Index a document. Returns a document_id.
(self, file_path: str, mode: str = "auto")
| 53 | self._load_workspace() |
| 54 | |
| 55 | def index(self, file_path: str, mode: str = "auto") -> str: |
| 56 | """Index a document. Returns a document_id.""" |
| 57 | # Persist a canonical absolute path so workspace reloads do not |
| 58 | # reinterpret caller-relative paths against the workspace directory. |
| 59 | file_path = os.path.abspath(os.path.expanduser(file_path)) |
| 60 | if not os.path.exists(file_path): |
| 61 | raise FileNotFoundError(f"File not found: {file_path}") |
| 62 | |
| 63 | doc_id = str(uuid.uuid4()) |
| 64 | ext = os.path.splitext(file_path)[1].lower() |
| 65 | |
| 66 | is_pdf = ext == '.pdf' |
| 67 | is_md = ext in ['.md', '.markdown'] |
| 68 | |
| 69 | if mode == "pdf" or (mode == "auto" and is_pdf): |
| 70 | print(f"Indexing PDF: {file_path}") |
| 71 | result = page_index( |
| 72 | doc=file_path, |
| 73 | model=self.model, |
| 74 | if_add_node_summary='yes', |
| 75 | if_add_node_text='yes', |
| 76 | if_add_node_id='yes', |
| 77 | if_add_doc_description='yes' |
| 78 | ) |
| 79 | # Extract per-page text so queries don't need the original PDF |
| 80 | pages = [] |
| 81 | with open(file_path, 'rb') as f: |
| 82 | pdf_reader = PyPDF2.PdfReader(f) |
| 83 | for i, page in enumerate(pdf_reader.pages, 1): |
| 84 | pages.append({'page': i, 'content': page.extract_text() or ''}) |
| 85 | |
| 86 | self.documents[doc_id] = { |
| 87 | 'id': doc_id, |
| 88 | 'type': 'pdf', |
| 89 | 'path': file_path, |
| 90 | 'doc_name': result.get('doc_name', ''), |
| 91 | 'doc_description': result.get('doc_description', ''), |
| 92 | 'page_count': len(pages), |
| 93 | 'structure': result['structure'], |
| 94 | 'pages': pages, |
| 95 | } |
| 96 | |
| 97 | elif mode == "md" or (mode == "auto" and is_md): |
| 98 | print(f"Indexing Markdown: {file_path}") |
| 99 | coro = md_to_tree( |
| 100 | md_path=file_path, |
| 101 | if_thinning=False, |
| 102 | if_add_node_summary='yes', |
| 103 | summary_token_threshold=200, |
| 104 | model=self.model, |
| 105 | if_add_doc_description='yes', |
| 106 | if_add_node_text='yes', |
| 107 | if_add_node_id='yes' |
| 108 | ) |
| 109 | try: |
| 110 | asyncio.get_running_loop() |
| 111 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: |
| 112 | result = pool.submit(asyncio.run, coro).result() |
no test coverage detected