MCPcopy
hub / github.com/VectifyAI/PageIndex / index

Method index

pageindex/client.py:55–130  ·  view source on GitHub ↗

Index a document. Returns a document_id.

(self, file_path: str, mode: str = "auto")

Source from the content-addressed store, hash-verified

53 self._load_workspace()
54
55 def index(self, file_path: str, mode: str = "auto") -> str:
56 """Index a document. Returns a document_id."""
57 # Persist a canonical absolute path so workspace reloads do not
58 # reinterpret caller-relative paths against the workspace directory.
59 file_path = os.path.abspath(os.path.expanduser(file_path))
60 if not os.path.exists(file_path):
61 raise FileNotFoundError(f"File not found: {file_path}")
62
63 doc_id = str(uuid.uuid4())
64 ext = os.path.splitext(file_path)[1].lower()
65
66 is_pdf = ext == '.pdf'
67 is_md = ext in ['.md', '.markdown']
68
69 if mode == "pdf" or (mode == "auto" and is_pdf):
70 print(f"Indexing PDF: {file_path}")
71 result = page_index(
72 doc=file_path,
73 model=self.model,
74 if_add_node_summary='yes',
75 if_add_node_text='yes',
76 if_add_node_id='yes',
77 if_add_doc_description='yes'
78 )
79 # Extract per-page text so queries don't need the original PDF
80 pages = []
81 with open(file_path, 'rb') as f:
82 pdf_reader = PyPDF2.PdfReader(f)
83 for i, page in enumerate(pdf_reader.pages, 1):
84 pages.append({'page': i, 'content': page.extract_text() or ''})
85
86 self.documents[doc_id] = {
87 'id': doc_id,
88 'type': 'pdf',
89 'path': file_path,
90 'doc_name': result.get('doc_name', ''),
91 'doc_description': result.get('doc_description', ''),
92 'page_count': len(pages),
93 'structure': result['structure'],
94 'pages': pages,
95 }
96
97 elif mode == "md" or (mode == "auto" and is_md):
98 print(f"Indexing Markdown: {file_path}")
99 coro = md_to_tree(
100 md_path=file_path,
101 if_thinning=False,
102 if_add_node_summary='yes',
103 summary_token_threshold=200,
104 model=self.model,
105 if_add_doc_description='yes',
106 if_add_node_text='yes',
107 if_add_node_id='yes'
108 )
109 try:
110 asyncio.get_running_loop()
111 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
112 result = pool.submit(asyncio.run, coro).result()

Callers 1

Calls 3

_save_docMethod · 0.95
page_indexFunction · 0.90
md_to_treeFunction · 0.85

Tested by

no test coverage detected