hub / github.com/VectifyAI/PageIndex / index

Method index

pageindex/client.py:55–130 · view source on GitHub ↗

Index a document. Returns a document_id.

(self, file_path: str, mode: str = "auto")

Source from the content-addressed store, hash-verified

53	self._load_workspace()
54
55	def index(self, file_path: str, mode: str = "auto") -> str:
56	"""Index a document. Returns a document_id."""
57	# Persist a canonical absolute path so workspace reloads do not
58	# reinterpret caller-relative paths against the workspace directory.
59	file_path = os.path.abspath(os.path.expanduser(file_path))
60	if not os.path.exists(file_path):
61	raise FileNotFoundError(f"File not found: {file_path}")
62
63	doc_id = str(uuid.uuid4())
64	ext = os.path.splitext(file_path)[1].lower()
65
66	is_pdf = ext == '.pdf'
67	is_md = ext in ['.md', '.markdown']
68
69	if mode == "pdf" or (mode == "auto" and is_pdf):
70	print(f"Indexing PDF: {file_path}")
71	result = page_index(
72	doc=file_path,
73	model=self.model,
74	if_add_node_summary='yes',
75	if_add_node_text='yes',
76	if_add_node_id='yes',
77	if_add_doc_description='yes'
78	)
79	# Extract per-page text so queries don't need the original PDF
80	pages = []
81	with open(file_path, 'rb') as f:
82	pdf_reader = PyPDF2.PdfReader(f)
83	for i, page in enumerate(pdf_reader.pages, 1):
84	pages.append({'page': i, 'content': page.extract_text() or ''})
85
86	self.documents[doc_id] = {
87	'id': doc_id,
88	'type': 'pdf',
89	'path': file_path,
90	'doc_name': result.get('doc_name', ''),
91	'doc_description': result.get('doc_description', ''),
92	'page_count': len(pages),
93	'structure': result['structure'],
94	'pages': pages,
95	}
96
97	elif mode == "md" or (mode == "auto" and is_md):
98	print(f"Indexing Markdown: {file_path}")
99	coro = md_to_tree(
100	md_path=file_path,
101	if_thinning=False,
102	if_add_node_summary='yes',
103	summary_token_threshold=200,
104	model=self.model,
105	if_add_doc_description='yes',
106	if_add_node_text='yes',
107	if_add_node_id='yes'
108	)
109	try:
110	asyncio.get_running_loop()
111	with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
112	result = pool.submit(asyncio.run, coro).result()

Callers 1

agentic_vectorless_rag_demo.pyFile · 0.80

Calls 3

_save_docMethod · 0.95

page_indexFunction · 0.90

md_to_treeFunction · 0.85

Tested by

no test coverage detected