MCPcopy Index your code
hub / github.com/VectifyAI/PageIndex / PageIndexClient

Class PageIndexClient

pageindex/client.py:28–234  ·  view source on GitHub ↗

A client for indexing and retrieving document content. Flow: index() -> get_document() / get_document_structure() / get_page_content() For agent-based QA, see examples/agentic_vectorless_rag_demo.py.

Source from the content-addressed store, hash-verified

26
27
28class PageIndexClient:
29 """
30 A client for indexing and retrieving document content.
31 Flow: index() -> get_document() / get_document_structure() / get_page_content()
32
33 For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
34 """
35 def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None):
36 if api_key:
37 os.environ["OPENAI_API_KEY"] = api_key
38 elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
39 os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY")
40 self.workspace = Path(workspace).expanduser() if workspace else None
41 overrides = {}
42 if model:
43 overrides["model"] = model
44 if retrieve_model:
45 overrides["retrieve_model"] = retrieve_model
46 opt = ConfigLoader().load(overrides or None)
47 self.model = opt.model
48 self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model)
49 if self.workspace:
50 self.workspace.mkdir(parents=True, exist_ok=True)
51 self.documents = {}
52 if self.workspace:
53 self._load_workspace()
54
55 def index(self, file_path: str, mode: str = "auto") -> str:
56 """Index a document. Returns a document_id."""
57 # Persist a canonical absolute path so workspace reloads do not
58 # reinterpret caller-relative paths against the workspace directory.
59 file_path = os.path.abspath(os.path.expanduser(file_path))
60 if not os.path.exists(file_path):
61 raise FileNotFoundError(f"File not found: {file_path}")
62
63 doc_id = str(uuid.uuid4())
64 ext = os.path.splitext(file_path)[1].lower()
65
66 is_pdf = ext == '.pdf'
67 is_md = ext in ['.md', '.markdown']
68
69 if mode == "pdf" or (mode == "auto" and is_pdf):
70 print(f"Indexing PDF: {file_path}")
71 result = page_index(
72 doc=file_path,
73 model=self.model,
74 if_add_node_summary='yes',
75 if_add_node_text='yes',
76 if_add_node_id='yes',
77 if_add_doc_description='yes'
78 )
79 # Extract per-page text so queries don't need the original PDF
80 pages = []
81 with open(file_path, 'rb') as f:
82 pdf_reader = PyPDF2.PdfReader(f)
83 for i, page in enumerate(pdf_reader.pages, 1):
84 pages.append({'page': i, 'content': page.extract_text() or ''})
85

Callers 1

Calls

no outgoing calls

Tested by

no test coverage detected