A client for indexing and retrieving document content. Flow: index() -> get_document() / get_document_structure() / get_page_content() For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
| 26 | |
| 27 | |
| 28 | class PageIndexClient: |
| 29 | """ |
| 30 | A client for indexing and retrieving document content. |
| 31 | Flow: index() -> get_document() / get_document_structure() / get_page_content() |
| 32 | |
| 33 | For agent-based QA, see examples/agentic_vectorless_rag_demo.py. |
| 34 | """ |
| 35 | def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None): |
| 36 | if api_key: |
| 37 | os.environ["OPENAI_API_KEY"] = api_key |
| 38 | elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): |
| 39 | os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY") |
| 40 | self.workspace = Path(workspace).expanduser() if workspace else None |
| 41 | overrides = {} |
| 42 | if model: |
| 43 | overrides["model"] = model |
| 44 | if retrieve_model: |
| 45 | overrides["retrieve_model"] = retrieve_model |
| 46 | opt = ConfigLoader().load(overrides or None) |
| 47 | self.model = opt.model |
| 48 | self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model) |
| 49 | if self.workspace: |
| 50 | self.workspace.mkdir(parents=True, exist_ok=True) |
| 51 | self.documents = {} |
| 52 | if self.workspace: |
| 53 | self._load_workspace() |
| 54 | |
| 55 | def index(self, file_path: str, mode: str = "auto") -> str: |
| 56 | """Index a document. Returns a document_id.""" |
| 57 | # Persist a canonical absolute path so workspace reloads do not |
| 58 | # reinterpret caller-relative paths against the workspace directory. |
| 59 | file_path = os.path.abspath(os.path.expanduser(file_path)) |
| 60 | if not os.path.exists(file_path): |
| 61 | raise FileNotFoundError(f"File not found: {file_path}") |
| 62 | |
| 63 | doc_id = str(uuid.uuid4()) |
| 64 | ext = os.path.splitext(file_path)[1].lower() |
| 65 | |
| 66 | is_pdf = ext == '.pdf' |
| 67 | is_md = ext in ['.md', '.markdown'] |
| 68 | |
| 69 | if mode == "pdf" or (mode == "auto" and is_pdf): |
| 70 | print(f"Indexing PDF: {file_path}") |
| 71 | result = page_index( |
| 72 | doc=file_path, |
| 73 | model=self.model, |
| 74 | if_add_node_summary='yes', |
| 75 | if_add_node_text='yes', |
| 76 | if_add_node_id='yes', |
| 77 | if_add_doc_description='yes' |
| 78 | ) |
| 79 | # Extract per-page text so queries don't need the original PDF |
| 80 | pages = [] |
| 81 | with open(file_path, 'rb') as f: |
| 82 | pdf_reader = PyPDF2.PdfReader(f) |
| 83 | for i, page in enumerate(pdf_reader.pages, 1): |
| 84 | pages.append({'page': i, 'content': page.extract_text() or ''}) |
| 85 |
no outgoing calls
no test coverage detected