(self, doc_id: str)
| 155 | return None |
| 156 | |
| 157 | def _save_doc(self, doc_id: str): |
| 158 | doc = self.documents[doc_id].copy() |
| 159 | # Strip text from structure nodes — redundant with pages (PDF only) |
| 160 | if doc.get('structure') and doc.get('type') == 'pdf': |
| 161 | doc['structure'] = remove_fields(doc['structure'], fields=['text']) |
| 162 | path = self.workspace / f"{doc_id}.json" |
| 163 | with open(path, "w", encoding="utf-8") as f: |
| 164 | json.dump(doc, f, ensure_ascii=False, indent=2) |
| 165 | self._save_meta(doc_id, self._make_meta_entry(doc)) |
| 166 | # Drop heavy fields; will lazy-load on demand |
| 167 | self.documents[doc_id].pop('structure', None) |
| 168 | self.documents[doc_id].pop('pages', None) |
| 169 | |
| 170 | def _rebuild_meta(self) -> dict: |
| 171 | """Scan individual doc JSON files and return a meta dict.""" |
no test coverage detected