Resolve the parser engine key for a full_docs row during processing. Returns a registry key: the internal ``reuse``/``passthrough`` handlers for the no-op formats, or a real engine name for ``pending_parse``. Never raises and never filters an unknown stored engine — the parse worker
(
file_path: str | Path,
content_data: dict[str, Any] | None,
)
| 1205 | |
| 1206 | |
| 1207 | def resolve_stored_document_parser_engine( |
| 1208 | file_path: str | Path, |
| 1209 | content_data: dict[str, Any] | None, |
| 1210 | ) -> str: |
| 1211 | """Resolve the parser engine key for a full_docs row during processing. |
| 1212 | |
| 1213 | Returns a registry key: the internal ``reuse``/``passthrough`` handlers for |
| 1214 | the no-op formats, or a real engine name for ``pending_parse``. Never |
| 1215 | raises and never filters an unknown stored engine — the parse worker |
| 1216 | decides fallback/validation (so its warning path actually fires). |
| 1217 | """ |
| 1218 | if content_data: |
| 1219 | doc_format = content_data.get("parse_format", FULL_DOCS_FORMAT_RAW) |
| 1220 | # All lightrag rows reuse the already-parsed sidecar (sidecar optional; |
| 1221 | # ReuseParser tolerates a missing one). Reached on resume/retry. |
| 1222 | if doc_format == FULL_DOCS_FORMAT_LIGHTRAG: |
| 1223 | return PARSER_ENGINE_REUSE |
| 1224 | # Anything not pending is already-extracted content (raw direct-insert |
| 1225 | # or legacy-extracted RAW) -> pass through verbatim. |
| 1226 | if doc_format != FULL_DOCS_FORMAT_PENDING_PARSE: |
| 1227 | return PARSER_ENGINE_PASSTHROUGH |
| 1228 | # PENDING_PARSE: honour the stored engine verbatim; fall back to |
| 1229 | # filename rules only when it is absent. |
| 1230 | pending_engine = normalize_parser_engine(content_data.get("parse_engine")) |
| 1231 | if pending_engine: |
| 1232 | return pending_engine |
| 1233 | |
| 1234 | return resolve_file_parser_engine(file_path) |