()
| 101 | therefore identical chunk_args_hash on chunk-0.""" |
| 102 | |
| 103 | async def _run(): |
| 104 | input_dir = tmp_path / "input" |
| 105 | input_dir.mkdir() |
| 106 | monkeypatch.setenv("INPUT_DIR", str(input_dir)) |
| 107 | |
| 108 | source_path = input_dir / "stable.docx" |
| 109 | source_path.write_bytes(b"fake docx bytes") |
| 110 | |
| 111 | # Stub extract_docx_blocks at the adapter so the upstream DOCX |
| 112 | # parser is never invoked. The adapter still does all the |
| 113 | # LightRAG-specific writing — that is what we want under test. |
| 114 | stable_blocks = [ |
| 115 | _block( |
| 116 | "Title\nFirst paragraph body.\nSecond paragraph body.", |
| 117 | heading="Title", |
| 118 | level=1, |
| 119 | ), |
| 120 | ] |
| 121 | |
| 122 | def _stub_extract(file_path, drawing_context=None, **kwargs): |
| 123 | return [dict(b) for b in stable_blocks] |
| 124 | |
| 125 | monkeypatch.setattr( |
| 126 | "lightrag.parser.docx.parse_document.extract_docx_blocks", |
| 127 | _stub_extract, |
| 128 | ) |
| 129 | |
| 130 | rag = _MiniRag(tmp_path / "work") |
| 131 | |
| 132 | # ---- First parse ---- |
| 133 | # parse_native archives the source after writing, so re-create it |
| 134 | # before the second parse for a fair comparison. |
| 135 | result1 = await _parse_via_registry( |
| 136 | rag, |
| 137 | "native", |
| 138 | "doc-stable", |
| 139 | str(source_path), |
| 140 | {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""}, |
| 141 | ) |
| 142 | merged1 = result1["content"] |
| 143 | assert merged1, "first parse produced empty merged_text" |
| 144 | |
| 145 | # ---- Second parse ---- |
| 146 | # Restore the source file (archive moved it), reset the in-memory |
| 147 | # full_docs row, and remove the parsed_dir so the writer rewrites |
| 148 | # both meta (with a fresh parse_time) and content lines. |
| 149 | source_path.write_bytes(b"fake docx bytes") |
| 150 | rag.full_docs.data.clear() |
| 151 | parsed_artifact_dir = input_dir / PARSED_DIR_NAME / f"{source_path.name}.parsed" |
| 152 | if parsed_artifact_dir.exists(): |
| 153 | import shutil |
| 154 | |
| 155 | shutil.rmtree(parsed_artifact_dir) |
| 156 | |
| 157 | result2 = await _parse_via_registry( |
| 158 | rag, |
| 159 | "native", |
| 160 | "doc-stable", |
no test coverage detected