Run --diff A..B mode: compare two extractors on each file.
(
gz_files: list[str],
diff_left: tuple,
diff_right: tuple,
run_dir: str,
batch_size: int | None = None,
)
| 103 | |
| 104 | |
| 105 | def _run_diff_extractors( |
| 106 | gz_files: list[str], |
| 107 | diff_left: tuple, |
| 108 | diff_right: tuple, |
| 109 | run_dir: str, |
| 110 | batch_size: int | None = None, |
| 111 | ) -> BatchResult: |
| 112 | """Run --diff A..B mode: compare two extractors on each file.""" |
| 113 | left_mode, left_model = diff_left |
| 114 | right_mode, right_model = diff_right |
| 115 | left_label = left_mode if not left_model else f"{left_mode} ({left_model})" |
| 116 | right_label = right_mode if not right_model else f"{right_mode} ({right_model})" |
| 117 | label = f"{left_label} vs {right_label}" |
| 118 | |
| 119 | left_cfg = ExtractorConfig(model=left_model, run_dir=run_dir) |
| 120 | right_cfg = ExtractorConfig(model=right_model, run_dir=run_dir) |
| 121 | left_ext = make_extractor(left_mode, left_cfg) |
| 122 | right_ext = make_extractor(right_mode, right_cfg) |
| 123 | |
| 124 | left_files: list[ExtractionResult] = [] |
| 125 | right_files: list[ExtractionResult] = [] |
| 126 | |
| 127 | left_bs = batch_size |
| 128 | right_bs = batch_size |
| 129 | |
| 130 | logger.info("running %s extractor on %d file(s)...", left_label, len(gz_files)) |
| 131 | run( |
| 132 | left_ext, |
| 133 | gz_files, |
| 134 | batch_size=left_bs, |
| 135 | on_result=lambda _p, e: left_files.append(e), |
| 136 | ) |
| 137 | logger.info("running %s extractor on %d file(s)...", right_label, len(gz_files)) |
| 138 | run( |
| 139 | right_ext, |
| 140 | gz_files, |
| 141 | batch_size=right_bs, |
| 142 | on_result=lambda _p, e: right_files.append(e), |
| 143 | ) |
| 144 | |
| 145 | batch = BatchResult() |
| 146 | for left_entry, right_entry in zip(left_files, right_files): |
| 147 | gz_path = left_entry.gz_path |
| 148 | short_path = config.source_from_path(gz_path) |
| 149 | left_ok = left_entry.outcome == ExtractionOutcome.SUCCESS |
| 150 | right_ok = right_entry.outcome == ExtractionOutcome.SUCCESS |
| 151 | |
| 152 | # Always accumulate stats from successful extractions, even when |
| 153 | # the other side failed — the tokens were consumed either way. |
| 154 | if left_ok: |
| 155 | batch.stats += left_entry.stats |
| 156 | if right_ok: |
| 157 | batch.stats += right_entry.stats |
| 158 | |
| 159 | if not left_ok or not right_ok: |
| 160 | logger.info("=== %s (%s) ===", short_path, label) |
| 161 | if not left_ok: |
| 162 | logger.info( |