Run --diff db mode: compare fresh extraction against the DB.
(
gz_files: list[str],
mode: str,
model: str | None,
run_dir: str,
debug: bool,
s: store.Store,
batch_size: int | None = None,
)
| 215 | |
| 216 | |
| 217 | def _run_diff_db( |
| 218 | gz_files: list[str], |
| 219 | mode: str, |
| 220 | model: str | None, |
| 221 | run_dir: str, |
| 222 | debug: bool, |
| 223 | s: store.Store, |
| 224 | batch_size: int | None = None, |
| 225 | ) -> BatchResult: |
| 226 | """Run --diff db mode: compare fresh extraction against the DB.""" |
| 227 | cfg = ExtractorConfig(model=model, run_dir=run_dir, debug=debug) |
| 228 | ext = make_extractor(mode, cfg) |
| 229 | |
| 230 | from explainshell import manpage as _manpage |
| 231 | |
| 232 | total = len(gz_files) |
| 233 | counter = {"n": 0} |
| 234 | |
| 235 | def on_start(gz_path: str) -> None: |
| 236 | counter["n"] += 1 |
| 237 | short_path = config.source_from_path(gz_path) |
| 238 | logger.info( |
| 239 | "[%d/%d] [%s] extracting (%s)...", counter["n"], total, short_path, mode |
| 240 | ) |
| 241 | |
| 242 | def on_result(gz_path: str, entry: ExtractionResult) -> None: |
| 243 | short_path = config.source_from_path(gz_path) |
| 244 | if entry.outcome == ExtractionOutcome.SKIPPED: |
| 245 | logger.info("[%s] skipped: %s", short_path, entry.error) |
| 246 | return |
| 247 | if entry.outcome == ExtractionOutcome.FAILED: |
| 248 | logger.error("failed to process %s: %s", short_path, entry.error) |
| 249 | return |
| 250 | name = _manpage.extract_name(gz_path) |
| 251 | logger.info("=== %s ===", short_path) |
| 252 | try: |
| 253 | # Prefer exact source match (fully populated) over name lookup. |
| 254 | try: |
| 255 | results = s.find_man_page(short_path) |
| 256 | except errors.ProgramDoesNotExist: |
| 257 | results = s.find_man_page(name) |
| 258 | stored_mp = results[0] |
| 259 | except errors.ProgramDoesNotExist: |
| 260 | logger.info(" (not in DB, nothing to diff)") |
| 261 | else: |
| 262 | for line in format_diff(stored_mp, entry.mp): |
| 263 | logger.info(line) |
| 264 | |
| 265 | return run( |
| 266 | ext, gz_files, batch_size=batch_size, on_start=on_start, on_result=on_result |
| 267 | ) |
| 268 | |
| 269 | |
| 270 | def _format_decision(d: prefilter.Decision) -> str: |