Run LLM extraction via provider batch API. Files are finalized as soon as their batch completes (per-batch), not after all batches finish. The optional ``on_result`` callback is invoked immediately after each file is finalized, always from the main thread. When ``jobs > 1``, u
(
extractor: BatchExtractor,
gz_files: list[str],
*,
manifest: BatchManifestWriter,
batch_size: int = 50,
jobs: int = 1,
on_start: Callable[[str], None] | None = None,
on_result: Callable[[str, ExtractionResult], None] | None = None,
)
| 467 | |
| 468 | |
| 469 | def run_batch( |
| 470 | extractor: BatchExtractor, |
| 471 | gz_files: list[str], |
| 472 | *, |
| 473 | manifest: BatchManifestWriter, |
| 474 | batch_size: int = 50, |
| 475 | jobs: int = 1, |
| 476 | on_start: Callable[[str], None] | None = None, |
| 477 | on_result: Callable[[str, ExtractionResult], None] | None = None, |
| 478 | ) -> BatchResult: |
| 479 | """Run LLM extraction via provider batch API. |
| 480 | |
| 481 | Files are finalized as soon as their batch completes (per-batch), |
| 482 | not after all batches finish. The optional ``on_result`` callback |
| 483 | is invoked immediately after each file is finalized, always from the |
| 484 | main thread. |
| 485 | |
| 486 | When ``jobs > 1``, up to that many provider batches are submitted |
| 487 | and polled concurrently via a thread pool. |
| 488 | """ |
| 489 | result = BatchResult() |
| 490 | |
| 491 | # Phase 1: prepare all files. |
| 492 | work_items: list[WorkItem] = [] |
| 493 | for gz_path in gz_files: |
| 494 | if on_start: |
| 495 | on_start(gz_path) |
| 496 | try: |
| 497 | prepared = extractor.prepare(gz_path) |
| 498 | except SkippedExtraction as e: |
| 499 | entry = ExtractionResult( |
| 500 | gz_path=gz_path, |
| 501 | outcome=ExtractionOutcome.SKIPPED, |
| 502 | stats=e.stats, |
| 503 | error=e.reason, |
| 504 | reason_class=e.reason_class, |
| 505 | ) |
| 506 | _tally(result, entry) |
| 507 | if on_result: |
| 508 | on_result(gz_path, entry) |
| 509 | continue |
| 510 | except ExtractionError as e: |
| 511 | logger.error("failed to prepare %s: %s", gz_path, e) |
| 512 | entry = ExtractionResult( |
| 513 | gz_path=gz_path, |
| 514 | outcome=ExtractionOutcome.FAILED, |
| 515 | error=str(e), |
| 516 | reason_class=e.reason_class, |
| 517 | ) |
| 518 | _tally(result, entry) |
| 519 | if on_result: |
| 520 | on_result(gz_path, entry) |
| 521 | continue |
| 522 | except Exception as e: |
| 523 | logger.error("failed to prepare %s: %s", gz_path, e) |
| 524 | entry = ExtractionResult( |
| 525 | gz_path=gz_path, |
| 526 | outcome=ExtractionOutcome.FAILED, |