LLM-based option extractor. Implements the base ``Extractor`` protocol via ``extract()``. Also satisfies ``BatchExtractor`` via ``prepare()``, ``finalize()``, and ``batch_provider``.
| 186 | |
| 187 | |
| 188 | class LLMExtractor: |
| 189 | """LLM-based option extractor. |
| 190 | |
| 191 | Implements the base ``Extractor`` protocol via ``extract()``. |
| 192 | Also satisfies ``BatchExtractor`` via ``prepare()``, ``finalize()``, |
| 193 | and ``batch_provider``. |
| 194 | """ |
| 195 | |
| 196 | def __init__(self, config: ExtractorConfig) -> None: |
| 197 | self._model = config.model or "" |
| 198 | self._run_dir = config.run_dir |
| 199 | self._repo_root = config.repo_root |
| 200 | self._debug = config.debug |
| 201 | self.provider: LLMProvider = make_provider(self._model) |
| 202 | try: |
| 203 | self.batch_provider: BatchProvider = make_batch_provider(self._model) |
| 204 | except ValueError: |
| 205 | pass # model doesn't support batch; accessed only via --batch flag |
| 206 | self._cancelled = threading.Event() |
| 207 | |
| 208 | def cancel(self) -> None: |
| 209 | """Signal all in-progress extract() calls to stop after their current |
| 210 | LLM request completes. Does not abort already in-flight HTTP calls, |
| 211 | but prevents the next chunk from being submitted.""" |
| 212 | self._cancelled.set() |
| 213 | |
| 214 | def extract(self, gz_path: str) -> ExtractionResult: |
| 215 | """Full extraction pipeline: prepare → LLM calls → finalize.""" |
| 216 | prepared = self.prepare(gz_path) |
| 217 | basename = prepared.basename |
| 218 | n_chunks = prepared.n_chunks |
| 219 | |
| 220 | logger.info( |
| 221 | "%s: %d chars (%d numbered), %d chunk(s)", |
| 222 | basename, |
| 223 | prepared.plain_text_len, |
| 224 | len(prepared.numbered_text), |
| 225 | n_chunks, |
| 226 | ) |
| 227 | |
| 228 | stats = ExtractionStats( |
| 229 | chunks=n_chunks, |
| 230 | plain_text_len=prepared.plain_text_len, |
| 231 | ) |
| 232 | |
| 233 | all_chunk_data: list[ChunkResult] = [] |
| 234 | t0 = time.monotonic() |
| 235 | |
| 236 | for i, user_content in enumerate(prepared.requests): |
| 237 | if self._cancelled.is_set(): |
| 238 | raise ExtractionError("cancelled", reason_class=FailureReason.CANCELLED) |
| 239 | |
| 240 | chunk_label = ( |
| 241 | f"chunk {i + 1}/{n_chunks}" if n_chunks > 1 else "single chunk" |
| 242 | ) |
| 243 | logger.info( |
| 244 | "%s: calling LLM (%s, %d chars)...", |
| 245 | basename, |
no outgoing calls