Full extraction pipeline: prepare → LLM calls → finalize.
(self, gz_path: str)
| 212 | self._cancelled.set() |
| 213 | |
| 214 | def extract(self, gz_path: str) -> ExtractionResult: |
| 215 | """Full extraction pipeline: prepare → LLM calls → finalize.""" |
| 216 | prepared = self.prepare(gz_path) |
| 217 | basename = prepared.basename |
| 218 | n_chunks = prepared.n_chunks |
| 219 | |
| 220 | logger.info( |
| 221 | "%s: %d chars (%d numbered), %d chunk(s)", |
| 222 | basename, |
| 223 | prepared.plain_text_len, |
| 224 | len(prepared.numbered_text), |
| 225 | n_chunks, |
| 226 | ) |
| 227 | |
| 228 | stats = ExtractionStats( |
| 229 | chunks=n_chunks, |
| 230 | plain_text_len=prepared.plain_text_len, |
| 231 | ) |
| 232 | |
| 233 | all_chunk_data: list[ChunkResult] = [] |
| 234 | t0 = time.monotonic() |
| 235 | |
| 236 | for i, user_content in enumerate(prepared.requests): |
| 237 | if self._cancelled.is_set(): |
| 238 | raise ExtractionError("cancelled", reason_class=FailureReason.CANCELLED) |
| 239 | |
| 240 | chunk_label = ( |
| 241 | f"chunk {i + 1}/{n_chunks}" if n_chunks > 1 else "single chunk" |
| 242 | ) |
| 243 | logger.info( |
| 244 | "%s: calling LLM (%s, %d chars)...", |
| 245 | basename, |
| 246 | chunk_label, |
| 247 | len(user_content), |
| 248 | ) |
| 249 | |
| 250 | try: |
| 251 | cr = self._call_llm(user_content) |
| 252 | except ExtractionError as e: |
| 253 | if e.raw_response: |
| 254 | self._dump_failed_response(gz_path, i, e.raw_response) |
| 255 | raise |
| 256 | |
| 257 | stats.input_tokens += cr.usage.input_tokens |
| 258 | stats.output_tokens += cr.usage.output_tokens |
| 259 | stats.reasoning_tokens += cr.usage.reasoning_tokens |
| 260 | n_opts = len(cr.data["options"]) |
| 261 | logger.info( |
| 262 | "%s: LLM returned %d option(s) for %s", |
| 263 | basename, |
| 264 | n_opts, |
| 265 | chunk_label, |
| 266 | ) |
| 267 | all_chunk_data.append(cr) |
| 268 | |
| 269 | stats.elapsed_seconds = time.monotonic() - t0 |
| 270 | return self._finalize(gz_path, prepared, all_chunk_data, stats) |
| 271 |