Pre-process a manpage without calling LLM. Raises SkippedExtraction if the manpage is too large.
(self, gz_path: str)
| 270 | return self._finalize(gz_path, prepared, all_chunk_data, stats) |
| 271 | |
| 272 | def prepare(self, gz_path: str) -> PreparedFile: |
| 273 | """Pre-process a manpage without calling LLM. |
| 274 | |
| 275 | Raises SkippedExtraction if the manpage is too large. |
| 276 | """ |
| 277 | source = config.source_from_path(gz_path) |
| 278 | |
| 279 | if source in _BLACKLISTED_SOURCES: |
| 280 | raise SkippedExtraction( |
| 281 | "blacklisted", reason_class=FailureReason.BLACKLISTED |
| 282 | ) |
| 283 | |
| 284 | synopsis, aliases = manpage.get_synopsis_and_aliases(gz_path) |
| 285 | plain_text = clean_mandoc_artifacts(get_manpage_text(gz_path)) |
| 286 | basename = os.path.splitext(os.path.splitext(os.path.basename(gz_path))[0])[0] |
| 287 | |
| 288 | if len(plain_text) > MAX_MANPAGE_CHARS: |
| 289 | raise SkippedExtraction( |
| 290 | f"manpage too large ({len(plain_text):,} chars, limit {MAX_MANPAGE_CHARS:,})", |
| 291 | stats=ExtractionStats(plain_text_len=len(plain_text)), |
| 292 | reason_class=FailureReason.MANPAGE_TOO_LARGE, |
| 293 | ) |
| 294 | |
| 295 | filtered_text, removal_counts = filter_sections(plain_text) |
| 296 | if removal_counts: |
| 297 | logger.debug( |
| 298 | "%s: filtered sections: %s (saved %d chars)", |
| 299 | basename, |
| 300 | ", ".join(f"{k} ({v})" for k, v in sorted(removal_counts.items())), |
| 301 | len(plain_text) - len(filtered_text), |
| 302 | ) |
| 303 | |
| 304 | numbered_text, original_lines = number_lines(filtered_text) |
| 305 | chunks = chunk_text(filtered_text) |
| 306 | n_chunks = len(chunks) |
| 307 | |
| 308 | if n_chunks > MAX_CHUNKS: |
| 309 | raise ExtractionError( |
| 310 | f"too many chunks ({n_chunks:,}, limit {MAX_CHUNKS})", |
| 311 | reason_class=FailureReason.TOO_MANY_CHUNKS, |
| 312 | ) |
| 313 | |
| 314 | requests: list[str] = [] |
| 315 | for i, chunk in enumerate(chunks): |
| 316 | chunk_info = f" (part {i + 1} of {n_chunks})" if n_chunks > 1 else "" |
| 317 | requests.append(self._build_user_content(chunk, chunk_info)) |
| 318 | |
| 319 | return PreparedFile( |
| 320 | synopsis=synopsis, |
| 321 | aliases=aliases, |
| 322 | original_lines=original_lines, |
| 323 | basename=basename, |
| 324 | numbered_text=numbered_text, |
| 325 | plain_text_len=len(plain_text), |
| 326 | plain_text=plain_text, |
| 327 | requests=requests, |
| 328 | ) |
| 329 |