MCPcopy
hub / github.com/idank/explainshell / prepare

Method prepare

explainshell/extraction/llm/extractor.py:272–328  ·  view source on GitHub ↗

Pre-process a manpage without calling LLM. Raises SkippedExtraction if the manpage is too large.

(self, gz_path: str)

Source from the content-addressed store, hash-verified

270 return self._finalize(gz_path, prepared, all_chunk_data, stats)
271
272 def prepare(self, gz_path: str) -> PreparedFile:
273 """Pre-process a manpage without calling LLM.
274
275 Raises SkippedExtraction if the manpage is too large.
276 """
277 source = config.source_from_path(gz_path)
278
279 if source in _BLACKLISTED_SOURCES:
280 raise SkippedExtraction(
281 "blacklisted", reason_class=FailureReason.BLACKLISTED
282 )
283
284 synopsis, aliases = manpage.get_synopsis_and_aliases(gz_path)
285 plain_text = clean_mandoc_artifacts(get_manpage_text(gz_path))
286 basename = os.path.splitext(os.path.splitext(os.path.basename(gz_path))[0])[0]
287
288 if len(plain_text) > MAX_MANPAGE_CHARS:
289 raise SkippedExtraction(
290 f"manpage too large ({len(plain_text):,} chars, limit {MAX_MANPAGE_CHARS:,})",
291 stats=ExtractionStats(plain_text_len=len(plain_text)),
292 reason_class=FailureReason.MANPAGE_TOO_LARGE,
293 )
294
295 filtered_text, removal_counts = filter_sections(plain_text)
296 if removal_counts:
297 logger.debug(
298 "%s: filtered sections: %s (saved %d chars)",
299 basename,
300 ", ".join(f"{k} ({v})" for k, v in sorted(removal_counts.items())),
301 len(plain_text) - len(filtered_text),
302 )
303
304 numbered_text, original_lines = number_lines(filtered_text)
305 chunks = chunk_text(filtered_text)
306 n_chunks = len(chunks)
307
308 if n_chunks > MAX_CHUNKS:
309 raise ExtractionError(
310 f"too many chunks ({n_chunks:,}, limit {MAX_CHUNKS})",
311 reason_class=FailureReason.TOO_MANY_CHUNKS,
312 )
313
314 requests: list[str] = []
315 for i, chunk in enumerate(chunks):
316 chunk_info = f" (part {i + 1} of {n_chunks})" if n_chunks > 1 else ""
317 requests.append(self._build_user_content(chunk, chunk_info))
318
319 return PreparedFile(
320 synopsis=synopsis,
321 aliases=aliases,
322 original_lines=original_lines,
323 basename=basename,
324 numbered_text=numbered_text,
325 plain_text_len=len(plain_text),
326 plain_text=plain_text,
327 requests=requests,
328 )
329

Callers 5

extractMethod · 0.95
run_batchFunction · 0.45
test_blacklisted_skipMethod · 0.45

Calls 10

_build_user_contentMethod · 0.95
SkippedExtractionClass · 0.90
clean_mandoc_artifactsFunction · 0.90
get_manpage_textFunction · 0.90
ExtractionStatsClass · 0.90
filter_sectionsFunction · 0.90
number_linesFunction · 0.90
chunk_textFunction · 0.90
ExtractionErrorClass · 0.90
PreparedFileClass · 0.85

Tested by 3

test_blacklisted_skipMethod · 0.36