MCPcopy
hub / github.com/langroid/langroid / __init__

Method __init__

langroid/parsing/document_parser.py:1091–1139  ·  view source on GitHub ↗
(self, source: Union[str, bytes], config: ParsingConfig)

Source from the content-addressed store, hash-verified

1089 """.strip()
1090
1091 def __init__(self, source: Union[str, bytes], config: ParsingConfig):
1092 super().__init__(source, config)
1093 if not config.pdf.llm_parser_config:
1094 raise ValueError(
1095 "LLMPdfParser requires a llm-based config in pdf parsing config"
1096 )
1097 self.llm_parser_config: LLMPdfParserConfig = config.pdf.llm_parser_config
1098 self.model_name = self.llm_parser_config.model_name
1099
1100 # Ensure output directory exists
1101 self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
1102
1103 prefix = (
1104 Path(source).stem + "_"
1105 if isinstance(source, str) and Path(source).exists()
1106 else "output_"
1107 )
1108 temp_file = tempfile.NamedTemporaryFile(
1109 suffix=".md",
1110 prefix=prefix,
1111 dir=str(self.OUTPUT_DIR),
1112 delete=False,
1113 )
1114 temp_file.close()
1115 self.output_filename = Path(temp_file.name)
1116
1117 self.max_tokens = self.llm_parser_config.max_tokens or self.DEFAULT_MAX_TOKENS
1118
1119 """
1120 If True, each PDF page is processed as a separate chunk,
1121 resulting in one LLM request per page. If False, pages are
1122 grouped into chunks based on `max_token_limit` before being sent
1123 to the LLM.
1124 """
1125 self.split_on_page = self.llm_parser_config.split_on_page or False
1126
1127 # Rate limiting parameters
1128 import asyncio
1129
1130 self.requests_per_minute = self.llm_parser_config.requests_per_minute or 5
1131
1132 """
1133 A semaphore to control the number of concurrent requests to the LLM,
1134 preventing rate limit errors. A semaphore slot is acquired before
1135 making an LLM request and released after the request is complete.
1136 """
1137 self.semaphore = asyncio.Semaphore(self.requests_per_minute)
1138 self.retry_delay = 5 # seconds, for exponential backoff
1139 self.max_retries = 3
1140
1141 def _extract_page(self, page_num: int) -> Dict[str, Any]:
1142 """

Callers 2

__init__Method · 0.45
__init__Method · 0.45

Calls 1

closeMethod · 0.45

Tested by

no test coverage detected