hub / github.com/langroid/langroid / __init__

Method init

langroid/parsing/document_parser.py:1091–1139 · view source on GitHub ↗

(self, source: Union[str, bytes], config: ParsingConfig)

Source from the content-addressed store, hash-verified

1089	""".strip()
1090
1091	def __init__(self, source: Union[str, bytes], config: ParsingConfig):
1092	super().__init__(source, config)
1093	if not config.pdf.llm_parser_config:
1094	raise ValueError(
1095	"LLMPdfParser requires a llm-based config in pdf parsing config"
1096	)
1097	self.llm_parser_config: LLMPdfParserConfig = config.pdf.llm_parser_config
1098	self.model_name = self.llm_parser_config.model_name
1099
1100	# Ensure output directory exists
1101	self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
1102
1103	prefix = (
1104	Path(source).stem + "_"
1105	if isinstance(source, str) and Path(source).exists()
1106	else "output_"
1107	)
1108	temp_file = tempfile.NamedTemporaryFile(
1109	suffix=".md",
1110	prefix=prefix,
1111	dir=str(self.OUTPUT_DIR),
1112	delete=False,
1113	)
1114	temp_file.close()
1115	self.output_filename = Path(temp_file.name)
1116
1117	self.max_tokens = self.llm_parser_config.max_tokens or self.DEFAULT_MAX_TOKENS
1118
1119	"""
1120	If True, each PDF page is processed as a separate chunk,
1121	resulting in one LLM request per page. If False, pages are
1122	grouped into chunks based on `max_token_limit` before being sent
1123	to the LLM.
1124	"""
1125	self.split_on_page = self.llm_parser_config.split_on_page or False
1126
1127	# Rate limiting parameters
1128	import asyncio
1129
1130	self.requests_per_minute = self.llm_parser_config.requests_per_minute or 5
1131
1132	"""
1133	A semaphore to control the number of concurrent requests to the LLM,
1134	preventing rate limit errors. A semaphore slot is acquired before
1135	making an LLM request and released after the request is complete.
1136	"""
1137	self.semaphore = asyncio.Semaphore(self.requests_per_minute)
1138	self.retry_delay = 5 # seconds, for exponential backoff
1139	self.max_retries = 3
1140
1141	def _extract_page(self, page_num: int) -> Dict[str, Any]:
1142	"""

Callers 2

__init__Method · 0.45

Calls 1

closeMethod · 0.45

Tested by

no test coverage detected

Method __init__

Source from the content-addressed store, hash-verified

Callers 2

Calls 1

Tested by

Method init