| 1089 | """.strip() |
| 1090 | |
| 1091 | def __init__(self, source: Union[str, bytes], config: ParsingConfig): |
| 1092 | super().__init__(source, config) |
| 1093 | if not config.pdf.llm_parser_config: |
| 1094 | raise ValueError( |
| 1095 | "LLMPdfParser requires a llm-based config in pdf parsing config" |
| 1096 | ) |
| 1097 | self.llm_parser_config: LLMPdfParserConfig = config.pdf.llm_parser_config |
| 1098 | self.model_name = self.llm_parser_config.model_name |
| 1099 | |
| 1100 | # Ensure output directory exists |
| 1101 | self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
| 1102 | |
| 1103 | prefix = ( |
| 1104 | Path(source).stem + "_" |
| 1105 | if isinstance(source, str) and Path(source).exists() |
| 1106 | else "output_" |
| 1107 | ) |
| 1108 | temp_file = tempfile.NamedTemporaryFile( |
| 1109 | suffix=".md", |
| 1110 | prefix=prefix, |
| 1111 | dir=str(self.OUTPUT_DIR), |
| 1112 | delete=False, |
| 1113 | ) |
| 1114 | temp_file.close() |
| 1115 | self.output_filename = Path(temp_file.name) |
| 1116 | |
| 1117 | self.max_tokens = self.llm_parser_config.max_tokens or self.DEFAULT_MAX_TOKENS |
| 1118 | |
| 1119 | """ |
| 1120 | If True, each PDF page is processed as a separate chunk, |
| 1121 | resulting in one LLM request per page. If False, pages are |
| 1122 | grouped into chunks based on `max_token_limit` before being sent |
| 1123 | to the LLM. |
| 1124 | """ |
| 1125 | self.split_on_page = self.llm_parser_config.split_on_page or False |
| 1126 | |
| 1127 | # Rate limiting parameters |
| 1128 | import asyncio |
| 1129 | |
| 1130 | self.requests_per_minute = self.llm_parser_config.requests_per_minute or 5 |
| 1131 | |
| 1132 | """ |
| 1133 | A semaphore to control the number of concurrent requests to the LLM, |
| 1134 | preventing rate limit errors. A semaphore slot is acquired before |
| 1135 | making an LLM request and released after the request is complete. |
| 1136 | """ |
| 1137 | self.semaphore = asyncio.Semaphore(self.requests_per_minute) |
| 1138 | self.retry_delay = 5 # seconds, for exponential backoff |
| 1139 | self.max_retries = 3 |
| 1140 | |
| 1141 | def _extract_page(self, page_num: int) -> Dict[str, Any]: |
| 1142 | """ |