Parse document with caching support Args: file_path: Path to the file to parse output_dir: Output directory (defaults to config.parser_output_dir) parse_method: Parse method (defaults to config.parse_method) display_stats: Whether to
(
self,
file_path: str,
output_dir: str = None,
parse_method: str = None,
display_stats: bool = None,
**kwargs,
)
| 384 | self.logger.warning(f"Error storing to parse cache: {e}") |
| 385 | |
| 386 | async def parse_document( |
| 387 | self, |
| 388 | file_path: str, |
| 389 | output_dir: str = None, |
| 390 | parse_method: str = None, |
| 391 | display_stats: bool = None, |
| 392 | **kwargs, |
| 393 | ) -> tuple[List[Dict[str, Any]], str]: |
| 394 | """ |
| 395 | Parse document with caching support |
| 396 | |
| 397 | Args: |
| 398 | file_path: Path to the file to parse |
| 399 | output_dir: Output directory (defaults to config.parser_output_dir) |
| 400 | parse_method: Parse method (defaults to config.parse_method) |
| 401 | display_stats: Whether to display content statistics (defaults to config.display_content_stats) |
| 402 | **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source) |
| 403 | |
| 404 | Returns: |
| 405 | tuple[List[Dict[str, Any]], str]: (content_list, doc_id) |
| 406 | """ |
| 407 | # Use config defaults if not provided |
| 408 | if output_dir is None: |
| 409 | output_dir = self.config.parser_output_dir |
| 410 | if parse_method is None: |
| 411 | parse_method = self.config.parse_method |
| 412 | if display_stats is None: |
| 413 | display_stats = self.config.display_content_stats |
| 414 | |
| 415 | self.logger.info(f"Starting document parsing: {file_path}") |
| 416 | |
| 417 | file_path = Path(file_path) |
| 418 | if not file_path.exists(): |
| 419 | raise FileNotFoundError(f"File not found: {file_path}") |
| 420 | |
| 421 | callback_file = str(file_path) |
| 422 | callback_manager = getattr(self, "callback_manager", None) |
| 423 | parse_start_time = time.time() |
| 424 | if callback_manager is not None: |
| 425 | callback_manager.dispatch( |
| 426 | "on_parse_start", |
| 427 | file_path=callback_file, |
| 428 | parser=self.config.parser, |
| 429 | ) |
| 430 | |
| 431 | # Generate cache key based on file and configuration |
| 432 | cache_key = self._generate_cache_key(file_path, parse_method, **kwargs) |
| 433 | |
| 434 | # Check cache first |
| 435 | cached_result = await self._get_cached_result( |
| 436 | cache_key, file_path, parse_method, **kwargs |
| 437 | ) |
| 438 | if cached_result is not None: |
| 439 | content_list, doc_id = cached_result |
| 440 | self.logger.info(f"Using cached parsing result for: {file_path}") |
| 441 | if display_stats: |
| 442 | self.logger.info( |
| 443 | f"* Total blocks in cached content_list: {len(content_list)}" |
no test coverage detected