MCPcopy
hub / github.com/HKUDS/RAG-Anything / parse_document

Method parse_document

raganything/processor.py:386–605  ·  view source on GitHub ↗

Parse document with caching support Args: file_path: Path to the file to parse output_dir: Output directory (defaults to config.parser_output_dir) parse_method: Parse method (defaults to config.parse_method) display_stats: Whether to

(
        self,
        file_path: str,
        output_dir: str = None,
        parse_method: str = None,
        display_stats: bool = None,
        **kwargs,
    )

Source from the content-addressed store, hash-verified

384 self.logger.warning(f"Error storing to parse cache: {e}")
385
386 async def parse_document(
387 self,
388 file_path: str,
389 output_dir: str = None,
390 parse_method: str = None,
391 display_stats: bool = None,
392 **kwargs,
393 ) -> tuple[List[Dict[str, Any]], str]:
394 """
395 Parse document with caching support
396
397 Args:
398 file_path: Path to the file to parse
399 output_dir: Output directory (defaults to config.parser_output_dir)
400 parse_method: Parse method (defaults to config.parse_method)
401 display_stats: Whether to display content statistics (defaults to config.display_content_stats)
402 **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
403
404 Returns:
405 tuple[List[Dict[str, Any]], str]: (content_list, doc_id)
406 """
407 # Use config defaults if not provided
408 if output_dir is None:
409 output_dir = self.config.parser_output_dir
410 if parse_method is None:
411 parse_method = self.config.parse_method
412 if display_stats is None:
413 display_stats = self.config.display_content_stats
414
415 self.logger.info(f"Starting document parsing: {file_path}")
416
417 file_path = Path(file_path)
418 if not file_path.exists():
419 raise FileNotFoundError(f"File not found: {file_path}")
420
421 callback_file = str(file_path)
422 callback_manager = getattr(self, "callback_manager", None)
423 parse_start_time = time.time()
424 if callback_manager is not None:
425 callback_manager.dispatch(
426 "on_parse_start",
427 file_path=callback_file,
428 parser=self.config.parser,
429 )
430
431 # Generate cache key based on file and configuration
432 cache_key = self._generate_cache_key(file_path, parse_method, **kwargs)
433
434 # Check cache first
435 cached_result = await self._get_cached_result(
436 cache_key, file_path, parse_method, **kwargs
437 )
438 if cached_result is not None:
439 content_list, doc_id = cached_result
440 self.logger.info(f"Using cached parsing result for: {file_path}")
441 if display_stats:
442 self.logger.info(
443 f"* Total blocks in cached content_list: {len(content_list)}"

Calls 12

_generate_cache_keyMethod · 0.95
_get_cached_resultMethod · 0.95
_store_cached_resultMethod · 0.95
get_parserFunction · 0.90
MineruParserClass · 0.90
dispatchMethod · 0.80
getMethod · 0.80
itemsMethod · 0.80
infoMethod · 0.45
warningMethod · 0.45
errorMethod · 0.45

Tested by

no test coverage detected