Complete document processing workflow Args: file_path: Path to the file to process output_dir: output directory (defaults to config.parser_output_dir) parse_method: Parse method (defaults to config.parse_method) display_stats: Whether
(
self,
file_path: str,
output_dir: str = None,
parse_method: str = None,
display_stats: bool = None,
split_by_character: str | None = None,
split_by_character_only: bool = False,
doc_id: str | None = None,
file_name: str | None = None,
**kwargs,
)
| 1652 | } |
| 1653 | |
| 1654 | async def process_document_complete( |
| 1655 | self, |
| 1656 | file_path: str, |
| 1657 | output_dir: str = None, |
| 1658 | parse_method: str = None, |
| 1659 | display_stats: bool = None, |
| 1660 | split_by_character: str | None = None, |
| 1661 | split_by_character_only: bool = False, |
| 1662 | doc_id: str | None = None, |
| 1663 | file_name: str | None = None, |
| 1664 | **kwargs, |
| 1665 | ): |
| 1666 | """ |
| 1667 | Complete document processing workflow |
| 1668 | |
| 1669 | Args: |
| 1670 | file_path: Path to the file to process |
| 1671 | output_dir: output directory (defaults to config.parser_output_dir) |
| 1672 | parse_method: Parse method (defaults to config.parse_method) |
| 1673 | display_stats: Whether to display content statistics (defaults to config.display_content_stats) |
| 1674 | split_by_character: Optional character to split the text by |
| 1675 | split_by_character_only: If True, split only by the specified character |
| 1676 | doc_id: Optional document ID, if not provided will be generated from content |
| 1677 | **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source) |
| 1678 | """ |
| 1679 | callback_manager = getattr(self, "callback_manager", None) |
| 1680 | doc_start_time = time.time() |
| 1681 | stage = "parse" |
| 1682 | file_name = file_name or self._get_file_reference(file_path) |
| 1683 | |
| 1684 | try: |
| 1685 | # Ensure LightRAG is initialized |
| 1686 | init_result = await self._ensure_lightrag_initialized() |
| 1687 | if not init_result or not init_result.get("success"): |
| 1688 | raise RuntimeError( |
| 1689 | f"LightRAG initialization failed: {(init_result or {}).get('error', 'unknown error')}" |
| 1690 | ) |
| 1691 | |
| 1692 | # Use config defaults if not provided |
| 1693 | if output_dir is None: |
| 1694 | output_dir = self.config.parser_output_dir |
| 1695 | if parse_method is None: |
| 1696 | parse_method = self.config.parse_method |
| 1697 | if display_stats is None: |
| 1698 | display_stats = self.config.display_content_stats |
| 1699 | |
| 1700 | self.logger.info(f"Starting complete document processing: {file_path}") |
| 1701 | |
| 1702 | # Step 1: Parse document |
| 1703 | content_list, content_based_doc_id = await self.parse_document( |
| 1704 | file_path, output_dir, parse_method, display_stats, **kwargs |
| 1705 | ) |
| 1706 | |
| 1707 | # Use provided doc_id or fall back to content-based doc_id |
| 1708 | if doc_id is None: |
| 1709 | doc_id = content_based_doc_id |
| 1710 | |
| 1711 | # Step 2: Separate text and multimodal content |
nothing calls this directly
no test coverage detected