Loads the content of a file based on its input type. Parameters: source (str): The path to the source file. input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md"). Returns: list: A list containing a Document object wi
(self, source, input_type)
| 169 | return state |
| 170 | |
| 171 | def load_file_content(self, source, input_type): |
| 172 | """ |
| 173 | Loads the content of a file based on its input type. |
| 174 | |
| 175 | Parameters: |
| 176 | source (str): The path to the source file. |
| 177 | input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md"). |
| 178 | |
| 179 | Returns: |
| 180 | list: A list containing a Document object with the loaded content and metadata. |
| 181 | """ |
| 182 | |
| 183 | if input_type == "pdf": |
| 184 | from langchain_community.document_loaders import PyPDFLoader |
| 185 | loader = PyPDFLoader(source) |
| 186 | # PyPDFLoader.load() can be blocking for large PDFs. Run it in a thread and |
| 187 | # enforce the configured timeout if provided. |
| 188 | if self.timeout is None: |
| 189 | return loader.load() |
| 190 | else: |
| 191 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: |
| 192 | future = executor.submit(loader.load) |
| 193 | try: |
| 194 | return future.result(timeout=self.timeout) |
| 195 | except concurrent.futures.TimeoutError: |
| 196 | raise TimeoutError( |
| 197 | f"PDF parsing exceeded timeout of {self.timeout} seconds" |
| 198 | ) |
| 199 | elif input_type == "csv": |
| 200 | try: |
| 201 | import pandas as pd |
| 202 | except ImportError: |
| 203 | raise ImportError( |
| 204 | "pandas is not installed. Please install it using `pip install pandas`." |
| 205 | ) |
| 206 | return [ |
| 207 | Document( |
| 208 | page_content=str(pd.read_csv(source)), metadata={"source": "csv"} |
| 209 | ) |
| 210 | ] |
| 211 | elif input_type == "json": |
| 212 | with open(source, encoding="utf-8") as f: |
| 213 | return [ |
| 214 | Document( |
| 215 | page_content=str(json.load(f)), metadata={"source": "json"} |
| 216 | ) |
| 217 | ] |
| 218 | elif input_type == "xml" or input_type == "md": |
| 219 | with open(source, "r", encoding="utf-8") as f: |
| 220 | data = f.read() |
| 221 | return [Document(page_content=data, metadata={"source": input_type})] |
| 222 | |
| 223 | def handle_local_source(self, state, source): |
| 224 | """ |