MCPcopy Index your code
hub / github.com/ScrapeGraphAI/Scrapegraph-ai / load_file_content

Method load_file_content

scrapegraphai/nodes/fetch_node.py:171–221  ·  view source on GitHub ↗

Loads the content of a file based on its input type. Parameters: source (str): The path to the source file. input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md"). Returns: list: A list containing a Document object wi

(self, source, input_type)

Source from the content-addressed store, hash-verified

169 return state
170
171 def load_file_content(self, source, input_type):
172 """
173 Loads the content of a file based on its input type.
174
175 Parameters:
176 source (str): The path to the source file.
177 input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md").
178
179 Returns:
180 list: A list containing a Document object with the loaded content and metadata.
181 """
182
183 if input_type == "pdf":
184 from langchain_community.document_loaders import PyPDFLoader
185 loader = PyPDFLoader(source)
186 # PyPDFLoader.load() can be blocking for large PDFs. Run it in a thread and
187 # enforce the configured timeout if provided.
188 if self.timeout is None:
189 return loader.load()
190 else:
191 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
192 future = executor.submit(loader.load)
193 try:
194 return future.result(timeout=self.timeout)
195 except concurrent.futures.TimeoutError:
196 raise TimeoutError(
197 f"PDF parsing exceeded timeout of {self.timeout} seconds"
198 )
199 elif input_type == "csv":
200 try:
201 import pandas as pd
202 except ImportError:
203 raise ImportError(
204 "pandas is not installed. Please install it using `pip install pandas`."
205 )
206 return [
207 Document(
208 page_content=str(pd.read_csv(source)), metadata={"source": "csv"}
209 )
210 ]
211 elif input_type == "json":
212 with open(source, encoding="utf-8") as f:
213 return [
214 Document(
215 page_content=str(json.load(f)), metadata={"source": "json"}
216 )
217 ]
218 elif input_type == "xml" or input_type == "md":
219 with open(source, "r", encoding="utf-8") as f:
220 data = f.read()
221 return [Document(page_content=data, metadata={"source": input_type})]
222
223 def handle_local_source(self, state, source):
224 """

Callers 1

handle_fileMethod · 0.95

Calls 1

loadMethod · 0.45

Tested by

no test coverage detected