hub / github.com/ScrapeGraphAI/Scrapegraph-ai / load_file_content

Method load_file_content

scrapegraphai/nodes/fetch_node.py:171–221 · view source on GitHub ↗

Loads the content of a file based on its input type. Parameters: source (str): The path to the source file. input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md"). Returns: list: A list containing a Document object wi

(self, source, input_type)

Source from the content-addressed store, hash-verified

169	return state
170
171	def load_file_content(self, source, input_type):
172	"""
173	Loads the content of a file based on its input type.
174
175	Parameters:
176	source (str): The path to the source file.
177	input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md").
178
179	Returns:
180	list: A list containing a Document object with the loaded content and metadata.
181	"""
182
183	if input_type == "pdf":
184	from langchain_community.document_loaders import PyPDFLoader
185	loader = PyPDFLoader(source)
186	# PyPDFLoader.load() can be blocking for large PDFs. Run it in a thread and
187	# enforce the configured timeout if provided.
188	if self.timeout is None:
189	return loader.load()
190	else:
191	with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
192	future = executor.submit(loader.load)
193	try:
194	return future.result(timeout=self.timeout)
195	except concurrent.futures.TimeoutError:
196	raise TimeoutError(
197	f"PDF parsing exceeded timeout of {self.timeout} seconds"
198	)
199	elif input_type == "csv":
200	try:
201	import pandas as pd
202	except ImportError:
203	raise ImportError(
204	"pandas is not installed. Please install it using `pip install pandas`."
205	)
206	return [
207	Document(
208	page_content=str(pd.read_csv(source)), metadata={"source": "csv"}
209	)
210	]
211	elif input_type == "json":
212	with open(source, encoding="utf-8") as f:
213	return [
214	Document(
215	page_content=str(json.load(f)), metadata={"source": "json"}
216	)
217	]
218	elif input_type == "xml" or input_type == "md":
219	with open(source, "r", encoding="utf-8") as f:
220	data = f.read()
221	return [Document(page_content=data, metadata={"source": input_type})]
222
223	def handle_local_source(self, state, source):
224	"""

Callers 1

handle_fileMethod · 0.95

Calls 1

loadMethod · 0.45

Tested by

no test coverage detected