MCPcopy
hub / github.com/ScrapeGraphAI/Scrapegraph-ai / handle_web_source

Method handle_web_source

scrapegraphai/nodes/fetch_node.py:266–409  ·  view source on GitHub ↗

Handles the web source by fetching HTML content from a URL, optionally converting it to Markdown, and updating the state. Parameters: state (dict): The current state of the graph. source (str): The URL of the web source to fetch HTML content from. R

(self, state, source)

Source from the content-addressed store, hash-verified

264 return state
265
266 def handle_web_source(self, state, source):
267 """
268 Handles the web source by fetching HTML content from a URL,
269 optionally converting it to Markdown, and updating the state.
270
271 Parameters:
272 state (dict): The current state of the graph.
273 source (str): The URL of the web source to fetch HTML content from.
274
275 Returns:
276 dict: The updated state with the processed content.
277
278 Raises:
279 ValueError: If the fetched HTML content is empty or contains only whitespace.
280 """
281
282 self.logger.info(f"--- (Fetching HTML from: {source}) ---")
283 if self.use_soup:
284 # Apply configured timeout to blocking HTTP requests. If timeout is None,
285 # don't pass the timeout argument (requests will block until completion).
286 if self.timeout is None:
287 response = requests.get(source)
288 else:
289 response = requests.get(source, timeout=self.timeout)
290 if response.status_code == 200:
291 if not response.text.strip():
292 raise ValueError("No HTML body content found in the response.")
293
294 if not self.cut:
295 parsed_content = cleanup_html(response, source)
296
297 if (
298 isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI))
299 and not self.script_creator
300 or (self.force and not self.script_creator)
301 ):
302 parsed_content = convert_to_md(source, parsed_content)
303
304 compressed_document = [Document(page_content=parsed_content)]
305 else:
306 self.logger.warning(
307 f"Failed to retrieve contents from the webpage at url: {source}"
308 )
309 else:
310 loader_kwargs = {}
311
312 if self.node_config:
313 loader_kwargs = self.node_config.get("loader_kwargs", {})
314
315 # If a global timeout is configured on the node and no loader-specific timeout
316 # was provided, propagate it to ChromiumLoader so it can apply the same limit.
317 if "timeout" not in loader_kwargs and self.timeout is not None:
318 loader_kwargs["timeout"] = self.timeout
319
320 if self.browser_base:
321 try:
322 from ..docloaders.browser_base import browser_base_fetch
323 except ImportError:

Callers 1

executeMethod · 0.95

Calls 11

loadMethod · 0.95
cleanup_htmlFunction · 0.90
convert_to_mdFunction · 0.90
browser_base_fetchFunction · 0.85
scrape_do_fetchFunction · 0.85
PlasmateLoaderClass · 0.85
ChromiumLoaderClass · 0.85
getMethod · 0.80
warningMethod · 0.80
updateMethod · 0.80
infoMethod · 0.45

Tested by

no test coverage detected