Handles the web source by fetching HTML content from a URL, optionally converting it to Markdown, and updating the state. Parameters: state (dict): The current state of the graph. source (str): The URL of the web source to fetch HTML content from. R
(self, state, source)
| 264 | return state |
| 265 | |
| 266 | def handle_web_source(self, state, source): |
| 267 | """ |
| 268 | Handles the web source by fetching HTML content from a URL, |
| 269 | optionally converting it to Markdown, and updating the state. |
| 270 | |
| 271 | Parameters: |
| 272 | state (dict): The current state of the graph. |
| 273 | source (str): The URL of the web source to fetch HTML content from. |
| 274 | |
| 275 | Returns: |
| 276 | dict: The updated state with the processed content. |
| 277 | |
| 278 | Raises: |
| 279 | ValueError: If the fetched HTML content is empty or contains only whitespace. |
| 280 | """ |
| 281 | |
| 282 | self.logger.info(f"--- (Fetching HTML from: {source}) ---") |
| 283 | if self.use_soup: |
| 284 | # Apply configured timeout to blocking HTTP requests. If timeout is None, |
| 285 | # don't pass the timeout argument (requests will block until completion). |
| 286 | if self.timeout is None: |
| 287 | response = requests.get(source) |
| 288 | else: |
| 289 | response = requests.get(source, timeout=self.timeout) |
| 290 | if response.status_code == 200: |
| 291 | if not response.text.strip(): |
| 292 | raise ValueError("No HTML body content found in the response.") |
| 293 | |
| 294 | if not self.cut: |
| 295 | parsed_content = cleanup_html(response, source) |
| 296 | |
| 297 | if ( |
| 298 | isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) |
| 299 | and not self.script_creator |
| 300 | or (self.force and not self.script_creator) |
| 301 | ): |
| 302 | parsed_content = convert_to_md(source, parsed_content) |
| 303 | |
| 304 | compressed_document = [Document(page_content=parsed_content)] |
| 305 | else: |
| 306 | self.logger.warning( |
| 307 | f"Failed to retrieve contents from the webpage at url: {source}" |
| 308 | ) |
| 309 | else: |
| 310 | loader_kwargs = {} |
| 311 | |
| 312 | if self.node_config: |
| 313 | loader_kwargs = self.node_config.get("loader_kwargs", {}) |
| 314 | |
| 315 | # If a global timeout is configured on the node and no loader-specific timeout |
| 316 | # was provided, propagate it to ChromiumLoader so it can apply the same limit. |
| 317 | if "timeout" not in loader_kwargs and self.timeout is not None: |
| 318 | loader_kwargs["timeout"] = self.timeout |
| 319 | |
| 320 | if self.browser_base: |
| 321 | try: |
| 322 | from ..docloaders.browser_base import browser_base_fetch |
| 323 | except ImportError: |
no test coverage detected