A node responsible for fetching the HTML content of a specified URL and updating the graph's state with this content. It uses ChromiumLoader to fetch the content from a web page asynchronously (with proxy protection). This node acts as a starting point in many scraping workflows, p
| 17 | |
| 18 | |
| 19 | class FetchNode(BaseNode): |
| 20 | """ |
| 21 | A node responsible for fetching the HTML content of a specified URL and updating |
| 22 | the graph's state with this content. It uses ChromiumLoader to fetch |
| 23 | the content from a web page asynchronously (with proxy protection). |
| 24 | |
| 25 | This node acts as a starting point in many scraping workflows, preparing the state |
| 26 | with the necessary HTML content for further processing by subsequent nodes in the graph. |
| 27 | |
| 28 | Attributes: |
| 29 | headless (bool): A flag indicating whether the browser should run in headless mode. |
| 30 | verbose (bool): A flag indicating whether to print verbose output during execution. |
| 31 | |
| 32 | Args: |
| 33 | input (str): Boolean expression defining the input keys needed from the state. |
| 34 | output (List[str]): List of output keys to be updated in the state. |
| 35 | node_config (Optional[dict]): Additional configuration for the node. |
| 36 | node_name (str): The unique identifier name for the node, defaulting to "Fetch". |
| 37 | """ |
| 38 | |
| 39 | def __init__( |
| 40 | self, |
| 41 | input: str, |
| 42 | output: List[str], |
| 43 | node_config: Optional[dict] = None, |
| 44 | node_name: str = "Fetch", |
| 45 | ): |
| 46 | super().__init__(node_name, "node", input, output, 1, node_config) |
| 47 | |
| 48 | self.headless = ( |
| 49 | True if node_config is None else node_config.get("headless", True) |
| 50 | ) |
| 51 | self.verbose = ( |
| 52 | False if node_config is None else node_config.get("verbose", False) |
| 53 | ) |
| 54 | self.use_soup = ( |
| 55 | False if node_config is None else node_config.get("use_soup", False) |
| 56 | ) |
| 57 | self.loader_kwargs = ( |
| 58 | {} if node_config is None else node_config.get("loader_kwargs", {}) |
| 59 | ) |
| 60 | self.llm_model = {} if node_config is None else node_config.get("llm_model", {}) |
| 61 | self.force = False if node_config is None else node_config.get("force", False) |
| 62 | self.script_creator = ( |
| 63 | False if node_config is None else node_config.get("script_creator", False) |
| 64 | ) |
| 65 | self.openai_md_enabled = ( |
| 66 | False |
| 67 | if node_config is None |
| 68 | else node_config.get("openai_md_enabled", False) |
| 69 | ) |
| 70 | |
| 71 | # Timeout in seconds for blocking operations (HTTP requests, PDF parsing, etc.). |
| 72 | # If set to None, no timeout will be applied. |
| 73 | self.timeout = None if node_config is None else node_config.get("timeout", 30) |
| 74 | |
| 75 | self.cut = False if node_config is None else node_config.get("cut", True) |
| 76 |
no outgoing calls