Checks if a website is scrapeable based on the robots.txt file and updates the state with the scrapeability status. The method constructs a prompt for the language model, submits it, and parses the output to determine if scraping is allowed. Args: state
(self, state: dict)
| 55 | ) |
| 56 | |
| 57 | def execute(self, state: dict) -> dict: |
| 58 | """ |
| 59 | Checks if a website is scrapeable based on the robots.txt file and updates the state |
| 60 | with the scrapeability status. The method constructs a prompt for the language model, |
| 61 | submits it, and parses the output to determine if scraping is allowed. |
| 62 | |
| 63 | Args: |
| 64 | state (dict): The current state of the graph. The input keys will be used to fetch the |
| 65 | |
| 66 | Returns: |
| 67 | dict: The updated state with the output key containing the scrapeability status. |
| 68 | |
| 69 | Raises: |
| 70 | KeyError: If the input keys are not found in the state, indicating that the |
| 71 | necessary information for checking scrapeability is missing. |
| 72 | KeyError: If the large language model is not found in the robots_dictionary. |
| 73 | ValueError: If the website is not scrapeable based on the robots.txt file and |
| 74 | scraping is not enforced. |
| 75 | """ |
| 76 | |
| 77 | self.logger.info(f"--- Executing {self.node_name} Node ---") |
| 78 | |
| 79 | input_keys = self.get_input_keys(state) |
| 80 | |
| 81 | input_data = [state[key] for key in input_keys] |
| 82 | |
| 83 | source = input_data[0] |
| 84 | output_parser = CommaSeparatedListOutputParser() |
| 85 | |
| 86 | if not source.startswith("http"): |
| 87 | raise ValueError("Operation not allowed") |
| 88 | |
| 89 | else: |
| 90 | parsed_url = urlparse(source) |
| 91 | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" |
| 92 | from langchain_community.document_loaders import AsyncChromiumLoader |
| 93 | loader = AsyncChromiumLoader(f"{base_url}/robots.txt") |
| 94 | document = loader.load() |
| 95 | if "ollama" in self.llm_model.model: |
| 96 | self.llm_model.model = self.llm_model.model.split("/")[-1] |
| 97 | model = self.llm_model.model.split("/")[-1] |
| 98 | else: |
| 99 | model = self.llm_model.model |
| 100 | try: |
| 101 | agent = robots_dictionary[model] |
| 102 | |
| 103 | except KeyError: |
| 104 | agent = model |
| 105 | |
| 106 | prompt = PromptTemplate( |
| 107 | template=TEMPLATE_ROBOT, |
| 108 | input_variables=["path"], |
| 109 | partial_variables={"context": document, "agent": agent}, |
| 110 | ) |
| 111 | |
| 112 | chain = prompt | self.llm_model | output_parser |
| 113 | is_scrapable = chain.invoke({"path": source})[0] |
| 114 |