Executes the node's logic to parse the HTML document content and split it into chunks. Args: state (dict): The current state of the graph. The input keys will be used to fetch the correct data from the state. Returns: dic
(self, state: dict)
| 60 | self.chunk_size = node_config.get("chunk_size") |
| 61 | |
| 62 | def execute(self, state: dict) -> dict: |
| 63 | """ |
| 64 | Executes the node's logic to parse the HTML document content and split it into chunks. |
| 65 | |
| 66 | Args: |
| 67 | state (dict): The current state of the graph. The input keys will be used to fetch the |
| 68 | correct data from the state. |
| 69 | |
| 70 | Returns: |
| 71 | dict: The updated state with the output key containing the parsed content chunks. |
| 72 | |
| 73 | Raises: |
| 74 | KeyError: If the input keys are not found in the state, indicating that the |
| 75 | necessary information for parsing the content is missing. |
| 76 | """ |
| 77 | |
| 78 | self.logger.info(f"--- Executing {self.node_name} Node ---") |
| 79 | |
| 80 | input_keys = self.get_input_keys(state) |
| 81 | input_data = [state[key] for key in input_keys] |
| 82 | docs_transformed = input_data[0] |
| 83 | source = input_data[1] if self.parse_urls else None |
| 84 | |
| 85 | if self.parse_html: |
| 86 | docs_transformed = Html2TextTransformer( |
| 87 | ignore_links=False |
| 88 | ).transform_documents(input_data[0]) |
| 89 | docs_transformed = docs_transformed[0] |
| 90 | |
| 91 | link_urls, img_urls = self._extract_urls( |
| 92 | docs_transformed.page_content, source |
| 93 | ) |
| 94 | |
| 95 | chunks = split_text_into_chunks( |
| 96 | text=docs_transformed.page_content, |
| 97 | chunk_size=self.chunk_size - 250, |
| 98 | ) |
| 99 | else: |
| 100 | docs_transformed = docs_transformed[0] |
| 101 | |
| 102 | try: |
| 103 | link_urls, img_urls = self._extract_urls( |
| 104 | docs_transformed.page_content, source |
| 105 | ) |
| 106 | except Exception: |
| 107 | link_urls, img_urls = "", "" |
| 108 | |
| 109 | chunk_size = self.chunk_size |
| 110 | chunk_size = min(chunk_size - 500, int(chunk_size * 0.8)) |
| 111 | |
| 112 | if isinstance(docs_transformed, Document): |
| 113 | chunks = split_text_into_chunks( |
| 114 | text=docs_transformed.page_content, |
| 115 | chunk_size=chunk_size, |
| 116 | ) |
| 117 | else: |
| 118 | chunks = split_text_into_chunks( |
| 119 | text=docs_transformed, chunk_size=chunk_size |
nothing calls this directly
no test coverage detected