(self, url: str, ix:int, html: str)
| 92 | |
| 93 | |
| 94 | def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: |
| 95 | # print("[LOG] Extracting blocks from URL:", url) |
| 96 | print(f"[LOG] Call LLM for {url} - block index: {ix}") |
| 97 | variable_values = { |
| 98 | "URL": url, |
| 99 | "HTML": escape_json_string(sanitize_html(html)), |
| 100 | } |
| 101 | |
| 102 | prompt_with_variables = PROMPT_EXTRACT_BLOCKS |
| 103 | if self.instruction: |
| 104 | variable_values["REQUEST"] = self.instruction |
| 105 | prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION |
| 106 | |
| 107 | if self.extract_type == "schema" and self.schema: |
| 108 | variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) |
| 109 | prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION |
| 110 | |
| 111 | for variable in variable_values: |
| 112 | prompt_with_variables = prompt_with_variables.replace( |
| 113 | "{" + variable + "}", variable_values[variable] |
| 114 | ) |
| 115 | |
| 116 | response = perform_completion_with_backoff( |
| 117 | self.provider, |
| 118 | prompt_with_variables, |
| 119 | self.api_token, |
| 120 | base_url=self.api_base or self.base_url, |
| 121 | extra_args = self.extra_args |
| 122 | ) # , json_response=self.extract_type == "schema") |
| 123 | try: |
| 124 | blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] |
| 125 | blocks = json.loads(blocks) |
| 126 | for block in blocks: |
| 127 | block['error'] = False |
| 128 | except Exception as e: |
| 129 | parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) |
| 130 | blocks = parsed |
| 131 | if unparsed: |
| 132 | blocks.append({ |
| 133 | "index": 0, |
| 134 | "error": True, |
| 135 | "tags": ["error"], |
| 136 | "content": unparsed |
| 137 | }) |
| 138 | |
| 139 | if self.verbose: |
| 140 | print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix) |
| 141 | return blocks |
| 142 | |
| 143 | def _merge(self, documents, chunk_token_threshold, overlap): |
| 144 | chunks = [] |
nothing calls this directly
no test coverage detected