MCPcopy Index your code
hub / github.com/unclecode/crawl4ai / extract

Method extract

crawl4ai/extraction_strategy.py:94–141  ·  view source on GitHub ↗
(self, url: str, ix:int, html: str)

Source from the content-addressed store, hash-verified

92
93
94 def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
95 # print("[LOG] Extracting blocks from URL:", url)
96 print(f"[LOG] Call LLM for {url} - block index: {ix}")
97 variable_values = {
98 "URL": url,
99 "HTML": escape_json_string(sanitize_html(html)),
100 }
101
102 prompt_with_variables = PROMPT_EXTRACT_BLOCKS
103 if self.instruction:
104 variable_values["REQUEST"] = self.instruction
105 prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
106
107 if self.extract_type == "schema" and self.schema:
108 variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
109 prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
110
111 for variable in variable_values:
112 prompt_with_variables = prompt_with_variables.replace(
113 "{" + variable + "}", variable_values[variable]
114 )
115
116 response = perform_completion_with_backoff(
117 self.provider,
118 prompt_with_variables,
119 self.api_token,
120 base_url=self.api_base or self.base_url,
121 extra_args = self.extra_args
122 ) # , json_response=self.extract_type == "schema")
123 try:
124 blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
125 blocks = json.loads(blocks)
126 for block in blocks:
127 block['error'] = False
128 except Exception as e:
129 parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
130 blocks = parsed
131 if unparsed:
132 blocks.append({
133 "index": 0,
134 "error": True,
135 "tags": ["error"],
136 "content": unparsed
137 })
138
139 if self.verbose:
140 print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
141 return blocks
142
143 def _merge(self, documents, chunk_token_threshold, overlap):
144 chunks = []

Callers

nothing calls this directly

Calls 5

escape_json_stringFunction · 0.85
sanitize_htmlFunction · 0.85
extract_xml_dataFunction · 0.85

Tested by

no test coverage detected