Function extract_blocks_batch

crawl4ai/utils.py:880–921 · view source on GitHub ↗

(batch_data, provider = "groq/llama3-70b-8192", api_token = None)

Source from the content-addressed store, hash-verified

878	return blocks
879
880	def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
881	api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
882	from litellm import batch_completion
883	messages = []
884
885	for url, html in batch_data:
886	variable_values = {
887	"URL": url,
888	"HTML": html,
889	}
890
891	prompt_with_variables = PROMPT_EXTRACT_BLOCKS
892	for variable in variable_values:
893	prompt_with_variables = prompt_with_variables.replace(
894	"{" + variable + "}", variable_values[variable]
895	)
896
897	messages.append([{"role": "user", "content": prompt_with_variables}])
898
899
900	responses = batch_completion(
901	model = provider,
902	messages = messages,
903	temperature = 0.01
904	)
905
906	all_blocks = []
907	for response in responses:
908	try:
909	blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
910	blocks = json.loads(blocks)
911
912	except Exception as e:
913	blocks = [{
914	"index": 0,
915	"tags": ["error"],
916	"content": ["Error extracting blocks from the HTML content. Choose another provider/model or try again."],
917	"questions": ["What went wrong during the block extraction process?"]
918	}]
919	all_blocks.append(blocks)
920
921	return sum(all_blocks, [])
922
923	def merge_chunks_based_on_token_threshold(chunks, token_threshold):
924	"""

nothing calls this directly

extract_xml_dataFunction · 0.85

no test coverage detected

searching dependent graphs…