MCPcopy Index your code
hub / github.com/unclecode/crawl4ai / extract_blocks_batch

Function extract_blocks_batch

crawl4ai/utils.py:880–921  ·  view source on GitHub ↗
(batch_data, provider = "groq/llama3-70b-8192", api_token = None)

Source from the content-addressed store, hash-verified

878 return blocks
879
880def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
881 api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
882 from litellm import batch_completion
883 messages = []
884
885 for url, html in batch_data:
886 variable_values = {
887 "URL": url,
888 "HTML": html,
889 }
890
891 prompt_with_variables = PROMPT_EXTRACT_BLOCKS
892 for variable in variable_values:
893 prompt_with_variables = prompt_with_variables.replace(
894 "{" + variable + "}", variable_values[variable]
895 )
896
897 messages.append([{"role": "user", "content": prompt_with_variables}])
898
899
900 responses = batch_completion(
901 model = provider,
902 messages = messages,
903 temperature = 0.01
904 )
905
906 all_blocks = []
907 for response in responses:
908 try:
909 blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
910 blocks = json.loads(blocks)
911
912 except Exception as e:
913 blocks = [{
914 "index": 0,
915 "tags": ["error"],
916 "content": ["Error extracting blocks from the HTML content. Choose another provider/model or try again."],
917 "questions": ["What went wrong during the block extraction process?"]
918 }]
919 all_blocks.append(blocks)
920
921 return sum(all_blocks, [])
922
923def merge_chunks_based_on_token_threshold(chunks, token_threshold):
924 """

Callers

nothing calls this directly

Calls 1

extract_xml_dataFunction · 0.85

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…