hub / github.com/AsyncFuncAI/deepwiki-open / OllamaDocumentProcessor

Class OllamaDocumentProcessor

api/ollama_patch.py:62–105 · view source on GitHub ↗

Process documents for Ollama embeddings by processing one document at a time. Adalflow Ollama Client does not support batch embedding, so we need to process each document individually.

Source from the content-addressed store, hash-verified

60	return False
61
62	class OllamaDocumentProcessor(DataComponent):
63	"""
64	Process documents for Ollama embeddings by processing one document at a time.
65	Adalflow Ollama Client does not support batch embedding, so we need to process each document individually.
66	"""
67	def __init__(self, embedder: adal.Embedder) -> None:
68	super().__init__()
69	self.embedder = embedder
70
71	def __call__(self, documents: Sequence[Document]) -> Sequence[Document]:
72	output = deepcopy(documents)
73	logger.info(f"Processing {len(output)} documents individually for Ollama embeddings")
74
75	successful_docs = []
76	expected_embedding_size = None
77
78	for i, doc in enumerate(tqdm(output, desc="Processing documents for Ollama embeddings")):
79	try:
80	# Get embedding for a single document
81	result = self.embedder(input=doc.text)
82	if result.data and len(result.data) > 0:
83	embedding = result.data[0].embedding
84
85	# Validate embedding size consistency
86	if expected_embedding_size is None:
87	expected_embedding_size = len(embedding)
88	logger.info(f"Expected embedding size set to: {expected_embedding_size}")
89	elif len(embedding) != expected_embedding_size:
90	file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}')
91	logger.warning(f"Document '{file_path}' has inconsistent embedding size {len(embedding)} != {expected_embedding_size}, skipping")
92	continue
93
94	# Assign the embedding to the document
95	output[i].vector = embedding
96	successful_docs.append(output[i])
97	else:
98	file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}')
99	logger.warning(f"Failed to get embedding for document '{file_path}', skipping")
100	except Exception as e:
101	file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}')
102	logger.error(f"Error processing document '{file_path}': {e}, skipping")
103
104	logger.info(f"Successfully processed {len(successful_docs)}/{len(output)} documents with consistent embeddings")
105	return successful_docs

Callers 1

prepare_data_pipelineFunction · 0.90

Calls

no outgoing calls

Tested by

no test coverage detected