MCPcopy
hub / github.com/AsyncFuncAI/deepwiki-open / OllamaDocumentProcessor

Class OllamaDocumentProcessor

api/ollama_patch.py:62–105  ·  view source on GitHub ↗

Process documents for Ollama embeddings by processing one document at a time. Adalflow Ollama Client does not support batch embedding, so we need to process each document individually.

Source from the content-addressed store, hash-verified

60 return False
61
62class OllamaDocumentProcessor(DataComponent):
63 """
64 Process documents for Ollama embeddings by processing one document at a time.
65 Adalflow Ollama Client does not support batch embedding, so we need to process each document individually.
66 """
67 def __init__(self, embedder: adal.Embedder) -> None:
68 super().__init__()
69 self.embedder = embedder
70
71 def __call__(self, documents: Sequence[Document]) -> Sequence[Document]:
72 output = deepcopy(documents)
73 logger.info(f"Processing {len(output)} documents individually for Ollama embeddings")
74
75 successful_docs = []
76 expected_embedding_size = None
77
78 for i, doc in enumerate(tqdm(output, desc="Processing documents for Ollama embeddings")):
79 try:
80 # Get embedding for a single document
81 result = self.embedder(input=doc.text)
82 if result.data and len(result.data) > 0:
83 embedding = result.data[0].embedding
84
85 # Validate embedding size consistency
86 if expected_embedding_size is None:
87 expected_embedding_size = len(embedding)
88 logger.info(f"Expected embedding size set to: {expected_embedding_size}")
89 elif len(embedding) != expected_embedding_size:
90 file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}')
91 logger.warning(f"Document '{file_path}' has inconsistent embedding size {len(embedding)} != {expected_embedding_size}, skipping")
92 continue
93
94 # Assign the embedding to the document
95 output[i].vector = embedding
96 successful_docs.append(output[i])
97 else:
98 file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}')
99 logger.warning(f"Failed to get embedding for document '{file_path}', skipping")
100 except Exception as e:
101 file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}')
102 logger.error(f"Error processing document '{file_path}': {e}, skipping")
103
104 logger.info(f"Successfully processed {len(successful_docs)}/{len(output)} documents with consistent embeddings")
105 return successful_docs

Callers 1

prepare_data_pipelineFunction · 0.90

Calls

no outgoing calls

Tested by

no test coverage detected