Process documents for Ollama embeddings by processing one document at a time. Adalflow Ollama Client does not support batch embedding, so we need to process each document individually.
| 60 | return False |
| 61 | |
| 62 | class OllamaDocumentProcessor(DataComponent): |
| 63 | """ |
| 64 | Process documents for Ollama embeddings by processing one document at a time. |
| 65 | Adalflow Ollama Client does not support batch embedding, so we need to process each document individually. |
| 66 | """ |
| 67 | def __init__(self, embedder: adal.Embedder) -> None: |
| 68 | super().__init__() |
| 69 | self.embedder = embedder |
| 70 | |
| 71 | def __call__(self, documents: Sequence[Document]) -> Sequence[Document]: |
| 72 | output = deepcopy(documents) |
| 73 | logger.info(f"Processing {len(output)} documents individually for Ollama embeddings") |
| 74 | |
| 75 | successful_docs = [] |
| 76 | expected_embedding_size = None |
| 77 | |
| 78 | for i, doc in enumerate(tqdm(output, desc="Processing documents for Ollama embeddings")): |
| 79 | try: |
| 80 | # Get embedding for a single document |
| 81 | result = self.embedder(input=doc.text) |
| 82 | if result.data and len(result.data) > 0: |
| 83 | embedding = result.data[0].embedding |
| 84 | |
| 85 | # Validate embedding size consistency |
| 86 | if expected_embedding_size is None: |
| 87 | expected_embedding_size = len(embedding) |
| 88 | logger.info(f"Expected embedding size set to: {expected_embedding_size}") |
| 89 | elif len(embedding) != expected_embedding_size: |
| 90 | file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}') |
| 91 | logger.warning(f"Document '{file_path}' has inconsistent embedding size {len(embedding)} != {expected_embedding_size}, skipping") |
| 92 | continue |
| 93 | |
| 94 | # Assign the embedding to the document |
| 95 | output[i].vector = embedding |
| 96 | successful_docs.append(output[i]) |
| 97 | else: |
| 98 | file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}') |
| 99 | logger.warning(f"Failed to get embedding for document '{file_path}', skipping") |
| 100 | except Exception as e: |
| 101 | file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}') |
| 102 | logger.error(f"Error processing document '{file_path}': {e}, skipping") |
| 103 | |
| 104 | logger.info(f"Successfully processed {len(successful_docs)}/{len(output)} documents with consistent embeddings") |
| 105 | return successful_docs |
no outgoing calls
no test coverage detected