Multi-modal embedder with built-in support for common modalities. Supports: text, image, video (extensible)
| 112 | |
| 113 | |
| 114 | class MultiModalEmbedder(BaseEmbedder): |
| 115 | """ |
| 116 | Multi-modal embedder with built-in support for common modalities. |
| 117 | |
| 118 | Supports: text, image, video (extensible) |
| 119 | """ |
| 120 | |
| 121 | def __init__( |
| 122 | self, |
| 123 | text_model: str = "all-MiniLM-L6-v2", |
| 124 | image_model: str = "openai/clip-vit-base-patch32", |
| 125 | config: Optional[EmbeddingConfig] = None, |
| 126 | ): |
| 127 | self.text_model_name = text_model |
| 128 | self.image_model_name = image_model |
| 129 | |
| 130 | # Lazy-loaded models |
| 131 | self._text_model = None |
| 132 | self._image_model = None |
| 133 | self._image_processor = None |
| 134 | |
| 135 | super().__init__(config) |
| 136 | |
| 137 | def _register_default_modalities(self) -> None: |
| 138 | """Register built-in modality handlers.""" |
| 139 | self.register_modality("text", self._embed_text) |
| 140 | self.register_modality("image", self._embed_image) |
| 141 | # Future: add more as needed |
| 142 | # self.register_modality("video", self._embed_video) |
| 143 | # self.register_modality("audio", self._embed_audio) |
| 144 | |
| 145 | def embed(self, inputs: List[Any], modality: str) -> "np.ndarray": |
| 146 | """Route to appropriate handler based on modality.""" |
| 147 | if modality not in self._modality_handlers: |
| 148 | raise ValueError( |
| 149 | f"Unsupported modality: '{modality}'. " |
| 150 | f"Supported: {self.supported_modalities}" |
| 151 | ) |
| 152 | |
| 153 | handler = self._modality_handlers[modality] |
| 154 | return handler(inputs) |
| 155 | |
| 156 | def get_embedding_dim(self, modality: str) -> Optional[int]: |
| 157 | """ |
| 158 | Return the embedding dimension for a given modality. |
| 159 | |
| 160 | For "text", this queries the SentenceTransformer model's dimension |
| 161 | (which triggers lazy model loading). |
| 162 | |
| 163 | Args: |
| 164 | modality: The modality to query (e.g. "text", "image"). |
| 165 | |
| 166 | Returns: |
| 167 | The embedding dimension, or None if unknown. |
| 168 | """ |
| 169 | if modality == "text": |
| 170 | return self.text_model.get_sentence_embedding_dimension() |
| 171 | elif modality == "image": |
no outgoing calls