Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (
(self, documents: List[str], verbose: bool = False)
| 88 | ) |
| 89 | |
| 90 | def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: |
| 91 | """Embed a list of n documents/words into an n-dimensional |
| 92 | matrix of embeddings. |
| 93 | |
| 94 | Arguments: |
| 95 | documents: A list of documents or words to be embedded |
| 96 | verbose: Controls the verbosity of the process |
| 97 | |
| 98 | Returns: |
| 99 | Document/words embeddings with shape (n, m) with `n` documents/words |
| 100 | that each have an embeddings size of `m` |
| 101 | """ |
| 102 | # Distill the model |
| 103 | if self.distill and not self._has_distilled: |
| 104 | from model2vec.distill import distill |
| 105 | |
| 106 | # Distill with the vocabulary of the documents |
| 107 | if not self.distill_kwargs.get("vocabulary"): |
| 108 | X = self.distill_vectorizer.fit_transform(documents) |
| 109 | word_counts = np.array(X.sum(axis=0)).flatten() |
| 110 | words = self.distill_vectorizer.get_feature_names_out() |
| 111 | vocabulary = [word for word, _ in sorted(zip(words, word_counts), key=lambda x: x[1], reverse=True)] |
| 112 | self.distill_kwargs["vocabulary"] = vocabulary |
| 113 | |
| 114 | # Distill the model |
| 115 | self.embedding_model = distill(self.embedding_model, **self.distill_kwargs) |
| 116 | |
| 117 | # Distillation should happen only once and not for every embed call |
| 118 | # The distillation should only happen the first time on the entire vocabulary |
| 119 | self._has_distilled = True |
| 120 | |
| 121 | # Embed the documents |
| 122 | embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose) |
| 123 | return embeddings |
| 124 | |
| 125 | def _check_model2vec_installation(self): |
| 126 | try: |
nothing calls this directly
no test coverage detected