MCPcopy
hub / github.com/MaartenGr/BERTopic / embed

Method embed

bertopic/backend/_model2vec.py:90–123  ·  view source on GitHub ↗

Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (

(self, documents: List[str], verbose: bool = False)

Source from the content-addressed store, hash-verified

88 )
89
90 def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
91 """Embed a list of n documents/words into an n-dimensional
92 matrix of embeddings.
93
94 Arguments:
95 documents: A list of documents or words to be embedded
96 verbose: Controls the verbosity of the process
97
98 Returns:
99 Document/words embeddings with shape (n, m) with `n` documents/words
100 that each have an embeddings size of `m`
101 """
102 # Distill the model
103 if self.distill and not self._has_distilled:
104 from model2vec.distill import distill
105
106 # Distill with the vocabulary of the documents
107 if not self.distill_kwargs.get("vocabulary"):
108 X = self.distill_vectorizer.fit_transform(documents)
109 word_counts = np.array(X.sum(axis=0)).flatten()
110 words = self.distill_vectorizer.get_feature_names_out()
111 vocabulary = [word for word, _ in sorted(zip(words, word_counts), key=lambda x: x[1], reverse=True)]
112 self.distill_kwargs["vocabulary"] = vocabulary
113
114 # Distill the model
115 self.embedding_model = distill(self.embedding_model, **self.distill_kwargs)
116
117 # Distillation should happen only once and not for every embed call
118 # The distillation should only happen the first time on the entire vocabulary
119 self._has_distilled = True
120
121 # Embed the documents
122 embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
123 return embeddings
124
125 def _check_model2vec_installation(self):
126 try:

Callers

nothing calls this directly

Calls 1

fit_transformMethod · 0.80

Tested by

no test coverage detected