MCPcopy
hub / github.com/MaartenGr/BERTopic / HFTransformerBackend

Class HFTransformerBackend

bertopic/backend/_hftransformers.py:12–91  ·  view source on GitHub ↗

Hugging Face transformers model. This uses the `transformers.pipelines.pipeline` to define and create a feature generation pipeline from which embeddings can be extracted. Arguments: embedding_model: A Hugging Face feature extraction pipeline Examples: To use a Hugging

Source from the content-addressed store, hash-verified

10
11
12class HFTransformerBackend(BaseEmbedder):
13 """Hugging Face transformers model.
14
15 This uses the `transformers.pipelines.pipeline` to define and create
16 a feature generation pipeline from which embeddings can be extracted.
17
18 Arguments:
19 embedding_model: A Hugging Face feature extraction pipeline
20
21 Examples:
22 To use a Hugging Face transformers model, load in a pipeline and point
23 to any model found on their model hub (https://huggingface.co/models):
24
25 ```python
26 from bertopic.backend import HFTransformerBackend
27 from transformers.pipelines import pipeline
28
29 hf_model = pipeline("feature-extraction", model="distilbert-base-cased")
30 embedding_model = HFTransformerBackend(hf_model)
31 ```
32 """
33
34 def __init__(self, embedding_model: Pipeline):
35 super().__init__()
36
37 if isinstance(embedding_model, Pipeline):
38 self.embedding_model = embedding_model
39 else:
40 raise ValueError(
41 "Please select a correct transformers pipeline. For example: "
42 "pipeline('feature-extraction', model='distilbert-base-cased', device=0)"
43 )
44
45 def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
46 """Embed a list of n documents/words into an n-dimensional
47 matrix of embeddings.
48
49 Arguments:
50 documents: A list of documents or words to be embedded
51 verbose: Controls the verbosity of the process
52
53 Returns:
54 Document/words embeddings with shape (n, m) with `n` documents/words
55 that each have an embeddings size of `m`
56 """
57 dataset = MyDataset(documents)
58
59 embeddings = []
60 for document, features in tqdm(
61 zip(documents, self.embedding_model(dataset, truncation=True, padding=True)),
62 total=len(dataset),
63 disable=not verbose,
64 ):
65 embeddings.append(self._embed(document, features))
66
67 return np.array(embeddings)
68
69 def _embed(self, document: str, features: np.ndarray) -> np.ndarray:

Callers 1

select_backendFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected