Create a BERTopic model from a variety of inputs. Arguments: topics: A dictionary containing topic metadata, including: - Topic representations, labels, sizes, custom labels, etc. params: BERTopic-specific hyperparams, including HF embedding_model ID
(
topics: Mapping[str, Any],
params: Mapping[str, Any],
tensors: Mapping[str, np.array],
ctfidf_tensors: Mapping[str, Any] | None = None,
ctfidf_config: Mapping[str, Any] | None = None,
images: Mapping[int, Any] | None = None,
warn_no_backend: bool = True,
)
| 5016 | |
| 5017 | |
| 5018 | def _create_model_from_files( |
| 5019 | topics: Mapping[str, Any], |
| 5020 | params: Mapping[str, Any], |
| 5021 | tensors: Mapping[str, np.array], |
| 5022 | ctfidf_tensors: Mapping[str, Any] | None = None, |
| 5023 | ctfidf_config: Mapping[str, Any] | None = None, |
| 5024 | images: Mapping[int, Any] | None = None, |
| 5025 | warn_no_backend: bool = True, |
| 5026 | ): |
| 5027 | """Create a BERTopic model from a variety of inputs. |
| 5028 | |
| 5029 | Arguments: |
| 5030 | topics: A dictionary containing topic metadata, including: |
| 5031 | - Topic representations, labels, sizes, custom labels, etc. |
| 5032 | params: BERTopic-specific hyperparams, including HF embedding_model ID |
| 5033 | if given. |
| 5034 | tensors: The topic embeddings |
| 5035 | ctfidf_tensors: The c-TF-IDF representations |
| 5036 | ctfidf_config: The config for CountVectorizer and c-TF-IDF |
| 5037 | images: The images per topic |
| 5038 | warn_no_backend: Whether to warn the user if no backend is given |
| 5039 | """ |
| 5040 | params["n_gram_range"] = tuple(params["n_gram_range"]) |
| 5041 | |
| 5042 | if ctfidf_config is not None: |
| 5043 | ngram_range = ctfidf_config["vectorizer_model"]["params"]["ngram_range"] |
| 5044 | ctfidf_config["vectorizer_model"]["params"]["ngram_range"] = tuple(ngram_range) |
| 5045 | |
| 5046 | params["n_gram_range"] = tuple(params["n_gram_range"]) |
| 5047 | |
| 5048 | # Select HF model through SentenceTransformers |
| 5049 | try: |
| 5050 | from sentence_transformers import SentenceTransformer |
| 5051 | |
| 5052 | embedding_model = select_backend(SentenceTransformer(params["embedding_model"])) |
| 5053 | except: # noqa: E722 |
| 5054 | embedding_model = BaseEmbedder() |
| 5055 | |
| 5056 | if warn_no_backend: |
| 5057 | logger.warning( |
| 5058 | "You are loading a BERTopic model without explicitly defining an embedding model." |
| 5059 | " If you want to also load in an embedding model, make sure to use" |
| 5060 | " `BERTopic.load(my_model, embedding_model=my_embedding_model)`." |
| 5061 | ) |
| 5062 | |
| 5063 | if params.get("embedding_model") is not None: |
| 5064 | del params["embedding_model"] |
| 5065 | |
| 5066 | # Prepare our empty sub-models |
| 5067 | empty_dimensionality_model = BaseDimensionalityReduction() |
| 5068 | empty_cluster_model = BaseCluster() |
| 5069 | |
| 5070 | # Fit BERTopic without actually performing any clustering |
| 5071 | topic_model = BERTopic( |
| 5072 | embedding_model=embedding_model, |
| 5073 | umap_model=empty_dimensionality_model, |
| 5074 | hdbscan_model=empty_cluster_model, |
| 5075 | **params, |
no test coverage detected