Fit the models on a collection of documents and generate topics. Arguments: documents: A list of documents to fit on embeddings: Pre-trained document embeddings. These can be used instead of the sentence-transformer model images: A
(
self,
documents: List[str],
embeddings: np.ndarray = None,
images: List[str] | None = None,
y: Union[List[int], np.ndarray] = None,
)
| 348 | return topic_labels |
| 349 | |
| 350 | def fit( |
| 351 | self, |
| 352 | documents: List[str], |
| 353 | embeddings: np.ndarray = None, |
| 354 | images: List[str] | None = None, |
| 355 | y: Union[List[int], np.ndarray] = None, |
| 356 | ): |
| 357 | """Fit the models on a collection of documents and generate topics. |
| 358 | |
| 359 | Arguments: |
| 360 | documents: A list of documents to fit on |
| 361 | embeddings: Pre-trained document embeddings. These can be used |
| 362 | instead of the sentence-transformer model |
| 363 | images: A list of paths to the images to fit on or the images themselves |
| 364 | y: The target class for (semi)-supervised modeling. Use -1 if no class for a |
| 365 | specific instance is specified. |
| 366 | |
| 367 | Examples: |
| 368 | ```python |
| 369 | from bertopic import BERTopic |
| 370 | from sklearn.datasets import fetch_20newsgroups |
| 371 | |
| 372 | docs = fetch_20newsgroups(subset='all')['data'] |
| 373 | topic_model = BERTopic().fit(docs) |
| 374 | ``` |
| 375 | |
| 376 | If you want to use your own embeddings, use it as follows: |
| 377 | |
| 378 | ```python |
| 379 | from bertopic import BERTopic |
| 380 | from sklearn.datasets import fetch_20newsgroups |
| 381 | from sentence_transformers import SentenceTransformer |
| 382 | |
| 383 | # Create embeddings |
| 384 | docs = fetch_20newsgroups(subset='all')['data'] |
| 385 | sentence_model = SentenceTransformer("all-MiniLM-L6-v2") |
| 386 | embeddings = sentence_model.encode(docs, show_progress_bar=True) |
| 387 | |
| 388 | # Create topic model |
| 389 | topic_model = BERTopic().fit(docs, embeddings) |
| 390 | ``` |
| 391 | """ |
| 392 | self.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images) |
| 393 | return self |
| 394 | |
| 395 | def fit_transform( |
| 396 | self, |