Reduce dimensionality of embeddings using UMAP and train a UMAP model. Arguments: embeddings: The extracted embeddings using the sentence transformer module. y: The target class for (semi)-supervised dimensionality reduction partial_fit: Whether to run `p
(
self,
embeddings: Union[np.ndarray, csr_matrix],
y: Union[List[int], np.ndarray] = None,
partial_fit: bool = False,
)
| 3905 | return mapped_predictions |
| 3906 | |
| 3907 | def _reduce_dimensionality( |
| 3908 | self, |
| 3909 | embeddings: Union[np.ndarray, csr_matrix], |
| 3910 | y: Union[List[int], np.ndarray] = None, |
| 3911 | partial_fit: bool = False, |
| 3912 | ) -> np.ndarray: |
| 3913 | """Reduce dimensionality of embeddings using UMAP and train a UMAP model. |
| 3914 | |
| 3915 | Arguments: |
| 3916 | embeddings: The extracted embeddings using the sentence transformer module. |
| 3917 | y: The target class for (semi)-supervised dimensionality reduction |
| 3918 | partial_fit: Whether to run `partial_fit` for online learning |
| 3919 | |
| 3920 | Returns: |
| 3921 | umap_embeddings: The reduced embeddings |
| 3922 | """ |
| 3923 | logger.info("Dimensionality - Fitting the dimensionality reduction algorithm") |
| 3924 | # Partial fit |
| 3925 | if partial_fit: |
| 3926 | if hasattr(self.umap_model, "partial_fit"): |
| 3927 | self.umap_model = self.umap_model.partial_fit(embeddings) |
| 3928 | umap_embeddings = self.umap_model.transform(embeddings) |
| 3929 | elif self.topic_representations_ is None: |
| 3930 | if hasattr(self.umap_model, "fit_transform"): |
| 3931 | umap_embeddings = self.umap_model.fit_transform(embeddings) |
| 3932 | else: |
| 3933 | self.umap_model.fit(embeddings) |
| 3934 | umap_embeddings = self.umap_model.transform(embeddings) |
| 3935 | else: |
| 3936 | umap_embeddings = self.umap_model.transform(embeddings) |
| 3937 | |
| 3938 | # Regular fit |
| 3939 | else: |
| 3940 | try: |
| 3941 | # cuml umap needs y to be an numpy array |
| 3942 | y = np.array(y) if y is not None else None |
| 3943 | if hasattr(self.umap_model, "fit_transform"): |
| 3944 | umap_embeddings = self.umap_model.fit_transform(embeddings, y=y) |
| 3945 | else: |
| 3946 | self.umap_model.fit(embeddings, y=y) |
| 3947 | umap_embeddings = self.umap_model.transform(embeddings) |
| 3948 | except TypeError: |
| 3949 | if hasattr(self.umap_model, "fit_transform"): |
| 3950 | umap_embeddings = self.umap_model.fit_transform(embeddings, y=y) |
| 3951 | else: |
| 3952 | self.umap_model.fit(embeddings, y=y) |
| 3953 | umap_embeddings = self.umap_model.transform(embeddings) |
| 3954 | |
| 3955 | logger.info("Dimensionality - Completed \u2713") |
| 3956 | return np.nan_to_num(umap_embeddings) |
| 3957 | |
| 3958 | def _cluster_embeddings( |
| 3959 | self, |