Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list. We transform the topics in `self.zeroshot_topic_list` to embeddings and compare them through cosine similarity with the document embeddings. If they pass the `self.zeroshot_min_sim
(
self, documents: pd.DataFrame, embeddings: np.ndarray
)
| 4008 | return documents, probabilities |
| 4009 | |
| 4010 | def _zeroshot_topic_modeling( |
| 4011 | self, documents: pd.DataFrame, embeddings: np.ndarray |
| 4012 | ) -> Tuple[pd.DataFrame, np.array, pd.DataFrame, np.array]: |
| 4013 | """Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list. |
| 4014 | |
| 4015 | We transform the topics in `self.zeroshot_topic_list` to embeddings and |
| 4016 | compare them through cosine similarity with the document embeddings. |
| 4017 | If they pass the `self.zeroshot_min_similarity` threshold, they are assigned. |
| 4018 | |
| 4019 | Arguments: |
| 4020 | documents: Dataframe with documents and their corresponding IDs |
| 4021 | embeddings: The document embeddings |
| 4022 | |
| 4023 | Returns: |
| 4024 | documents: The leftover documents that were not assigned to any topic |
| 4025 | embeddings: The leftover embeddings that were not assigned to any topic |
| 4026 | """ |
| 4027 | logger.info("Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics") |
| 4028 | # Similarity between document and zero-shot topic embeddings |
| 4029 | zeroshot_embeddings = self._extract_embeddings(self.zeroshot_topic_list) |
| 4030 | cosine_similarities = cosine_similarity(embeddings, zeroshot_embeddings) |
| 4031 | assignment = np.argmax(cosine_similarities, 1) |
| 4032 | assignment_vals = np.max(cosine_similarities, 1) |
| 4033 | assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity] |
| 4034 | non_assigned_ids = [ |
| 4035 | index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity |
| 4036 | ] |
| 4037 | |
| 4038 | # Assign topics |
| 4039 | assigned_documents = documents.iloc[assigned_ids] |
| 4040 | assigned_documents["Topic"] = [topic for topic in assignment[assigned_ids]] |
| 4041 | assigned_documents["Old_ID"] = assigned_documents["ID"].copy() |
| 4042 | assigned_documents["ID"] = range(len(assigned_documents)) |
| 4043 | assigned_embeddings = embeddings[assigned_ids] |
| 4044 | |
| 4045 | # Check that if a number of topics was specified, it exceeds the number of zeroshot topics matched |
| 4046 | num_zeroshot_topics = len(assigned_documents["Topic"].unique()) |
| 4047 | if self.nr_topics != "auto": |
| 4048 | if self.nr_topics and not self.nr_topics > num_zeroshot_topics: |
| 4049 | raise ValueError( |
| 4050 | f"The set nr_topics ({self.nr_topics}) must exceed the number of matched zero-shot topics " |
| 4051 | f"({num_zeroshot_topics}). Consider raising nr_topics or raising the " |
| 4052 | f"zeroshot_min_similarity ({self.zeroshot_min_similarity})." |
| 4053 | ) |
| 4054 | |
| 4055 | # Select non-assigned topics to be clustered |
| 4056 | documents = documents.iloc[non_assigned_ids] |
| 4057 | documents["Old_ID"] = documents["ID"].copy() |
| 4058 | documents["ID"] = range(len(documents)) |
| 4059 | embeddings = embeddings[non_assigned_ids] |
| 4060 | |
| 4061 | if len(documents) == 0: |
| 4062 | self.topics_ = assigned_documents["Topic"].values.tolist() |
| 4063 | self.topic_mapper_ = TopicMapper(self.topics_) |
| 4064 | |
| 4065 | logger.info("Zeroshot Step 1 - Completed \u2713") |
| 4066 | return documents, embeddings, assigned_documents, assigned_embeddings |
| 4067 |
no test coverage detected