Add new column(s) of topic mappings. Arguments: mappings: The mappings to add topic_model: The topic model this TopicMapper belongs to
(self, mappings: Mapping[int, int], topic_model: BERTopic)
| 4948 | return mappings |
| 4949 | |
| 4950 | def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic): |
| 4951 | """Add new column(s) of topic mappings. |
| 4952 | |
| 4953 | Arguments: |
| 4954 | mappings: The mappings to add |
| 4955 | topic_model: The topic model this TopicMapper belongs to |
| 4956 | """ |
| 4957 | for topics in self.mappings_: |
| 4958 | topic = topics[-1] |
| 4959 | if topic in mappings: |
| 4960 | topics.append(mappings[topic]) |
| 4961 | else: |
| 4962 | topics.append(-1) |
| 4963 | |
| 4964 | # When zero-shot topic(s) are present in the topics to merge, |
| 4965 | # determine whether to take one of the zero-shot topic labels |
| 4966 | # or use a calculated representation. |
| 4967 | if topic_model._is_zeroshot() and len(topic_model._topic_id_to_zeroshot_topic_idx) > 0: |
| 4968 | new_topic_id_to_zeroshot_topic_idx = {} |
| 4969 | topics_to_map = { |
| 4970 | topic_mapping[0]: topic_mapping[1] |
| 4971 | for topic_mapping in np.array(topic_model.topic_mapper_.mappings_)[:, -2:] |
| 4972 | } |
| 4973 | |
| 4974 | # Map topic_to to topics_from |
| 4975 | mapping = defaultdict(list) |
| 4976 | for key, value in topics_to_map.items(): |
| 4977 | mapping[value].append(key) |
| 4978 | |
| 4979 | for topic_to, topics_from in mapping.items(): |
| 4980 | # which of the original topics are zero-shot |
| 4981 | zeroshot_topic_ids = [ |
| 4982 | topic_id for topic_id in topics_from if topic_id in topic_model._topic_id_to_zeroshot_topic_idx |
| 4983 | ] |
| 4984 | if len(zeroshot_topic_ids) == 0: |
| 4985 | continue |
| 4986 | |
| 4987 | # If any of the original topics are zero-shot, take the best fitting zero-shot label |
| 4988 | # if the cosine similarity with the new topic exceeds the zero-shot threshold |
| 4989 | zeroshot_labels = [ |
| 4990 | topic_model.zeroshot_topic_list[topic_model._topic_id_to_zeroshot_topic_idx[topic_id]] |
| 4991 | for topic_id in zeroshot_topic_ids |
| 4992 | ] |
| 4993 | zeroshot_embeddings = topic_model._extract_embeddings(zeroshot_labels) |
| 4994 | cosine_similarities = cosine_similarity( |
| 4995 | zeroshot_embeddings, [topic_model.topic_embeddings_[topic_to]] |
| 4996 | ).flatten() |
| 4997 | best_zeroshot_topic_idx = np.argmax(cosine_similarities) |
| 4998 | best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx] |
| 4999 | if best_cosine_similarity >= topic_model.zeroshot_min_similarity: |
| 5000 | # Using the topic ID from before mapping, get the idx into the zeroshot topic list |
| 5001 | new_topic_id_to_zeroshot_topic_idx[topic_to] = topic_model._topic_id_to_zeroshot_topic_idx[ |
| 5002 | zeroshot_topic_ids[best_zeroshot_topic_idx] |
| 5003 | ] |
| 5004 | topic_model._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx |
| 5005 | |
| 5006 | def add_new_topics(self, mappings: Mapping[int, int]): |
| 5007 | """Add new row(s) of topic mappings. |
no test coverage detected