MCPcopy
hub / github.com/MaartenGr/BERTopic / add_mappings

Method add_mappings

bertopic/_bertopic.py:4950–5004  ·  view source on GitHub ↗

Add new column(s) of topic mappings. Arguments: mappings: The mappings to add topic_model: The topic model this TopicMapper belongs to

(self, mappings: Mapping[int, int], topic_model: BERTopic)

Source from the content-addressed store, hash-verified

4948 return mappings
4949
4950 def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic):
4951 """Add new column(s) of topic mappings.
4952
4953 Arguments:
4954 mappings: The mappings to add
4955 topic_model: The topic model this TopicMapper belongs to
4956 """
4957 for topics in self.mappings_:
4958 topic = topics[-1]
4959 if topic in mappings:
4960 topics.append(mappings[topic])
4961 else:
4962 topics.append(-1)
4963
4964 # When zero-shot topic(s) are present in the topics to merge,
4965 # determine whether to take one of the zero-shot topic labels
4966 # or use a calculated representation.
4967 if topic_model._is_zeroshot() and len(topic_model._topic_id_to_zeroshot_topic_idx) > 0:
4968 new_topic_id_to_zeroshot_topic_idx = {}
4969 topics_to_map = {
4970 topic_mapping[0]: topic_mapping[1]
4971 for topic_mapping in np.array(topic_model.topic_mapper_.mappings_)[:, -2:]
4972 }
4973
4974 # Map topic_to to topics_from
4975 mapping = defaultdict(list)
4976 for key, value in topics_to_map.items():
4977 mapping[value].append(key)
4978
4979 for topic_to, topics_from in mapping.items():
4980 # which of the original topics are zero-shot
4981 zeroshot_topic_ids = [
4982 topic_id for topic_id in topics_from if topic_id in topic_model._topic_id_to_zeroshot_topic_idx
4983 ]
4984 if len(zeroshot_topic_ids) == 0:
4985 continue
4986
4987 # If any of the original topics are zero-shot, take the best fitting zero-shot label
4988 # if the cosine similarity with the new topic exceeds the zero-shot threshold
4989 zeroshot_labels = [
4990 topic_model.zeroshot_topic_list[topic_model._topic_id_to_zeroshot_topic_idx[topic_id]]
4991 for topic_id in zeroshot_topic_ids
4992 ]
4993 zeroshot_embeddings = topic_model._extract_embeddings(zeroshot_labels)
4994 cosine_similarities = cosine_similarity(
4995 zeroshot_embeddings, [topic_model.topic_embeddings_[topic_to]]
4996 ).flatten()
4997 best_zeroshot_topic_idx = np.argmax(cosine_similarities)
4998 best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx]
4999 if best_cosine_similarity >= topic_model.zeroshot_min_similarity:
5000 # Using the topic ID from before mapping, get the idx into the zeroshot topic list
5001 new_topic_id_to_zeroshot_topic_idx[topic_to] = topic_model._topic_id_to_zeroshot_topic_idx[
5002 zeroshot_topic_ids[best_zeroshot_topic_idx]
5003 ]
5004 topic_model._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx
5005
5006 def add_new_topics(self, mappings: Mapping[int, int]):
5007 """Add new row(s) of topic mappings.

Callers 5

merge_topicsMethod · 0.80
delete_topicsMethod · 0.80
_reduce_to_n_topicsMethod · 0.80
_auto_reduce_topicsMethod · 0.80

Calls 2

_is_zeroshotMethod · 0.80
_extract_embeddingsMethod · 0.45

Tested by

no test coverage detected