Approximate most representative documents per topic by sampling a subset of the documents in each topic and calculating which are most representative to their topic based on the cosine similarity between c-TF-IDF representations. Arguments: c_tf_idf: The
(
self,
c_tf_idf: csr_matrix,
documents: pd.DataFrame,
topics: Mapping[str, List[Tuple[str, float]]],
nr_samples: int = 500,
nr_repr_docs: int = 5,
diversity: float | None = None,
)
| 4233 | self.representative_docs_ = repr_docs |
| 4234 | |
| 4235 | def _extract_representative_docs( |
| 4236 | self, |
| 4237 | c_tf_idf: csr_matrix, |
| 4238 | documents: pd.DataFrame, |
| 4239 | topics: Mapping[str, List[Tuple[str, float]]], |
| 4240 | nr_samples: int = 500, |
| 4241 | nr_repr_docs: int = 5, |
| 4242 | diversity: float | None = None, |
| 4243 | ) -> Union[List[str], List[List[int]]]: |
| 4244 | """Approximate most representative documents per topic by sampling |
| 4245 | a subset of the documents in each topic and calculating which are |
| 4246 | most representative to their topic based on the cosine similarity between |
| 4247 | c-TF-IDF representations. |
| 4248 | |
| 4249 | Arguments: |
| 4250 | c_tf_idf: The topic c-TF-IDF representation |
| 4251 | documents: All input documents |
| 4252 | topics: The candidate topics as calculated with c-TF-IDF |
| 4253 | nr_samples: The number of candidate documents to extract per topic |
| 4254 | nr_repr_docs: The number of representative documents to extract per topic |
| 4255 | diversity: The diversity between the most representative documents. |
| 4256 | If None, no MMR is used. Otherwise, accepts values between 0 and 1. |
| 4257 | |
| 4258 | Returns: |
| 4259 | repr_docs_mappings: A dictionary from topic to representative documents |
| 4260 | representative_docs: A flat list of representative documents |
| 4261 | repr_doc_indices: Ordered indices of representative documents |
| 4262 | that belong to each topic |
| 4263 | repr_doc_ids: The indices of representative documents |
| 4264 | that belong to each topic |
| 4265 | """ |
| 4266 | # Sample documents per topic |
| 4267 | documents_per_topic = ( |
| 4268 | documents.drop("Image", axis=1, errors="ignore") |
| 4269 | .groupby("Topic") |
| 4270 | .sample(n=nr_samples, replace=True, random_state=42) |
| 4271 | .drop_duplicates() |
| 4272 | ) |
| 4273 | |
| 4274 | # Find and extract documents that are most similar to the topic |
| 4275 | repr_docs = [] |
| 4276 | repr_docs_indices = [] |
| 4277 | repr_docs_mappings = {} |
| 4278 | repr_docs_ids = [] |
| 4279 | labels = sorted(list(topics.keys())) |
| 4280 | for index, topic in enumerate(labels): |
| 4281 | # Slice data |
| 4282 | selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :] |
| 4283 | selected_docs = selection["Document"].values |
| 4284 | selected_docs_ids = selection.index.tolist() |
| 4285 | |
| 4286 | # Calculate similarity |
| 4287 | nr_docs = nr_repr_docs if len(selected_docs) > nr_repr_docs else len(selected_docs) |
| 4288 | bow = self.vectorizer_model.transform(selected_docs) |
| 4289 | ctfidf = self.ctfidf_model.transform(bow) |
| 4290 | sim_matrix = cosine_similarity(ctfidf, c_tf_idf[index]) |
| 4291 | |
| 4292 | # Use MMR to find representative but diverse documents |
no test coverage detected