Get annotations by replicating linkage function calculation in scipy. Arguments: topic_model: A fitted BERTopic instance. hierarchical_topics: A dataframe that contains a hierarchy of topics represented by their parents and their children.
(
topic_model,
hierarchical_topics: pd.DataFrame,
embeddings: csr_matrix,
linkage_function: Callable[[csr_matrix], np.ndarray],
distance_function: Callable[[csr_matrix], csr_matrix],
orientation: str,
custom_labels: bool = False,
)
| 228 | |
| 229 | |
| 230 | def _get_annotations( |
| 231 | topic_model, |
| 232 | hierarchical_topics: pd.DataFrame, |
| 233 | embeddings: csr_matrix, |
| 234 | linkage_function: Callable[[csr_matrix], np.ndarray], |
| 235 | distance_function: Callable[[csr_matrix], csr_matrix], |
| 236 | orientation: str, |
| 237 | custom_labels: bool = False, |
| 238 | ) -> List[List[str]]: |
| 239 | """Get annotations by replicating linkage function calculation in scipy. |
| 240 | |
| 241 | Arguments: |
| 242 | topic_model: A fitted BERTopic instance. |
| 243 | hierarchical_topics: A dataframe that contains a hierarchy of topics |
| 244 | represented by their parents and their children. |
| 245 | NOTE: The hierarchical topic names are only visualized |
| 246 | if both `topics` and `top_n_topics` are not set. |
| 247 | embeddings: The c-TF-IDF matrix on which to model the hierarchy |
| 248 | linkage_function: The linkage function to use. Default is: |
| 249 | `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)` |
| 250 | NOTE: Make sure to use the same `linkage_function` as used |
| 251 | in `topic_model.hierarchical_topics`. |
| 252 | distance_function: The distance function to use on the c-TF-IDF matrix. Default is: |
| 253 | `lambda x: 1 - cosine_similarity(x)`. |
| 254 | You can pass any function that returns either a square matrix of |
| 255 | shape (n_samples, n_samples) with zeros on the diagonal and |
| 256 | non-negative values or condensed distance matrix of shape |
| 257 | (n_samples * (n_samples - 1) / 2,) containing the upper |
| 258 | triangular of the distance matrix. |
| 259 | NOTE: Make sure to use the same `distance_function` as used |
| 260 | in `topic_model.hierarchical_topics`. |
| 261 | orientation: The orientation of the figure. |
| 262 | Either 'left' or 'bottom' |
| 263 | custom_labels: Whether to use custom topic labels that were defined using |
| 264 | `topic_model.set_topic_labels`. |
| 265 | NOTE: Custom labels are only generated for the original |
| 266 | un-merged topics. |
| 267 | |
| 268 | Returns: |
| 269 | text_annotations: Annotations to be used within Plotly's `ff.create_dendogram` |
| 270 | """ |
| 271 | df = hierarchical_topics.loc[hierarchical_topics.Parent_Name != "Top", :] |
| 272 | |
| 273 | # Calculate distance |
| 274 | X = distance_function(embeddings) |
| 275 | X = validate_distance_matrix(X, embeddings.shape[0]) |
| 276 | |
| 277 | # Calculate linkage and generate dendrogram |
| 278 | Z = linkage_function(X) |
| 279 | P = sch.dendrogram(Z, orientation=orientation, no_plot=True) |
| 280 | |
| 281 | # store topic no.(leaves) corresponding to the x-ticks in dendrogram |
| 282 | x_ticks = np.arange(5, len(P["leaves"]) * 10 + 5, 10) |
| 283 | x_topic = dict(zip(P["leaves"], x_ticks)) |
| 284 | |
| 285 | topic_vals = dict() |
| 286 | for key, val in x_topic.items(): |
| 287 | topic_vals[val] = [key] |
no test coverage detected