Runs k-means on given data and initial set of centroids. maxiter: maximum number of iterations to run.(default=500) record_heterogeneity: (optional) a list, to store the history of heterogeneity as function of iterations if None, do not sto
(
data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False
)
| 146 | |
| 147 | |
| 148 | def kmeans( |
| 149 | data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False |
| 150 | ): |
| 151 | """Runs k-means on given data and initial set of centroids. |
| 152 | maxiter: maximum number of iterations to run.(default=500) |
| 153 | record_heterogeneity: (optional) a list, to store the history of heterogeneity |
| 154 | as function of iterations |
| 155 | if None, do not store the history. |
| 156 | verbose: if True, print how many data points changed their cluster labels in |
| 157 | each iteration""" |
| 158 | centroids = initial_centroids[:] |
| 159 | prev_cluster_assignment = None |
| 160 | |
| 161 | for itr in range(maxiter): |
| 162 | if verbose: |
| 163 | print(itr, end="") |
| 164 | |
| 165 | # 1. Make cluster assignments using nearest centroids |
| 166 | cluster_assignment = assign_clusters(data, centroids) |
| 167 | |
| 168 | # 2. Compute a new centroid for each of the k clusters, averaging all data |
| 169 | # points assigned to that cluster. |
| 170 | centroids = revise_centroids(data, k, cluster_assignment) |
| 171 | |
| 172 | # Check for convergence: if none of the assignments changed, stop |
| 173 | if ( |
| 174 | prev_cluster_assignment is not None |
| 175 | and (prev_cluster_assignment == cluster_assignment).all() |
| 176 | ): |
| 177 | break |
| 178 | |
| 179 | # Print number of new assignments |
| 180 | if prev_cluster_assignment is not None: |
| 181 | num_changed = np.sum(prev_cluster_assignment != cluster_assignment) |
| 182 | if verbose: |
| 183 | print( |
| 184 | f" {num_changed:5d} elements changed their cluster assignment." |
| 185 | ) |
| 186 | |
| 187 | # Record heterogeneity convergence metric |
| 188 | if record_heterogeneity is not None: |
| 189 | # YOUR CODE HERE |
| 190 | score = compute_heterogeneity(data, k, centroids, cluster_assignment) |
| 191 | record_heterogeneity.append(score) |
| 192 | |
| 193 | prev_cluster_assignment = cluster_assignment[:] |
| 194 | |
| 195 | return centroids, cluster_assignment |
| 196 | |
| 197 | |
| 198 | # Mock test below |
no test coverage detected