Randomly choose k data points as initial centroids
(data, k, seed=None)
| 60 | |
| 61 | |
| 62 | def get_initial_centroids(data, k, seed=None): |
| 63 | """Randomly choose k data points as initial centroids""" |
| 64 | # useful for obtaining consistent results |
| 65 | rng = np.random.default_rng(seed) |
| 66 | n = data.shape[0] # number of data points |
| 67 | |
| 68 | # Pick K indices from range [0, N). |
| 69 | rand_indices = rng.integers(0, n, k) |
| 70 | |
| 71 | # Keep centroids as dense format, as many entries will be nonzero due to averaging. |
| 72 | # As long as at least one document in a cluster contains a word, |
| 73 | # it will carry a nonzero weight in the TF-IDF vector of the centroid. |
| 74 | centroids = data[rand_indices, :] |
| 75 | |
| 76 | return centroids |
| 77 | |
| 78 | |
| 79 | def centroid_pairwise_dist(x, centroids): |