| 3 | |
| 4 | |
| 5 | class LDA(object): |
| 6 | def __init__(self, T=10): |
| 7 | """ |
| 8 | Vanilla (non-smoothed) LDA model trained using variational EM. |
| 9 | Generates maximum-likelihood estimates for model paramters |
| 10 | `alpha` and `beta`. |
| 11 | |
| 12 | Parameters |
| 13 | ---------- |
| 14 | T : int |
| 15 | Number of topics |
| 16 | |
| 17 | Attributes |
| 18 | ---------- |
| 19 | D : int |
| 20 | Number of documents |
| 21 | N : list of length `D` |
| 22 | Number of words in each document |
| 23 | V : int |
| 24 | Number of unique word tokens across all documents |
| 25 | phi : :py:class:`ndarray <numpy.ndarray>` of shape `(D, N[d], T)` |
| 26 | Variational approximation to word-topic distribution |
| 27 | gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(D, T)` |
| 28 | Variational approximation to document-topic distribution |
| 29 | alpha : :py:class:`ndarray <numpy.ndarray>` of shape `(1, T)` |
| 30 | Parameter for the Dirichlet prior on the document-topic distribution |
| 31 | beta : :py:class:`ndarray <numpy.ndarray>` of shape `(V, T)` |
| 32 | Word-topic distribution |
| 33 | """ |
| 34 | self.T = T |
| 35 | |
| 36 | def _maximize_phi(self): |
| 37 | """ |
| 38 | Optimize variational parameter phi |
| 39 | ϕ_{t, n} ∝ β_{t, w_n} e^( Ψ(γ_t) ) |
| 40 | """ |
| 41 | D = self.D |
| 42 | N = self.N |
| 43 | T = self.T |
| 44 | |
| 45 | phi = self.phi |
| 46 | beta = self.beta |
| 47 | gamma = self.gamma |
| 48 | corpus = self.corpus |
| 49 | |
| 50 | for d in range(D): |
| 51 | for n in range(N[d]): |
| 52 | for t in range(T): |
| 53 | w_n = int(corpus[d][n]) |
| 54 | phi[d][n, t] = beta[w_n, t] * np.exp(dg(gamma, d, t)) |
| 55 | |
| 56 | # Normalize over topics |
| 57 | phi[d][n, :] = phi[d][n, :] / np.sum(phi[d][n, :]) |
| 58 | return phi |
| 59 | |
| 60 | def _maximize_gamma(self): |
| 61 | """ |
| 62 | Optimize variational parameter gamma |