Train the LDA model on a corpus of documents (bags of words). Parameters ---------- corpus : list of length `D` A list of lists, with each sublist containing the tokenized text of a single document. verbose : bool Whether
(self, corpus, verbose=False, max_iter=1000, tol=5)
| 204 | self.gamma = np.tile(self.alpha, (D, 1)) + np.tile(N / T, (T, 1)).T |
| 205 | |
| 206 | def train(self, corpus, verbose=False, max_iter=1000, tol=5): |
| 207 | """ |
| 208 | Train the LDA model on a corpus of documents (bags of words). |
| 209 | |
| 210 | Parameters |
| 211 | ---------- |
| 212 | corpus : list of length `D` |
| 213 | A list of lists, with each sublist containing the tokenized text of |
| 214 | a single document. |
| 215 | verbose : bool |
| 216 | Whether to print the VLB at each training iteration. Default is |
| 217 | True. |
| 218 | max_iter : int |
| 219 | The maximum number of training iterations to perform before |
| 220 | breaking. Default is 1000. |
| 221 | tol : int |
| 222 | Break the training loop if the difference betwen the VLB on the |
| 223 | current iteration and the previous iteration is less than `tol`. |
| 224 | Default is 5. |
| 225 | """ |
| 226 | self.D = len(corpus) |
| 227 | self.V = len(set(np.concatenate(corpus))) |
| 228 | self.N = np.array([len(d) for d in corpus]) |
| 229 | self.corpus = corpus |
| 230 | |
| 231 | self.initialize_parameters() |
| 232 | vlb = -np.inf |
| 233 | |
| 234 | for i in range(max_iter): |
| 235 | old_vlb = vlb |
| 236 | |
| 237 | self._E_step() |
| 238 | self._M_step() |
| 239 | |
| 240 | vlb = self.VLB() |
| 241 | delta = vlb - old_vlb |
| 242 | |
| 243 | if verbose: |
| 244 | print("Iteration {}: {:.3f} (delta: {:.2f})".format(i + 1, vlb, delta)) |
| 245 | |
| 246 | if delta < tol: |
| 247 | break |
| 248 | |
| 249 | |
| 250 | ####################################################################### |
no test coverage detected