(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU)
| 131 | |
| 132 | |
| 133 | def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU): |
| 134 | # there are 32 files |
| 135 | ### note: you can pick between Wikipedia data and Brown corpus |
| 136 | ### just comment one out, and uncomment the other! |
| 137 | # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000) |
| 138 | sentences, word2idx = get_sentences_with_word2idx_limit_vocab() |
| 139 | print("finished retrieving data") |
| 140 | print("vocab size:", len(word2idx), "number of sentences:", len(sentences)) |
| 141 | rnn = RNN(30, [30], len(word2idx)) |
| 142 | rnn.fit(sentences, learning_rate=2*1e-4, epochs=10, show_fig=True, activation=T.nnet.relu) |
| 143 | |
| 144 | np.save(we_file, rnn.We.get_value()) |
| 145 | with open(w2i_file, 'w') as f: |
| 146 | json.dump(word2idx, f) |
| 147 | |
| 148 | |
| 149 |
no test coverage detected