Compute the vocabulary across a collection of documents. Parameters ---------- corpus_fps : str or list of strs The filepath / list of filepaths for the document(s) to be encoded. Each document is expected to be encoded as newline-separated
(self, corpus_fps, encoding="utf-8-sig")
| 1164 | return [self.idx2token[i] if i in self.idx2token else unk for i in indices] |
| 1165 | |
| 1166 | def fit(self, corpus_fps, encoding="utf-8-sig"): |
| 1167 | """ |
| 1168 | Compute the vocabulary across a collection of documents. |
| 1169 | |
| 1170 | Parameters |
| 1171 | ---------- |
| 1172 | corpus_fps : str or list of strs |
| 1173 | The filepath / list of filepaths for the document(s) to be encoded. |
| 1174 | Each document is expected to be encoded as newline-separated |
| 1175 | string of text, with adjacent tokens separated by a whitespace |
| 1176 | character. |
| 1177 | encoding : str |
| 1178 | Specifies the text encoding for corpus. Common entries are either |
| 1179 | 'utf-8' (no header byte), or 'utf-8-sig' (header byte). Default is |
| 1180 | 'utf-8-sig'. |
| 1181 | |
| 1182 | Returns |
| 1183 | ------- |
| 1184 | self |
| 1185 | """ |
| 1186 | if isinstance(corpus_fps, str): |
| 1187 | corpus_fps = [corpus_fps] |
| 1188 | |
| 1189 | for corpus_fp in corpus_fps: |
| 1190 | assert op.isfile(corpus_fp), "{} does not exist".format(corpus_fp) |
| 1191 | |
| 1192 | tokens = [] |
| 1193 | H = self.hyperparameters |
| 1194 | idx2word, word2idx = {}, {} |
| 1195 | |
| 1196 | tokenizer_dict = { |
| 1197 | "words": tokenize_words, |
| 1198 | "characters": tokenize_chars, |
| 1199 | "whitespace": tokenize_whitespace, |
| 1200 | "bytes": tokenize_bytes_raw, |
| 1201 | } |
| 1202 | |
| 1203 | min_count = H["min_count"] |
| 1204 | lowercase = H["lowercase"] |
| 1205 | max_tokens = H["max_tokens"] |
| 1206 | filter_stop = H["filter_stopwords"] |
| 1207 | filter_punc = H["filter_punctuation"] |
| 1208 | tokenizer = tokenizer_dict[H["tokenizer"]] |
| 1209 | |
| 1210 | H["encoding"] = encoding |
| 1211 | H["corpus_fps"] = corpus_fps |
| 1212 | |
| 1213 | # encode special tokens |
| 1214 | for tt in ["<bol>", "<eol>", "<unk>"]: |
| 1215 | word2idx[tt] = len(tokens) |
| 1216 | idx2word[len(tokens)] = tt |
| 1217 | tokens.append(Token(tt)) |
| 1218 | |
| 1219 | bol_ix = word2idx["<bol>"] |
| 1220 | eol_ix = word2idx["<eol>"] |
| 1221 | |
| 1222 | for d_ix, doc_fp in enumerate(corpus_fps): |
| 1223 | with open(doc_fp, "r", encoding=H["encoding"]) as doc: |