MCPcopy Index your code
hub / github.com/ddbourgin/numpy-ml / fit

Method fit

numpy_ml/preprocessing/nlp.py:1166–1262  ·  view source on GitHub ↗

Compute the vocabulary across a collection of documents. Parameters ---------- corpus_fps : str or list of strs The filepath / list of filepaths for the document(s) to be encoded. Each document is expected to be encoded as newline-separated

(self, corpus_fps, encoding="utf-8-sig")

Source from the content-addressed store, hash-verified

1164 return [self.idx2token[i] if i in self.idx2token else unk for i in indices]
1165
1166 def fit(self, corpus_fps, encoding="utf-8-sig"):
1167 """
1168 Compute the vocabulary across a collection of documents.
1169
1170 Parameters
1171 ----------
1172 corpus_fps : str or list of strs
1173 The filepath / list of filepaths for the document(s) to be encoded.
1174 Each document is expected to be encoded as newline-separated
1175 string of text, with adjacent tokens separated by a whitespace
1176 character.
1177 encoding : str
1178 Specifies the text encoding for corpus. Common entries are either
1179 'utf-8' (no header byte), or 'utf-8-sig' (header byte). Default is
1180 'utf-8-sig'.
1181
1182 Returns
1183 -------
1184 self
1185 """
1186 if isinstance(corpus_fps, str):
1187 corpus_fps = [corpus_fps]
1188
1189 for corpus_fp in corpus_fps:
1190 assert op.isfile(corpus_fp), "{} does not exist".format(corpus_fp)
1191
1192 tokens = []
1193 H = self.hyperparameters
1194 idx2word, word2idx = {}, {}
1195
1196 tokenizer_dict = {
1197 "words": tokenize_words,
1198 "characters": tokenize_chars,
1199 "whitespace": tokenize_whitespace,
1200 "bytes": tokenize_bytes_raw,
1201 }
1202
1203 min_count = H["min_count"]
1204 lowercase = H["lowercase"]
1205 max_tokens = H["max_tokens"]
1206 filter_stop = H["filter_stopwords"]
1207 filter_punc = H["filter_punctuation"]
1208 tokenizer = tokenizer_dict[H["tokenizer"]]
1209
1210 H["encoding"] = encoding
1211 H["corpus_fps"] = corpus_fps
1212
1213 # encode special tokens
1214 for tt in ["<bol>", "<eol>", "<unk>"]:
1215 word2idx[tt] = len(tokens)
1216 idx2word[len(tokens)] = tt
1217 tokens.append(Token(tt))
1218
1219 bol_ix = word2idx["<bol>"]
1220 eol_ix = word2idx["<eol>"]
1221
1222 for d_ix, doc_fp in enumerate(corpus_fps):
1223 with open(doc_fp, "r", encoding=H["encoding"]) as doc:

Callers 3

fitMethod · 0.45
test_HMMFunction · 0.45
plot_logisticFunction · 0.45

Calls 3

_drop_low_freq_tokensMethod · 0.95
_keep_top_n_tokensMethod · 0.95
TokenClass · 0.85

Tested by 1

test_HMMFunction · 0.36