Train a byte pair codebook on a set of documents. Parameters ---------- corpus_fps : str or list of strs The filepath / list of filepaths for the document(s) to be used to learn the byte pair codebook. encoding : str The t
(self, corpus_fps, encoding="utf-8")
| 218 | self.token2byte = OrderedDict({v: k for k, v in self.byte2token.items()}) |
| 219 | |
| 220 | def fit(self, corpus_fps, encoding="utf-8"): |
| 221 | """ |
| 222 | Train a byte pair codebook on a set of documents. |
| 223 | |
| 224 | Parameters |
| 225 | ---------- |
| 226 | corpus_fps : str or list of strs |
| 227 | The filepath / list of filepaths for the document(s) to be used to |
| 228 | learn the byte pair codebook. |
| 229 | encoding : str |
| 230 | The text encoding for documents. Common entries are either 'utf-8' |
| 231 | (no header byte), or 'utf-8-sig' (header byte). Default is |
| 232 | 'utf-8'. |
| 233 | """ |
| 234 | vocab = ( |
| 235 | Vocabulary( |
| 236 | lowercase=False, |
| 237 | min_count=None, |
| 238 | max_tokens=None, |
| 239 | filter_stopwords=False, |
| 240 | filter_punctuation=False, |
| 241 | tokenizer="bytes", |
| 242 | ) |
| 243 | .fit(corpus_fps, encoding=encoding) |
| 244 | .counts |
| 245 | ) |
| 246 | |
| 247 | # iteratively merge the most common byte bigram across the documents |
| 248 | for _ in range(self.parameters["max_merges"]): |
| 249 | pair_counts = self._get_counts(vocab) |
| 250 | most_common_bigram = max(pair_counts, key=pair_counts.get) |
| 251 | vocab = self._merge(most_common_bigram, vocab) |
| 252 | |
| 253 | token_bytes = set() |
| 254 | for k in vocab.keys(): |
| 255 | token_bytes = token_bytes.union([w for w in k.split(" ") if "-" in w]) |
| 256 | |
| 257 | for i, t in enumerate(token_bytes): |
| 258 | byte_tuple = tuple(int(j) for j in t.split("-")) |
| 259 | self.token2byte[256 + i] = byte_tuple |
| 260 | self.byte2token[byte_tuple] = 256 + i |
| 261 | |
| 262 | return self |
| 263 | |
| 264 | def _get_counts(self, vocab): |
| 265 | """Collect bigram counts for the tokens in vocab""" |
nothing calls this directly
no test coverage detected