| 429 | |
| 430 | |
| 431 | class HuffmanEncoder(object): |
| 432 | def fit(self, text): |
| 433 | """ |
| 434 | Build a Huffman tree for the tokens in `text` and compute each token's |
| 435 | binary encoding. |
| 436 | |
| 437 | Notes |
| 438 | ----- |
| 439 | In a Huffman code, tokens that occur more frequently are (generally) |
| 440 | represented using fewer bits. Huffman codes produce the minimum expected |
| 441 | codeword length among all methods for encoding tokens individually. |
| 442 | |
| 443 | Huffman codes correspond to paths through a binary tree, with 1 |
| 444 | corresponding to "move right" and 0 corresponding to "move left". In |
| 445 | contrast to standard binary trees, the Huffman tree is constructed from the |
| 446 | bottom up. Construction begins by initializing a min-heap priority queue |
| 447 | consisting of each token in the corpus, with priority corresponding to the |
| 448 | token frequency. At each step, the two most infrequent tokens in the corpus |
| 449 | are removed and become the children of a parent pseudotoken whose |
| 450 | "frequency" is the sum of the frequencies of its children. This new parent |
| 451 | pseudotoken is added to the priority queue and the process is repeated |
| 452 | recursively until no tokens remain. |
| 453 | |
| 454 | Parameters |
| 455 | ---------- |
| 456 | text: list of strs or :class:`Vocabulary` instance |
| 457 | The tokenized text or a pretrained :class:`Vocabulary` object to use for |
| 458 | building the Huffman code. |
| 459 | """ |
| 460 | self._build_tree(text) |
| 461 | self._generate_codes() |
| 462 | |
| 463 | def transform(self, text): |
| 464 | """ |
| 465 | Transform the words in `text` into their Huffman-code representations. |
| 466 | |
| 467 | Parameters |
| 468 | ---------- |
| 469 | text: list of `N` strings |
| 470 | The list of words to encode |
| 471 | |
| 472 | Returns |
| 473 | ------- |
| 474 | codes : list of `N` binary strings |
| 475 | The encoded words in `text` |
| 476 | """ |
| 477 | if isinstance(text, str): |
| 478 | text = [text] |
| 479 | for token in set(text): |
| 480 | if token not in self._item2code: |
| 481 | raise Warning("Token '{}' not in Huffman tree. Skipping".format(token)) |
| 482 | return [self._item2code.get(t, None) for t in text] |
| 483 | |
| 484 | def inverse_transform(self, codes): |
| 485 | """ |
| 486 | Transform an encoded sequence of bit-strings back into words. |
| 487 | |
| 488 | Parameters |