Compute term-frequencies and inverse document frequencies on a collection of documents. Parameters ---------- corpus_seq : str or list of strs The filepath / list of filepaths / raw string contents of the document(s) to be encoded, in
(self, corpus_seq, encoding="utf-8-sig")
| 689 | } |
| 690 | |
| 691 | def fit(self, corpus_seq, encoding="utf-8-sig"): |
| 692 | """ |
| 693 | Compute term-frequencies and inverse document frequencies on a |
| 694 | collection of documents. |
| 695 | |
| 696 | Parameters |
| 697 | ---------- |
| 698 | corpus_seq : str or list of strs |
| 699 | The filepath / list of filepaths / raw string contents of the |
| 700 | document(s) to be encoded, in accordance with the `input_type` |
| 701 | parameter passed to the :meth:`__init__` method. Each document is |
| 702 | expected to be a string of tokens separated by whitespace. |
| 703 | encoding : str |
| 704 | Specifies the text encoding for corpus if `input_type` is `files`. |
| 705 | Common entries are either 'utf-8' (no header byte), or 'utf-8-sig' |
| 706 | (header byte). Default is 'utf-8-sig'. |
| 707 | |
| 708 | Returns |
| 709 | ------- |
| 710 | self |
| 711 | """ |
| 712 | H = self.hyperparameters |
| 713 | |
| 714 | if isinstance(corpus_seq, str): |
| 715 | corpus_seq = [corpus_seq] |
| 716 | |
| 717 | if H["input_type"] == "files": |
| 718 | for corpus_fp in corpus_seq: |
| 719 | assert op.isfile(corpus_fp), "{} does not exist".format(corpus_fp) |
| 720 | |
| 721 | tokens = [] |
| 722 | idx2token, token2idx = {}, {} |
| 723 | |
| 724 | # encode special tokens |
| 725 | for tt in ["<bol>", "<eol>", "<unk>"]: |
| 726 | token2idx[tt] = len(tokens) |
| 727 | idx2token[len(tokens)] = tt |
| 728 | tokens.append(Token(tt)) |
| 729 | |
| 730 | min_count = H["min_count"] |
| 731 | max_tokens = H["max_tokens"] |
| 732 | H["encoding"] = encoding |
| 733 | |
| 734 | bol_ix = token2idx["<bol>"] |
| 735 | eol_ix = token2idx["<eol>"] |
| 736 | idx2doc, term_freq = {}, {} |
| 737 | |
| 738 | # encode the text in `corpus_fps` without any filtering ... |
| 739 | for d_ix, doc in enumerate(corpus_seq): |
| 740 | doc_count = {} |
| 741 | idx2doc[d_ix] = doc if H["input_type"] == "files" else None |
| 742 | token2idx, idx2token, tokens, doc_count = self._encode_document( |
| 743 | doc, token2idx, idx2token, tokens, doc_count, bol_ix, eol_ix, |
| 744 | ) |
| 745 | term_freq[d_ix] = doc_count |
| 746 | |
| 747 | self._tokens = tokens |
| 748 | self._idx2doc = idx2doc |