MCPcopy
hub / github.com/ddbourgin/numpy-ml / fit

Method fit

numpy_ml/preprocessing/nlp.py:691–767  ·  view source on GitHub ↗

Compute term-frequencies and inverse document frequencies on a collection of documents. Parameters ---------- corpus_seq : str or list of strs The filepath / list of filepaths / raw string contents of the document(s) to be encoded, in

(self, corpus_seq, encoding="utf-8-sig")

Source from the content-addressed store, hash-verified

689 }
690
691 def fit(self, corpus_seq, encoding="utf-8-sig"):
692 """
693 Compute term-frequencies and inverse document frequencies on a
694 collection of documents.
695
696 Parameters
697 ----------
698 corpus_seq : str or list of strs
699 The filepath / list of filepaths / raw string contents of the
700 document(s) to be encoded, in accordance with the `input_type`
701 parameter passed to the :meth:`__init__` method. Each document is
702 expected to be a string of tokens separated by whitespace.
703 encoding : str
704 Specifies the text encoding for corpus if `input_type` is `files`.
705 Common entries are either 'utf-8' (no header byte), or 'utf-8-sig'
706 (header byte). Default is 'utf-8-sig'.
707
708 Returns
709 -------
710 self
711 """
712 H = self.hyperparameters
713
714 if isinstance(corpus_seq, str):
715 corpus_seq = [corpus_seq]
716
717 if H["input_type"] == "files":
718 for corpus_fp in corpus_seq:
719 assert op.isfile(corpus_fp), "{} does not exist".format(corpus_fp)
720
721 tokens = []
722 idx2token, token2idx = {}, {}
723
724 # encode special tokens
725 for tt in ["<bol>", "<eol>", "<unk>"]:
726 token2idx[tt] = len(tokens)
727 idx2token[len(tokens)] = tt
728 tokens.append(Token(tt))
729
730 min_count = H["min_count"]
731 max_tokens = H["max_tokens"]
732 H["encoding"] = encoding
733
734 bol_ix = token2idx["<bol>"]
735 eol_ix = token2idx["<eol>"]
736 idx2doc, term_freq = {}, {}
737
738 # encode the text in `corpus_fps` without any filtering ...
739 for d_ix, doc in enumerate(corpus_seq):
740 doc_count = {}
741 idx2doc[d_ix] = doc if H["input_type"] == "files" else None
742 token2idx, idx2token, tokens, doc_count = self._encode_document(
743 doc, token2idx, idx2token, tokens, doc_count, bol_ix, eol_ix,
744 )
745 term_freq[d_ix] = doc_count
746
747 self._tokens = tokens
748 self._idx2doc = idx2doc

Callers 1

test_tfidfFunction · 0.95

Calls 6

_encode_documentMethod · 0.95
_keep_top_n_tokensMethod · 0.95
_drop_low_freq_tokensMethod · 0.95
_sort_tokensMethod · 0.95
_calc_idfMethod · 0.95
TokenClass · 0.85

Tested by 1

test_tfidfFunction · 0.76