MCPcopy Index your code
hub / github.com/ddbourgin/numpy-ml / _calc_idf

Method _calc_idf

numpy_ml/preprocessing/nlp.py:934–962  ·  view source on GitHub ↗

Compute the (smoothed-) inverse-document frequency for each token in the corpus. For a word token `w`, the IDF is simply IDF(w) = log ( |D| / |{ d in D: w in d }| ) + 1 where D is the set of all documents in the corpus, D = {d1, d2, ..., d

(self)

Source from the content-addressed store, hash-verified

932 self.vocab_counts = Counter({t.word: t.count for t in self._tokens})
933
934 def _calc_idf(self):
935 """
936 Compute the (smoothed-) inverse-document frequency for each token in
937 the corpus.
938
939 For a word token `w`, the IDF is simply
940
941 IDF(w) = log ( |D| / |{ d in D: w in d }| ) + 1
942
943 where D is the set of all documents in the corpus,
944
945 D = {d1, d2, ..., dD}
946
947 If `smooth_idf` is True, we perform additive smoothing on the number of
948 documents containing a given word, equivalent to pretending that there
949 exists a final D+1st document that contains every word in the corpus:
950
951 SmoothedIDF(w) = log ( |D| + 1 / [1 + |{ d in D: w in d }|] ) + 1
952 """
953 inv_doc_freq = {}
954 smooth_idf = self.hyperparameters["smooth_idf"]
955 tf, doc_idxs = self.term_freq, self._idx2doc.keys()
956
957 D = len(self._idx2doc) + int(smooth_idf)
958 for word, w_ix in self.token2idx.items():
959 d_count = int(smooth_idf)
960 d_count += np.sum([1 if w_ix in tf[d_ix] else 0 for d_ix in doc_idxs])
961 inv_doc_freq[w_ix] = 1 if d_count == 0 else np.log(D / d_count) + 1
962 self.inv_doc_freq = inv_doc_freq
963
964 def transform(self, ignore_special_chars=True):
965 """

Callers 1

fitMethod · 0.95

Calls

no outgoing calls

Tested by

no test coverage detected