(
self, content: str, domain: Optional[str],
precompute_ngrams: bool = False,
precompute_hash_features: bool = False,
dsir_buckets: Optional[int] = None
)
| 47 | ) |
| 48 | |
| 49 | def __init__( |
| 50 | self, content: str, domain: Optional[str], |
| 51 | precompute_ngrams: bool = False, |
| 52 | precompute_hash_features: bool = False, |
| 53 | dsir_buckets: Optional[int] = None |
| 54 | ): |
| 55 | self._raw_content = content |
| 56 | self._domain = domain |
| 57 | |
| 58 | # the normalized content: lowercased and punctuation removed |
| 59 | self._normalized_content = normalize(content) |
| 60 | |
| 61 | # the lines of the document (split by newline) |
| 62 | self._raw_lines: Tuple[TextSlice] = split_paragraphs( |
| 63 | text=content, normalizer=lambda x: x, remove_empty=False |
| 64 | ) |
| 65 | |
| 66 | # the lines of the document (split by newline), normalized |
| 67 | self._normalized_lines: Tuple[TextSlice] = split_paragraphs( |
| 68 | text=content, normalizer=normalize, remove_empty=False |
| 69 | ) |
| 70 | |
| 71 | # the words of the document after normalization |
| 72 | self._raw_words = tuple(_word_tokenizer.tokenize(self._raw_content)) |
| 73 | |
| 74 | # the normalized words of the document (split by whitespace) |
| 75 | self._normalized_words = tuple(self._normalized_content.split()) |
| 76 | |
| 77 | # get number of words before and after normalization |
| 78 | self._num_raw_words = len(self._raw_words) |
| 79 | self._num_normalized_words = len(self._normalized_words) |
| 80 | |
| 81 | # precompute ngrams |
| 82 | if precompute_ngrams: |
| 83 | # raw grams |
| 84 | self._raw_2grams = _compute_ngrams(self._raw_words, 2) |
| 85 | self._raw_3grams = _compute_ngrams(self._raw_words, 3) |
| 86 | |
| 87 | # normalized grams |
| 88 | self._norm_2grams = _compute_ngrams(self._normalized_words, 2) |
| 89 | self._norm_3grams = _compute_ngrams(self._normalized_words, 3) |
| 90 | self._norm_4grams = _compute_ngrams(self._normalized_words, 4) |
| 91 | else: |
| 92 | self._raw_2grams = None |
| 93 | self._raw_3grams = None |
| 94 | self._norm_2grams = None |
| 95 | self._norm_3grams = None |
| 96 | self._norm_4grams = None |
| 97 | |
| 98 | # precomupte hash features |
| 99 | if precompute_hash_features: |
| 100 | bigrams = self._raw_2grams or _compute_ngrams(self._raw_words, 2) |
| 101 | self._hash_features = hash_feature( |
| 102 | unigrams=self._raw_words, |
| 103 | bigrams=bigrams, |
| 104 | buckets=dsir_buckets |
| 105 | ) |
| 106 | else: |
nothing calls this directly
no test coverage detected