| 38 | |
| 39 | |
| 40 | class Document: |
| 41 | __slots__ = ( |
| 42 | "_raw_content", "_normalized_content", "_raw_lines", |
| 43 | "_normalized_lines", "_raw_words", "_normalized_words", |
| 44 | "_num_raw_words", "_num_normalized_words", "_domain", "_raw_2grams", |
| 45 | "_raw_3grams", "_norm_2grams", "_norm_3grams", "_norm_4grams", |
| 46 | "_hash_features" |
| 47 | ) |
| 48 | |
| 49 | def __init__( |
| 50 | self, content: str, domain: Optional[str], |
| 51 | precompute_ngrams: bool = False, |
| 52 | precompute_hash_features: bool = False, |
| 53 | dsir_buckets: Optional[int] = None |
| 54 | ): |
| 55 | self._raw_content = content |
| 56 | self._domain = domain |
| 57 | |
| 58 | # the normalized content: lowercased and punctuation removed |
| 59 | self._normalized_content = normalize(content) |
| 60 | |
| 61 | # the lines of the document (split by newline) |
| 62 | self._raw_lines: Tuple[TextSlice] = split_paragraphs( |
| 63 | text=content, normalizer=lambda x: x, remove_empty=False |
| 64 | ) |
| 65 | |
| 66 | # the lines of the document (split by newline), normalized |
| 67 | self._normalized_lines: Tuple[TextSlice] = split_paragraphs( |
| 68 | text=content, normalizer=normalize, remove_empty=False |
| 69 | ) |
| 70 | |
| 71 | # the words of the document after normalization |
| 72 | self._raw_words = tuple(_word_tokenizer.tokenize(self._raw_content)) |
| 73 | |
| 74 | # the normalized words of the document (split by whitespace) |
| 75 | self._normalized_words = tuple(self._normalized_content.split()) |
| 76 | |
| 77 | # get number of words before and after normalization |
| 78 | self._num_raw_words = len(self._raw_words) |
| 79 | self._num_normalized_words = len(self._normalized_words) |
| 80 | |
| 81 | # precompute ngrams |
| 82 | if precompute_ngrams: |
| 83 | # raw grams |
| 84 | self._raw_2grams = _compute_ngrams(self._raw_words, 2) |
| 85 | self._raw_3grams = _compute_ngrams(self._raw_words, 3) |
| 86 | |
| 87 | # normalized grams |
| 88 | self._norm_2grams = _compute_ngrams(self._normalized_words, 2) |
| 89 | self._norm_3grams = _compute_ngrams(self._normalized_words, 3) |
| 90 | self._norm_4grams = _compute_ngrams(self._normalized_words, 4) |
| 91 | else: |
| 92 | self._raw_2grams = None |
| 93 | self._raw_3grams = None |
| 94 | self._norm_2grams = None |
| 95 | self._norm_3grams = None |
| 96 | self._norm_4grams = None |
| 97 |
no outgoing calls
no test coverage detected