MCPcopy Index your code
hub / github.com/togethercomputer/RedPajama-Data / __init__

Method __init__

app/src/core/document.py:49–107  ·  view source on GitHub ↗
(
            self, content: str, domain: Optional[str],
            precompute_ngrams: bool = False,
            precompute_hash_features: bool = False,
            dsir_buckets: Optional[int] = None
    )

Source from the content-addressed store, hash-verified

47 )
48
49 def __init__(
50 self, content: str, domain: Optional[str],
51 precompute_ngrams: bool = False,
52 precompute_hash_features: bool = False,
53 dsir_buckets: Optional[int] = None
54 ):
55 self._raw_content = content
56 self._domain = domain
57
58 # the normalized content: lowercased and punctuation removed
59 self._normalized_content = normalize(content)
60
61 # the lines of the document (split by newline)
62 self._raw_lines: Tuple[TextSlice] = split_paragraphs(
63 text=content, normalizer=lambda x: x, remove_empty=False
64 )
65
66 # the lines of the document (split by newline), normalized
67 self._normalized_lines: Tuple[TextSlice] = split_paragraphs(
68 text=content, normalizer=normalize, remove_empty=False
69 )
70
71 # the words of the document after normalization
72 self._raw_words = tuple(_word_tokenizer.tokenize(self._raw_content))
73
74 # the normalized words of the document (split by whitespace)
75 self._normalized_words = tuple(self._normalized_content.split())
76
77 # get number of words before and after normalization
78 self._num_raw_words = len(self._raw_words)
79 self._num_normalized_words = len(self._normalized_words)
80
81 # precompute ngrams
82 if precompute_ngrams:
83 # raw grams
84 self._raw_2grams = _compute_ngrams(self._raw_words, 2)
85 self._raw_3grams = _compute_ngrams(self._raw_words, 3)
86
87 # normalized grams
88 self._norm_2grams = _compute_ngrams(self._normalized_words, 2)
89 self._norm_3grams = _compute_ngrams(self._normalized_words, 3)
90 self._norm_4grams = _compute_ngrams(self._normalized_words, 4)
91 else:
92 self._raw_2grams = None
93 self._raw_3grams = None
94 self._norm_2grams = None
95 self._norm_3grams = None
96 self._norm_4grams = None
97
98 # precomupte hash features
99 if precompute_hash_features:
100 bigrams = self._raw_2grams or _compute_ngrams(self._raw_words, 2)
101 self._hash_features = hash_feature(
102 unigrams=self._raw_words,
103 bigrams=bigrams,
104 buckets=dsir_buckets
105 )
106 else:

Callers

nothing calls this directly

Calls 4

normalizeFunction · 0.90
hash_featureFunction · 0.90
split_paragraphsFunction · 0.85
_compute_ngramsFunction · 0.85

Tested by

no test coverage detected