hub / github.com/togethercomputer/RedPajama-Data / __init__

Method init

app/src/core/document.py:49–107 · view source on GitHub ↗

(
            self, content: str, domain: Optional[str],
            precompute_ngrams: bool = False,
            precompute_hash_features: bool = False,
            dsir_buckets: Optional[int] = None
    )

Source from the content-addressed store, hash-verified

47	)
48
49	def __init__(
50	self, content: str, domain: Optional[str],
51	precompute_ngrams: bool = False,
52	precompute_hash_features: bool = False,
53	dsir_buckets: Optional[int] = None
54	):
55	self._raw_content = content
56	self._domain = domain
57
58	# the normalized content: lowercased and punctuation removed
59	self._normalized_content = normalize(content)
60
61	# the lines of the document (split by newline)
62	self._raw_lines: Tuple[TextSlice] = split_paragraphs(
63	text=content, normalizer=lambda x: x, remove_empty=False
64	)
65
66	# the lines of the document (split by newline), normalized
67	self._normalized_lines: Tuple[TextSlice] = split_paragraphs(
68	text=content, normalizer=normalize, remove_empty=False
69	)
70
71	# the words of the document after normalization
72	self._raw_words = tuple(_word_tokenizer.tokenize(self._raw_content))
73
74	# the normalized words of the document (split by whitespace)
75	self._normalized_words = tuple(self._normalized_content.split())
76
77	# get number of words before and after normalization
78	self._num_raw_words = len(self._raw_words)
79	self._num_normalized_words = len(self._normalized_words)
80
81	# precompute ngrams
82	if precompute_ngrams:
83	# raw grams
84	self._raw_2grams = _compute_ngrams(self._raw_words, 2)
85	self._raw_3grams = _compute_ngrams(self._raw_words, 3)
86
87	# normalized grams
88	self._norm_2grams = _compute_ngrams(self._normalized_words, 2)
89	self._norm_3grams = _compute_ngrams(self._normalized_words, 3)
90	self._norm_4grams = _compute_ngrams(self._normalized_words, 4)
91	else:
92	self._raw_2grams = None
93	self._raw_3grams = None
94	self._norm_2grams = None
95	self._norm_3grams = None
96	self._norm_4grams = None
97
98	# precomupte hash features
99	if precompute_hash_features:
100	bigrams = self._raw_2grams or _compute_ngrams(self._raw_words, 2)
101	self._hash_features = hash_feature(
102	unigrams=self._raw_words,
103	bigrams=bigrams,
104	buckets=dsir_buckets
105	)
106	else:

Callers

nothing calls this directly

Calls 4

normalizeFunction · 0.90

hash_featureFunction · 0.90

split_paragraphsFunction · 0.85

_compute_ngramsFunction · 0.85

Tested by

no test coverage detected

Method __init__

Source from the content-addressed store, hash-verified

Callers

Calls 4

Tested by

Method init