hub / github.com/togethercomputer/RedPajama-Data / Document

Class Document

app/src/core/document.py:40–178 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

38
39
40	class Document:
41	__slots__ = (
42	"_raw_content", "_normalized_content", "_raw_lines",
43	"_normalized_lines", "_raw_words", "_normalized_words",
44	"_num_raw_words", "_num_normalized_words", "_domain", "_raw_2grams",
45	"_raw_3grams", "_norm_2grams", "_norm_3grams", "_norm_4grams",
46	"_hash_features"
47	)
48
49	def __init__(
50	self, content: str, domain: Optional[str],
51	precompute_ngrams: bool = False,
52	precompute_hash_features: bool = False,
53	dsir_buckets: Optional[int] = None
54	):
55	self._raw_content = content
56	self._domain = domain
57
58	# the normalized content: lowercased and punctuation removed
59	self._normalized_content = normalize(content)
60
61	# the lines of the document (split by newline)
62	self._raw_lines: Tuple[TextSlice] = split_paragraphs(
63	text=content, normalizer=lambda x: x, remove_empty=False
64	)
65
66	# the lines of the document (split by newline), normalized
67	self._normalized_lines: Tuple[TextSlice] = split_paragraphs(
68	text=content, normalizer=normalize, remove_empty=False
69	)
70
71	# the words of the document after normalization
72	self._raw_words = tuple(_word_tokenizer.tokenize(self._raw_content))
73
74	# the normalized words of the document (split by whitespace)
75	self._normalized_words = tuple(self._normalized_content.split())
76
77	# get number of words before and after normalization
78	self._num_raw_words = len(self._raw_words)
79	self._num_normalized_words = len(self._normalized_words)
80
81	# precompute ngrams
82	if precompute_ngrams:
83	# raw grams
84	self._raw_2grams = _compute_ngrams(self._raw_words, 2)
85	self._raw_3grams = _compute_ngrams(self._raw_words, 3)
86
87	# normalized grams
88	self._norm_2grams = _compute_ngrams(self._normalized_words, 2)
89	self._norm_3grams = _compute_ngrams(self._normalized_words, 3)
90	self._norm_4grams = _compute_ngrams(self._normalized_words, 4)
91	else:
92	self._raw_2grams = None
93	self._raw_3grams = None
94	self._norm_2grams = None
95	self._norm_3grams = None
96	self._norm_4grams = None
97

Callers 3

__process_recordMethod · 0.90

__write_train_chunkMethod · 0.90

_compute_hash_featuresFunction · 0.90

Calls

no outgoing calls

Tested by

no test coverage detected