MCPcopy Index your code
hub / github.com/togethercomputer/RedPajama-Data / Document

Class Document

app/src/core/document.py:40–178  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

38
39
40class Document:
41 __slots__ = (
42 "_raw_content", "_normalized_content", "_raw_lines",
43 "_normalized_lines", "_raw_words", "_normalized_words",
44 "_num_raw_words", "_num_normalized_words", "_domain", "_raw_2grams",
45 "_raw_3grams", "_norm_2grams", "_norm_3grams", "_norm_4grams",
46 "_hash_features"
47 )
48
49 def __init__(
50 self, content: str, domain: Optional[str],
51 precompute_ngrams: bool = False,
52 precompute_hash_features: bool = False,
53 dsir_buckets: Optional[int] = None
54 ):
55 self._raw_content = content
56 self._domain = domain
57
58 # the normalized content: lowercased and punctuation removed
59 self._normalized_content = normalize(content)
60
61 # the lines of the document (split by newline)
62 self._raw_lines: Tuple[TextSlice] = split_paragraphs(
63 text=content, normalizer=lambda x: x, remove_empty=False
64 )
65
66 # the lines of the document (split by newline), normalized
67 self._normalized_lines: Tuple[TextSlice] = split_paragraphs(
68 text=content, normalizer=normalize, remove_empty=False
69 )
70
71 # the words of the document after normalization
72 self._raw_words = tuple(_word_tokenizer.tokenize(self._raw_content))
73
74 # the normalized words of the document (split by whitespace)
75 self._normalized_words = tuple(self._normalized_content.split())
76
77 # get number of words before and after normalization
78 self._num_raw_words = len(self._raw_words)
79 self._num_normalized_words = len(self._normalized_words)
80
81 # precompute ngrams
82 if precompute_ngrams:
83 # raw grams
84 self._raw_2grams = _compute_ngrams(self._raw_words, 2)
85 self._raw_3grams = _compute_ngrams(self._raw_words, 3)
86
87 # normalized grams
88 self._norm_2grams = _compute_ngrams(self._normalized_words, 2)
89 self._norm_3grams = _compute_ngrams(self._normalized_words, 3)
90 self._norm_4grams = _compute_ngrams(self._normalized_words, 4)
91 else:
92 self._raw_2grams = None
93 self._raw_3grams = None
94 self._norm_2grams = None
95 self._norm_3grams = None
96 self._norm_4grams = None
97

Callers 3

__process_recordMethod · 0.90
__write_train_chunkMethod · 0.90
_compute_hash_featuresFunction · 0.90

Calls

no outgoing calls

Tested by

no test coverage detected