MCPcopy Index your code
hub / github.com/togethercomputer/RedPajama-Data / __process_record

Method __process_record

app/src/core/worker.py:217–289  ·  view source on GitHub ↗
(
            self, idx: int, record, uri_id: str, snapshot_id: str
    )

Source from the content-addressed store, hash-verified

215 return callables
216
217 def __process_record(
218 self, idx: int, record, uri_id: str, snapshot_id: str
219 ):
220 # Setup document; this precomputes ngrams and hash features
221 document = Document(
222 record.raw_content,
223 domain=record.source_domain,
224 precompute_ngrams=True,
225 precompute_hash_features=True,
226 dsir_buckets=self._dsir_buckets
227 )
228
229 # compute signals
230 rp_v2_signals = {}
231 for func in self._quality_signals:
232 rp_v2_signals[func.field_name] = func(document) # noqa
233
234 # compute minhash signatures
235 minhash_signatures = self._minhash.compute_banded_signatures(
236 tokens=document.normalized_words
237 )
238
239 # compute document ids
240 doc_id = f"{uri_id}/{idx}"
241 doc_id_int = int.from_bytes(
242 hashlib.sha1(doc_id.encode("utf-8")).digest()[:8], # take 8 bytes
243 byteorder=_BYTE_ORDER, signed=False
244 )
245
246 record_data = {
247 "id": f"{uri_id}/{idx}",
248 "id_int": doc_id_int,
249 }
250
251 metadata = {
252 "cc_segment": record.cc_segment,
253 "cc_net_source": uri_id,
254 "url": record.url,
255 "source_domain": record.source_domain,
256 "language": record.language,
257 "snapshot_id": snapshot_id
258 }
259
260 ccnet_quality_signals = {
261 "ccnet_length": (
262 (0, len(document), float(record.length)),
263 ),
264 "ccnet_original_length": (
265 (0, len(document), float(record.original_length)),
266 ),
267 "ccnet_nlines": (
268 (0, len(document), float(record.nlines)),
269 ),
270 "ccnet_original_nlines": (
271 (0, len(document), float(record.original_nlines)),
272 ),
273 "ccnet_language_score": (
274 (0, len(document), float(record.language_score)),

Callers 1

__process_uriMethod · 0.95

Calls 3

DocumentClass · 0.90
_ccnet_bucket_to_intFunction · 0.85

Tested by

no test coverage detected