(
self, idx: int, record, uri_id: str, snapshot_id: str
)
| 215 | return callables |
| 216 | |
| 217 | def __process_record( |
| 218 | self, idx: int, record, uri_id: str, snapshot_id: str |
| 219 | ): |
| 220 | # Setup document; this precomputes ngrams and hash features |
| 221 | document = Document( |
| 222 | record.raw_content, |
| 223 | domain=record.source_domain, |
| 224 | precompute_ngrams=True, |
| 225 | precompute_hash_features=True, |
| 226 | dsir_buckets=self._dsir_buckets |
| 227 | ) |
| 228 | |
| 229 | # compute signals |
| 230 | rp_v2_signals = {} |
| 231 | for func in self._quality_signals: |
| 232 | rp_v2_signals[func.field_name] = func(document) # noqa |
| 233 | |
| 234 | # compute minhash signatures |
| 235 | minhash_signatures = self._minhash.compute_banded_signatures( |
| 236 | tokens=document.normalized_words |
| 237 | ) |
| 238 | |
| 239 | # compute document ids |
| 240 | doc_id = f"{uri_id}/{idx}" |
| 241 | doc_id_int = int.from_bytes( |
| 242 | hashlib.sha1(doc_id.encode("utf-8")).digest()[:8], # take 8 bytes |
| 243 | byteorder=_BYTE_ORDER, signed=False |
| 244 | ) |
| 245 | |
| 246 | record_data = { |
| 247 | "id": f"{uri_id}/{idx}", |
| 248 | "id_int": doc_id_int, |
| 249 | } |
| 250 | |
| 251 | metadata = { |
| 252 | "cc_segment": record.cc_segment, |
| 253 | "cc_net_source": uri_id, |
| 254 | "url": record.url, |
| 255 | "source_domain": record.source_domain, |
| 256 | "language": record.language, |
| 257 | "snapshot_id": snapshot_id |
| 258 | } |
| 259 | |
| 260 | ccnet_quality_signals = { |
| 261 | "ccnet_length": ( |
| 262 | (0, len(document), float(record.length)), |
| 263 | ), |
| 264 | "ccnet_original_length": ( |
| 265 | (0, len(document), float(record.original_length)), |
| 266 | ), |
| 267 | "ccnet_nlines": ( |
| 268 | (0, len(document), float(record.nlines)), |
| 269 | ), |
| 270 | "ccnet_original_nlines": ( |
| 271 | (0, len(document), float(record.original_nlines)), |
| 272 | ), |
| 273 | "ccnet_language_score": ( |
| 274 | (0, len(document), float(record.language_score)), |
no test coverage detected