MCPcopy
hub / github.com/facebookresearch/MetaCLIP / parse_wat

Method parse_wat

metaclip/curation/parse_wat.py:21–39  ·  view source on GitHub ↗
(self, wat_fn, snapshot, warc_id)

Source from the content-addressed store, hash-verified

19 KOI = ["alt", "title", "data-image-title"]
20
21 def parse_wat(self, wat_fn, snapshot, warc_id):
22 data = []
23 num_records = 0
24 with gzip.open(wat_fn) as fr:
25 looking_for_json = False
26 for line in fr:
27 try:
28 line = line.decode().strip()
29 except Exception as e:
30 print(e)
31 continue
32 if line.startswith("WARC-Target-URI"):
33 target_uri = line[len("WARC-Target-URI: ") :]
34 looking_for_json = True
35 if looking_for_json and line.startswith("{"):
36 looking_for_json = False
37 self.parse_json(line, target_uri, snapshot, warc_id, data)
38 num_records += 1
39 return data
40
41 def parse_json(self, line, target_uri, snapshot, warc_id, data):
42 try:

Callers 1

parseFunction · 0.80

Calls 2

parse_jsonMethod · 0.95
decodeMethod · 0.80

Tested by

no test coverage detected