MCPcopy
hub / github.com/facebookresearch/MetaCLIP / parse

Method parse

metaclip/metaclip1/cc_matching.py:196–214  ·  view source on GitHub ↗
(self, cc_file, verbose=False)

Source from the content-addressed store, hash-verified

194 self.lid = lid
195
196 def parse(self, cc_file, verbose=False):
197 data = []
198 num_records = 0
199 with gzip.open(cc_file) as fr:
200 looking_for_json = False
201 for line in fr:
202 try:
203 line = line.decode().strip()
204 except Exception as e:
205 print(e)
206 continue
207 if line.startswith("WARC-Target-URI"):
208 target_uri = line[len("WARC-Target-URI: ") :]
209 looking_for_json = True
210 if looking_for_json and line.startswith("{"):
211 looking_for_json = False
212 self.parse_json(line, target_uri, data)
213 num_records += 1
214 return data
215
216 def parse_json(self, line, target_uri, data):
217 try:

Callers 1

tokenizeFunction · 0.45

Calls 2

parse_jsonMethod · 0.95
decodeMethod · 0.80

Tested by

no test coverage detected