(self, cc_file, verbose=False)
| 194 | self.lid = lid |
| 195 | |
| 196 | def parse(self, cc_file, verbose=False): |
| 197 | data = [] |
| 198 | num_records = 0 |
| 199 | with gzip.open(cc_file) as fr: |
| 200 | looking_for_json = False |
| 201 | for line in fr: |
| 202 | try: |
| 203 | line = line.decode().strip() |
| 204 | except Exception as e: |
| 205 | print(e) |
| 206 | continue |
| 207 | if line.startswith("WARC-Target-URI"): |
| 208 | target_uri = line[len("WARC-Target-URI: ") :] |
| 209 | looking_for_json = True |
| 210 | if looking_for_json and line.startswith("{"): |
| 211 | looking_for_json = False |
| 212 | self.parse_json(line, target_uri, data) |
| 213 | num_records += 1 |
| 214 | return data |
| 215 | |
| 216 | def parse_json(self, line, target_uri, data): |
| 217 | try: |
no test coverage detected