(self, wat_fn, snapshot, warc_id)
| 19 | KOI = ["alt", "title", "data-image-title"] |
| 20 | |
| 21 | def parse_wat(self, wat_fn, snapshot, warc_id): |
| 22 | data = [] |
| 23 | num_records = 0 |
| 24 | with gzip.open(wat_fn) as fr: |
| 25 | looking_for_json = False |
| 26 | for line in fr: |
| 27 | try: |
| 28 | line = line.decode().strip() |
| 29 | except Exception as e: |
| 30 | print(e) |
| 31 | continue |
| 32 | if line.startswith("WARC-Target-URI"): |
| 33 | target_uri = line[len("WARC-Target-URI: ") :] |
| 34 | looking_for_json = True |
| 35 | if looking_for_json and line.startswith("{"): |
| 36 | looking_for_json = False |
| 37 | self.parse_json(line, target_uri, snapshot, warc_id, data) |
| 38 | num_records += 1 |
| 39 | return data |
| 40 | |
| 41 | def parse_json(self, line, target_uri, snapshot, warc_id, data): |
| 42 | try: |
no test coverage detected