(self, json_line)
| 67 | Encoder.splitter = IdentitySplitter() |
| 68 | |
| 69 | def split(self, json_line): |
| 70 | data = json.loads(json_line) |
| 71 | output = {} |
| 72 | for key in self.args.json_keys: |
| 73 | text = data[key] |
| 74 | max_len = 1000000 |
| 75 | tokens_list = [Encoder.splitter.tokenize(text[i:i+max_len]) for i in range(0, len(text), max_len)] |
| 76 | output[key] = [tokens for partial in tokens_list for tokens in partial] |
| 77 | return json.dumps(output), len(json_line) |
| 78 | |
| 79 | def encode(self, json_line): |
| 80 | data = json.loads(json_line) |