(self, json_line)
| 77 | return json.dumps(output), len(json_line) |
| 78 | |
| 79 | def encode(self, json_line): |
| 80 | data = json.loads(json_line) |
| 81 | ids = {} |
| 82 | lens = {} |
| 83 | for key in self.args.json_keys: |
| 84 | text = data[key] |
| 85 | if isinstance(text, list): |
| 86 | sentences = text |
| 87 | else: |
| 88 | sentences = [text] |
| 89 | doc_ids = [] |
| 90 | sentence_lens = [] |
| 91 | for sentence in sentences: |
| 92 | sentence_ids = Encoder.tokenizer.tokenize(sentence) |
| 93 | if len(sentence_ids) > 0: |
| 94 | doc_ids.extend(sentence_ids) |
| 95 | sentence_lens.append(len(sentence_ids)) |
| 96 | if len(doc_ids) > 0 and self.args.append_eod: |
| 97 | doc_ids.append(Encoder.tokenizer.eod) |
| 98 | ids[key] = doc_ids |
| 99 | lens[key] = sentence_lens |
| 100 | return ids, lens, len(json_line) |
| 101 | |
| 102 | |
| 103 | class Partition(object): |
no test coverage detected