(self, split)
| 111 | self.tokenizer = tokenizer |
| 112 | |
| 113 | def create_examples(self, split): |
| 114 | if split == "train": |
| 115 | filename = "train.json" |
| 116 | elif split == "dev": |
| 117 | filename = "dev.json" |
| 118 | elif split == "test": |
| 119 | filename = "test.json" |
| 120 | else: |
| 121 | raise NotImplementedError(split) |
| 122 | print_rank_0(f"Creating CMRC-{split} dataset from {self.data_dir}") |
| 123 | example_list = [] |
| 124 | idx = 0 |
| 125 | with open(os.path.join(self.data_dir, filename), encoding='utf-8') as file: |
| 126 | dataset = json.load(file) |
| 127 | for article in dataset['data']: |
| 128 | for paragraph in article['paragraphs']: |
| 129 | context = paragraph['context'] |
| 130 | for qa in paragraph['qas']: |
| 131 | question = qa["question"] |
| 132 | answers = {answer['text'] for answer in qa["answers"]} if split != 'test' else {"FAKE_ANSWER"} |
| 133 | for answer in answers: |
| 134 | guid = "%s-%s" % (split, idx) |
| 135 | meta = { |
| 136 | "answer": answer, |
| 137 | "question": question, |
| 138 | "ref": self.tokenizer.DecodeIds(self.tokenizer.EncodeAsIds(answer).tokenization)} |
| 139 | example = InputExample(guid=guid, text_a=context, meta=meta) |
| 140 | if idx < 10: |
| 141 | print_rank_0( |
| 142 | (context.encode('utf-8'), answer.encode('utf-8'), meta["ref"].encode('utf-8'))) |
| 143 | example_list.append(example) |
| 144 | idx += 1 |
| 145 | print_rank_0(f"Creating {len(example_list)} examples for {split}") |
| 146 | return example_list |
| 147 | |
| 148 | |
| 149 | class SQuADGenerationProcessor: |
no test coverage detected