(self, split)
| 152 | self.tokenizer = tokenizer |
| 153 | |
| 154 | def create_examples(self, split): |
| 155 | if split == "train": |
| 156 | filename = "train.json" |
| 157 | elif split == "dev": |
| 158 | filename = "dev.json" |
| 159 | elif split == "test": |
| 160 | filename = "test.json" |
| 161 | else: |
| 162 | raise NotImplementedError(split) |
| 163 | print_rank_0(f"Creating CMRC-{split} dataset from {self.data_dir}") |
| 164 | example_list = [] |
| 165 | idx = 0 |
| 166 | with open(os.path.join(self.data_dir, filename), encoding='utf-8') as file: |
| 167 | dataset = json.load(file) |
| 168 | for article in dataset['data']: |
| 169 | for paragraph in article['paragraphs']: |
| 170 | context = paragraph['context'] |
| 171 | for qa in paragraph['qas']: |
| 172 | question = qa["question"] |
| 173 | answers = {answer['text'] for answer in qa["answers"]} if split != 'test' else {"FAKE_ANSWER"} |
| 174 | for answer in answers: |
| 175 | guid = "%s-%s" % (split, idx) |
| 176 | meta = { |
| 177 | "answer": answer, |
| 178 | "question": question, |
| 179 | "ref": self.tokenizer.DecodeIds(self.tokenizer.EncodeAsIds(answer).tokenization)} |
| 180 | example = InputExample(guid=guid, text_a=context, meta=meta) |
| 181 | if idx < 10: |
| 182 | print_rank_0( |
| 183 | (context.encode('utf-8'), answer.encode('utf-8'), meta["ref"].encode('utf-8'))) |
| 184 | example_list.append(example) |
| 185 | idx += 1 |
| 186 | print_rank_0(f"Creating {len(example_list)} examples for {split}") |
| 187 | return example_list |
| 188 | |
| 189 | |
| 190 | class SQuADProcessor: |
nothing calls this directly
no test coverage detected