(self, split)
| 193 | self.tokenizer = tokenizer |
| 194 | |
| 195 | def create_examples(self, split): |
| 196 | if split == "train": |
| 197 | filename = "train.json" |
| 198 | elif split == "dev": |
| 199 | filename = "dev.json" |
| 200 | elif split == "test": |
| 201 | filename = "test.json" |
| 202 | else: |
| 203 | raise NotImplementedError(split) |
| 204 | print_rank_0(f"Creating SQuAD-{split} dataset from {self.data_dir}") |
| 205 | example_list = [] |
| 206 | idx = 0 |
| 207 | with open(os.path.join(self.data_dir, filename), encoding='utf-8') as file: |
| 208 | dataset = json.load(file) |
| 209 | for paragraphs in dataset: |
| 210 | for paragraph in paragraphs['paragraphs']: |
| 211 | context = paragraph['context'] |
| 212 | for qa in paragraph['qas']: |
| 213 | question = qa["question"] |
| 214 | answers = {answer["text"] for answer in qa["answers"]} |
| 215 | answer_starts = {answer["text"]: answer["answer_start"] for answer in qa["answers"]} |
| 216 | for answer in answers: |
| 217 | guid = "%s-%s" % (split, idx) |
| 218 | meta = { |
| 219 | "answer_start": answer_starts[answer], |
| 220 | "answer": answer, |
| 221 | "question": question, |
| 222 | "ref": self.tokenizer.DecodeIds(self.tokenizer.EncodeAsIds(question).tokenization)} |
| 223 | example = InputExample(guid=guid, text_a=context, meta=meta) |
| 224 | if idx < 10: |
| 225 | print_rank_0( |
| 226 | (context.encode('utf-8'), answer.encode('utf-8'), meta["ref"].encode('utf-8'))) |
| 227 | example_list.append(example) |
| 228 | idx += 1 |
| 229 | print_rank_0(f"Creating {len(example_list)} examples for {split}") |
| 230 | return example_list |
| 231 | |
| 232 | |
| 233 | def generate_token_to_char_map(tokens, raw_text, tokenizer): |
nothing calls this directly
no test coverage detected