Method create_examples

tasks/seq2seq/dataset.py:154–187 · view source on GitHub ↗

(self, split)

Source from the content-addressed store, hash-verified

152	self.tokenizer = tokenizer
153
154	def create_examples(self, split):
155	if split == "train":
156	filename = "train.json"
157	elif split == "dev":
158	filename = "dev.json"
159	elif split == "test":
160	filename = "test.json"
161	else:
162	raise NotImplementedError(split)
163	print_rank_0(f"Creating CMRC-{split} dataset from {self.data_dir}")
164	example_list = []
165	idx = 0
166	with open(os.path.join(self.data_dir, filename), encoding='utf-8') as file:
167	dataset = json.load(file)
168	for article in dataset['data']:
169	for paragraph in article['paragraphs']:
170	context = paragraph['context']
171	for qa in paragraph['qas']:
172	question = qa["question"]
173	answers = {answer['text'] for answer in qa["answers"]} if split != 'test' else {"FAKE_ANSWER"}
174	for answer in answers:
175	guid = "%s-%s" % (split, idx)
176	meta = {
177	"answer": answer,
178	"question": question,
179	"ref": self.tokenizer.DecodeIds(self.tokenizer.EncodeAsIds(answer).tokenization)}
180	example = InputExample(guid=guid, text_a=context, meta=meta)
181	if idx < 10:
182	print_rank_0(
183	(context.encode('utf-8'), answer.encode('utf-8'), meta["ref"].encode('utf-8')))
184	example_list.append(example)
185	idx += 1
186	print_rank_0(f"Creating {len(example_list)} examples for {split}")
187	return example_list
188
189
190	class SQuADProcessor:

nothing calls this directly

print_rank_0Function · 0.90

InputExampleClass · 0.90

loadMethod · 0.80

appendMethod · 0.80

DecodeIdsMethod · 0.45

EncodeAsIdsMethod · 0.45

encodeMethod · 0.45

no test coverage detected