Method create_examples

tasks/seq2seq/dataset.py:113–146 · view source on GitHub ↗

(self, split)

Source from the content-addressed store, hash-verified

111	self.tokenizer = tokenizer
112
113	def create_examples(self, split):
114	if split == "train":
115	filename = "train.json"
116	elif split == "dev":
117	filename = "dev.json"
118	elif split == "test":
119	filename = "test.json"
120	else:
121	raise NotImplementedError(split)
122	print_rank_0(f"Creating CMRC-{split} dataset from {self.data_dir}")
123	example_list = []
124	idx = 0
125	with open(os.path.join(self.data_dir, filename), encoding='utf-8') as file:
126	dataset = json.load(file)
127	for article in dataset['data']:
128	for paragraph in article['paragraphs']:
129	context = paragraph['context']
130	for qa in paragraph['qas']:
131	question = qa["question"]
132	answers = {answer['text'] for answer in qa["answers"]} if split != 'test' else {"FAKE_ANSWER"}
133	for answer in answers:
134	guid = "%s-%s" % (split, idx)
135	meta = {
136	"answer": answer,
137	"question": question,
138	"ref": self.tokenizer.DecodeIds(self.tokenizer.EncodeAsIds(answer).tokenization)}
139	example = InputExample(guid=guid, text_a=context, meta=meta)
140	if idx < 10:
141	print_rank_0(
142	(context.encode('utf-8'), answer.encode('utf-8'), meta["ref"].encode('utf-8')))
143	example_list.append(example)
144	idx += 1
145	print_rank_0(f"Creating {len(example_list)} examples for {split}")
146	return example_list
147
148
149	class SQuADGenerationProcessor:

__init__Method · 0.45

print_rank_0Function · 0.90

InputExampleClass · 0.90

loadMethod · 0.80

appendMethod · 0.80

DecodeIdsMethod · 0.45

EncodeAsIdsMethod · 0.45

encodeMethod · 0.45

no test coverage detected