Method create_examples

tasks/seq2seq/dataset.py:195–230 · view source on GitHub ↗

(self, split)

Source from the content-addressed store, hash-verified

193	self.tokenizer = tokenizer
194
195	def create_examples(self, split):
196	if split == "train":
197	filename = "train.json"
198	elif split == "dev":
199	filename = "dev.json"
200	elif split == "test":
201	filename = "test.json"
202	else:
203	raise NotImplementedError(split)
204	print_rank_0(f"Creating SQuAD-{split} dataset from {self.data_dir}")
205	example_list = []
206	idx = 0
207	with open(os.path.join(self.data_dir, filename), encoding='utf-8') as file:
208	dataset = json.load(file)
209	for paragraphs in dataset:
210	for paragraph in paragraphs['paragraphs']:
211	context = paragraph['context']
212	for qa in paragraph['qas']:
213	question = qa["question"]
214	answers = {answer["text"] for answer in qa["answers"]}
215	answer_starts = {answer["text"]: answer["answer_start"] for answer in qa["answers"]}
216	for answer in answers:
217	guid = "%s-%s" % (split, idx)
218	meta = {
219	"answer_start": answer_starts[answer],
220	"answer": answer,
221	"question": question,
222	"ref": self.tokenizer.DecodeIds(self.tokenizer.EncodeAsIds(question).tokenization)}
223	example = InputExample(guid=guid, text_a=context, meta=meta)
224	if idx < 10:
225	print_rank_0(
226	(context.encode('utf-8'), answer.encode('utf-8'), meta["ref"].encode('utf-8')))
227	example_list.append(example)
228	idx += 1
229	print_rank_0(f"Creating {len(example_list)} examples for {split}")
230	return example_list
231
232
233	def generate_token_to_char_map(tokens, raw_text, tokenizer):

nothing calls this directly

print_rank_0Function · 0.90

InputExampleClass · 0.90

loadMethod · 0.80

appendMethod · 0.80

DecodeIdsMethod · 0.45

EncodeAsIdsMethod · 0.45

encodeMethod · 0.45

no test coverage detected