hub / github.com/THUDM/GLM / _create_examples

Method _create_examples

tasks/superglue/dataset.py:432–517 · view source on GitHub ↗

(self, path: str, set_type: str, cloze_eval=True)

Source from the content-addressed store, hash-verified

430	return text_a, text_b
431
432	def _create_examples(self, path: str, set_type: str, cloze_eval=True) -> List[InputExample]:
433	examples = []
434
435	with open(path, encoding='utf8') as f:
436	for line in f:
437	example_json = json.loads(line)
438	idx = example_json['idx']
439	label = str(example_json['label']) if 'label' in example_json else None
440	guid = "%s-%s" % (set_type, idx)
441	text_a = punctuation_standardization(example_json['text'])
442	meta = {
443	'span1_text': example_json['target']['span1_text'],
444	'span2_text': example_json['target']['span2_text'],
445	'span1_index': example_json['target']['span1_index'],
446	'span2_index': example_json['target']['span2_index']
447	}
448	if 'candidates' in example_json:
449	candidates = [cand['text'] for cand in example_json['candidates']]
450	# candidates = list(set(candidates))
451	filtered = []
452	for i, cand in enumerate(candidates):
453	if not cand in candidates[:i]:
454	filtered.append(cand)
455	candidates = filtered
456
457	# the indices in the dataset are wrong for some examples, so we manually fix them
458	span1_index, span1_text = meta['span1_index'], meta['span1_text']
459	span2_index, span2_text = meta['span2_index'], meta['span2_text']
460	words_a = text_a.split()
461	words_a_lower = text_a.lower().split()
462	words_span1_text = span1_text.lower().split()
463	span1_len = len(words_span1_text)
464
465	if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
466	for offset in [-1, +1]:
467	if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text:
468	span1_index += offset
469
470	# if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
471	# print_rank_0(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected "
472	# f"'{words_span1_text}' at index {span1_index} for '{words_a}'")
473
474	if words_a[span2_index] != span2_text:
475	for offset in [-1, +1]:
476	if words_a[span2_index + offset] == span2_text:
477	span2_index += offset
478
479	if words_a[span2_index] != span2_text and words_a[span2_index].startswith(span2_text):
480	words_a = words_a[:span2_index] \
481	+ [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] \
482	+ words_a[span2_index + 1:]
483
484	assert words_a[span2_index] == span2_text, \
485	f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'"
486
487	text_a = ' '.join(words_a)
488	meta['span1_index'], meta['span2_index'] = span1_index, span2_index
489

Callers 1

get_train_examplesMethod · 0.95

Calls 3

punctuation_standardizationFunction · 0.90

InputExampleClass · 0.90

appendMethod · 0.80

Tested by

no test coverage detected