MCPcopy Index your code
hub / github.com/THUDM/GLM / _create_examples

Method _create_examples

tasks/superglue/dataset.py:432–517  ·  view source on GitHub ↗
(self, path: str, set_type: str, cloze_eval=True)

Source from the content-addressed store, hash-verified

430 return text_a, text_b
431
432 def _create_examples(self, path: str, set_type: str, cloze_eval=True) -> List[InputExample]:
433 examples = []
434
435 with open(path, encoding='utf8') as f:
436 for line in f:
437 example_json = json.loads(line)
438 idx = example_json['idx']
439 label = str(example_json['label']) if 'label' in example_json else None
440 guid = "%s-%s" % (set_type, idx)
441 text_a = punctuation_standardization(example_json['text'])
442 meta = {
443 'span1_text': example_json['target']['span1_text'],
444 'span2_text': example_json['target']['span2_text'],
445 'span1_index': example_json['target']['span1_index'],
446 'span2_index': example_json['target']['span2_index']
447 }
448 if 'candidates' in example_json:
449 candidates = [cand['text'] for cand in example_json['candidates']]
450 # candidates = list(set(candidates))
451 filtered = []
452 for i, cand in enumerate(candidates):
453 if not cand in candidates[:i]:
454 filtered.append(cand)
455 candidates = filtered
456
457 # the indices in the dataset are wrong for some examples, so we manually fix them
458 span1_index, span1_text = meta['span1_index'], meta['span1_text']
459 span2_index, span2_text = meta['span2_index'], meta['span2_text']
460 words_a = text_a.split()
461 words_a_lower = text_a.lower().split()
462 words_span1_text = span1_text.lower().split()
463 span1_len = len(words_span1_text)
464
465 if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
466 for offset in [-1, +1]:
467 if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text:
468 span1_index += offset
469
470 # if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
471 # print_rank_0(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected "
472 # f"'{words_span1_text}' at index {span1_index} for '{words_a}'")
473
474 if words_a[span2_index] != span2_text:
475 for offset in [-1, +1]:
476 if words_a[span2_index + offset] == span2_text:
477 span2_index += offset
478
479 if words_a[span2_index] != span2_text and words_a[span2_index].startswith(span2_text):
480 words_a = words_a[:span2_index] \
481 + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] \
482 + words_a[span2_index + 1:]
483
484 assert words_a[span2_index] == span2_text, \
485 f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'"
486
487 text_a = ' '.join(words_a)
488 meta['span1_index'], meta['span2_index'] = span1_index, span2_index
489

Callers 1

get_train_examplesMethod · 0.95

Calls 3

InputExampleClass · 0.90
appendMethod · 0.80

Tested by

no test coverage detected