(self, path: str, set_type: str, cloze_eval=True)
| 430 | return text_a, text_b |
| 431 | |
| 432 | def _create_examples(self, path: str, set_type: str, cloze_eval=True) -> List[InputExample]: |
| 433 | examples = [] |
| 434 | |
| 435 | with open(path, encoding='utf8') as f: |
| 436 | for line in f: |
| 437 | example_json = json.loads(line) |
| 438 | idx = example_json['idx'] |
| 439 | label = str(example_json['label']) if 'label' in example_json else None |
| 440 | guid = "%s-%s" % (set_type, idx) |
| 441 | text_a = punctuation_standardization(example_json['text']) |
| 442 | meta = { |
| 443 | 'span1_text': example_json['target']['span1_text'], |
| 444 | 'span2_text': example_json['target']['span2_text'], |
| 445 | 'span1_index': example_json['target']['span1_index'], |
| 446 | 'span2_index': example_json['target']['span2_index'] |
| 447 | } |
| 448 | if 'candidates' in example_json: |
| 449 | candidates = [cand['text'] for cand in example_json['candidates']] |
| 450 | # candidates = list(set(candidates)) |
| 451 | filtered = [] |
| 452 | for i, cand in enumerate(candidates): |
| 453 | if not cand in candidates[:i]: |
| 454 | filtered.append(cand) |
| 455 | candidates = filtered |
| 456 | |
| 457 | # the indices in the dataset are wrong for some examples, so we manually fix them |
| 458 | span1_index, span1_text = meta['span1_index'], meta['span1_text'] |
| 459 | span2_index, span2_text = meta['span2_index'], meta['span2_text'] |
| 460 | words_a = text_a.split() |
| 461 | words_a_lower = text_a.lower().split() |
| 462 | words_span1_text = span1_text.lower().split() |
| 463 | span1_len = len(words_span1_text) |
| 464 | |
| 465 | if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: |
| 466 | for offset in [-1, +1]: |
| 467 | if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text: |
| 468 | span1_index += offset |
| 469 | |
| 470 | # if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: |
| 471 | # print_rank_0(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected " |
| 472 | # f"'{words_span1_text}' at index {span1_index} for '{words_a}'") |
| 473 | |
| 474 | if words_a[span2_index] != span2_text: |
| 475 | for offset in [-1, +1]: |
| 476 | if words_a[span2_index + offset] == span2_text: |
| 477 | span2_index += offset |
| 478 | |
| 479 | if words_a[span2_index] != span2_text and words_a[span2_index].startswith(span2_text): |
| 480 | words_a = words_a[:span2_index] \ |
| 481 | + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] \ |
| 482 | + words_a[span2_index + 1:] |
| 483 | |
| 484 | assert words_a[span2_index] == span2_text, \ |
| 485 | f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'" |
| 486 | |
| 487 | text_a = ' '.join(words_a) |
| 488 | meta['span1_index'], meta['span2_index'] = span1_index, span2_index |
| 489 |
no test coverage detected