| 466 | |
| 467 | |
| 468 | def SCAN_ONE(root, model, dataset): |
| 469 | from termcolor import colored |
| 470 | FAIL_MSG = 'Failed to obtain answer via API.' |
| 471 | root = osp.join(root, model) |
| 472 | fname = f'{model}_{dataset}.xlsx' |
| 473 | pth = osp.join(root, fname) |
| 474 | if osp.exists(pth): |
| 475 | data = load(pth) |
| 476 | # Detect Failure |
| 477 | assert 'prediction' in data |
| 478 | data['prediction'] = [str(x) for x in data['prediction']] |
| 479 | fail = [FAIL_MSG in x for x in data['prediction']] |
| 480 | if sum(fail): |
| 481 | nfail = sum(fail) |
| 482 | ntot = len(fail) |
| 483 | print(colored(f'Model {model} x Dataset {dataset} Inference: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ', 'light_red')) |
| 484 | |
| 485 | eval_files = ls(root, match=f'{model}_{dataset}_') |
| 486 | eval_files = [x for x in eval_files if listinstr([f'{dataset}_openai', f'{dataset}_gpt'], x) and x.endswith('.xlsx')] |
| 487 | |
| 488 | if len(eval_files) == 0: |
| 489 | return |
| 490 | |
| 491 | for eval_file in eval_files: |
| 492 | data = load(eval_file) |
| 493 | |
| 494 | if 'MMVet' in dataset: |
| 495 | bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)] |
| 496 | if len(bad): |
| 497 | print(f'Evaluation ({eval_file}): {len(bad)} out of {len(data)} failed.') |
| 498 | elif 'MathVista' in dataset: |
| 499 | bad = [x for x in data['res'] if FAIL_MSG in str(x)] |
| 500 | if len(bad): |
| 501 | print(f'Evaluation ({eval_file}): {len(bad)} out of {len(data)} failed.') |
| 502 | elif dataset == 'LLaVABench': |
| 503 | sub = data[data['gpt4_score'] == -1] |
| 504 | sub = sub[sub['gpt4_score'] == -1] |
| 505 | if len(sub): |
| 506 | print(f'Evaluation ({eval_file}): {len(sub)} out of {len(data)} failed.') |
| 507 | else: |
| 508 | if 'log' in data: |
| 509 | bad = [x for x in data['log'] if FAIL_MSG in str(x)] |
| 510 | if len(bad): |
| 511 | print(f'Evaluation ({eval_file}): {len(bad)} out of {len(data)} failed.') |
| 512 | else: |
| 513 | print(colored(f'Model {model} x Dataset {dataset} Inference Result Missing! ', 'red')) |
| 514 | |
| 515 | |
| 516 | def SCAN(root, models, datasets): |