MCPcopy
hub / github.com/open-compass/VLMEvalKit / SCAN_ONE

Function SCAN_ONE

vlmeval/tools.py:468–513  ·  view source on GitHub ↗
(root, model, dataset)

Source from the content-addressed store, hash-verified

466
467
468def SCAN_ONE(root, model, dataset):
469 from termcolor import colored
470 FAIL_MSG = 'Failed to obtain answer via API.'
471 root = osp.join(root, model)
472 fname = f'{model}_{dataset}.xlsx'
473 pth = osp.join(root, fname)
474 if osp.exists(pth):
475 data = load(pth)
476 # Detect Failure
477 assert 'prediction' in data
478 data['prediction'] = [str(x) for x in data['prediction']]
479 fail = [FAIL_MSG in x for x in data['prediction']]
480 if sum(fail):
481 nfail = sum(fail)
482 ntot = len(fail)
483 print(colored(f'Model {model} x Dataset {dataset} Inference: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ', 'light_red'))
484
485 eval_files = ls(root, match=f'{model}_{dataset}_')
486 eval_files = [x for x in eval_files if listinstr([f'{dataset}_openai', f'{dataset}_gpt'], x) and x.endswith('.xlsx')]
487
488 if len(eval_files) == 0:
489 return
490
491 for eval_file in eval_files:
492 data = load(eval_file)
493
494 if 'MMVet' in dataset:
495 bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)]
496 if len(bad):
497 print(f'Evaluation ({eval_file}): {len(bad)} out of {len(data)} failed.')
498 elif 'MathVista' in dataset:
499 bad = [x for x in data['res'] if FAIL_MSG in str(x)]
500 if len(bad):
501 print(f'Evaluation ({eval_file}): {len(bad)} out of {len(data)} failed.')
502 elif dataset == 'LLaVABench':
503 sub = data[data['gpt4_score'] == -1]
504 sub = sub[sub['gpt4_score'] == -1]
505 if len(sub):
506 print(f'Evaluation ({eval_file}): {len(sub)} out of {len(data)} failed.')
507 else:
508 if 'log' in data:
509 bad = [x for x in data['log'] if FAIL_MSG in str(x)]
510 if len(bad):
511 print(f'Evaluation ({eval_file}): {len(bad)} out of {len(data)} failed.')
512 else:
513 print(colored(f'Model {model} x Dataset {dataset} Inference Result Missing! ', 'red'))
514
515
516def SCAN(root, models, datasets):

Callers 1

SCANFunction · 0.85

Calls 4

loadFunction · 0.85
coloredFunction · 0.85
lsFunction · 0.85
listinstrFunction · 0.85

Tested by

no test coverage detected