hub / github.com/open-compass/VLMEvalKit / SCAN_ONE

Function SCAN_ONE

vlmeval/tools.py:468–513 · view source on GitHub ↗

(root, model, dataset)

Source from the content-addressed store, hash-verified

466
467
468	def SCAN_ONE(root, model, dataset):
469	from termcolor import colored
470	FAIL_MSG = 'Failed to obtain answer via API.'
471	root = osp.join(root, model)
472	fname = f'{model}_{dataset}.xlsx'
473	pth = osp.join(root, fname)
474	if osp.exists(pth):
475	data = load(pth)
476	# Detect Failure
477	assert 'prediction' in data
478	data['prediction'] = [str(x) for x in data['prediction']]
479	fail = [FAIL_MSG in x for x in data['prediction']]
480	if sum(fail):
481	nfail = sum(fail)
482	ntot = len(fail)
483	print(colored(f'Model {model} x Dataset {dataset} Inference: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ', 'light_red'))
484
485	eval_files = ls(root, match=f'{model}_{dataset}_')
486	eval_files = [x for x in eval_files if listinstr([f'{dataset}_openai', f'{dataset}_gpt'], x) and x.endswith('.xlsx')]
487
488	if len(eval_files) == 0:
489	return
490
491	for eval_file in eval_files:
492	data = load(eval_file)
493
494	if 'MMVet' in dataset:
495	bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)]
496	if len(bad):
497	print(f'Evaluation ({eval_file}): {len(bad)} out of {len(data)} failed.')
498	elif 'MathVista' in dataset:
499	bad = [x for x in data['res'] if FAIL_MSG in str(x)]
500	if len(bad):
501	print(f'Evaluation ({eval_file}): {len(bad)} out of {len(data)} failed.')
502	elif dataset == 'LLaVABench':
503	sub = data[data['gpt4_score'] == -1]
504	sub = sub[sub['gpt4_score'] == -1]
505	if len(sub):
506	print(f'Evaluation ({eval_file}): {len(sub)} out of {len(data)} failed.')
507	else:
508	if 'log' in data:
509	bad = [x for x in data['log'] if FAIL_MSG in str(x)]
510	if len(bad):
511	print(f'Evaluation ({eval_file}): {len(bad)} out of {len(data)} failed.')
512	else:
513	print(colored(f'Model {model} x Dataset {dataset} Inference Result Missing! ', 'red'))
514
515
516	def SCAN(root, models, datasets):

Callers 1

SCANFunction · 0.85

Calls 4

loadFunction · 0.85

coloredFunction · 0.85

lsFunction · 0.85

listinstrFunction · 0.85

Tested by

no test coverage detected