hub / github.com/open-compass/VLMEvalKit / evaluate

Method evaluate

vlmeval/dataset/image_yorn.py:26–95 · view source on GitHub ↗

(self, eval_file, **judge_kwargs)

Source from the content-addressed store, hash-verified

24
25	# It returns a dataframe
26	def evaluate(self, eval_file, **judge_kwargs):
27	from .utils.yorn import YOrN_Extraction, YOrN_auxeval
28	from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
29
30	dataset = self.dataset_name
31	data = load(eval_file)
32	data['prediction'] = [str(x) for x in data['prediction']]
33	storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
34	tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
35	nproc = judge_kwargs.pop('nproc', 4)
36
37	if not osp.exists(storage):
38	ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
39	if osp.exists(tmp_file):
40	tmp = load(tmp_file)
41	for k in tmp:
42	if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
43	ans_map[k] = tmp[k]
44
45	data['extracted'] = [ans_map[x] for x in data['index']]
46	unknown = data[data['extracted'] == 'Unknown']
47
48	model = judge_kwargs.get('model', 'exact_matching')
49	if model == 'exact_matching':
50	model = None
51	elif gpt_key_set():
52	model = build_judge(**judge_kwargs)
53	if not model.working():
54	warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
55	warnings.warn(DEBUG_MESSAGE)
56	model = None
57	else:
58	model = None
59	warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
60
61	if model is not None:
62	lt = len(unknown)
63	lines = [unknown.iloc[i] for i in range(lt)]
64	tups = [(model, line) for line in lines]
65	indices = list(unknown['index'])
66	if len(tups):
67	res = track_progress_rich(
68	YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
69	for k, v in zip(indices, res):
70	ans_map[k] = v
71
72	data['extracted'] = [ans_map[x] for x in data['index']]
73	dump(data, storage)
74
75	data = load(storage)
76	if listinstr(['AMBER'], dataset):
77	data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
78	else:
79	data['score'] = (data['answer'] == data['extracted'])
80	dump(data, storage)
81
82	if dataset is not None and listinstr(['MME'], dataset):
83	score = MME_rating(storage)

Callers

nothing calls this directly

Calls 14

loadFunction · 0.85

YOrN_ExtractionFunction · 0.85

gpt_key_setFunction · 0.85

build_judgeFunction · 0.85

track_progress_richFunction · 0.85

dumpFunction · 0.85

listinstrFunction · 0.85

MME_ratingFunction · 0.85

Hallusion_ratingFunction · 0.85

POPE_ratingFunction · 0.85

AMBER_ratingFunction · 0.85

default_ratingFunction · 0.85

Tested by

no test coverage detected