MCPcopy
hub / github.com/open-compass/VLMEvalKit / evaluate

Method evaluate

vlmeval/dataset/image_yorn.py:26–95  ·  view source on GitHub ↗
(self, eval_file, **judge_kwargs)

Source from the content-addressed store, hash-verified

24
25 # It returns a dataframe
26 def evaluate(self, eval_file, **judge_kwargs):
27 from .utils.yorn import YOrN_Extraction, YOrN_auxeval
28 from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
29
30 dataset = self.dataset_name
31 data = load(eval_file)
32 data['prediction'] = [str(x) for x in data['prediction']]
33 storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
34 tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
35 nproc = judge_kwargs.pop('nproc', 4)
36
37 if not osp.exists(storage):
38 ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
39 if osp.exists(tmp_file):
40 tmp = load(tmp_file)
41 for k in tmp:
42 if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
43 ans_map[k] = tmp[k]
44
45 data['extracted'] = [ans_map[x] for x in data['index']]
46 unknown = data[data['extracted'] == 'Unknown']
47
48 model = judge_kwargs.get('model', 'exact_matching')
49 if model == 'exact_matching':
50 model = None
51 elif gpt_key_set():
52 model = build_judge(**judge_kwargs)
53 if not model.working():
54 warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
55 warnings.warn(DEBUG_MESSAGE)
56 model = None
57 else:
58 model = None
59 warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
60
61 if model is not None:
62 lt = len(unknown)
63 lines = [unknown.iloc[i] for i in range(lt)]
64 tups = [(model, line) for line in lines]
65 indices = list(unknown['index'])
66 if len(tups):
67 res = track_progress_rich(
68 YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
69 for k, v in zip(indices, res):
70 ans_map[k] = v
71
72 data['extracted'] = [ans_map[x] for x in data['index']]
73 dump(data, storage)
74
75 data = load(storage)
76 if listinstr(['AMBER'], dataset):
77 data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
78 else:
79 data['score'] = (data['answer'] == data['extracted'])
80 dump(data, storage)
81
82 if dataset is not None and listinstr(['MME'], dataset):
83 score = MME_rating(storage)

Callers

nothing calls this directly

Calls 14

loadFunction · 0.85
YOrN_ExtractionFunction · 0.85
gpt_key_setFunction · 0.85
build_judgeFunction · 0.85
track_progress_richFunction · 0.85
dumpFunction · 0.85
listinstrFunction · 0.85
MME_ratingFunction · 0.85
Hallusion_ratingFunction · 0.85
POPE_ratingFunction · 0.85
AMBER_ratingFunction · 0.85
default_ratingFunction · 0.85

Tested by

no test coverage detected