MCPcopy
hub / github.com/open-compass/VLMEvalKit / evaluate

Method evaluate

vlmeval/dataset/mmalignbench.py:171–298  ·  view source on GitHub ↗
(self, eval_file, **judge_kwargs)

Source from the content-addressed store, hash-verified

169 # It returns a DataFrame
170 @classmethod
171 def evaluate(self, eval_file, **judge_kwargs):
172 # We adopt pairwise evaluation (twice for a pair) for this dataset
173 suffix = eval_file.split('.')[-1]
174 model = judge_kwargs['model']
175 storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
176 score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
177 tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
178 nproc = judge_kwargs.pop('nproc', 4)
179
180 if not osp.exists(storage):
181 raw_data = MMAlignBench('MMAlignBench').data
182 b64_map = {x: y for x, y in zip(raw_data['index'], raw_data['image'])}
183 data = self.gen_eval_base(eval_file, b64_map)
184
185 # judge_kwargs['system_prompt'] = SYSTEM_PROMPT
186 judge_kwargs['temperature'] = 0
187 judge_kwargs['img_detail'] = 'high'
188 judge_kwargs['timeout'] = 300
189 model = build_judge(max_tokens=4096, **judge_kwargs)
190
191 assert model.working(), (
192 'MMAlignBench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
193 )
194
195 lt = len(data)
196 lines = [data.iloc[i] for i in range(lt)]
197 tups = [(model, line) for line in lines]
198 indices = [line['index'] for line in lines]
199
200 ans = load(tmp_file) if osp.exists(tmp_file) else {}
201 tups = [x for x, i in zip(tups, indices) if i not in ans]
202 indices = [i for i in indices if i not in ans]
203
204 if len(indices):
205 new_results = track_progress_rich(
206 MMAlignBench_auxeval,
207 tups,
208 nproc=nproc,
209 chunksize=nproc,
210 keys=indices,
211 save=tmp_file,
212 )
213 ans = load(tmp_file)
214 for k, v in zip(indices, new_results):
215 ans[k] = {'score': v[0], 'resp': v[1]}
216 else:
217 for k,v in ans.items():
218 ans[k] = {'score': v[0], 'resp': v[1]}
219 # breakpoint()
220 data['score'] = [ans[x]['score'] for x in data['index']]
221 data['judge'] = [ans[x]['resp'] for x in data['index']]
222 data.pop('image')
223 dump(data, storage)
224
225 data = load(storage)
226 lt = len(data)
227
228 scores = defaultdict(lambda: 0)

Callers

nothing calls this directly

Calls 8

gen_eval_baseMethod · 0.95
MMAlignBenchClass · 0.85
build_judgeFunction · 0.85
loadFunction · 0.85
track_progress_richFunction · 0.85
dumpFunction · 0.85
workingMethod · 0.80
getMethod · 0.80

Tested by

no test coverage detected