MCPcopy
hub / github.com/open-compass/VLMEvalKit / evaluate

Method evaluate

vlmeval/dataset/creation.py:652–738  ·  view source on GitHub ↗
(self, eval_file, **judge_kwargs)

Source from the content-addressed store, hash-verified

650 return tgt_path
651
652 def evaluate(self, eval_file, **judge_kwargs):
653 rating_rev = None
654 dual_eval = judge_kwargs.pop('dual_eval', True)
655 if dual_eval:
656 print('Dual Evaluation Strategy is enabled.')
657 src = load(eval_file)
658 tgt = load(eval_file)
659 tgt['reference_answer_by_gpt4o'] = src['prediction']
660 tgt['prediction'] = src['reference_answer_by_gpt4o']
661 tgt_file_name = eval_file.replace('.xlsx', '_rev.xlsx')
662 dump(tgt, tgt_file_name)
663 judge_kwargs['dual_eval'] = False
664 rating_rev = self.evaluate(tgt_file_name, **judge_kwargs)
665 judge_kwargs.pop('dual_eval', None)
666
667 suffix = '.' + eval_file.split('.')[-1]
668
669 score_file = eval_file.replace(suffix, '_score.csv')
670 tgt_file = eval_file.replace(suffix, '_rating.json')
671
672 model = judge_kwargs.pop('model', 'gpt-4o-0806')
673 model_name = model.split('/')[-1] if '/' in model else model
674 tmp_file = eval_file.replace(suffix, f'_{model_name}.pkl')
675
676 nproc = judge_kwargs.pop('nproc', 4)
677
678 if not osp.exists(score_file):
679 data = load(eval_file)
680 lt = len(data)
681 lines = [data.iloc[i] for i in range(len(data))]
682 judge_kwargs['max_tokens'] = 4096
683
684 model = build_judge(model=model, **judge_kwargs)
685 assert model.working(), ('CreationMMBench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
686
687 prompts = [build_prompt(line, self.dataset_name) for line in lines]
688
689 indices = [line['index'] for line in lines]
690
691 if listinstr(['Creation_MMBench'], self.dataset_name):
692 no_relative_image_list = [self.dump_image(line) for idx, line in self.data.iterrows()]
693 assert len(no_relative_image_list) == len(lines)
694 image_list = []
695 for subimage_list in no_relative_image_list:
696 sublist = []
697 for image_path in subimage_list:
698 image_path = osp.join(self.img_root, image_path)
699 assert osp.exists(image_path), f"Image not found: {image_path}"
700 sublist.append(image_path)
701 image_list.append(sublist)
702 else:
703 image_list = [[] * len(lines)]
704 tups = [(model, image, prompt) for prompt, image in zip(prompts, image_list)]
705
706 ans = {}
707 if osp.exists(tmp_file):
708 ans = load(tmp_file)
709 ans = {k: v for k, v in ans.items() if model.fail_msg not in str(v)}

Callers

nothing calls this directly

Calls 11

dump_imageMethod · 0.95
loadFunction · 0.85
dumpFunction · 0.85
build_judgeFunction · 0.85
listinstrFunction · 0.85
track_progress_richFunction · 0.85
Creation_MMBench_extractFunction · 0.85
merge_dualFunction · 0.85
workingMethod · 0.80
build_promptFunction · 0.70
get_dimension_ratingFunction · 0.70

Tested by

no test coverage detected