(self, eval_file, **judge_kwargs)
| 650 | return tgt_path |
| 651 | |
| 652 | def evaluate(self, eval_file, **judge_kwargs): |
| 653 | rating_rev = None |
| 654 | dual_eval = judge_kwargs.pop('dual_eval', True) |
| 655 | if dual_eval: |
| 656 | print('Dual Evaluation Strategy is enabled.') |
| 657 | src = load(eval_file) |
| 658 | tgt = load(eval_file) |
| 659 | tgt['reference_answer_by_gpt4o'] = src['prediction'] |
| 660 | tgt['prediction'] = src['reference_answer_by_gpt4o'] |
| 661 | tgt_file_name = eval_file.replace('.xlsx', '_rev.xlsx') |
| 662 | dump(tgt, tgt_file_name) |
| 663 | judge_kwargs['dual_eval'] = False |
| 664 | rating_rev = self.evaluate(tgt_file_name, **judge_kwargs) |
| 665 | judge_kwargs.pop('dual_eval', None) |
| 666 | |
| 667 | suffix = '.' + eval_file.split('.')[-1] |
| 668 | |
| 669 | score_file = eval_file.replace(suffix, '_score.csv') |
| 670 | tgt_file = eval_file.replace(suffix, '_rating.json') |
| 671 | |
| 672 | model = judge_kwargs.pop('model', 'gpt-4o-0806') |
| 673 | model_name = model.split('/')[-1] if '/' in model else model |
| 674 | tmp_file = eval_file.replace(suffix, f'_{model_name}.pkl') |
| 675 | |
| 676 | nproc = judge_kwargs.pop('nproc', 4) |
| 677 | |
| 678 | if not osp.exists(score_file): |
| 679 | data = load(eval_file) |
| 680 | lt = len(data) |
| 681 | lines = [data.iloc[i] for i in range(len(data))] |
| 682 | judge_kwargs['max_tokens'] = 4096 |
| 683 | |
| 684 | model = build_judge(model=model, **judge_kwargs) |
| 685 | assert model.working(), ('CreationMMBench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) |
| 686 | |
| 687 | prompts = [build_prompt(line, self.dataset_name) for line in lines] |
| 688 | |
| 689 | indices = [line['index'] for line in lines] |
| 690 | |
| 691 | if listinstr(['Creation_MMBench'], self.dataset_name): |
| 692 | no_relative_image_list = [self.dump_image(line) for idx, line in self.data.iterrows()] |
| 693 | assert len(no_relative_image_list) == len(lines) |
| 694 | image_list = [] |
| 695 | for subimage_list in no_relative_image_list: |
| 696 | sublist = [] |
| 697 | for image_path in subimage_list: |
| 698 | image_path = osp.join(self.img_root, image_path) |
| 699 | assert osp.exists(image_path), f"Image not found: {image_path}" |
| 700 | sublist.append(image_path) |
| 701 | image_list.append(sublist) |
| 702 | else: |
| 703 | image_list = [[] * len(lines)] |
| 704 | tups = [(model, image, prompt) for prompt, image in zip(prompts, image_list)] |
| 705 | |
| 706 | ans = {} |
| 707 | if osp.exists(tmp_file): |
| 708 | ans = load(tmp_file) |
| 709 | ans = {k: v for k, v in ans.items() if model.fail_msg not in str(v)} |
nothing calls this directly
no test coverage detected