(self, eval_file, **judge_kwargs)
| 24 | |
| 25 | # It returns a dataframe |
| 26 | def evaluate(self, eval_file, **judge_kwargs): |
| 27 | from .utils.yorn import YOrN_Extraction, YOrN_auxeval |
| 28 | from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating |
| 29 | |
| 30 | dataset = self.dataset_name |
| 31 | data = load(eval_file) |
| 32 | data['prediction'] = [str(x) for x in data['prediction']] |
| 33 | storage = eval_file.replace('.xlsx', '_auxmatch.xlsx') |
| 34 | tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') |
| 35 | nproc = judge_kwargs.pop('nproc', 4) |
| 36 | |
| 37 | if not osp.exists(storage): |
| 38 | ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])} |
| 39 | if osp.exists(tmp_file): |
| 40 | tmp = load(tmp_file) |
| 41 | for k in tmp: |
| 42 | if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown': |
| 43 | ans_map[k] = tmp[k] |
| 44 | |
| 45 | data['extracted'] = [ans_map[x] for x in data['index']] |
| 46 | unknown = data[data['extracted'] == 'Unknown'] |
| 47 | |
| 48 | model = judge_kwargs.get('model', 'exact_matching') |
| 49 | if model == 'exact_matching': |
| 50 | model = None |
| 51 | elif gpt_key_set(): |
| 52 | model = build_judge(**judge_kwargs) |
| 53 | if not model.working(): |
| 54 | warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') |
| 55 | warnings.warn(DEBUG_MESSAGE) |
| 56 | model = None |
| 57 | else: |
| 58 | model = None |
| 59 | warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation') |
| 60 | |
| 61 | if model is not None: |
| 62 | lt = len(unknown) |
| 63 | lines = [unknown.iloc[i] for i in range(lt)] |
| 64 | tups = [(model, line) for line in lines] |
| 65 | indices = list(unknown['index']) |
| 66 | if len(tups): |
| 67 | res = track_progress_rich( |
| 68 | YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file) |
| 69 | for k, v in zip(indices, res): |
| 70 | ans_map[k] = v |
| 71 | |
| 72 | data['extracted'] = [ans_map[x] for x in data['index']] |
| 73 | dump(data, storage) |
| 74 | |
| 75 | data = load(storage) |
| 76 | if listinstr(['AMBER'], dataset): |
| 77 | data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower()) |
| 78 | else: |
| 79 | data['score'] = (data['answer'] == data['extracted']) |
| 80 | dump(data, storage) |
| 81 | |
| 82 | if dataset is not None and listinstr(['MME'], dataset): |
| 83 | score = MME_rating(storage) |
nothing calls this directly
no test coverage detected