(self, eval_file, **judge_kwargs)
| 169 | # It returns a DataFrame |
| 170 | @classmethod |
| 171 | def evaluate(self, eval_file, **judge_kwargs): |
| 172 | # We adopt pairwise evaluation (twice for a pair) for this dataset |
| 173 | suffix = eval_file.split('.')[-1] |
| 174 | model = judge_kwargs['model'] |
| 175 | storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') |
| 176 | score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') |
| 177 | tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') |
| 178 | nproc = judge_kwargs.pop('nproc', 4) |
| 179 | |
| 180 | if not osp.exists(storage): |
| 181 | raw_data = MMAlignBench('MMAlignBench').data |
| 182 | b64_map = {x: y for x, y in zip(raw_data['index'], raw_data['image'])} |
| 183 | data = self.gen_eval_base(eval_file, b64_map) |
| 184 | |
| 185 | # judge_kwargs['system_prompt'] = SYSTEM_PROMPT |
| 186 | judge_kwargs['temperature'] = 0 |
| 187 | judge_kwargs['img_detail'] = 'high' |
| 188 | judge_kwargs['timeout'] = 300 |
| 189 | model = build_judge(max_tokens=4096, **judge_kwargs) |
| 190 | |
| 191 | assert model.working(), ( |
| 192 | 'MMAlignBench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE |
| 193 | ) |
| 194 | |
| 195 | lt = len(data) |
| 196 | lines = [data.iloc[i] for i in range(lt)] |
| 197 | tups = [(model, line) for line in lines] |
| 198 | indices = [line['index'] for line in lines] |
| 199 | |
| 200 | ans = load(tmp_file) if osp.exists(tmp_file) else {} |
| 201 | tups = [x for x, i in zip(tups, indices) if i not in ans] |
| 202 | indices = [i for i in indices if i not in ans] |
| 203 | |
| 204 | if len(indices): |
| 205 | new_results = track_progress_rich( |
| 206 | MMAlignBench_auxeval, |
| 207 | tups, |
| 208 | nproc=nproc, |
| 209 | chunksize=nproc, |
| 210 | keys=indices, |
| 211 | save=tmp_file, |
| 212 | ) |
| 213 | ans = load(tmp_file) |
| 214 | for k, v in zip(indices, new_results): |
| 215 | ans[k] = {'score': v[0], 'resp': v[1]} |
| 216 | else: |
| 217 | for k,v in ans.items(): |
| 218 | ans[k] = {'score': v[0], 'resp': v[1]} |
| 219 | # breakpoint() |
| 220 | data['score'] = [ans[x]['score'] for x in data['index']] |
| 221 | data['judge'] = [ans[x]['resp'] for x in data['index']] |
| 222 | data.pop('image') |
| 223 | dump(data, storage) |
| 224 | |
| 225 | data = load(storage) |
| 226 | lt = len(data) |
| 227 | |
| 228 | scores = defaultdict(lambda: 0) |
nothing calls this directly
no test coverage detected