(dataset_name, data_file, **kwargs)
| 384 | |
| 385 | |
| 386 | def EVAL(dataset_name, data_file, **kwargs): |
| 387 | from vlmeval.dataset import build_dataset |
| 388 | logger = get_logger('VLMEvalKit Tool-Eval') |
| 389 | dataset = build_dataset(dataset_name) |
| 390 | # Set the judge kwargs first before evaluation or dumping |
| 391 | judge_kwargs = {'nproc': 4, 'verbose': True} |
| 392 | if 'model' not in kwargs: |
| 393 | if dataset.TYPE in ['MCQ', 'Y/N', 'MCQ_MMMU_Pro']: |
| 394 | judge_kwargs['model'] = 'chatgpt-0125' |
| 395 | elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name): |
| 396 | judge_kwargs['model'] = 'gpt-4-turbo' |
| 397 | elif listinstr(['MMLongBench', 'MMDU'], dataset_name): |
| 398 | judge_kwargs['model'] = 'gpt-4o' |
| 399 | elif listinstr(['DynaMath', 'MathVerse', 'MathVista', 'MathVision'], dataset_name): |
| 400 | judge_kwargs['model'] = 'gpt-4o-mini' |
| 401 | else: |
| 402 | judge_kwargs['model'] = kwargs['model'] |
| 403 | judge_kwargs['nproc'] = kwargs.get('nproc', 4) |
| 404 | eval_results = dataset.evaluate(data_file, **judge_kwargs) |
| 405 | if eval_results is not None: |
| 406 | assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame) |
| 407 | logger.info('Evaluation Results:') |
| 408 | if isinstance(eval_results, dict): |
| 409 | logger.info('\n' + json.dumps(eval_results, indent=4)) |
| 410 | elif isinstance(eval_results, pd.DataFrame): |
| 411 | logger.info('\n') |
| 412 | logger.info(tabulate(eval_results.T) if len(eval_results) < len(eval_results.columns) else eval_results) |
| 413 | return eval_results |
| 414 | |
| 415 | |
| 416 | def parse_args_eval(): |
no test coverage detected