| 347 | |
| 348 | |
| 349 | def mllm_eval_test(model_path, eval_path, case_name, port=DEFAULT_PORT, test_type='infer', extra_config={}): |
| 350 | work_dir = os.path.join(eval_path, f'wk_{case_name}') |
| 351 | timestamp = time.strftime('%Y%m%d_%H%M%S') |
| 352 | eval_log = os.path.join(eval_path, f'log_{case_name}_{timestamp}.log') |
| 353 | |
| 354 | print(f'Starting VLMEvalKit evaluation for model: {model_path}') |
| 355 | print(f'Model path: {model_path}') |
| 356 | print(f'Case: {case_name}') |
| 357 | print(f'Work directory: {work_dir}') |
| 358 | |
| 359 | os.makedirs(work_dir, exist_ok=True) |
| 360 | |
| 361 | extra_config_str = get_cli_str(extra_config) |
| 362 | |
| 363 | if test_type == 'infer': |
| 364 | cmd = f'python run.py --data MMBench_V11_MINI MMStar_MINI AI2D_MINI OCRBench_MINI --model {case_name} --base-url http://{DEFAULT_SERVER}:{port}/v1 --reuse --work-dir {work_dir} --timeout 7200 --mode infer {extra_config_str}' # noqa |
| 365 | elif test_type == 'eval': |
| 366 | cmd = f'python run.py --data MMBench_V11_MINI MMStar_MINI AI2D_MINI OCRBench_MINI --model {case_name} --base-url http://{DEFAULT_SERVER}:empty/v1 --reuse --work-dir {work_dir} --api-nproc 32 --mode eval --judge turbomind_Qwen2.5-32B-Instruct_nccl_tp2_0 --judge-base-url http://{DEFAULT_SERVER}:{port}/v1' # noqa |
| 367 | |
| 368 | result, msg = execute_command_with_logging(cmd, eval_log) |
| 369 | |
| 370 | allure.attach.file(eval_log, name=eval_log, attachment_type=allure.attachment_type.TEXT) |
| 371 | |
| 372 | if test_type == 'eval': |
| 373 | mllm_summary(case_name, |
| 374 | result, |
| 375 | msg, |
| 376 | work_dir, |
| 377 | eval_path, |
| 378 | dataset_list=['MMBench_V11_MINI', 'MMStar_MINI', 'AI2D_MINI', 'OCRBench_MINI']) |
| 379 | return result, msg |