Test the benchmark manager specifically for LeetCode using a REAL model. Uses PIPELINE mode: inference and evaluation run in parallel. 流水线模式说明: - 推理完成 batch_size 个任务后,立即 push + 评测 - 同时继续推理下一批任务 - 推理失败的任务直接保存结果,不入队列 - 共享一个浏览器实例
(benchmark_name: str = "leetcode")
| 246 | |
| 247 | |
| 248 | async def test_leetcode_benchmark(benchmark_name: str = "leetcode"): |
| 249 | """ |
| 250 | Test the benchmark manager specifically for LeetCode using a REAL model. |
| 251 | Uses PIPELINE mode: inference and evaluation run in parallel. |
| 252 | |
| 253 | 流水线模式说明: |
| 254 | - 推理完成 batch_size 个任务后,立即 push + 评测 |
| 255 | - 同时继续推理下一批任务 |
| 256 | - 推理失败的任务直接保存结果,不入队列 |
| 257 | - 共享一个浏览器实例 |
| 258 | """ |
| 259 | print(f"\n{'='*60}") |
| 260 | print(f"🧪 LeetCode Benchmark Test (Pipeline Mode)") |
| 261 | print(f"{'='*60}") |
| 262 | print(f"🤖 Model: {TARGET_MODEL}") |
| 263 | print(f"💻 Language: {TARGET_LANGUAGE}") |
| 264 | print(f"⚡ Max concurrent inference: {MAX_CONCURRENT_INFERENCE}") |
| 265 | print(f"📦 Batch size for evaluation: {BATCH_SIZE}") |
| 266 | print(f"{'='*60}\n") |
| 267 | |
| 268 | # Define save directory |
| 269 | save_dir = os.path.join(config.workdir, "benchmark", benchmark_name) |
| 270 | if not os.path.exists(save_dir): |
| 271 | os.makedirs(save_dir, exist_ok=True) |
| 272 | print(f"📁 Created output directory: {save_dir}") |
| 273 | |
| 274 | # 1. Reset and collect all tasks |
| 275 | print(f"🔄 Resetting progress for LeetCode...") |
| 276 | task = await benchmark_manager.reset(benchmark_name) |
| 277 | |
| 278 | if not task: |
| 279 | logger.warning("⚠️ No tasks available to run (Dataset empty or all finished).") |
| 280 | summarize_benchmark_results(benchmark_name) |
| 281 | return |
| 282 | |
| 283 | # ========================================== |
| 284 | # 收集所有待处理任务 |
| 285 | # ========================================== |
| 286 | all_tasks: List[Task] = [task] |
| 287 | while True: |
| 288 | next_task = await benchmark_manager.step(benchmark_name) |
| 289 | if next_task is None: |
| 290 | break |
| 291 | all_tasks.append(next_task) |
| 292 | |
| 293 | total_tasks = len(all_tasks) |
| 294 | print(f"📋 Collected {total_tasks} tasks for processing\n") |
| 295 | |
| 296 | # ========================================== |
| 297 | # 获取 benchmark 实例 |
| 298 | # ========================================== |
| 299 | benchmark = await benchmark_manager.get(benchmark_name) |
| 300 | |
| 301 | # ========================================== |
| 302 | # 流水线模式:推理完成直接入队,满5个自动评测 |
| 303 | # ========================================== |
| 304 | inference_semaphore = asyncio.Semaphore(MAX_CONCURRENT_INFERENCE) |
| 305 |
no test coverage detected