刷新评测队列:批量 push 代码并依次评测。 Returns: 评测完成的任务列表
(self)
| 1005 | return False |
| 1006 | |
| 1007 | async def flush_eval_queue(self) -> List[Task]: |
| 1008 | """ |
| 1009 | 刷新评测队列:批量 push 代码并依次评测。 |
| 1010 | |
| 1011 | Returns: |
| 1012 | 评测完成的任务列表 |
| 1013 | """ |
| 1014 | if not self._pending_queue: |
| 1015 | return [] |
| 1016 | |
| 1017 | async with submit_lock: |
| 1018 | logger.info(f"| 🔒 Acquired lock for batch evaluation ({len(self._pending_queue)} tasks)") |
| 1019 | |
| 1020 | # 1. 确保 submitter 已初始化 |
| 1021 | if not self._submitter_started: |
| 1022 | logger.info("| 🔍 Starting CodeSubmitter browser...") |
| 1023 | try: |
| 1024 | await self._submitter.initialize() |
| 1025 | self._submitter_started = True |
| 1026 | except Exception as e: |
| 1027 | logger.error(f"| ❌ Failed to start submitter: {e}") |
| 1028 | # 标记所有任务为失败,并保存结果(output_file 在 __init__ 中已设置) |
| 1029 | for task in self._pending_queue: |
| 1030 | task.score = 0.0 |
| 1031 | task.extra["submit_time"] = 0.0 |
| 1032 | task.extra["spend_time"] = task.extra.get("inference_time", 0.0) |
| 1033 | self._tasks.append(task) |
| 1034 | |
| 1035 | # 保存错误结果 |
| 1036 | error_result = Result( |
| 1037 | task_id=task.task_id, |
| 1038 | prompt=task.input, |
| 1039 | prediction="browser_init_error", |
| 1040 | answer="None", |
| 1041 | score=0.0, |
| 1042 | metrics={"error": str(e), "inference_time": task.extra.get("inference_time", 0.0), "submit_time": 0.0}, |
| 1043 | extra=None, |
| 1044 | start_time=task.extra.get("inference_start_time", time.time()), |
| 1045 | end_time=time.time(), |
| 1046 | spend_time=task.extra.get("inference_time", 0.0) |
| 1047 | ) |
| 1048 | await self._submitter.save_result(error_result) |
| 1049 | |
| 1050 | result = self._pending_queue.copy() |
| 1051 | self._pending_queue.clear() |
| 1052 | return result |
| 1053 | |
| 1054 | # 2. 准备批量写入的文件 |
| 1055 | batch_tasks = self._pending_queue.copy() |
| 1056 | self._pending_queue.clear() |
| 1057 | |
| 1058 | file_contents = [] |
| 1059 | valid_tasks = [] # 有代码的任务 |
| 1060 | |
| 1061 | for task in batch_tasks: |
| 1062 | code_content = task.result |
| 1063 | if not code_content: |
| 1064 | # 无代码,直接标记失败 |