| 870 | return None |
| 871 | |
| 872 | def evaluate(self, |
| 873 | llm: Union[LLM, PyTorchLLM], |
| 874 | sampling_params: Optional[SamplingParams] = None, |
| 875 | streaming: bool = False) -> float: |
| 876 | import lm_eval |
| 877 | |
| 878 | lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper |
| 879 | results = lm_eval.evaluate( |
| 880 | lm=lm_cls(llm, |
| 881 | sampling_params=sampling_params, |
| 882 | streaming=streaming, |
| 883 | chat_template_kwargs=self.chat_template_kwargs), |
| 884 | task_dict=self.task_dict, |
| 885 | limit=self.num_samples, |
| 886 | apply_chat_template=self.apply_chat_template, |
| 887 | fewshot_as_multiturn=self.fewshot_as_multiturn, |
| 888 | system_instruction=self.system_prompt, |
| 889 | log_samples=self.log_samples) |
| 890 | logger.info( |
| 891 | f"lm-eval {self.task_name} results:\n{lm_eval.utils.make_table(results)}" |
| 892 | ) |
| 893 | |
| 894 | # Save results if output_path is specified |
| 895 | if self.output_path: |
| 896 | self.save_results(results) |
| 897 | |
| 898 | # LongBench is a group task in lm-eval. lm-eval already computes subgroup |
| 899 | # "score" values (e.g., `longbench_fewshot`, `longbench_single`, ...). |
| 900 | # To keep this implementation simple and aligned with the printed table, |
| 901 | # we compute the final LongBench score as the unweighted mean of subgroup |
| 902 | # scores. |
| 903 | group_results: Dict[str, Dict[str, Any]] = results.get("groups", {}) |
| 904 | subgroup_names = results.get("group_subtasks", |
| 905 | {}).get(self.task_name, []) |
| 906 | if not subgroup_names: |
| 907 | raise KeyError( |
| 908 | f"lm-eval did not provide subgroup list for group '{self.task_name}'. " |
| 909 | "Expected `results['group_subtasks'][task_name]` to exist.") |
| 910 | |
| 911 | subgroup_scores: List[float] = [] |
| 912 | missing: List[str] = [] |
| 913 | for name in subgroup_names: |
| 914 | m = group_results.get(name, None) |
| 915 | score = self._get_group_score(m) |
| 916 | if score is None: |
| 917 | missing.append(name) |
| 918 | else: |
| 919 | subgroup_scores.append(score) |
| 920 | |
| 921 | if not subgroup_scores: |
| 922 | raise KeyError( |
| 923 | f"lm-eval did not provide subgroup 'score' metrics for '{self.task_name}'. " |
| 924 | f"Missing subgroups: {missing[:10]}") |
| 925 | |
| 926 | result_acc = float(np.mean(subgroup_scores)) * 100 |
| 927 | logger.info( |
| 928 | f"lm-eval {self.task_name} average 'score' across {len(subgroup_scores)} subgroups: {result_acc:.2f}" |
| 929 | ) |