MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / evaluate

Method evaluate

tensorrt_llm/evaluate/lm_eval.py:872–930  ·  view source on GitHub ↗
(self,
                 llm: Union[LLM, PyTorchLLM],
                 sampling_params: Optional[SamplingParams] = None,
                 streaming: bool = False)

Source from the content-addressed store, hash-verified

870 return None
871
872 def evaluate(self,
873 llm: Union[LLM, PyTorchLLM],
874 sampling_params: Optional[SamplingParams] = None,
875 streaming: bool = False) -> float:
876 import lm_eval
877
878 lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper
879 results = lm_eval.evaluate(
880 lm=lm_cls(llm,
881 sampling_params=sampling_params,
882 streaming=streaming,
883 chat_template_kwargs=self.chat_template_kwargs),
884 task_dict=self.task_dict,
885 limit=self.num_samples,
886 apply_chat_template=self.apply_chat_template,
887 fewshot_as_multiturn=self.fewshot_as_multiturn,
888 system_instruction=self.system_prompt,
889 log_samples=self.log_samples)
890 logger.info(
891 f"lm-eval {self.task_name} results:\n{lm_eval.utils.make_table(results)}"
892 )
893
894 # Save results if output_path is specified
895 if self.output_path:
896 self.save_results(results)
897
898 # LongBench is a group task in lm-eval. lm-eval already computes subgroup
899 # "score" values (e.g., `longbench_fewshot`, `longbench_single`, ...).
900 # To keep this implementation simple and aligned with the printed table,
901 # we compute the final LongBench score as the unweighted mean of subgroup
902 # scores.
903 group_results: Dict[str, Dict[str, Any]] = results.get("groups", {})
904 subgroup_names = results.get("group_subtasks",
905 {}).get(self.task_name, [])
906 if not subgroup_names:
907 raise KeyError(
908 f"lm-eval did not provide subgroup list for group '{self.task_name}'. "
909 "Expected `results['group_subtasks'][task_name]` to exist.")
910
911 subgroup_scores: List[float] = []
912 missing: List[str] = []
913 for name in subgroup_names:
914 m = group_results.get(name, None)
915 score = self._get_group_score(m)
916 if score is None:
917 missing.append(name)
918 else:
919 subgroup_scores.append(score)
920
921 if not subgroup_scores:
922 raise KeyError(
923 f"lm-eval did not provide subgroup 'score' metrics for '{self.task_name}'. "
924 f"Missing subgroups: {missing[:10]}")
925
926 result_acc = float(np.mean(subgroup_scores)) * 100
927 logger.info(
928 f"lm-eval {self.task_name} average 'score' across {len(subgroup_scores)} subgroups: {result_acc:.2f}"
929 )

Callers 8

commandMethod · 0.95
evaluateMethod · 0.45
command_harnessMethod · 0.45
commandMethod · 0.45
commandMethod · 0.45
commandMethod · 0.45

Calls 6

_get_group_scoreMethod · 0.95
save_resultsMethod · 0.80
meanMethod · 0.80
infoMethod · 0.45
getMethod · 0.45
appendMethod · 0.45