Evaluate a model on the dataset using a specific approach. Returns metrics and detailed results.
(
client: OpenAI,
model: str,
dataset: datasets.Dataset,
approach: str,
approach_extra_body: Dict[str, Any] = None,
max_samples: int = None
)
| 301 | return f"Question: {question}" |
| 302 | |
| 303 | def evaluate_model( |
| 304 | client: OpenAI, |
| 305 | model: str, |
| 306 | dataset: datasets.Dataset, |
| 307 | approach: str, |
| 308 | approach_extra_body: Dict[str, Any] = None, |
| 309 | max_samples: int = None |
| 310 | ) -> Tuple[Dict[str, float], List[Dict[str, Any]]]: |
| 311 | """ |
| 312 | Evaluate a model on the dataset using a specific approach. |
| 313 | Returns metrics and detailed results. |
| 314 | """ |
| 315 | metrics = { |
| 316 | "total_correct": 0, |
| 317 | "total_time": 0, |
| 318 | "samples": 0, |
| 319 | } |
| 320 | |
| 321 | # Initialize category-specific metrics |
| 322 | category_metrics = {} |
| 323 | |
| 324 | # Detailed results for each example |
| 325 | detailed_results = [] |
| 326 | |
| 327 | # Prepare the dataset |
| 328 | examples = dataset if max_samples is None else dataset.select(range(max_samples)) |
| 329 | |
| 330 | # Check if we need to do multiple runs (for pass@1 calculation) |
| 331 | num_runs = approach_extra_body.get("num_runs", 1) if approach_extra_body else 1 |
| 332 | # Check if we're using n parameter for parallel generation |
| 333 | n_param = approach_extra_body.get("n", 1) if approach_extra_body else 1 |
| 334 | |
| 335 | # Handle special approach names and create model names |
| 336 | if approach.startswith("avg@") or approach.startswith("pass@"): |
| 337 | # For avg@N and pass@N, use base model without any prefix |
| 338 | full_model_name = model |
| 339 | elif approach.startswith("maj@"): |
| 340 | # For majority voting, use the plugin prefix |
| 341 | full_model_name = f"majority_voting-{model}" |
| 342 | elif approach.startswith("genselect@"): |
| 343 | # For genselect, use the plugin prefix |
| 344 | full_model_name = f"genselect-{model}" |
| 345 | elif approach.startswith("thinkdeeper_"): |
| 346 | # For thinkdeeper, use base model name (decoding is passed in extra_body) |
| 347 | full_model_name = model |
| 348 | elif approach.startswith("majority_voting"): |
| 349 | # For other majority voting configurations |
| 350 | full_model_name = f"majority_voting-{model}" |
| 351 | elif approach == "none": |
| 352 | # For explicit none approach |
| 353 | full_model_name = model |
| 354 | else: |
| 355 | # Standard approach prefix |
| 356 | full_model_name = f"{approach}-{model}" |
| 357 | |
| 358 | for example in tqdm(examples, desc=f"Evaluating {approach}"): |
| 359 | # For avg@N and pass@N with n parameter, we generate n responses in parallel |
| 360 | if n_param > 1 and (approach.startswith("avg@") or approach.startswith("pass@")): |
no test coverage detected