MCPcopy
hub / github.com/algorithmicsuperintelligence/optillm / evaluate_model

Function evaluate_model

scripts/eval_optillmbench.py:303–721  ·  view source on GitHub ↗

Evaluate a model on the dataset using a specific approach. Returns metrics and detailed results.

(
    client: OpenAI,
    model: str,
    dataset: datasets.Dataset,
    approach: str,
    approach_extra_body: Dict[str, Any] = None,
    max_samples: int = None
)

Source from the content-addressed store, hash-verified

301 return f"Question: {question}"
302
303def evaluate_model(
304 client: OpenAI,
305 model: str,
306 dataset: datasets.Dataset,
307 approach: str,
308 approach_extra_body: Dict[str, Any] = None,
309 max_samples: int = None
310) -> Tuple[Dict[str, float], List[Dict[str, Any]]]:
311 """
312 Evaluate a model on the dataset using a specific approach.
313 Returns metrics and detailed results.
314 """
315 metrics = {
316 "total_correct": 0,
317 "total_time": 0,
318 "samples": 0,
319 }
320
321 # Initialize category-specific metrics
322 category_metrics = {}
323
324 # Detailed results for each example
325 detailed_results = []
326
327 # Prepare the dataset
328 examples = dataset if max_samples is None else dataset.select(range(max_samples))
329
330 # Check if we need to do multiple runs (for pass@1 calculation)
331 num_runs = approach_extra_body.get("num_runs", 1) if approach_extra_body else 1
332 # Check if we're using n parameter for parallel generation
333 n_param = approach_extra_body.get("n", 1) if approach_extra_body else 1
334
335 # Handle special approach names and create model names
336 if approach.startswith("avg@") or approach.startswith("pass@"):
337 # For avg@N and pass@N, use base model without any prefix
338 full_model_name = model
339 elif approach.startswith("maj@"):
340 # For majority voting, use the plugin prefix
341 full_model_name = f"majority_voting-{model}"
342 elif approach.startswith("genselect@"):
343 # For genselect, use the plugin prefix
344 full_model_name = f"genselect-{model}"
345 elif approach.startswith("thinkdeeper_"):
346 # For thinkdeeper, use base model name (decoding is passed in extra_body)
347 full_model_name = model
348 elif approach.startswith("majority_voting"):
349 # For other majority voting configurations
350 full_model_name = f"majority_voting-{model}"
351 elif approach == "none":
352 # For explicit none approach
353 full_model_name = model
354 else:
355 # Standard approach prefix
356 full_model_name = f"{approach}-{model}"
357
358 for example in tqdm(examples, desc=f"Evaluating {approach}"):
359 # For avg@N and pass@N with n parameter, we generate n responses in parallel
360 if n_param > 1 and (approach.startswith("avg@") or approach.startswith("pass@")):

Callers 1

mainFunction · 0.85

Calls 6

get_prompt_for_categoryFunction · 0.85
updateMethod · 0.80
remove_thinking_blocksFunction · 0.70
evaluate_responseFunction · 0.70
selectMethod · 0.45
createMethod · 0.45

Tested by

no test coverage detected