MCPcopy Index your code
hub / github.com/algorithmicsuperintelligence/optillm / analyze_results

Function analyze_results

scripts/eval_aime_benchmark.py:475–794  ·  view source on GitHub ↗

Analyze and print summary statistics of the results. Args: results (List[Dict]): List of evaluation results n (int): Number of attempts per problem analyze_thoughts (bool): Whether to analyze thinking patterns analyze_logits (bool): Whether to analyze to

(results: List[Dict], n: int, analyze_thoughts: bool = False, analyze_logits: bool = False)

Source from the content-addressed store, hash-verified

473 return max(int(r.get('index', -1)) for r in results)
474
475def analyze_results(results: List[Dict], n: int, analyze_thoughts: bool = False, analyze_logits: bool = False):
476 """
477 Analyze and print summary statistics of the results.
478
479 Args:
480 results (List[Dict]): List of evaluation results
481 n (int): Number of attempts per problem
482 analyze_thoughts (bool): Whether to analyze thinking patterns
483 analyze_logits (bool): Whether to analyze token probabilities
484 """
485 total = len(results)
486 correct = sum(1 for r in results if r['is_correct'])
487 accuracy = correct / total if total > 0 else 0
488
489 print("\n=== Results Summary ===")
490 print(f"Evaluation mode: pass@{n}")
491 print(f"Total problems: {total}")
492 print(f"Correct answers: {correct}")
493 print(f"Accuracy: {accuracy:.2%}")
494
495 # Calculate attempt statistics
496 successful_attempts = [r['first_correct_attempt'] for r in results if r['is_correct']]
497 if successful_attempts:
498 avg_attempts = sum(successful_attempts) / len(successful_attempts)
499 print(f"\nFor correct solutions:")
500 print(f"Average attempts needed: {avg_attempts:.2f}")
501 print(f"Attempt distribution:")
502 for i in range(1, n + 1):
503 count = sum(1 for x in successful_attempts if x == i)
504 print(f" Attempt {i}: {count} problems")
505
506 if analyze_thoughts:
507 print("\n=== Thinking Pattern Analysis ===")
508
509 # Collect metrics about thinking patterns for correct vs incorrect attempts
510 correct_attempts = []
511 incorrect_attempts = []
512
513 for result in results:
514 for attempt in result['attempts']:
515 if 'thought_analysis' in attempt:
516 if result['is_correct'] and attempt['predicted_answer'] == result['correct_answer']:
517 correct_attempts.append(attempt)
518 else:
519 incorrect_attempts.append(attempt)
520
521 # Function to calculate statistics for a group of attempts
522 def calc_stats(attempts):
523 if not attempts:
524 return {
525 "count": 0,
526 "avg_thinking_tokens": 0,
527 "avg_thought_transitions": 0,
528 "transition_usage": {phrase: 0 for phrase in THOUGHT_TRANSITIONS},
529 "has_think_tags_pct": 0
530 }
531
532 thinking_tokens = [a['thought_analysis']['thinking_tokens'] for a in attempts]

Callers 1

mainFunction · 0.70

Calls 2

calc_statsFunction · 0.85
calc_logit_statsFunction · 0.85

Tested by

no test coverage detected