Analyze and print comprehensive statistics
(results: List[Dict])
| 225 | |
| 226 | |
| 227 | def analyze_results(results: List[Dict]): |
| 228 | """Analyze and print comprehensive statistics""" |
| 229 | if not results: |
| 230 | print("No results to analyze") |
| 231 | return |
| 232 | |
| 233 | total_problems = len(results) |
| 234 | correct = sum(1 for r in results if r.get('is_correct', False)) |
| 235 | |
| 236 | print("\n" + "="*80) |
| 237 | print("IMO-Bench AnswerBench Evaluation Results") |
| 238 | print("="*80) |
| 239 | print(f"Total problems: {total_problems}") |
| 240 | print(f"Correct: {correct}") |
| 241 | print(f"Accuracy: {correct/total_problems*100:.2f}%") |
| 242 | |
| 243 | # Category breakdown |
| 244 | categories = {} |
| 245 | for r in results: |
| 246 | cat = r.get('category', 'Unknown') |
| 247 | if cat not in categories: |
| 248 | categories[cat] = {'total': 0, 'correct': 0} |
| 249 | categories[cat]['total'] += 1 |
| 250 | if r.get('is_correct', False): |
| 251 | categories[cat]['correct'] += 1 |
| 252 | |
| 253 | print("\nPerformance by Category:") |
| 254 | print("-" * 60) |
| 255 | for cat, stats in sorted(categories.items()): |
| 256 | acc = stats['correct'] / stats['total'] * 100 if stats['total'] > 0 else 0 |
| 257 | print(f"{cat:20s}: {stats['correct']:3d}/{stats['total']:3d} ({acc:5.1f}%)") |
| 258 | |
| 259 | # Difficulty breakdown if available |
| 260 | difficulties = {} |
| 261 | for r in results: |
| 262 | diff = r.get('difficulty', 'Unknown') |
| 263 | if diff and diff != 'Unknown': |
| 264 | if diff not in difficulties: |
| 265 | difficulties[diff] = {'total': 0, 'correct': 0} |
| 266 | difficulties[diff]['total'] += 1 |
| 267 | if r.get('is_correct', False): |
| 268 | difficulties[diff]['correct'] += 1 |
| 269 | |
| 270 | if difficulties: |
| 271 | print("\nPerformance by Difficulty:") |
| 272 | print("-" * 60) |
| 273 | for diff, stats in sorted(difficulties.items()): |
| 274 | acc = stats['correct'] / stats['total'] * 100 if stats['total'] > 0 else 0 |
| 275 | print(f"{diff:20s}: {stats['correct']:3d}/{stats['total']:3d} ({acc:5.1f}%)") |
| 276 | |
| 277 | # Token statistics |
| 278 | total_tokens = sum(r['response'].get('total_tokens', 0) for r in results) |
| 279 | reasoning_tokens = sum(r['response'].get('reasoning_tokens', 0) for r in results) |
| 280 | |
| 281 | print("\nToken Statistics:") |
| 282 | print("-" * 60) |
| 283 | print(f"Total tokens: {total_tokens:,}") |
| 284 | print(f"Reasoning tokens: {reasoning_tokens:,}") |