MCPcopy Index your code
hub / github.com/algorithmicsuperintelligence/optillm / analyze_results

Function analyze_results

scripts/eval_imobench_answer.py:227–292  ·  view source on GitHub ↗

Analyze and print comprehensive statistics

(results: List[Dict])

Source from the content-addressed store, hash-verified

225
226
227def analyze_results(results: List[Dict]):
228 """Analyze and print comprehensive statistics"""
229 if not results:
230 print("No results to analyze")
231 return
232
233 total_problems = len(results)
234 correct = sum(1 for r in results if r.get('is_correct', False))
235
236 print("\n" + "="*80)
237 print("IMO-Bench AnswerBench Evaluation Results")
238 print("="*80)
239 print(f"Total problems: {total_problems}")
240 print(f"Correct: {correct}")
241 print(f"Accuracy: {correct/total_problems*100:.2f}%")
242
243 # Category breakdown
244 categories = {}
245 for r in results:
246 cat = r.get('category', 'Unknown')
247 if cat not in categories:
248 categories[cat] = {'total': 0, 'correct': 0}
249 categories[cat]['total'] += 1
250 if r.get('is_correct', False):
251 categories[cat]['correct'] += 1
252
253 print("\nPerformance by Category:")
254 print("-" * 60)
255 for cat, stats in sorted(categories.items()):
256 acc = stats['correct'] / stats['total'] * 100 if stats['total'] > 0 else 0
257 print(f"{cat:20s}: {stats['correct']:3d}/{stats['total']:3d} ({acc:5.1f}%)")
258
259 # Difficulty breakdown if available
260 difficulties = {}
261 for r in results:
262 diff = r.get('difficulty', 'Unknown')
263 if diff and diff != 'Unknown':
264 if diff not in difficulties:
265 difficulties[diff] = {'total': 0, 'correct': 0}
266 difficulties[diff]['total'] += 1
267 if r.get('is_correct', False):
268 difficulties[diff]['correct'] += 1
269
270 if difficulties:
271 print("\nPerformance by Difficulty:")
272 print("-" * 60)
273 for diff, stats in sorted(difficulties.items()):
274 acc = stats['correct'] / stats['total'] * 100 if stats['total'] > 0 else 0
275 print(f"{diff:20s}: {stats['correct']:3d}/{stats['total']:3d} ({acc:5.1f}%)")
276
277 # Token statistics
278 total_tokens = sum(r['response'].get('total_tokens', 0) for r in results)
279 reasoning_tokens = sum(r['response'].get('reasoning_tokens', 0) for r in results)
280
281 print("\nToken Statistics:")
282 print("-" * 60)
283 print(f"Total tokens: {total_tokens:,}")
284 print(f"Reasoning tokens: {reasoning_tokens:,}")

Callers 1

mainFunction · 0.70

Calls

no outgoing calls

Tested by

no test coverage detected