Save evaluation results to files.
(metrics: Dict[str, float], detailed_results: List[Dict[str, Any]],
model: str, approach: str, output_dir: str)
| 721 | return final_metrics, detailed_results |
| 722 | |
| 723 | def save_results(metrics: Dict[str, float], detailed_results: List[Dict[str, Any]], |
| 724 | model: str, approach: str, output_dir: str): |
| 725 | """Save evaluation results to files.""" |
| 726 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 727 | |
| 728 | # Create model-specific directory |
| 729 | model_dir = os.path.join(output_dir, model.replace('/', '_')) |
| 730 | os.makedirs(model_dir, exist_ok=True) |
| 731 | |
| 732 | base_filename = os.path.join(model_dir, f"{approach}_{timestamp}") |
| 733 | |
| 734 | # Save metrics |
| 735 | with open(f"{base_filename}_metrics.json", "w") as f: |
| 736 | json.dump(metrics, f, indent=2) |
| 737 | |
| 738 | # Save detailed results |
| 739 | with open(f"{base_filename}_detailed.json", "w") as f: |
| 740 | json.dump(detailed_results, f, indent=2) |
| 741 | |
| 742 | # Create a summary DataFrame for easier analysis |
| 743 | df = pd.DataFrame([ |
| 744 | {k: v for k, v in result.items() if k != 'raw_response' and k != 'processed_response'} |
| 745 | for result in detailed_results |
| 746 | ]) |
| 747 | df.to_csv(f"{base_filename}_summary.csv", index=False) |
| 748 | |
| 749 | logger.info(f"Results saved to {base_filename}_*") |
| 750 | |
| 751 | def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, is_test_time_compute: bool = False): |
| 752 | """Generate a comprehensive report comparing all approaches.""" |