()
| 857 | logger.info(f"Report saved to {report_path}") |
| 858 | |
| 859 | def main(): |
| 860 | parser = argparse.ArgumentParser( |
| 861 | description="Evaluate a model on OptiLLM Bench. By default, runs test-time compute evaluation with pass@1, maj@64, and genselect@64." |
| 862 | ) |
| 863 | parser.add_argument("--model", required=True, help="Model identifier") |
| 864 | parser.add_argument("--base-url", default="http://localhost:8000/v1", |
| 865 | help="Base URL for API endpoint") |
| 866 | parser.add_argument("--max-samples", type=int, help="Maximum number of samples to evaluate") |
| 867 | parser.add_argument("--output-dir", default="results", |
| 868 | help="Directory to save results") |
| 869 | parser.add_argument("--approaches", nargs="+", |
| 870 | help="Specific approaches to evaluate (overrides default test-time compute)") |
| 871 | parser.add_argument("--test-time-compute", action="store_true", |
| 872 | help="Evaluate full test-time compute scaling approaches (ThinkDeeper and various k values)") |
| 873 | parser.add_argument("--debug", action="store_true", help="Enable debug logging") |
| 874 | args = parser.parse_args() |
| 875 | |
| 876 | # Set debug logging if specified |
| 877 | if args.debug: |
| 878 | logging.getLogger().setLevel(logging.DEBUG) |
| 879 | |
| 880 | # Create output directory |
| 881 | os.makedirs(args.output_dir, exist_ok=True) |
| 882 | |
| 883 | # Get API key from environment |
| 884 | api_key = os.environ.get("OPENAI_API_KEY") |
| 885 | if not api_key: |
| 886 | raise ValueError("OPENAI_API_KEY environment variable must be set") |
| 887 | |
| 888 | # Initialize OpenAI client |
| 889 | client = OpenAI( |
| 890 | api_key=api_key, |
| 891 | base_url=args.base_url |
| 892 | ) |
| 893 | |
| 894 | # Load dataset |
| 895 | dataset = load_optillm_bench() |
| 896 | |
| 897 | # Determine which approaches to evaluate |
| 898 | if args.test_time_compute: |
| 899 | # Use test-time compute approaches |
| 900 | approaches_config = TEST_TIME_COMPUTE_APPROACHES |
| 901 | if args.approaches: |
| 902 | # Filter test-time compute approaches if specific ones are requested |
| 903 | approaches_config = [a for a in TEST_TIME_COMPUTE_APPROACHES if a[0] in args.approaches] |
| 904 | elif args.approaches: |
| 905 | # Specific approaches requested - check all available approach lists |
| 906 | all_available_approaches = APPROACHES + TEST_TIME_COMPUTE_APPROACHES + DEFAULT_TEST_TIME_COMPUTE |
| 907 | approaches_config = [] |
| 908 | for requested_approach in args.approaches: |
| 909 | found = False |
| 910 | for approach_tuple in all_available_approaches: |
| 911 | if approach_tuple[0] == requested_approach: |
| 912 | if approach_tuple not in approaches_config: # Avoid duplicates |
| 913 | approaches_config.append(approach_tuple) |
| 914 | found = True |
| 915 | break |
| 916 | if not found: |
no test coverage detected