MCPcopy
hub / github.com/algorithmicsuperintelligence/optillm / main

Function main

scripts/eval_optillmbench.py:859–961  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

857 logger.info(f"Report saved to {report_path}")
858
859def main():
860 parser = argparse.ArgumentParser(
861 description="Evaluate a model on OptiLLM Bench. By default, runs test-time compute evaluation with pass@1, maj@64, and genselect@64."
862 )
863 parser.add_argument("--model", required=True, help="Model identifier")
864 parser.add_argument("--base-url", default="http://localhost:8000/v1",
865 help="Base URL for API endpoint")
866 parser.add_argument("--max-samples", type=int, help="Maximum number of samples to evaluate")
867 parser.add_argument("--output-dir", default="results",
868 help="Directory to save results")
869 parser.add_argument("--approaches", nargs="+",
870 help="Specific approaches to evaluate (overrides default test-time compute)")
871 parser.add_argument("--test-time-compute", action="store_true",
872 help="Evaluate full test-time compute scaling approaches (ThinkDeeper and various k values)")
873 parser.add_argument("--debug", action="store_true", help="Enable debug logging")
874 args = parser.parse_args()
875
876 # Set debug logging if specified
877 if args.debug:
878 logging.getLogger().setLevel(logging.DEBUG)
879
880 # Create output directory
881 os.makedirs(args.output_dir, exist_ok=True)
882
883 # Get API key from environment
884 api_key = os.environ.get("OPENAI_API_KEY")
885 if not api_key:
886 raise ValueError("OPENAI_API_KEY environment variable must be set")
887
888 # Initialize OpenAI client
889 client = OpenAI(
890 api_key=api_key,
891 base_url=args.base_url
892 )
893
894 # Load dataset
895 dataset = load_optillm_bench()
896
897 # Determine which approaches to evaluate
898 if args.test_time_compute:
899 # Use test-time compute approaches
900 approaches_config = TEST_TIME_COMPUTE_APPROACHES
901 if args.approaches:
902 # Filter test-time compute approaches if specific ones are requested
903 approaches_config = [a for a in TEST_TIME_COMPUTE_APPROACHES if a[0] in args.approaches]
904 elif args.approaches:
905 # Specific approaches requested - check all available approach lists
906 all_available_approaches = APPROACHES + TEST_TIME_COMPUTE_APPROACHES + DEFAULT_TEST_TIME_COMPUTE
907 approaches_config = []
908 for requested_approach in args.approaches:
909 found = False
910 for approach_tuple in all_available_approaches:
911 if approach_tuple[0] == requested_approach:
912 if approach_tuple not in approaches_config: # Avoid duplicates
913 approaches_config.append(approach_tuple)
914 found = True
915 break
916 if not found:

Callers 1

Calls 4

load_optillm_benchFunction · 0.85
evaluate_modelFunction · 0.85
save_resultsFunction · 0.85
generate_reportFunction · 0.85

Tested by

no test coverage detected