hub / github.com/algorithmicsuperintelligence/optillm / main

Function main

scripts/eval_optillmbench.py:859–961 · view source on GitHub ↗

()

Source from the content-addressed store, hash-verified

857	logger.info(f"Report saved to {report_path}")
858
859	def main():
860	parser = argparse.ArgumentParser(
861	description="Evaluate a model on OptiLLM Bench. By default, runs test-time compute evaluation with pass@1, maj@64, and genselect@64."
862	)
863	parser.add_argument("--model", required=True, help="Model identifier")
864	parser.add_argument("--base-url", default="http://localhost:8000/v1",
865	help="Base URL for API endpoint")
866	parser.add_argument("--max-samples", type=int, help="Maximum number of samples to evaluate")
867	parser.add_argument("--output-dir", default="results",
868	help="Directory to save results")
869	parser.add_argument("--approaches", nargs="+",
870	help="Specific approaches to evaluate (overrides default test-time compute)")
871	parser.add_argument("--test-time-compute", action="store_true",
872	help="Evaluate full test-time compute scaling approaches (ThinkDeeper and various k values)")
873	parser.add_argument("--debug", action="store_true", help="Enable debug logging")
874	args = parser.parse_args()
875
876	# Set debug logging if specified
877	if args.debug:
878	logging.getLogger().setLevel(logging.DEBUG)
879
880	# Create output directory
881	os.makedirs(args.output_dir, exist_ok=True)
882
883	# Get API key from environment
884	api_key = os.environ.get("OPENAI_API_KEY")
885	if not api_key:
886	raise ValueError("OPENAI_API_KEY environment variable must be set")
887
888	# Initialize OpenAI client
889	client = OpenAI(
890	api_key=api_key,
891	base_url=args.base_url
892	)
893
894	# Load dataset
895	dataset = load_optillm_bench()
896
897	# Determine which approaches to evaluate
898	if args.test_time_compute:
899	# Use test-time compute approaches
900	approaches_config = TEST_TIME_COMPUTE_APPROACHES
901	if args.approaches:
902	# Filter test-time compute approaches if specific ones are requested
903	approaches_config = [a for a in TEST_TIME_COMPUTE_APPROACHES if a[0] in args.approaches]
904	elif args.approaches:
905	# Specific approaches requested - check all available approach lists
906	all_available_approaches = APPROACHES + TEST_TIME_COMPUTE_APPROACHES + DEFAULT_TEST_TIME_COMPUTE
907	approaches_config = []
908	for requested_approach in args.approaches:
909	found = False
910	for approach_tuple in all_available_approaches:
911	if approach_tuple[0] == requested_approach:
912	if approach_tuple not in approaches_config: # Avoid duplicates
913	approaches_config.append(approach_tuple)
914	found = True
915	break
916	if not found:

Callers 1

eval_optillmbench.pyFile · 0.70

Calls 4

load_optillm_benchFunction · 0.85

evaluate_modelFunction · 0.85

save_resultsFunction · 0.85

generate_reportFunction · 0.85

Tested by

no test coverage detected