Function main

scripts/eval_math500_benchmark.py:745–782 · view source on GitHub ↗

Main evaluation function.

(model: str)

Source from the content-addressed store, hash-verified

743	print("---")
744
745	def main(model: str):
746	"""Main evaluation function."""
747	os.makedirs("results", exist_ok=True)
748	results_file = f"evaluation_results_math500_{model.replace('/', '_')}.json"
749
750	dataset = load_math500_dataset()
751	existing_results = load_existing_results(results_file)
752
753	# Create a set of already processed indexes for efficient lookup
754	processed_indexes = {result['index'] for result in existing_results}
755
756	for idx, item in enumerate(tqdm(dataset, desc="Evaluating problems")):
757	# Skip if this index has already been processed
758	if idx in processed_indexes:
759	continue
760
761	problem_text = item['problem']
762	correct_answer = item['answer']
763
764	# Get model's response
765	response = get_llm_response(problem_text, model)
766	predicted_answer = extract_answer(response)
767
768	# Compare answers using the new comparison function
769	is_correct = compare_answers(correct_answer, predicted_answer)
770
771	result = {
772	"index": idx,
773	"problem": problem_text,
774	"response": response,
775	"correct_answer": correct_answer,
776	"predicted_answer": predicted_answer,
777	"is_correct": is_correct
778	}
779	save_result(results_file, result)
780
781	final_results = load_existing_results(results_file)
782	analyze_results(final_results)
783
784	if __name__ == "__main__":
785	parser = argparse.ArgumentParser(description="Evaluate LLM performance on MATH-500 problems")

eval_math500_benchmark.pyFile · 0.70

load_math500_datasetFunction · 0.85

load_existing_resultsFunction · 0.70

get_llm_responseFunction · 0.70

extract_answerFunction · 0.70

compare_answersFunction · 0.70

save_resultFunction · 0.70

analyze_resultsFunction · 0.70

no test coverage detected