MCPcopy
hub / github.com/karpathy/nanochat / evaluate_core

Function evaluate_core

scripts/base_eval.py:107–173  ·  view source on GitHub ↗

Evaluate a base model on the CORE benchmark. Returns dict with results, centered_results, and core_metric.

(model, tokenizer, device, max_per_task=-1)

Source from the content-addressed store, hash-verified

105
106
107def evaluate_core(model, tokenizer, device, max_per_task=-1):
108 """
109 Evaluate a base model on the CORE benchmark.
110 Returns dict with results, centered_results, and core_metric.
111 """
112 base_dir = get_base_dir()
113 eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
114 # Download the eval bundle if needed
115 if not os.path.exists(eval_bundle_dir):
116 download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
117
118 config_path = os.path.join(eval_bundle_dir, "core.yaml")
119 data_base_path = os.path.join(eval_bundle_dir, "eval_data")
120 eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
121
122 with open(config_path, 'r', encoding='utf-8') as f:
123 config = yaml.safe_load(f)
124 tasks = config['icl_tasks']
125
126 # Load random baseline values
127 random_baselines = {}
128 with open(eval_meta_data, 'r', encoding='utf-8') as f:
129 reader = csv.DictReader(f)
130 for row in reader:
131 task_name = row['Eval Task']
132 random_baseline = row['Random baseline']
133 random_baselines[task_name] = float(random_baseline)
134
135 # Evaluate each task
136 results = {}
137 centered_results = {}
138 for task in tasks:
139 start_time = time.time()
140 label = task['label']
141 task_meta = {
142 'task_type': task['icl_task_type'],
143 'dataset_uri': task['dataset_uri'],
144 'num_fewshot': task['num_fewshot'][0],
145 'continuation_delimiter': task.get('continuation_delimiter', ' ')
146 }
147 print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='')
148
149 data_path = os.path.join(data_base_path, task_meta['dataset_uri'])
150 with open(data_path, 'r', encoding='utf-8') as f:
151 data = [json.loads(line.strip()) for line in f]
152
153 # Shuffle for consistent subsampling when using max_per_task
154 shuffle_rng = random.Random(1337)
155 shuffle_rng.shuffle(data)
156 if max_per_task > 0:
157 data = data[:max_per_task]
158
159 accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
160 results[label] = accuracy
161 random_baseline = random_baselines[label]
162 centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
163 centered_results[label] = centered_result
164 elapsed = time.time() - start_time

Callers 2

base_train.pyFile · 0.90
mainFunction · 0.85

Calls 4

get_base_dirFunction · 0.90
download_file_with_lockFunction · 0.90
print0Function · 0.90
evaluate_taskFunction · 0.90

Tested by

no test coverage detected