Evaluate a base model on the CORE benchmark. Returns dict with results, centered_results, and core_metric.
(model, tokenizer, device, max_per_task=-1)
| 105 | |
| 106 | |
| 107 | def evaluate_core(model, tokenizer, device, max_per_task=-1): |
| 108 | """ |
| 109 | Evaluate a base model on the CORE benchmark. |
| 110 | Returns dict with results, centered_results, and core_metric. |
| 111 | """ |
| 112 | base_dir = get_base_dir() |
| 113 | eval_bundle_dir = os.path.join(base_dir, "eval_bundle") |
| 114 | # Download the eval bundle if needed |
| 115 | if not os.path.exists(eval_bundle_dir): |
| 116 | download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle) |
| 117 | |
| 118 | config_path = os.path.join(eval_bundle_dir, "core.yaml") |
| 119 | data_base_path = os.path.join(eval_bundle_dir, "eval_data") |
| 120 | eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") |
| 121 | |
| 122 | with open(config_path, 'r', encoding='utf-8') as f: |
| 123 | config = yaml.safe_load(f) |
| 124 | tasks = config['icl_tasks'] |
| 125 | |
| 126 | # Load random baseline values |
| 127 | random_baselines = {} |
| 128 | with open(eval_meta_data, 'r', encoding='utf-8') as f: |
| 129 | reader = csv.DictReader(f) |
| 130 | for row in reader: |
| 131 | task_name = row['Eval Task'] |
| 132 | random_baseline = row['Random baseline'] |
| 133 | random_baselines[task_name] = float(random_baseline) |
| 134 | |
| 135 | # Evaluate each task |
| 136 | results = {} |
| 137 | centered_results = {} |
| 138 | for task in tasks: |
| 139 | start_time = time.time() |
| 140 | label = task['label'] |
| 141 | task_meta = { |
| 142 | 'task_type': task['icl_task_type'], |
| 143 | 'dataset_uri': task['dataset_uri'], |
| 144 | 'num_fewshot': task['num_fewshot'][0], |
| 145 | 'continuation_delimiter': task.get('continuation_delimiter', ' ') |
| 146 | } |
| 147 | print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='') |
| 148 | |
| 149 | data_path = os.path.join(data_base_path, task_meta['dataset_uri']) |
| 150 | with open(data_path, 'r', encoding='utf-8') as f: |
| 151 | data = [json.loads(line.strip()) for line in f] |
| 152 | |
| 153 | # Shuffle for consistent subsampling when using max_per_task |
| 154 | shuffle_rng = random.Random(1337) |
| 155 | shuffle_rng.shuffle(data) |
| 156 | if max_per_task > 0: |
| 157 | data = data[:max_per_task] |
| 158 | |
| 159 | accuracy = evaluate_task(model, tokenizer, data, device, task_meta) |
| 160 | results[label] = accuracy |
| 161 | random_baseline = random_baselines[label] |
| 162 | centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline) |
| 163 | centered_results[label] = centered_result |
| 164 | elapsed = time.time() - start_time |
no test coverage detected