Function evaluate

examples/mmlu.py:178–214 · view source on GitHub ↗

(args, subject, pipeline, dev_df, test_df)

Source from the content-addressed store, hash-verified

176
177
178	def evaluate(args, subject, pipeline, dev_df, test_df):
179	rank = tensorrt_llm.mpi_rank()
180	cors = []
181	all_probs = []
182	for i in range(test_df.shape[0]):
183	if i >= args.max_ite:
184	break
185	# get prompt and make sure it fits
186	k = args.ntrain
187	prompt_end = format_example(test_df, i, include_answer=False)
188	train_prompt = gen_prompt(dev_df, subject, k)
189	prompt = train_prompt + prompt_end
190
191	while not pipeline.check_valid_length(prompt) and k > 0:
192	k -= 1
193	train_prompt = gen_prompt(dev_df, subject, k)
194	prompt = train_prompt + prompt_end
195
196	label = test_df.iloc[i, test_df.shape[1] - 1]
197	pred = pipeline(prompt)
198
199	if rank == 0:
200	probs = [0 for _ in get_choices()]
201	cor = pred.strip().startswith(label)
202	cors.append(cor)
203	all_probs.append(probs)
204
205	if rank == 0:
206	acc = np.mean(cors)
207	cors = np.array(cors)
208
209	all_probs = np.array(all_probs)
210	print("Average accuracy {:.3f} - {}".format(acc, subject))
211
212	return cors, acc, all_probs
213	else:
214	return None, 0, None
215
216
217	def get_tokenizer(ckpt_path, max_seq_len):

mainFunction · 0.85

format_exampleFunction · 0.85

gen_promptFunction · 0.85

get_choicesFunction · 0.85

check_valid_lengthMethod · 0.80

meanMethod · 0.80

appendMethod · 0.45

no test coverage detected