(args, subject, pipeline, dev_df, test_df)
| 176 | |
| 177 | |
| 178 | def evaluate(args, subject, pipeline, dev_df, test_df): |
| 179 | rank = tensorrt_llm.mpi_rank() |
| 180 | cors = [] |
| 181 | all_probs = [] |
| 182 | for i in range(test_df.shape[0]): |
| 183 | if i >= args.max_ite: |
| 184 | break |
| 185 | # get prompt and make sure it fits |
| 186 | k = args.ntrain |
| 187 | prompt_end = format_example(test_df, i, include_answer=False) |
| 188 | train_prompt = gen_prompt(dev_df, subject, k) |
| 189 | prompt = train_prompt + prompt_end |
| 190 | |
| 191 | while not pipeline.check_valid_length(prompt) and k > 0: |
| 192 | k -= 1 |
| 193 | train_prompt = gen_prompt(dev_df, subject, k) |
| 194 | prompt = train_prompt + prompt_end |
| 195 | |
| 196 | label = test_df.iloc[i, test_df.shape[1] - 1] |
| 197 | pred = pipeline(prompt) |
| 198 | |
| 199 | if rank == 0: |
| 200 | probs = [0 for _ in get_choices()] |
| 201 | cor = pred.strip().startswith(label) |
| 202 | cors.append(cor) |
| 203 | all_probs.append(probs) |
| 204 | |
| 205 | if rank == 0: |
| 206 | acc = np.mean(cors) |
| 207 | cors = np.array(cors) |
| 208 | |
| 209 | all_probs = np.array(all_probs) |
| 210 | print("Average accuracy {:.3f} - {}".format(acc, subject)) |
| 211 | |
| 212 | return cors, acc, all_probs |
| 213 | else: |
| 214 | return None, 0, None |
| 215 | |
| 216 | |
| 217 | def get_tokenizer(ckpt_path, max_seq_len): |
no test coverage detected