(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=None)
| 27 | # Generative evaluation loop (we go one problem at a time, sample, evaluate) |
| 28 | |
| 29 | def run_generative_eval(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=None): |
| 30 | |
| 31 | ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() |
| 32 | device = model.get_device() |
| 33 | |
| 34 | num_problems = len(task_object) if max_problems is None else min(len(task_object), max_problems) |
| 35 | |
| 36 | # Run the evaluation |
| 37 | num_passed, total = 0, 0 |
| 38 | for i in range(ddp_rank, num_problems, ddp_world_size): |
| 39 | conversation = task_object[i] |
| 40 | |
| 41 | # Tokenize the prompt |
| 42 | encoded_prompt = tokenizer.render_for_completion(conversation) |
| 43 | # Get the completions |
| 44 | results, _ = engine.generate_batch( |
| 45 | encoded_prompt, |
| 46 | num_samples=num_samples, |
| 47 | max_tokens=max_new_tokens, |
| 48 | temperature=temperature, |
| 49 | top_k=top_k, |
| 50 | ) |
| 51 | # Decode the completions as text |
| 52 | prefix_length = len(encoded_prompt) |
| 53 | completions = [tokenizer.decode(result_tokens[prefix_length:]) for result_tokens in results] |
| 54 | # Evaluate success criteria |
| 55 | outcomes = [task_object.evaluate(conversation, completion) for completion in completions] |
| 56 | passed = any(outcomes) |
| 57 | |
| 58 | # Keep stats |
| 59 | total += 1 |
| 60 | num_passed += int(passed) |
| 61 | |
| 62 | # Logging (overwrite the same line in the console) |
| 63 | print(f"\r\033[KRank {ddp_rank} | {num_passed}/{total} ({100*num_passed/total:.2f}%)", end='', flush=True) |
| 64 | |
| 65 | # Finish the in-place progress line with a newline before final summary |
| 66 | print() |
| 67 | |
| 68 | # Aggregate results across all ranks |
| 69 | if ddp: |
| 70 | num_passed_tensor = torch.tensor([num_passed], dtype=torch.long, device=device) |
| 71 | total_tensor = torch.tensor([total], dtype=torch.long, device=device) |
| 72 | dist.all_reduce(num_passed_tensor, op=dist.ReduceOp.SUM) |
| 73 | dist.all_reduce(total_tensor, op=dist.ReduceOp.SUM) |
| 74 | num_passed = num_passed_tensor.item() |
| 75 | total = total_tensor.item() |
| 76 | |
| 77 | print0("=" * 50) |
| 78 | print0(f"Final: {num_passed}/{total} ({100*num_passed/total:.2f}%)") |
| 79 | |
| 80 | # Return the accuracy |
| 81 | return num_passed/total |
| 82 | |
| 83 | # ----------------------------------------------------------------------------- |
| 84 | # Categorical evaluation loop |
no test coverage detected