MCPcopy
hub / github.com/karpathy/nanochat / run_generative_eval

Function run_generative_eval

scripts/chat_eval.py:29–81  ·  view source on GitHub ↗
(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=None)

Source from the content-addressed store, hash-verified

27# Generative evaluation loop (we go one problem at a time, sample, evaluate)
28
29def run_generative_eval(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=None):
30
31 ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
32 device = model.get_device()
33
34 num_problems = len(task_object) if max_problems is None else min(len(task_object), max_problems)
35
36 # Run the evaluation
37 num_passed, total = 0, 0
38 for i in range(ddp_rank, num_problems, ddp_world_size):
39 conversation = task_object[i]
40
41 # Tokenize the prompt
42 encoded_prompt = tokenizer.render_for_completion(conversation)
43 # Get the completions
44 results, _ = engine.generate_batch(
45 encoded_prompt,
46 num_samples=num_samples,
47 max_tokens=max_new_tokens,
48 temperature=temperature,
49 top_k=top_k,
50 )
51 # Decode the completions as text
52 prefix_length = len(encoded_prompt)
53 completions = [tokenizer.decode(result_tokens[prefix_length:]) for result_tokens in results]
54 # Evaluate success criteria
55 outcomes = [task_object.evaluate(conversation, completion) for completion in completions]
56 passed = any(outcomes)
57
58 # Keep stats
59 total += 1
60 num_passed += int(passed)
61
62 # Logging (overwrite the same line in the console)
63 print(f"\r\033[KRank {ddp_rank} | {num_passed}/{total} ({100*num_passed/total:.2f}%)", end='', flush=True)
64
65 # Finish the in-place progress line with a newline before final summary
66 print()
67
68 # Aggregate results across all ranks
69 if ddp:
70 num_passed_tensor = torch.tensor([num_passed], dtype=torch.long, device=device)
71 total_tensor = torch.tensor([total], dtype=torch.long, device=device)
72 dist.all_reduce(num_passed_tensor, op=dist.ReduceOp.SUM)
73 dist.all_reduce(total_tensor, op=dist.ReduceOp.SUM)
74 num_passed = num_passed_tensor.item()
75 total = total_tensor.item()
76
77 print0("=" * 50)
78 print0(f"Final: {num_passed}/{total} ({100*num_passed/total:.2f}%)")
79
80 # Return the accuracy
81 return num_passed/total
82
83# -----------------------------------------------------------------------------
84# Categorical evaluation loop

Callers 1

run_chat_evalFunction · 0.85

Calls 7

get_dist_infoFunction · 0.90
print0Function · 0.90
render_for_completionMethod · 0.80
generate_batchMethod · 0.80
get_deviceMethod · 0.45
decodeMethod · 0.45
evaluateMethod · 0.45

Tested by

no test coverage detected