(model, tokenizer, device, dtype, config)
| 18 | |
| 19 | |
| 20 | def evaluate(model, tokenizer, device, dtype, config): |
| 21 | test_dataset = CountdownTasksDataset( |
| 22 | data_path=config["data"]["path"], |
| 23 | tokenizer=tokenizer, |
| 24 | split="test", |
| 25 | test_size=config["data"]["test_size"], |
| 26 | ) |
| 27 | generator = torch.Generator(device=device) |
| 28 | # We reduce the batch size by half as we want to |
| 29 | # generate twice as long trajectories. |
| 30 | dataloader = DataLoader( |
| 31 | test_dataset, |
| 32 | shuffle=False, |
| 33 | collate_fn=CountdownTasksDataset.collate_fn, |
| 34 | generator=generator, |
| 35 | batch_size=config["training"]["batch_size"] // 2, |
| 36 | drop_last=False, |
| 37 | ) |
| 38 | success = [] |
| 39 | for batch in dataloader: |
| 40 | episodes = rollout( |
| 41 | model=model, |
| 42 | tokenizer=tokenizer, |
| 43 | batch=batch, |
| 44 | max_gen_len=config["training"]["max_gen_len"] * 2, |
| 45 | num_answer_per_question=1, |
| 46 | reward_function=reward_function, |
| 47 | device=device, |
| 48 | dtype=dtype, |
| 49 | ) |
| 50 | success.extend([episode.reward_info["answer_reward"] for episode in episodes]) |
| 51 | return np.mean(success) |
| 52 | |
| 53 | |
| 54 | def main(config_path: str): |
no test coverage detected