Wrap a model in DDP when running multi-GPU; return it unchanged otherwise. Only the *trainable* model should be wrapped. Reference / old-policy / reward models carry no gradients and must NOT be wrapped (they are replicated identically per rank). Pass ``find_unused_parameters=True`` fo
(model: torch.nn.Module, ctx: DDPContext, find_unused_parameters: bool = False)
| 59 | |
| 60 | |
| 61 | def ddp_wrap(model: torch.nn.Module, ctx: DDPContext, find_unused_parameters: bool = False) -> torch.nn.Module: |
| 62 | """Wrap a model in DDP when running multi-GPU; return it unchanged otherwise. |
| 63 | |
| 64 | Only the *trainable* model should be wrapped. Reference / old-policy / reward models |
| 65 | carry no gradients and must NOT be wrapped (they are replicated identically per rank). |
| 66 | |
| 67 | Pass ``find_unused_parameters=True`` for a model where some parameters do not receive a |
| 68 | gradient on every step (for example the reward model, which uses the backbone's |
| 69 | ``forward_hidden`` and a reward head but never its ``lm_head``). Without it, DDP raises |
| 70 | a "did not get a gradient" error on the first backward. |
| 71 | """ |
| 72 | if not ctx.enabled: |
| 73 | return model |
| 74 | device_ids = [ctx.local_rank] if ctx.device.startswith("cuda") else None |
| 75 | return DDP(model, device_ids=device_ids, find_unused_parameters=find_unused_parameters) |
| 76 | |
| 77 | |
| 78 | def is_main_process(ctx: DDPContext) -> bool: |