Initialize model. Returns: The neural network model to be trained or evaluated.
()
| 44 | |
| 45 | |
| 46 | def initialize_model(): |
| 47 | """ |
| 48 | Initialize model. |
| 49 | |
| 50 | Returns: The neural network model to be trained or evaluated. |
| 51 | """ |
| 52 | |
| 53 | model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model)) |
| 54 | if isinstance(model, nn.ModuleList): |
| 55 | model = nn.ModuleList( |
| 56 | [ |
| 57 | NaiveAMPModel( |
| 58 | model=_m, |
| 59 | output_to_fp32=False, # manually controlled by interleaved pipleline scheduler |
| 60 | dtype=gpc.config.model.get("dtype", torch.half), |
| 61 | sync_buffer=False, |
| 62 | ) |
| 63 | for _m in model |
| 64 | ] |
| 65 | ) |
| 66 | else: |
| 67 | model = NaiveAMPModel( |
| 68 | model=model, |
| 69 | output_to_fp32=is_no_pp_or_last_stage(), |
| 70 | dtype=gpc.config.model.get("dtype", torch.half), |
| 71 | sync_buffer=False, |
| 72 | ) |
| 73 | |
| 74 | # This sync is very important, cause the model weights kept in optimizer are copied |
| 75 | # from the origin parameters in the memory, so we should make sure the dp sync |
| 76 | # does not influence the model weights in optimizer be different with the origin parameters. |
| 77 | sync_model_param(model, parallel_mode=ParallelMode.DATA) |
| 78 | |
| 79 | # This function is needed to make sure parameters that are not splitted by tensor parallelism are |
| 80 | # the same across tensor parallelism. |
| 81 | sync_model_param_within_tp(model) |
| 82 | |
| 83 | return model |
| 84 | |
| 85 | |
| 86 | def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]): |
no test coverage detected