Return the number of parameters in the model. For non-embedding count (default), the position embeddings get subtracted. The token embeddings would too, except due to the parameter sharing these params are actually used as weights in the final layer, so we include th
(self, non_embedding=True)
| 148 | print("number of parameters: %.2fM" % (self.get_num_params()/1e6,)) |
| 149 | |
| 150 | def get_num_params(self, non_embedding=True): |
| 151 | """ |
| 152 | Return the number of parameters in the model. |
| 153 | For non-embedding count (default), the position embeddings get subtracted. |
| 154 | The token embeddings would too, except due to the parameter sharing these |
| 155 | params are actually used as weights in the final layer, so we include them. |
| 156 | """ |
| 157 | n_params = sum(p.numel() for p in self.parameters()) |
| 158 | if non_embedding: |
| 159 | n_params -= self.transformer.wpe.weight.numel() |
| 160 | return n_params |
| 161 | |
| 162 | def _init_weights(self, module): |
| 163 | if isinstance(module, nn.Linear): |
no outgoing calls
no test coverage detected