:param x: The attention output. It should be [batch_size*seq_length, dim] :param intermediate_size: the hidden projection. By default this is the input_dim * 4. in the original GPT we would return layer_norm(x_norm + h1) rather than layer_norm(x + h1) :return:
(x_flat, intermediate_size, initializer_range=0.02, hidden_dropout_prob=0.1)
| 227 | |
| 228 | |
| 229 | def residual_mlp_layer(x_flat, intermediate_size, initializer_range=0.02, hidden_dropout_prob=0.1): |
| 230 | """ |
| 231 | :param x: The attention output. It should be [batch_size*seq_length, dim] |
| 232 | :param intermediate_size: the hidden projection. By default this is the input_dim * 4. |
| 233 | in the original GPT we would return layer_norm(x_norm + h1) rather than layer_norm(x + h1) |
| 234 | :return: |
| 235 | """ |
| 236 | batch_size_seq_length, hidden_size = get_shape_list(x_flat, expected_rank=2) |
| 237 | x_norm = layer_norm(x_flat, name='mlp_ln0') |
| 238 | |
| 239 | intermediate_output = tf.layers.dense( |
| 240 | x_norm, |
| 241 | intermediate_size, |
| 242 | activation=gelu, |
| 243 | kernel_initializer=create_initializer(initializer_range), |
| 244 | name='intermediate', |
| 245 | ) |
| 246 | |
| 247 | output_for_residual = tf.layers.dense( |
| 248 | intermediate_output, |
| 249 | hidden_size, |
| 250 | name='output', |
| 251 | kernel_initializer=create_initializer(initializer_range)) |
| 252 | output_for_residual = dropout(output_for_residual, hidden_dropout_prob) |
| 253 | |
| 254 | layer_output = layer_norm(x_flat + output_for_residual, name='mlp_ln1') |
| 255 | return layer_output |
| 256 | |
| 257 | |
| 258 | def embed(input_ids, |
no test coverage detected