| 419 | self.activation = ACT2FN['tanh'] |
| 420 | |
| 421 | def forward(self, hidden_states, input_lengths, remove_input_padding): |
| 422 | if not remove_input_padding: |
| 423 | # We "pool" the model by simply taking the hidden state corresponding |
| 424 | # to the first token. |
| 425 | first_token_tensor = select(hidden_states, 1, 0) |
| 426 | else: |
| 427 | # when remove_input_padding is enabled, the shape of hidden_states is [num_tokens, hidden_size] |
| 428 | # We can take the first token of each sequence according to input_lengths, |
| 429 | # and then do pooling similar to padding mode. |
| 430 | # For example, if input_lengths is [8, 5, 6], then the indices of first tokens |
| 431 | # should be [0, 8, 13] |
| 432 | first_token_indices = cumsum( |
| 433 | concat([ |
| 434 | 0, |
| 435 | slice(input_lengths, |
| 436 | starts=[0], |
| 437 | sizes=(shape(input_lengths) - |
| 438 | constant(np.array([1], dtype=np.int32)))) |
| 439 | ]), 0) |
| 440 | first_token_tensor = index_select(hidden_states, 0, |
| 441 | first_token_indices) |
| 442 | |
| 443 | pooled_output = self.dense(first_token_tensor) |
| 444 | pooled_output = self.activation(pooled_output) |
| 445 | return pooled_output |
| 446 | |
| 447 | |
| 448 | class RobertaClassificationHead(Module): |