| 35 | |
| 36 | |
| 37 | class ELMo(nn.Layer): |
| 38 | def __init__( |
| 39 | self, |
| 40 | batch_size=None, |
| 41 | char_embed_dim=16, |
| 42 | projection_dim=512, |
| 43 | vocab_size=None, |
| 44 | cnn_filters=[[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], |
| 45 | char_vocab_size=262, |
| 46 | max_characters_per_token=50, |
| 47 | num_highways=2, |
| 48 | num_layers=2, |
| 49 | dropout=0.1, |
| 50 | task="pre-train", |
| 51 | ): |
| 52 | super(ELMo, self).__init__() |
| 53 | |
| 54 | if task == "pre-train": |
| 55 | if vocab_size is None or batch_size is None: |
| 56 | raise ValueError('vocab_size and batch_size should be set when task="pre-train"') |
| 57 | elif task == "fine-tune": |
| 58 | if batch_size is None: |
| 59 | batch_size = 128 |
| 60 | else: |
| 61 | raise ValueError('task should be "pre-train" or "fine-tune"') |
| 62 | |
| 63 | self._projection_dim = projection_dim |
| 64 | self._task = task |
| 65 | |
| 66 | self._token_embding_layer = ELMoCharacterEncoderLayer( |
| 67 | char_vocab_size, char_embed_dim, projection_dim, num_highways, cnn_filters, max_characters_per_token |
| 68 | ) |
| 69 | self._elmobilm = ELMoBiLM(batch_size, projection_dim, projection_dim, num_layers, dropout, task) |
| 70 | if task == "pre-train": |
| 71 | paramAttr = paddle.ParamAttr(initializer=I.Normal(mean=0.0, std=1.0 / np.sqrt(projection_dim))) |
| 72 | self._linear_layer = nn.Linear(projection_dim, vocab_size, weight_attr=paramAttr) |
| 73 | |
| 74 | @property |
| 75 | def embedding_dim(self): |
| 76 | return self._projection_dim * 2 |
| 77 | |
| 78 | def forward(self, inputs): |
| 79 | # [batch_size, seq_len, max_characters_per_token] |
| 80 | ids, ids_reverse = inputs |
| 81 | # [batch_size, seq_len, projection_dim] |
| 82 | token_embedding = self._token_embding_layer(ids) |
| 83 | token_embedding_reverse = self._token_embding_layer(ids_reverse) |
| 84 | |
| 85 | outs = self._elmobilm(token_embedding, token_embedding_reverse) |
| 86 | |
| 87 | if self._task == "pre-train": |
| 88 | # [batch_size, seq_len, projection_dim] |
| 89 | fw_out, bw_out = outs |
| 90 | |
| 91 | # [batch_size, max_seq_len, vocab_size] |
| 92 | fw_logits = self._linear_layer(fw_out) |
| 93 | bw_logits = self._linear_layer(bw_out) |
| 94 | return [fw_logits, bw_logits] |
no outgoing calls
no test coverage detected
searching dependent graphs…