| 96 | |
| 97 | |
| 98 | class Attention(nn.Layer): |
| 99 | def __init__( |
| 100 | self, |
| 101 | dim, |
| 102 | num_heads=8, |
| 103 | qkv_bias=False, |
| 104 | qk_scale=None, |
| 105 | attn_drop=0.0, |
| 106 | proj_drop=0.0, |
| 107 | ): |
| 108 | super().__init__() |
| 109 | self.num_heads = num_heads |
| 110 | head_dim = dim // num_heads |
| 111 | self.scale = qk_scale or head_dim**-0.5 |
| 112 | |
| 113 | self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) |
| 114 | self.attn_drop = nn.Dropout(attn_drop) |
| 115 | self.proj = nn.Linear(dim, dim) |
| 116 | self.proj_drop = nn.Dropout(proj_drop) |
| 117 | |
| 118 | def forward(self, x): |
| 119 | # B= x.shape[0] |
| 120 | N, C = x.shape[1:] |
| 121 | qkv = ( |
| 122 | self.qkv(x) |
| 123 | .reshape((-1, N, 3, self.num_heads, C // self.num_heads)) |
| 124 | .transpose((2, 0, 3, 1, 4)) |
| 125 | ) |
| 126 | q, k, v = qkv[0], qkv[1], qkv[2] |
| 127 | |
| 128 | attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale |
| 129 | attn = nn.functional.softmax(attn, axis=-1) |
| 130 | attn = self.attn_drop(attn) |
| 131 | |
| 132 | x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) |
| 133 | x = self.proj(x) |
| 134 | x = self.proj_drop(x) |
| 135 | return x |
| 136 | |
| 137 | |
| 138 | class Block(nn.Layer): |
no outgoing calls
no test coverage detected
searching dependent graphs…