(self, inp_image, conditional=None, return_features=False)
| 465 | |
| 466 | |
| 467 | def forward(self, inp_image, conditional=None, return_features=False): |
| 468 | |
| 469 | inp_image = inp_image.to(self.model.positional_embedding.device) |
| 470 | |
| 471 | # x_inp = normalize(inp_image) |
| 472 | x_inp = inp_image |
| 473 | |
| 474 | bs, dev = inp_image.shape[0], x_inp.device |
| 475 | |
| 476 | cond = self.get_cond_vec(conditional, bs) |
| 477 | |
| 478 | visual_q, activations, affinities = self.visual_forward(x_inp, extract_layers=[self.extract_layer]) |
| 479 | |
| 480 | a = activations[0] |
| 481 | a = self.reduce(a) |
| 482 | a = self.film_mul(cond) * a + self.film_add(cond) |
| 483 | |
| 484 | if self.reduce2 is not None: |
| 485 | a = self.reduce2(a) |
| 486 | |
| 487 | # the original model would execute a transformer block here |
| 488 | |
| 489 | a = a[1:].permute(1, 2, 0) # rm cls token and -> BS, Feats, Tokens |
| 490 | |
| 491 | size = int(math.sqrt(a.shape[2])) |
| 492 | |
| 493 | a = a.view(bs, a.shape[1], size, size) |
| 494 | a = self.trans_conv(a) |
| 495 | |
| 496 | if return_features: |
| 497 | return a, visual_q, cond, activations |
| 498 | else: |
| 499 | return a, |
| 500 | |
| 501 | |
| 502 | class CLIPSegMultiLabel(nn.Module): |
nothing calls this directly
no test coverage detected