r""" Reconstruct video tensors from patch embeddings. Args: x (List[Tensor]): List of patchified features, each with shape [L, C_out * prod(patch_size)] grid_sizes (Tensor): Original spatial-temporal grid dimensions before patc
(self, x, grid_sizes)
| 547 | return [u.float() for u in x] |
| 548 | |
| 549 | def unpatchify(self, x, grid_sizes): |
| 550 | r""" |
| 551 | Reconstruct video tensors from patch embeddings. |
| 552 | |
| 553 | Args: |
| 554 | x (List[Tensor]): |
| 555 | List of patchified features, each with shape [L, C_out * prod(patch_size)] |
| 556 | grid_sizes (Tensor): |
| 557 | Original spatial-temporal grid dimensions before patching, |
| 558 | shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches) |
| 559 | |
| 560 | Returns: |
| 561 | List[Tensor]: |
| 562 | Reconstructed video tensors with shape [C_out, F, H / 8, W / 8] |
| 563 | """ |
| 564 | |
| 565 | c = self.out_dim |
| 566 | out = [] |
| 567 | for u, v in zip(x, grid_sizes.tolist()): |
| 568 | u = u[:math.prod(v)].view(*v, *self.patch_size, c) |
| 569 | u = torch.einsum('fhwpqrc->cfphqwr', u) |
| 570 | u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)]) |
| 571 | out.append(u) |
| 572 | return out |
| 573 | |
| 574 | def init_weights(self): |
| 575 | r""" |
no outgoing calls
no test coverage detected