| 89 | |
| 90 | |
| 91 | class VaceVideoProcessor(object): |
| 92 | |
| 93 | def __init__(self, downsample, min_area, max_area, min_fps, max_fps, |
| 94 | zero_start, seq_len, keep_last, **kwargs): |
| 95 | self.downsample = downsample |
| 96 | self.min_area = min_area |
| 97 | self.max_area = max_area |
| 98 | self.min_fps = min_fps |
| 99 | self.max_fps = max_fps |
| 100 | self.zero_start = zero_start |
| 101 | self.keep_last = keep_last |
| 102 | self.seq_len = seq_len |
| 103 | assert seq_len >= min_area / (self.downsample[1] * self.downsample[2]) |
| 104 | |
| 105 | def set_area(self, area): |
| 106 | self.min_area = area |
| 107 | self.max_area = area |
| 108 | |
| 109 | def set_seq_len(self, seq_len): |
| 110 | self.seq_len = seq_len |
| 111 | |
| 112 | @staticmethod |
| 113 | def resize_crop(video: torch.Tensor, oh: int, ow: int): |
| 114 | """ |
| 115 | Resize, center crop and normalize for decord loaded video (torch.Tensor type) |
| 116 | |
| 117 | Parameters: |
| 118 | video - video to process (torch.Tensor): Tensor from `reader.get_batch(frame_ids)`, in shape of (T, H, W, C) |
| 119 | oh - target height (int) |
| 120 | ow - target width (int) |
| 121 | |
| 122 | Returns: |
| 123 | The processed video (torch.Tensor): Normalized tensor range [-1, 1], in shape of (C, T, H, W) |
| 124 | |
| 125 | Raises: |
| 126 | """ |
| 127 | # permute ([t, h, w, c] -> [t, c, h, w]) |
| 128 | video = video.permute(0, 3, 1, 2) |
| 129 | |
| 130 | # resize and crop |
| 131 | ih, iw = video.shape[2:] |
| 132 | if ih != oh or iw != ow: |
| 133 | # resize |
| 134 | scale = max(ow / iw, oh / ih) |
| 135 | video = F.interpolate( |
| 136 | video, |
| 137 | size=(round(scale * ih), round(scale * iw)), |
| 138 | mode='bicubic', |
| 139 | antialias=True) |
| 140 | assert video.size(3) >= ow and video.size(2) >= oh |
| 141 | |
| 142 | # center crop |
| 143 | x1 = (video.size(3) - ow) // 2 |
| 144 | y1 = (video.size(2) - oh) // 2 |
| 145 | video = video[:, :, y1:y1 + oh, x1:x1 + ow] |
| 146 | |
| 147 | # permute ([t, c, h, w] -> [c, t, h, w]) and normalize |
| 148 | video = video.transpose(0, 1).float().div_(127.5).sub_(1.) |