r""" Generates video frames from input image and text prompt using diffusion process. Args: input_prompt (`str`): Text prompt for content generation. img (PIL.Image.Image): Input image tensor. Shape: [3, H, W] max_a
(self,
input_prompt,
img,
max_area=720 * 1280,
frame_num=81,
shift=5.0,
sample_solver='unipc',
sampling_steps=40,
guide_scale=5.0,
n_prompt="",
seed=-1,
offload_model=True)
| 131 | self.sample_neg_prompt = config.sample_neg_prompt |
| 132 | |
| 133 | def generate(self, |
| 134 | input_prompt, |
| 135 | img, |
| 136 | max_area=720 * 1280, |
| 137 | frame_num=81, |
| 138 | shift=5.0, |
| 139 | sample_solver='unipc', |
| 140 | sampling_steps=40, |
| 141 | guide_scale=5.0, |
| 142 | n_prompt="", |
| 143 | seed=-1, |
| 144 | offload_model=True): |
| 145 | r""" |
| 146 | Generates video frames from input image and text prompt using diffusion process. |
| 147 | |
| 148 | Args: |
| 149 | input_prompt (`str`): |
| 150 | Text prompt for content generation. |
| 151 | img (PIL.Image.Image): |
| 152 | Input image tensor. Shape: [3, H, W] |
| 153 | max_area (`int`, *optional*, defaults to 720*1280): |
| 154 | Maximum pixel area for latent space calculation. Controls video resolution scaling |
| 155 | frame_num (`int`, *optional*, defaults to 81): |
| 156 | How many frames to sample from a video. The number should be 4n+1 |
| 157 | shift (`float`, *optional*, defaults to 5.0): |
| 158 | Noise schedule shift parameter. Affects temporal dynamics |
| 159 | [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0. |
| 160 | sample_solver (`str`, *optional*, defaults to 'unipc'): |
| 161 | Solver used to sample the video. |
| 162 | sampling_steps (`int`, *optional*, defaults to 40): |
| 163 | Number of diffusion sampling steps. Higher values improve quality but slow generation |
| 164 | guide_scale (`float`, *optional*, defaults 5.0): |
| 165 | Classifier-free guidance scale. Controls prompt adherence vs. creativity |
| 166 | n_prompt (`str`, *optional*, defaults to ""): |
| 167 | Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt` |
| 168 | seed (`int`, *optional*, defaults to -1): |
| 169 | Random seed for noise generation. If -1, use random seed |
| 170 | offload_model (`bool`, *optional*, defaults to True): |
| 171 | If True, offloads models to CPU during generation to save VRAM |
| 172 | |
| 173 | Returns: |
| 174 | torch.Tensor: |
| 175 | Generated video frames tensor. Dimensions: (C, N H, W) where: |
| 176 | - C: Color channels (3 for RGB) |
| 177 | - N: Number of frames (81) |
| 178 | - H: Frame height (from max_area) |
| 179 | - W: Frame width from max_area) |
| 180 | """ |
| 181 | img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(self.device) |
| 182 | |
| 183 | F = frame_num |
| 184 | h, w = img.shape[1:] |
| 185 | aspect_ratio = h / w |
| 186 | lat_h = round( |
| 187 | np.sqrt(max_area * aspect_ratio) // self.vae_stride[1] // |
| 188 | self.patch_size[1] * self.patch_size[1]) |
| 189 | lat_w = round( |
| 190 | np.sqrt(max_area / aspect_ratio) // self.vae_stride[2] // |
nothing calls this directly
no test coverage detected