r""" Generates video frames from input first-last frame and text prompt using diffusion process. Args: input_prompt (`str`): Text prompt for content generation. first_frame (PIL.Image.Image): Input image tensor. Shape: [3, H, W
(self,
input_prompt,
first_frame,
last_frame,
max_area=720 * 1280,
frame_num=81,
shift=16,
sample_solver='unipc',
sampling_steps=50,
guide_scale=5.5,
n_prompt="",
seed=-1,
offload_model=True)
| 131 | self.sample_neg_prompt = config.sample_neg_prompt |
| 132 | |
| 133 | def generate(self, |
| 134 | input_prompt, |
| 135 | first_frame, |
| 136 | last_frame, |
| 137 | max_area=720 * 1280, |
| 138 | frame_num=81, |
| 139 | shift=16, |
| 140 | sample_solver='unipc', |
| 141 | sampling_steps=50, |
| 142 | guide_scale=5.5, |
| 143 | n_prompt="", |
| 144 | seed=-1, |
| 145 | offload_model=True): |
| 146 | r""" |
| 147 | Generates video frames from input first-last frame and text prompt using diffusion process. |
| 148 | |
| 149 | Args: |
| 150 | input_prompt (`str`): |
| 151 | Text prompt for content generation. |
| 152 | first_frame (PIL.Image.Image): |
| 153 | Input image tensor. Shape: [3, H, W] |
| 154 | last_frame (PIL.Image.Image): |
| 155 | Input image tensor. Shape: [3, H, W] |
| 156 | [NOTE] If the sizes of first_frame and last_frame are mismatched, last_frame will be cropped & resized |
| 157 | to match first_frame. |
| 158 | max_area (`int`, *optional*, defaults to 720*1280): |
| 159 | Maximum pixel area for latent space calculation. Controls video resolution scaling |
| 160 | frame_num (`int`, *optional*, defaults to 81): |
| 161 | How many frames to sample from a video. The number should be 4n+1 |
| 162 | shift (`float`, *optional*, defaults to 5.0): |
| 163 | Noise schedule shift parameter. Affects temporal dynamics |
| 164 | [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0. |
| 165 | sample_solver (`str`, *optional*, defaults to 'unipc'): |
| 166 | Solver used to sample the video. |
| 167 | sampling_steps (`int`, *optional*, defaults to 40): |
| 168 | Number of diffusion sampling steps. Higher values improve quality but slow generation |
| 169 | guide_scale (`float`, *optional*, defaults 5.0): |
| 170 | Classifier-free guidance scale. Controls prompt adherence vs. creativity |
| 171 | n_prompt (`str`, *optional*, defaults to ""): |
| 172 | Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt` |
| 173 | seed (`int`, *optional*, defaults to -1): |
| 174 | Random seed for noise generation. If -1, use random seed |
| 175 | offload_model (`bool`, *optional*, defaults to True): |
| 176 | If True, offloads models to CPU during generation to save VRAM |
| 177 | |
| 178 | Returns: |
| 179 | torch.Tensor: |
| 180 | Generated video frames tensor. Dimensions: (C, N H, W) where: |
| 181 | - C: Color channels (3 for RGB) |
| 182 | - N: Number of frames (81) |
| 183 | - H: Frame height (from max_area) |
| 184 | - W: Frame width from max_area) |
| 185 | """ |
| 186 | first_frame_size = first_frame.size |
| 187 | last_frame_size = last_frame.size |
| 188 | first_frame = TF.to_tensor(first_frame).sub_(0.5).div_(0.5).to( |
| 189 | self.device) |
| 190 | last_frame = TF.to_tensor(last_frame).sub_(0.5).div_(0.5).to( |
nothing calls this directly
no test coverage detected