r""" Generates video frames from text prompt using diffusion process. Args: input_prompt (`str`): Text prompt for content generation size (tupele[`int`], *optional*, defaults to (1280,720)): Controls video resolution, (width,he
(self,
input_prompt,
input_frames,
input_masks,
input_ref_images,
size=(1280, 720),
frame_num=81,
context_scale=1.0,
shift=5.0,
sample_solver='unipc',
sampling_steps=50,
guide_scale=5.0,
n_prompt="",
seed=-1,
offload_model=True)
| 293 | return vae.decode(trimed_zs) |
| 294 | |
| 295 | def generate(self, |
| 296 | input_prompt, |
| 297 | input_frames, |
| 298 | input_masks, |
| 299 | input_ref_images, |
| 300 | size=(1280, 720), |
| 301 | frame_num=81, |
| 302 | context_scale=1.0, |
| 303 | shift=5.0, |
| 304 | sample_solver='unipc', |
| 305 | sampling_steps=50, |
| 306 | guide_scale=5.0, |
| 307 | n_prompt="", |
| 308 | seed=-1, |
| 309 | offload_model=True): |
| 310 | r""" |
| 311 | Generates video frames from text prompt using diffusion process. |
| 312 | |
| 313 | Args: |
| 314 | input_prompt (`str`): |
| 315 | Text prompt for content generation |
| 316 | size (tupele[`int`], *optional*, defaults to (1280,720)): |
| 317 | Controls video resolution, (width,height). |
| 318 | frame_num (`int`, *optional*, defaults to 81): |
| 319 | How many frames to sample from a video. The number should be 4n+1 |
| 320 | shift (`float`, *optional*, defaults to 5.0): |
| 321 | Noise schedule shift parameter. Affects temporal dynamics |
| 322 | sample_solver (`str`, *optional*, defaults to 'unipc'): |
| 323 | Solver used to sample the video. |
| 324 | sampling_steps (`int`, *optional*, defaults to 40): |
| 325 | Number of diffusion sampling steps. Higher values improve quality but slow generation |
| 326 | guide_scale (`float`, *optional*, defaults 5.0): |
| 327 | Classifier-free guidance scale. Controls prompt adherence vs. creativity |
| 328 | n_prompt (`str`, *optional*, defaults to ""): |
| 329 | Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt` |
| 330 | seed (`int`, *optional*, defaults to -1): |
| 331 | Random seed for noise generation. If -1, use random seed. |
| 332 | offload_model (`bool`, *optional*, defaults to True): |
| 333 | If True, offloads models to CPU during generation to save VRAM |
| 334 | |
| 335 | Returns: |
| 336 | torch.Tensor: |
| 337 | Generated video frames tensor. Dimensions: (C, N H, W) where: |
| 338 | - C: Color channels (3 for RGB) |
| 339 | - N: Number of frames (81) |
| 340 | - H: Frame height (from size) |
| 341 | - W: Frame width from size) |
| 342 | """ |
| 343 | # preprocess |
| 344 | # F = frame_num |
| 345 | # target_shape = (self.vae.model.z_dim, (F - 1) // self.vae_stride[0] + 1, |
| 346 | # size[1] // self.vae_stride[1], |
| 347 | # size[0] // self.vae_stride[2]) |
| 348 | # |
| 349 | # seq_len = math.ceil((target_shape[2] * target_shape[3]) / |
| 350 | # (self.patch_size[1] * self.patch_size[2]) * |
| 351 | # target_shape[1] / self.sp_size) * self.sp_size |
| 352 |
no test coverage detected