(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
| 101 | |
| 102 | @torch.no_grad() |
| 103 | def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf): |
| 104 | total_latent_sections = (total_second_length * 30) / (latent_window_size * 4) |
| 105 | total_latent_sections = int(max(round(total_latent_sections), 1)) |
| 106 | |
| 107 | job_id = generate_timestamp() |
| 108 | |
| 109 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...')))) |
| 110 | |
| 111 | try: |
| 112 | # Clean GPU |
| 113 | if not high_vram: |
| 114 | unload_complete_models( |
| 115 | text_encoder, text_encoder_2, image_encoder, vae, transformer |
| 116 | ) |
| 117 | |
| 118 | # Text encoding |
| 119 | |
| 120 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...')))) |
| 121 | |
| 122 | if not high_vram: |
| 123 | fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode. |
| 124 | load_model_as_complete(text_encoder_2, target_device=gpu) |
| 125 | |
| 126 | llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) |
| 127 | |
| 128 | if cfg == 1: |
| 129 | llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler) |
| 130 | else: |
| 131 | llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) |
| 132 | |
| 133 | llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512) |
| 134 | llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512) |
| 135 | |
| 136 | # Processing input image |
| 137 | |
| 138 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...')))) |
| 139 | |
| 140 | H, W, C = input_image.shape |
| 141 | height, width = find_nearest_bucket(H, W, resolution=640) |
| 142 | input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height) |
| 143 | |
| 144 | Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png')) |
| 145 | |
| 146 | input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1 |
| 147 | input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None] |
| 148 | |
| 149 | # VAE encoding |
| 150 | |
| 151 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...')))) |
| 152 | |
| 153 | if not high_vram: |
| 154 | load_model_as_complete(vae, target_device=gpu) |
| 155 | |
| 156 | start_latent = vae_encode(input_image_pt, vae) |
| 157 | |
| 158 | # CLIP Vision |
| 159 | |
| 160 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...')))) |
nothing calls this directly
no test coverage detected