(self, request, context)
| 151 | return backend_pb2.Reply(message=bytes("OK", 'utf-8')) |
| 152 | |
| 153 | def LoadModel(self, request, context): |
| 154 | try: |
| 155 | # CPU detection: if no CUDA, default vLLM target device to CPU. |
| 156 | try: |
| 157 | if not torch.cuda.is_available(): |
| 158 | os.environ.setdefault("VLLM_TARGET_DEVICE", "cpu") |
| 159 | os.environ.setdefault("VLLM_CPU_KVCACHE_SPACE", "4") |
| 160 | except Exception: |
| 161 | pass |
| 162 | |
| 163 | print(f"Loading model {request.Model}...", file=sys.stderr) |
| 164 | print(f"Request {request}", file=sys.stderr) |
| 165 | |
| 166 | # Parse options from request.Options using shared helper |
| 167 | self.options = parse_options(request.Options) |
| 168 | opts = self.options |
| 169 | |
| 170 | print(f"Options: {self.options}", file=sys.stderr) |
| 171 | |
| 172 | # Detect model type |
| 173 | self.model_name = request.Model |
| 174 | self.model_type = request.Type if request.Type else self._detect_model_type(request.Model) |
| 175 | print(f"Detected model type: {self.model_type}", file=sys.stderr) |
| 176 | |
| 177 | # Build DiffusionParallelConfig if diffusion model (image or video) |
| 178 | parallel_config = None |
| 179 | if self.model_type in ["image", "video"]: |
| 180 | parallel_config = DiffusionParallelConfig( |
| 181 | ulysses_degree=self.options.get("ulysses_degree", 1), |
| 182 | ring_degree=self.options.get("ring_degree", 1), |
| 183 | cfg_parallel_size=self.options.get("cfg_parallel_size", 1), |
| 184 | tensor_parallel_size=self.options.get("tensor_parallel_size", 1), |
| 185 | ) |
| 186 | |
| 187 | # Build cache_config dict if cache_backend specified |
| 188 | cache_backend = self.options.get("cache_backend") # "cache_dit" or "tea_cache" |
| 189 | cache_config = None |
| 190 | if cache_backend == "cache_dit": |
| 191 | cache_config = { |
| 192 | "Fn_compute_blocks": self.options.get("cache_dit_fn_compute_blocks", 1), |
| 193 | "Bn_compute_blocks": self.options.get("cache_dit_bn_compute_blocks", 0), |
| 194 | "max_warmup_steps": self.options.get("cache_dit_max_warmup_steps", 4), |
| 195 | "residual_diff_threshold": self.options.get("cache_dit_residual_diff_threshold", 0.24), |
| 196 | "max_continuous_cached_steps": self.options.get("cache_dit_max_continuous_cached_steps", 3), |
| 197 | "enable_taylorseer": self.options.get("cache_dit_enable_taylorseer", False), |
| 198 | "taylorseer_order": self.options.get("cache_dit_taylorseer_order", 1), |
| 199 | "scm_steps_mask_policy": self.options.get("cache_dit_scm_steps_mask_policy"), |
| 200 | "scm_steps_policy": self.options.get("cache_dit_scm_steps_policy", "dynamic"), |
| 201 | } |
| 202 | elif cache_backend == "tea_cache": |
| 203 | cache_config = { |
| 204 | "rel_l1_thresh": self.options.get("tea_cache_rel_l1_thresh", 0.2), |
| 205 | } |
| 206 | |
| 207 | # Base Omni initialization parameters |
| 208 | omni_kwargs = { |
| 209 | "model": request.Model, |
| 210 | } |
nothing calls this directly
no test coverage detected