Loads a language model. Args: request: The load model request. context: The gRPC context. Returns: backend_pb2.Result: The load model result.
(self, request, context)
| 187 | return backend_pb2.Reply(message=bytes("OK", 'utf-8')) |
| 188 | |
| 189 | async def LoadModel(self, request, context): |
| 190 | """ |
| 191 | Loads a language model. |
| 192 | |
| 193 | Args: |
| 194 | request: The load model request. |
| 195 | context: The gRPC context. |
| 196 | |
| 197 | Returns: |
| 198 | backend_pb2.Result: The load model result. |
| 199 | """ |
| 200 | engine_args = AsyncEngineArgs( |
| 201 | model=request.Model, |
| 202 | ) |
| 203 | |
| 204 | if request.Quantization != "": |
| 205 | engine_args.quantization = request.Quantization |
| 206 | if request.LoadFormat != "": |
| 207 | engine_args.load_format = request.LoadFormat |
| 208 | if request.GPUMemoryUtilization != 0: |
| 209 | engine_args.gpu_memory_utilization = request.GPUMemoryUtilization |
| 210 | if request.TrustRemoteCode: |
| 211 | engine_args.trust_remote_code = request.TrustRemoteCode |
| 212 | if request.EnforceEager: |
| 213 | engine_args.enforce_eager = request.EnforceEager |
| 214 | if request.TensorParallelSize: |
| 215 | engine_args.tensor_parallel_size = request.TensorParallelSize |
| 216 | if request.SwapSpace != 0: |
| 217 | engine_args.swap_space = request.SwapSpace |
| 218 | if request.MaxModelLen != 0: |
| 219 | engine_args.max_model_len = request.MaxModelLen |
| 220 | if request.DisableLogStatus: |
| 221 | engine_args.disable_log_status = request.DisableLogStatus |
| 222 | if request.DType != "": |
| 223 | engine_args.dtype = request.DType |
| 224 | if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0: |
| 225 | # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs |
| 226 | engine_args.limit_mm_per_prompt = { |
| 227 | "image": max(request.LimitImagePerPrompt, 1), |
| 228 | "video": max(request.LimitVideoPerPrompt, 1), |
| 229 | "audio": max(request.LimitAudioPerPrompt, 1) |
| 230 | } |
| 231 | |
| 232 | # engine_args from YAML overrides typed fields above so operators can |
| 233 | # tune anything the AsyncEngineArgs dataclass exposes without waiting |
| 234 | # on protobuf changes. |
| 235 | try: |
| 236 | engine_args = self._apply_engine_args(engine_args, request.EngineArgs) |
| 237 | except ValueError as err: |
| 238 | print(f"engine_args error: {err}", file=sys.stderr) |
| 239 | return backend_pb2.Result(success=False, message=str(err)) |
| 240 | |
| 241 | try: |
| 242 | self.llm = AsyncLLMEngine.from_engine_args(engine_args) |
| 243 | except Exception as err: |
| 244 | print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr) |
| 245 | return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") |
| 246 |
nothing calls this directly
no test coverage detected