Serve LLMs with restful api using fastapi.
(args)
| 208 | |
| 209 | @staticmethod |
| 210 | def api_server(args): |
| 211 | """Serve LLMs with restful api using fastapi.""" |
| 212 | from lmdeploy.archs import autoget_backend |
| 213 | |
| 214 | max_batch_size = args.max_batch_size if args.max_batch_size \ |
| 215 | else get_max_batch_size(args.device) |
| 216 | backend = args.backend |
| 217 | if backend != 'pytorch': |
| 218 | # set auto backend mode |
| 219 | backend = autoget_backend(args.model_path, trust_remote_code=args.trust_remote_code) |
| 220 | |
| 221 | if backend == 'pytorch': |
| 222 | from lmdeploy.messages import PytorchEngineConfig |
| 223 | adapters = get_lora_adapters(args.adapters) |
| 224 | backend_config = PytorchEngineConfig( |
| 225 | dtype=args.dtype, |
| 226 | tp=args.tp, |
| 227 | dp=args.dp, |
| 228 | ep=args.ep, |
| 229 | max_batch_size=max_batch_size, |
| 230 | cache_max_entry_count=args.cache_max_entry_count, |
| 231 | block_size=args.cache_block_seq_len, |
| 232 | kernel_block_size=args.kernel_block_size, |
| 233 | session_len=args.session_len, |
| 234 | adapters=adapters, |
| 235 | enable_prefix_caching=args.enable_prefix_caching, |
| 236 | device_type=args.device, |
| 237 | quant_policy=args.quant_policy, |
| 238 | eager_mode=args.eager_mode, |
| 239 | max_prefill_token_num=args.max_prefill_token_num, |
| 240 | enable_microbatch=args.enable_microbatch, |
| 241 | enable_eplb=args.enable_eplb, |
| 242 | enable_metrics=not args.disable_metrics, |
| 243 | role=EngineRole[args.role], |
| 244 | migration_backend=MigrationBackend[args.migration_backend], |
| 245 | model_format=args.model_format, |
| 246 | hf_overrides=args.hf_overrides, |
| 247 | disable_vision_encoder=args.disable_vision_encoder, |
| 248 | logprobs_mode=args.logprobs_mode, |
| 249 | dllm_block_length=args.dllm_block_length, |
| 250 | dllm_unmasking_strategy=args.dllm_unmasking_strategy, |
| 251 | dllm_denoising_steps=args.dllm_denoising_steps, |
| 252 | dllm_confidence_threshold=args.dllm_confidence_threshold, |
| 253 | enable_return_routed_experts=args.enable_return_routed_experts, |
| 254 | distributed_executor_backend=args.distributed_executor_backend, |
| 255 | ) |
| 256 | else: |
| 257 | from lmdeploy.messages import TurbomindEngineConfig |
| 258 | backend_config = TurbomindEngineConfig(dtype=args.dtype, |
| 259 | tp=args.tp, |
| 260 | dp=args.dp, |
| 261 | cp=args.cp, |
| 262 | nnodes=args.nnodes, |
| 263 | node_rank=args.node_rank, |
| 264 | dist_init_addr=args.dist_init_addr, |
| 265 | max_batch_size=max_batch_size, |
| 266 | session_len=args.session_len, |
| 267 | model_format=args.model_format, |
nothing calls this directly
no test coverage detected