MCPcopy
hub / github.com/InternLM/lmdeploy / api_server

Method api_server

lmdeploy/cli/serve.py:210–347  ·  view source on GitHub ↗

Serve LLMs with restful api using fastapi.

(args)

Source from the content-addressed store, hash-verified

208
209 @staticmethod
210 def api_server(args):
211 """Serve LLMs with restful api using fastapi."""
212 from lmdeploy.archs import autoget_backend
213
214 max_batch_size = args.max_batch_size if args.max_batch_size \
215 else get_max_batch_size(args.device)
216 backend = args.backend
217 if backend != 'pytorch':
218 # set auto backend mode
219 backend = autoget_backend(args.model_path, trust_remote_code=args.trust_remote_code)
220
221 if backend == 'pytorch':
222 from lmdeploy.messages import PytorchEngineConfig
223 adapters = get_lora_adapters(args.adapters)
224 backend_config = PytorchEngineConfig(
225 dtype=args.dtype,
226 tp=args.tp,
227 dp=args.dp,
228 ep=args.ep,
229 max_batch_size=max_batch_size,
230 cache_max_entry_count=args.cache_max_entry_count,
231 block_size=args.cache_block_seq_len,
232 kernel_block_size=args.kernel_block_size,
233 session_len=args.session_len,
234 adapters=adapters,
235 enable_prefix_caching=args.enable_prefix_caching,
236 device_type=args.device,
237 quant_policy=args.quant_policy,
238 eager_mode=args.eager_mode,
239 max_prefill_token_num=args.max_prefill_token_num,
240 enable_microbatch=args.enable_microbatch,
241 enable_eplb=args.enable_eplb,
242 enable_metrics=not args.disable_metrics,
243 role=EngineRole[args.role],
244 migration_backend=MigrationBackend[args.migration_backend],
245 model_format=args.model_format,
246 hf_overrides=args.hf_overrides,
247 disable_vision_encoder=args.disable_vision_encoder,
248 logprobs_mode=args.logprobs_mode,
249 dllm_block_length=args.dllm_block_length,
250 dllm_unmasking_strategy=args.dllm_unmasking_strategy,
251 dllm_denoising_steps=args.dllm_denoising_steps,
252 dllm_confidence_threshold=args.dllm_confidence_threshold,
253 enable_return_routed_experts=args.enable_return_routed_experts,
254 distributed_executor_backend=args.distributed_executor_backend,
255 )
256 else:
257 from lmdeploy.messages import TurbomindEngineConfig
258 backend_config = TurbomindEngineConfig(dtype=args.dtype,
259 tp=args.tp,
260 dp=args.dp,
261 cp=args.cp,
262 nnodes=args.nnodes,
263 node_rank=args.node_rank,
264 dist_init_addr=args.dist_init_addr,
265 max_batch_size=max_batch_size,
266 session_len=args.session_len,
267 model_format=args.model_format,

Callers

nothing calls this directly

Calls 9

get_max_batch_sizeFunction · 0.90
autoget_backendFunction · 0.90
PytorchEngineConfigClass · 0.90
VisionConfigClass · 0.90
launch_serverFunction · 0.90
get_lora_adaptersFunction · 0.85
get_speculative_configFunction · 0.85
get_chat_templateFunction · 0.70

Tested by

no test coverage detected