hub / github.com/InternLM/lmdeploy / api_server

Method api_server

lmdeploy/cli/serve.py:210–347 · view source on GitHub ↗

Serve LLMs with restful api using fastapi.

(args)

Source from the content-addressed store, hash-verified

208
209	@staticmethod
210	def api_server(args):
211	"""Serve LLMs with restful api using fastapi."""
212	from lmdeploy.archs import autoget_backend
213
214	max_batch_size = args.max_batch_size if args.max_batch_size \
215	else get_max_batch_size(args.device)
216	backend = args.backend
217	if backend != 'pytorch':
218	# set auto backend mode
219	backend = autoget_backend(args.model_path, trust_remote_code=args.trust_remote_code)
220
221	if backend == 'pytorch':
222	from lmdeploy.messages import PytorchEngineConfig
223	adapters = get_lora_adapters(args.adapters)
224	backend_config = PytorchEngineConfig(
225	dtype=args.dtype,
226	tp=args.tp,
227	dp=args.dp,
228	ep=args.ep,
229	max_batch_size=max_batch_size,
230	cache_max_entry_count=args.cache_max_entry_count,
231	block_size=args.cache_block_seq_len,
232	kernel_block_size=args.kernel_block_size,
233	session_len=args.session_len,
234	adapters=adapters,
235	enable_prefix_caching=args.enable_prefix_caching,
236	device_type=args.device,
237	quant_policy=args.quant_policy,
238	eager_mode=args.eager_mode,
239	max_prefill_token_num=args.max_prefill_token_num,
240	enable_microbatch=args.enable_microbatch,
241	enable_eplb=args.enable_eplb,
242	enable_metrics=not args.disable_metrics,
243	role=EngineRole[args.role],
244	migration_backend=MigrationBackend[args.migration_backend],
245	model_format=args.model_format,
246	hf_overrides=args.hf_overrides,
247	disable_vision_encoder=args.disable_vision_encoder,
248	logprobs_mode=args.logprobs_mode,
249	dllm_block_length=args.dllm_block_length,
250	dllm_unmasking_strategy=args.dllm_unmasking_strategy,
251	dllm_denoising_steps=args.dllm_denoising_steps,
252	dllm_confidence_threshold=args.dllm_confidence_threshold,
253	enable_return_routed_experts=args.enable_return_routed_experts,
254	distributed_executor_backend=args.distributed_executor_backend,
255	)
256	else:
257	from lmdeploy.messages import TurbomindEngineConfig
258	backend_config = TurbomindEngineConfig(dtype=args.dtype,
259	tp=args.tp,
260	dp=args.dp,
261	cp=args.cp,
262	nnodes=args.nnodes,
263	node_rank=args.node_rank,
264	dist_init_addr=args.dist_init_addr,
265	max_batch_size=max_batch_size,
266	session_len=args.session_len,
267	model_format=args.model_format,

Callers

nothing calls this directly

Calls 9

get_max_batch_sizeFunction · 0.90

autoget_backendFunction · 0.90

PytorchEngineConfigClass · 0.90

TurbomindEngineConfigClass · 0.90

VisionConfigClass · 0.90

launch_serverFunction · 0.90

get_lora_adaptersFunction · 0.85

get_speculative_configFunction · 0.85

get_chat_templateFunction · 0.70

Tested by

no test coverage detected