hub / github.com/NVIDIA/TensorRT-LLM / _enqueue_request

Method _enqueue_request

tensorrt_llm/executor/base_worker.py:386–605 · view source on GitHub ↗

(self,
                         request: GenerationRequest,
                         result_wait_queue=None)

Source from the content-addressed store, hash-verified

384	uids=[str(prompt_adapter_request.adapter_id)])
385
386	def _enqueue_request(self,
387	request: GenerationRequest,
388	result_wait_queue=None) -> int:
389	assert request.id is not None
390	py_lora_path = None
391	if self._lora_manager is not None and request.lora_request is not None:
392	adapter_in_cache = self._lora_manager.is_adapter_in_cpu_cache(
393	request.lora_request.adapter_id)
394	self._load_lora_adapter(request.lora_request)
395	uid = str(request.lora_request.adapter_id)
396	lora_config = tllm.LoraConfig(
397	task_id=request.lora_request.adapter_id,
398	weights=self._lora_manager.cpp_lora_weights[uid]
399	if not adapter_in_cache else None,
400	config=self._lora_manager.cpp_lora_config[uid])
401	py_lora_path = request.lora_request.lora_path
402	else:
403	lora_config = None
404
405	prompt_token_ids = copy.deepcopy(request.prompt_token_ids)
406	prompt_tuning_config = None
407	if request.prompt_adapter_request is not None:
408	self._load_prompt_adapter(request.prompt_adapter_request)
409	uid = str(request.prompt_adapter_request.adapter_id)
410	prompt_tuning_config = tllm.PromptTuningConfig(
411	self._prompt_adapter_manager.uid_to_weights[uid])
412	vocab_size = self._runtime_model_config.vocab_size
413	pa_length = prompt_tuning_config.embedding_table.size(0)
414	prompt_token_ids = list(range(
415	vocab_size, vocab_size + pa_length)) + prompt_token_ids
416
417	# MULTIMODAL
418	# NOTE: Since, we only support PyTorch backend for multimodal, we will send multimodal_data through the 'py_multimodal_data' field
419	# except `multimodal_input` as it needs to go through the C++ runtime.
420	multimodal_input = None
421	if request.multimodal_params is not None and request.multimodal_params.has_content(
422	):
423	if request.multimodal_params.multimodal_input is not None:
424	multimodal_input = tllm.MultimodalInput(
425	multimodal_hashes=request.multimodal_params.
426	multimodal_input.multimodal_hashes,
427	multimodal_positions=request.multimodal_params.
428	multimodal_input.multimodal_positions,
429	multimodal_lengths=request.multimodal_params.
430	multimodal_input.multimodal_lengths)
431	# NOTE: Setting to None here to avoid sending multimodal_input again through the 'py_multimodal_data' field
432	request.multimodal_params.multimodal_input = None
433
434	context_phase_params = None
435	request_type = tllm.RequestType.REQUEST_TYPE_CONTEXT_AND_GENERATION
436	if request.disaggregated_params is not None:
437	assert (
438	not self._is_pytorch_backend
439	or self.engine.kv_cache_transceiver is not None
440	or request.disaggregated_params.request_type
441	== "context_and_generation"
442	), "kv_cache_transceiver is disabled, please set 'cache_transceiver_config: backend:<backend_type>` in config file for disaggregated serving"
443	request_type = request.disaggregated_params.get_request_type()

Callers 2

submitMethod · 0.95

enqueue_requestMethod · 0.80

Calls 15

_load_lora_adapterMethod · 0.95

_load_prompt_adapterMethod · 0.95

RequestErrorClass · 0.85

is_adapter_in_cpu_cacheMethod · 0.80

has_contentMethod · 0.80

get_request_typeMethod · 0.80

get_context_phase_paramsMethod · 0.80

_get_sampling_configMethod · 0.80

_get_output_configMethod · 0.80

maybe_to_pybindMethod · 0.80

_get_guided_decoding_paramsMethod · 0.80

_get_bad_wordsMethod · 0.80

Tested by

no test coverage detected