MCPcopy Index your code
hub / github.com/NVIDIA/TensorRT-LLM / _enqueue_request

Method _enqueue_request

tensorrt_llm/executor/base_worker.py:386–605  ·  view source on GitHub ↗
(self,
                         request: GenerationRequest,
                         result_wait_queue=None)

Source from the content-addressed store, hash-verified

384 uids=[str(prompt_adapter_request.adapter_id)])
385
386 def _enqueue_request(self,
387 request: GenerationRequest,
388 result_wait_queue=None) -> int:
389 assert request.id is not None
390 py_lora_path = None
391 if self._lora_manager is not None and request.lora_request is not None:
392 adapter_in_cache = self._lora_manager.is_adapter_in_cpu_cache(
393 request.lora_request.adapter_id)
394 self._load_lora_adapter(request.lora_request)
395 uid = str(request.lora_request.adapter_id)
396 lora_config = tllm.LoraConfig(
397 task_id=request.lora_request.adapter_id,
398 weights=self._lora_manager.cpp_lora_weights[uid]
399 if not adapter_in_cache else None,
400 config=self._lora_manager.cpp_lora_config[uid])
401 py_lora_path = request.lora_request.lora_path
402 else:
403 lora_config = None
404
405 prompt_token_ids = copy.deepcopy(request.prompt_token_ids)
406 prompt_tuning_config = None
407 if request.prompt_adapter_request is not None:
408 self._load_prompt_adapter(request.prompt_adapter_request)
409 uid = str(request.prompt_adapter_request.adapter_id)
410 prompt_tuning_config = tllm.PromptTuningConfig(
411 self._prompt_adapter_manager.uid_to_weights[uid])
412 vocab_size = self._runtime_model_config.vocab_size
413 pa_length = prompt_tuning_config.embedding_table.size(0)
414 prompt_token_ids = list(range(
415 vocab_size, vocab_size + pa_length)) + prompt_token_ids
416
417 # MULTIMODAL
418 # NOTE: Since, we only support PyTorch backend for multimodal, we will send multimodal_data through the 'py_multimodal_data' field
419 # except `multimodal_input` as it needs to go through the C++ runtime.
420 multimodal_input = None
421 if request.multimodal_params is not None and request.multimodal_params.has_content(
422 ):
423 if request.multimodal_params.multimodal_input is not None:
424 multimodal_input = tllm.MultimodalInput(
425 multimodal_hashes=request.multimodal_params.
426 multimodal_input.multimodal_hashes,
427 multimodal_positions=request.multimodal_params.
428 multimodal_input.multimodal_positions,
429 multimodal_lengths=request.multimodal_params.
430 multimodal_input.multimodal_lengths)
431 # NOTE: Setting to None here to avoid sending multimodal_input again through the 'py_multimodal_data' field
432 request.multimodal_params.multimodal_input = None
433
434 context_phase_params = None
435 request_type = tllm.RequestType.REQUEST_TYPE_CONTEXT_AND_GENERATION
436 if request.disaggregated_params is not None:
437 assert (
438 not self._is_pytorch_backend
439 or self.engine.kv_cache_transceiver is not None
440 or request.disaggregated_params.request_type
441 == "context_and_generation"
442 ), "kv_cache_transceiver is disabled, please set 'cache_transceiver_config: backend:<backend_type>` in config file for disaggregated serving"
443 request_type = request.disaggregated_params.get_request_type()

Callers 2

submitMethod · 0.95
enqueue_requestMethod · 0.80

Calls 15

_load_lora_adapterMethod · 0.95
_load_prompt_adapterMethod · 0.95
RequestErrorClass · 0.85
has_contentMethod · 0.80
get_request_typeMethod · 0.80
_get_sampling_configMethod · 0.80
_get_output_configMethod · 0.80
maybe_to_pybindMethod · 0.80
_get_bad_wordsMethod · 0.80

Tested by

no test coverage detected