(self,
request: GenerationRequest,
result_wait_queue=None)
| 384 | uids=[str(prompt_adapter_request.adapter_id)]) |
| 385 | |
| 386 | def _enqueue_request(self, |
| 387 | request: GenerationRequest, |
| 388 | result_wait_queue=None) -> int: |
| 389 | assert request.id is not None |
| 390 | py_lora_path = None |
| 391 | if self._lora_manager is not None and request.lora_request is not None: |
| 392 | adapter_in_cache = self._lora_manager.is_adapter_in_cpu_cache( |
| 393 | request.lora_request.adapter_id) |
| 394 | self._load_lora_adapter(request.lora_request) |
| 395 | uid = str(request.lora_request.adapter_id) |
| 396 | lora_config = tllm.LoraConfig( |
| 397 | task_id=request.lora_request.adapter_id, |
| 398 | weights=self._lora_manager.cpp_lora_weights[uid] |
| 399 | if not adapter_in_cache else None, |
| 400 | config=self._lora_manager.cpp_lora_config[uid]) |
| 401 | py_lora_path = request.lora_request.lora_path |
| 402 | else: |
| 403 | lora_config = None |
| 404 | |
| 405 | prompt_token_ids = copy.deepcopy(request.prompt_token_ids) |
| 406 | prompt_tuning_config = None |
| 407 | if request.prompt_adapter_request is not None: |
| 408 | self._load_prompt_adapter(request.prompt_adapter_request) |
| 409 | uid = str(request.prompt_adapter_request.adapter_id) |
| 410 | prompt_tuning_config = tllm.PromptTuningConfig( |
| 411 | self._prompt_adapter_manager.uid_to_weights[uid]) |
| 412 | vocab_size = self._runtime_model_config.vocab_size |
| 413 | pa_length = prompt_tuning_config.embedding_table.size(0) |
| 414 | prompt_token_ids = list(range( |
| 415 | vocab_size, vocab_size + pa_length)) + prompt_token_ids |
| 416 | |
| 417 | # MULTIMODAL |
| 418 | # NOTE: Since, we only support PyTorch backend for multimodal, we will send multimodal_data through the 'py_multimodal_data' field |
| 419 | # except `multimodal_input` as it needs to go through the C++ runtime. |
| 420 | multimodal_input = None |
| 421 | if request.multimodal_params is not None and request.multimodal_params.has_content( |
| 422 | ): |
| 423 | if request.multimodal_params.multimodal_input is not None: |
| 424 | multimodal_input = tllm.MultimodalInput( |
| 425 | multimodal_hashes=request.multimodal_params. |
| 426 | multimodal_input.multimodal_hashes, |
| 427 | multimodal_positions=request.multimodal_params. |
| 428 | multimodal_input.multimodal_positions, |
| 429 | multimodal_lengths=request.multimodal_params. |
| 430 | multimodal_input.multimodal_lengths) |
| 431 | # NOTE: Setting to None here to avoid sending multimodal_input again through the 'py_multimodal_data' field |
| 432 | request.multimodal_params.multimodal_input = None |
| 433 | |
| 434 | context_phase_params = None |
| 435 | request_type = tllm.RequestType.REQUEST_TYPE_CONTEXT_AND_GENERATION |
| 436 | if request.disaggregated_params is not None: |
| 437 | assert ( |
| 438 | not self._is_pytorch_backend |
| 439 | or self.engine.kv_cache_transceiver is not None |
| 440 | or request.disaggregated_params.request_type |
| 441 | == "context_and_generation" |
| 442 | ), "kv_cache_transceiver is disabled, please set 'cache_transceiver_config: backend:<backend_type>` in config file for disaggregated serving" |
| 443 | request_type = request.disaggregated_params.get_request_type() |
no test coverage detected