(self)
| 887 | |
| 888 | @torch.inference_mode() |
| 889 | def profile_run(self) -> None: |
| 890 | # Enable top-k sampling to reflect the accurate memory usage. |
| 891 | sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) |
| 892 | max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens |
| 893 | max_num_seqs = self.scheduler_config.max_num_seqs |
| 894 | |
| 895 | # This represents the maximum number of different requests |
| 896 | # that will have unique loras, an therefore the max amount of memory |
| 897 | # consumption create dummy lora request copies from the lora request |
| 898 | # passed in, which contains a lora from the lora warmup path. |
| 899 | dummy_lora_requests = [] |
| 900 | dummy_lora_requests_per_seq = [] |
| 901 | if self.lora_config: |
| 902 | for idx in range(self.lora_config.max_loras): |
| 903 | lora_id = idx + 1 |
| 904 | dummy_lora_request = LoRARequest( |
| 905 | lora_name=f"warmup_{lora_id}", |
| 906 | lora_int_id=lora_id, |
| 907 | lora_local_path="/not/a/real/path", |
| 908 | ) |
| 909 | self.lora_manager.add_dummy_lora(dummy_lora_request, |
| 910 | rank=LORA_WARMUP_RANK) |
| 911 | dummy_lora_requests.append(dummy_lora_request) |
| 912 | dummy_lora_requests_per_seq = [ |
| 913 | dummy_lora_requests[idx % len(dummy_lora_requests)] |
| 914 | for idx in range(max_num_seqs) |
| 915 | ] |
| 916 | |
| 917 | # Profile memory usage with max_num_sequences sequences and the total |
| 918 | # number of tokens equal to max_num_batched_tokens. |
| 919 | seqs: List[SequenceGroupMetadata] = [] |
| 920 | # Additional GPU memory may be needed for vision encoding, which needs |
| 921 | # to be accounted for when calculating the GPU blocks for |
| 922 | # vLLM blocker manager. |
| 923 | # To exercise the worst scenario for GPU memory consumption, |
| 924 | # the number of seqs (batch_size) is chosen to maximize the number |
| 925 | # of images processed. |
| 926 | if self.vision_language_config: |
| 927 | max_num_seqs = min( |
| 928 | max_num_seqs, |
| 929 | int(max_num_batched_tokens / |
| 930 | self.vision_language_config.image_feature_size)) |
| 931 | for group_id in range(max_num_seqs): |
| 932 | seq_len = (max_num_batched_tokens // max_num_seqs + |
| 933 | (group_id < max_num_batched_tokens % max_num_seqs)) |
| 934 | seq_data, fake_multi_modal_input = _prepare_fake_inputs( |
| 935 | seq_len, self.vision_language_config) |
| 936 | seq = SequenceGroupMetadata( |
| 937 | request_id=str(group_id), |
| 938 | is_prompt=True, |
| 939 | seq_data={group_id: seq_data}, |
| 940 | sampling_params=sampling_params, |
| 941 | block_tables=None, |
| 942 | lora_request=dummy_lora_requests_per_seq[group_id] |
| 943 | if dummy_lora_requests_per_seq else None, |
| 944 | multi_modal_data=fake_multi_modal_input, |
| 945 | ) |
| 946 | seqs.append(seq) |
no test coverage detected