MCPcopy
hub / github.com/FoundationVision/LlamaGen / profile_run

Method profile_run

autoregressive/serve/model_runner.py:889–953  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

887
888 @torch.inference_mode()
889 def profile_run(self) -> None:
890 # Enable top-k sampling to reflect the accurate memory usage.
891 sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
892 max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
893 max_num_seqs = self.scheduler_config.max_num_seqs
894
895 # This represents the maximum number of different requests
896 # that will have unique loras, an therefore the max amount of memory
897 # consumption create dummy lora request copies from the lora request
898 # passed in, which contains a lora from the lora warmup path.
899 dummy_lora_requests = []
900 dummy_lora_requests_per_seq = []
901 if self.lora_config:
902 for idx in range(self.lora_config.max_loras):
903 lora_id = idx + 1
904 dummy_lora_request = LoRARequest(
905 lora_name=f"warmup_{lora_id}",
906 lora_int_id=lora_id,
907 lora_local_path="/not/a/real/path",
908 )
909 self.lora_manager.add_dummy_lora(dummy_lora_request,
910 rank=LORA_WARMUP_RANK)
911 dummy_lora_requests.append(dummy_lora_request)
912 dummy_lora_requests_per_seq = [
913 dummy_lora_requests[idx % len(dummy_lora_requests)]
914 for idx in range(max_num_seqs)
915 ]
916
917 # Profile memory usage with max_num_sequences sequences and the total
918 # number of tokens equal to max_num_batched_tokens.
919 seqs: List[SequenceGroupMetadata] = []
920 # Additional GPU memory may be needed for vision encoding, which needs
921 # to be accounted for when calculating the GPU blocks for
922 # vLLM blocker manager.
923 # To exercise the worst scenario for GPU memory consumption,
924 # the number of seqs (batch_size) is chosen to maximize the number
925 # of images processed.
926 if self.vision_language_config:
927 max_num_seqs = min(
928 max_num_seqs,
929 int(max_num_batched_tokens /
930 self.vision_language_config.image_feature_size))
931 for group_id in range(max_num_seqs):
932 seq_len = (max_num_batched_tokens // max_num_seqs +
933 (group_id < max_num_batched_tokens % max_num_seqs))
934 seq_data, fake_multi_modal_input = _prepare_fake_inputs(
935 seq_len, self.vision_language_config)
936 seq = SequenceGroupMetadata(
937 request_id=str(group_id),
938 is_prompt=True,
939 seq_data={group_id: seq_data},
940 sampling_params=sampling_params,
941 block_tables=None,
942 lora_request=dummy_lora_requests_per_seq[group_id]
943 if dummy_lora_requests_per_seq else None,
944 multi_modal_data=fake_multi_modal_input,
945 )
946 seqs.append(seq)

Callers 1

Calls 2

execute_modelMethod · 0.95
_prepare_fake_inputsFunction · 0.85

Tested by

no test coverage detected