hub / github.com/FoundationVision/LlamaGen / profile_run

Method profile_run

autoregressive/serve/model_runner.py:889–953 · view source on GitHub ↗

(self)

Source from the content-addressed store, hash-verified

887
888	@torch.inference_mode()
889	def profile_run(self) -> None:
890	# Enable top-k sampling to reflect the accurate memory usage.
891	sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
892	max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
893	max_num_seqs = self.scheduler_config.max_num_seqs
894
895	# This represents the maximum number of different requests
896	# that will have unique loras, an therefore the max amount of memory
897	# consumption create dummy lora request copies from the lora request
898	# passed in, which contains a lora from the lora warmup path.
899	dummy_lora_requests = []
900	dummy_lora_requests_per_seq = []
901	if self.lora_config:
902	for idx in range(self.lora_config.max_loras):
903	lora_id = idx + 1
904	dummy_lora_request = LoRARequest(
905	lora_name=f"warmup_{lora_id}",
906	lora_int_id=lora_id,
907	lora_local_path="/not/a/real/path",
908	)
909	self.lora_manager.add_dummy_lora(dummy_lora_request,
910	rank=LORA_WARMUP_RANK)
911	dummy_lora_requests.append(dummy_lora_request)
912	dummy_lora_requests_per_seq = [
913	dummy_lora_requests[idx % len(dummy_lora_requests)]
914	for idx in range(max_num_seqs)
915	]
916
917	# Profile memory usage with max_num_sequences sequences and the total
918	# number of tokens equal to max_num_batched_tokens.
919	seqs: List[SequenceGroupMetadata] = []
920	# Additional GPU memory may be needed for vision encoding, which needs
921	# to be accounted for when calculating the GPU blocks for
922	# vLLM blocker manager.
923	# To exercise the worst scenario for GPU memory consumption,
924	# the number of seqs (batch_size) is chosen to maximize the number
925	# of images processed.
926	if self.vision_language_config:
927	max_num_seqs = min(
928	max_num_seqs,
929	int(max_num_batched_tokens /
930	self.vision_language_config.image_feature_size))
931	for group_id in range(max_num_seqs):
932	seq_len = (max_num_batched_tokens // max_num_seqs +
933	(group_id < max_num_batched_tokens % max_num_seqs))
934	seq_data, fake_multi_modal_input = _prepare_fake_inputs(
935	seq_len, self.vision_language_config)
936	seq = SequenceGroupMetadata(
937	request_id=str(group_id),
938	is_prompt=True,
939	seq_data={group_id: seq_data},
940	sampling_params=sampling_params,
941	block_tables=None,
942	lora_request=dummy_lora_requests_per_seq[group_id]
943	if dummy_lora_requests_per_seq else None,
944	multi_modal_data=fake_multi_modal_input,
945	)
946	seqs.append(seq)

Callers 1

determine_num_available_blocksMethod · 0.80

Calls 2

execute_modelMethod · 0.95

_prepare_fake_inputsFunction · 0.85

Tested by

no test coverage detected