MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / _set_weight_streaming

Method _set_weight_streaming

tensorrt_llm/runtime/session.py:239–266  ·  view source on GitHub ↗
(self, gpu_weights_percent)

Source from the content-addressed store, hash-verified

237 return outputs
238
239 def _set_weight_streaming(self, gpu_weights_percent):
240 if not self.engine.streamable_weights_size:
241 assert gpu_weights_percent == 1, "Engine built without weight streaming. Cannot set gpu_weights_percent to a value other than 1."
242 return
243
244 assert self.engine is not None
245
246 self._context = None
247
248 min = 0
249 max = self.engine.streamable_weights_size
250 budget = int(gpu_weights_percent * max)
251
252 self.engine.weight_streaming_budget_v2 = budget
253 assert self.engine.weight_streaming_budget_v2 == budget, "Failed to set weight streaming budget!"
254 logger.info(
255 f"Set gpu weights percent to {gpu_weights_percent}, which is {budget} bytes. Valid range: {min} bytes ~ {max} bytes."
256 )
257
258 try:
259 self.__prepare_execution_contexts()
260 except:
261 free_mem = torch.cuda.mem_get_info()[0]
262 if free_mem < budget:
263 raise torch.cuda.OutOfMemoryError(
264 f"Out of Memory: Memory budget is {budget} bytes but only {free_mem} bytes are available on the GPU."
265 )
266 raise
267
268 def run(self,
269 inputs: Dict[str, Any],

Callers 2

from_engineMethod · 0.45
from_dirMethod · 0.45

Calls 2

infoMethod · 0.45

Tested by

no test coverage detected