(self, gpu_weights_percent)
| 237 | return outputs |
| 238 | |
| 239 | def _set_weight_streaming(self, gpu_weights_percent): |
| 240 | if not self.engine.streamable_weights_size: |
| 241 | assert gpu_weights_percent == 1, "Engine built without weight streaming. Cannot set gpu_weights_percent to a value other than 1." |
| 242 | return |
| 243 | |
| 244 | assert self.engine is not None |
| 245 | |
| 246 | self._context = None |
| 247 | |
| 248 | min = 0 |
| 249 | max = self.engine.streamable_weights_size |
| 250 | budget = int(gpu_weights_percent * max) |
| 251 | |
| 252 | self.engine.weight_streaming_budget_v2 = budget |
| 253 | assert self.engine.weight_streaming_budget_v2 == budget, "Failed to set weight streaming budget!" |
| 254 | logger.info( |
| 255 | f"Set gpu weights percent to {gpu_weights_percent}, which is {budget} bytes. Valid range: {min} bytes ~ {max} bytes." |
| 256 | ) |
| 257 | |
| 258 | try: |
| 259 | self.__prepare_execution_contexts() |
| 260 | except: |
| 261 | free_mem = torch.cuda.mem_get_info()[0] |
| 262 | if free_mem < budget: |
| 263 | raise torch.cuda.OutOfMemoryError( |
| 264 | f"Out of Memory: Memory budget is {budget} bytes but only {free_mem} bytes are available on the GPU." |
| 265 | ) |
| 266 | raise |
| 267 | |
| 268 | def run(self, |
| 269 | inputs: Dict[str, Any], |
no test coverage detected