(
self,
model_w_task,
dtype,
enable_cuda_graph,
enable_triton,
query,
inf_kwargs,
assert_fn,
perf_meas=True,
)
| 307 | world_size = 1 |
| 308 | |
| 309 | def test( |
| 310 | self, |
| 311 | model_w_task, |
| 312 | dtype, |
| 313 | enable_cuda_graph, |
| 314 | enable_triton, |
| 315 | query, |
| 316 | inf_kwargs, |
| 317 | assert_fn, |
| 318 | perf_meas=True, |
| 319 | ): |
| 320 | invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton) |
| 321 | if invalid_test_msg: |
| 322 | pytest.skip(invalid_test_msg) |
| 323 | |
| 324 | if dtype not in get_accelerator().supported_dtypes(): |
| 325 | pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.") |
| 326 | |
| 327 | if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: |
| 328 | pytest.skip("This op had not been implemented on this system.", allow_module_level=True) |
| 329 | |
| 330 | model, task = model_w_task |
| 331 | local_rank = int(os.getenv("LOCAL_RANK", "0")) |
| 332 | |
| 333 | # Load the model on CPU first to avoid OOM for large models @fp32 |
| 334 | pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt") |
| 335 | if dtype == torch.half: |
| 336 | pipe.model.half() |
| 337 | |
| 338 | # Switch device to GPU after converting to half |
| 339 | device = torch.device(get_accelerator().device_name(local_rank)) |
| 340 | pipe.device = device |
| 341 | pipe.model.to(device) |
| 342 | |
| 343 | # Warm-up queries for perf measurement |
| 344 | #for i in range(10): |
| 345 | # _ = pipe(query, **inf_kwargs) |
| 346 | get_accelerator().synchronize() |
| 347 | start = time.time() |
| 348 | bs_output = pipe(query, **inf_kwargs) |
| 349 | get_accelerator().synchronize() |
| 350 | bs_time = time.time() - start |
| 351 | |
| 352 | args = { |
| 353 | 'mp_size': 1, |
| 354 | 'dtype': dtype, |
| 355 | 'replace_with_kernel_inject': True, |
| 356 | 'enable_cuda_graph': enable_cuda_graph, |
| 357 | 'use_triton': enable_triton, |
| 358 | 'triton_autotune': False, |
| 359 | } |
| 360 | if pipe.tokenizer.model_max_length < deepspeed.ops.transformer.inference.config.DeepSpeedInferenceConfig( |
| 361 | ).max_out_tokens: |
| 362 | args.update({'max_out_tokens': pipe.tokenizer.model_max_length}) |
| 363 | pipe.model = deepspeed.init_inference(pipe.model, **args) |
| 364 | check_injection(pipe.model) |
| 365 | # Warm-up queries for perf measurement |
| 366 | #for i in range(10): |
nothing calls this directly
no test coverage detected