MCPcopy
hub / github.com/deepspeedai/DeepSpeed / test

Method test

tests/unit/inference/test_inference.py:309–388  ·  view source on GitHub ↗
(
        self,
        model_w_task,
        dtype,
        enable_cuda_graph,
        enable_triton,
        query,
        inf_kwargs,
        assert_fn,
        perf_meas=True,
    )

Source from the content-addressed store, hash-verified

307 world_size = 1
308
309 def test(
310 self,
311 model_w_task,
312 dtype,
313 enable_cuda_graph,
314 enable_triton,
315 query,
316 inf_kwargs,
317 assert_fn,
318 perf_meas=True,
319 ):
320 invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton)
321 if invalid_test_msg:
322 pytest.skip(invalid_test_msg)
323
324 if dtype not in get_accelerator().supported_dtypes():
325 pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
326
327 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
328 pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
329
330 model, task = model_w_task
331 local_rank = int(os.getenv("LOCAL_RANK", "0"))
332
333 # Load the model on CPU first to avoid OOM for large models @fp32
334 pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
335 if dtype == torch.half:
336 pipe.model.half()
337
338 # Switch device to GPU after converting to half
339 device = torch.device(get_accelerator().device_name(local_rank))
340 pipe.device = device
341 pipe.model.to(device)
342
343 # Warm-up queries for perf measurement
344 #for i in range(10):
345 # _ = pipe(query, **inf_kwargs)
346 get_accelerator().synchronize()
347 start = time.time()
348 bs_output = pipe(query, **inf_kwargs)
349 get_accelerator().synchronize()
350 bs_time = time.time() - start
351
352 args = {
353 'mp_size': 1,
354 'dtype': dtype,
355 'replace_with_kernel_inject': True,
356 'enable_cuda_graph': enable_cuda_graph,
357 'use_triton': enable_triton,
358 'triton_autotune': False,
359 }
360 if pipe.tokenizer.model_max_length < deepspeed.ops.transformer.inference.config.DeepSpeedInferenceConfig(
361 ).max_out_tokens:
362 args.update({'max_out_tokens': pipe.tokenizer.model_max_length})
363 pipe.model = deepspeed.init_inference(pipe.model, **args)
364 check_injection(pipe.model)
365 # Warm-up queries for perf measurement
366 #for i in range(10):

Callers

nothing calls this directly

Calls 10

get_acceleratorFunction · 0.90
validate_testFunction · 0.85
check_injectionFunction · 0.85
assert_fnFunction · 0.85
supported_dtypesMethod · 0.45
device_nameMethod · 0.45
deviceMethod · 0.45
toMethod · 0.45
synchronizeMethod · 0.45
updateMethod · 0.45

Tested by

no test coverage detected