hub / github.com/deepspeedai/DeepSpeed / test

Method test

tests/unit/inference/test_inference.py:309–388 · view source on GitHub ↗

(
        self,
        model_w_task,
        dtype,
        enable_cuda_graph,
        enable_triton,
        query,
        inf_kwargs,
        assert_fn,
        perf_meas=True,
    )

Source from the content-addressed store, hash-verified

307	world_size = 1
308
309	def test(
310	self,
311	model_w_task,
312	dtype,
313	enable_cuda_graph,
314	enable_triton,
315	query,
316	inf_kwargs,
317	assert_fn,
318	perf_meas=True,
319	):
320	invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton)
321	if invalid_test_msg:
322	pytest.skip(invalid_test_msg)
323
324	if dtype not in get_accelerator().supported_dtypes():
325	pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
326
327	if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
328	pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
329
330	model, task = model_w_task
331	local_rank = int(os.getenv("LOCAL_RANK", "0"))
332
333	# Load the model on CPU first to avoid OOM for large models @fp32
334	pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
335	if dtype == torch.half:
336	pipe.model.half()
337
338	# Switch device to GPU after converting to half
339	device = torch.device(get_accelerator().device_name(local_rank))
340	pipe.device = device
341	pipe.model.to(device)
342
343	# Warm-up queries for perf measurement
344	#for i in range(10):
345	# _ = pipe(query, **inf_kwargs)
346	get_accelerator().synchronize()
347	start = time.time()
348	bs_output = pipe(query, **inf_kwargs)
349	get_accelerator().synchronize()
350	bs_time = time.time() - start
351
352	args = {
353	'mp_size': 1,
354	'dtype': dtype,
355	'replace_with_kernel_inject': True,
356	'enable_cuda_graph': enable_cuda_graph,
357	'use_triton': enable_triton,
358	'triton_autotune': False,
359	}
360	if pipe.tokenizer.model_max_length < deepspeed.ops.transformer.inference.config.DeepSpeedInferenceConfig(
361	).max_out_tokens:
362	args.update({'max_out_tokens': pipe.tokenizer.model_max_length})
363	pipe.model = deepspeed.init_inference(pipe.model, **args)
364	check_injection(pipe.model)
365	# Warm-up queries for perf measurement
366	#for i in range(10):

Callers

nothing calls this directly

Calls 10

get_acceleratorFunction · 0.90

validate_testFunction · 0.85

check_injectionFunction · 0.85

assert_fnFunction · 0.85

supported_dtypesMethod · 0.45

device_nameMethod · 0.45

deviceMethod · 0.45

toMethod · 0.45

synchronizeMethod · 0.45

updateMethod · 0.45

Tested by

no test coverage detected