hub / github.com/NVIDIA/TensorRT-LLM / allocate_workspace

Method allocate_workspace

tensorrt_llm/plugin/plugin.py:635–689 · view source on GitHub ↗

(mapping: Mapping,
                           size: int)

Source from the content-addressed store, hash-verified

633
634	@staticmethod
635	def allocate_workspace(mapping: Mapping,
636	size: int) -> Tuple[List[IpcMemory], "torch.tensor"]:
637	import torch
638
639	# Force pull mode and disable lamport when force deterministic is enabled, for reducing device memory usage.
640	force_deterministic = force_all_reduce_deterministic()
641	is_p2p_supported = can_access_peer(mapping)
642	ipc_buffers_size = size if force_deterministic else size * mapping.tp_size
643	ipc_buffers_ping = IpcMemory(mapping, ipc_buffers_size,
644	is_p2p_supported)
645	ipc_buffers_pong = IpcMemory(mapping, ipc_buffers_size,
646	is_p2p_supported)
647	ipc_barriers_in = IpcMemory(
648	mapping, IpcMemory.IPC_BARRIERS_SIZE_PER_GPU * mapping.tp_size * 2 *
649	mapping.tp_size, is_p2p_supported)
650	ipc_barriers_out = IpcMemory(
651	mapping, IpcMemory.IPC_BARRIERS_SIZE_PER_GPU * mapping.tp_size * 2 *
652	mapping.tp_size, is_p2p_supported)
653	lamport_buffers_size = 1 if force_deterministic else size * mapping.tp_size
654	lamport_buffers_0 = IpcMemory(mapping, lamport_buffers_size,
655	is_p2p_supported)
656	lamport_buffers_1 = IpcMemory(mapping, lamport_buffers_size,
657	is_p2p_supported)
658	lamport_buffers_2 = IpcMemory(mapping, lamport_buffers_size,
659	is_p2p_supported)
660	# TODO: it seems we may need to initialize lamport buffers for all tp groups
661	# just like its cpp counterpart (AllReduceBuffers::AllReduceBuffers()) does.
662	if is_p2p_supported:
663	lamport_initialize_all(
664	lamport_buffers_0.local_ptr,
665	lamport_buffers_1.local_ptr,
666	lamport_buffers_2.local_ptr,
667	lamport_buffers_size,
668	)
669	buffers = [
670	ipc_buffers_ping,
671	ipc_buffers_pong,
672	ipc_barriers_in,
673	ipc_barriers_out,
674	lamport_buffers_0,
675	lamport_buffers_1,
676	lamport_buffers_2,
677	# Start from 1 since 0 represents released state for barrier at the beginning of the all_reduce.
678	# The last element is the barrier flag counter.
679	torch.tensor([1, 1, 0], dtype=torch.int64, device="cuda")
680	]
681
682	return buffers, torch.tensor(
683	ipc_buffers_ping.serialize() + ipc_buffers_pong.serialize() +
684	ipc_barriers_in.serialize() + ipc_barriers_out.serialize() +
685	lamport_buffers_0.serialize() + lamport_buffers_1.serialize() +
686	lamport_buffers_2.serialize() + [buffers[-1].data_ptr()] +
687	[buffers[-1][1:].data_ptr()] + [buffers[-1][2:].data_ptr()],
688	dtype=torch.int64,
689	device="cpu")
690
691	@staticmethod
692	def allocate_lowprecision_workspace(

Callers 8

test_allreduceMethod · 0.80

test_pp_reduce_scatterMethod · 0.80

test_allreduceMethod · 0.80

__init__Method · 0.80

encoder_runMethod · 0.80

__init__Method · 0.80

Calls 5

serializeMethod · 0.95

force_all_reduce_deterministicFunction · 0.85

can_access_peerFunction · 0.85

IpcMemoryClass · 0.85

data_ptrMethod · 0.45

Tested by 4

test_allreduceMethod · 0.64

test_pp_reduce_scatterMethod · 0.64

test_allreduceMethod · 0.64