MCPcopy
hub / github.com/tinygrad/tinygrad / __init__

Method __init__

tinygrad/runtime/ops_amd.py:955–1037  ·  view source on GitHub ↗
(self, device:str="")

Source from the content-addressed store, hash-verified

953 def is_usb(self) -> bool: return isinstance(self.iface, USBIface)
954
955 def __init__(self, device:str=""):
956 self.device_id = int(device.split(":")[1]) if ":" in device else 0
957
958 self.iface = self._select_iface()
959
960 self.target:tuple[int, ...] = ((trgt:=self.iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
961 self.arch = "gfx%d%x%x" % self.target
962 assert (self.target in ((9,4,2),(9,5,0))) or self.target[0] in (11, 12), f"Unsupported arch: {self.arch}"
963 if DEBUG >= 1: print(f"AMDDevice: opening {self.device_id} with target {self.target} arch {self.arch}")
964
965 self.xccs = self.iface.props.get('num_xcc', 1)
966 self.se_cnt = self.iface.props['array_count'] // self.iface.props['simd_arrays_per_engine'] // self.xccs
967 self.cu_cnt = self.iface.props['simd_count'] // self.iface.props['simd_per_cu'] // self.xccs
968 self.waves_per_cu = self.iface.props['max_waves_per_simd'] * self.iface.props['simd_per_cu']
969 self.wave_cnt = (self.cu_cnt * self.waves_per_cu) if self.target[0] != 9 else min(self.cu_cnt * 40, self.se_cnt * self.xccs * 512)
970
971 # https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
972 sgrp_size_per_cu, hwreg_size_per_cu = 0x4000, 0x1000
973 lds_size_per_cu = self.iface.props["lds_size_in_kb"] << 10 if self.target[:2] == (9,5) else 0x10000
974 vgpr_size_per_cu = 0x60000 if self.target in {(11,0,0), (11,0,1), (11,5,1), (12,0,0), (12,0,1)} else 0x80000 if self.target[0] == 9 else 0x40000
975 wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * self.cu_cnt, mmap.PAGESIZE)
976 ctl_stack_size = round_up((12 if self.target[0] != 9 else 8) * self.wave_cnt + 8 + 40, mmap.PAGESIZE)
977 debug_memory_size = round_up(self.wave_cnt * 32, 64)
978
979 self.ip_off = importlib.import_module(f"tinygrad.runtime.autogen.am.{'vega' if self.target[0] == 9 else 'navi'}_offsets")
980 self.soc = import_soc(self.target)
981 self.pm4 = importlib.import_module(f"tinygrad.runtime.autogen.am.pm4_{'soc15' if self.target[0] == 9 else 'nv'}")
982 self.sdma = import_module('sdma', min(self.iface.ip_versions[am.SDMA0_HWIP], (6, 0, 0)))
983 self.gc = AMDIP('gc', self.iface.ip_versions[am.GC_HWIP],
984 bases={i: tuple(getattr(self.ip_off, f'GC_BASE__INST{i}_SEG{s}', 0) for s in range(6)) for i in range(6)})
985
986 self.nbio = AMDIP('nbio' if self.target[0] < 12 else 'nbif', self.iface.ip_versions[am.NBIF_HWIP],
987 bases={i: tuple(getattr(self.ip_off, f'NBIO_BASE__INST{i}_SEG{s}', 0) for s in range(9)) for i in range(6)})
988
989 self.is_aql = getenv("AMD_AQL", int(self.xccs > 1))
990 if self.is_aql:
991 self.pm4_ibs = self.iface.alloc(0x2000 if self.is_usb() else (16 << 20), uncached=True, cpu_access=True)
992 self.pm4_ib_alloc = BumpAllocator(self.pm4_ibs.size, wrap=True)
993
994 self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL if self.is_aql else kfd.KFD_IOC_QUEUE_TYPE_COMPUTE,
995 0x2000 if self.is_usb() else (16 << 20), eop_buffer_size=0x1000,
996 ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
997
998 self.max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
999 self.sdma_queues:dict = {}
1000 self.has_sdma_queue = self.sdma_queue(0) is not None
1001
1002 super().__init__(device, AMDAllocator(self), [HIPRenderer, AMDLLVMRenderer, HIPCCRenderer], functools.partial(AMDProgram, self), AMDSignal,
1003 functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self),
1004 functools.partial(AMDCopyQueue, self, max_copy_size=self.max_copy_size) if self.has_sdma_queue else None,
1005 kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000,
1006 can_recover=self.is_am(), arch=self.arch)
1007
1008 # Scratch setup
1009 self.max_private_segment_size = 0
1010 self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
1011
1012 self.pmc_enabled:bool = PROFILE > 0 and PMC > 0

Callers

nothing calls this directly

Calls 15

is_usbMethod · 0.95
create_queueMethod · 0.95
is_amMethod · 0.95
sdma_queueMethod · 0.95
round_upFunction · 0.90
import_socFunction · 0.90
import_moduleFunction · 0.90
AMDIPClass · 0.90
getenvFunction · 0.90
import_pmcFunction · 0.90
unwrapFunction · 0.90

Tested by

no test coverage detected