(self, device:str="")
| 953 | def is_usb(self) -> bool: return isinstance(self.iface, USBIface) |
| 954 | |
| 955 | def __init__(self, device:str=""): |
| 956 | self.device_id = int(device.split(":")[1]) if ":" in device else 0 |
| 957 | |
| 958 | self.iface = self._select_iface() |
| 959 | |
| 960 | self.target:tuple[int, ...] = ((trgt:=self.iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100) |
| 961 | self.arch = "gfx%d%x%x" % self.target |
| 962 | assert (self.target in ((9,4,2),(9,5,0))) or self.target[0] in (11, 12), f"Unsupported arch: {self.arch}" |
| 963 | if DEBUG >= 1: print(f"AMDDevice: opening {self.device_id} with target {self.target} arch {self.arch}") |
| 964 | |
| 965 | self.xccs = self.iface.props.get('num_xcc', 1) |
| 966 | self.se_cnt = self.iface.props['array_count'] // self.iface.props['simd_arrays_per_engine'] // self.xccs |
| 967 | self.cu_cnt = self.iface.props['simd_count'] // self.iface.props['simd_per_cu'] // self.xccs |
| 968 | self.waves_per_cu = self.iface.props['max_waves_per_simd'] * self.iface.props['simd_per_cu'] |
| 969 | self.wave_cnt = (self.cu_cnt * self.waves_per_cu) if self.target[0] != 9 else min(self.cu_cnt * 40, self.se_cnt * self.xccs * 512) |
| 970 | |
| 971 | # https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391 |
| 972 | sgrp_size_per_cu, hwreg_size_per_cu = 0x4000, 0x1000 |
| 973 | lds_size_per_cu = self.iface.props["lds_size_in_kb"] << 10 if self.target[:2] == (9,5) else 0x10000 |
| 974 | vgpr_size_per_cu = 0x60000 if self.target in {(11,0,0), (11,0,1), (11,5,1), (12,0,0), (12,0,1)} else 0x80000 if self.target[0] == 9 else 0x40000 |
| 975 | wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * self.cu_cnt, mmap.PAGESIZE) |
| 976 | ctl_stack_size = round_up((12 if self.target[0] != 9 else 8) * self.wave_cnt + 8 + 40, mmap.PAGESIZE) |
| 977 | debug_memory_size = round_up(self.wave_cnt * 32, 64) |
| 978 | |
| 979 | self.ip_off = importlib.import_module(f"tinygrad.runtime.autogen.am.{'vega' if self.target[0] == 9 else 'navi'}_offsets") |
| 980 | self.soc = import_soc(self.target) |
| 981 | self.pm4 = importlib.import_module(f"tinygrad.runtime.autogen.am.pm4_{'soc15' if self.target[0] == 9 else 'nv'}") |
| 982 | self.sdma = import_module('sdma', min(self.iface.ip_versions[am.SDMA0_HWIP], (6, 0, 0))) |
| 983 | self.gc = AMDIP('gc', self.iface.ip_versions[am.GC_HWIP], |
| 984 | bases={i: tuple(getattr(self.ip_off, f'GC_BASE__INST{i}_SEG{s}', 0) for s in range(6)) for i in range(6)}) |
| 985 | |
| 986 | self.nbio = AMDIP('nbio' if self.target[0] < 12 else 'nbif', self.iface.ip_versions[am.NBIF_HWIP], |
| 987 | bases={i: tuple(getattr(self.ip_off, f'NBIO_BASE__INST{i}_SEG{s}', 0) for s in range(9)) for i in range(6)}) |
| 988 | |
| 989 | self.is_aql = getenv("AMD_AQL", int(self.xccs > 1)) |
| 990 | if self.is_aql: |
| 991 | self.pm4_ibs = self.iface.alloc(0x2000 if self.is_usb() else (16 << 20), uncached=True, cpu_access=True) |
| 992 | self.pm4_ib_alloc = BumpAllocator(self.pm4_ibs.size, wrap=True) |
| 993 | |
| 994 | self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL if self.is_aql else kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, |
| 995 | 0x2000 if self.is_usb() else (16 << 20), eop_buffer_size=0x1000, |
| 996 | ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size) |
| 997 | |
| 998 | self.max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000 |
| 999 | self.sdma_queues:dict = {} |
| 1000 | self.has_sdma_queue = self.sdma_queue(0) is not None |
| 1001 | |
| 1002 | super().__init__(device, AMDAllocator(self), [HIPRenderer, AMDLLVMRenderer, HIPCCRenderer], functools.partial(AMDProgram, self), AMDSignal, |
| 1003 | functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self), |
| 1004 | functools.partial(AMDCopyQueue, self, max_copy_size=self.max_copy_size) if self.has_sdma_queue else None, |
| 1005 | kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000, |
| 1006 | can_recover=self.is_am(), arch=self.arch) |
| 1007 | |
| 1008 | # Scratch setup |
| 1009 | self.max_private_segment_size = 0 |
| 1010 | self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread |
| 1011 | |
| 1012 | self.pmc_enabled:bool = PROFILE > 0 and PMC > 0 |
nothing calls this directly
no test coverage detected