hub / github.com/tinygrad/tinygrad / __init__

Method init

tinygrad/runtime/ops_amd.py:563–609 · view source on GitHub ↗

(self, dev:AMDDevice, name:str, lib:bytes, **kwargs)

Source from the content-addressed store, hash-verified

561
562	class AMDProgram(HCQProgram):
563	def __init__(self, dev:AMDDevice, name:str, lib:bytes, **kwargs):
564	# TODO; this API needs the type signature of the function and global_size/local_size
565	self.dev, self.name, self.lib = dev, name, lib
566
567	image, sections, relocs = elf_loader(self.lib)
568
569	rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
570	assert rodata_entry >= 0, ".rodata section not found"
571
572	for apply_image_offset, rel_sym_offset, typ, addent in relocs:
573	if typ == 5: image[apply_image_offset:apply_image_offset+8] = struct.pack('<q', rel_sym_offset - apply_image_offset + addent) # R_AMDGPU_REL64
574	else: raise RuntimeError(f"unknown AMD reloc {typ}")
575
576	self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(nolru=True))
577	self.dev.allocator._copyin(self.lib_gpu, image)
578	self.dev.synchronize()
579
580	desc_sz = ctypes.sizeof(amdgpu_kd.llvm_amdhsa_kernel_descriptor_t)
581	desc = amdgpu_kd.llvm_amdhsa_kernel_descriptor_t.from_buffer_copy(bytes(image[rodata_entry:rodata_entry+desc_sz]))
582	self.group_segment_size = desc.group_segment_fixed_size
583	self.private_segment_size = desc.private_segment_fixed_size
584	self.kernargs_segment_size = desc.kernarg_size
585	lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
586	if lds_size > (self.dev.iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
587
588	# Ensure scratch size
589	self.dev._ensure_has_local_memory(self.private_segment_size)
590
591	self.wave32: bool = desc.kernel_code_properties & 0x400 == 0x400
592
593	# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
594	self.rsrc1: int = desc.compute_pgm_rsrc1 \| ((1 << 20) if self.dev.target[0] == 11 else 0)
595	self.rsrc2: int = desc.compute_pgm_rsrc2 \| (lds_size << 15)
596	self.rsrc3: int = desc.compute_pgm_rsrc3
597	self.aql_prog_addr: int = self.lib_gpu.va_addr + rodata_entry
598	self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + desc.kernel_code_entry_byte_offset
599	# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
600	# The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
601	self.enable_dispatch_ptr: int = desc.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
602	self.enable_private_segment_sgpr: int = desc.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
603	additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
604
605	if dev.sqtt_enabled: self.libhash: tuple[int, int] = struct.unpack('<Q', hashlib.md5(self.lib).digest()[:8])*2
606
607	super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib,
608	base=self.lib_gpu.va_addr)
609	weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
610
611	def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int\|None, ...]=(),
612	wait=False, timeout:int\|None=None):

Callers

nothing calls this directly

Calls 9

elf_loaderFunction · 0.90

round_upFunction · 0.90

BufferSpecClass · 0.90

allocMethod · 0.45

_copyinMethod · 0.45

synchronizeMethod · 0.45

_ensure_has_local_memoryMethod · 0.45

__init__Method · 0.45

finalizeMethod · 0.45

Tested by

no test coverage detected

Method __init__

Source from the content-addressed store, hash-verified

Callers

Calls 9

Tested by

Method init