MCPcopy
hub / github.com/tinygrad/tinygrad / __init__

Method __init__

tinygrad/runtime/ops_amd.py:563–609  ·  view source on GitHub ↗
(self, dev:AMDDevice, name:str, lib:bytes, **kwargs)

Source from the content-addressed store, hash-verified

561
562class AMDProgram(HCQProgram):
563 def __init__(self, dev:AMDDevice, name:str, lib:bytes, **kwargs):
564 # TODO; this API needs the type signature of the function and global_size/local_size
565 self.dev, self.name, self.lib = dev, name, lib
566
567 image, sections, relocs = elf_loader(self.lib)
568
569 rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
570 assert rodata_entry >= 0, ".rodata section not found"
571
572 for apply_image_offset, rel_sym_offset, typ, addent in relocs:
573 if typ == 5: image[apply_image_offset:apply_image_offset+8] = struct.pack('<q', rel_sym_offset - apply_image_offset + addent) # R_AMDGPU_REL64
574 else: raise RuntimeError(f"unknown AMD reloc {typ}")
575
576 self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(nolru=True))
577 self.dev.allocator._copyin(self.lib_gpu, image)
578 self.dev.synchronize()
579
580 desc_sz = ctypes.sizeof(amdgpu_kd.llvm_amdhsa_kernel_descriptor_t)
581 desc = amdgpu_kd.llvm_amdhsa_kernel_descriptor_t.from_buffer_copy(bytes(image[rodata_entry:rodata_entry+desc_sz]))
582 self.group_segment_size = desc.group_segment_fixed_size
583 self.private_segment_size = desc.private_segment_fixed_size
584 self.kernargs_segment_size = desc.kernarg_size
585 lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
586 if lds_size > (self.dev.iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
587
588 # Ensure scratch size
589 self.dev._ensure_has_local_memory(self.private_segment_size)
590
591 self.wave32: bool = desc.kernel_code_properties & 0x400 == 0x400
592
593 # Set rsrc1.priv=1 on gfx11 to workaround cwsr.
594 self.rsrc1: int = desc.compute_pgm_rsrc1 | ((1 << 20) if self.dev.target[0] == 11 else 0)
595 self.rsrc2: int = desc.compute_pgm_rsrc2 | (lds_size << 15)
596 self.rsrc3: int = desc.compute_pgm_rsrc3
597 self.aql_prog_addr: int = self.lib_gpu.va_addr + rodata_entry
598 self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + desc.kernel_code_entry_byte_offset
599 # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
600 # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
601 self.enable_dispatch_ptr: int = desc.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
602 self.enable_private_segment_sgpr: int = desc.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
603 additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
604
605 if dev.sqtt_enabled: self.libhash: tuple[int, int] = struct.unpack('<Q', hashlib.md5(self.lib).digest()[:8])*2
606
607 super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib,
608 base=self.lib_gpu.va_addr)
609 weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
610
611 def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int|None, ...]=(),
612 wait=False, timeout:int|None=None):

Callers

nothing calls this directly

Calls 9

elf_loaderFunction · 0.90
round_upFunction · 0.90
BufferSpecClass · 0.90
allocMethod · 0.45
_copyinMethod · 0.45
synchronizeMethod · 0.45
__init__Method · 0.45
finalizeMethod · 0.45

Tested by

no test coverage detected