(self, dev:AMDDevice, name:str, lib:bytes, **kwargs)
| 561 | |
| 562 | class AMDProgram(HCQProgram): |
| 563 | def __init__(self, dev:AMDDevice, name:str, lib:bytes, **kwargs): |
| 564 | # TODO; this API needs the type signature of the function and global_size/local_size |
| 565 | self.dev, self.name, self.lib = dev, name, lib |
| 566 | |
| 567 | image, sections, relocs = elf_loader(self.lib) |
| 568 | |
| 569 | rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1) |
| 570 | assert rodata_entry >= 0, ".rodata section not found" |
| 571 | |
| 572 | for apply_image_offset, rel_sym_offset, typ, addent in relocs: |
| 573 | if typ == 5: image[apply_image_offset:apply_image_offset+8] = struct.pack('<q', rel_sym_offset - apply_image_offset + addent) # R_AMDGPU_REL64 |
| 574 | else: raise RuntimeError(f"unknown AMD reloc {typ}") |
| 575 | |
| 576 | self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(nolru=True)) |
| 577 | self.dev.allocator._copyin(self.lib_gpu, image) |
| 578 | self.dev.synchronize() |
| 579 | |
| 580 | desc_sz = ctypes.sizeof(amdgpu_kd.llvm_amdhsa_kernel_descriptor_t) |
| 581 | desc = amdgpu_kd.llvm_amdhsa_kernel_descriptor_t.from_buffer_copy(bytes(image[rodata_entry:rodata_entry+desc_sz])) |
| 582 | self.group_segment_size = desc.group_segment_fixed_size |
| 583 | self.private_segment_size = desc.private_segment_fixed_size |
| 584 | self.kernargs_segment_size = desc.kernarg_size |
| 585 | lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF |
| 586 | if lds_size > (self.dev.iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size") |
| 587 | |
| 588 | # Ensure scratch size |
| 589 | self.dev._ensure_has_local_memory(self.private_segment_size) |
| 590 | |
| 591 | self.wave32: bool = desc.kernel_code_properties & 0x400 == 0x400 |
| 592 | |
| 593 | # Set rsrc1.priv=1 on gfx11 to workaround cwsr. |
| 594 | self.rsrc1: int = desc.compute_pgm_rsrc1 | ((1 << 20) if self.dev.target[0] == 11 else 0) |
| 595 | self.rsrc2: int = desc.compute_pgm_rsrc2 | (lds_size << 15) |
| 596 | self.rsrc3: int = desc.compute_pgm_rsrc3 |
| 597 | self.aql_prog_addr: int = self.lib_gpu.va_addr + rodata_entry |
| 598 | self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + desc.kernel_code_entry_byte_offset |
| 599 | # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution. |
| 600 | # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments. |
| 601 | self.enable_dispatch_ptr: int = desc.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR |
| 602 | self.enable_private_segment_sgpr: int = desc.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER |
| 603 | additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0 |
| 604 | |
| 605 | if dev.sqtt_enabled: self.libhash: tuple[int, int] = struct.unpack('<Q', hashlib.md5(self.lib).digest()[:8])*2 |
| 606 | |
| 607 | super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib, |
| 608 | base=self.lib_gpu.va_addr) |
| 609 | weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec) |
| 610 | |
| 611 | def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int|None, ...]=(), |
| 612 | wait=False, timeout:int|None=None): |
nothing calls this directly
no test coverage detected