(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int|None, ...]=(),
wait=False, timeout:int|None=None)
| 609 | weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec) |
| 610 | |
| 611 | def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int|None, ...]=(), |
| 612 | wait=False, timeout:int|None=None): |
| 613 | if self.dev.sqtt_enabled: cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).sqtt_start(self.dev.sqtt_buffers).submit(self.dev) |
| 614 | res = super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait, timeout=timeout) |
| 615 | if self.dev.pmc_enabled: |
| 616 | cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).pmc_read(self.dev.pmc_buffer, self.dev.pmc_sched) \ |
| 617 | .signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev) |
| 618 | self.dev.allocator._copyout(pmc_buf:=memoryview(bytearray(self.dev.pmc_buffer.size)), self.dev.pmc_buffer) |
| 619 | Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.prof_prg_counter, self.dev.pmc_sched, bytes(pmc_buf), |
| 620 | self.dev.prof_exec_counter)] |
| 621 | if self.dev.sqtt_enabled: |
| 622 | cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).sqtt_stop(self.dev.sqtt_wptrs) \ |
| 623 | .signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev) |
| 624 | self.dev.synchronize() |
| 625 | |
| 626 | for se, buf in enumerate(self.dev.sqtt_buffers): |
| 627 | wptr = (self.dev.sqtt_wptrs.cpu_view().view(fmt='I')[se] & 0x1FFFFFFF) * 32 |
| 628 | if self.dev.target[:2] == (11, 0): wptr -= ((buf.va_addr // 32) & 0x1FFFFFFF) * 32 |
| 629 | |
| 630 | if DEBUG >= 5: print(f'\t{self.dev.device}: SE {se} blob size {wptr:#x}') |
| 631 | assert wptr >= 0 and wptr <= buf.size, f"{wptr} > {buf.size}, should never happen" |
| 632 | |
| 633 | # When sqtt buffer overflows, wptr stops at the last dword |
| 634 | if wptr >= buf.size - 32: |
| 635 | print(colored(f"{self.dev.device}: Warning: SQTT buffer is full (SE {se})! Increase SQTT buffer with SQTT_BUFFER_SIZE=X (in MB)", "yellow")) |
| 636 | |
| 637 | self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf) |
| 638 | resbuf = (struct.pack('<Q', 0x11 | (4 << 13) | (0xf << 16) | (se << 24)) + bytes(sqtt_mv)) if self.dev.target[0] == 9 else bytes(sqtt_mv) |
| 639 | Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.prof_prg_counter, se, resbuf, |
| 640 | bool((SQTT_ITRACE_SE_MASK.value >> se) & 1), self.dev.prof_exec_counter)] |
| 641 | return res |
| 642 | |
| 643 | class AMDAllocator(HCQAllocator['AMDDevice']): |
| 644 | def __init__(self, dev:AMDDevice): |
nothing calls this directly
no test coverage detected