MCPcopy
hub / github.com/tinygrad/tinygrad / __call__

Method __call__

tinygrad/runtime/ops_amd.py:611–641  ·  view source on GitHub ↗
(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int|None, ...]=(),
               wait=False, timeout:int|None=None)

Source from the content-addressed store, hash-verified

609 weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
610
611 def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int|None, ...]=(),
612 wait=False, timeout:int|None=None):
613 if self.dev.sqtt_enabled: cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).sqtt_start(self.dev.sqtt_buffers).submit(self.dev)
614 res = super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait, timeout=timeout)
615 if self.dev.pmc_enabled:
616 cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).pmc_read(self.dev.pmc_buffer, self.dev.pmc_sched) \
617 .signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
618 self.dev.allocator._copyout(pmc_buf:=memoryview(bytearray(self.dev.pmc_buffer.size)), self.dev.pmc_buffer)
619 Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.prof_prg_counter, self.dev.pmc_sched, bytes(pmc_buf),
620 self.dev.prof_exec_counter)]
621 if self.dev.sqtt_enabled:
622 cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).sqtt_stop(self.dev.sqtt_wptrs) \
623 .signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
624 self.dev.synchronize()
625
626 for se, buf in enumerate(self.dev.sqtt_buffers):
627 wptr = (self.dev.sqtt_wptrs.cpu_view().view(fmt='I')[se] & 0x1FFFFFFF) * 32
628 if self.dev.target[:2] == (11, 0): wptr -= ((buf.va_addr // 32) & 0x1FFFFFFF) * 32
629
630 if DEBUG >= 5: print(f'\t{self.dev.device}: SE {se} blob size {wptr:#x}')
631 assert wptr >= 0 and wptr <= buf.size, f"{wptr} > {buf.size}, should never happen"
632
633 # When sqtt buffer overflows, wptr stops at the last dword
634 if wptr >= buf.size - 32:
635 print(colored(f"{self.dev.device}: Warning: SQTT buffer is full (SE {se})! Increase SQTT buffer with SQTT_BUFFER_SIZE=X (in MB)", "yellow"))
636
637 self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf)
638 resbuf = (struct.pack('<Q', 0x11 | (4 << 13) | (0xf << 16) | (se << 24)) + bytes(sqtt_mv)) if self.dev.target[0] == 9 else bytes(sqtt_mv)
639 Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.prof_prg_counter, se, resbuf,
640 bool((SQTT_ITRACE_SE_MASK.value >> se) & 1), self.dev.prof_exec_counter)]
641 return res
642
643class AMDAllocator(HCQAllocator['AMDDevice']):
644 def __init__(self, dev:AMDDevice):

Callers

nothing calls this directly

Calls 14

coloredFunction · 0.90
castFunction · 0.85
ProfilePMCEventClass · 0.85
ProfileSQTTEventClass · 0.85
submitMethod · 0.80
sqtt_startMethod · 0.80
pmc_readMethod · 0.80
next_timelineMethod · 0.80
sqtt_stopMethod · 0.80
signalMethod · 0.45
_copyoutMethod · 0.45
synchronizeMethod · 0.45

Tested by

no test coverage detected