(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int|None, ...]=(),
wait=False, timeout:int|None=None)
| 324 | start_off += (sz if typ == 0x4 else 0) + 4 |
| 325 | |
| 326 | def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int|None, ...]=(), |
| 327 | wait=False, timeout:int|None=None): |
| 328 | if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread: |
| 329 | raise RuntimeError(f"Too many resources requested for launch, {prod(local_size)=}, {self.max_threads=}") |
| 330 | if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])): |
| 331 | raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}") |
| 332 | res = super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait, timeout=timeout) |
| 333 | if self.dev.pma_enabled: |
| 334 | self.dev.synchronize() |
| 335 | if pma_blob:=self.dev._prof_readback(): |
| 336 | Compiled.profile_events += [ProfilePMAEvent(self.dev.device, self.name, pma_blob, self.dev.prof_exec_counter)] |
| 337 | return res |
| 338 | |
| 339 | class NVAllocator(HCQAllocator['NVDevice']): |
| 340 | def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer: |
nothing calls this directly
no test coverage detected