(A:UOp)
| 146 | return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg="AMD"), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts])))) |
| 147 | |
| 148 | def custom_data_deps(A:UOp) -> UOp: |
| 149 | A = A.flatten() |
| 150 | threads = UOp.special(A.numel(), "lidx0") |
| 151 | k = Kernel() |
| 152 | k.emit(s_load_b64(s[0:1], s[0:1], soffset=NULL)) |
| 153 | k.emit(s_waitcnt_lgkmcnt(sdst=NULL, simm16=0)) |
| 154 | k.emit(v_lshlrev_b32_e32(v[0], 2, v[0])) |
| 155 | k.emit(global_load_b32(v[1], v[0], saddr=s[0:1])) |
| 156 | k.emit(s_waitcnt_vmcnt(sdst=NULL, simm16=0)) |
| 157 | k.emit(v_add_f32_e32(v[1], 1.0, v[1])) |
| 158 | k.emit(global_store_b32(addr=v[0], data=v[1], saddr=s[0:1])) |
| 159 | k.emit(s_endpgm()) |
| 160 | insts = k.finalize() |
| 161 | sink = UOp.sink(A.base, threads, arg=KernelInfo("custom_data_deps")) |
| 162 | return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg="AMD"), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts])))) |
| 163 | |
| 164 | @unittest.skipUnless(Device.DEFAULT == "AMD", "requires AMD device") |
| 165 | class TestCustomKernel(unittest.TestCase): |
nothing calls this directly
no test coverage detected
searching dependent graphs…