(dtype)
| 49 | dtype, nv_dtype = input |
| 50 | |
| 51 | def _create_mod(dtype): |
| 52 | @I.ir_module(s_tir=True) |
| 53 | class Module: |
| 54 | @T.prim_func(s_tir=True) |
| 55 | def main( |
| 56 | A: T.Buffer((64,), dtype), |
| 57 | B: T.Buffer((64,), dtype), |
| 58 | C: T.Buffer((64,), dtype), |
| 59 | ): |
| 60 | T.func_attr({"tirx.noalias": True}) |
| 61 | for i_0 in T.thread_binding(2, thread="blockIdx.x"): |
| 62 | for i_1 in T.thread_binding(32, thread="threadIdx.x"): |
| 63 | with T.sblock("C"): |
| 64 | v_i = T.axis.spatial(64, i_0 * 32 + i_1) |
| 65 | T.reads(A[v_i], B[v_i]) |
| 66 | T.writes(C[v_i]) |
| 67 | C[v_i] = T.Cast( |
| 68 | dtype, T.Cast("float16", A[v_i]) + T.Cast("float16", B[v_i]) |
| 69 | ) |
| 70 | |
| 71 | return Module |
| 72 | |
| 73 | mod = _create_mod(dtype) |
| 74 | target = "cuda" |
no outgoing calls
no test coverage detected
searching dependent graphs…