(A: T.Buffer((16, 16), "float32"), B: T.Buffer((16, 16), "float32"))
| 52 | |
| 53 | @T.prim_func(s_tir=True) |
| 54 | def add(A: T.Buffer((16, 16), "float32"), B: T.Buffer((16, 16), "float32")): |
| 55 | T.func_attr({"global_symbol": "add"}) |
| 56 | with T.sblock("root"): |
| 57 | for i in T.thread_binding(16, thread="threadIdx.x"): |
| 58 | for j in range(16): |
| 59 | with T.sblock("update"): |
| 60 | vi, vj = T.axis.remap("SS", [i, j]) |
| 61 | B[vi, vj] = A[vi, vj] + T.float32(1) |
| 62 | |
| 63 | @R.function |
| 64 | def cuda_graph_alloc() -> R.Tuple(R.Object, R.Object): |
no test coverage detected