(self, buf0s:list[HCQBuffer])
| 225 | mode=int(tracing), **trace_ctrl) |
| 226 | |
| 227 | def sqtt_start(self, buf0s:list[HCQBuffer]): |
| 228 | self.memory_barrier() |
| 229 | if self.dev.target[0] == 9: |
| 230 | self.set_grbm() |
| 231 | self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, simd_en=0xf, cu_sel=0, sq_stall_en=1, spi_stall_en=1, reg_stall_en=1, vm_id_mask=0) |
| 232 | for se in range(len(buf0s)): |
| 233 | mask = (__SQTT_MISC:=1<<0) | (__SQTT_TIME:=1<<1) | (__SQTT_REG:=1<<2) | (__SQTT_WAVE_START:=1<<3) | (__SQTT_WAVE_END:=1<<6) \ |
| 234 | | (__SQTT_USERDATA:=1<<12) | (__SQTT_REG_CS:=1<<5) | (__SQTT_REG_CS_PRIV:=1<<15) |
| 235 | if (SQTT_ITRACE_SE_MASK.value >> se) & 0b1: mask |= (__SQTTINST:=1<<10) | (__SQTT_INST_PC:=1<<11) | (__SQTT_ISSUE:=1<<13) |
| 236 | |
| 237 | with self.pred_exec(xcc_mask=1<<(se // self.dev.se_cnt)): |
| 238 | self.set_grbm(se=se % self.dev.se_cnt, sh=0) |
| 239 | self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_mask=0xf, token_mask=mask) |
| 240 | self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK2, inst_mask=0xffffffff) |
| 241 | self.wreg(self.gc.regSQ_THREAD_TRACE_BASE, addr=lo32(buf0s[se].va_addr >> 12)) |
| 242 | self.wreg(self.gc.regSQ_THREAD_TRACE_BASE2, addr_hi=hi32(buf0s[se].va_addr >> 12)) |
| 243 | self.wreg(self.gc.regSQ_THREAD_TRACE_SIZE, size=buf0s[se].size >> 12) |
| 244 | self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, reset_buffer=1) |
| 245 | self.wreg(self.gc.regSQ_THREAD_TRACE_MODE, mask_cs=1, autoflush_en=1, mode=1) |
| 246 | else: |
| 247 | self.spi_config(tracing=True) |
| 248 | # One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well |
| 249 | for se in range(len(buf0s)): |
| 250 | self.set_grbm(se=se, sh=0) |
| 251 | |
| 252 | buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr >> 12) |
| 253 | if self.dev.target >= (12,0,0): |
| 254 | self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, size=buf0s[se].size >> 12) |
| 255 | self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_LO, base_lo=buf0_lo) |
| 256 | self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_HI, base_hi=buf0_hi) |
| 257 | else: |
| 258 | self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size >> 12) |
| 259 | self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo) |
| 260 | # NOTE: SQTT can only trace instructions on one simd per se, this selects the simd in first wgp in first sa. |
| 261 | # For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se, |
| 262 | # and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but |
| 263 | # sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and |
| 264 | # be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the |
| 265 | # CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront. |
| 266 | # Use SQTT_SIMD_SEL to select which SIMD to trace (0-3). Memory ops show different InstOp values (0x2x vs 0x5x) based on SIMD. |
| 267 | cs_wtype = (1 << 6) if self.dev.target >= (12,0,0) else self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT |
| 268 | self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=cs_wtype, simd_sel=SQTT_SIMD_SEL.value, wgp_sel=0, sa_sel=0) |
| 269 | reg_include = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \ |
| 270 | self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT |
| 271 | token_exclude = SQTT_TOKEN_EXCLUDE.value | ((1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT) if self.dev.target < (12,0,0) else 0) |
| 272 | |
| 273 | # disable instr tracing |
| 274 | if not (SQTT_ITRACE_SE_MASK.value >> se) & 0b1: |
| 275 | # gfx12 doesn't have enums with all fields, so it's hardcoded, but it's the same as gfx11. |
| 276 | token_exclude |= (1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \ |
| 277 | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \ |
| 278 | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT) if self.dev.target < (12,0,0) else 0x927 |
| 279 | |
| 280 | self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=reg_include, token_exclude=token_exclude, bop_events_token_include=1, |
| 281 | **({} if self.dev.target < (12,0,0) else {'exclude_barrier_wait': 1})) |
| 282 | self.sqtt_config(tracing=True) |
| 283 | |
| 284 | self.set_grbm() |
no test coverage detected