MCPcopy
hub / github.com/tinygrad/tinygrad / sqtt_start

Method sqtt_start

tinygrad/runtime/ops_amd.py:227–287  ·  view source on GitHub ↗
(self, buf0s:list[HCQBuffer])

Source from the content-addressed store, hash-verified

225 mode=int(tracing), **trace_ctrl)
226
227 def sqtt_start(self, buf0s:list[HCQBuffer]):
228 self.memory_barrier()
229 if self.dev.target[0] == 9:
230 self.set_grbm()
231 self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, simd_en=0xf, cu_sel=0, sq_stall_en=1, spi_stall_en=1, reg_stall_en=1, vm_id_mask=0)
232 for se in range(len(buf0s)):
233 mask = (__SQTT_MISC:=1<<0) | (__SQTT_TIME:=1<<1) | (__SQTT_REG:=1<<2) | (__SQTT_WAVE_START:=1<<3) | (__SQTT_WAVE_END:=1<<6) \
234 | (__SQTT_USERDATA:=1<<12) | (__SQTT_REG_CS:=1<<5) | (__SQTT_REG_CS_PRIV:=1<<15)
235 if (SQTT_ITRACE_SE_MASK.value >> se) & 0b1: mask |= (__SQTTINST:=1<<10) | (__SQTT_INST_PC:=1<<11) | (__SQTT_ISSUE:=1<<13)
236
237 with self.pred_exec(xcc_mask=1<<(se // self.dev.se_cnt)):
238 self.set_grbm(se=se % self.dev.se_cnt, sh=0)
239 self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_mask=0xf, token_mask=mask)
240 self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK2, inst_mask=0xffffffff)
241 self.wreg(self.gc.regSQ_THREAD_TRACE_BASE, addr=lo32(buf0s[se].va_addr >> 12))
242 self.wreg(self.gc.regSQ_THREAD_TRACE_BASE2, addr_hi=hi32(buf0s[se].va_addr >> 12))
243 self.wreg(self.gc.regSQ_THREAD_TRACE_SIZE, size=buf0s[se].size >> 12)
244 self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, reset_buffer=1)
245 self.wreg(self.gc.regSQ_THREAD_TRACE_MODE, mask_cs=1, autoflush_en=1, mode=1)
246 else:
247 self.spi_config(tracing=True)
248 # One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well
249 for se in range(len(buf0s)):
250 self.set_grbm(se=se, sh=0)
251
252 buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr >> 12)
253 if self.dev.target >= (12,0,0):
254 self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, size=buf0s[se].size >> 12)
255 self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_LO, base_lo=buf0_lo)
256 self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_HI, base_hi=buf0_hi)
257 else:
258 self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size >> 12)
259 self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo)
260 # NOTE: SQTT can only trace instructions on one simd per se, this selects the simd in first wgp in first sa.
261 # For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
262 # and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
263 # sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
264 # be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
265 # CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
266 # Use SQTT_SIMD_SEL to select which SIMD to trace (0-3). Memory ops show different InstOp values (0x2x vs 0x5x) based on SIMD.
267 cs_wtype = (1 << 6) if self.dev.target >= (12,0,0) else self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT
268 self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=cs_wtype, simd_sel=SQTT_SIMD_SEL.value, wgp_sel=0, sa_sel=0)
269 reg_include = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
270 self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT
271 token_exclude = SQTT_TOKEN_EXCLUDE.value | ((1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT) if self.dev.target < (12,0,0) else 0)
272
273 # disable instr tracing
274 if not (SQTT_ITRACE_SE_MASK.value >> se) & 0b1:
275 # gfx12 doesn't have enums with all fields, so it's hardcoded, but it's the same as gfx11.
276 token_exclude |= (1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
277 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
278 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT) if self.dev.target < (12,0,0) else 0x927
279
280 self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=reg_include, token_exclude=token_exclude, bop_events_token_include=1,
281 **({} if self.dev.target < (12,0,0) else {'exclude_barrier_wait': 1}))
282 self.sqtt_config(tracing=True)
283
284 self.set_grbm()

Callers 1

__call__Method · 0.80

Calls 9

memory_barrierMethod · 0.95
set_grbmMethod · 0.95
wregMethod · 0.95
pred_execMethod · 0.95
spi_configMethod · 0.95
sqtt_configMethod · 0.95
lo32Function · 0.90
hi32Function · 0.90
data64_leFunction · 0.90

Tested by

no test coverage detected