MCPcopy
hub / github.com/tinygrad/tinygrad / sqtt_setup_exec

Method sqtt_setup_exec

tinygrad/runtime/ops_amd.py:199–215  ·  view source on GitHub ↗
(self, prg, global_size)

Source from the content-addressed store, hash-verified

197 ### SQTT ###
198
199 def sqtt_setup_exec(self, prg, global_size):
200 self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind(identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE,
201 bind_point=(__BIND_POINT_COMPUTE:=1), api_pso_hash=data64_le(prg.libhash[0])))
202 self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event(has_thread_dims=1, cmd_id=next(prg.dev.sqtt_next_cmd_id)), *global_size)
203
204 if SQTT_LIMIT_SE:
205 # Calculate number of CUs per SE to enable based on blocks count. 4 is maximum simd per CU, but on rdna we can trace only 1.
206 cu_per_se = prod([x if isinstance(x, int) else 1 for x in global_size]) // ((self.dev.cu_cnt // self.dev.se_cnt) * 4)
207 for xcc in range(self.dev.xccs):
208 with self.pred_exec(xcc_mask=1 << xcc):
209 for i in range(8 if prg.dev.target[0] != 9 else 4):
210 if SQTT_LIMIT_SE > 1: mask = 1 if SQTT_ITRACE_SE_MASK.value & (1 << i) else 0 # only run unmasked shader engines
211 else:
212 sa_mask = (1 << (self.dev.iface.props['cu_per_simd_array'] // 2)) - 1
213 cu_mask = (1 << (cu_per_se + (1 if i == 0 else 0))) - 1
214 mask = lo32((cu_mask & sa_mask) | (cu_mask & (sa_mask << 16)) << 16)
215 self.wreg(getattr(self.gc, f'regCOMPUTE_STATIC_THREAD_MGMT_SE{i}'), mask)
216
217 def sqtt_userdata(self, data, *extra_dwords):
218 data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)

Callers 2

execMethod · 0.95
execMethod · 0.80

Calls 6

sqtt_userdataMethod · 0.95
pred_execMethod · 0.95
wregMethod · 0.95
data64_leFunction · 0.90
prodFunction · 0.90
lo32Function · 0.90

Tested by

no test coverage detected