Method sqtt_setup_exec

tinygrad/runtime/ops_amd.py:199–215 · view source on GitHub ↗

(self, prg, global_size)

Source from the content-addressed store, hash-verified

197	### SQTT ###
198
199	def sqtt_setup_exec(self, prg, global_size):
200	self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind(identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE,
201	bind_point=(__BIND_POINT_COMPUTE:=1), api_pso_hash=data64_le(prg.libhash[0])))
202	self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event(has_thread_dims=1, cmd_id=next(prg.dev.sqtt_next_cmd_id)), *global_size)
203
204	if SQTT_LIMIT_SE:
205	# Calculate number of CUs per SE to enable based on blocks count. 4 is maximum simd per CU, but on rdna we can trace only 1.
206	cu_per_se = prod([x if isinstance(x, int) else 1 for x in global_size]) // ((self.dev.cu_cnt // self.dev.se_cnt) * 4)
207	for xcc in range(self.dev.xccs):
208	with self.pred_exec(xcc_mask=1 << xcc):
209	for i in range(8 if prg.dev.target[0] != 9 else 4):
210	if SQTT_LIMIT_SE > 1: mask = 1 if SQTT_ITRACE_SE_MASK.value & (1 << i) else 0 # only run unmasked shader engines
211	else:
212	sa_mask = (1 << (self.dev.iface.props['cu_per_simd_array'] // 2)) - 1
213	cu_mask = (1 << (cu_per_se + (1 if i == 0 else 0))) - 1
214	mask = lo32((cu_mask & sa_mask) \| (cu_mask & (sa_mask << 16)) << 16)
215	self.wreg(getattr(self.gc, f'regCOMPUTE_STATIC_THREAD_MGMT_SE{i}'), mask)
216
217	def sqtt_userdata(self, data, *extra_dwords):
218	data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)

execMethod · 0.95

execMethod · 0.80

sqtt_userdataMethod · 0.95

pred_execMethod · 0.95

wregMethod · 0.95

data64_leFunction · 0.90

prodFunction · 0.90

lo32Function · 0.90

no test coverage detected