| 197 | ### SQTT ### |
| 198 | |
| 199 | def sqtt_setup_exec(self, prg, global_size): |
| 200 | self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind(identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE, |
| 201 | bind_point=(__BIND_POINT_COMPUTE:=1), api_pso_hash=data64_le(prg.libhash[0]))) |
| 202 | self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event(has_thread_dims=1, cmd_id=next(prg.dev.sqtt_next_cmd_id)), *global_size) |
| 203 | |
| 204 | if SQTT_LIMIT_SE: |
| 205 | # Calculate number of CUs per SE to enable based on blocks count. 4 is maximum simd per CU, but on rdna we can trace only 1. |
| 206 | cu_per_se = prod([x if isinstance(x, int) else 1 for x in global_size]) // ((self.dev.cu_cnt // self.dev.se_cnt) * 4) |
| 207 | for xcc in range(self.dev.xccs): |
| 208 | with self.pred_exec(xcc_mask=1 << xcc): |
| 209 | for i in range(8 if prg.dev.target[0] != 9 else 4): |
| 210 | if SQTT_LIMIT_SE > 1: mask = 1 if SQTT_ITRACE_SE_MASK.value & (1 << i) else 0 # only run unmasked shader engines |
| 211 | else: |
| 212 | sa_mask = (1 << (self.dev.iface.props['cu_per_simd_array'] // 2)) - 1 |
| 213 | cu_mask = (1 << (cu_per_se + (1 if i == 0 else 0))) - 1 |
| 214 | mask = lo32((cu_mask & sa_mask) | (cu_mask & (sa_mask << 16)) << 16) |
| 215 | self.wreg(getattr(self.gc, f'regCOMPUTE_STATIC_THREAD_MGMT_SE{i}'), mask) |
| 216 | |
| 217 | def sqtt_userdata(self, data, *extra_dwords): |
| 218 | data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords) |