};
static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
- 0xbf820001, 0xbf8202ea,
+ 0xbf820001, 0xbf8202d8,
0xb8f8f802, 0x8978ff78,
0x00020006, 0xb8fbf803,
0x866eff78, 0x00002000,
0xe0724300, 0x701d0300,
0xbefe00c1, 0xbeff00c1,
0xb8fb5306, 0x867bc17b,
- 0xbf840064, 0xbf8a0000,
+ 0xbf840052, 0xbf8a0000,
0x867aff6f, 0x04000000,
- 0xbf840060, 0x8e7b867b,
+ 0xbf84004e, 0x8e7b867b,
0x8e7b827b, 0xbef6007b,
0xb8f02985, 0x80708170,
0x8e708a70, 0x8e708170,
0x000204c1, 0x867aff78,
0x00400000, 0xbf850003,
0xb8faf803, 0x897a7aff,
- 0x10000000, 0xbf850030,
- 0x24040682, 0xd86e4000,
+ 0x10000000, 0xbf85001d,
+ 0x24040682, 0xd86c0000,
0x00000002, 0xbf8cc07f,
0xbe840080, 0xd2890000,
0x00000900, 0x80048104,
0x80048104, 0xc069003a,
0x00000070, 0xbf8cc07f,
0x80709070, 0xbf06c004,
- 0xbf84ffee, 0xbe840080,
- 0xd2890000, 0x00000901,
- 0x80048104, 0xd2890001,
- 0x00000901, 0x80048104,
- 0xd2890002, 0x00000901,
- 0x80048104, 0xd2890003,
- 0x00000901, 0x80048104,
- 0xc069003a, 0x00000070,
- 0xbf8cc07f, 0x80709070,
- 0xbf06c004, 0xbf84ffee,
- 0x680404ff, 0x00000200,
+ 0xbf84ffee, 0x680404ff,
+ 0x00000100, 0xd0c9006a,
+ 0x0000f702, 0xbf87ffe5,
+ 0xbf820016, 0xd1060002,
+ 0x00011103, 0x7e0602ff,
+ 0x00000200, 0xbefc00ff,
+ 0x00010000, 0xbe800077,
+ 0x8677ff77, 0xff7fffff,
+ 0x8777ff77, 0x00058000,
+ 0xd8ec0000, 0x00000002,
+ 0xbf8cc07f, 0xe0765000,
+ 0x701d0002, 0x68040702,
0xd0c9006a, 0x0000f702,
- 0xbf87ffd2, 0xbf820015,
- 0xd1060002, 0x00011103,
- 0x7e0602ff, 0x00000200,
- 0xbefc00ff, 0x00010000,
- 0xbe800077, 0x8677ff77,
- 0xff7fffff, 0x8777ff77,
- 0x00058000, 0xd8ec0000,
- 0x00000002, 0xbf8cc07f,
- 0xe0765000, 0x701d0002,
- 0x68040702, 0xd0c9006a,
- 0x0000f702, 0xbf87fff7,
+ 0xbefe016a, 0xbf87fff6,
0xbef70000, 0xbef000ff,
0x00000400, 0xbefe00c1,
0xbeff00c1, 0xb8fb2b05,
0x701d0300, 0x807c847c,
0x8070ff70, 0x00000400,
0xbf0a7b7c, 0xbf85ffeb,
- 0xbf9c0000, 0xbf8200ee,
+ 0xbf9c0000, 0xbf8200f4,
0xbef4007e, 0x8675ff7f,
0x0000ffff, 0x8775ff75,
0x00040000, 0xbef60080,
0xbef700ff, 0x00807fac,
0x866eff7f, 0x04000000,
- 0xbf84001f, 0xbefe00c1,
+ 0xbf840025, 0xbefe00c1,
0xbeff00c1, 0xb8ef5306,
- 0x866fc16f, 0xbf84001a,
+ 0x866fc16f, 0xbf840020,
0x8e6f866f, 0x8e6f826f,
0xbef6006f, 0xb8f82985,
0x80788178, 0x8e788a78,
0x01000000, 0xbefc0080,
0xe0510000, 0x781d0000,
0xe0510100, 0x781d0000,
- 0x807cff7c, 0x00000200,
- 0x8078ff78, 0x00000200,
- 0xbf0a6f7c, 0xbf85fff6,
+ 0xe0510200, 0x781d0000,
+ 0xe0510300, 0x781d0000,
+ 0xe0510400, 0x781d0000,
+ 0x807cff7c, 0x00000500,
+ 0x8078ff78, 0x00000500,
+ 0xbf0a6f7c, 0xbf85fff0,
0xbefe00c1, 0xbeff00c1,
0xbef600ff, 0x01000000,
0xb8ef2b05, 0x806f816f,
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
#if ASIC_FAMILY >= CHIP_GC_9_5_0
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 11
+var LDS_RESTORE_GRANULARITY_BYTES = 1280
#else
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
+var LDS_RESTORE_GRANULARITY_BYTES = 512
#endif
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
v_lshlrev_b32 v2, 2, v3
L_SAVE_LDS_LOOP_SQC:
+#if ASIC_FAMILY < CHIP_GC_9_5_0
ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40
s_waitcnt lgkmcnt(0)
-
write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset)
v_add_u32 v2, 0x200, v2
+#else
+ // gfx950 needs to save in multiple of 256 bytes.
+ ds_read_b32 v0, v2
+ s_waitcnt lgkmcnt(0)
+ write_vgprs_to_mem_with_sqc(v0, 1, s_save_buf_rsrc0, s_save_mem_offset)
+
+ v_add_u32 v2, 0x100, v2
+#endif
+
v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC
// v_add_u32 v2, vcc[0:1], v2, v3
v_add_u32 v2, v2, v3
v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+#if ASIC_FAMILY >= CHIP_GC_9_5_0
+ s_mov_b64 exec, vcc
+#endif
s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
// restore rsrc3
L_RESTORE_LDS_LOOP:
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
- s_add_u32 m0, m0, 256*2 // 128 DW
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
+#if ASIC_FAMILY >= CHIP_GC_9_5_0
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:512 // third 64DW
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:768 // forth 64DW
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:1024 // fifth 64DW
+#endif
+ s_add_u32 m0, m0, LDS_RESTORE_GRANULARITY_BYTES // 128/320 DW
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, LDS_RESTORE_GRANULARITY_BYTES //mem offset increased by 128/320 DW
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?