0xbf8a0000, 0xbe801f6c,
0xbf9b0000, 0x00000000,
};
+
+static const uint32_t cwsr_trap_gfx12_1_0_hex[] = {
+ 0xbfa00001, 0xbfa002a2,
+ 0xb0804009, 0xb8f8f804,
+ 0x9178ff78, 0x00008c00,
+ 0xb8fbf811, 0x8b6eff78,
+ 0x00004000, 0xbfa10008,
+ 0x8b6eff7b, 0x00000080,
+ 0xbfa20018, 0x8b6ea07b,
+ 0xbfa2004f, 0xbf830010,
+ 0xb8fbf811, 0xbfa0fffb,
+ 0x8b6eff7b, 0x00000bd0,
+ 0xbfa20010, 0xb8eef812,
+ 0x8b6f8f7b, 0xbfa10002,
+ 0x8c6eff6e, 0x00000080,
+ 0xb8eff813, 0x8b6e6e6f,
+ 0xbfa20008, 0x8b6eff6d,
+ 0xf0000000, 0xbfa20005,
+ 0x8b6fff6f, 0x00000200,
+ 0xbfa20002, 0x8b6ea07b,
+ 0xbfa20039, 0x9177ff77,
+ 0x007fc000, 0xb8fa04a1,
+ 0x847a967a, 0x8c777a77,
+ 0xb8fa0421, 0x847a957a,
+ 0x8c777a77, 0xb8fa3021,
+ 0x847a8e7a, 0x8c777a77,
+ 0xb980f821, 0x00000000,
+ 0xbefa4d82, 0xbf8a0000,
+ 0x84fa887a, 0xbf0d987b,
+ 0xbfa10002, 0x8c7bff7b,
+ 0xfe000000, 0xf4601bbd,
+ 0xf8000010, 0xbf8a0000,
+ 0x846e976e, 0x9177ff77,
+ 0x00800000, 0x8c776e77,
+ 0xf4603bbd, 0xf8000000,
+ 0xbf8a0000, 0xf4603ebd,
+ 0xf8000008, 0xbf8a0000,
+ 0x8bee6e6e, 0xbfa10001,
+ 0xbe80486e, 0x8b6eff6d,
+ 0xf0000000, 0xbfa20009,
+ 0xb8eef811, 0x8b6eff6e,
+ 0x00000080, 0xbfa20007,
+ 0x8c78ff78, 0x00004000,
+ 0x80ec886c, 0x82ed806d,
+ 0xbfa00002, 0x806c846c,
+ 0x826d806d, 0x8b6dff6d,
+ 0x01ffffff, 0x8bfe7e7e,
+ 0x8bea6a6a, 0x85788978,
+ 0xb9783244, 0xbe804a6c,
+ 0xb8faf802, 0xbf0d987a,
+ 0xbfa10001, 0xbfb00000,
+ 0x8b6dff6d, 0x01ffffff,
+ 0xbefa0080, 0xb97a0151,
+ 0xbeee007e, 0xbeef007f,
+ 0xbefe0180, 0xbefe4d84,
+ 0xbf8a0000, 0x8b7aff7f,
+ 0x04000000, 0x847a857a,
+ 0x8c6d7a6d, 0x9177ff77,
+ 0x007fc000, 0xb8fa04a1,
+ 0x847a967a, 0x8c777a77,
+ 0xb8fa0421, 0x847a957a,
+ 0x8c777a77, 0xb8fa3021,
+ 0x847a8e7a, 0x8c777a77,
+ 0xb980f821, 0x00000000,
+ 0xb8eff822, 0xb980f822,
+ 0x00000000, 0xb8fa2b01,
+ 0x847a997a, 0x8c6d7a6d,
+ 0xb9802b01, 0x00000000,
+ 0xbefa007e, 0x8b7bff7f,
+ 0x01ffffff, 0xbefe00c1,
+ 0xbeff00c1, 0xee0a407a,
+ 0x000c0000, 0x00000000,
+ 0x7e000280, 0xbefe007a,
+ 0xbeff007b, 0xb8fb0742,
+ 0x847b997b, 0xb8fa3b05,
+ 0x807a817a, 0xbf0d997b,
+ 0xbfa20002, 0x847a897a,
+ 0xbfa00001, 0x847a8a7a,
+ 0x8b7bff7f, 0x01ffffff,
+ 0x807aff7a, 0x000001c0,
+ 0x807a7e7a, 0x827b807b,
+ 0xd7610000, 0x00010870,
+ 0xd7610000, 0x00010a71,
+ 0xd7610000, 0x00010c72,
+ 0xd7610000, 0x00010e73,
+ 0xd7610000, 0x00011074,
+ 0xd7610000, 0x00011275,
+ 0xd7610000, 0x00011476,
+ 0xd7610000, 0x00011677,
+ 0xd7610000, 0x00011a79,
+ 0xd7610000, 0x00011c7e,
+ 0xd7610000, 0x00011e7f,
+ 0xbefe00ff, 0x00003fff,
+ 0xbeff0080, 0xee0a407a,
+ 0x000c0000, 0x00000000,
+ 0xd760007a, 0x00011d00,
+ 0xd760007b, 0x00011f00,
+ 0xbefe007a, 0xbeff007b,
+ 0xbef4007e, 0x8b75ff7f,
+ 0x01ffffff, 0xbef1007d,
+ 0xb8f30742, 0x84739973,
+ 0xbefe00c1, 0x857d9973,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa20002, 0xbeff0080,
+ 0xbfa00002, 0xbeff00c1,
+ 0xbfa0000a, 0xee0a4074,
+ 0x008c0000, 0x00008000,
+ 0xee0a4074, 0x010c0000,
+ 0x00010000, 0xee0a4074,
+ 0x018c0000, 0x00018000,
+ 0xbfa00009, 0xee0a4074,
+ 0x008c0000, 0x00010000,
+ 0xee0a4074, 0x010c0000,
+ 0x00020000, 0xee0a4074,
+ 0x018c0000, 0x00030000,
+ 0xb8f03b05, 0x80708170,
+ 0xbf0d9973, 0xbfa20002,
+ 0x84708970, 0xbfa00001,
+ 0x84708a70, 0x8070ff70,
+ 0x00000200, 0x7e000280,
+ 0x7e020280, 0x7e040280,
+ 0xbefd0080, 0xb8faf802,
+ 0xbf0c8b7a, 0xbfa20003,
+ 0xbe804fc2, 0xbf94fffe,
+ 0xbfa10001, 0xbe804ec4,
+ 0xbf94fffc, 0xb8faf804,
+ 0x8b7aff7a, 0x0001000c,
+ 0x9178ff78, 0x0001000c,
+ 0x8c787a78, 0xd7610002,
+ 0x0000fa71, 0x807d817d,
+ 0xd7610002, 0x0000fa6c,
+ 0x807d817d, 0x917aff6d,
+ 0x80000000, 0xd7610002,
+ 0x0000fa7a, 0x807d817d,
+ 0xd7610002, 0x0000fa6e,
+ 0x807d817d, 0xbefa0080,
+ 0xd7610002, 0x0000fa7a,
+ 0x807d817d, 0xd7610002,
+ 0x0000fa78, 0x807d817d,
+ 0xb8faf811, 0xd7610002,
+ 0x0000fa7a, 0x807d817d,
+ 0xd7610002, 0x0000fa6f,
+ 0x807d817d, 0xb8f1f801,
+ 0x937aff6d, 0x00060019,
+ 0x847a8c7a, 0x8c717a71,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f814,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f815,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f812,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f813,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8faf802,
+ 0xd7610002, 0x0000fa7a,
+ 0x807d817d, 0xbefa50c1,
+ 0xbfc70000, 0xd7610002,
+ 0x0000fa7a, 0x807d817d,
+ 0xbefa4c88, 0xbfc70000,
+ 0xd7610002, 0x0000fa7a,
+ 0x807d817d, 0xbefe00ff,
+ 0x0000ffff, 0xbeff0080,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x010c0000,
+ 0x00000000, 0xbefe00c1,
+ 0x7e040280, 0xbefa5081,
+ 0xbfc70000, 0xd7610002,
+ 0x0001007a, 0xbefa5082,
+ 0xbfc70000, 0xd7610002,
+ 0x0001027a, 0xbefa5083,
+ 0xbfc70000, 0xd7610002,
+ 0x0001047a, 0xbefa5084,
+ 0xbfc70000, 0xd7610002,
+ 0x0001067a, 0xbefa5085,
+ 0xbfc70000, 0xd7610002,
+ 0x0001087a, 0xbefa5086,
+ 0xbfc70000, 0xd7610002,
+ 0x00010a7a, 0xbefa5087,
+ 0xbfc70000, 0xd7610002,
+ 0x00010c7a, 0xbefa5088,
+ 0xbfc70000, 0xd7610002,
+ 0x00010e7a, 0xbefa5089,
+ 0xbfc70000, 0xd7610002,
+ 0x0001107a, 0xbefa508a,
+ 0xbfc70000, 0xd7610002,
+ 0x0001127a, 0xbefa508b,
+ 0xbfc70000, 0xd7610002,
+ 0x0001147a, 0xbefa508c,
+ 0xbfc70000, 0xd7610002,
+ 0x0001167a, 0xbefa508d,
+ 0xbfc70000, 0xd7610002,
+ 0x0001187a, 0xbefa508e,
+ 0xbfc70000, 0xd7610002,
+ 0x00011a7a, 0xbefa508f,
+ 0xbfc70000, 0xd7610002,
+ 0x00011c7a, 0xbefa5090,
+ 0xbfc70000, 0xd7610002,
+ 0x00011e7a, 0xee0a4076,
+ 0x010c0000, 0x00008000,
+ 0xb8f03b05, 0x80708170,
+ 0xbf0d9973, 0xbfa20002,
+ 0x84708970, 0xbfa00001,
+ 0x84708a70, 0xbef90080,
+ 0xbefd0080, 0xbf800000,
+ 0xbe804100, 0xbe824102,
+ 0xbe844104, 0xbe864106,
+ 0xbe884108, 0xbe8a410a,
+ 0xbe8c410c, 0xbe8e410e,
+ 0xd7610002, 0x0000f200,
+ 0x80798179, 0xd7610002,
+ 0x0000f201, 0x80798179,
+ 0xd7610002, 0x0000f202,
+ 0x80798179, 0xd7610002,
+ 0x0000f203, 0x80798179,
+ 0xd7610002, 0x0000f204,
+ 0x80798179, 0xd7610002,
+ 0x0000f205, 0x80798179,
+ 0xd7610002, 0x0000f206,
+ 0x80798179, 0xd7610002,
+ 0x0000f207, 0x80798179,
+ 0xd7610002, 0x0000f208,
+ 0x80798179, 0xd7610002,
+ 0x0000f209, 0x80798179,
+ 0xd7610002, 0x0000f20a,
+ 0x80798179, 0xd7610002,
+ 0x0000f20b, 0x80798179,
+ 0xd7610002, 0x0000f20c,
+ 0x80798179, 0xd7610002,
+ 0x0000f20d, 0x80798179,
+ 0xd7610002, 0x0000f20e,
+ 0x80798179, 0xd7610002,
+ 0x0000f20f, 0x80798179,
+ 0xbf06a079, 0xbfa10009,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x010c0000,
+ 0x00000000, 0x8070ff70,
+ 0x00000080, 0xbef90080,
+ 0x7e040280, 0x807d907d,
+ 0xbf0aff7d, 0x00000060,
+ 0xbfa2ffb9, 0xbe804100,
+ 0xbe824102, 0xbe844104,
+ 0xbe864106, 0xbe884108,
+ 0xbe8a410a, 0xd7610002,
+ 0x0000f200, 0x80798179,
+ 0xd7610002, 0x0000f201,
+ 0x80798179, 0xd7610002,
+ 0x0000f202, 0x80798179,
+ 0xd7610002, 0x0000f203,
+ 0x80798179, 0xd7610002,
+ 0x0000f204, 0x80798179,
+ 0xd7610002, 0x0000f205,
+ 0x80798179, 0xd7610002,
+ 0x0000f206, 0x80798179,
+ 0xd7610002, 0x0000f207,
+ 0x80798179, 0xd7610002,
+ 0x0000f208, 0x80798179,
+ 0xd7610002, 0x0000f209,
+ 0x80798179, 0xd7610002,
+ 0x0000f20a, 0x80798179,
+ 0xd7610002, 0x0000f20b,
+ 0x80798179, 0xbefe00ff,
+ 0x0000ffff, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x010c0000, 0x00000000,
+ 0xbefe00c1, 0x857d9973,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa20002, 0xbeff0080,
+ 0xbfa00001, 0xbeff00c1,
+ 0xb8fb4306, 0x8b7bc17b,
+ 0xbfa10042, 0x8b7aff6d,
+ 0x80000000, 0xbfa1003f,
+ 0x847b8a7b, 0xb8f03b05,
+ 0x80708170, 0xbf0d9973,
+ 0xbfa20002, 0x84708970,
+ 0xbfa00001, 0x84708a70,
+ 0x8070ff70, 0x00000200,
+ 0x8070ff70, 0x00000200,
+ 0xd71f0000, 0x000100c1,
+ 0xd7200000, 0x000200c1,
+ 0x16000084, 0x857d9973,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbefd0080, 0xbfa20015,
+ 0xbe8300ff, 0x00000080,
+ 0xbf800000, 0xbf800000,
+ 0xbf800000, 0xd8d80000,
+ 0x01000000, 0xbf8a0000,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x008c0000,
+ 0x00000000, 0x807d037d,
+ 0x80700370, 0xd5250000,
+ 0x0001ff00, 0x00000080,
+ 0xbf0a7b7d, 0xbfa2fff1,
+ 0xbfa00014, 0xbe8300ff,
+ 0x00000100, 0xbf800000,
+ 0xbf800000, 0xbf800000,
+ 0xd8d80000, 0x01000000,
+ 0xbf8a0000, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x008c0000, 0x00000000,
+ 0x807d037d, 0x80700370,
+ 0xd5250000, 0x0001ff00,
+ 0x00000100, 0xbf0a7b7d,
+ 0xbfa2fff1, 0xbefe00c1,
+ 0x857d9973, 0x8b7d817d,
+ 0xbf06817d, 0xbfa20004,
+ 0xbef000ff, 0x00000200,
+ 0xbeff0080, 0xbfa00003,
+ 0xbef000ff, 0x00000400,
+ 0xbeff00c1, 0xb8fb3b05,
+ 0x807b817b, 0x847b827b,
+ 0x857d9973, 0x8b7d817d,
+ 0xbf06817d, 0xbfa2001b,
+ 0xbefd0084, 0xbf0a7b7d,
+ 0xbfa10032, 0x7e008700,
+ 0x7e028701, 0x7e048702,
+ 0x7e068703, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x000c0000, 0x00000000,
+ 0xee0a4076, 0x008c0000,
+ 0x00008000, 0xee0a4076,
+ 0x010c0000, 0x00010000,
+ 0xee0a4076, 0x018c0000,
+ 0x00018000, 0x807d847d,
+ 0x8070ff70, 0x00000200,
+ 0xbf0a7b7d, 0xbfa2ffe9,
+ 0xbfa0001a, 0xbefd0084,
+ 0xbf0a7b7d, 0xbfa10017,
+ 0x7e008700, 0x7e028701,
+ 0x7e048702, 0x7e068703,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x000c0000,
+ 0x00000000, 0xee0a4076,
+ 0x008c0000, 0x00010000,
+ 0xee0a4076, 0x010c0000,
+ 0x00020000, 0xee0a4076,
+ 0x018c0000, 0x00030000,
+ 0x807d847d, 0x8070ff70,
+ 0x00000400, 0xbf0a7b7d,
+ 0xbfa2ffe9, 0xbfa00180,
+ 0xbef4007e, 0x8b75ff7f,
+ 0x01ffffff, 0xbef1007f,
+ 0xb8f20742, 0x84729972,
+ 0x8b6eff7f, 0x04000000,
+ 0xbfa10044, 0xbefe00c1,
+ 0x857d9972, 0x8b7d817d,
+ 0xbf06817d, 0xbfa20002,
+ 0xbeff0080, 0xbfa00001,
+ 0xbeff00c1, 0xb8ef4306,
+ 0x8b6fc16f, 0xbfa10039,
+ 0x846f8a6f, 0xb8f83b05,
+ 0x80788178, 0xbf0d9972,
+ 0xbfa20002, 0x84788978,
+ 0xbfa00001, 0x84788a78,
+ 0x8078ff78, 0x00000200,
+ 0x8078ff78, 0x00000200,
+ 0x857d9972, 0x8b7d817d,
+ 0xbf06817d, 0xbefd0080,
+ 0xd71f0001, 0x000100c1,
+ 0xd7200001, 0x000202c1,
+ 0x30020282, 0xbfa20012,
+ 0x80767874, 0x82778075,
+ 0xee0a0076, 0x000c0000,
+ 0x00000000, 0xbf8a0000,
+ 0xd8340000, 0x00000001,
+ 0xd5250001, 0x0001ff01,
+ 0x00000080, 0x807dff7d,
+ 0x00000080, 0x8078ff78,
+ 0x00000080, 0xbf0a6f7d,
+ 0xbfa2ffef, 0xbfa00011,
+ 0x80767874, 0x82778075,
+ 0xee0a0076, 0x000c0000,
+ 0x00000000, 0xbf8a0000,
+ 0xd8340000, 0x00000001,
+ 0xd5250001, 0x0001ff01,
+ 0x00000100, 0x807dff7d,
+ 0x00000100, 0x8078ff78,
+ 0x00000100, 0xbf0a6f7d,
+ 0xbfa2ffef, 0xbef80080,
+ 0xbefe00c1, 0x857d9972,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa20002, 0xbeff0080,
+ 0xbfa00001, 0xbeff00c1,
+ 0xb8ef3b05, 0x806f816f,
+ 0x846f826f, 0x857d9972,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa2002c, 0xbeee0078,
+ 0x8078ff78, 0x00000200,
+ 0xbefd0084, 0x80767874,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xee0a0076, 0x000c0001,
+ 0x00008000, 0xee0a0076,
+ 0x000c0002, 0x00010000,
+ 0xee0a0076, 0x000c0003,
+ 0x00018000, 0xbf8a0000,
+ 0x7e008500, 0x7e028501,
+ 0x7e048502, 0x7e068503,
+ 0x807d847d, 0x8078ff78,
+ 0x00000200, 0xbf0a6f7d,
+ 0xbfa2ffe8, 0x80766e74,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xee0a0076, 0x000c0001,
+ 0x00008000, 0xee0a0076,
+ 0x000c0002, 0x00010000,
+ 0xee0a0076, 0x000c0003,
+ 0x00018000, 0xbf8a0000,
+ 0xbfa0002d, 0xbeee0078,
+ 0x8078ff78, 0x00000400,
+ 0xbefd0084, 0xbf0a6f7d,
+ 0xbfa10018, 0x80767874,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xee0a0076, 0x000c0001,
+ 0x00010000, 0xee0a0076,
+ 0x000c0002, 0x00020000,
+ 0xee0a0076, 0x000c0003,
+ 0x00030000, 0xbf8a0000,
+ 0x7e008500, 0x7e028501,
+ 0x7e048502, 0x7e068503,
+ 0x807d847d, 0x8078ff78,
+ 0x00000400, 0xbf0a6f7d,
+ 0xbfa2ffe8, 0x80766e74,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xee0a0076, 0x000c0001,
+ 0x00010000, 0xee0a0076,
+ 0x000c0002, 0x00020000,
+ 0xee0a0076, 0x000c0003,
+ 0x00030000, 0xbf8a0000,
+ 0xb8f83b05, 0x80788178,
+ 0xbf0d9972, 0xbfa20002,
+ 0x84788978, 0xbfa00001,
+ 0x84788a78, 0x8078ff78,
+ 0x00000200, 0x80f8ff78,
+ 0x00000060, 0x80767874,
+ 0x82778075, 0xbefd00ff,
+ 0x0000006c, 0xf460403b,
+ 0xf8000000, 0xbf8a0000,
+ 0x80fd847d, 0xbf800000,
+ 0xbe804300, 0xbe824302,
+ 0x80f6a076, 0x82f78077,
+ 0xf460603b, 0xf8000000,
+ 0xbf8a0000, 0x80fd887d,
+ 0xbf800000, 0xbe804300,
+ 0xbe824302, 0xbe844304,
+ 0xbe864306, 0x80f6c076,
+ 0x82f78077, 0xf460803b,
+ 0xf8000000, 0xbf8a0000,
+ 0x80fd907d, 0xbf800000,
+ 0xbe804300, 0xbe824302,
+ 0xbe844304, 0xbe864306,
+ 0xbe884308, 0xbe8a430a,
+ 0xbe8c430c, 0xbe8e430e,
+ 0xbf06807d, 0xbfa1ffef,
+ 0xb980f801, 0x00000000,
+ 0xb8f83b05, 0x80788178,
+ 0xbf0d9972, 0xbfa20002,
+ 0x84788978, 0xbfa00001,
+ 0x84788a78, 0x8078ff78,
+ 0x00000200, 0x80767874,
+ 0x82778075, 0xbeff0071,
+ 0xf4601bfb, 0xf8000000,
+ 0xf4601b3b, 0xf8000004,
+ 0xf4601b7b, 0xf8000008,
+ 0xf4601c3b, 0xf800000c,
+ 0xf4601c7b, 0xf8000010,
+ 0xf4601ebb, 0xf8000014,
+ 0xf4601efb, 0xf8000018,
+ 0xf4601e7b, 0xf800001c,
+ 0xf4601cfb, 0xf8000020,
+ 0xf4601bbb, 0xf8000024,
+ 0xbf8a0000, 0xb96ef814,
+ 0xf4601bbb, 0xf8000028,
+ 0xbf8a0000, 0xb96ef815,
+ 0xf4601bbb, 0xf800002c,
+ 0xbf8a0000, 0xb96ef812,
+ 0xf4601bbb, 0xf8000030,
+ 0xbf8a0000, 0xb96ef813,
+ 0x8b6eff7f, 0x04000000,
+ 0xbfa10022, 0xf4601bbb,
+ 0xf8000038, 0xbf8a0000,
+ 0xbf0d806e, 0xbfa1001d,
+ 0x856e906e, 0x8b6e6e6e,
+ 0xbfa10003, 0xbe804ec1,
+ 0x816ec16e, 0xbfa0fffb,
+ 0xbef800ff, 0x00000080,
+ 0xbefd0081, 0xf4601bbb,
+ 0xf0000000, 0xbfc70000,
+ 0x80788478, 0x937eff6e,
+ 0x00070004, 0x847e907e,
+ 0x8c7d7e7d, 0xbe80517d,
+ 0x917dff7d, 0x007f0000,
+ 0x856e906e, 0x8b6e6e6e,
+ 0xbfa10003, 0xbe804e7d,
+ 0x816ec16e, 0xbfa0fffb,
+ 0x807d817d, 0xbf08907d,
+ 0xbfa1ffec, 0xf4601bbb,
+ 0xf800003c, 0xbfc70000,
+ 0xbf0d806e, 0xbfa1000c,
+ 0xbf0d9a7f, 0xbfa10002,
+ 0xbf068180, 0xbe804fc4,
+ 0xbf94fffc, 0xbfa10006,
+ 0x856e906e, 0x8b6e6e6e,
+ 0xbfa10003, 0xbe804ec3,
+ 0x816ec16e, 0xbfa0fffb,
+ 0xbefd006f, 0xbefe0070,
+ 0xbeff0071, 0xb979f822,
+ 0xb97b2011, 0x857b867b,
+ 0xb97b0191, 0x857b827b,
+ 0xb97bba11, 0xb973f801,
+ 0xb8ee3b05, 0x806e816e,
+ 0xbf0d9972, 0xbfa20002,
+ 0x846e896e, 0xbfa00001,
+ 0x846e8a6e, 0x806eff6e,
+ 0x000001c0, 0x806e746e,
+ 0x826f8075, 0xf4605c37,
+ 0xf8000010, 0xf4605d37,
+ 0xf8000020, 0xf4601e77,
+ 0xf8000034, 0xbf8a0000,
+ 0x856e9677, 0xb96e04a1,
+ 0x856e9577, 0xb96e0421,
+ 0x856e8e77, 0xb96e3021,
+ 0x8b6dff6d, 0x01ffffff,
+ 0x8bfe7e7e, 0x8bea6a6a,
+ 0xb97af804, 0xb8eef802,
+ 0xbf0c8b6e, 0xbfa20003,
+ 0xbe804fc2, 0xbf94fffe,
+ 0xbfa10001, 0xbe804ec4,
+ 0xbf94fffc, 0x857a897a,
+ 0xb97a0244, 0xbe804a6c,
+ 0xb8eef802, 0xbf0c8b6e,
+ 0xbfa20003, 0xbe804fc2,
+ 0xbf94fffe, 0xbfa10001,
+ 0xbe804ec4, 0xbf94fffc,
+ 0xbfb10000, 0xbf9f0000,
+ 0xbf9f0000, 0xbf9f0000,
+ 0xbf9f0000, 0xbf9f0000,
+};
*/
#define CHIP_GFX12 37
+#define CHIP_GC_12_0_3 38
+
+#define HAVE_XNACK (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define HAVE_57BIT_ADDRESS (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define HAVE_BANKED_VGPRS (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define NUM_NAMED_BARRIERS (ASIC_FAMILY == CHIP_GC_12_0_3 ? 0x10 : 0)
+#define HAVE_CLUSTER_BARRIER (ASIC_FAMILY == CHIP_GC_12_0_3)
#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost TRAP_AFTER_INST exception when SAVECTX raised
#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12)
+#define WAVE32_ONLY (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define SAVE_TTMPS_IN_SGPR_BLOCK (ASIC_FAMILY >= CHIP_GC_12_0_3)
+
+#if HAVE_XNACK && !WAVE32_ONLY
+# error
+#endif
-var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4
+#define ADDRESS_HI32_NUM_BITS ((HAVE_57BIT_ADDRESS ? 57 : 48) - 32)
+#define ADDRESS_HI32_MASK ((1 << ADDRESS_HI32_NUM_BITS) - 1)
+
+var SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK = 0x4 | (NUM_NAMED_BARRIERS ? 0x8 : 0) | (HAVE_CLUSTER_BARRIER ? 0x10000 : 0)
var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9
var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00
var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000
var SQ_WAVE_STATUS_WAVE64_SHIFT = 29
var SQ_WAVE_STATUS_WAVE64_SIZE = 1
var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24
+var SQ_WAVE_STATUS_IN_WG_SHIFT = 11
var SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12
-var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
-var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
+
+#if ASIC_FAMILY < CHIP_GC_12_0_3
var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9
+#else
+var SQ_WAVE_LDS_ALLOC_GRANULARITY = 10
+#endif
var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF
var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10
var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE = 2
var BARRIER_STATE_SIGNAL_OFFSET = 16
+var BARRIER_STATE_MEMBER_OFFSET = 4
+var BARRIER_STATE_MEMBER_SIZE = 7
var BARRIER_STATE_VALID_OFFSET = 0
var TTMP11_SCHED_MODE_SHIFT = 26
var TTMP11_SCHED_MODE_SIZE = 2
var TTMP11_SCHED_MODE_MASK = 0xC000000
+
+var NAMED_BARRIERS_SR_OFFSET_FROM_HWREG = 0x80
+var S_BARRIER_INIT_MEMBERCNT_MASK = 0x7F0000
+var S_BARRIER_INIT_MEMBERCNT_SHIFT = 0x10
+
+var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT = 18
+var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE = 1
+var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT = 16
+var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE = 1
+var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT = 0
+var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE = 7
+
+#if HAVE_BANKED_VGPRS
+var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT = 12
+var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE = 6
+#endif
+
var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23
var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000
+var TTMP11_FIRST_REPLAY_SHIFT = 22
+var TTMP11_FIRST_REPLAY_MASK = 0x400000
+var TTMP11_REPLAY_W64H_SHIFT = 21
+var TTMP11_REPLAY_W64H_MASK = 0x200000
+var TTMP11_FXPTR_SHIFT = 14
+var TTMP11_FXPTR_MASK = 0x1FC000
// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000
var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31
+#if HAVE_BANKED_VGPRS
+var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT = 25
+var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE = 6
+#endif
+
var s_sgpr_save_num = 108
var s_save_spi_init_lo = exec_lo
var s_save_exec_hi = ttmp3
var s_save_state_priv = ttmp12
var s_save_excp_flag_priv = ttmp15
-var s_save_xnack_mask = s_save_excp_flag_priv
+var s_save_xnack_mask = s_save_exec_hi
var s_wave_size = ttmp7
-var s_save_buf_rsrc0 = ttmp8
-var s_save_buf_rsrc1 = ttmp9
-var s_save_buf_rsrc2 = ttmp10
-var s_save_buf_rsrc3 = ttmp11
+var s_save_base_addr_lo = ttmp8
+var s_save_base_addr_hi = ttmp9
+var s_save_addr_lo = ttmp10
+var s_save_addr_hi = ttmp11
var s_save_mem_offset = ttmp4
var s_save_alloc_size = s_save_excp_flag_priv
var s_save_tmp = ttmp14
var s_save_ttmps_lo = s_save_tmp
var s_save_ttmps_hi = s_save_excp_flag_priv
-var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
-var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
-
var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_WAVE_SIZE = 25
var s_restore_state_priv = ttmp14
var s_restore_excp_flag_priv = ttmp15
var s_restore_xnack_mask = ttmp13
-var s_restore_buf_rsrc0 = ttmp8
-var s_restore_buf_rsrc1 = ttmp9
-var s_restore_buf_rsrc2 = ttmp10
-var s_restore_buf_rsrc3 = ttmp11
+var s_restore_base_addr_lo = ttmp8
+var s_restore_base_addr_hi = ttmp9
+var s_restore_addr_lo = ttmp10
+var s_restore_addr_hi = ttmp11
var s_restore_size = ttmp6
var s_restore_ttmps_lo = s_restore_tmp
var s_restore_ttmps_hi = s_restore_alloc_size
var s_restore_spi_init_hi_save = s_restore_exec_hi
+#if SAVE_TTMPS_IN_SGPR_BLOCK
+var TTMP_SR_OFFSET_FROM_HWREG = -0x40
+#else
+var TTMP_SR_OFFSET_FROM_HWREG = 0x40
+#endif
+
shader main
asic(DEFAULT)
type(CS)
s_cbranch_scc1 L_SAVE
L_FETCH_2ND_TRAP:
+#if HAVE_XNACK
+ save_and_clear_xnack_state_priv(ttmp14)
+#endif
+
// Read second-level TBA/TMA from first-level TMA and jump if available.
// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
// ttmp12 holds SQ_WAVE_STATUS
s_wait_idle
s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
- s_bitcmp1_b32 ttmp15, 0xF
+ s_bitcmp1_b32 ttmp15, (ADDRESS_HI32_NUM_BITS - 1)
s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA
- s_or_b32 ttmp15, ttmp15, 0xFFFF0000
+ s_or_b32 ttmp15, ttmp15, ~ADDRESS_HI32_MASK
L_NO_SIGN_EXTEND_TMA:
#if ASIC_FAMILY == CHIP_GFX12
// Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI).
s_addc_u32 ttmp1, ttmp1, 0x0
L_EXIT_TRAP:
- s_and_b32 ttmp1, ttmp1, 0xFFFF
+ s_and_b32 ttmp1, ttmp1, ADDRESS_HI32_MASK
// Restore SQ_WAVE_STATUS.
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
- // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
+ // STATE_PRIV.*BARRIER_COMPLETE may have changed since we read it.
// Only restore fields which the trap handler changes.
s_lshr_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
s_cbranch_scc0 L_HAVE_VGPRS
s_endpgm
L_HAVE_VGPRS:
-
- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+ s_and_b32 s_save_pc_hi, s_save_pc_hi, ADDRESS_HI32_MASK
s_mov_b32 s_save_tmp, 0
s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit
s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+#if HAVE_XNACK
+ save_and_clear_xnack_state_priv(s_save_tmp)
+ s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_WAVE_XNACK_MASK)
+ s_setreg_imm32_b32 hwreg(HW_REG_WAVE_XNACK_MASK), 0
+#endif
+
+#if HAVE_BANKED_VGPRS
+ // Save and clear shader's DST/SRC0/SRC1 VGPR bank selection so we can use v[0-255].
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE)
+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT
+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+ s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE), 0
+#endif
+
// Trap temporaries must be saved via VGPR but all VGPRs are in use.
// There is no ttmp space to hold the resource constant for VGPR save.
// Save v0 by itself since it requires only two SGPRs.
s_mov_b32 s_save_ttmps_lo, exec_lo
- s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF
+ s_and_b32 s_save_ttmps_hi, exec_hi, ADDRESS_HI32_MASK
s_mov_b32 exec_lo, 0xFFFFFFFF
s_mov_b32 exec_hi, 0xFFFFFFFF
global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS
s_mov_b32 exec_hi, s_save_ttmps_hi
// Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
- // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
+ // ttmp SR memory offset:
+ // - gfx12: size(VGPR)+size(SGPR)+0x40
+ // - gfx12.5: size(VGPR)+size(SGPR)-0x40
get_wave_size2(s_save_ttmps_hi)
get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
- get_svgpr_size_bytes(s_save_ttmps_hi)
- s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
- s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF
- s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes()
+ s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK
+ s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG)
s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0
v_writelane_b32 v0, ttmp13, 0xD
v_writelane_b32 v0, exec_lo, 0xE
v_writelane_b32 v0, exec_hi, 0xF
- valu_sgpr_hazard()
s_mov_b32 exec_lo, 0x3FFF
s_mov_b32 exec_hi, 0x0
- global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] offset:0x40 scope:SCOPE_SYS
+ global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS
v_readlane_b32 ttmp14, v0, 0xE
v_readlane_b32 ttmp15, v0, 0xF
s_mov_b32 exec_lo, ttmp14
s_mov_b32 exec_hi, ttmp15
- /* setup Resource Contants */
- s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
- s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
- s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
- s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
-
+ s_mov_b32 s_save_base_addr_lo, s_save_spi_init_lo
+ s_and_b32 s_save_base_addr_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK
s_mov_b32 s_save_m0, m0
- /* global mem offset */
- s_mov_b32 s_save_mem_offset, 0x0
get_wave_size2(s_wave_size)
/* save first 4 VGPRs, needed for SGPR save */
s_mov_b32 exec_hi, 0xFFFFFFFF
s_branch L_SAVE_4VGPR_WAVE64
L_SAVE_4VGPR_WAVE32:
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR Allocated in 4-GPR granularity
-
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2
- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3
+ global_store_addtid_b32 v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128
+ global_store_addtid_b32 v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*2
+ global_store_addtid_b32 v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*3
s_branch L_SAVE_HWREG
L_SAVE_4VGPR_WAVE64:
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR Allocated in 4-GPR granularity
-
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2
- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3
+ global_store_addtid_b32 v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256
+ global_store_addtid_b32 v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*2
+ global_store_addtid_b32 v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*3
/* save HW registers */
L_SAVE_HWREG:
- // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
+ // HWREG SR memory offset : size(VGPR)+size(SGPR)
get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
- get_svgpr_size_bytes(s_save_tmp)
- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource
v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource
v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store
+ s_mov_b32 m0, 0x0 //Next lane of v2 to write to
// Ensure no further changes to barrier or LDS state.
- // STATE_PRIV.BARRIER_COMPLETE may change up to this point.
- s_barrier_signal -2
- s_barrier_wait -2
+ // STATE_PRIV.*BARRIER_COMPLETE may change up to this point.
+ wait_trap_barriers(s_save_tmp)
- // Re-read final state of BARRIER_COMPLETE field for save.
+ // Re-read final state of *BARRIER_COMPLETE fields for save.
s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV)
- s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
- s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
+ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK
+ s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK
s_or_b32 s_save_state_priv, s_save_state_priv, s_save_tmp
+ write_hwreg_to_v2(s_save_m0)
+ write_hwreg_to_v2(s_save_pc_lo)
s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
- v_writelane_b32 v2, s_save_m0, 0x0
- v_writelane_b32 v2, s_save_pc_lo, 0x1
- v_writelane_b32 v2, s_save_tmp, 0x2
- v_writelane_b32 v2, s_save_exec_lo, 0x3
- v_writelane_b32 v2, s_save_exec_hi, 0x4
- v_writelane_b32 v2, s_save_state_priv, 0x5
- v_writelane_b32 v2, s_save_xnack_mask, 0x7
- valu_sgpr_hazard()
+ write_hwreg_to_v2(s_save_tmp)
+ write_hwreg_to_v2(s_save_exec_lo)
+#if WAVE32_ONLY
+ s_mov_b32 s_save_tmp, 0
+ write_hwreg_to_v2(s_save_tmp)
+#else
+ write_hwreg_to_v2(s_save_exec_hi)
+#endif
+ write_hwreg_to_v2(s_save_state_priv)
s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
- v_writelane_b32 v2, s_save_tmp, 0x6
+ write_hwreg_to_v2(s_save_tmp)
- s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_MODE)
- v_writelane_b32 v2, s_save_tmp, 0x8
+#if HAVE_XNACK
+ write_hwreg_to_v2(s_save_xnack_mask)
+#else
+ s_mov_b32 s_save_tmp, 0
+ write_hwreg_to_v2(s_save_tmp)
+#endif
- s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO)
- v_writelane_b32 v2, s_save_tmp, 0x9
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_MODE)
- s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI)
- v_writelane_b32 v2, s_save_tmp, 0xA
+#if HAVE_BANKED_VGPRS
+ s_bfe_u32 s_save_tmp, s_save_pc_hi, (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT | (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE << 0x10))
+ s_lshl_b32 s_save_tmp, s_save_tmp, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT
+ s_or_b32 s_save_m0, s_save_m0, s_save_tmp
+#endif
- s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
- v_writelane_b32 v2, s_save_tmp, 0xB
+ write_hwreg_to_v2(s_save_m0)
- s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_TRAP_CTRL)
- v_writelane_b32 v2, s_save_tmp, 0xC
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO)
+ write_hwreg_to_v2(s_save_m0)
+
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI)
+ write_hwreg_to_v2(s_save_m0)
+
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
+ write_hwreg_to_v2(s_save_m0)
+
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_TRAP_CTRL)
+ write_hwreg_to_v2(s_save_m0)
s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
- v_writelane_b32 v2, s_save_tmp, 0xD
+ write_hwreg_to_v2(s_save_tmp)
s_get_barrier_state s_save_tmp, -1
s_wait_kmcnt (0)
- v_writelane_b32 v2, s_save_tmp, 0xE
- valu_sgpr_hazard()
+ write_hwreg_to_v2(s_save_tmp)
+
+#if HAVE_CLUSTER_BARRIER
+ s_sendmsg_rtn_b32 s_save_tmp, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE)
+ s_wait_kmcnt 0
+ write_hwreg_to_v2(s_save_tmp)
+#endif
// Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
s_mov_b32 exec_lo, 0xFFFF
s_mov_b32 exec_hi, 0x0
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
// Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
s_mov_b32 exec_lo, 0xFFFFFFFF
+#if NUM_NAMED_BARRIERS
+ v_mov_b32 v2, 0
+
+ for var bar_idx = 0; bar_idx < NUM_NAMED_BARRIERS; bar_idx ++
+ s_get_barrier_state s_save_tmp, (bar_idx + 1)
+ s_wait_kmcnt 0
+ v_writelane_b32 v2, s_save_tmp, bar_idx
+ end
+
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:NAMED_BARRIERS_SR_OFFSET_FROM_HWREG
+#endif
+
/* save SGPRs */
// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
- // SGPR SR memory offset : size(VGPR)+size(SVGPR)
+ // SGPR SR memory offset : size(VGPR)
get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
- get_svgpr_size_bytes(s_save_tmp)
- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into
s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
- s_cmp_eq_u32 ttmp13, 0x0
- s_cbranch_scc0 L_WRITE_V2_SECOND_HALF
- write_16sgpr_to_v2(s0, 0x0)
- s_branch L_SAVE_SGPR_SKIP_TCP_STORE
-L_WRITE_V2_SECOND_HALF:
- write_16sgpr_to_v2(s0, 0x10)
+ write_16sgpr_to_v2(s0)
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+ s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled?
+ s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE
+
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80
s_mov_b32 ttmp13, 0x0
v_mov_b32 v2, 0x0
s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
write_12sgpr_to_v2(s0)
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+#if SAVE_TTMPS_IN_SGPR_BLOCK
+ // Last 16 dwords of the SGPR block already contain the TTMPS. Make
+ // sure to not override them.
+ s_mov_b32 exec_lo, 0xFFFF
+#endif
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
/* save LDS */
// first wave do LDS save;
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
- s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
- // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
//
get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
- get_svgpr_size_bytes(s_save_tmp)
- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
//load 0~63*4(byte address) to vgpr v0
v_mbcnt_lo_u32_b32 v0, -1, 0
v_mbcnt_hi_u32_b32 v0, -1, v0
L_SAVE_LDS_LOOP_W32:
ds_read_b32 v1, v0
s_wait_idle
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
L_SAVE_LDS_LOOP_W64:
ds_read_b32 v1, v0
s_wait_idle
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_SAVE_VGPR_WAVE64
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR Allocated in 4-GPR granularity
// VGPR store using dw burst
v_movrels_b32 v2, v2 //v2 = v[2+m0]
v_movrels_b32 v3, v3 //v3 = v[3+m0]
- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2
- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
+ global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*2
+ global_store_addtid_b32 v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*3
s_add_u32 m0, m0, 4 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes
s_branch L_SAVE_VGPR_END
L_SAVE_VGPR_WAVE64:
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR store using dw burst
s_mov_b32 m0, 0x4 //VGPR initial index value =4
s_cmp_lt_u32 m0, s_save_alloc_size
- s_cbranch_scc0 L_SAVE_SHARED_VGPR
+ s_cbranch_scc0 L_SAVE_VGPR_END
L_SAVE_VGPR_W64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
v_movrels_b32 v2, v2 //v2 = v[2+m0]
v_movrels_b32 v3, v3 //v3 = v[3+m0]
- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2
- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
+ global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*2
+ global_store_addtid_b32 v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*3
s_add_u32 m0, m0, 4 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete?
-L_SAVE_SHARED_VGPR:
- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
- s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
- s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS
- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
- //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
- //save shared_vgpr will start from the index of m0
- s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
- s_mov_b32 exec_lo, 0xFFFFFFFF
- s_mov_b32 exec_hi, 0x00000000
-
-L_SAVE_SHARED_VGPR_WAVE64_LOOP:
- v_movrels_b32 v0, v0 //v0 = v[0+m0]
- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
- s_add_u32 m0, m0, 1 //next vgpr index
- s_add_u32 s_save_mem_offset, s_save_mem_offset, 128
- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
- s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
-
L_SAVE_VGPR_END:
s_branch L_END_PGM
L_RESTORE:
- /* Setup Resource Contants */
- s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
- s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
- s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
- s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
+ s_mov_b32 s_restore_base_addr_lo, s_restore_spi_init_lo
+ s_and_b32 s_restore_base_addr_hi, s_restore_spi_init_hi, ADDRESS_HI32_MASK
// Save s_restore_spi_init_hi for later use.
s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
- s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
- // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
//
get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
- get_svgpr_size_bytes(s_restore_tmp)
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
s_and_b32 m0, m0, 1
s_cmp_eq_u32 m0, 1
s_mov_b32 m0, 0x0
+
+ v_mbcnt_lo_u32_b32 v1, -1, 0
+ v_mbcnt_hi_u32_b32 v1, -1, v1
+ v_lshlrev_b32 v1, 2, v1 // 0, 4, 8, ... 124 (W32) or 252 (W64)
+
s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
L_RESTORE_LDS_LOOP_W32:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
s_wait_idle
- ds_store_addtid_b32 v0
+ ds_store_b32 v1, v0
+ v_add_nc_u32 v1, v1, 128
s_add_u32 m0, m0, 128 // 128 DW
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
s_branch L_RESTORE_VGPR
L_RESTORE_LDS_LOOP_W64:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
s_wait_idle
- ds_store_addtid_b32 v0
+ ds_store_b32 v1, v0
+ v_add_nc_u32 v1, v1, 256
s_add_u32 m0, m0, 256 // 256 DW
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR load using dw burst
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4
s_mov_b32 m0, 4 //VGPR initial index value = 4
- s_cmp_lt_u32 m0, s_restore_alloc_size
- s_cbranch_scc0 L_RESTORE_SGPR
L_RESTORE_VGPR_WAVE32_LOOP:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128
- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*2
- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*3
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
+ global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128
+ global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2
+ global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3
s_wait_idle
v_movreld_b32 v0, v0 //v[0+m0] = v0
v_movreld_b32 v1, v1
s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
/* VGPR restore on v0 */
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS
- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128
- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*2
- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*3
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
+ global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128
+ global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2
+ global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3
s_wait_idle
s_branch L_RESTORE_SGPR
L_RESTORE_VGPR_WAVE64:
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR load using dw burst
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
s_mov_b32 m0, 4 //VGPR initial index value = 4
s_cmp_lt_u32 m0, s_restore_alloc_size
- s_cbranch_scc0 L_RESTORE_SHARED_VGPR
+ s_cbranch_scc0 L_RESTORE_V0
L_RESTORE_VGPR_WAVE64_LOOP:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256
- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*2
- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*3
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
+ global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256
+ global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2
+ global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3
s_wait_idle
v_movreld_b32 v0, v0 //v[0+m0] = v0
v_movreld_b32 v1, v1
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
-L_RESTORE_SHARED_VGPR:
- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
- s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
- s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used?
- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
- //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
- //restore shared_vgpr will start from the index of m0
- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
- s_mov_b32 exec_lo, 0xFFFFFFFF
- s_mov_b32 exec_hi, 0x00000000
-L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
- s_wait_idle
- v_movreld_b32 v0, v0 //v[0+m0] = v0
- s_add_u32 m0, m0, 1 //next vgpr index
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
- s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
- s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
-
- s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
-
/* VGPR restore on v0 */
L_RESTORE_V0:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS
- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256
- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*2
- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*3
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
+ global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256
+ global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2
+ global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3
s_wait_idle
/* restore SGPRs */
//will be 2+8+16*6
- // SGPR SR memory offset : size(VGPR)+size(SVGPR)
+ // SGPR SR memory offset : size(VGPR)
L_RESTORE_SGPR:
get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
- get_svgpr_size_bytes(s_restore_tmp)
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved
-
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 24*4 // s[104:107]
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
s_mov_b32 m0, s_sgpr_save_num
- read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b128 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS
s_wait_idle
s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104]
s_movreld_b64 s0, s0 //s[0+m0] = s0
s_movreld_b64 s2, s2
- read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_sub_co_u32 s_restore_addr_lo, s_restore_addr_lo, 8*4 // s[96:103]
+ s_sub_co_ci_u32 s_restore_addr_hi, s_restore_addr_hi, 0
+ s_load_b256 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS
s_wait_idle
s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96]
s_movreld_b64 s6, s6
L_RESTORE_SGPR_LOOP:
- read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_sub_co_u32 s_restore_addr_lo, s_restore_addr_lo, 16*4 // s[0,16,32,48,64,80]
+ s_sub_co_ci_u32 s_restore_addr_hi, s_restore_addr_hi, 0
+ s_load_b512 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS
s_wait_idle
s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
/* restore HW registers */
L_RESTORE_HWREG:
- // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
+ // HWREG SR memory offset : size(VGPR)+size(SGPR)
get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
- get_svgpr_size_bytes(s_restore_tmp)
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
-
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
// Restore s_restore_spi_init_hi before the saved value gets clobbered.
s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save
- read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_state_priv, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_excp_flag_priv, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_m0, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS
+ s_load_b32 s_restore_pc_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x4
+ s_load_b32 s_restore_pc_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x8
+ s_load_b32 s_restore_exec_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0xC
+ s_load_b32 s_restore_exec_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x10
+ s_load_b32 s_restore_state_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x14
+ s_load_b32 s_restore_excp_flag_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x18
+ s_load_b32 s_restore_xnack_mask, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x1C
+ s_load_b32 s_restore_mode, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x20
+ s_load_b32 s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x24
s_wait_idle
s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_LO), s_restore_flat_scratch
- read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x28
s_wait_idle
s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_HI), s_restore_flat_scratch
- read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x2C
s_wait_idle
s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp
- read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x30
s_wait_idle
s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp
- // Only the first wave needs to restore the workgroup barrier.
+ // Only the first wave needs to restore group barriers.
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
- s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
+ s_cbranch_scc0 L_SKIP_GROUP_BARRIER_RESTORE
// Skip over WAVE_STATUS, since there is no state to restore from it
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 4
- read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x38
s_wait_idle
+ // Skip group barriers if wave is not part of a group.
s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET
- s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
+ s_cbranch_scc0 L_SKIP_GROUP_BARRIER_RESTORE
- // extract the saved signal count from s_restore_tmp
- s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET
+ // Restore workgroup barrier signal count.
+ restore_barrier_signal_count(-1)
- // We need to call s_barrier_signal repeatedly to restore the signal
- // count of the work group barrier. The member count is already
- // initialized with the number of waves in the work group.
-L_BARRIER_RESTORE_LOOP:
- s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp
- s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
- s_barrier_signal -1
- s_add_i32 s_restore_tmp, s_restore_tmp, -1
- s_branch L_BARRIER_RESTORE_LOOP
+#if NUM_NAMED_BARRIERS
+ s_mov_b32 s_restore_mem_offset, NAMED_BARRIERS_SR_OFFSET_FROM_HWREG
+ s_mov_b32 m0, 1
+
+L_RESTORE_NAMED_BARRIER_LOOP:
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], s_restore_mem_offset scope:SCOPE_SYS
+ s_wait_kmcnt 0
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 0x4
+
+ // Restore named barrier member count.
+ s_bfe_u32 exec_lo, s_restore_tmp, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 16))
+ s_lshl_b32 exec_lo, exec_lo, S_BARRIER_INIT_MEMBERCNT_SHIFT
+ s_or_b32 m0, m0, exec_lo
+ s_barrier_init m0
+ s_andn2_b32 m0, m0, S_BARRIER_INIT_MEMBERCNT_MASK
-L_SKIP_BARRIER_RESTORE:
+ // Restore named barrier signal count.
+ restore_barrier_signal_count(m0)
+
+ s_add_u32 m0, m0, 1
+ s_cmp_gt_u32 m0, NUM_NAMED_BARRIERS
+ s_cbranch_scc0 L_RESTORE_NAMED_BARRIER_LOOP
+#endif
+
+L_SKIP_GROUP_BARRIER_RESTORE:
+#if HAVE_CLUSTER_BARRIER
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x3C
+ s_wait_kmcnt 0
+
+ // Skip cluster barrier restore if wave is not part of a cluster.
+ s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET
+ s_cbranch_scc0 L_SKIP_CLUSTER_BARRIER_RESTORE
+
+ // Only the first wave in the group signals the trap cluster barrier.
+ s_bitcmp1_b32 s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT
+ s_cbranch_scc0 L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL
+
+ // Clear SCC: s_barrier_signal_isfirst -4 writes SCC=>1 but not SCC=>0.
+ s_cmp_eq_u32 0, 1
+ s_barrier_signal_isfirst -4
+L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL:
+ s_barrier_wait -4
+
+ // Only the first wave in the cluster restores the barrier.
+ s_cbranch_scc0 L_SKIP_CLUSTER_BARRIER_RESTORE
+
+ // Restore cluster barrier signal count.
+ restore_barrier_signal_count(-3)
+L_SKIP_CLUSTER_BARRIER_RESTORE:
+#endif
s_mov_b32 m0, s_restore_m0
s_mov_b32 exec_lo, s_restore_exec_lo
s_mov_b32 exec_hi, s_restore_exec_hi
+#if HAVE_XNACK
+ s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_MASK), s_restore_xnack_mask
+#endif
+
// EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed.
// Only restore the other fields to avoid clobbering them.
s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 0, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE), s_restore_excp_flag_priv
s_setreg_b32 hwreg(HW_REG_WAVE_MODE), s_restore_mode
// Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
- // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
+ // ttmp SR memory offset :
+ // - gfx12: size(VGPR)+size(SGPR)+0x40
+ // - gfx12.5: size(VGPR)+size(SGPR)-0x40
get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
- get_svgpr_size_bytes(s_restore_ttmps_hi)
- s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
- s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes()
- s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
- s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
- s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
- s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 scope:SCOPE_SYS
- s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 scope:SCOPE_SYS
- s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 scope:SCOPE_SYS
+ s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG)
+ s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_base_addr_lo
+ s_addc_u32 s_restore_ttmps_hi, s_restore_base_addr_hi, 0x0
+ s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x10 scope:SCOPE_SYS
+ s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x20 scope:SCOPE_SYS
+ s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x34 scope:SCOPE_SYS
s_wait_idle
- s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
+#if HAVE_XNACK
+ restore_xnack_state_priv(s_restore_tmp)
+#endif
+
+ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, ADDRESS_HI32_MASK //Do it here in order not to affect STATUS
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv // SCC is included, which is changed by previous salu
- // Make barrier and LDS state visible to all waves in the group.
- // STATE_PRIV.BARRIER_COMPLETE may change after this point.
- s_barrier_signal -2
- s_barrier_wait -2
+ // Make barrier and LDS state visible to all waves in the group/cluster.
+ // STATE_PRIV.*BARRIER_COMPLETE may change after this point.
+ wait_trap_barriers(s_restore_tmp)
+
+#if HAVE_CLUSTER_BARRIER
+ // SCC is changed by wait_trap_barriers, restore it separately.
+ s_lshr_b32 s_restore_state_priv, s_restore_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
+ s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, 1), s_restore_state_priv
+#endif
s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
L_END_PGM:
- // Make sure that no wave of the workgroup can exit the trap handler
- // before the workgroup barrier state is saved.
- s_barrier_signal -2
- s_barrier_wait -2
+ // Make sure that no wave of the group/cluster can exit the trap handler
+ // before the group/cluster barrier state is saved.
+ wait_trap_barriers(s_restore_tmp)
+
s_endpgm_saved
end
-function write_16sgpr_to_v2(s, lane_offset)
+function write_hwreg_to_v2(s)
+ // Copy into VGPR for later TCP store.
+ v_writelane_b32 v2, s, m0
+ s_add_u32 m0, m0, 0x1
+end
+
+
+function write_16sgpr_to_v2(s)
// Copy into VGPR for later TCP store.
for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++
- v_writelane_b32 v2, s[sgpr_idx], sgpr_idx + lane_offset
+ v_writelane_b32 v2, s[sgpr_idx], ttmp13
+ s_add_u32 ttmp13, ttmp13, 0x1
end
- valu_sgpr_hazard()
- s_add_u32 ttmp13, ttmp13, 0x10
end
function write_12sgpr_to_v2(s)
// Copy into VGPR for later TCP store.
for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++
- v_writelane_b32 v2, s[sgpr_idx], sgpr_idx
+ v_writelane_b32 v2, s[sgpr_idx], ttmp13
+ s_add_u32 ttmp13, ttmp13, 0x1
end
- valu_sgpr_hazard()
-end
-
-function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
- s_buffer_load_dword s, s_rsrc, s_mem_offset scope:SCOPE_SYS
- s_add_u32 s_mem_offset, s_mem_offset, 4
-end
-
-function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
- s_sub_u32 s_mem_offset, s_mem_offset, 4*16
- s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset scope:SCOPE_SYS
-end
-
-function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
- s_sub_u32 s_mem_offset, s_mem_offset, 4*8
- s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset scope:SCOPE_SYS
-end
-
-function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
- s_sub_u32 s_mem_offset, s_mem_offset, 4*4
- s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset scope:SCOPE_SYS
end
function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
L_SHIFT_DONE:
end
-function get_svgpr_size_bytes(s_svgpr_size_byte)
- s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
- s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
-end
-
function get_sgpr_size_bytes
return 512
end
function get_hwreg_size_bytes
+#if ASIC_FAMILY >= CHIP_GC_12_0_3
+ return 512
+#else
return 128
+#endif
end
function get_wave_size2(s_reg)
s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE
end
-function valu_sgpr_hazard
-#if HAVE_VALU_SGPR_HAZARD
- for var rep = 0; rep < 8; rep ++
- ds_nop
- end
+#if HAVE_XNACK
+function save_and_clear_xnack_state_priv(s_tmp)
+ // Preserve and clear XNACK state before issuing further translations.
+ // Save XNACK_STATE_PRIV.{FIRST_REPLAY, REPLAY_W64H, FXPTR} into ttmp11[22:14].
+ s_andn2_b32 ttmp11, ttmp11, (TTMP11_FIRST_REPLAY_MASK | TTMP11_REPLAY_W64H_MASK | TTMP11_FXPTR_MASK)
+
+ s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE)
+ s_lshl_b32 s_tmp, s_tmp, TTMP11_FIRST_REPLAY_SHIFT
+ s_or_b32 ttmp11, ttmp11, s_tmp
+
+ s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE)
+ s_lshl_b32 s_tmp, s_tmp, TTMP11_REPLAY_W64H_SHIFT
+ s_or_b32 ttmp11, ttmp11, s_tmp
+
+ s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE)
+ s_lshl_b32 s_tmp, s_tmp, TTMP11_FXPTR_SHIFT
+ s_or_b32 ttmp11, ttmp11, s_tmp
+
+ s_setreg_imm32_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV), 0
+end
+
+function restore_xnack_state_priv(s_tmp)
+ s_lshr_b32 s_tmp, ttmp11, TTMP11_FIRST_REPLAY_SHIFT
+ s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE), s_tmp
+
+ s_lshr_b32 s_tmp, ttmp11, TTMP11_REPLAY_W64H_SHIFT
+ s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE), s_tmp
+
+ s_lshr_b32 s_tmp, ttmp11, TTMP11_FXPTR_SHIFT
+ s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE), s_tmp
+end
+#endif
+
+function wait_trap_barriers(s_tmp)
+#if HAVE_CLUSTER_BARRIER
+ // If not in a WG then wave cannot use s_barrier_signal_isfirst.
+ s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_STATUS)
+ s_bitcmp0_b32 s_tmp, SQ_WAVE_STATUS_IN_WG_SHIFT
+ s_cbranch_scc1 L_TRAP_CLUSTER_BARRIER_SIGNAL
+
+ s_barrier_signal_isfirst -2
+ s_barrier_wait -2
+
+ // Only the first wave in the group signals the trap cluster barrier.
+ s_cbranch_scc0 L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL
+
+L_TRAP_CLUSTER_BARRIER_SIGNAL:
+ s_barrier_signal -4
+
+L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL:
+ s_barrier_wait -4
+#else
+ s_barrier_signal -2
+ s_barrier_wait -2
#endif
end
+
function restore_sched_mode(s_tmp)
s_bfe_u32 s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10))
s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp
+
+function restore_barrier_signal_count(barrier_id)
+ // extract the saved signal count from s_restore_tmp
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET
+
+ // We need to call s_barrier_signal repeatedly to restore the signal count
+ // of the group/cluster barrier. The member count is already initialized.
+L_BARRIER_RESTORE_LOOP:
+ s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp
+ s_cbranch_scc0 L_BARRIER_RESTORE_DONE
+ s_barrier_signal barrier_id
+ s_add_i32 s_restore_tmp, s_restore_tmp, -1
+ s_branch L_BARRIER_RESTORE_LOOP
+
+L_BARRIER_RESTORE_DONE:
end