Commit a36e8967 authored by Jay Cornwall's avatar Jay Cornwall Committed by Alex Deucher

drm/amdkfd: Replace gfx10 trap handler with correct branch

Previously submitted code was taken from an incorrect branch and
was non-functional.

Cc: Oak Zeng <oak.zeng@amd.com>
Signed-off-by: default avatarJay Cornwall <jay.cornwall@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Reviewed-By: default avatarOak Zeng <oak.zeng@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 7c2eaf5c
...@@ -680,24 +680,47 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { ...@@ -680,24 +680,47 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
}; };
static const uint32_t cwsr_trap_gfx10_hex[] = { static const uint32_t cwsr_trap_gfx10_hex[] = {
0xbf820001, 0xbf82012e, 0xbf820001, 0xbf8201b2,
0xb0804004, 0xb970f802, 0xb0804004, 0xb978f802,
0x8a708670, 0xb971f803, 0x8a788678, 0xb971f803,
0x8771ff71, 0x00000400, 0x876eff71, 0x00000400,
0xbf850008, 0xb971f803, 0xbf850033, 0x876eff71,
0x8771ff71, 0x000001ff, 0x00000100, 0xbf840002,
0xbf850001, 0x806c846c, 0x8878ff78, 0x00002000,
0x8a77ff77, 0xff000000,
0xb96ef807, 0x876fff6e,
0x02000000, 0x8f6f866f,
0x88776f77, 0x876fff6e,
0x003f8000, 0x8f6f896f,
0x88776f77, 0x8a6eff6e,
0x023f8000, 0xb9eef807,
0xb970f812, 0xb971f813,
0x8ff08870, 0xf4051bb8,
0xfa000000, 0xbf8cc07f,
0xf4051c38, 0xfa000008,
0xbf8cc07f, 0x87ee6e6e,
0xbf840001, 0xbe80206e,
0xb971f803, 0x8771ff71,
0x000001ff, 0xbf850002,
0x806c846c, 0x826d806d,
0x876dff6d, 0x0000ffff,
0x906e8977, 0x876fff6e,
0x003f8000, 0x906e8677,
0x876eff6e, 0x02000000,
0x886e6f6e, 0xb9eef807,
0x87fe7e7e, 0x87ea6a6a,
0xb9f8f802, 0xbe80226c,
0xb971f803, 0x8771ff71,
0x00000100, 0xbf840006,
0xbef60380, 0xb9f60203,
0x876dff6d, 0x0000ffff, 0x876dff6d, 0x0000ffff,
0xbe80226c, 0xb971f803, 0x80ec886c, 0x82ed806d,
0x8771ff71, 0x00000100, 0xbef60380, 0xb9f60283,
0xbf840006, 0xbef60380, 0xb972f816, 0xb9762c07,
0xb9f60203, 0x876dff6d, 0x8f769a76, 0x886d766d,
0x0000ffff, 0x80ec886c, 0xb97603c7, 0x8f769976,
0x82ed806d, 0xbef60380, 0x886d766d, 0xb9760647,
0xb9f60283, 0xb973f816, 0x8f769876, 0x886d766d,
0xb9762c07, 0x8f769c76,
0x886d766d, 0xb97603c7,
0x8f769b76, 0x886d766d,
0xb976f807, 0x8776ff76, 0xb976f807, 0x8776ff76,
0x00007fff, 0xb9f6f807, 0x00007fff, 0xb9f6f807,
0xbeee037e, 0xbeef037f, 0xbeee037e, 0xbeef037f,
...@@ -706,32 +729,167 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { ...@@ -706,32 +729,167 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xbef4037e, 0x8775ff7f, 0xbef4037e, 0x8775ff7f,
0x0000ffff, 0x8875ff75, 0x0000ffff, 0x8875ff75,
0x00040000, 0xbef60380, 0x00040000, 0xbef60380,
0xbef703ff, 0x00807fac, 0xbef703ff, 0x10807fac,
0x8776ff7f, 0x08000000, 0x8776ff7f, 0x08000000,
0x90768376, 0x88777677, 0x90768376, 0x88777677,
0x8776ff7f, 0x70000000, 0x8776ff7f, 0x70000000,
0x90768176, 0x88777677, 0x90768176, 0x88777677,
0xbefb037c, 0xbefa0380, 0xbefb037c, 0xbefa0380,
0xb97202dc, 0x8872727f, 0xb97302dc, 0x8f739973,
0xbefe03c1, 0x877c8172, 0x8873737f, 0xb97a2a05,
0x807a817a, 0x907c9973,
0x877c817c, 0xbf06817c,
0xbf850002, 0x8f7a897a,
0xbf820001, 0x8f7a8a7a,
0xb9761e06, 0x8f768a76,
0x807a767a, 0x807aff7a,
0x00000200, 0xbef603ff,
0x01000000, 0xbefe037c,
0xbefc037a, 0xf4611efa,
0xf8000000, 0x807a847a,
0xbefc037e, 0xbefe037c,
0xbefc037a, 0xf4611b3a,
0xf8000000, 0x807a847a,
0xbefc037e, 0xbefe037c,
0xbefc037a, 0xf4611b7a,
0xf8000000, 0x807a847a,
0xbefc037e, 0xbefe037c,
0xbefc037a, 0xf4611bba,
0xf8000000, 0x807a847a,
0xbefc037e, 0xbefe037c,
0xbefc037a, 0xf4611bfa,
0xf8000000, 0x807a847a,
0xbefc037e, 0xbefe037c,
0xbefc037a, 0xf4611e3a,
0xf8000000, 0x807a847a,
0xbefc037e, 0xb971f803,
0xbefe037c, 0xbefc037a,
0xf4611c7a, 0xf8000000,
0x807a847a, 0xbefc037e,
0xbefe037c, 0xbefc037a,
0xf4611cba, 0xf8000000,
0x807a847a, 0xbefc037e,
0xb97bf801, 0xbefe037c,
0xbefc037a, 0xf4611efa,
0xf8000000, 0x807a847a,
0xbefc037e, 0x8776ff7f,
0x04000000, 0xbeef0380,
0x886f6f76, 0xb97a2a05,
0x807a817a, 0x907c9973,
0x877c817c, 0xbf06817c,
0xbf850002, 0x8f7a897a,
0xbf820001, 0x8f7a8a7a,
0xb9761e06, 0x8f768a76,
0x807a767a, 0xbef603ff,
0x01000000, 0xbef20374,
0x80747a74, 0x82758075,
0xbefc0380, 0xbf800000,
0xbe802f00, 0xbe822f02,
0xbe842f04, 0xbe862f06,
0xbe882f08, 0xbe8a2f0a,
0xbe8c2f0c, 0xbe8e2f0e,
0xf469003a, 0xfa000000,
0xf469013a, 0xfa000010,
0xf469023a, 0xfa000020,
0xf469033a, 0xfa000030,
0x8074c074, 0x82758075,
0x807c907c, 0xbf0aff7c,
0x00000060, 0xbf85ffea,
0xbe802f00, 0xbe822f02,
0xbe842f04, 0xbe862f06,
0xbe882f08, 0xf469003a,
0xfa000000, 0xf469013a,
0xfa000010, 0xf465023a,
0xfa000020, 0x8074c074,
0x82758075, 0xbef40372,
0xbefa0380, 0xbefe03c1,
0x907c9973, 0x877c817c,
0xbf06817c, 0xbf850002, 0xbf06817c, 0xbf850002,
0xbeff0380, 0xbf820001, 0xbeff0380, 0xbf820002,
0xbeff03c1, 0xb9712a05, 0xbeff03c1, 0xbf82000b,
0x80718171, 0x8f718271,
0x877c8172, 0xbf06817c,
0xbf85000d, 0x8f768771,
0xbef603ff, 0x01000000, 0xbef603ff, 0x01000000,
0xbefc0380, 0x7e008700,
0xe0704000, 0x7a5d0000, 0xe0704000, 0x7a5d0000,
0x807c817c, 0x807aff7a, 0xe0704080, 0x7a5d0100,
0x00000080, 0xbf0a717c, 0xe0704100, 0x7a5d0200,
0xbf85fff8, 0xbf82001b, 0xe0704180, 0x7a5d0300,
0x8f768871, 0xbef603ff, 0xbf82000a, 0xbef603ff,
0x01000000, 0xbefc0380, 0x01000000, 0xe0704000,
0x7e008700, 0xe0704000, 0x7a5d0000, 0xe0704100,
0x7a5d0000, 0x807c817c, 0x7a5d0100, 0xe0704200,
0x807aff7a, 0x00000100, 0x7a5d0200, 0xe0704300,
0xbf0a717c, 0xbf85fff8, 0x7a5d0300, 0xbefe03c1,
0x907c9973, 0x877c817c,
0xbf06817c, 0xbf850002,
0xbeff0380, 0xbf820001,
0xbeff03c1, 0xb9714306,
0x8771c171, 0xbf840046,
0xbf8a0000, 0x8776ff6f,
0x04000000, 0xbf840042,
0x8f718671, 0x8f718271,
0xbef60371, 0xb97a2a05,
0x807a817a, 0x907c9973,
0x877c817c, 0xbf06817c,
0xbf850002, 0x8f7a897a,
0xbf820001, 0x8f7a8a7a,
0xb9761e06, 0x8f768a76,
0x807a767a, 0x807aff7a,
0x00000200, 0x807aff7a,
0x00000080, 0xbef603ff,
0x01000000, 0xd7650000,
0x000100c1, 0xd7660000,
0x000200c1, 0x16000084,
0x907c9973, 0x877c817c,
0xbf06817c, 0xbefc0380,
0xbf850012, 0xbe8303ff,
0x00000080, 0xbf800000,
0xbf800000, 0xbf800000,
0xd8d80000, 0x01000000,
0xbf8c0000, 0xe0704000,
0x7a5d0100, 0x807c037c,
0x807a037a, 0xd5250000,
0x0001ff00, 0x00000080,
0xbf0a717c, 0xbf85fff4,
0xbf820011, 0xbe8303ff,
0x00000100, 0xbf800000,
0xbf800000, 0xbf800000,
0xd8d80000, 0x01000000,
0xbf8c0000, 0xe0704000,
0x7a5d0100, 0x807c037c,
0x807a037a, 0xd5250000,
0x0001ff00, 0x00000100,
0xbf0a717c, 0xbf85fff4,
0xbefe03c1, 0x907c9973,
0x877c817c, 0xbf06817c,
0xbf850004, 0xbefa03ff,
0x00000200, 0xbeff0380,
0xbf820003, 0xbefa03ff,
0x00000400, 0xbeff03c1,
0xb9712a05, 0x80718171,
0x8f718271, 0x907c9973,
0x877c817c, 0xbf06817c,
0xbf850017, 0xbef603ff,
0x01000000, 0xbefc0384,
0xbf0a717c, 0xbf840037,
0x7e008700, 0x7e028701,
0x7e048702, 0x7e068703,
0xe0704000, 0x7a5d0000,
0xe0704080, 0x7a5d0100,
0xe0704100, 0x7a5d0200,
0xe0704180, 0x7a5d0300,
0x807c847c, 0x807aff7a,
0x00000200, 0xbf0a717c,
0xbf85ffef, 0xbf820025,
0xbef603ff, 0x01000000,
0xbefc0384, 0xbf0a717c,
0xbf840020, 0x7e008700,
0x7e028701, 0x7e048702,
0x7e068703, 0xe0704000,
0x7a5d0000, 0xe0704100,
0x7a5d0100, 0xe0704200,
0x7a5d0200, 0xe0704300,
0x7a5d0300, 0x807c847c,
0x807aff7a, 0x00000400,
0xbf0a717c, 0xbf85ffef,
0xb9711e06, 0x8771c171, 0xb9711e06, 0x8771c171,
0xbf84000c, 0x8f718371, 0xbf84000c, 0x8f718371,
0x80717c71, 0xbefe03c1, 0x80717c71, 0xbefe03c1,
...@@ -739,133 +897,82 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { ...@@ -739,133 +897,82 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xe0704000, 0x7a5d0000, 0xe0704000, 0x7a5d0000,
0x807c817c, 0x807aff7a, 0x807c817c, 0x807aff7a,
0x00000080, 0xbf0a717c, 0x00000080, 0xbf0a717c,
0xbf85fff8, 0xbf8a0000, 0xbf85fff8, 0xbf820138,
0x8776ff72, 0x04000000,
0xbf84002b, 0xbefe03c1,
0x877c8172, 0xbf06817c,
0xbf850002, 0xbeff0380,
0xbf820001, 0xbeff03c1,
0xb9714306, 0x8771c171,
0xbf840021, 0x8f718671,
0x8f718271, 0xbef60371,
0xbef603ff, 0x01000000,
0xd7650000, 0x000100c1,
0xd7660000, 0x000200c1,
0x16000084, 0x877c8172,
0xbf06817c, 0xbefc0380,
0xbf85000a, 0x807cff7c,
0x00000080, 0x807aff7a,
0x00000080, 0xd5250000,
0x0001ff00, 0x00000080,
0xbf0a717c, 0xbf85fff7,
0xbf820009, 0x807cff7c,
0x00000100, 0x807aff7a,
0x00000100, 0xd5250000,
0x0001ff00, 0x00000100,
0xbf0a717c, 0xbf85fff7,
0x877c8172, 0xbf06817c,
0xbf850003, 0x8f7687ff,
0x0000006a, 0xbf820002,
0x8f7688ff, 0x0000006a,
0xbef603ff, 0x01000000,
0x877c8172, 0xbf06817c,
0xbefc0380, 0xbf800000,
0xbf85000b, 0xbe802e00,
0x7e000200, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0x807c817c,
0xbf0aff7c, 0x0000006a,
0xbf85fff6, 0xbf82000a,
0xbe802e00, 0x7e000200,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0x807c817c, 0xbf0aff7c,
0x0000006a, 0xbf85fff6,
0xbef60384, 0xbef603ff,
0x01000000, 0x877c8172,
0xbf06817c, 0xbf850030,
0x7e00027b, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0x7e00026c,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000080,
0x7e00026d, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0x7e00026e,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000080,
0x7e00026f, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0x7e000270,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000080,
0xb971f803, 0x7e000271,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000080,
0x7e000273, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0xb97bf801,
0x7e00027b, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0xbf82002f,
0x7e00027b, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0x7e00026c,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0x7e00026d, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0x7e00026e,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0x7e00026f, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0x7e000270,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0xb971f803, 0x7e000271,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0x7e000273, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0xb97bf801,
0x7e00027b, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0xbf820119,
0xbef4037e, 0x8775ff7f, 0xbef4037e, 0x8775ff7f,
0x0000ffff, 0x8875ff75, 0x0000ffff, 0x8875ff75,
0x00040000, 0xbef60380, 0x00040000, 0xbef60380,
0xbef703ff, 0x00807fac, 0xbef703ff, 0x10807fac,
0x8772ff7f, 0x08000000, 0x8772ff7f, 0x08000000,
0x90728372, 0x88777277, 0x90728372, 0x88777277,
0x8772ff7f, 0x70000000, 0x8772ff7f, 0x70000000,
0x90728172, 0x88777277, 0x90728172, 0x88777277,
0xb97902dc, 0x8879797f, 0xb97302dc, 0x8f739973,
0xbef80380, 0xbefe03c1, 0x8873737f, 0x8772ff7f,
0x877c8179, 0xbf06817c, 0x04000000, 0xbf840036,
0xbefe03c1, 0x907c9973,
0x877c817c, 0xbf06817c,
0xbf850002, 0xbeff0380, 0xbf850002, 0xbeff0380,
0xbf820001, 0xbeff03c1, 0xbf820001, 0xbeff03c1,
0xb96f2a05, 0x806f816f, 0xb96f4306, 0x876fc16f,
0x8f6f826f, 0x877c8179, 0xbf84002b, 0x8f6f866f,
0xbf06817c, 0xbf850013, 0x8f6f826f, 0xbef6036f,
0x8f76876f, 0xbef603ff, 0xb9782a05, 0x80788178,
0x01000000, 0xbef20378, 0x907c9973, 0x877c817c,
0x8078ff78, 0x00000080, 0xbf06817c, 0xbf850002,
0xbefc0381, 0xe0304000, 0x8f788978, 0xbf820001,
0x785d0000, 0xbf8c3f70, 0x8f788a78, 0xb9721e06,
0x7e008500, 0x807c817c, 0x8f728a72, 0x80787278,
0x8078ff78, 0x00000200,
0x8078ff78, 0x00000080, 0x8078ff78, 0x00000080,
0xbf0a6f7c, 0xbf85fff7, 0xbef603ff, 0x01000000,
0xe0304000, 0x725d0000, 0x907c9973, 0x877c817c,
0xbf820023, 0x8f76886f, 0xbf06817c, 0xbefc0380,
0xbf850009, 0xe0310000,
0x781d0000, 0x807cff7c,
0x00000080, 0x8078ff78,
0x00000080, 0xbf0a6f7c,
0xbf85fff8, 0xbf820008,
0xe0310000, 0x781d0000,
0x807cff7c, 0x00000100,
0x8078ff78, 0x00000100,
0xbf0a6f7c, 0xbf85fff8,
0xbef80380, 0xbefe03c1,
0x907c9973, 0x877c817c,
0xbf06817c, 0xbf850002,
0xbeff0380, 0xbf820001,
0xbeff03c1, 0xb96f2a05,
0x806f816f, 0x8f6f826f,
0x907c9973, 0x877c817c,
0xbf06817c, 0xbf850021,
0xbef603ff, 0x01000000, 0xbef603ff, 0x01000000,
0xbef20378, 0x8078ff78, 0xbef20378, 0x8078ff78,
0x00000100, 0xbefc0381, 0x00000200, 0xbefc0384,
0xe0304000, 0x785d0000, 0xe0304000, 0x785d0000,
0xe0304080, 0x785d0100,
0xe0304100, 0x785d0200,
0xe0304180, 0x785d0300,
0xbf8c3f70, 0x7e008500, 0xbf8c3f70, 0x7e008500,
0x807c817c, 0x8078ff78, 0x7e028501, 0x7e048502,
0x00000100, 0xbf0a6f7c, 0x7e068503, 0x807c847c,
0xbf85fff7, 0xb96f1e06, 0x8078ff78, 0x00000200,
0xbf0a6f7c, 0xbf85ffee,
0xe0304000, 0x725d0000,
0xe0304080, 0x725d0100,
0xe0304100, 0x725d0200,
0xe0304180, 0x725d0300,
0xbf820031, 0xbef603ff,
0x01000000, 0xbef20378,
0x8078ff78, 0x00000400,
0xbefc0384, 0xe0304000,
0x785d0000, 0xe0304100,
0x785d0100, 0xe0304200,
0x785d0200, 0xe0304300,
0x785d0300, 0xbf8c3f70,
0x7e008500, 0x7e028501,
0x7e048502, 0x7e068503,
0x807c847c, 0x8078ff78,
0x00000400, 0xbf0a6f7c,
0xbf85ffee, 0xb96f1e06,
0x876fc16f, 0xbf84000e, 0x876fc16f, 0xbf84000e,
0x8f6f836f, 0x806f7c6f, 0x8f6f836f, 0x806f7c6f,
0xbefe03c1, 0xbeff0380, 0xbefe03c1, 0xbeff0380,
...@@ -875,107 +982,81 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { ...@@ -875,107 +982,81 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0x00000080, 0xbf0a6f7c, 0x00000080, 0xbf0a6f7c,
0xbf85fff7, 0xbeff03c1, 0xbf85fff7, 0xbeff03c1,
0xe0304000, 0x725d0000, 0xe0304000, 0x725d0000,
0x8772ff79, 0x04000000, 0xe0304080, 0x725d0100,
0xbf840020, 0xbefe03c1, 0xe0304100, 0x725d0200,
0x877c8179, 0xbf06817c, 0xe0304180, 0x725d0300,
0xbf850002, 0xbeff0380, 0xb9782a05, 0x80788178,
0xbf820001, 0xbeff03c1, 0x907c9973, 0x877c817c,
0xb96f4306, 0x876fc16f, 0xbf06817c, 0xbf850002,
0xbf840016, 0x8f6f866f, 0x8f788978, 0xbf820001,
0x8f6f826f, 0xbef6036f, 0x8f788a78, 0xb9721e06,
0xbef603ff, 0x01000000, 0x8f728a72, 0x80787278,
0x877c8172, 0xbf06817c, 0x8078ff78, 0x00000200,
0xbefc0380, 0xbf850007, 0x80f8ff78, 0x00000058,
0x807cff7c, 0x00000080, 0x80f88878, 0xbef603ff,
0x8078ff78, 0x00000080, 0x01000000, 0xbefc03ff,
0xbf0a6f7c, 0xbf85fffa, 0x0000006a, 0xf425003a,
0xbf820006, 0x807cff7c, 0xf0000000, 0x80f8a078,
0x00000100, 0x8078ff78, 0xbf8cc07f, 0x80fc827c,
0x00000100, 0xbf0a6f7c, 0xbf800000, 0xbe803100,
0xbf85fffa, 0x877c8179, 0xf42d003a, 0xf0000000,
0xbf06817c, 0xbf850003, 0x80f8c078, 0xbf8cc07f,
0x8f7687ff, 0x0000006a, 0x80fc887c, 0xbf800000,
0xbf820002, 0x8f7688ff, 0xbe803100, 0xbe823102,
0x0000006a, 0xbef603ff, 0xbe843104, 0xbe863106,
0x01000000, 0x877c8179, 0xf431003a, 0xf0000000,
0xbf06817c, 0xbf850012, 0x80f8c078, 0xbf8cc07f,
0xf4211cba, 0xf0000000, 0x80fc907c, 0xbf800000,
0x8078ff78, 0x00000080, 0xbe803100, 0xbe823102,
0xbefc0381, 0xf421003a, 0xbe843104, 0xbe863106,
0xf0000000, 0x8078ff78, 0xbe883108, 0xbe8a310a,
0x00000080, 0xbf8cc07f, 0xbe8c310c, 0xbe8e310e,
0xbe803000, 0xbf800000, 0xbf06807c, 0xbf84fff0,
0x807c817c, 0xbf0aff7c, 0xb9782a05, 0x80788178,
0x0000006a, 0xbf85fff5, 0x907c9973, 0x877c817c,
0xbe800372, 0xbf820011, 0xbf06817c, 0xbf850002,
0xf4211cba, 0xf0000000, 0x8f788978, 0xbf820001,
0x8078ff78, 0x00000100, 0x8f788a78, 0xb9721e06,
0xbefc0381, 0xf421003a, 0x8f728a72, 0x80787278,
0xf0000000, 0x8078ff78, 0x8078ff78, 0x00000200,
0x00000100, 0xbf8cc07f,
0xbe803000, 0xbf800000,
0x807c817c, 0xbf0aff7c,
0x0000006a, 0xbf85fff5,
0xbe800372, 0xbef60384,
0xbef603ff, 0x01000000, 0xbef603ff, 0x01000000,
0x877c8179, 0xbf06817c,
0xbf850025, 0xf4211bfa,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211b3a,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211b7a,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211eba,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211efa,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211c3a,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211c7a,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211cfa,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211e7a,
0xf0000000, 0x8078ff78,
0x00000080, 0xbf820024,
0xf4211bfa, 0xf0000000, 0xf4211bfa, 0xf0000000,
0x8078ff78, 0x00000100, 0x80788478, 0xf4211b3a,
0xf4211b3a, 0xf0000000, 0xf0000000, 0x80788478,
0x8078ff78, 0x00000100,
0xf4211b7a, 0xf0000000, 0xf4211b7a, 0xf0000000,
0x8078ff78, 0x00000100, 0x80788478, 0xf4211eba,
0xf4211eba, 0xf0000000, 0xf0000000, 0x80788478,
0x8078ff78, 0x00000100,
0xf4211efa, 0xf0000000, 0xf4211efa, 0xf0000000,
0x8078ff78, 0x00000100, 0x80788478, 0xf4211c3a,
0xf4211c3a, 0xf0000000, 0xf0000000, 0x80788478,
0x8078ff78, 0x00000100,
0xf4211c7a, 0xf0000000, 0xf4211c7a, 0xf0000000,
0x8078ff78, 0x00000100, 0x80788478, 0xf4211e7a,
0xf0000000, 0x80788478,
0xf4211cfa, 0xf0000000, 0xf4211cfa, 0xf0000000,
0x8078ff78, 0x00000100, 0x80788478, 0xbf8cc07f,
0xf4211e7a, 0xf0000000, 0xbef2036d, 0x876dff72,
0x8078ff78, 0x00000100,
0xbf8cc07f, 0x876dff6d,
0x0000ffff, 0xbefc036f, 0x0000ffff, 0xbefc036f,
0xbefe037a, 0xbeff037b, 0xbefe037a, 0xbeff037b,
0x876f71ff, 0x000003ff, 0x876f71ff, 0x000003ff,
0xb9ef4803, 0xb9f3f816, 0xb9ef4803, 0xb9f9f816,
0x876f71ff, 0xfffff800, 0x876f71ff, 0xfffff800,
0x906f8b6f, 0xb9efa2c3, 0x906f8b6f, 0xb9efa2c3,
0xb9f9f801, 0x876fff6d, 0xb9f3f801, 0x876fff72,
0xf0000000, 0x906f9c6f, 0xfc000000, 0x906f9a6f,
0x8f6f906f, 0xbef20380, 0x8f6f906f, 0xbef30380,
0x88726f72, 0x876fff6d, 0x88736f73, 0x876fff72,
0x08000000, 0x906f9b6f, 0x02000000, 0x906f996f,
0x8f6f8f6f, 0x88726f72, 0x8f6f8f6f, 0x88736f73,
0x876fff70, 0x00800000, 0x876fff72, 0x01000000,
0x906f976f, 0xb9f2f807, 0x906f986f, 0x8f6f996f,
0xb9f0f802, 0xbf8a0000, 0x88736f73, 0x876fff70,
0xbe80226c, 0xbf810000, 0x00800000, 0x906f976f,
0xb9f3f807, 0x87fe7e7e,
0x87ea6a6a, 0xb9f0f802,
0xbf8a0000, 0xbe80226c,
0xbf810000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0x00000000,
}; };
static const uint32_t cwsr_trap_arcturus_hex[] = { static const uint32_t cwsr_trap_arcturus_hex[] = {
0xbf820001, 0xbf8202c4, 0xbf820001, 0xbf8202c4,
......
...@@ -20,1105 +20,933 @@ ...@@ -20,1105 +20,933 @@
* OTHER DEALINGS IN THE SOFTWARE. * OTHER DEALINGS IN THE SOFTWARE.
*/ */
var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
var SQ_WAVE_STATUS_HALT_MASK = 0x2000
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4
var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11
var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1
var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF
var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800
var SQ_WAVE_IB_STS_RCNT_SHIFT = 16
var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15
var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT = 25
var SQ_WAVE_IB_STS_REPLAY_W64H_SIZE = 1
var SQ_WAVE_IB_STS_REPLAY_W64H_MASK = 0x02000000
var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1
var SQ_WAVE_IB_STS_RCNT_SIZE = 6
var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000
var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF
var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
// bits [31:24] unused by SPI debug data
var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31
var TTMP11_SAVE_REPLAY_W64H_MASK = 0x80000000
var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 24
var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0x7F000000
// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000
var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC
var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000
var S_SAVE_SPI_INIT_ATC_SHIFT = 27
var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000
var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_SAVE_PC_HI_RCNT_SHIFT = 26
var S_SAVE_PC_HI_RCNT_MASK = 0xFC000000
var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 25
var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x02000000
var S_SAVE_PC_HI_REPLAY_W64H_SHIFT = 24
var S_SAVE_PC_HI_REPLAY_W64H_MASK = 0x01000000
var s_sgpr_save_num = 106
var s_save_spi_init_lo = exec_lo
var s_save_spi_init_hi = exec_hi
var s_save_pc_lo = ttmp0
var s_save_pc_hi = ttmp1
var s_save_exec_lo = ttmp2
var s_save_exec_hi = ttmp3
var s_save_status = ttmp12
var s_save_trapsts = ttmp5
var s_save_xnack_mask = ttmp6
var s_wave_size = ttmp7
var s_save_buf_rsrc0 = ttmp8
var s_save_buf_rsrc1 = ttmp9
var s_save_buf_rsrc2 = ttmp10
var s_save_buf_rsrc3 = ttmp11
var s_save_mem_offset = ttmp14
var s_save_alloc_size = s_save_trapsts
var s_save_tmp = s_save_buf_rsrc2
var s_save_m0 = ttmp15
var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000
var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000
var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_WAVE_SIZE = 25
var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
var s_restore_spi_init_lo = exec_lo
var s_restore_spi_init_hi = exec_hi
var s_restore_mem_offset = ttmp12
var s_restore_alloc_size = ttmp3
var s_restore_tmp = ttmp6
var s_restore_mem_offset_save = s_restore_tmp
var s_restore_m0 = s_restore_alloc_size
var s_restore_mode = ttmp7
var s_restore_pc_lo = ttmp0
var s_restore_pc_hi = ttmp1
var s_restore_exec_lo = ttmp14
var s_restore_exec_hi = ttmp15
var s_restore_status = ttmp4
var s_restore_trapsts = ttmp5
var s_restore_xnack_mask = ttmp13
var s_restore_buf_rsrc0 = ttmp8
var s_restore_buf_rsrc1 = ttmp9
var s_restore_buf_rsrc2 = ttmp10
var s_restore_buf_rsrc3 = ttmp11
var s_restore_size = ttmp7
shader main shader main
asic(DEFAULT)
type(CS)
wave_size(32)
asic(DEFAULT) s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
type(CS)
wave_size(32)
/*************************************************************************/
/* control on how to run the shader */
/*************************************************************************/
//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
var EMU_RUN_HACK = 0
var EMU_RUN_HACK_RESTORE_NORMAL = 0
var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
var SAVE_LDS = 0
var WG_BASE_ADDR_LO = 0x9000a000
var WG_BASE_ADDR_HI = 0x0
var WAVE_SPACE = 0x9000 //memory size that each wave occupies in workgroup state mem, increase from 5000 to 9000 for more SGPR need to be saved
var CTX_SAVE_CONTROL = 0x0
var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write
var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
var SAVE_RESTORE_HWID_DDID = 0
var RESTORE_DDID_IN_SGPR18 = 0
/**************************************************************************/
/* variables */
/**************************************************************************/
var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11
var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1
var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask
var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME
var SQ_WAVE_IB_STS_RCNT_SIZE = 6 //FIXME
var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
/* Save */
var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
var S_SAVE_SPI_INIT_ATC_SHIFT = 27
var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
var s_save_spi_init_lo = exec_lo
var s_save_spi_init_hi = exec_hi
var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
var s_save_pc_hi = ttmp1
var s_save_exec_lo = ttmp2
var s_save_exec_hi = ttmp3
var s_save_status = ttmp4
var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
var s_wave_size = ttmp6 //ttmp6 is not needed now, since it's only 32bit xnack mask, now use it to determine wave32 or wave64 in EMU_HACK
var s_save_xnack_mask = ttmp7
var s_save_buf_rsrc0 = ttmp8
var s_save_buf_rsrc1 = ttmp9
var s_save_buf_rsrc2 = ttmp10
var s_save_buf_rsrc3 = ttmp11
var s_save_mem_offset = ttmp14
var s_sgpr_save_num = 106 //in gfx10, all sgpr must be saved
var s_save_alloc_size = s_save_trapsts //conflict
var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
var s_save_m0 = ttmp15
/* Restore */
var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
var s_restore_spi_init_lo = exec_lo
var s_restore_spi_init_hi = exec_hi
var s_restore_mem_offset = ttmp12
var s_restore_alloc_size = ttmp3
var s_restore_tmp = ttmp6
var s_restore_mem_offset_save = s_restore_tmp //no conflict
var s_restore_m0 = s_restore_alloc_size //no conflict
var s_restore_mode = ttmp13
var s_restore_hwid1 = ttmp2
var s_restore_ddid = s_restore_hwid1
var s_restore_pc_lo = ttmp0
var s_restore_pc_hi = ttmp1
var s_restore_exec_lo = ttmp14
var s_restore_exec_hi = ttmp15
var s_restore_status = ttmp4
var s_restore_trapsts = ttmp5
//var s_restore_xnack_mask_lo = xnack_mask_lo
//var s_restore_xnack_mask_hi = xnack_mask_hi
var s_restore_xnack_mask = ttmp7
var s_restore_buf_rsrc0 = ttmp8
var s_restore_buf_rsrc1 = ttmp9
var s_restore_buf_rsrc2 = ttmp10
var s_restore_buf_rsrc3 = ttmp11
var s_restore_size = ttmp13 //ttmp13 has no conflict
/**************************************************************************/
/* trap handler entry points */
/**************************************************************************/
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
//FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
//FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
else
s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
end
L_JUMP_TO_RESTORE: L_JUMP_TO_RESTORE:
s_branch L_RESTORE //restore s_branch L_RESTORE
L_SKIP_RESTORE: L_SKIP_RESTORE:
s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK
s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save s_cbranch_scc1 L_SAVE
s_cbranch_scc1 L_SAVE //this is the operation for save
// If STATUS.MEM_VIOL is asserted then halt the wave to prevent
// ********* Handle non-CWSR traps ******************* // the exception raising again and blocking context save.
if (!EMU_RUN_HACK) s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) s_cbranch_scc0 L_FETCH_2ND_TRAP
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 L_FETCH_2ND_TRAP:
// Preserve and clear scalar XNACK state before issuing scalar loads.
L_EXCP_CASE: // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
s_and_b32 ttmp1, ttmp1, 0xFFFF // unused space ttmp11[31:24].
s_rfe_b64 [ttmp0, ttmp1] s_andn2_b32 ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
end s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS)
// ********* End handling of non-CWSR traps ******************* s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
/**************************************************************************/ s_or_b32 ttmp11, ttmp11, ttmp3
/* save routine */ s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
/**************************************************************************/ s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
s_or_b32 ttmp11, ttmp11, ttmp3
L_SAVE: s_andn2_b32 ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
// Read second-level TBA/TMA from first-level TMA and jump if available.
// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
// ttmp12 holds SQ_WAVE_STATUS
s_getreg_b32 ttmp4, hwreg(HW_REG_SHADER_TMA_LO)
s_getreg_b32 ttmp5, hwreg(HW_REG_SHADER_TMA_HI)
s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA
s_waitcnt lgkmcnt(0)
s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA
s_waitcnt lgkmcnt(0)
s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler
L_NO_NEXT_TRAP:
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK
s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
s_addc_u32 ttmp1, ttmp1, 0
L_EXCP_CASE:
s_and_b32 ttmp1, ttmp1, 0xFFFF
// Restore SQ_WAVE_IB_STS.
s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
s_or_b32 ttmp2, ttmp2, ttmp3
s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
// Restore SQ_WAVE_STATUS.
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status
s_rfe_b64 [ttmp0, ttmp1]
L_SAVE:
//check whether there is mem_viol //check whether there is mem_viol
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
s_cbranch_scc0 L_NO_PC_REWIND s_cbranch_scc0 L_NO_PC_REWIND
//if so, need rewind PC assuming GDS operation gets NACKed //if so, need rewind PC assuming GDS operation gets NACKed
s_mov_b32 s_save_tmp, 0 //clear mem_viol bit s_mov_b32 s_save_tmp, 0
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0
L_NO_PC_REWIND: L_NO_PC_REWIND:
s_mov_b32 s_save_tmp, 0 //clear saveCtx bit s_mov_b32 s_save_tmp, 0
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
//s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
//s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)
s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, SQ_WAVE_IB_STS_REPLAY_W64H_SIZE)
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY and REPLAY_W64H in IB_STS
s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
/* inform SPI the readiness and wait for SPI's go signal */ /* inform SPI the readiness and wait for SPI's go signal */
s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
s_mov_b32 s_save_exec_hi, exec_hi s_mov_b32 s_save_exec_hi, exec_hi
s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
if (EMU_RUN_HACK)
s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
else
s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC L_SLEEP:
end // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
// SQ hang, since the 7,8th wave could not get arbit to exec inst, while
L_SLEEP: // other waves are stuck into the sleep-loop and waiting for wrexec!=0
s_sleep 0x2 s_sleep 0x2
s_cbranch_execz L_SLEEP
if (EMU_RUN_HACK)
/* setup Resource Contants */
else s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
s_cbranch_execz L_SLEEP s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
end s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
/* setup Resource Contants */ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
//calculate wd_addr using absolute thread id s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
v_readlane_b32 s_save_tmp, v9, 0 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
//determine it is wave32 or wave64 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
s_cmp_eq_u32 s_wave_size, 0
s_cbranch_scc1 L_SAVE_WAVE32 s_mov_b32 s_save_m0, m0
s_lshr_b32 s_save_tmp, s_save_tmp, 6 //SAVE WAVE64
s_branch L_SAVE_CON /* global mem offset */
L_SAVE_WAVE32: s_mov_b32 s_save_mem_offset, 0x0
s_lshr_b32 s_save_tmp, s_save_tmp, 5 //SAVE WAVE32 s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
L_SAVE_CON: s_lshl_b32 s_wave_size, s_wave_size, S_WAVE_SIZE
s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi, it's at bit25
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI /* save HW registers */
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
else L_SAVE_HWREG:
end // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO get_svgpr_size_bytes(s_save_tmp)
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
else
end s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE)
write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
s_mov_b32 s_save_m0, m0 //save M0
/* the first wave in the threadgroup */
/* global mem offset */ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 s_mov_b32 s_save_exec_hi, 0x0
s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //get wave_save_size s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26]
s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi
/* save SGPRs */
/* save VGPRs */ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
//////////////////////////////
L_SAVE_VGPR: // SGPR SR memory offset : size(VGPR)+size(SVGPR)
get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on get_svgpr_size_bytes(s_save_tmp)
s_and_b32 m0, s_wave_size, 1 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
s_cmp_eq_u32 m0, 1 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI
s_mov_b32 exec_hi, 0x00000000 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
s_branch L_SAVE_VGPR_NORMAL s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0
L_ENABLE_SAVE_VGPR_EXEC_HI: s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
s_mov_b32 exec_hi, 0xFFFFFFFF s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
L_SAVE_VGPR_NORMAL:
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size s_mov_b32 m0, 0x0 //SGPR initial index value =0
//for wave32 and wave64, the num of vgpr function is the same? s_nop 0x0 //Manually inserted wait states
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 L_SAVE_SGPR_LOOP:
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible // SGPR is allocated in 16 SGPR granularity
//determine it is wave32 or wave64 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
s_and_b32 m0, s_wave_size, 1 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
s_cmp_eq_u32 m0, 1 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
s_cbranch_scc1 L_SAVE_VGPR_WAVE64 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
//zhenxu added it for save vgpr for wave32 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4) s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
if (SWIZZLE_EN) s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes s_add_u32 m0, m0, 16 //next sgpr index
end s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0
s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete?
s_mov_b32 m0, 0x0 //VGPR initial index value =0
//s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 //save the rest 10 SGPR
//s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
L_SAVE_VGPR_WAVE32_LOOP: s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
v_movrels_b32 v0, v0 //v0 = v[0+m0] s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
if(USE_MTBUF_INSTEAD_OF_MUBUF) write_10sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else // restore s_save_buf_rsrc0,1
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask
end
/* save first 4 VGPR, then LDS save could use */
s_add_u32 m0, m0, 1 //next vgpr index // each wave will alloc 4 vgprs at least...
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 128 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 s_mov_b32 s_save_mem_offset, 0
s_cbranch_scc1 L_SAVE_VGPR_WAVE32_LOOP //VGPR save is complete? s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
s_branch L_SAVE_LDS s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
//save vgpr for wave32 ends s_and_b32 m0, m0, 1
s_cmp_eq_u32 m0, 1
L_SAVE_VGPR_WAVE64: s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) s_mov_b32 exec_hi, 0x00000000
if (SWIZZLE_EN) s_branch L_SAVE_4VGPR_WAVE32
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? L_ENABLE_SAVE_4VGPR_EXEC_HI:
else s_mov_b32 exec_hi, 0xFFFFFFFF
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes s_branch L_SAVE_4VGPR_WAVE64
end L_SAVE_4VGPR_WAVE32:
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
s_mov_b32 m0, 0x0 //VGPR initial index value =0
//s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 // VGPR Allocated in 4-GPR granularity
//s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
L_SAVE_VGPR_WAVE64_LOOP: buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
v_movrels_b32 v0, v0 //v0 = v[0+m0] buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
if(USE_MTBUF_INSTEAD_OF_MUBUF) s_branch L_SAVE_LDS
tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else L_SAVE_4VGPR_WAVE64:
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
// VGPR Allocated in 4-GPR granularity
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
s_cbranch_scc1 L_SAVE_VGPR_WAVE64_LOOP //VGPR save is complete? buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
//s_set_gpr_idx_off buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
//
//Below part will be the save shared vgpr part (new for gfx10) /* save LDS */
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? L_SAVE_LDS:
s_cbranch_scc0 L_SAVE_LDS //no shared_vgpr used? jump to L_SAVE_LDS // Change EXEC to all threads...
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
//save shared_vgpr will start from the index of m0 s_and_b32 m0, m0, 1
s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 s_cmp_eq_u32 m0, 1
s_mov_b32 exec_lo, 0xFFFFFFFF s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI
s_mov_b32 exec_hi, 0x00000000 s_mov_b32 exec_hi, 0x00000000
L_SAVE_SHARED_VGPR_WAVE64_LOOP: s_branch L_SAVE_LDS_NORMAL
v_movrels_b32 v0, v0 //v0 = v[0+m0] L_ENABLE_SAVE_LDS_EXEC_HI:
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 s_mov_b32 exec_hi, 0xFFFFFFFF
s_add_u32 m0, m0, 1 //next vgpr index L_SAVE_LDS_NORMAL:
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 256 bytes s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
/* save LDS */ s_barrier //LDS is used? wait for other waves in the same TG
////////////////////////////// s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
L_SAVE_LDS: s_cbranch_scc0 L_SAVE_LDS_DONE
//Only check the first wave need LDS // first wave do LDS save;
/* the first wave in the threadgroup */
s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG" s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
s_cbranch_scc0 L_SAVE_SGPR s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
s_and_b32 m0, s_wave_size, 1 //
s_cmp_eq_u32 m0, 1 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI get_svgpr_size_bytes(s_save_tmp)
s_mov_b32 exec_hi, 0x00000000 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
s_branch L_SAVE_LDS_NORMAL s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
L_ENABLE_SAVE_LDS_EXEC_HI: s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
s_mov_b32 exec_hi, 0xFFFFFFFF
L_SAVE_LDS_NORMAL: s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? //load 0~63*4(byte address) to vgpr v0
s_cbranch_scc0 L_SAVE_SGPR //no lds used? jump to L_SAVE_VGPR v_mbcnt_lo_u32_b32 v0, -1, 0
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw v_mbcnt_hi_u32_b32 v0, -1, v0
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes v_mul_u32_u24 v0, 4, v0
s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
if (SWIZZLE_EN) s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? s_and_b32 m0, m0, 1
else s_cmp_eq_u32 m0, 1
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes s_mov_b32 m0, 0x0
end s_cbranch_scc1 L_SAVE_LDS_W64
//load 0~63*4(byte address) to vgpr v15 L_SAVE_LDS_W32:
v_mbcnt_lo_u32_b32 v0, -1, 0 s_mov_b32 s3, 128
v_mbcnt_hi_u32_b32 v0, -1, v0 s_nop 0
v_mul_u32_u24 v0, 4, v0 s_nop 0
s_nop 0
s_and_b32 m0, s_wave_size, 1 L_SAVE_LDS_LOOP_W32:
s_cmp_eq_u32 m0, 1 ds_read_b32 v1, v0
s_mov_b32 m0, 0x0 s_waitcnt 0
s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
L_SAVE_LDS_LOOP_W32: s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
if (SAVE_LDS) s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
ds_read_b32 v1, v0 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
s_waitcnt 0 //ensure data ready s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete?
//buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10
end s_branch L_SAVE_LDS_DONE
s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //mem offset increased by 128 bytes L_SAVE_LDS_W64:
v_add_nc_u32 v0, v0, 128 s_mov_b32 s3, 256
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 s_nop 0
s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? s_nop 0
s_branch L_SAVE_SGPR s_nop 0
L_SAVE_LDS_LOOP_W64:
L_SAVE_LDS_LOOP_W64: ds_read_b32 v1, v0
if (SAVE_LDS) s_waitcnt 0
ds_read_b32 v1, v0 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
s_waitcnt 0 //ensure data ready
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
//buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
end v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes
s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete?
v_add_nc_u32 v0, v0, 256
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 L_SAVE_LDS_DONE:
s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? /* save VGPRs - set the Rest VGPRs */
L_SAVE_VGPR:
// VGPR SR memory offset: 0
/* save SGPRs */ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
////////////////////////////// s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
//s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size s_and_b32 m0, m0, 1
//s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 s_cmp_eq_u32 m0, 1
//s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI
//s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //In gfx10, Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value) s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs
L_SAVE_SGPR: s_mov_b32 exec_hi, 0x00000000
//need to look at it is wave32 or wave64 s_branch L_SAVE_VGPR_NORMAL
s_and_b32 m0, s_wave_size, 1 L_ENABLE_SAVE_VGPR_EXEC_HI:
s_cmp_eq_u32 m0, 1 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
s_cbranch_scc1 L_SAVE_SGPR_VMEM_WAVE64 s_mov_b32 exec_hi, 0xFFFFFFFF
if (SGPR_SAVE_USE_SQC) L_SAVE_VGPR_NORMAL:
s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
else s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads) s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
end //determine it is wave32 or wave64
s_branch L_SAVE_SGPR_CONT s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
L_SAVE_SGPR_VMEM_WAVE64: s_and_b32 m0, m0, 1
if (SGPR_SAVE_USE_SQC) s_cmp_eq_u32 m0, 1
s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes s_cbranch_scc1 L_SAVE_VGPR_WAVE64
else
s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads) s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
L_SAVE_SGPR_CONT: // VGPR Allocated in 4-GPR granularity
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? // VGPR store using dw burst
else s_mov_b32 m0, 0x4 //VGPR initial index value =4
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes s_cmp_lt_u32 m0, s_save_alloc_size
end s_cbranch_scc0 L_SAVE_VGPR_END
//s_mov_b32 m0, 0x0 //SGPR initial index value =0 L_SAVE_VGPR_W32_LOOP:
//s_nop 0x0 //Manually inserted wait states v_movrels_b32 v0, v0 //v0 = v[0+m0]
v_movrels_b32 v1, v1 //v1 = v[1+m0]
s_and_b32 m0, s_wave_size, 1 v_movrels_b32 v2, v2 //v2 = v[2+m0]
s_cmp_eq_u32 m0, 1 v_movrels_b32 v3, v3 //v3 = v[3+m0]
s_mov_b32 m0, 0x0 //SGPR initial index value =0 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
s_nop 0x0 //Manually inserted wait states buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
L_SAVE_SGPR_LOOP_WAVE32: s_add_u32 m0, m0, 4 //next vgpr index
s_movrels_b32 s0, s0 //s0 = s[0+m0] s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes
//zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
write_sgpr_to_mem_wave32(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete?
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0 s_branch L_SAVE_VGPR_END
s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE32 //SGPR save is complete?
s_branch L_SAVE_HWREG L_SAVE_VGPR_WAVE64:
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
L_SAVE_SGPR_LOOP_WAVE64:
s_movrels_b32 s0, s0 //s0 = s[0+m0] // VGPR store using dw burst
//zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change s_mov_b32 m0, 0x4 //VGPR initial index value =4
write_sgpr_to_mem_wave64(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4 s_cmp_lt_u32 m0, s_save_alloc_size
s_add_u32 m0, m0, 1 //next sgpr index s_cbranch_scc0 L_SAVE_VGPR_END
s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0
s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64 //SGPR save is complete? L_SAVE_VGPR_W64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
v_movrels_b32 v1, v1 //v1 = v[1+m0]
/* save HW registers */ v_movrels_b32 v2, v2 //v2 = v[2+m0]
////////////////////////////// v_movrels_b32 v3, v3 //v3 = v[3+m0]
L_SAVE_HWREG:
s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
if (SWIZZLE_EN) buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
else buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end s_add_u32 m0, m0, 4 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
s_and_b32 m0, s_wave_size, 1 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cmp_eq_u32 m0, 1 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete?
s_cbranch_scc1 L_SAVE_HWREG_WAVE64
//Below part will be the save shared vgpr part (new for gfx10)
write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
end //save shared_vgpr will start from the index of m0
s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
write_sgpr_to_mem_wave32(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC s_mov_b32 exec_lo, 0xFFFFFFFF
write_sgpr_to_mem_wave32(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) s_mov_b32 exec_hi, 0x00000000
write_sgpr_to_mem_wave32(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC L_SAVE_SHARED_VGPR_WAVE64_LOOP:
write_sgpr_to_mem_wave32(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) v_movrels_b32 v0, v0 //v0 = v[0+m0]
write_sgpr_to_mem_wave32(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
s_add_u32 m0, m0, 1 //next vgpr index
//s_save_trapsts conflicts with s_save_alloc_size s_add_u32 s_save_mem_offset, s_save_mem_offset, 128
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
write_sgpr_to_mem_wave32(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
//write_sgpr_to_mem_wave32(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO L_SAVE_VGPR_END:
write_sgpr_to_mem_wave32(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI s_branch L_END_PGM
//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
if(SAVE_RESTORE_HWID_DDID)
s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
end
s_branch L_S_PGM_END_SAVED
L_SAVE_HWREG_WAVE64:
write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
end
write_sgpr_to_mem_wave64(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
write_sgpr_to_mem_wave64(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem_wave64(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
write_sgpr_to_mem_wave64(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem_wave64(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
//s_save_trapsts conflicts with s_save_alloc_size
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
write_sgpr_to_mem_wave64(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
//write_sgpr_to_mem_wave64(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
write_sgpr_to_mem_wave64(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
if(SAVE_RESTORE_HWID_DDID)
s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
/* save DDID */
//////////////////////////////
L_SAVE_DDID:
//EXEC has been saved, no vector inst following
s_mov_b32 exec_lo, 0x80000000 //Set MSB to 1. Cleared when draw index is returned
s_sendmsg sendmsg(MSG_GET_DDID)
L_WAIT_DDID_LOOP:
s_nop 7 // sleep a bit
s_bitcmp0_b32 exec_lo, 31 // test to see if MSB is cleared, meaning done
s_cbranch_scc0 L_WAIT_DDID_LOOP
s_mov_b32 s_save_m0, exec_lo
s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_SAVE_DDID_WAVE64
write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
L_SAVE_DDID_WAVE64:
write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
end
L_S_PGM_END_SAVED:
/* S_PGM_END_SAVED */ //FIXME graphics ONLY
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
s_rfe_b64 s_save_pc_lo //Return to the main shader program
else
end
s_branch L_END_PGM
/**************************************************************************/
/* restore routine */
/**************************************************************************/
L_RESTORE: L_RESTORE:
/* Setup Resource Contants */ /* Setup Resource Contants */
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
//calculate wd_addr using absolute thread id s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
v_readlane_b32 s_restore_tmp, v9, 0 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
//determine it is wave32 or wave64 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //change to ttmp13 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
s_cmp_eq_u32 s_restore_size, 0 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
s_cbranch_scc1 L_RESTORE_WAVE32 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 //SAVE WAVE64 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
s_branch L_RESTORE_CON s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
L_RESTORE_WAVE32: s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
s_lshr_b32 s_restore_tmp, s_restore_tmp, 5 //SAVE WAVE32 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
L_RESTORE_CON: //determine it is wave32 or wave64
s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO s_lshl_b32 s_restore_size, s_restore_size, S_WAVE_SIZE
s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size
s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
else s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
end s_cbranch_scc0 L_RESTORE_VGPR
s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo /* restore LDS */
s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi L_RESTORE_LDS:
s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC s_and_b32 m0, m0, 1
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK s_cmp_eq_u32 m0, 1
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC s_mov_b32 exec_hi, 0x00000000
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK s_branch L_RESTORE_LDS_NORMAL
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position L_ENABLE_RESTORE_LDS_EXEC_HI:
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE s_mov_b32 exec_hi, 0xFFFFFFFF
//determine it is wave32 or wave64 L_RESTORE_LDS_NORMAL:
s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size //share s_wave_size with exec_hi s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
/* global mem offset */ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
/* restore VGPRs */
////////////////////////////// // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
L_RESTORE_VGPR: //
get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead get_svgpr_size_bytes(s_restore_tmp)
s_and_b32 m0, s_restore_size, 1 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
s_cmp_eq_u32 m0, 1 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
s_mov_b32 exec_hi, 0x00000000
s_branch L_RESTORE_VGPR_NORMAL s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
L_ENABLE_RESTORE_VGPR_EXEC_HI:
s_mov_b32 exec_hi, 0xFFFFFFFF s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
L_RESTORE_VGPR_NORMAL: s_and_b32 m0, m0, 1
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size s_cmp_eq_u32 m0, 1
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 s_mov_b32 m0, 0x0
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
//determine it is wave32 or wave64
s_and_b32 m0, s_restore_size, 1 L_RESTORE_LDS_LOOP_W32:
s_cmp_eq_u32 m0, 1 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 s_add_u32 m0, m0, 128 // 128 DW
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4) s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
if (SWIZZLE_EN) s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete?
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? s_branch L_RESTORE_VGPR
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes L_RESTORE_LDS_LOOP_W64:
end buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
s_add_u32 m0, m0, 256 // 256 DW
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
s_mov_b32 m0, 1 //VGPR initial index value = 1 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete?
//s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
//s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later, might not need this in gfx10 /* restore VGPRs */
L_RESTORE_VGPR:
L_RESTORE_VGPR_WAVE32_LOOP: // VGPR SR memory offset : 0
if(USE_MTBUF_INSTEAD_OF_MUBUF) s_mov_b32 s_restore_mem_offset, 0x0
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
else s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 s_and_b32 m0, m0, 1
end s_cmp_eq_u32 m0, 1
s_waitcnt vmcnt(0) //ensure data ready s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI
v_movreld_b32 v0, v0 //v[0+m0] = v0 s_mov_b32 exec_hi, 0x00000000
s_add_u32 m0, m0, 1 //next vgpr index s_branch L_RESTORE_VGPR_NORMAL
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 128 bytes L_ENABLE_RESTORE_VGPR_EXEC_HI:
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 s_mov_b32 exec_hi, 0xFFFFFFFF
s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? L_RESTORE_VGPR_NORMAL:
//s_set_gpr_idx_off s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
/* VGPR restore on v0 */ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
if(USE_MTBUF_INSTEAD_OF_MUBUF) s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 //determine it is wave32 or wave64
else s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 s_and_b32 m0, m0, 1
end s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
s_branch L_RESTORE_LDS
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
L_RESTORE_VGPR_WAVE64:
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) // VGPR load using dw burst
if (SWIZZLE_EN) s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4
else s_mov_b32 m0, 4 //VGPR initial index value = 4
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end L_RESTORE_VGPR_WAVE32_LOOP:
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2
s_mov_b32 m0, 1 //VGPR initial index value = 1 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3
L_RESTORE_VGPR_WAVE64_LOOP: s_waitcnt vmcnt(0)
if(USE_MTBUF_INSTEAD_OF_MUBUF) v_movreld_b32 v0, v0 //v[0+m0] = v0
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 v_movreld_b32 v1, v1
else v_movreld_b32 v2, v2
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 v_movreld_b32 v3, v3
end s_add_u32 m0, m0, 4 //next vgpr index
s_waitcnt vmcnt(0) //ensure data ready s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes
v_movreld_b32 v0, v0 //v[0+m0] = v0 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_add_u32 m0, m0, 1 //next vgpr index s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 /* VGPR restore on v0 */
s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
//s_set_gpr_idx_off buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
// buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
//Below part will be the restore shared vgpr part (new for gfx10) buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? s_branch L_RESTORE_SGPR
s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? jump to L_SAVE_LDS
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) L_RESTORE_VGPR_WAVE64:
//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
//restore shared_vgpr will start from the index of m0
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 // VGPR load using dw burst
s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last
s_mov_b32 exec_hi, 0x00000000 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
L_RESTORE_SHARED_VGPR_WAVE64_LOOP: s_mov_b32 m0, 4 //VGPR initial index value = 4
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
s_waitcnt vmcnt(0) //ensure data ready L_RESTORE_VGPR_WAVE64_LOOP:
v_movreld_b32 v0, v0 //v[0+m0] = v0 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
s_add_u32 m0, m0, 1 //next vgpr index buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 256 bytes buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? s_waitcnt vmcnt(0)
v_movreld_b32 v0, v0 //v[0+m0] = v0
s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! v_movreld_b32 v1, v1
v_movreld_b32 v2, v2
/* VGPR restore on v0 */ v_movreld_b32 v3, v3
L_RESTORE_V0: s_add_u32 m0, m0, 4 //next vgpr index
if(USE_MTBUF_INSTEAD_OF_MUBUF) s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
else s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
end //Below part will be the restore shared vgpr part (new for gfx10)
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
/* restore LDS */ s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used?
////////////////////////////// s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
L_RESTORE_LDS: //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
//restore shared_vgpr will start from the index of m0
//Only need to check the first wave s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
/* the first wave in the threadgroup */ s_mov_b32 exec_lo, 0xFFFFFFFF
s_and_b32 s_restore_tmp, s_restore_size, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK s_mov_b32 exec_hi, 0x00000000
s_cbranch_scc0 L_RESTORE_SGPR L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead s_waitcnt vmcnt(0)
s_and_b32 m0, s_restore_size, 1 v_movreld_b32 v0, v0 //v[0+m0] = v0
s_cmp_eq_u32 m0, 1 s_add_u32 m0, m0, 1 //next vgpr index
s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
s_mov_b32 exec_hi, 0x00000000 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_branch L_RESTORE_LDS_NORMAL s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
L_ENABLE_RESTORE_LDS_EXEC_HI:
s_mov_b32 exec_hi, 0xFFFFFFFF s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
L_RESTORE_LDS_NORMAL:
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size /* VGPR restore on v0 */
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? L_RESTORE_V0:
s_cbranch_scc0 L_RESTORE_SGPR //no lds used? jump to L_RESTORE_VGPR buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? /* restore SGPRs */
else //will be 2+8+16*6
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes // SGPR SR memory offset : size(VGPR)+size(SVGPR)
end L_RESTORE_SGPR:
get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
s_and_b32 m0, s_wave_size, 1 get_svgpr_size_bytes(s_restore_tmp)
s_cmp_eq_u32 m0, 1 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
s_mov_b32 m0, 0x0 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 22*4 //s106~s127 is not saved
s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 2*4 // restore SGPR from S[n] to S[0], by 2 sgprs group
L_RESTORE_LDS_LOOP_W32:
if (SAVE_LDS) s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
s_waitcnt 0 s_mov_b32 m0, s_sgpr_save_num
end
s_add_u32 m0, m0, 128 //every buffer_load_dword does 256 bytes read_2sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 s_waitcnt lgkmcnt(0)
s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete?
s_branch L_RESTORE_SGPR s_sub_u32 m0, m0, 2 // Restore from S[n] to S[0]
s_nop 0 // hazard SALU M0=> S_MOVREL
L_RESTORE_LDS_LOOP_W64:
if (SAVE_LDS) s_movreld_b64 s0, s0 //s[0+m0] = s0
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
s_waitcnt 0 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
end s_waitcnt lgkmcnt(0)
s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes s_sub_u32 m0, m0, 8 // Restore from S[n] to S[0]
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 s_nop 0 // hazard SALU M0=> S_MOVREL
s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete?
s_movreld_b64 s0, s0 //s[0+m0] = s0
s_movreld_b64 s2, s2
/* restore SGPRs */ s_movreld_b64 s4, s4
////////////////////////////// s_movreld_b64 s6, s6
//s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
//s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 L_RESTORE_SGPR_LOOP:
//s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
//s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value) s_waitcnt lgkmcnt(0)
L_RESTORE_SGPR:
//need to look at it is wave32 or wave64 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
s_and_b32 m0, s_restore_size, 1 s_nop 0 // hazard SALU M0=> S_MOVREL
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_SGPR_VMEM_WAVE64 s_movreld_b64 s0, s0 //s[0+m0] = s0
if (SGPR_SAVE_USE_SQC) s_movreld_b64 s2, s2
s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes s_movreld_b64 s4, s4
else s_movreld_b64 s6, s6
s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads) s_movreld_b64 s8, s8
end s_movreld_b64 s10, s10
s_branch L_RESTORE_SGPR_CONT s_movreld_b64 s12, s12
L_RESTORE_SGPR_VMEM_WAVE64: s_movreld_b64 s14, s14
if (SGPR_SAVE_USE_SQC)
s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0
else s_cbranch_scc0 L_RESTORE_SGPR_LOOP
s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads)
end /* restore HW registers */
L_RESTORE_HWREG:
L_RESTORE_SGPR_CONT: // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
if (SWIZZLE_EN) get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? get_svgpr_size_bytes(s_restore_tmp)
else s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
end
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
s_cbranch_scc1 L_RESTORE_SGPR_WAVE64 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
read_sgpr_from_mem_wave32(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
s_mov_b32 m0, 0x1 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
L_RESTORE_SGPR_LOOP_WAVE32: read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
read_sgpr_from_mem_wave32(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
s_waitcnt lgkmcnt(0) //ensure data ready read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
s_movreld_b32 s0, s0 //s[0+m0] = s0
s_nop 0 // hazard SALU M0=> S_MOVREL s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0 s_mov_b32 s_restore_tmp, s_restore_pc_hi
s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE32 //SGPR restore (except s0) is complete? s_and_b32 s_restore_pc_hi, s_restore_tmp, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
s_branch L_RESTORE_HWREG s_mov_b32 m0, s_restore_m0
s_mov_b32 exec_lo, s_restore_exec_lo
L_RESTORE_SGPR_WAVE64: s_mov_b32 exec_hi, s_restore_exec_hi
read_sgpr_from_mem_wave64(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
L_RESTORE_SGPR_LOOP_WAVE64:
read_sgpr_from_mem_wave64(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
s_waitcnt lgkmcnt(0) //ensure data ready
s_movreld_b32 s0, s0 //s[0+m0] = s0
s_nop 0 // hazard SALU M0=> S_MOVREL
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE64 //SGPR restore (except s0) is complete?
s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
/* restore HW registers */
//////////////////////////////
L_RESTORE_HWREG:
s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_HWREG_WAVE64
read_sgpr_from_mem_wave32(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
read_sgpr_from_mem_wave32(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
read_sgpr_from_mem_wave32(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem_wave32(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
read_sgpr_from_mem_wave32(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem_wave32(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
read_sgpr_from_mem_wave32(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
//read_sgpr_from_mem_wave32(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
//read_sgpr_from_mem_wave32(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
read_sgpr_from_mem_wave32(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK
read_sgpr_from_mem_wave32(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
if(SAVE_RESTORE_HWID_DDID)
read_sgpr_from_mem_wave32(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1
end
s_branch L_RESTORE_HWREG_FINISH
L_RESTORE_HWREG_WAVE64:
read_sgpr_from_mem_wave64(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
read_sgpr_from_mem_wave64(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
read_sgpr_from_mem_wave64(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem_wave64(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
read_sgpr_from_mem_wave64(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem_wave64(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
read_sgpr_from_mem_wave64(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
//read_sgpr_from_mem_wave64(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
//read_sgpr_from_mem_wave64(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
read_sgpr_from_mem_wave64(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK
read_sgpr_from_mem_wave64(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
if(SAVE_RESTORE_HWID_DDID)
read_sgpr_from_mem_wave64(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1
end
L_RESTORE_HWREG_FINISH:
s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
if(SAVE_RESTORE_HWID_DDID)
L_RESTORE_DDID:
s_mov_b32 m0, s_restore_hwid1 //virture ttrace support: The save-context handler records the SE/SA/WGP/SIMD/wave of the original wave
s_ttracedata //and then can output it as SHADER_DATA to ttrace on restore to provide a correlation across the save-restore
s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_DDID_WAVE64
read_sgpr_from_mem_wave32(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
s_branch L_RESTORE_DDID_FINISH
L_RESTORE_DDID_WAVE64:
read_sgpr_from_mem_wave64(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
L_RESTORE_DDID_FINISH:
s_waitcnt lgkmcnt(0)
//s_mov_b32 m0, s_restore_ddid
//s_ttracedata
if (RESTORE_DDID_IN_SGPR18)
s_mov_b32 s18, s_restore_ddid
end
end
s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
end
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
end
s_mov_b32 m0, s_restore_m0
s_mov_b32 exec_lo, s_restore_exec_lo
s_mov_b32 exec_hi, s_restore_exec_hi
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask //restore xnack_mask s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
//s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK
//reuse s_restore_m0 as a temp register s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT s_mov_b32 s_restore_mode, 0x0
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0
s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0
s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_mode
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
s_rfe_b64 s_restore_pc_lo // s_restore_m0[0] is used to set STATUS.inst_atc s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
/**************************************************************************/
/* the END */ s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
/**************************************************************************/
L_END_PGM: L_END_PGM:
s_endpgm s_endpgm
end
/**************************************************************************/
/* the helper functions */
/**************************************************************************/
function write_sgpr_to_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
if (use_sqc)
s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
s_mov_b32 m0, s_mem_offset
s_buffer_store_dword s, s_rsrc, m0 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 4
s_mov_b32 m0, exec_lo
elsif (use_mtbuf)
v_mov_b32 v0, s
tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 128
else
v_mov_b32 v0, s
buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 128
end
end end
function write_sgpr_to_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf) function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
if (use_sqc) s_mov_b32 exec_lo, m0
s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on s_mov_b32 m0, s_mem_offset
s_mov_b32 m0, s_mem_offset s_buffer_store_dword s, s_rsrc, m0 glc:1
s_buffer_store_dword s, s_rsrc, m0 glc:1 s_add_u32 s_mem_offset, s_mem_offset, 4
s_add_u32 s_mem_offset, s_mem_offset, 4 s_mov_b32 m0, exec_lo
s_mov_b32 m0, exec_lo end
elsif (use_mtbuf)
v_mov_b32 v0, s
tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
s_add_u32 s_mem_offset, s_mem_offset, 256 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
else s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
v_mov_b32 v0, s s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1
buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 256 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
end s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
end
function write_10sgpr_to_mem(s, s_rsrc, s_mem_offset)
s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
s_buffer_store_dwordx2 s[8], s_rsrc, 32 glc:1
s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
end
function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
s_add_u32 s_mem_offset, s_mem_offset, 4
end end
function read_sgpr_from_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc) function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1
if (use_sqc) s_sub_u32 s_mem_offset, s_mem_offset, 4*16
s_add_u32 s_mem_offset, s_mem_offset, 4
else
s_add_u32 s_mem_offset, s_mem_offset, 128
end
end end
function read_sgpr_from_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc) function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset glc:1
if (use_sqc) s_sub_u32 s_mem_offset, s_mem_offset, 4*16
s_add_u32 s_mem_offset, s_mem_offset, 4
else
s_add_u32 s_mem_offset, s_mem_offset, 256
end
end end
function read_2sgpr_from_mem(s, s_rsrc, s_mem_offset)
s_buffer_load_dwordx2 s, s_rsrc, s_mem_offset glc:1
s_sub_u32 s_mem_offset, s_mem_offset, 4*8
end
function get_lds_size_bytes(s_lds_size_byte)
s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
end
function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
s_lshr_b32 m0, s_size, S_WAVE_SIZE
s_and_b32 m0, m0, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_ENABLE_SHIFT_W64
s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value)
s_branch L_SHIFT_DONE
L_ENABLE_SHIFT_W64:
s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value)
L_SHIFT_DONE:
end
function get_svgpr_size_bytes(s_svgpr_size_byte)
s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
end
function get_sgpr_size_bytes
return 512
end
function get_hwreg_size_bytes
return 128
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment