Commit 385093fd authored by Jay Cornwall's avatar Jay Cornwall Committed by Alex Deucher

drm/amdkfd: Move trap handler coherence flags to preprocessor

No functional change. Preparation for gfx12 support.

v2: drop unrelated change (Alex)
Signed-off-by: default avatarJay Cornwall <jay.cornwall@amd.com>
Reviewed-by: default avatarHarish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 90e4fc83
...@@ -46,6 +46,9 @@ ...@@ -46,6 +46,9 @@
#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO) #define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO)
#define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger #define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger
#define S_COHERENCE glc:1
#define V_COHERENCE slc:1 glc:1
var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
...@@ -298,15 +301,15 @@ L_FETCH_2ND_TRAP: ...@@ -298,15 +301,15 @@ L_FETCH_2ND_TRAP:
s_or_b32 ttmp15, ttmp15, 0xFFFF0000 s_or_b32 ttmp15, ttmp15, 0xFFFF0000
L_NO_SIGN_EXTEND_TMA: L_NO_SIGN_EXTEND_TMA:
s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE // debug trap enabled flag
s_waitcnt lgkmcnt(0) s_waitcnt lgkmcnt(0)
s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
s_or_b32 ttmp11, ttmp11, ttmp2 s_or_b32 ttmp11, ttmp11, ttmp2
s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE // second-level TBA
s_waitcnt lgkmcnt(0) s_waitcnt lgkmcnt(0)
s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE // second-level TMA
s_waitcnt lgkmcnt(0) s_waitcnt lgkmcnt(0)
s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
...@@ -399,7 +402,7 @@ L_SLEEP: ...@@ -399,7 +402,7 @@ L_SLEEP:
s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF
s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_lo, 0xFFFFFFFF
s_mov_b32 exec_hi, 0xFFFFFFFF s_mov_b32 exec_hi, 0xFFFFFFFF
global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] slc:1 glc:1 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] V_COHERENCE
v_mov_b32 v0, 0x0 v_mov_b32 v0, 0x0
s_mov_b32 exec_lo, s_save_ttmps_lo s_mov_b32 exec_lo, s_save_ttmps_lo
s_mov_b32 exec_hi, s_save_ttmps_hi s_mov_b32 exec_hi, s_save_ttmps_hi
...@@ -431,15 +434,15 @@ L_SLEEP: ...@@ -431,15 +434,15 @@ L_SLEEP:
s_mov_b32 exec_lo, 0x3FFF s_mov_b32 exec_lo, 0x3FFF
s_mov_b32 exec_hi, 0x0 s_mov_b32 exec_hi, 0x0
global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 slc:1 glc:1 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 V_COHERENCE
v_readlane_b32 ttmp14, v0, 0xE v_readlane_b32 ttmp14, v0, 0xE
v_readlane_b32 ttmp15, v0, 0xF v_readlane_b32 ttmp15, v0, 0xF
s_mov_b32 exec_lo, ttmp14 s_mov_b32 exec_lo, ttmp14
s_mov_b32 exec_hi, ttmp15 s_mov_b32 exec_hi, ttmp15
#else #else
s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 S_COHERENCE
s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 S_COHERENCE
s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 S_COHERENCE
#endif #endif
/* setup Resource Contants */ /* setup Resource Contants */
...@@ -488,11 +491,11 @@ L_SAVE_FIRST_VGPRS32_WITH_TCP: ...@@ -488,11 +491,11 @@ L_SAVE_FIRST_VGPRS32_WITH_TCP:
#endif #endif
#if !NO_SQC_STORE #if !NO_SQC_STORE
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
#endif #endif
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128
buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2
buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3
s_branch L_SAVE_HWREG s_branch L_SAVE_HWREG
L_SAVE_4VGPR_WAVE64: L_SAVE_4VGPR_WAVE64:
...@@ -511,11 +514,11 @@ L_SAVE_FIRST_VGPRS64_WITH_TCP: ...@@ -511,11 +514,11 @@ L_SAVE_FIRST_VGPRS64_WITH_TCP:
#endif #endif
#if !NO_SQC_STORE #if !NO_SQC_STORE
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
#endif #endif
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256
buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2
buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3
/* save HW registers */ /* save HW registers */
...@@ -562,7 +565,7 @@ L_SAVE_HWREG: ...@@ -562,7 +565,7 @@ L_SAVE_HWREG:
// Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
s_mov_b32 exec_lo, 0xFFFF s_mov_b32 exec_lo, 0xFFFF
s_mov_b32 exec_hi, 0x0 s_mov_b32 exec_hi, 0x0
buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
// Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_lo, 0xFFFFFFFF
...@@ -605,7 +608,7 @@ L_SAVE_SGPR_LOOP: ...@@ -605,7 +608,7 @@ L_SAVE_SGPR_LOOP:
s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled?
s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE
buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80
s_mov_b32 ttmp13, 0x0 s_mov_b32 ttmp13, 0x0
v_mov_b32 v2, 0x0 v_mov_b32 v2, 0x0
...@@ -626,7 +629,7 @@ L_SAVE_SGPR_SKIP_TCP_STORE: ...@@ -626,7 +629,7 @@ L_SAVE_SGPR_SKIP_TCP_STORE:
write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
#if NO_SQC_STORE #if NO_SQC_STORE
buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
#else #else
// restore s_save_buf_rsrc0,1 // restore s_save_buf_rsrc0,1
s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask
...@@ -709,7 +712,7 @@ L_SAVE_LDS_WITH_TCP_W32: ...@@ -709,7 +712,7 @@ L_SAVE_LDS_WITH_TCP_W32:
L_SAVE_LDS_LOOP_W32: L_SAVE_LDS_LOOP_W32:
ds_read_b32 v1, v0 ds_read_b32 v1, v0
s_waitcnt 0 s_waitcnt 0
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
...@@ -747,7 +750,7 @@ L_SAVE_LDS_WITH_TCP_W64: ...@@ -747,7 +750,7 @@ L_SAVE_LDS_WITH_TCP_W64:
L_SAVE_LDS_LOOP_W64: L_SAVE_LDS_LOOP_W64:
ds_read_b32 v1, v0 ds_read_b32 v1, v0
s_waitcnt 0 s_waitcnt 0
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
...@@ -814,10 +817,10 @@ L_SAVE_VGPR_W32_LOOP: ...@@ -814,10 +817,10 @@ L_SAVE_VGPR_W32_LOOP:
v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v2, v2 //v2 = v[2+m0]
v_movrels_b32 v3, v3 //v3 = v[3+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0]
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128
buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2
buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3
s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 m0, m0, 4 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes
...@@ -859,10 +862,10 @@ L_SAVE_VGPR_W64_LOOP: ...@@ -859,10 +862,10 @@ L_SAVE_VGPR_W64_LOOP:
v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v2, v2 //v2 = v[2+m0]
v_movrels_b32 v3, v3 //v3 = v[3+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0]
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256
buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2
buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3
s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 m0, m0, 4 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
...@@ -899,7 +902,7 @@ L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC: ...@@ -899,7 +902,7 @@ L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC:
L_SAVE_SHARED_VGPR_WAVE64_LOOP: L_SAVE_SHARED_VGPR_WAVE64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0] v_movrels_b32 v0, v0 //v0 = v[0+m0]
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
s_add_u32 m0, m0, 1 //next vgpr index s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
...@@ -1017,10 +1020,10 @@ L_RESTORE_VGPR_NORMAL: ...@@ -1017,10 +1020,10 @@ L_RESTORE_VGPR_NORMAL:
s_cbranch_scc0 L_RESTORE_SGPR s_cbranch_scc0 L_RESTORE_SGPR
L_RESTORE_VGPR_WAVE32_LOOP: L_RESTORE_VGPR_WAVE32_LOOP:
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128
buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2
buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3
s_waitcnt vmcnt(0) s_waitcnt vmcnt(0)
v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v0, v0 //v[0+m0] = v0
v_movreld_b32 v1, v1 v_movreld_b32 v1, v1
...@@ -1032,10 +1035,10 @@ L_RESTORE_VGPR_WAVE32_LOOP: ...@@ -1032,10 +1035,10 @@ L_RESTORE_VGPR_WAVE32_LOOP:
s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
/* VGPR restore on v0 */ /* VGPR restore on v0 */
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE
buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128
buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2
buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3
s_waitcnt vmcnt(0) s_waitcnt vmcnt(0)
s_branch L_RESTORE_SGPR s_branch L_RESTORE_SGPR
...@@ -1051,10 +1054,10 @@ L_RESTORE_VGPR_WAVE64: ...@@ -1051,10 +1054,10 @@ L_RESTORE_VGPR_WAVE64:
s_cbranch_scc0 L_RESTORE_SHARED_VGPR s_cbranch_scc0 L_RESTORE_SHARED_VGPR
L_RESTORE_VGPR_WAVE64_LOOP: L_RESTORE_VGPR_WAVE64_LOOP:
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256
buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2
buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3
s_waitcnt vmcnt(0) s_waitcnt vmcnt(0)
v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v0, v0 //v[0+m0] = v0
v_movreld_b32 v1, v1 v_movreld_b32 v1, v1
...@@ -1077,7 +1080,7 @@ L_RESTORE_SHARED_VGPR: ...@@ -1077,7 +1080,7 @@ L_RESTORE_SHARED_VGPR:
s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_lo, 0xFFFFFFFF
s_mov_b32 exec_hi, 0x00000000 s_mov_b32 exec_hi, 0x00000000
L_RESTORE_SHARED_VGPR_WAVE64_LOOP: L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
s_waitcnt vmcnt(0) s_waitcnt vmcnt(0)
v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v0, v0 //v[0+m0] = v0
s_add_u32 m0, m0, 1 //next vgpr index s_add_u32 m0, m0, 1 //next vgpr index
...@@ -1089,10 +1092,10 @@ L_RESTORE_SHARED_VGPR_WAVE64_LOOP: ...@@ -1089,10 +1092,10 @@ L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
/* VGPR restore on v0 */ /* VGPR restore on v0 */
L_RESTORE_V0: L_RESTORE_V0:
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE
buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256
buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2
buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3
s_waitcnt vmcnt(0) s_waitcnt vmcnt(0)
/* restore SGPRs */ /* restore SGPRs */
...@@ -1207,9 +1210,9 @@ L_RESTORE_HWREG: ...@@ -1207,9 +1210,9 @@ L_RESTORE_HWREG:
s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE
s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE
s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE
s_waitcnt lgkmcnt(0) s_waitcnt lgkmcnt(0)
#if HAVE_XNACK #if HAVE_XNACK
...@@ -1247,7 +1250,7 @@ function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) ...@@ -1247,7 +1250,7 @@ function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
#else #else
s_mov_b32 exec_lo, m0 s_mov_b32 exec_lo, m0
s_mov_b32 m0, s_mem_offset s_mov_b32 m0, s_mem_offset
s_buffer_store_dword s, s_rsrc, m0 glc:1 s_buffer_store_dword s, s_rsrc, m0 S_COHERENCE
s_add_u32 s_mem_offset, s_mem_offset, 4 s_add_u32 s_mem_offset, s_mem_offset, 4
s_mov_b32 m0, exec_lo s_mov_b32 m0, exec_lo
#endif #endif
...@@ -1262,10 +1265,10 @@ function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) ...@@ -1262,10 +1265,10 @@ function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
s_add_u32 ttmp13, ttmp13, 0x1 s_add_u32 ttmp13, ttmp13, 0x1
end end
#else #else
s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE
s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE
s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE
s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 s_buffer_store_dwordx4 s[12], s_rsrc, 48 S_COHERENCE
s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
#endif #endif
...@@ -1279,32 +1282,32 @@ function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) ...@@ -1279,32 +1282,32 @@ function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset)
s_add_u32 ttmp13, ttmp13, 0x1 s_add_u32 ttmp13, ttmp13, 0x1
end end
#else #else
s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE
s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE
s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE
s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 s_add_u32 s_rsrc[0], s_rsrc[0], 4*12
s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
#endif #endif
end end
function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 s_buffer_load_dword s, s_rsrc, s_mem_offset S_COHERENCE
s_add_u32 s_mem_offset, s_mem_offset, 4 s_add_u32 s_mem_offset, s_mem_offset, 4
end end
function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
s_sub_u32 s_mem_offset, s_mem_offset, 4*16 s_sub_u32 s_mem_offset, s_mem_offset, 4*16
s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset S_COHERENCE
end end
function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
s_sub_u32 s_mem_offset, s_mem_offset, 4*8 s_sub_u32 s_mem_offset, s_mem_offset, 4*8
s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset glc:1 s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset S_COHERENCE
end end
function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
s_sub_u32 s_mem_offset, s_mem_offset, 4*4 s_sub_u32 s_mem_offset, s_mem_offset, 4*4
s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1 s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset S_COHERENCE
end end
#if SAVE_AFTER_XNACK_ERROR #if SAVE_AFTER_XNACK_ERROR
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment