Commit 14328aa5 authored by Philip Cox's avatar Philip Cox Committed by Alex Deucher

drm/amdkfd: Add navi10 support to amdkfd. (v3)

KFD (kernel fusion driver) is the kernel driver
for the compute backend for usermode compute
stack.

v2: squash in updates (Alex)
v3: squash in rebase fixes (Alex)
Signed-off-by: default avatarOak Zeng <Oak.Zeng@amd.com>
Signed-off-by: default avatarPhilip Cox <Philip.Cox@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e0d07657
......@@ -693,6 +693,14 @@ MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (defau
bool hws_gws_support;
module_param(hws_gws_support, bool, 0444);
MODULE_PARM_DESC(hws_gws_support, "MEC FW support gws barriers (false = not supported (Default), true = supported)");
/**
* DOC: queue_preemption_timeout_ms (int)
* queue preemption timeout in ms (1 = Minimum, 9000 = default)
*/
int queue_preemption_timeout_ms;
module_param(queue_preemption_timeout_ms, int, 0644);
MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption timeout in ms (1 = Minimum, 9000 = default)");
#endif
/**
......
......@@ -36,16 +36,19 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
$(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
$(AMDKFD_PATH)/kfd_mqd_manager_v9.o \
$(AMDKFD_PATH)/kfd_mqd_manager_v10.o \
$(AMDKFD_PATH)/kfd_kernel_queue.o \
$(AMDKFD_PATH)/kfd_kernel_queue_cik.o \
$(AMDKFD_PATH)/kfd_kernel_queue_vi.o \
$(AMDKFD_PATH)/kfd_kernel_queue_v9.o \
$(AMDKFD_PATH)/kfd_kernel_queue_v10.o \
$(AMDKFD_PATH)/kfd_packet_manager.o \
$(AMDKFD_PATH)/kfd_process_queue_manager.o \
$(AMDKFD_PATH)/kfd_device_queue_manager.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \
$(AMDKFD_PATH)/kfd_interrupt.o \
$(AMDKFD_PATH)/kfd_events.o \
$(AMDKFD_PATH)/cik_event_interrupt.o \
......
......@@ -561,3 +561,302 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
0xbf8a0000, 0x95806f6c,
0xbf810000, 0x00000000,
};
static const uint32_t cwsr_trap_gfx10_hex[] = {
0xbf820001, 0xbf82012e,
0xb0804004, 0xb970f802,
0x8a708670, 0xb971f803,
0x8771ff71, 0x00000400,
0xbf850008, 0xb971f803,
0x8771ff71, 0x000001ff,
0xbf850001, 0x806c846c,
0x876dff6d, 0x0000ffff,
0xbe80226c, 0xb971f803,
0x8771ff71, 0x00000100,
0xbf840006, 0xbef60380,
0xb9f60203, 0x876dff6d,
0x0000ffff, 0x80ec886c,
0x82ed806d, 0xbef60380,
0xb9f60283, 0xb973f816,
0xb9762c07, 0x8f769c76,
0x886d766d, 0xb97603c7,
0x8f769b76, 0x886d766d,
0xb976f807, 0x8776ff76,
0x00007fff, 0xb9f6f807,
0xbeee037e, 0xbeef037f,
0xbefe0480, 0xbf900004,
0xbf8e0002, 0xbf88fffe,
0xbef4037e, 0x8775ff7f,
0x0000ffff, 0x8875ff75,
0x00040000, 0xbef60380,
0xbef703ff, 0x00807fac,
0x8776ff7f, 0x08000000,
0x90768376, 0x88777677,
0x8776ff7f, 0x70000000,
0x90768176, 0x88777677,
0xbefb037c, 0xbefa0380,
0xb97202dc, 0x8872727f,
0xbefe03c1, 0x877c8172,
0xbf06817c, 0xbf850002,
0xbeff0380, 0xbf820001,
0xbeff03c1, 0xb9712a05,
0x80718171, 0x8f718271,
0x877c8172, 0xbf06817c,
0xbf85000d, 0x8f768771,
0xbef603ff, 0x01000000,
0xbefc0380, 0x7e008700,
0xe0704000, 0x7a5d0000,
0x807c817c, 0x807aff7a,
0x00000080, 0xbf0a717c,
0xbf85fff8, 0xbf82001b,
0x8f768871, 0xbef603ff,
0x01000000, 0xbefc0380,
0x7e008700, 0xe0704000,
0x7a5d0000, 0x807c817c,
0x807aff7a, 0x00000100,
0xbf0a717c, 0xbf85fff8,
0xb9711e06, 0x8771c171,
0xbf84000c, 0x8f718371,
0x80717c71, 0xbefe03c1,
0xbeff0380, 0x7e008700,
0xe0704000, 0x7a5d0000,
0x807c817c, 0x807aff7a,
0x00000080, 0xbf0a717c,
0xbf85fff8, 0xbf8a0000,
0x8776ff72, 0x04000000,
0xbf84002b, 0xbefe03c1,
0x877c8172, 0xbf06817c,
0xbf850002, 0xbeff0380,
0xbf820001, 0xbeff03c1,
0xb9714306, 0x8771c171,
0xbf840021, 0x8f718671,
0x8f718271, 0xbef60371,
0xbef603ff, 0x01000000,
0xd7650000, 0x000100c1,
0xd7660000, 0x000200c1,
0x16000084, 0x877c8172,
0xbf06817c, 0xbefc0380,
0xbf85000a, 0x807cff7c,
0x00000080, 0x807aff7a,
0x00000080, 0xd5250000,
0x0001ff00, 0x00000080,
0xbf0a717c, 0xbf85fff7,
0xbf820009, 0x807cff7c,
0x00000100, 0x807aff7a,
0x00000100, 0xd5250000,
0x0001ff00, 0x00000100,
0xbf0a717c, 0xbf85fff7,
0x877c8172, 0xbf06817c,
0xbf850003, 0x8f7687ff,
0x0000006a, 0xbf820002,
0x8f7688ff, 0x0000006a,
0xbef603ff, 0x01000000,
0x877c8172, 0xbf06817c,
0xbefc0380, 0xbf800000,
0xbf85000b, 0xbe802e00,
0x7e000200, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0x807c817c,
0xbf0aff7c, 0x0000006a,
0xbf85fff6, 0xbf82000a,
0xbe802e00, 0x7e000200,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0x807c817c, 0xbf0aff7c,
0x0000006a, 0xbf85fff6,
0xbef60384, 0xbef603ff,
0x01000000, 0x877c8172,
0xbf06817c, 0xbf850030,
0x7e00027b, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0x7e00026c,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000080,
0x7e00026d, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0x7e00026e,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000080,
0x7e00026f, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0x7e000270,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000080,
0xb971f803, 0x7e000271,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000080,
0x7e000273, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0xb97bf801,
0x7e00027b, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000080, 0xbf82002f,
0x7e00027b, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0x7e00026c,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0x7e00026d, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0x7e00026e,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0x7e00026f, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0x7e000270,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0xb971f803, 0x7e000271,
0xe0704000, 0x7a5d0000,
0x807aff7a, 0x00000100,
0x7e000273, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0xb97bf801,
0x7e00027b, 0xe0704000,
0x7a5d0000, 0x807aff7a,
0x00000100, 0xbf820119,
0xbef4037e, 0x8775ff7f,
0x0000ffff, 0x8875ff75,
0x00040000, 0xbef60380,
0xbef703ff, 0x00807fac,
0x8772ff7f, 0x08000000,
0x90728372, 0x88777277,
0x8772ff7f, 0x70000000,
0x90728172, 0x88777277,
0xb97902dc, 0x8879797f,
0xbef80380, 0xbefe03c1,
0x877c8179, 0xbf06817c,
0xbf850002, 0xbeff0380,
0xbf820001, 0xbeff03c1,
0xb96f2a05, 0x806f816f,
0x8f6f826f, 0x877c8179,
0xbf06817c, 0xbf850013,
0x8f76876f, 0xbef603ff,
0x01000000, 0xbef20378,
0x8078ff78, 0x00000080,
0xbefc0381, 0xe0304000,
0x785d0000, 0xbf8c3f70,
0x7e008500, 0x807c817c,
0x8078ff78, 0x00000080,
0xbf0a6f7c, 0xbf85fff7,
0xe0304000, 0x725d0000,
0xbf820023, 0x8f76886f,
0xbef603ff, 0x01000000,
0xbef20378, 0x8078ff78,
0x00000100, 0xbefc0381,
0xe0304000, 0x785d0000,
0xbf8c3f70, 0x7e008500,
0x807c817c, 0x8078ff78,
0x00000100, 0xbf0a6f7c,
0xbf85fff7, 0xb96f1e06,
0x876fc16f, 0xbf84000e,
0x8f6f836f, 0x806f7c6f,
0xbefe03c1, 0xbeff0380,
0xe0304000, 0x785d0000,
0xbf8c3f70, 0x7e008500,
0x807c817c, 0x8078ff78,
0x00000080, 0xbf0a6f7c,
0xbf85fff7, 0xbeff03c1,
0xe0304000, 0x725d0000,
0x8772ff79, 0x04000000,
0xbf840020, 0xbefe03c1,
0x877c8179, 0xbf06817c,
0xbf850002, 0xbeff0380,
0xbf820001, 0xbeff03c1,
0xb96f4306, 0x876fc16f,
0xbf840016, 0x8f6f866f,
0x8f6f826f, 0xbef6036f,
0xbef603ff, 0x01000000,
0x877c8172, 0xbf06817c,
0xbefc0380, 0xbf850007,
0x807cff7c, 0x00000080,
0x8078ff78, 0x00000080,
0xbf0a6f7c, 0xbf85fffa,
0xbf820006, 0x807cff7c,
0x00000100, 0x8078ff78,
0x00000100, 0xbf0a6f7c,
0xbf85fffa, 0x877c8179,
0xbf06817c, 0xbf850003,
0x8f7687ff, 0x0000006a,
0xbf820002, 0x8f7688ff,
0x0000006a, 0xbef603ff,
0x01000000, 0x877c8179,
0xbf06817c, 0xbf850012,
0xf4211cba, 0xf0000000,
0x8078ff78, 0x00000080,
0xbefc0381, 0xf421003a,
0xf0000000, 0x8078ff78,
0x00000080, 0xbf8cc07f,
0xbe803000, 0xbf800000,
0x807c817c, 0xbf0aff7c,
0x0000006a, 0xbf85fff5,
0xbe800372, 0xbf820011,
0xf4211cba, 0xf0000000,
0x8078ff78, 0x00000100,
0xbefc0381, 0xf421003a,
0xf0000000, 0x8078ff78,
0x00000100, 0xbf8cc07f,
0xbe803000, 0xbf800000,
0x807c817c, 0xbf0aff7c,
0x0000006a, 0xbf85fff5,
0xbe800372, 0xbef60384,
0xbef603ff, 0x01000000,
0x877c8179, 0xbf06817c,
0xbf850025, 0xf4211bfa,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211b3a,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211b7a,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211eba,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211efa,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211c3a,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211c7a,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211cfa,
0xf0000000, 0x8078ff78,
0x00000080, 0xf4211e7a,
0xf0000000, 0x8078ff78,
0x00000080, 0xbf820024,
0xf4211bfa, 0xf0000000,
0x8078ff78, 0x00000100,
0xf4211b3a, 0xf0000000,
0x8078ff78, 0x00000100,
0xf4211b7a, 0xf0000000,
0x8078ff78, 0x00000100,
0xf4211eba, 0xf0000000,
0x8078ff78, 0x00000100,
0xf4211efa, 0xf0000000,
0x8078ff78, 0x00000100,
0xf4211c3a, 0xf0000000,
0x8078ff78, 0x00000100,
0xf4211c7a, 0xf0000000,
0x8078ff78, 0x00000100,
0xf4211cfa, 0xf0000000,
0x8078ff78, 0x00000100,
0xf4211e7a, 0xf0000000,
0x8078ff78, 0x00000100,
0xbf8cc07f, 0x876dff6d,
0x0000ffff, 0xbefc036f,
0xbefe037a, 0xbeff037b,
0x876f71ff, 0x000003ff,
0xb9ef4803, 0xb9f3f816,
0x876f71ff, 0xfffff800,
0x906f8b6f, 0xb9efa2c3,
0xb9f9f801, 0x876fff6d,
0xf0000000, 0x906f9c6f,
0x8f6f906f, 0xbef20380,
0x88726f72, 0x876fff6d,
0x08000000, 0x906f9b6f,
0x8f6f8f6f, 0x88726f72,
0x876fff70, 0x00800000,
0x906f976f, 0xb9f2f807,
0xb9f0f802, 0xbf8a0000,
0xbe80226c, 0xbf810000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0x00000000,
};
/*
* Copyright 2018 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
shader main
asic(DEFAULT)
type(CS)
wave_size(32)
/*************************************************************************/
/* control on how to run the shader */
/*************************************************************************/
//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
var EMU_RUN_HACK = 0
var EMU_RUN_HACK_RESTORE_NORMAL = 0
var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
var SAVE_LDS = 0
var WG_BASE_ADDR_LO = 0x9000a000
var WG_BASE_ADDR_HI = 0x0
var WAVE_SPACE = 0x9000 //memory size that each wave occupies in workgroup state mem, increase from 5000 to 9000 for more SGPR need to be saved
var CTX_SAVE_CONTROL = 0x0
var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write
var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
var SAVE_RESTORE_HWID_DDID = 0
var RESTORE_DDID_IN_SGPR18 = 0
/**************************************************************************/
/* variables */
/**************************************************************************/
var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11
var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1
var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask
var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME
var SQ_WAVE_IB_STS_RCNT_SIZE = 6 //FIXME
var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
/* Save */
var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
var S_SAVE_SPI_INIT_ATC_SHIFT = 27
var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
var s_save_spi_init_lo = exec_lo
var s_save_spi_init_hi = exec_hi
var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
var s_save_pc_hi = ttmp1
var s_save_exec_lo = ttmp2
var s_save_exec_hi = ttmp3
var s_save_status = ttmp4
var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
var s_wave_size = ttmp6 //ttmp6 is not needed now, since it's only 32bit xnack mask, now use it to determine wave32 or wave64 in EMU_HACK
var s_save_xnack_mask = ttmp7
var s_save_buf_rsrc0 = ttmp8
var s_save_buf_rsrc1 = ttmp9
var s_save_buf_rsrc2 = ttmp10
var s_save_buf_rsrc3 = ttmp11
var s_save_mem_offset = ttmp14
var s_sgpr_save_num = 106 //in gfx10, all sgpr must be saved
var s_save_alloc_size = s_save_trapsts //conflict
var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
var s_save_m0 = ttmp15
/* Restore */
var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
var s_restore_spi_init_lo = exec_lo
var s_restore_spi_init_hi = exec_hi
var s_restore_mem_offset = ttmp12
var s_restore_alloc_size = ttmp3
var s_restore_tmp = ttmp6
var s_restore_mem_offset_save = s_restore_tmp //no conflict
var s_restore_m0 = s_restore_alloc_size //no conflict
var s_restore_mode = ttmp13
var s_restore_hwid1 = ttmp2
var s_restore_ddid = s_restore_hwid1
var s_restore_pc_lo = ttmp0
var s_restore_pc_hi = ttmp1
var s_restore_exec_lo = ttmp14
var s_restore_exec_hi = ttmp15
var s_restore_status = ttmp4
var s_restore_trapsts = ttmp5
//var s_restore_xnack_mask_lo = xnack_mask_lo
//var s_restore_xnack_mask_hi = xnack_mask_hi
var s_restore_xnack_mask = ttmp7
var s_restore_buf_rsrc0 = ttmp8
var s_restore_buf_rsrc1 = ttmp9
var s_restore_buf_rsrc2 = ttmp10
var s_restore_buf_rsrc3 = ttmp11
var s_restore_size = ttmp13 //ttmp13 has no conflict
/**************************************************************************/
/* trap handler entry points */
/**************************************************************************/
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
//FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
//FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
else
s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
end
L_JUMP_TO_RESTORE:
s_branch L_RESTORE //restore
L_SKIP_RESTORE:
s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
s_cbranch_scc1 L_SAVE //this is the operation for save
// ********* Handle non-CWSR traps *******************
if (!EMU_RUN_HACK)
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
L_EXCP_CASE:
s_and_b32 ttmp1, ttmp1, 0xFFFF
s_rfe_b64 [ttmp0, ttmp1]
end
// ********* End handling of non-CWSR traps *******************
/**************************************************************************/
/* save routine */
/**************************************************************************/
L_SAVE:
//check whether there is mem_viol
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
s_cbranch_scc0 L_NO_PC_REWIND
//if so, need rewind PC assuming GDS operation gets NACKed
s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
L_NO_PC_REWIND:
s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
//s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK
//s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi
s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
/* inform SPI the readiness and wait for SPI's go signal */
s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
s_mov_b32 s_save_exec_hi, exec_hi
s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
if (EMU_RUN_HACK)
else
s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
end
L_SLEEP:
s_sleep 0x2
if (EMU_RUN_HACK)
else
s_cbranch_execz L_SLEEP
end
/* setup Resource Contants */
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
//calculate wd_addr using absolute thread id
v_readlane_b32 s_save_tmp, v9, 0
//determine it is wave32 or wave64
s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
s_cmp_eq_u32 s_wave_size, 0
s_cbranch_scc1 L_SAVE_WAVE32
s_lshr_b32 s_save_tmp, s_save_tmp, 6 //SAVE WAVE64
s_branch L_SAVE_CON
L_SAVE_WAVE32:
s_lshr_b32 s_save_tmp, s_save_tmp, 5 //SAVE WAVE32
L_SAVE_CON:
s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
else
end
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
else
end
s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
s_mov_b32 s_save_m0, m0 //save M0
/* global mem offset */
s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //get wave_save_size
s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi
/* save VGPRs */
//////////////////////////////
L_SAVE_VGPR:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI
s_mov_b32 exec_hi, 0x00000000
s_branch L_SAVE_VGPR_NORMAL
L_ENABLE_SAVE_VGPR_EXEC_HI:
s_mov_b32 exec_hi, 0xFFFFFFFF
L_SAVE_VGPR_NORMAL:
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
//for wave32 and wave64, the num of vgpr function is the same?
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
//determine it is wave32 or wave64
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_SAVE_VGPR_WAVE64
//zhenxu added it for save vgpr for wave32
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4)
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //VGPR initial index value =0
//s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
//s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
L_SAVE_VGPR_WAVE32_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
end
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 128 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_VGPR_WAVE32_LOOP //VGPR save is complete?
s_branch L_SAVE_LDS
//save vgpr for wave32 ends
L_SAVE_VGPR_WAVE64:
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //VGPR initial index value =0
//s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
//s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
L_SAVE_VGPR_WAVE64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
end
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_VGPR_WAVE64_LOOP //VGPR save is complete?
//s_set_gpr_idx_off
//
//Below part will be the save shared vgpr part (new for gfx10)
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
s_cbranch_scc0 L_SAVE_LDS //no shared_vgpr used? jump to L_SAVE_LDS
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
//save shared_vgpr will start from the index of m0
s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
s_mov_b32 exec_lo, 0xFFFFFFFF
s_mov_b32 exec_hi, 0x00000000
L_SAVE_SHARED_VGPR_WAVE64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 256 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
/* save LDS */
//////////////////////////////
L_SAVE_LDS:
//Only check the first wave need LDS
/* the first wave in the threadgroup */
s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG"
s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
s_cbranch_scc0 L_SAVE_SGPR
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI
s_mov_b32 exec_hi, 0x00000000
s_branch L_SAVE_LDS_NORMAL
L_ENABLE_SAVE_LDS_EXEC_HI:
s_mov_b32 exec_hi, 0xFFFFFFFF
L_SAVE_LDS_NORMAL:
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc0 L_SAVE_SGPR //no lds used? jump to L_SAVE_VGPR
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
//load 0~63*4(byte address) to vgpr v15
v_mbcnt_lo_u32_b32 v0, -1, 0
v_mbcnt_hi_u32_b32 v0, -1, v0
v_mul_u32_u24 v0, 4, v0
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_mov_b32 m0, 0x0
s_cbranch_scc1 L_SAVE_LDS_LOOP_W64
L_SAVE_LDS_LOOP_W32:
if (SAVE_LDS)
ds_read_b32 v1, v0
s_waitcnt 0 //ensure data ready
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
//buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10
end
s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //mem offset increased by 128 bytes
v_add_nc_u32 v0, v0, 128
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete?
s_branch L_SAVE_SGPR
L_SAVE_LDS_LOOP_W64:
if (SAVE_LDS)
ds_read_b32 v1, v0
s_waitcnt 0 //ensure data ready
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
//buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10
end
s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes
v_add_nc_u32 v0, v0, 256
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete?
/* save SGPRs */
//////////////////////////////
//s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
//s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
//s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
//s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //In gfx10, Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value)
L_SAVE_SGPR:
//need to look at it is wave32 or wave64
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_SAVE_SGPR_VMEM_WAVE64
if (SGPR_SAVE_USE_SQC)
s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
else
s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads)
end
s_branch L_SAVE_SGPR_CONT
L_SAVE_SGPR_VMEM_WAVE64:
if (SGPR_SAVE_USE_SQC)
s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
else
s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads)
end
L_SAVE_SGPR_CONT:
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
//s_mov_b32 m0, 0x0 //SGPR initial index value =0
//s_nop 0x0 //Manually inserted wait states
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_mov_b32 m0, 0x0 //SGPR initial index value =0
s_nop 0x0 //Manually inserted wait states
s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64
L_SAVE_SGPR_LOOP_WAVE32:
s_movrels_b32 s0, s0 //s0 = s[0+m0]
//zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change
write_sgpr_to_mem_wave32(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0
s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE32 //SGPR save is complete?
s_branch L_SAVE_HWREG
L_SAVE_SGPR_LOOP_WAVE64:
s_movrels_b32 s0, s0 //s0 = s[0+m0]
//zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change
write_sgpr_to_mem_wave64(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0
s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64 //SGPR save is complete?
/* save HW registers */
//////////////////////////////
L_SAVE_HWREG:
s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_SAVE_HWREG_WAVE64
write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
end
write_sgpr_to_mem_wave32(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
write_sgpr_to_mem_wave32(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem_wave32(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
write_sgpr_to_mem_wave32(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem_wave32(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
//s_save_trapsts conflicts with s_save_alloc_size
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
write_sgpr_to_mem_wave32(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
//write_sgpr_to_mem_wave32(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
write_sgpr_to_mem_wave32(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
if(SAVE_RESTORE_HWID_DDID)
s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
end
s_branch L_S_PGM_END_SAVED
L_SAVE_HWREG_WAVE64:
write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
end
write_sgpr_to_mem_wave64(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
write_sgpr_to_mem_wave64(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem_wave64(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
write_sgpr_to_mem_wave64(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem_wave64(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
//s_save_trapsts conflicts with s_save_alloc_size
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
write_sgpr_to_mem_wave64(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
//write_sgpr_to_mem_wave64(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
write_sgpr_to_mem_wave64(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
if(SAVE_RESTORE_HWID_DDID)
s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
/* save DDID */
//////////////////////////////
L_SAVE_DDID:
//EXEC has been saved, no vector inst following
s_mov_b32 exec_lo, 0x80000000 //Set MSB to 1. Cleared when draw index is returned
s_sendmsg sendmsg(MSG_GET_DDID)
L_WAIT_DDID_LOOP:
s_nop 7 // sleep a bit
s_bitcmp0_b32 exec_lo, 31 // test to see if MSB is cleared, meaning done
s_cbranch_scc0 L_WAIT_DDID_LOOP
s_mov_b32 s_save_m0, exec_lo
s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_SAVE_DDID_WAVE64
write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
L_SAVE_DDID_WAVE64:
write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
end
L_S_PGM_END_SAVED:
/* S_PGM_END_SAVED */ //FIXME graphics ONLY
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
s_rfe_b64 s_save_pc_lo //Return to the main shader program
else
end
s_branch L_END_PGM
/**************************************************************************/
/* restore routine */
/**************************************************************************/
L_RESTORE:
/* Setup Resource Contants */
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
//calculate wd_addr using absolute thread id
v_readlane_b32 s_restore_tmp, v9, 0
//determine it is wave32 or wave64
s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //change to ttmp13
s_cmp_eq_u32 s_restore_size, 0
s_cbranch_scc1 L_RESTORE_WAVE32
s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 //SAVE WAVE64
s_branch L_RESTORE_CON
L_RESTORE_WAVE32:
s_lshr_b32 s_restore_tmp, s_restore_tmp, 5 //SAVE WAVE32
L_RESTORE_CON:
s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
else
end
s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
//determine it is wave32 or wave64
s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size //share s_wave_size with exec_hi
/* global mem offset */
s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
/* restore VGPRs */
//////////////////////////////
L_RESTORE_VGPR:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI
s_mov_b32 exec_hi, 0x00000000
s_branch L_RESTORE_VGPR_NORMAL
L_ENABLE_RESTORE_VGPR_EXEC_HI:
s_mov_b32 exec_hi, 0xFFFFFFFF
L_RESTORE_VGPR_NORMAL:
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
//determine it is wave32 or wave64
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4)
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
s_mov_b32 m0, 1 //VGPR initial index value = 1
//s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
//s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later, might not need this in gfx10
L_RESTORE_VGPR_WAVE32_LOOP:
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
end
s_waitcnt vmcnt(0) //ensure data ready
v_movreld_b32 v0, v0 //v[0+m0] = v0
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 128 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
//s_set_gpr_idx_off
/* VGPR restore on v0 */
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
end
s_branch L_RESTORE_LDS
L_RESTORE_VGPR_WAVE64:
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256
s_mov_b32 m0, 1 //VGPR initial index value = 1
L_RESTORE_VGPR_WAVE64_LOOP:
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
end
s_waitcnt vmcnt(0) //ensure data ready
v_movreld_b32 v0, v0 //v[0+m0] = v0
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
//s_set_gpr_idx_off
//
//Below part will be the restore shared vgpr part (new for gfx10)
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? jump to L_SAVE_LDS
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
//restore shared_vgpr will start from the index of m0
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
s_mov_b32 exec_lo, 0xFFFFFFFF
s_mov_b32 exec_hi, 0x00000000
L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
s_waitcnt vmcnt(0) //ensure data ready
v_movreld_b32 v0, v0 //v[0+m0] = v0
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
/* VGPR restore on v0 */
L_RESTORE_V0:
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
end
/* restore LDS */
//////////////////////////////
L_RESTORE_LDS:
//Only need to check the first wave
/* the first wave in the threadgroup */
s_and_b32 s_restore_tmp, s_restore_size, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
s_cbranch_scc0 L_RESTORE_SGPR
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI
s_mov_b32 exec_hi, 0x00000000
s_branch L_RESTORE_LDS_NORMAL
L_ENABLE_RESTORE_LDS_EXEC_HI:
s_mov_b32 exec_hi, 0xFFFFFFFF
L_RESTORE_LDS_NORMAL:
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc0 L_RESTORE_SGPR //no lds used? jump to L_RESTORE_VGPR
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_and_b32 m0, s_wave_size, 1
s_cmp_eq_u32 m0, 1
s_mov_b32 m0, 0x0
s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
L_RESTORE_LDS_LOOP_W32:
if (SAVE_LDS)
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
s_waitcnt 0
end
s_add_u32 m0, m0, 128 //every buffer_load_dword does 256 bytes
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete?
s_branch L_RESTORE_SGPR
L_RESTORE_LDS_LOOP_W64:
if (SAVE_LDS)
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
s_waitcnt 0
end
s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete?
/* restore SGPRs */
//////////////////////////////
//s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
//s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
//s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
//s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value)
L_RESTORE_SGPR:
//need to look at it is wave32 or wave64
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_SGPR_VMEM_WAVE64
if (SGPR_SAVE_USE_SQC)
s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
else
s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads)
end
s_branch L_RESTORE_SGPR_CONT
L_RESTORE_SGPR_VMEM_WAVE64:
if (SGPR_SAVE_USE_SQC)
s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
else
s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads)
end
L_RESTORE_SGPR_CONT:
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_SGPR_WAVE64
read_sgpr_from_mem_wave32(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
s_mov_b32 m0, 0x1
L_RESTORE_SGPR_LOOP_WAVE32:
read_sgpr_from_mem_wave32(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
s_waitcnt lgkmcnt(0) //ensure data ready
s_movreld_b32 s0, s0 //s[0+m0] = s0
s_nop 0 // hazard SALU M0=> S_MOVREL
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE32 //SGPR restore (except s0) is complete?
s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
s_branch L_RESTORE_HWREG
L_RESTORE_SGPR_WAVE64:
read_sgpr_from_mem_wave64(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1
L_RESTORE_SGPR_LOOP_WAVE64:
read_sgpr_from_mem_wave64(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
s_waitcnt lgkmcnt(0) //ensure data ready
s_movreld_b32 s0, s0 //s[0+m0] = s0
s_nop 0 // hazard SALU M0=> S_MOVREL
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE64 //SGPR restore (except s0) is complete?
s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
/* restore HW registers */
//////////////////////////////
L_RESTORE_HWREG:
s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_HWREG_WAVE64
read_sgpr_from_mem_wave32(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
read_sgpr_from_mem_wave32(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
read_sgpr_from_mem_wave32(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem_wave32(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
read_sgpr_from_mem_wave32(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem_wave32(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
read_sgpr_from_mem_wave32(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
//read_sgpr_from_mem_wave32(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
//read_sgpr_from_mem_wave32(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
read_sgpr_from_mem_wave32(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK
read_sgpr_from_mem_wave32(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
if(SAVE_RESTORE_HWID_DDID)
read_sgpr_from_mem_wave32(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1
end
s_branch L_RESTORE_HWREG_FINISH
L_RESTORE_HWREG_WAVE64:
read_sgpr_from_mem_wave64(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
read_sgpr_from_mem_wave64(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
read_sgpr_from_mem_wave64(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem_wave64(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
read_sgpr_from_mem_wave64(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem_wave64(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
read_sgpr_from_mem_wave64(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
//read_sgpr_from_mem_wave64(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
//read_sgpr_from_mem_wave64(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
read_sgpr_from_mem_wave64(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK
read_sgpr_from_mem_wave64(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
if(SAVE_RESTORE_HWID_DDID)
read_sgpr_from_mem_wave64(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1
end
L_RESTORE_HWREG_FINISH:
s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
if(SAVE_RESTORE_HWID_DDID)
L_RESTORE_DDID:
s_mov_b32 m0, s_restore_hwid1 //virture ttrace support: The save-context handler records the SE/SA/WGP/SIMD/wave of the original wave
s_ttracedata //and then can output it as SHADER_DATA to ttrace on restore to provide a correlation across the save-restore
s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_and_b32 m0, s_restore_size, 1
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_DDID_WAVE64
read_sgpr_from_mem_wave32(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
s_branch L_RESTORE_DDID_FINISH
L_RESTORE_DDID_WAVE64:
read_sgpr_from_mem_wave64(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
L_RESTORE_DDID_FINISH:
s_waitcnt lgkmcnt(0)
//s_mov_b32 m0, s_restore_ddid
//s_ttracedata
if (RESTORE_DDID_IN_SGPR18)
s_mov_b32 s18, s_restore_ddid
end
end
s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
end
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
end
s_mov_b32 m0, s_restore_m0
s_mov_b32 exec_lo, s_restore_exec_lo
s_mov_b32 exec_hi, s_restore_exec_hi
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask //restore xnack_mask
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
//s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
//reuse s_restore_m0 as a temp register
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status
s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
s_rfe_b64 s_restore_pc_lo // s_restore_m0[0] is used to set STATUS.inst_atc
/**************************************************************************/
/* the END */
/**************************************************************************/
L_END_PGM:
s_endpgm
end
/**************************************************************************/
/* the helper functions */
/**************************************************************************/
function write_sgpr_to_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
if (use_sqc)
s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
s_mov_b32 m0, s_mem_offset
s_buffer_store_dword s, s_rsrc, m0 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 4
s_mov_b32 m0, exec_lo
elsif (use_mtbuf)
v_mov_b32 v0, s
tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 128
else
v_mov_b32 v0, s
buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 128
end
end
function write_sgpr_to_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
if (use_sqc)
s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
s_mov_b32 m0, s_mem_offset
s_buffer_store_dword s, s_rsrc, m0 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 4
s_mov_b32 m0, exec_lo
elsif (use_mtbuf)
v_mov_b32 v0, s
tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 256
else
v_mov_b32 v0, s
buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 256
end
end
function read_sgpr_from_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc)
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
if (use_sqc)
s_add_u32 s_mem_offset, s_mem_offset, 4
else
s_add_u32 s_mem_offset, s_mem_offset, 128
end
end
function read_sgpr_from_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc)
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
if (use_sqc)
s_add_u32 s_mem_offset, s_mem_offset, 4
else
s_add_u32 s_mem_offset, s_mem_offset, 256
end
end
......@@ -138,6 +138,8 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
/* TODO - check & update Vega10 cache details */
#define vega10_cache_info carrizo_cache_info
#define raven_cache_info carrizo_cache_info
/* TODO - check & update Navi10 cache details */
#define navi10_cache_info carrizo_cache_info
static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
struct crat_subtype_computeunit *cu)
......@@ -666,6 +668,9 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
case CHIP_RAVEN:
pcache_info = raven_cache_info;
num_of_cache_types = ARRAY_SIZE(raven_cache_info);
case CHIP_NAVI10:
pcache_info = navi10_cache_info;
num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
break;
default:
return -EINVAL;
......
......@@ -317,6 +317,23 @@ static const struct kfd_device_info vega20_device_info = {
.num_sdma_queues_per_engine = 8,
};
static const struct kfd_device_info navi10_device_info = {
.asic_family = CHIP_NAVI10,
.max_pasid_bits = 16,
.max_no_of_hqd = 24,
.doorbell_size = 8,
.ih_ring_entry_size = 8 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_v9,
.num_of_watch_points = 4,
.mqd_size_aligned = MQD_SIZE_ALIGNED,
.needs_iommu_device = false,
.supports_cwsr = true,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
.num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 8,
};
struct kfd_deviceid {
unsigned short did;
const struct kfd_device_info *device_info;
......@@ -434,7 +451,13 @@ static const struct kfd_deviceid supported_devices[] = {
{ 0x66a3, &vega20_device_info }, /* Vega20 */
{ 0x66a4, &vega20_device_info }, /* Vega20 */
{ 0x66a7, &vega20_device_info }, /* Vega20 */
{ 0x66af, &vega20_device_info } /* Vega20 */
{ 0x66af, &vega20_device_info }, /* Vega20 */
/* Navi10 */
{ 0x7310, &navi10_device_info }, /* Navi10 */
{ 0x7312, &navi10_device_info }, /* Navi10 */
{ 0x7318, &navi10_device_info }, /* Navi10 */
{ 0x731a, &navi10_device_info }, /* Navi10 */
{ 0x731f, &navi10_device_info }, /* Navi10 */
};
static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
......@@ -516,10 +539,14 @@ static void kfd_cwsr_init(struct kfd_dev *kfd)
BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
kfd->cwsr_isa = cwsr_trap_gfx8_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
} else {
} else if (kfd->device_info->asic_family < CHIP_NAVI10) {
BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE);
kfd->cwsr_isa = cwsr_trap_gfx9_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex);
} else {
BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE);
kfd->cwsr_isa = cwsr_trap_gfx10_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex);
}
kfd->cwsr_enabled = true;
......
......@@ -1264,6 +1264,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
return 0;
retval = pm_send_runlist(&dqm->packets, &dqm->queues);
pr_debug("%s sent runlist\n", __func__);
if (retval) {
pr_err("failed to execute runlist\n");
return retval;
......@@ -1301,7 +1302,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
KFD_FENCE_COMPLETED);
/* should be timed out */
retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
queue_preemption_timeout_ms);
if (retval)
return retval;
......@@ -1785,6 +1786,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
case CHIP_RAVEN:
device_queue_manager_init_v9(&dqm->asic_ops);
break;
case CHIP_NAVI10:
device_queue_manager_init_v10_navi10(&dqm->asic_ops);
break;
default:
WARN(1, "Unexpected ASIC family %u",
dev->device_info->asic_family);
......@@ -1876,12 +1880,13 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
int r = 0;
r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs);
KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
&dump, &n_regs);
if (!r) {
seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n",
KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
KFD_CIK_HIQ_QUEUE);
KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
KFD_CIK_HIQ_QUEUE);
seq_reg_dump(m, dump, n_regs);
kfree(dump);
......
......@@ -31,8 +31,6 @@
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
#define KFD_UNMAP_LATENCY_MS (4000)
#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
struct device_process_node {
struct qcm_process_device *qpd;
......@@ -212,6 +210,8 @@ void device_queue_manager_init_vi_tonga(
struct device_queue_manager_asic_ops *asic_ops);
void device_queue_manager_init_v9(
struct device_queue_manager_asic_ops *asic_ops);
void device_queue_manager_init_v10_navi10(
struct device_queue_manager_asic_ops *asic_ops);
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
unsigned int get_queues_num(struct device_queue_manager *dqm);
......
/*
* Copyright 2018 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "kfd_device_queue_manager.h"
#include "navi10_enum.h"
#include "gc/gc_10_1_0_offset.h"
#include "gc/gc_10_1_0_sh_mask.h"
static int update_qpd_v10(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd);
void device_queue_manager_init_v10_navi10(
struct device_queue_manager_asic_ops *asic_ops)
{
asic_ops->update_qpd = update_qpd_v10;
asic_ops->init_sdma_vm = init_sdma_vm_v10;
asic_ops->mqd_manager_init = mqd_manager_init_v10;
}
static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
{
uint32_t shared_base = pdd->lds_base >> 48;
uint32_t private_base = pdd->scratch_base >> 48;
return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) |
private_base;
}
static int update_qpd_v10(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct kfd_process_device *pdd;
pdd = qpd_to_pdd(qpd);
/* check if sh_mem_config register already configured */
if (qpd->sh_mem_config == 0) {
qpd->sh_mem_config =
SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
#if 0
/* TODO:
* This shouldn't be an issue with Navi10. Verify.
*/
if (vega10_noretry)
qpd->sh_mem_config |=
1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
#endif
qpd->sh_mem_ape1_limit = 0;
qpd->sh_mem_ape1_base = 0;
}
qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
return 0;
}
static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd)
{
/* Not needed on SDMAv4 onwards any more */
q->properties.sdma_vm_addr = 0;
}
......@@ -405,6 +405,7 @@ int kfd_init_apertures(struct kfd_process *process)
case CHIP_VEGA12:
case CHIP_VEGA20:
case CHIP_RAVEN:
case CHIP_NAVI10:
kfd_init_apertures_v9(pdd, id);
break;
default:
......
......@@ -332,6 +332,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
case CHIP_RAVEN:
kernel_queue_init_v9(&kq->ops_asic_specific);
break;
case CHIP_NAVI10:
kernel_queue_init_v10(&kq->ops_asic_specific);
break;
default:
WARN(1, "Unexpected ASIC family %u",
dev->device_info->asic_family);
......
......@@ -102,5 +102,6 @@ struct kernel_queue {
void kernel_queue_init_cik(struct kernel_queue_ops *ops);
void kernel_queue_init_vi(struct kernel_queue_ops *ops);
void kernel_queue_init_v9(struct kernel_queue_ops *ops);
void kernel_queue_init_v10(struct kernel_queue_ops *ops);
#endif /* KFD_KERNEL_QUEUE_H_ */
/*
* Copyright 2018 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "kfd_kernel_queue.h"
#include "kfd_device_queue_manager.h"
#include "kfd_pm4_headers_ai.h"
#include "kfd_pm4_opcodes.h"
#include "gc/gc_10_1_0_sh_mask.h"
static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev,
enum kfd_queue_type type, unsigned int queue_size);
static void uninitialize_v10(struct kernel_queue *kq);
static void submit_packet_v10(struct kernel_queue *kq);
void kernel_queue_init_v10(struct kernel_queue_ops *ops)
{
ops->initialize = initialize_v10;
ops->uninitialize = uninitialize_v10;
ops->submit_packet = submit_packet_v10;
}
static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev,
enum kfd_queue_type type, unsigned int queue_size)
{
int retval;
retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
if (retval != 0)
return false;
kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
kq->eop_kernel_addr = kq->eop_mem->cpu_ptr;
memset(kq->eop_kernel_addr, 0, PAGE_SIZE);
return true;
}
static void uninitialize_v10(struct kernel_queue *kq)
{
kfd_gtt_sa_free(kq->dev, kq->eop_mem);
}
static void submit_packet_v10(struct kernel_queue *kq)
{
*kq->wptr64_kernel = kq->pending_wptr64;
write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
kq->pending_wptr64);
}
static int pm_map_process_v10(struct packet_manager *pm,
uint32_t *buffer, struct qcm_process_device *qpd)
{
struct pm4_mes_map_process *packet;
uint64_t vm_page_table_base_addr = qpd->page_table_base;
packet = (struct pm4_mes_map_process *)buffer;
memset(buffer, 0, sizeof(struct pm4_mes_map_process));
packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
sizeof(struct pm4_mes_map_process));
packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
packet->bitfields2.process_quantum = 1;
packet->bitfields2.pasid = qpd->pqm->process->pasid;
packet->bitfields14.gds_size = qpd->gds_size;
packet->bitfields14.num_gws = qpd->num_gws;
packet->bitfields14.num_oac = qpd->num_oac;
packet->bitfields14.sdma_enable = 1;
packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
packet->sh_mem_config = qpd->sh_mem_config;
packet->sh_mem_bases = qpd->sh_mem_bases;
if (qpd->tba_addr) {
packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8);
packet->sq_shader_tba_hi = (1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT) |
upper_32_bits(qpd->tba_addr >> 8);
packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8);
packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8);
}
packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
packet->vm_context_page_table_base_addr_lo32 =
lower_32_bits(vm_page_table_base_addr);
packet->vm_context_page_table_base_addr_hi32 =
upper_32_bits(vm_page_table_base_addr);
return 0;
}
static int pm_runlist_v10(struct packet_manager *pm, uint32_t *buffer,
uint64_t ib, size_t ib_size_in_dwords, bool chain)
{
struct pm4_mes_runlist *packet;
int concurrent_proc_cnt = 0;
struct kfd_dev *kfd = pm->dqm->dev;
/* Determine the number of processes to map together to HW:
* it can not exceed the number of VMIDs available to the
* scheduler, and it is determined by the smaller of the number
* of processes in the runlist and kfd module parameter
* hws_max_conc_proc.
* Note: the arbitration between the number of VMIDs and
* hws_max_conc_proc has been done in
* kgd2kfd_device_init().
*/
concurrent_proc_cnt = min(pm->dqm->processes_count,
kfd->max_proc_per_quantum);
packet = (struct pm4_mes_runlist *)buffer;
memset(buffer, 0, sizeof(struct pm4_mes_runlist));
packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
sizeof(struct pm4_mes_runlist));
packet->bitfields4.ib_size = ib_size_in_dwords;
packet->bitfields4.chain = chain ? 1 : 0;
packet->bitfields4.offload_polling = 0;
packet->bitfields4.valid = 1;
packet->bitfields4.process_cnt = concurrent_proc_cnt;
packet->ordinal2 = lower_32_bits(ib);
packet->ib_base_hi = upper_32_bits(ib);
return 0;
}
static int pm_map_queues_v10(struct packet_manager *pm, uint32_t *buffer,
struct queue *q, bool is_static)
{
struct pm4_mes_map_queues *packet;
bool use_static = is_static;
packet = (struct pm4_mes_map_queues *)buffer;
memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
sizeof(struct pm4_mes_map_queues));
packet->bitfields2.num_queues = 1;
packet->bitfields2.queue_sel =
queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
packet->bitfields2.engine_sel =
engine_sel__mes_map_queues__compute_vi;
packet->bitfields2.queue_type =
queue_type__mes_map_queues__normal_compute_vi;
switch (q->properties.type) {
case KFD_QUEUE_TYPE_COMPUTE:
if (use_static)
packet->bitfields2.queue_type =
queue_type__mes_map_queues__normal_latency_static_queue_vi;
break;
case KFD_QUEUE_TYPE_DIQ:
packet->bitfields2.queue_type =
queue_type__mes_map_queues__debug_interface_queue_vi;
break;
case KFD_QUEUE_TYPE_SDMA:
case KFD_QUEUE_TYPE_SDMA_XGMI:
packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
engine_sel__mes_map_queues__sdma0_vi;
use_static = false; /* no static queues under SDMA */
break;
default:
WARN(1, "queue type %d\n", q->properties.type);
return -EINVAL;
}
packet->bitfields3.doorbell_offset =
q->properties.doorbell_off;
packet->mqd_addr_lo =
lower_32_bits(q->gart_mqd_addr);
packet->mqd_addr_hi =
upper_32_bits(q->gart_mqd_addr);
packet->wptr_addr_lo =
lower_32_bits((uint64_t)q->properties.write_ptr);
packet->wptr_addr_hi =
upper_32_bits((uint64_t)q->properties.write_ptr);
return 0;
}
static int pm_unmap_queues_v10(struct packet_manager *pm, uint32_t *buffer,
enum kfd_queue_type type,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param, bool reset,
unsigned int sdma_engine)
{
struct pm4_mes_unmap_queues *packet;
packet = (struct pm4_mes_unmap_queues *)buffer;
memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
sizeof(struct pm4_mes_unmap_queues));
switch (type) {
case KFD_QUEUE_TYPE_COMPUTE:
case KFD_QUEUE_TYPE_DIQ:
packet->bitfields2.engine_sel =
engine_sel__mes_unmap_queues__compute;
break;
case KFD_QUEUE_TYPE_SDMA:
case KFD_QUEUE_TYPE_SDMA_XGMI:
packet->bitfields2.engine_sel =
engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
break;
default:
WARN(1, "queue type %d\n", type);
break;
}
if (reset)
packet->bitfields2.action =
action__mes_unmap_queues__reset_queues;
else
packet->bitfields2.action =
action__mes_unmap_queues__preempt_queues;
switch (filter) {
case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
packet->bitfields2.queue_sel =
queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
packet->bitfields2.num_queues = 1;
packet->bitfields3b.doorbell_offset0 = filter_param;
break;
case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
packet->bitfields2.queue_sel =
queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
packet->bitfields3a.pasid = filter_param;
break;
case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
packet->bitfields2.queue_sel =
queue_sel__mes_unmap_queues__unmap_all_queues;
break;
case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
/* in this case, we do not preempt static queues */
packet->bitfields2.queue_sel =
queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
break;
default:
WARN(1, "filter %d\n", filter);
break;
}
return 0;
}
static int pm_query_status_v10(struct packet_manager *pm, uint32_t *buffer,
uint64_t fence_address, uint32_t fence_value)
{
struct pm4_mes_query_status *packet;
packet = (struct pm4_mes_query_status *)buffer;
memset(buffer, 0, sizeof(struct pm4_mes_query_status));
packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
sizeof(struct pm4_mes_query_status));
packet->bitfields2.context_id = 0;
packet->bitfields2.interrupt_sel =
interrupt_sel__mes_query_status__completion_status;
packet->bitfields2.command =
command__mes_query_status__fence_only_after_write_ack;
packet->addr_hi = upper_32_bits((uint64_t)fence_address);
packet->addr_lo = lower_32_bits((uint64_t)fence_address);
packet->data_hi = upper_32_bits((uint64_t)fence_value);
packet->data_lo = lower_32_bits((uint64_t)fence_value);
return 0;
}
static int pm_release_mem_v10(uint64_t gpu_addr, uint32_t *buffer)
{
struct pm4_mec_release_mem *packet;
WARN_ON(!buffer);
packet = (struct pm4_mec_release_mem *)buffer;
memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
sizeof(struct pm4_mec_release_mem));
packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
packet->bitfields2.tcl1_action_ena = 1;
packet->bitfields2.tc_action_ena = 1;
packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru;
packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
packet->bitfields3.int_sel =
int_sel__mec_release_mem__send_interrupt_after_write_confirm;
packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
packet->address_hi = upper_32_bits(gpu_addr);
packet->data_lo = 0;
return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int);
}
const struct packet_manager_funcs kfd_v10_pm_funcs = {
.map_process = pm_map_process_v10,
.runlist = pm_runlist_v10,
.set_resources = pm_set_resources_vi,
.map_queues = pm_map_queues_v10,
.unmap_queues = pm_unmap_queues_v10,
.query_status = pm_query_status_v10,
.release_mem = pm_release_mem_v10,
.map_process_size = sizeof(struct pm4_mes_map_process),
.runlist_size = sizeof(struct pm4_mes_runlist),
.set_resources_size = sizeof(struct pm4_mes_set_resources),
.map_queues_size = sizeof(struct pm4_mes_map_queues),
.unmap_queues_size = sizeof(struct pm4_mes_unmap_queues),
.query_status_size = sizeof(struct pm4_mes_query_status),
.release_mem_size = sizeof(struct pm4_mec_release_mem)
};
/*
* Copyright 2018 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
#include "v10_structs.h"
#include "gc/gc_10_1_0_offset.h"
#include "gc/gc_10_1_0_sh_mask.h"
#include "amdgpu_amdkfd.h"
static inline struct v10_compute_mqd *get_mqd(void *mqd)
{
return (struct v10_compute_mqd *)mqd;
}
static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd)
{
return (struct v10_sdma_mqd *)mqd;
}
static void update_cu_mask(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct v10_compute_mqd *m;
uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
if (q->cu_mask_count == 0)
return;
mqd_symmetrically_map_cu_mask(mm,
q->cu_mask, q->cu_mask_count, se_mask);
m = get_mqd(mqd);
m->compute_static_thread_mgmt_se0 = se_mask[0];
m->compute_static_thread_mgmt_se1 = se_mask[1];
m->compute_static_thread_mgmt_se2 = se_mask[2];
m->compute_static_thread_mgmt_se3 = se_mask[3];
pr_debug("update cu mask to %#x %#x %#x %#x\n",
m->compute_static_thread_mgmt_se0,
m->compute_static_thread_mgmt_se1,
m->compute_static_thread_mgmt_se2,
m->compute_static_thread_mgmt_se3);
}
static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
struct queue_properties *q)
{
int retval;
struct kfd_mem_obj *mqd_mem_obj = NULL;
/* From V9, for CWSR, the control stack is located on the next page
* boundary after the mqd, we will use the gtt allocation function
* instead of sub-allocation function.
*/
if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
if (!mqd_mem_obj)
return NULL;
retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd,
ALIGN(q->ctl_stack_size, PAGE_SIZE) +
ALIGN(sizeof(struct v10_compute_mqd), PAGE_SIZE),
&(mqd_mem_obj->gtt_mem),
&(mqd_mem_obj->gpu_addr),
(void *)&(mqd_mem_obj->cpu_ptr), true);
} else {
retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v10_compute_mqd),
&mqd_mem_obj);
}
if (retval) {
kfree(mqd_mem_obj);
return NULL;
}
return mqd_mem_obj;
}
static void init_mqd(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
uint64_t addr;
struct v10_compute_mqd *m;
m = (struct v10_compute_mqd *) mqd_mem_obj->cpu_ptr;
addr = mqd_mem_obj->gpu_addr;
memset(m, 0, sizeof(struct v10_compute_mqd));
m->header = 0xC0310800;
m->compute_pipelinestat_enable = 1;
m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;
m->cp_mqd_base_addr_lo = lower_32_bits(addr);
m->cp_mqd_base_addr_hi = upper_32_bits(addr);
m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
m->cp_hqd_pipe_priority = 1;
m->cp_hqd_queue_priority = 15;
if (q->format == KFD_QUEUE_FORMAT_AQL) {
m->cp_hqd_aql_control =
1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
}
if (mm->dev->cwsr_enabled) {
m->cp_hqd_persistent_state |=
(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
m->cp_hqd_ctx_save_base_addr_lo =
lower_32_bits(q->ctx_save_restore_area_address);
m->cp_hqd_ctx_save_base_addr_hi =
upper_32_bits(q->ctx_save_restore_area_address);
m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
m->cp_hqd_wg_state_offset = q->ctl_stack_size;
}
*mqd = m;
if (gart_addr)
*gart_addr = addr;
mm->update_mqd(mm, m, q);
}
static int load_mqd(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
struct queue_properties *p, struct mm_struct *mms)
{
int r = 0;
/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
r = mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
(uint32_t __user *)p->write_ptr,
wptr_shift, 0, mms);
return r;
}
static void update_mqd(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct v10_compute_mqd *m;
m = get_mqd(mqd);
m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
m->cp_hqd_pq_control |=
ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1;
pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
m->cp_hqd_pq_doorbell_control =
q->doorbell_off <<
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
m->cp_hqd_pq_doorbell_control);
m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT;
/*
* HW does not clamp this field correctly. Maximum EOP queue size
* is constrained by per-SE EOP done signal count, which is 8-bit.
* Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
* more than (EOP entry count - 1) so a queue size of 0x800 dwords
* is safe, giving a maximum field value of 0xA.
*/
m->cp_hqd_eop_control = min(0xA,
ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1);
m->cp_hqd_eop_base_addr_lo =
lower_32_bits(q->eop_ring_buffer_address >> 8);
m->cp_hqd_eop_base_addr_hi =
upper_32_bits(q->eop_ring_buffer_address >> 8);
m->cp_hqd_iq_timer = 0;
m->cp_hqd_vmid = q->vmid;
if (q->format == KFD_QUEUE_FORMAT_AQL) {
/* GC 10 removed WPP_CLAMP from PQ Control */
m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT ;
m->cp_hqd_pq_doorbell_control |=
1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
}
if (mm->dev->cwsr_enabled)
m->cp_hqd_ctx_save_control = 0;
update_cu_mask(mm, mqd, q);
q->is_active = (q->queue_size > 0 &&
q->queue_address != 0 &&
q->queue_percent > 0 &&
!q->is_evicted);
}
static int destroy_mqd(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id)
{
return mm->dev->kfd2kgd->hqd_destroy
(mm->dev->kgd, mqd, type, timeout,
pipe_id, queue_id);
}
static void free_mqd(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
struct kfd_dev *kfd = mm->dev;
if (mqd_mem_obj->gtt_mem) {
amdgpu_amdkfd_free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem);
kfree(mqd_mem_obj);
} else {
kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
}
}
static bool is_occupied(struct mqd_manager *mm, void *mqd,
uint64_t queue_address, uint32_t pipe_id,
uint32_t queue_id)
{
return mm->dev->kfd2kgd->hqd_is_occupied(
mm->dev->kgd, queue_address,
pipe_id, queue_id);
}
static int get_wave_state(struct mqd_manager *mm, void *mqd,
void __user *ctl_stack,
u32 *ctl_stack_used_size,
u32 *save_area_used_size)
{
struct v10_compute_mqd *m;
/* Control stack is located one page after MQD. */
void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
m = get_mqd(mqd);
*ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
m->cp_hqd_cntl_stack_offset;
*save_area_used_size = m->cp_hqd_wg_state_offset -
m->cp_hqd_cntl_stack_size;
if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
return -EFAULT;
return 0;
}
static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
struct v10_compute_mqd *m;
init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
m = get_mqd(*mqd);
m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
}
static void update_mqd_hiq(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct v10_compute_mqd *m;
update_mqd(mm, mqd, q);
/* TODO: what's the point? update_mqd already does this. */
m = get_mqd(mqd);
m->cp_hqd_vmid = q->vmid;
}
static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
struct v10_sdma_mqd *m;
m = (struct v10_sdma_mqd *) mqd_mem_obj->cpu_ptr;
memset(m, 0, sizeof(struct v10_sdma_mqd));
*mqd = m;
if (gart_addr)
*gart_addr = mqd_mem_obj->gpu_addr;
mm->update_mqd(mm, m, q);
}
static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
struct queue_properties *p, struct mm_struct *mms)
{
return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
(uint32_t __user *)p->write_ptr,
mms);
}
#define SDMA_RLC_DUMMY_DEFAULT 0xf
static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct v10_sdma_mqd *m;
m = get_sdma_mqd(mqd);
m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
m->sdmax_rlcx_doorbell_offset =
q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT;
m->sdma_engine_id = q->sdma_engine_id;
m->sdma_queue_id = q->sdma_queue_id;
m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
q->is_active = (q->queue_size > 0 &&
q->queue_address != 0 &&
q->queue_percent > 0 &&
!q->is_evicted);
}
/*
* * preempt type here is ignored because there is only one way
* * to preempt sdma queue
*/
static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id)
{
return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
}
static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
uint64_t queue_address, uint32_t pipe_id,
uint32_t queue_id)
{
return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
}
#if defined(CONFIG_DEBUG_FS)
static int debugfs_show_mqd(struct seq_file *m, void *data)
{
seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
data, sizeof(struct v10_compute_mqd), false);
return 0;
}
static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
{
seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
data, sizeof(struct v10_sdma_mqd), false);
return 0;
}
#endif
struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type,
struct kfd_dev *dev)
{
struct mqd_manager *mqd;
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
if (!mqd)
return NULL;
mqd->dev = dev;
switch (type) {
case KFD_MQD_TYPE_CP:
pr_debug("%s@%i\n", __func__, __LINE__);
case KFD_MQD_TYPE_COMPUTE:
pr_debug("%s@%i\n", __func__, __LINE__);
mqd->allocate_mqd = allocate_mqd;
mqd->init_mqd = init_mqd;
mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->mqd_size = sizeof(struct v10_compute_mqd);
mqd->get_wave_state = get_wave_state;
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
pr_debug("%s@%i\n", __func__, __LINE__);
break;
case KFD_MQD_TYPE_HIQ:
pr_debug("%s@%i\n", __func__, __LINE__);
mqd->allocate_mqd = allocate_hiq_mqd;
mqd->init_mqd = init_mqd_hiq;
mqd->free_mqd = free_mqd_hiq_sdma;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->mqd_size = sizeof(struct v10_compute_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
pr_debug("%s@%i\n", __func__, __LINE__);
break;
case KFD_MQD_TYPE_DIQ:
mqd->allocate_mqd = allocate_hiq_mqd;
mqd->init_mqd = init_mqd_hiq;
mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->mqd_size = sizeof(struct v10_compute_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
break;
case KFD_MQD_TYPE_SDMA:
pr_debug("%s@%i\n", __func__, __LINE__);
mqd->allocate_mqd = allocate_sdma_mqd;
mqd->init_mqd = init_mqd_sdma;
mqd->free_mqd = free_mqd_hiq_sdma;
mqd->load_mqd = load_mqd_sdma;
mqd->update_mqd = update_mqd_sdma;
mqd->destroy_mqd = destroy_mqd_sdma;
mqd->is_occupied = is_occupied_sdma;
mqd->mqd_size = sizeof(struct v10_sdma_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
#endif
pr_debug("%s@%i\n", __func__, __LINE__);
break;
default:
kfree(mqd);
return NULL;
}
return mqd;
}
......@@ -237,6 +237,9 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
case CHIP_RAVEN:
pm->pmf = &kfd_v9_pm_funcs;
break;
case CHIP_NAVI10:
pm->pmf = &kfd_v10_pm_funcs;
break;
default:
WARN(1, "Unexpected ASIC family %u",
dqm->dev->device_info->asic_family);
......
......@@ -105,6 +105,8 @@
#define KFD_KERNEL_QUEUE_SIZE 2048
#define KFD_UNMAP_LATENCY_MS (4000)
/*
* 512 = 0x200
* The doorbell index distance between SDMA RLC (2*i) and (2*i+1) in the
......@@ -167,11 +169,20 @@ extern int halt_if_hws_hang;
*/
extern bool hws_gws_support;
/*
* Queue preemption timeout in ms
*/
extern int queue_preemption_timeout_ms;
enum cache_policy {
cache_policy_coherent,
cache_policy_noncoherent
};
#define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_POLARIS11)
#define KFD_IS_DGPU(chip) (((chip) >= CHIP_TONGA && \
(chip) <= CHIP_NAVI10) || \
(chip) == CHIP_HAWAII)
#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10)
struct kfd_event_interrupt_class {
......@@ -870,6 +881,8 @@ struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
......@@ -909,8 +922,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
u32 *save_area_used_size);
int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
unsigned int fence_value,
unsigned int timeout_ms);
unsigned int fence_value,
unsigned int timeout_ms);
/* Packet Manager */
......@@ -959,6 +972,7 @@ struct packet_manager_funcs {
extern const struct packet_manager_funcs kfd_vi_pm_funcs;
extern const struct packet_manager_funcs kfd_v9_pm_funcs;
extern const struct packet_manager_funcs kfd_v10_pm_funcs;
int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
void pm_uninit(struct packet_manager *pm);
......@@ -978,7 +992,8 @@ void pm_release_ib(struct packet_manager *pm);
/* Following PM funcs can be shared among VI and AI */
unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
struct scheduling_resources *res);
struct scheduling_resources *res);
uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
......
......@@ -1203,3 +1203,4 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
}
#endif
......@@ -1321,6 +1321,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
case CHIP_VEGA12:
case CHIP_VEGA20:
case CHIP_RAVEN:
case CHIP_NAVI10:
dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment