Commit d9c13156 authored by Christian König's avatar Christian König Committed by Alex Deucher

drm/amdgpu: add option to stop on VM fault

Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent ce0c6bcd
...@@ -79,6 +79,7 @@ extern int amdgpu_bapm; ...@@ -79,6 +79,7 @@ extern int amdgpu_bapm;
extern int amdgpu_deep_color; extern int amdgpu_deep_color;
extern int amdgpu_vm_size; extern int amdgpu_vm_size;
extern int amdgpu_vm_block_size; extern int amdgpu_vm_block_size;
extern int amdgpu_vm_fault_stop;
extern int amdgpu_enable_scheduler; extern int amdgpu_enable_scheduler;
extern int amdgpu_sched_jobs; extern int amdgpu_sched_jobs;
extern int amdgpu_sched_hw_submission; extern int amdgpu_sched_hw_submission;
...@@ -960,6 +961,11 @@ struct amdgpu_ring { ...@@ -960,6 +961,11 @@ struct amdgpu_ring {
#define AMDGPU_PTE_FRAG_64KB (4 << 7) #define AMDGPU_PTE_FRAG_64KB (4 << 7)
#define AMDGPU_LOG2_PAGES_PER_FRAG 4 #define AMDGPU_LOG2_PAGES_PER_FRAG 4
/* How to programm VM fault handling */
#define AMDGPU_VM_FAULT_STOP_NEVER 0
#define AMDGPU_VM_FAULT_STOP_FIRST 1
#define AMDGPU_VM_FAULT_STOP_ALWAYS 2
struct amdgpu_vm_pt { struct amdgpu_vm_pt {
struct amdgpu_bo *bo; struct amdgpu_bo *bo;
uint64_t addr; uint64_t addr;
......
...@@ -75,6 +75,7 @@ int amdgpu_bapm = -1; ...@@ -75,6 +75,7 @@ int amdgpu_bapm = -1;
int amdgpu_deep_color = 0; int amdgpu_deep_color = 0;
int amdgpu_vm_size = 8; int amdgpu_vm_size = 8;
int amdgpu_vm_block_size = -1; int amdgpu_vm_block_size = -1;
int amdgpu_vm_fault_stop = 0;
int amdgpu_exp_hw_support = 0; int amdgpu_exp_hw_support = 0;
int amdgpu_enable_scheduler = 1; int amdgpu_enable_scheduler = 1;
int amdgpu_sched_jobs = 16; int amdgpu_sched_jobs = 16;
...@@ -141,6 +142,9 @@ module_param_named(vm_size, amdgpu_vm_size, int, 0444); ...@@ -141,6 +142,9 @@ module_param_named(vm_size, amdgpu_vm_size, int, 0444);
MODULE_PARM_DESC(vm_block_size, "VM page table size in bits (default depending on vm_size)"); MODULE_PARM_DESC(vm_block_size, "VM page table size in bits (default depending on vm_size)");
module_param_named(vm_block_size, amdgpu_vm_block_size, int, 0444); module_param_named(vm_block_size, amdgpu_vm_block_size, int, 0444);
MODULE_PARM_DESC(vm_fault_stop, "Stop on VM fault (0 = never (default), 1 = print first, 2 = always)");
module_param_named(vm_fault_stop, amdgpu_vm_fault_stop, int, 0444);
MODULE_PARM_DESC(exp_hw_support, "experimental hw support (1 = enable, 0 = disable (default))"); MODULE_PARM_DESC(exp_hw_support, "experimental hw support (1 = enable, 0 = disable (default))");
module_param_named(exp_hw_support, amdgpu_exp_hw_support, int, 0444); module_param_named(exp_hw_support, amdgpu_exp_hw_support, int, 0444);
......
...@@ -435,6 +435,33 @@ static int gmc_v7_0_gart_set_pte_pde(struct amdgpu_device *adev, ...@@ -435,6 +435,33 @@ static int gmc_v7_0_gart_set_pte_pde(struct amdgpu_device *adev,
return 0; return 0;
} }
/**
* gmc_v8_0_set_fault_enable_default - update VM fault handling
*
* @adev: amdgpu_device pointer
* @value: true redirects VM faults to the default page
*/
static void gmc_v7_0_set_fault_enable_default(struct amdgpu_device *adev,
bool value)
{
u32 tmp;
tmp = RREG32(mmVM_CONTEXT1_CNTL);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
RANGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
DUMMY_PAGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
PDE0_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
VALID_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
READ_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
WRITE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
WREG32(mmVM_CONTEXT1_CNTL, tmp);
}
/** /**
* gmc_v7_0_gart_enable - gart enable * gmc_v7_0_gart_enable - gart enable
* *
...@@ -523,15 +550,13 @@ static int gmc_v7_0_gart_enable(struct amdgpu_device *adev) ...@@ -523,15 +550,13 @@ static int gmc_v7_0_gart_enable(struct amdgpu_device *adev)
tmp = RREG32(mmVM_CONTEXT1_CNTL); tmp = RREG32(mmVM_CONTEXT1_CNTL);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, ENABLE_CONTEXT, 1); tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, ENABLE_CONTEXT, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_DEPTH, 1); tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_DEPTH, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, RANGE_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, DUMMY_PAGE_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PDE0_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, VALID_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, READ_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, WRITE_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_BLOCK_SIZE, tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_BLOCK_SIZE,
amdgpu_vm_block_size - 9); amdgpu_vm_block_size - 9);
WREG32(mmVM_CONTEXT1_CNTL, tmp); WREG32(mmVM_CONTEXT1_CNTL, tmp);
if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS)
gmc_v7_0_set_fault_enable_default(adev, false);
else
gmc_v7_0_set_fault_enable_default(adev, true);
if (adev->asic_type == CHIP_KAVERI) { if (adev->asic_type == CHIP_KAVERI) {
tmp = RREG32(mmCHUB_CONTROL); tmp = RREG32(mmCHUB_CONTROL);
...@@ -1268,6 +1293,9 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev, ...@@ -1268,6 +1293,9 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev,
if (!addr && !status) if (!addr && !status)
return 0; return 0;
if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
gmc_v7_0_set_fault_enable_default(adev, false);
dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n", dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
entry->src_id, entry->src_data); entry->src_id, entry->src_data);
dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n", dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n",
......
...@@ -549,6 +549,35 @@ static int gmc_v8_0_gart_set_pte_pde(struct amdgpu_device *adev, ...@@ -549,6 +549,35 @@ static int gmc_v8_0_gart_set_pte_pde(struct amdgpu_device *adev,
return 0; return 0;
} }
/**
* gmc_v8_0_set_fault_enable_default - update VM fault handling
*
* @adev: amdgpu_device pointer
* @value: true redirects VM faults to the default page
*/
static void gmc_v8_0_set_fault_enable_default(struct amdgpu_device *adev,
bool value)
{
u32 tmp;
tmp = RREG32(mmVM_CONTEXT1_CNTL);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
RANGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
DUMMY_PAGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
PDE0_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
VALID_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
READ_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
WRITE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
EXECUTE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
WREG32(mmVM_CONTEXT1_CNTL, tmp);
}
/** /**
* gmc_v8_0_gart_enable - gart enable * gmc_v8_0_gart_enable - gart enable
* *
...@@ -663,6 +692,10 @@ static int gmc_v8_0_gart_enable(struct amdgpu_device *adev) ...@@ -663,6 +692,10 @@ static int gmc_v8_0_gart_enable(struct amdgpu_device *adev)
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_BLOCK_SIZE, tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_BLOCK_SIZE,
amdgpu_vm_block_size - 9); amdgpu_vm_block_size - 9);
WREG32(mmVM_CONTEXT1_CNTL, tmp); WREG32(mmVM_CONTEXT1_CNTL, tmp);
if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS)
gmc_v8_0_set_fault_enable_default(adev, false);
else
gmc_v8_0_set_fault_enable_default(adev, true);
gmc_v8_0_gart_flush_gpu_tlb(adev, 0); gmc_v8_0_gart_flush_gpu_tlb(adev, 0);
DRM_INFO("PCIE GART of %uM enabled (table at 0x%016llX).\n", DRM_INFO("PCIE GART of %uM enabled (table at 0x%016llX).\n",
...@@ -1268,6 +1301,9 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev, ...@@ -1268,6 +1301,9 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
if (!addr && !status) if (!addr && !status)
return 0; return 0;
if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
gmc_v8_0_set_fault_enable_default(adev, false);
dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n", dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
entry->src_id, entry->src_data); entry->src_id, entry->src_data);
dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n", dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment