Commit 9b54d201 authored by Eric Huang's avatar Eric Huang Committed by Alex Deucher

drm/amdkfd: add RAS ECC event support (v3)

RAS ECC event will combine with GPU reset event, due to
ECC interrupts are caused by uncorrectable error that triggers
GPU reset.

v2: Fix misleading-indentation warning
v3: fix build with CONFIG_HSA_AMD disabled
Signed-off-by: default avatarEric Huang <JinhuiEric.Huang@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 0dee45a2
...@@ -640,4 +640,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) ...@@ -640,4 +640,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
{ {
} }
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
{
}
#endif #endif
...@@ -229,5 +229,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm); ...@@ -229,5 +229,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm);
int kgd2kfd_resume_mm(struct mm_struct *mm); int kgd2kfd_resume_mm(struct mm_struct *mm);
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence); struct dma_fence *fence);
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
#endif /* AMDGPU_AMDKFD_H_INCLUDED */ #endif /* AMDGPU_AMDKFD_H_INCLUDED */
...@@ -4805,6 +4805,7 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -4805,6 +4805,7 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
/* TODO ue will trigger an interrupt. */ /* TODO ue will trigger an interrupt. */
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_ras_reset_gpu(adev, 0); amdgpu_ras_reset_gpu(adev, 0);
return AMDGPU_RAS_UE; return AMDGPU_RAS_UE;
} }
......
...@@ -354,6 +354,7 @@ static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev, ...@@ -354,6 +354,7 @@ static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev,
static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_ras_reset_gpu(adev, 0); amdgpu_ras_reset_gpu(adev, 0);
return AMDGPU_RAS_UE; return AMDGPU_RAS_UE;
} }
......
...@@ -1851,6 +1851,8 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -1851,6 +1851,8 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
return 0; return 0;
} }
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_ras_reset_gpu(adev, 0); amdgpu_ras_reset_gpu(adev, 0);
return AMDGPU_RAS_UE; return AMDGPU_RAS_UE;
......
...@@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, ...@@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
memset(&kfd->doorbell_available_index, 0, memset(&kfd->doorbell_available_index, 0,
sizeof(kfd->doorbell_available_index)); sizeof(kfd->doorbell_available_index));
atomic_set(&kfd->sram_ecc_flag, 0);
return kfd; return kfd;
} }
...@@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) ...@@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
return ret; return ret;
count = atomic_dec_return(&kfd_locked); count = atomic_dec_return(&kfd_locked);
WARN_ONCE(count != 0, "KFD reset ref. error"); WARN_ONCE(count != 0, "KFD reset ref. error");
atomic_set(&kfd->sram_ecc_flag, 0);
return 0; return 0;
} }
...@@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) ...@@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
return 0; return 0;
} }
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
{
if (kfd)
atomic_inc(&kfd->sram_ecc_flag);
}
#if defined(CONFIG_DEBUG_FS) #if defined(CONFIG_DEBUG_FS)
/* This function will send a package to HIQ to hang the HWS /* This function will send a package to HIQ to hang the HWS
......
...@@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, ...@@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
void kfd_signal_reset_event(struct kfd_dev *dev) void kfd_signal_reset_event(struct kfd_dev *dev)
{ {
struct kfd_hsa_hw_exception_data hw_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data;
struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_process *p; struct kfd_process *p;
struct kfd_event *ev; struct kfd_event *ev;
unsigned int temp; unsigned int temp;
uint32_t id, idx; uint32_t id, idx;
int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
KFD_HW_EXCEPTION_ECC :
KFD_HW_EXCEPTION_GPU_HANG;
/* Whole gpu reset caused by GPU hang and memory is lost */ /* Whole gpu reset caused by GPU hang and memory is lost */
memset(&hw_exception_data, 0, sizeof(hw_exception_data)); memset(&hw_exception_data, 0, sizeof(hw_exception_data));
hw_exception_data.gpu_id = dev->id; hw_exception_data.gpu_id = dev->id;
hw_exception_data.memory_lost = 1; hw_exception_data.memory_lost = 1;
hw_exception_data.reset_cause = reset_cause;
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
memory_exception_data.gpu_id = dev->id;
memory_exception_data.failure.imprecise = true;
idx = srcu_read_lock(&kfd_processes_srcu); idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
mutex_lock(&p->event_mutex); mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID; id = KFD_FIRST_NONSIGNAL_EVENT_ID;
idr_for_each_entry_continue(&p->event_idr, ev, id) idr_for_each_entry_continue(&p->event_idr, ev, id) {
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
ev->hw_exception_data = hw_exception_data; ev->hw_exception_data = hw_exception_data;
set_event(ev); set_event(ev);
} }
if (ev->type == KFD_EVENT_TYPE_MEMORY &&
reset_cause == KFD_HW_EXCEPTION_ECC) {
ev->memory_exception_data = memory_exception_data;
set_event(ev);
}
}
mutex_unlock(&p->event_mutex); mutex_unlock(&p->event_mutex);
} }
srcu_read_unlock(&kfd_processes_srcu, idx); srcu_read_unlock(&kfd_processes_srcu, idx);
......
...@@ -276,6 +276,9 @@ struct kfd_dev { ...@@ -276,6 +276,9 @@ struct kfd_dev {
uint64_t hive_id; uint64_t hive_id;
bool pci_atomic_requested; bool pci_atomic_requested;
/* SRAM ECC flag */
atomic_t sram_ecc_flag;
}; };
enum kfd_mempool { enum kfd_mempool {
......
...@@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args { ...@@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args {
#define KFD_HW_EXCEPTION_GPU_HANG 0 #define KFD_HW_EXCEPTION_GPU_HANG 0
#define KFD_HW_EXCEPTION_ECC 1 #define KFD_HW_EXCEPTION_ECC 1
/* For kfd_hsa_memory_exception_data.ErrorType */
#define KFD_MEM_ERR_NO_RAS 0
#define KFD_MEM_ERR_SRAM_ECC 1
#define KFD_MEM_ERR_POISON_CONSUMED 2
#define KFD_MEM_ERR_GPU_HANG 3
struct kfd_ioctl_create_event_args { struct kfd_ioctl_create_event_args {
__u64 event_page_offset; /* from KFD */ __u64 event_page_offset; /* from KFD */
...@@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data { ...@@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data {
struct kfd_memory_exception_failure failure; struct kfd_memory_exception_failure failure;
__u64 va; __u64 va;
__u32 gpu_id; __u32 gpu_id;
__u32 pad; __u32 ErrorType; /* 0 = no RAS error,
* 1 = ECC_SRAM,
* 2 = Link_SYNFLOOD (poison),
* 3 = GPU hang (not attributable to a specific cause),
* other values reserved
*/
}; };
/* hw exception data */ /* hw exception data */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment