Commit c7490949 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

amd/amdkfd: add ras page retirement handling for sq/sdma (v3)

In ras poison mode, page retirement will be handled by the irq handler of the
module which consumes corrupted data.

v2: rename ras_process_cb to ras_poison_consumption_handler.
    move the handler's implementation from ASIC specific file to common
file.

v3: call gpu reset for xGMI connected mode.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e5d59cfa
...@@ -31,6 +31,8 @@ ...@@ -31,6 +31,8 @@
#include <linux/dma-buf.h> #include <linux/dma-buf.h>
#include "amdgpu_xgmi.h" #include "amdgpu_xgmi.h"
#include <uapi/linux/kfd_ioctl.h> #include <uapi/linux/kfd_ioctl.h>
#include "amdgpu_ras.h"
#include "amdgpu_umc.h"
/* Total memory size in system memory and all GPU VRAM. Used to /* Total memory size in system memory and all GPU VRAM. Used to
* estimate worst case amount of memory to reserve for page tables * estimate worst case amount of memory to reserve for page tables
...@@ -780,3 +782,15 @@ bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) ...@@ -780,3 +782,15 @@ bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
return adev->have_atomics_support; return adev->have_atomics_support;
} }
void amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd)
{
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
struct ras_err_data err_data = {0, 0, 0, NULL};
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
if (!adev->gmc.xgmi.connected_to_cpu)
amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
else
amdgpu_amdkfd_gpu_reset(kgd);
}
...@@ -290,6 +290,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, ...@@ -290,6 +290,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
uint64_t *mmap_offset); uint64_t *mmap_offset);
int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
struct tile_config *config); struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd);
#if IS_ENABLED(CONFIG_HSA_AMD) #if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void); void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
......
...@@ -231,7 +231,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, ...@@ -231,7 +231,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
kfd_signal_poison_consumed_event(dev, pasid); kfd_signal_poison_consumed_event(dev, pasid);
amdgpu_amdkfd_gpu_reset(dev->kgd); amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);
return; return;
} }
break; break;
...@@ -253,7 +253,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, ...@@ -253,7 +253,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) { } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
kfd_signal_poison_consumed_event(dev, pasid); kfd_signal_poison_consumed_event(dev, pasid);
amdgpu_amdkfd_gpu_reset(dev->kgd); amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);
return; return;
} }
} else if (client_id == SOC15_IH_CLIENTID_VMC || } else if (client_id == SOC15_IH_CLIENTID_VMC ||
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment