Commit 91a1a52d authored by Mukul Joshi's avatar Mukul Joshi Committed by Alex Deucher

drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran

During mode2 reset, the GPU is temporarily removed from the
mgpu_info list. As a result, page retirement fails because it
cannot find the GPU in the GPU list.
To fix this, create our own list of GPUs that support MCE notifier
based page retirement and use that list to check if the UMC error
occurred on a GPU that supports MCE notifier based page retirement.
Signed-off-by: default avatarMukul Joshi <mukul.joshi@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent a4967a1e
...@@ -112,7 +112,12 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, ...@@ -112,7 +112,12 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr); uint64_t addr);
#ifdef CONFIG_X86_MCE_AMD #ifdef CONFIG_X86_MCE_AMD
static void amdgpu_register_bad_pages_mca_notifier(void); static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
struct mce_notifier_adev_list {
struct amdgpu_device *devs[MAX_GPU_INSTANCE];
int num_gpu;
};
static struct mce_notifier_adev_list mce_adev_list;
#endif #endif
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
...@@ -2108,7 +2113,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -2108,7 +2113,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
#ifdef CONFIG_X86_MCE_AMD #ifdef CONFIG_X86_MCE_AMD
if ((adev->asic_type == CHIP_ALDEBARAN) && if ((adev->asic_type == CHIP_ALDEBARAN) &&
(adev->gmc.xgmi.connected_to_cpu)) (adev->gmc.xgmi.connected_to_cpu))
amdgpu_register_bad_pages_mca_notifier(); amdgpu_register_bad_pages_mca_notifier(adev);
#endif #endif
return 0; return 0;
...@@ -2605,24 +2610,18 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev) ...@@ -2605,24 +2610,18 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
#ifdef CONFIG_X86_MCE_AMD #ifdef CONFIG_X86_MCE_AMD
static struct amdgpu_device *find_adev(uint32_t node_id) static struct amdgpu_device *find_adev(uint32_t node_id)
{ {
struct amdgpu_gpu_instance *gpu_instance;
int i; int i;
struct amdgpu_device *adev = NULL; struct amdgpu_device *adev = NULL;
mutex_lock(&mgpu_info.mutex); for (i = 0; i < mce_adev_list.num_gpu; i++) {
adev = mce_adev_list.devs[i];
for (i = 0; i < mgpu_info.num_gpu; i++) {
gpu_instance = &(mgpu_info.gpu_ins[i]);
adev = gpu_instance->adev;
if (adev->gmc.xgmi.connected_to_cpu && if (adev && adev->gmc.xgmi.connected_to_cpu &&
adev->gmc.xgmi.physical_node_id == node_id) adev->gmc.xgmi.physical_node_id == node_id)
break; break;
adev = NULL; adev = NULL;
} }
mutex_unlock(&mgpu_info.mutex);
return adev; return adev;
} }
...@@ -2718,8 +2717,18 @@ static struct notifier_block amdgpu_bad_page_nb = { ...@@ -2718,8 +2717,18 @@ static struct notifier_block amdgpu_bad_page_nb = {
.priority = MCE_PRIO_UC, .priority = MCE_PRIO_UC,
}; };
static void amdgpu_register_bad_pages_mca_notifier(void) static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
{ {
/*
* Add the adev to the mce_adev_list.
* During mode2 reset, amdgpu device is temporarily
* removed from the mgpu_info list which can cause
* page retirement to fail.
* Use this list instead of mgpu_info to find the amdgpu
* device on which the UMC error was reported.
*/
mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
/* /*
* Register the x86 notifier only once * Register the x86 notifier only once
* with MCE subsystem. * with MCE subsystem.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment