Commit 5f7697bb authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: trigger mode1 reset for RAS RMA status

Check RMA status in bad page retirement flow.

v2: fix coding bugs in v1.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 5afbbcfe
...@@ -2068,8 +2068,9 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * ...@@ -2068,8 +2068,9 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
struct amdgpu_device *adev = obj->adev; struct amdgpu_device *adev = obj->adev;
struct amdgpu_ras_block_object *block_obj = struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0); amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
if (!block_obj) if (!block_obj || !con)
return; return;
/* both query_poison_status and handle_poison_consumption are optional, /* both query_poison_status and handle_poison_consumption are optional,
...@@ -2092,14 +2093,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * ...@@ -2092,14 +2093,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
/* gpu reset is fallback for failed and default cases */ /* gpu reset is fallback for failed and default cases.
if (poison_stat) { * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
*/
if (poison_stat && !con->is_rma) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
block_obj->ras_comm.name); block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} else {
amdgpu_gfx_poison_consumption_handler(adev, entry);
} }
if (!poison_stat)
amdgpu_gfx_poison_consumption_handler(adev, entry);
} }
static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
...@@ -2815,6 +2819,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) ...@@ -2815,6 +2819,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
page_retirement_dwork.work); page_retirement_dwork.work);
struct amdgpu_device *adev = con->adev; struct amdgpu_device *adev = con->adev;
struct ras_err_data err_data; struct ras_err_data err_data;
unsigned long err_cnt;
if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
return; return;
...@@ -2822,9 +2827,13 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) ...@@ -2822,9 +2827,13 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
amdgpu_ras_error_data_init(&err_data); amdgpu_ras_error_data_init(&err_data);
amdgpu_umc_handle_bad_pages(adev, &err_data); amdgpu_umc_handle_bad_pages(adev, &err_data);
err_cnt = err_data.err_addr_cnt;
amdgpu_ras_error_data_fini(&err_data); amdgpu_ras_error_data_fini(&err_data);
if (err_cnt && con->is_rma)
amdgpu_ras_reset_gpu(adev);
mutex_lock(&con->umc_ecc_log.lock); mutex_lock(&con->umc_ecc_log.lock);
if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree, if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
UMC_ECC_NEW_DETECTED_TAG)) UMC_ECC_NEW_DETECTED_TAG))
...@@ -2881,7 +2890,8 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, ...@@ -2881,7 +2890,8 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
if (poison_msg->pasid_fn) if (poison_msg->pasid_fn)
poison_msg->pasid_fn(adev, pasid, poison_msg->data); poison_msg->pasid_fn(adev, pasid, poison_msg->data);
if (reset) { /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
if (reset && !con->is_rma) {
flush_delayed_work(&con->page_retirement_dwork); flush_delayed_work(&con->page_retirement_dwork);
con->gpu_reset_flags |= reset; con->gpu_reset_flags |= reset;
...@@ -4010,6 +4020,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) ...@@ -4010,6 +4020,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
{ {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
/* mode1 is the only selection for RMA status */
if (ras->is_rma) {
ras->gpu_reset_flags = 0;
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
}
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
return 0; return 0;
......
...@@ -195,7 +195,8 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, ...@@ -195,7 +195,8 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_umc_handle_bad_pages(adev, ras_error_status); amdgpu_umc_handle_bad_pages(adev, ras_error_status);
if (err_data->ue_count && reset) { if ((err_data->ue_count || err_data->de_count) &&
(reset || (con && con->is_rma))) {
con->gpu_reset_flags |= reset; con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} }
...@@ -211,6 +212,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, ...@@ -211,6 +212,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
.block = AMDGPU_RAS_BLOCK__UMC, .block = AMDGPU_RAS_BLOCK__UMC,
}; };
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint32_t timeout = timeout_ms; uint32_t timeout = timeout_ms;
memset(&err_data, 0, sizeof(err_data)); memset(&err_data, 0, sizeof(err_data));
...@@ -243,9 +245,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, ...@@ -243,9 +245,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (reset) { if (reset || (err_data.err_addr_cnt && con && con->is_rma)) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
con->gpu_reset_flags |= reset; con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} }
......
...@@ -85,6 +85,7 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev, ...@@ -85,6 +85,7 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) && if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
(entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) && (entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
!entry->vmid && !entry->pasid) { !entry->vmid && !entry->pasid) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint32_t rlc_status0 = 0; uint32_t rlc_status0 = 0;
rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0); rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0);
...@@ -96,7 +97,8 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev, ...@@ -96,7 +97,8 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
} }
amdgpu_ras_reset_gpu(adev); if (con && !con->is_rma)
amdgpu_ras_reset_gpu(adev);
} }
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment