Commit 6c23f3d1 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

Use asynchronous polling to handle umc_v12_0 poisoning.

v2:
  1. Change function name.
  2. Change the debugging information content.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent ee9c3031
......@@ -120,6 +120,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
......@@ -2674,6 +2676,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_read(&con->page_retirement_req_cnt));
atomic_dec(&con->page_retirement_req_cnt);
amdgpu_umc_bad_page_polling_timeout(adev,
false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}
return 0;
......
......@@ -23,6 +23,7 @@
#include "amdgpu.h"
#include "umc_v6_7.h"
#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
......@@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
return ret;
}
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry,
bool reset)
static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
mutex_lock(&con->page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
......@@ -163,19 +161,85 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
con->update_channel_flag = false;
}
}
}
if (reset) {
kfree(err_data->err_addr);
mutex_unlock(&con->page_retirement_lock);
}
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry,
bool reset)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
if (err_data->ue_count && reset) {
/* use mode-2 reset for poison consumption */
if (!entry)
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
amdgpu_ras_reset_gpu(adev);
}
}
kfree(err_data->err_addr);
return AMDGPU_RAS_SUCCESS;
}
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
bool reset, uint32_t timeout_ms)
{
struct ras_err_data err_data;
struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__UMC,
};
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
uint32_t timeout = timeout_ms;
memset(&err_data, 0, sizeof(err_data));
amdgpu_ras_error_data_init(&err_data);
do {
amdgpu_umc_handle_bad_pages(adev, &err_data);
if (timeout && !err_data.de_count) {
msleep(1);
timeout--;
}
} while (timeout && !err_data.de_count);
if (!timeout)
dev_warn(adev->dev, "Can't find bad pages\n");
if (err_data.de_count)
dev_info(adev->dev, "%ld new deferred hardware errors detected\n", err_data.de_count);
if (obj) {
obj->err_data.ue_count += err_data.ue_count;
obj->err_data.ce_count += err_data.ce_count;
obj->err_data.de_count += err_data.de_count;
}
amdgpu_ras_error_data_fini(&err_data);
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (reset) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
/* use mode-2 reset for poison consumption */
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
amdgpu_ras_reset_gpu(adev);
}
return 0;
}
int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
{
int ret = AMDGPU_RAS_SUCCESS;
......@@ -193,6 +257,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
}
if (!amdgpu_sriov_vf(adev)) {
if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
struct ras_err_data err_data;
struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__UMC,
......@@ -212,6 +277,18 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
}
amdgpu_ras_error_data_fini(&err_data);
} else {
if (reset) {
amdgpu_umc_bad_page_polling_timeout(adev,
reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
} else {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
atomic_inc(&con->page_retirement_req_cnt);
wake_up(&con->page_retirement_wq);
}
}
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
adev->virt.ops->ras_poison_handler(adev);
......
......@@ -118,4 +118,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
umc_func func, void *data);
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
bool reset, uint32_t timeout_ms);
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment