Commit e8fbaf03 authored by Guchun Chen's avatar Guchun Chen Committed by Alex Deucher

drm/amdgpu: break GPU recovery once it's in bad state(v4)

When GPU executes recovery and retriving bad GPU tag
from external eerpom device, the recovery will be broken
and error message is printed as well for user's awareness.

v2: Refine warning message in threshold reaching case, and
    fix spelling typo.

v3: Fix explicit calling of bad gpu.

v4: Rename function names.
Signed-off-by: default avatarGuchun Chen <guchun.chen@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 9c06f91f
......@@ -4125,8 +4125,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
amdgpu_fbdev_set_suspend(tmp_adev, 0);
/* must succeed. */
amdgpu_ras_resume(tmp_adev);
/*
* The GPU enters bad state once faulty pages
* by ECC has reached the threshold, and ras
* recovery is scheduled next. So add one check
* here to break recovery if it indeed exceeds
* bad page threshold, and remind user to
* retire this GPU or setting one bigger
* bad_page_threshold value to fix this once
* probing driver again.
*/
if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
/* must succeed. */
amdgpu_ras_resume(tmp_adev);
} else {
r = -EINVAL;
goto out;
}
/* Update PSP FW topology after reset */
if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
......@@ -4134,7 +4149,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
}
}
out:
if (!r) {
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
......
......@@ -2205,3 +2205,19 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
return false;
}
bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
bool exc_err_limit = false;
if (con && (amdgpu_bad_page_threshold != 0))
amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control,
&exc_err_limit);
/*
* We are only interested in variable exc_err_limit,
* as it says if GPU is in bad state or not.
*/
return exc_err_limit;
}
......@@ -497,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
bool is_ce);
bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev);
/* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages);
......
......@@ -386,6 +386,46 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address)
return curr_address;
}
int amdgpu_ras_eeprom_check_err_threshold(
struct amdgpu_ras_eeprom_control *control,
bool *exceed_err_limit)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
unsigned char buff[EEPROM_ADDRESS_SIZE +
EEPROM_TABLE_HEADER_SIZE] = { 0 };
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
struct i2c_msg msg = {
.addr = control->i2c_address,
.flags = I2C_M_RD,
.len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
.buf = buff,
};
int ret;
*exceed_err_limit = false;
/* read EEPROM table header */
mutex_lock(&control->tbl_mutex);
ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
if (ret < 1) {
dev_err(adev->dev, "Failed to read EEPROM table header.\n");
goto err;
}
__decode_table_header_from_buff(hdr, &buff[2]);
if (hdr->header == EEPROM_TABLE_HDR_BAD) {
dev_warn(adev->dev, "This GPU is in BAD status.");
dev_warn(adev->dev, "Please retire it or setting one bigger "
"threshold value when reloading driver.\n");
*exceed_err_limit = true;
}
err:
mutex_unlock(&control->tbl_mutex);
return 0;
}
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *records,
bool write,
......
......@@ -80,6 +80,10 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
bool *exceed_err_limit);
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
int amdgpu_ras_eeprom_check_err_threshold(
struct amdgpu_ras_eeprom_control *control,
bool *exceed_err_limit);
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *records,
bool write,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment