Commit 69691c82 authored by Stanley.Yang's avatar Stanley.Yang Committed by Alex Deucher

drm/amdgpu: message smu to update bad channel info

It should notice SMU to update bad channel info when detected
uncorrectable error in UMC block
Signed-off-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d510eccf
...@@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->recovery_lock); mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0); atomic_set(&con->in_recovery, 0);
con->eeprom_control.bad_channel_bitmap = 0;
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
...@@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
goto free; goto free;
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
con->update_channel_flag = false;
}
} }
#ifdef CONFIG_X86_MCE_AMD #ifdef CONFIG_X86_MCE_AMD
...@@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) ...@@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con; goto release_con;
} }
con->update_channel_flag = false;
con->features = 0; con->features = 0;
INIT_LIST_HEAD(&con->head); INIT_LIST_HEAD(&con->head);
/* Might need get this flag from vbios. */ /* Might need get this flag from vbios. */
......
...@@ -374,6 +374,9 @@ struct amdgpu_ras { ...@@ -374,6 +374,9 @@ struct amdgpu_ras {
/* record umc error info queried from smu */ /* record umc error info queried from smu */
struct umc_ecc_info umc_ecc; struct umc_ecc_info umc_ecc;
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
}; };
struct ras_fs_data { struct ras_fs_data {
......
...@@ -267,6 +267,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) ...@@ -267,6 +267,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
{ {
struct amdgpu_device *adev = to_amdgpu_device(control); struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
u8 csum; u8 csum;
int res; int res;
...@@ -287,6 +288,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) ...@@ -287,6 +288,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs); amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
control->bad_channel_bitmap = 0;
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
con->update_channel_flag = false;
amdgpu_ras_debugfs_set_ret_size(control); amdgpu_ras_debugfs_set_ret_size(control);
mutex_unlock(&control->ras_tbl_mutex); mutex_unlock(&control->ras_tbl_mutex);
...@@ -420,6 +425,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, ...@@ -420,6 +425,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *record, struct eeprom_table_record *record,
const u32 num) const u32 num)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
u32 a, b, i; u32 a, b, i;
u8 *buf, *pp; u8 *buf, *pp;
int res; int res;
...@@ -431,9 +437,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, ...@@ -431,9 +437,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
/* Encode all of them in one go. /* Encode all of them in one go.
*/ */
pp = buf; pp = buf;
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
__encode_table_record_to_buf(control, &record[i], pp); __encode_table_record_to_buf(control, &record[i], pp);
/* update bad channel bitmap */
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
con->update_channel_flag = true;
}
}
/* a, first record index to write into. /* a, first record index to write into.
* b, last record index to write into. * b, last record index to write into.
* a = first index to read (fri) + number of records in the table, * a = first index to read (fri) + number of records in the table,
...@@ -686,6 +699,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, ...@@ -686,6 +699,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
const u32 num) const u32 num)
{ {
struct amdgpu_device *adev = to_amdgpu_device(control); struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int i, res; int i, res;
u8 *buf, *pp; u8 *buf, *pp;
u32 g0, g1; u32 g0, g1;
...@@ -753,8 +767,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, ...@@ -753,8 +767,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
/* Read up everything? Then transform. /* Read up everything? Then transform.
*/ */
pp = buf; pp = buf;
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
__decode_table_record_from_buf(control, &record[i], pp); __decode_table_record_from_buf(control, &record[i], pp);
/* update bad channel bitmap */
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
con->update_channel_flag = true;
}
}
Out: Out:
kfree(buf); kfree(buf);
mutex_unlock(&control->ras_tbl_mutex); mutex_unlock(&control->ras_tbl_mutex);
......
...@@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control { ...@@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
/* Protect table access via this mutex. /* Protect table access via this mutex.
*/ */
struct mutex ras_tbl_mutex; struct mutex ras_tbl_mutex;
/* Record channel info which occurred bad pages
*/
u32 bad_channel_bitmap;
}; };
/* /*
......
...@@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, ...@@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
amdgpu_ras_save_bad_pages(adev); amdgpu_ras_save_bad_pages(adev);
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
con->update_channel_flag = false;
}
} }
if (reset) if (reset)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment