Commit bd68fb94 authored by John Clements's avatar John Clements Committed by Alex Deucher

drm/amdgpu: resolve bug in UMC 6 error counter query

iterate over all error counter registers in SMN space

removed support error counter access via MMIO
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Signed-off-by: default avatarJohn Clements <john.clements@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent a210d698
...@@ -21,38 +21,6 @@ ...@@ -21,38 +21,6 @@
#ifndef __AMDGPU_UMC_H__ #ifndef __AMDGPU_UMC_H__
#define __AMDGPU_UMC_H__ #define __AMDGPU_UMC_H__
/* implement 64 bits REG operations via 32 bits interface */
#define RREG64_UMC(reg) (RREG32(reg) | \
((uint64_t)RREG32((reg) + 1) << 32))
#define WREG64_UMC(reg, v) \
do { \
WREG32((reg), lower_32_bits(v)); \
WREG32((reg) + 1, upper_32_bits(v)); \
} while (0)
/*
* void (*func)(struct amdgpu_device *adev, struct ras_err_data *err_data,
* uint32_t umc_reg_offset, uint32_t channel_index)
*/
#define amdgpu_umc_for_each_channel(func) \
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; \
uint32_t umc_inst, channel_inst, umc_reg_offset, channel_index; \
for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) { \
/* enable the index mode to query eror count per channel */ \
adev->umc.funcs->enable_umc_index_mode(adev, umc_inst); \
for (channel_inst = 0; \
channel_inst < adev->umc.channel_inst_num; \
channel_inst++) { \
/* calc the register offset according to channel instance */ \
umc_reg_offset = adev->umc.channel_offs * channel_inst; \
/* get channel index of interleaved memory */ \
channel_index = adev->umc.channel_idx_tbl[ \
umc_inst * adev->umc.channel_inst_num + channel_inst]; \
(func)(adev, err_data, umc_reg_offset, channel_index); \
} \
} \
adev->umc.funcs->disable_umc_index_mode(adev);
struct amdgpu_umc_funcs { struct amdgpu_umc_funcs {
void (*err_cnt_init)(struct amdgpu_device *adev); void (*err_cnt_init)(struct amdgpu_device *adev);
int (*ras_late_init)(struct amdgpu_device *adev); int (*ras_late_init)(struct amdgpu_device *adev);
...@@ -60,9 +28,6 @@ struct amdgpu_umc_funcs { ...@@ -60,9 +28,6 @@ struct amdgpu_umc_funcs {
void *ras_error_status); void *ras_error_status);
void (*query_ras_error_address)(struct amdgpu_device *adev, void (*query_ras_error_address)(struct amdgpu_device *adev,
void *ras_error_status); void *ras_error_status);
void (*enable_umc_index_mode)(struct amdgpu_device *adev,
uint32_t umc_instance);
void (*disable_umc_index_mode)(struct amdgpu_device *adev);
void (*init_registers)(struct amdgpu_device *adev); void (*init_registers)(struct amdgpu_device *adev);
}; };
......
...@@ -32,6 +32,8 @@ ...@@ -32,6 +32,8 @@
#define smnMCA_UMC0_MCUMC_ADDRT0 0x50f10 #define smnMCA_UMC0_MCUMC_ADDRT0 0x50f10
#define UMC_6_INST_DIST 0x40000
/* /*
* (addr / 256) * 8192, the higher 26 bits in ErrorAddr * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
* is the index of 8KB block * is the index of 8KB block
...@@ -50,41 +52,11 @@ const uint32_t ...@@ -50,41 +52,11 @@ const uint32_t
{9, 25, 0, 16}, {15, 31, 6, 22} {9, 25, 0, 16}, {15, 31, 6, 22}
}; };
static void umc_v6_1_enable_umc_index_mode(struct amdgpu_device *adev, static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev,
uint32_t umc_instance) uint32_t umc_inst,
{ uint32_t ch_inst)
uint32_t rsmu_umc_index;
rsmu_umc_index = RREG32_SOC15(RSMU, 0,
mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index,
RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
RSMU_UMC_INDEX_MODE_EN, 1);
rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index,
RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
RSMU_UMC_INDEX_INSTANCE, umc_instance);
rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index,
RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
RSMU_UMC_INDEX_WREN, 1 << umc_instance);
WREG32_SOC15(RSMU, 0, mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
rsmu_umc_index);
}
static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
{
WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
RSMU_UMC_INDEX_MODE_EN, 0);
}
static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev)
{ {
uint32_t rsmu_umc_index; return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst;
rsmu_umc_index = RREG32_SOC15(RSMU, 0,
mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
return REG_GET_FIELD(rsmu_umc_index,
RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
RSMU_UMC_INDEX_INSTANCE);
} }
static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
...@@ -174,25 +146,36 @@ static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev ...@@ -174,25 +146,36 @@ static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev
*error_count += 1; *error_count += 1;
} }
static void umc_v6_1_query_error_count(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint32_t umc_reg_offset,
uint32_t channel_index)
{
umc_v6_1_query_correctable_error_count(adev, umc_reg_offset,
&(err_data->ce_count));
umc_v6_1_querry_uncorrectable_error_count(adev, umc_reg_offset,
&(err_data->ue_count));
}
static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev, static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status) void *ras_error_status)
{ {
amdgpu_umc_for_each_channel(umc_v6_1_query_error_count); struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
uint32_t umc_inst = 0;
uint32_t ch_inst = 0;
uint32_t umc_reg_offset = 0;
for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) {
for (ch_inst = 0; ch_inst < adev->umc.channel_inst_num; ch_inst++) {
umc_reg_offset = get_umc_6_reg_offset(adev,
umc_inst,
ch_inst);
umc_v6_1_query_correctable_error_count(adev,
umc_reg_offset,
&(err_data->ce_count));
umc_v6_1_querry_uncorrectable_error_count(adev,
umc_reg_offset,
&(err_data->ue_count));
}
}
} }
static void umc_v6_1_query_error_address(struct amdgpu_device *adev, static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, struct ras_err_data *err_data,
uint32_t umc_reg_offset, uint32_t channel_index) uint32_t umc_reg_offset,
uint32_t channel_index,
uint32_t umc_inst)
{ {
uint32_t lsb, mc_umc_status_addr; uint32_t lsb, mc_umc_status_addr;
uint64_t mc_umc_status, err_addr, retired_page; uint64_t mc_umc_status, err_addr, retired_page;
...@@ -244,7 +227,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, ...@@ -244,7 +227,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
err_rec->cu = 0; err_rec->cu = 0;
err_rec->mem_channel = channel_index; err_rec->mem_channel = channel_index;
err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev); err_rec->mcumc_id = umc_inst;
err_data->err_addr_cnt++; err_data->err_addr_cnt++;
} }
...@@ -257,12 +240,30 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, ...@@ -257,12 +240,30 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev, static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev,
void *ras_error_status) void *ras_error_status)
{ {
amdgpu_umc_for_each_channel(umc_v6_1_query_error_address); struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
uint32_t umc_inst = 0;
uint32_t ch_inst = 0;
uint32_t umc_reg_offset = 0;
for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) {
for (ch_inst = 0; ch_inst < adev->umc.channel_inst_num; ch_inst++) {
umc_reg_offset = get_umc_6_reg_offset(adev,
umc_inst,
ch_inst);
umc_v6_1_query_error_address(adev,
err_data,
umc_reg_offset,
ch_inst,
umc_inst);
}
}
} }
static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev, static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint32_t umc_reg_offset)
uint32_t umc_reg_offset, uint32_t channel_index)
{ {
uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
uint32_t ecc_err_cnt_addr; uint32_t ecc_err_cnt_addr;
...@@ -301,9 +302,19 @@ static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev, ...@@ -301,9 +302,19 @@ static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev,
static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev) static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev)
{ {
void *ras_error_status = NULL; uint32_t umc_inst = 0;
uint32_t ch_inst = 0;
uint32_t umc_reg_offset = 0;
amdgpu_umc_for_each_channel(umc_v6_1_err_cnt_init_per_channel); for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) {
for (ch_inst = 0; ch_inst < adev->umc.channel_inst_num; ch_inst++) {
umc_reg_offset = get_umc_6_reg_offset(adev,
umc_inst,
ch_inst);
umc_v6_1_err_cnt_init_per_channel(adev, umc_reg_offset);
}
}
} }
const struct amdgpu_umc_funcs umc_v6_1_funcs = { const struct amdgpu_umc_funcs umc_v6_1_funcs = {
...@@ -311,6 +322,4 @@ const struct amdgpu_umc_funcs umc_v6_1_funcs = { ...@@ -311,6 +322,4 @@ const struct amdgpu_umc_funcs umc_v6_1_funcs = {
.ras_late_init = amdgpu_umc_ras_late_init, .ras_late_init = amdgpu_umc_ras_late_init,
.query_ras_error_count = umc_v6_1_query_ras_error_count, .query_ras_error_count = umc_v6_1_query_ras_error_count,
.query_ras_error_address = umc_v6_1_query_ras_error_address, .query_ras_error_address = umc_v6_1_query_ras_error_address,
.enable_umc_index_mode = umc_v6_1_enable_umc_index_mode,
.disable_umc_index_mode = umc_v6_1_disable_umc_index_mode,
}; };
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment