Commit e4e6a589 authored by Luben Tuikov's avatar Luben Tuikov Committed by Alex Deucher

drm/amdgpu: Use explicit cardinality for clarity

RAS_MAX_RECORD_NUM may mean the maximum record
number, as in the maximum house number on your
street, or it may mean the maximum number of
records, as in the count of records, which is also
a number. To make this distinction whether the
number is ordinal (index) or cardinal (count),
rename this macro to RAS_MAX_RECORD_COUNT.

This makes it easy to understand what it refers
to, especially when we compute quantities such as,
how many records do we have left in the table,
especially when there are so many other numbers,
quantities and numerical macros around.

Also rename the long,
amdgpu_ras_eeprom_get_record_max_length() to the
more succinct and clear,
amdgpu_ras_eeprom_max_record_count().

When computing the threshold, which also deals
with counts, i.e. "how many", use cardinal
"max_eeprom_records_count", than the quantitative
"max_eeprom_records_len".

Simplify the logic here and there, as well.

Cc: Guchun Chen <guchun.chen@amd.com>
Cc: John Clements <john.clements@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Signed-off-by: default avatarLuben Tuikov <luben.tuikov@amd.com>
Acked-by: default avatarAlexander Deucher <Alexander.Deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 803c6ebd
...@@ -870,11 +870,10 @@ MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legac ...@@ -870,11 +870,10 @@ MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legac
module_param_named(reset_method, amdgpu_reset_method, int, 0444); module_param_named(reset_method, amdgpu_reset_method, int, 0444);
/** /**
* DOC: bad_page_threshold (int) * DOC: bad_page_threshold (int) Bad page threshold is specifies the
* Bad page threshold is to specify the threshold value of faulty pages * threshold value of faulty pages detected by RAS ECC, which may
* detected by RAS ECC, that may result in GPU entering bad status if total * result in the GPU entering bad status when the number of total
* faulty pages by ECC exceed threshold value and leave it for user's further * faulty pages by ECC exceeds the threshold value.
* check.
*/ */
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)"); MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
......
...@@ -71,8 +71,8 @@ const char *ras_block_string[] = { ...@@ -71,8 +71,8 @@ const char *ras_block_string[] = {
/* inject address is 52 bits */ /* inject address is 52 bits */
#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
enum amdgpu_ras_retire_page_reservation { enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_RESERVED,
...@@ -1841,27 +1841,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) ...@@ -1841,27 +1841,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
{ {
struct amdgpu_ras_eeprom_control *control = struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras.ras->eeprom_control; &adev->psp.ras.ras->eeprom_control;
struct eeprom_table_record *bps = NULL; struct eeprom_table_record *bps;
int ret = 0; int ret;
/* no bad page record, skip eeprom access */ /* no bad page record, skip eeprom access */
if (!control->num_recs || (amdgpu_bad_page_threshold == 0)) if (control->num_recs == 0 || amdgpu_bad_page_threshold == 0)
return ret; return 0;
bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
if (!bps) if (!bps)
return -ENOMEM; return -ENOMEM;
if (amdgpu_ras_eeprom_read(control, bps, control->num_recs)) { ret = amdgpu_ras_eeprom_read(control, bps, control->num_recs);
if (ret)
dev_err(adev->dev, "Failed to load EEPROM table records!"); dev_err(adev->dev, "Failed to load EEPROM table records!");
ret = -EIO; else
goto out; ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
}
ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
out:
kfree(bps); kfree(bps);
return ret; return ret;
} }
...@@ -1901,11 +1898,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, ...@@ -1901,11 +1898,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
} }
static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
uint32_t max_length) uint32_t max_count)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int tmp_threshold = amdgpu_bad_page_threshold;
u64 val;
/* /*
* Justification of value bad_page_cnt_threshold in ras structure * Justification of value bad_page_cnt_threshold in ras structure
...@@ -1926,18 +1921,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, ...@@ -1926,18 +1921,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
* take no effect. * take no effect.
*/ */
if (tmp_threshold < -1) if (amdgpu_bad_page_threshold < 0) {
tmp_threshold = -1; u64 val = adev->gmc.mc_vram_size;
else if (tmp_threshold > max_length)
tmp_threshold = max_length;
if (tmp_threshold == -1) { do_div(val, RAS_BAD_PAGE_COVER);
val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
con->bad_page_cnt_threshold = min(lower_32_bits(val), con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_length); max_count);
} else { } else {
con->bad_page_cnt_threshold = tmp_threshold; con->bad_page_cnt_threshold = min_t(int, max_count,
amdgpu_bad_page_threshold);
} }
} }
...@@ -1945,7 +1937,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -1945,7 +1937,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data; struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0; u32 max_eeprom_records_count = 0;
bool exc_err_limit = false; bool exc_err_limit = false;
int ret; int ret;
...@@ -1965,8 +1957,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -1965,8 +1957,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0); atomic_set(&con->in_recovery, 0);
con->adev = adev; con->adev = adev;
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
/* Todo: During test the SMU might fail to read the eeprom through I2C /* Todo: During test the SMU might fail to read the eeprom through I2C
* when the GPU is pending on XGMI reset during probe time * when the GPU is pending on XGMI reset during probe time
......
...@@ -54,7 +54,7 @@ ...@@ -54,7 +54,7 @@
#define RAS_TBL_SIZE_BYTES (256 * 1024) #define RAS_TBL_SIZE_BYTES (256 * 1024)
#define RAS_HDR_START 0 #define RAS_HDR_START 0
#define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE) #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
#define RAS_MAX_RECORD_NUM ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ #define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
/ RAS_TABLE_RECORD_SIZE) / RAS_TABLE_RECORD_SIZE)
#define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
...@@ -532,7 +532,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, ...@@ -532,7 +532,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control,
* TODO - Check the assumption is correct * TODO - Check the assumption is correct
*/ */
control->num_recs += num; control->num_recs += num;
control->num_recs %= RAS_MAX_RECORD_NUM; control->num_recs %= RAS_MAX_RECORD_COUNT;
control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num; control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num;
if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES) if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES)
control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
...@@ -568,9 +568,9 @@ int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, ...@@ -568,9 +568,9 @@ int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
return amdgpu_ras_eeprom_xfer(control, records, num, true); return amdgpu_ras_eeprom_xfer(control, records, num, true);
} }
inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
{ {
return RAS_MAX_RECORD_NUM; return RAS_MAX_RECORD_COUNT;
} }
/* Used for testing if bugs encountered */ /* Used for testing if bugs encountered */
......
...@@ -88,7 +88,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, ...@@ -88,7 +88,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *records, const u32 num); struct eeprom_table_record *records, const u32 num);
inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void); inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control); void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment