drm/amdgpu: Use explicit cardinality for clarity

RAS_MAX_RECORD_NUM may mean the maximum record number, as in the maximum house number on your street, or it may mean the maximum number of records, as in the count of records, which is also a number. To make this distinction whether the number is ordinal (index) or cardinal (count), rename this macro to RAS_MAX_RECORD_COUNT. This makes it easy to understand what it refers to, especially when we compute quantities such as, how many records do we have left in the table, especially when there are so many other numbers, quantities and numerical macros around. Also rename the long, amdgpu_ras_eeprom_get_record_max_length() to the more succinct and clear, amdgpu_ras_eeprom_max_record_count(). When computing the threshold, which also deals with counts, i.e. "how many", use cardinal "max_eeprom_records_count", than the quantitative "max_eeprom_records_len". Simplify the logic here and there, as well. Cc: Guchun Chen <guchun.chen@amd.com> Cc: John Clements <john.clements@amd.com> Cc: Hawking Zhang <Hawking.Zhang@amd.com> Cc: Alexander Deucher <Alexander.Deucher@amd.com> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com> Acked-by: Alexander Deucher <Alexander.Deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

drm/amdgpu: Use explicit cardinality for clarity
RAS_MAX_RECORD_NUM may mean the maximum record number, as in the maximum house number on your street, or it may mean the maximum number of records, as in the count of records, which is also a number. To make this distinction whether the number is ordinal (index) or cardinal (count), rename this macro to RAS_MAX_RECORD_COUNT. This makes it easy to understand what it refers to, especially when we compute quantities such as, how many records do we have left in the table, especially when there are so many other numbers, quantities and numerical macros around. Also rename the long, amdgpu_ras_eeprom_get_record_max_length() to the more succinct and clear, amdgpu_ras_eeprom_max_record_count(). When computing the threshold, which also deals with counts, i.e. "how many", use cardinal "max_eeprom_records_count", than the quantitative "max_eeprom_records_len". Simplify the logic here and there, as well. Cc: Guchun Chen <guchun.chen@amd.com> Cc: John Clements <john.clements@amd.com> Cc: Hawking Zhang <Hawking.Zhang@amd.com> Cc: Alexander Deucher <Alexander.Deucher@amd.com> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com> Acked-by: Alexander Deucher <Alexander.Deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
e4e6a589 · Luben Tuikov · Alex Deucher · 803c6ebd · e4e6a589 · e4e6a589
Commit e4e6a589 authored Mar 27, 2021 by Luben Tuikov Committed by Alex Deucher Jul 01, 2021
4 changed files
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -870,11 +870,10 @@ MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legac
 module_param_named(reset_method, amdgpu_reset_method, int, 0444);
 /**
- * DOC: bad_page_threshold (int)
+ * DOC: bad_page_threshold (int) Bad page threshold is specifies the
- * Bad page threshold is to specify the threshold value of faulty pages
+ * threshold value of faulty pages detected by RAS ECC, which may
- * detected by RAS ECC, that may result in GPU entering bad status if total
+ * result in the GPU entering bad status when the number of total
- * faulty pages by ECC exceed threshold value and leave it for user's further
+ * faulty pages by ECC exceeds the threshold value.
- * check.
 */
 MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
 module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -71,8 +71,8 @@ const char *ras_block_string[] = {
 /* inject address is 52 bits */
 #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)
-/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
+/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
-#define RAS_BAD_PAGE_RATE		(100 * 1024 * 1024ULL)
+#define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)
 enum amdgpu_ras_retire_page_reservation {
 	AMDGPU_RAS_RETIRE_PAGE_RESERVED,
@@ -1841,27 +1841,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras_eeprom_control *control =
-					&adev->psp.ras.ras->eeprom_control;
+		&adev->psp.ras.ras->eeprom_control;
-	struct eeprom_table_record *bps = NULL;
+	struct eeprom_table_record *bps;
-	int ret = 0;
+	int ret;
 	/* no bad page record, skip eeprom access */
-	if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
+	if (control->num_recs == 0 || amdgpu_bad_page_threshold == 0)
-		return ret;
+		return 0;
 	bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
 	if (!bps)
 		return -ENOMEM;
-	if (amdgpu_ras_eeprom_read(control, bps, control->num_recs)) {
+	ret = amdgpu_ras_eeprom_read(control, bps, control->num_recs);
+	if (ret)
 		dev_err(adev->dev, "Failed to load EEPROM table records!");
-		ret = -EIO;
+	else
-		goto out;
+		ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
-	}
-	ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
-out:
 	kfree(bps);
 	return ret;
 }
@@ -1901,11 +1898,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 }
 static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
-					uint32_t max_length)
+					  uint32_t max_count)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-	int tmp_threshold = amdgpu_bad_page_threshold;
-	u64 val;
 	/*
 	 * Justification of value bad_page_cnt_threshold in ras structure
@@ -1926,18 +1921,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
 	 *      take no effect.
 	 */
-	if (tmp_threshold < -1)
+	if (amdgpu_bad_page_threshold < 0) {
-		tmp_threshold = -1;
+		u64 val = adev->gmc.mc_vram_size;
-	else if (tmp_threshold > max_length)
-		tmp_threshold = max_length;
-	if (tmp_threshold == -1) {
+		do_div(val, RAS_BAD_PAGE_COVER);
-		val = adev->gmc.mc_vram_size;
-		do_div(val, RAS_BAD_PAGE_RATE);
 		con->bad_page_cnt_threshold = min(lower_32_bits(val),
-						max_length);
+						  max_count);
 	} else {
-		con->bad_page_cnt_threshold = tmp_threshold;
+		con->bad_page_cnt_threshold = min_t(int, max_count,
+						    amdgpu_bad_page_threshold);
 	}
 }
@@ -1945,7 +1937,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data;
-	uint32_t max_eeprom_records_len = 0;
+	u32  max_eeprom_records_count = 0;
 	bool exc_err_limit = false;
 	int ret;
@@ -1965,8 +1957,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	atomic_set(&con->in_recovery, 0);
 	con->adev = adev;
-	max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+	max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
-	amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+	amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
 	/* Todo: During test the SMU might fail to read the eeprom through I2C
 	 * when the GPU is pending on XGMI reset during probe time

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -54,7 +54,7 @@
 #define RAS_TBL_SIZE_BYTES      (256 * 1024)
 #define RAS_HDR_START           0
 #define RAS_RECORD_START        (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
-#define RAS_MAX_RECORD_NUM      ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
+#define RAS_MAX_RECORD_COUNT    ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
 				 / RAS_TABLE_RECORD_SIZE)
 #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
@@ -532,7 +532,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control,
 		 * TODO - Check the assumption is correct
 		 */
 		control->num_recs += num;
-		control->num_recs %= RAS_MAX_RECORD_NUM;
+		control->num_recs %= RAS_MAX_RECORD_COUNT;
 		control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num;
 		if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES)
 			control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
@@ -568,9 +568,9 @@ int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
 	return amdgpu_ras_eeprom_xfer(control, records, num, true);
 }
-inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
+inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
 {
-	return RAS_MAX_RECORD_NUM;
+	return RAS_MAX_RECORD_COUNT;
 }
 /* Used for testing if bugs encountered */

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -88,7 +88,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
 int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
 			    struct eeprom_table_record *records, const u32 num);
-inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
+inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control);