Commit f27defca authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: umc v12_0 logs ecc errors

1. umc v12_0 logs ecc errors.
2. Reserve newly detected ecc error pages.
3. Add tag for bad pages, so that they can
   be retired later.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent b2aa6b10
......@@ -21,10 +21,13 @@
*
*/
#include <linux/sort.h>
#include "amdgpu.h"
#include "umc_v6_7.h"
#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
#define MAX_UMC_HASH_STRING_SIZE 256
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst)
......@@ -446,3 +449,67 @@ int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
status, ipid, addr);
return 0;
}
static int amdgpu_umc_uint64_cmp(const void *a, const void *b)
{
uint64_t *addr_a = (uint64_t *)a;
uint64_t *addr_b = (uint64_t *)b;
if (*addr_a > *addr_b)
return 1;
else if (*addr_a < *addr_b)
return -1;
else
return 0;
}
/* Use string hash to avoid logging the same bad pages repeatedly */
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
uint64_t *pfns, int len, uint64_t *val)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
char buf[MAX_UMC_HASH_STRING_SIZE] = {0};
int offset = 0, i = 0;
uint64_t hash_val;
if (!pfns || !len)
return -EINVAL;
sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL);
for (i = 0; i < len; i++)
offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]);
hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key);
*val = hash_val;
return 0;
}
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_ecc_log_info *ecc_log;
int ret;
ecc_log = &con->umc_ecc_log;
mutex_lock(&ecc_log->lock);
ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
if (!ret) {
struct ras_err_pages *err_pages = &ecc_err->err_pages;
int i;
/* Reserve memory */
for (i = 0; i < err_pages->count; i++)
amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
radix_tree_tag_set(ecc_tree,
ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
}
mutex_unlock(&ecc_log->lock);
return ret;
}
......@@ -52,6 +52,8 @@
#define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \
LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
/* Page retirement tag */
#define UMC_ECC_NEW_DETECTED_TAG 0x1
typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
uint32_t umc_inst, uint32_t ch_inst, void *data);
......@@ -127,5 +129,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
uint64_t status, uint64_t ipid, uint64_t addr);
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
uint64_t *pfns, int len, uint64_t *val);
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
#endif
......@@ -546,8 +546,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
uint16_t hwid, mcatype;
struct ta_ras_query_address_input addr_in;
uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
uint64_t err_addr;
uint64_t err_addr, hash_val = 0;
struct ras_ecc_err *ecc_err;
int count;
int ret;
hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
......@@ -589,6 +591,43 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
return 0;
}
ret = amdgpu_umc_build_pages_hash(adev,
page_pfn, count, &hash_val);
if (ret) {
dev_err(adev->dev, "Fail to build error pages hash\n");
return ret;
}
ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
if (!ecc_err)
return -ENOMEM;
ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), GFP_KERNEL);
if (!ecc_err->err_pages.pfn) {
kfree(ecc_err);
return -ENOMEM;
}
memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err->err_pages.pfn));
ecc_err->err_pages.count = count;
ecc_err->hash_index = hash_val;
ecc_err->status = status;
ecc_err->ipid = ipid;
ecc_err->addr = addr;
ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
if (ret) {
if (ret == -EEXIST)
con->umc_ecc_log.de_updated = true;
else
dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
kfree(ecc_err->err_pages.pfn);
kfree(ecc_err);
return ret;
}
con->umc_ecc_log.de_updated = true;
return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment