Commit 630ba007 authored by Shiju Jose's avatar Shiju Jose Committed by David S. Miller

net: hns3: add handling of RDMA RAS errors

This patch handles the RDMA RAS errors.
1. Enable RAS interrupt, print error detail info and clear error status.
2. Do CORE reset to recovery when these non-fatal errors happened.
Signed-off-by: default avatarXiaofei Tan <tanxiaofei@huawei.com>
Signed-off-by: default avatarShiju Jose <shiju.jose@huawei.com>
Signed-off-by: default avatarSalil Mehta <salil.mehta@huawei.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent c3529177
...@@ -229,6 +229,9 @@ enum hclge_opcode_type { ...@@ -229,6 +229,9 @@ enum hclge_opcode_type {
HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513, HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514, HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514,
HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515, HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515,
HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580,
HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581,
HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584,
HCLGE_IGU_EGU_TNL_INT_EN = 0x1803, HCLGE_IGU_EGU_TNL_INT_EN = 0x1803,
HCLGE_IGU_COMMON_INT_EN = 0x1806, HCLGE_IGU_COMMON_INT_EN = 0x1806,
HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14, HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14,
......
...@@ -337,6 +337,30 @@ static const struct hclge_hw_error hclge_ssu_port_based_pf_int[] = { ...@@ -337,6 +337,30 @@ static const struct hclge_hw_error hclge_ssu_port_based_pf_int[] = {
{ /* sentinel */ } { /* sentinel */ }
}; };
static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = {
{ .int_msk = 0, .msg = "rocee qmm ovf: sgid invalid err" },
{ .int_msk = 0x4, .msg = "rocee qmm ovf: sgid ovf err" },
{ .int_msk = 0x8, .msg = "rocee qmm ovf: smac invalid err" },
{ .int_msk = 0xC, .msg = "rocee qmm ovf: smac ovf err" },
{ .int_msk = 0x10, .msg = "rocee qmm ovf: cqc invalid err" },
{ .int_msk = 0x11, .msg = "rocee qmm ovf: cqc ovf err" },
{ .int_msk = 0x12, .msg = "rocee qmm ovf: cqc hopnum err" },
{ .int_msk = 0x13, .msg = "rocee qmm ovf: cqc ba0 err" },
{ .int_msk = 0x14, .msg = "rocee qmm ovf: srqc invalid err" },
{ .int_msk = 0x15, .msg = "rocee qmm ovf: srqc ovf err" },
{ .int_msk = 0x16, .msg = "rocee qmm ovf: srqc hopnum err" },
{ .int_msk = 0x17, .msg = "rocee qmm ovf: srqc ba0 err" },
{ .int_msk = 0x18, .msg = "rocee qmm ovf: mpt invalid err" },
{ .int_msk = 0x19, .msg = "rocee qmm ovf: mpt ovf err" },
{ .int_msk = 0x1A, .msg = "rocee qmm ovf: mpt hopnum err" },
{ .int_msk = 0x1B, .msg = "rocee qmm ovf: mpt ba0 err" },
{ .int_msk = 0x1C, .msg = "rocee qmm ovf: qpc invalid err" },
{ .int_msk = 0x1D, .msg = "rocee qmm ovf: qpc ovf err" },
{ .int_msk = 0x1E, .msg = "rocee qmm ovf: qpc hopnum err" },
{ .int_msk = 0x1F, .msg = "rocee qmm ovf: qpc ba0 err" },
{ /* sentinel */ }
};
static void hclge_log_error(struct device *dev, char *reg, static void hclge_log_error(struct device *dev, char *reg,
const struct hclge_hw_error *err, const struct hclge_hw_error *err,
u32 err_sts) u32 err_sts)
...@@ -1023,6 +1047,148 @@ static int hclge_handle_all_ras_errors(struct hclge_dev *hdev) ...@@ -1023,6 +1047,148 @@ static int hclge_handle_all_ras_errors(struct hclge_dev *hdev)
return ret; return ret;
} }
static int hclge_log_rocee_ovf_error(struct hclge_dev *hdev)
{
struct device *dev = &hdev->pdev->dev;
struct hclge_desc desc[2];
int ret;
/* read overflow error status */
ret = hclge_cmd_query_error(hdev, &desc[0],
HCLGE_ROCEE_PF_RAS_INT_CMD,
0, 0, 0);
if (ret) {
dev_err(dev, "failed(%d) to query ROCEE OVF error sts\n", ret);
return ret;
}
/* log overflow error */
if (le32_to_cpu(desc[0].data[0]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
const struct hclge_hw_error *err;
u32 err_sts;
err = &hclge_rocee_qmm_ovf_err_int[0];
err_sts = HCLGE_ROCEE_OVF_ERR_TYPE_MASK &
le32_to_cpu(desc[0].data[0]);
while (err->msg) {
if (err->int_msk == err_sts) {
dev_warn(dev, "%s [error status=0x%x] found\n",
err->msg,
le32_to_cpu(desc[0].data[0]));
break;
}
err++;
}
}
if (le32_to_cpu(desc[0].data[1]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
dev_warn(dev, "ROCEE TSP OVF [error status=0x%x] found\n",
le32_to_cpu(desc[0].data[1]));
}
if (le32_to_cpu(desc[0].data[2]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
dev_warn(dev, "ROCEE SCC OVF [error status=0x%x] found\n",
le32_to_cpu(desc[0].data[2]));
}
return 0;
}
static int hclge_log_and_clear_rocee_ras_error(struct hclge_dev *hdev)
{
enum hnae3_reset_type reset_type = HNAE3_FUNC_RESET;
struct hnae3_ae_dev *ae_dev = hdev->ae_dev;
struct device *dev = &hdev->pdev->dev;
struct hclge_desc desc[2];
unsigned int status;
int ret;
/* read RAS error interrupt status */
ret = hclge_cmd_query_error(hdev, &desc[0],
HCLGE_QUERY_CLEAR_ROCEE_RAS_INT,
0, 0, 0);
if (ret) {
dev_err(dev, "failed(%d) to query ROCEE RAS INT SRC\n", ret);
/* reset everything for now */
HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET);
return ret;
}
status = le32_to_cpu(desc[0].data[0]);
if (status & HCLGE_ROCEE_RERR_INT_MASK)
dev_warn(dev, "ROCEE RAS AXI rresp error\n");
if (status & HCLGE_ROCEE_BERR_INT_MASK)
dev_warn(dev, "ROCEE RAS AXI bresp error\n");
if (status & HCLGE_ROCEE_ECC_INT_MASK) {
dev_warn(dev, "ROCEE RAS 2bit ECC error\n");
reset_type = HNAE3_GLOBAL_RESET;
}
if (status & HCLGE_ROCEE_OVF_INT_MASK) {
ret = hclge_log_rocee_ovf_error(hdev);
if (ret) {
dev_err(dev, "failed(%d) to process ovf error\n", ret);
/* reset everything for now */
HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET);
return ret;
}
}
/* clear error status */
hclge_cmd_reuse_desc(&desc[0], false);
ret = hclge_cmd_send(&hdev->hw, &desc[0], 1);
if (ret) {
dev_err(dev, "failed(%d) to clear ROCEE RAS error\n", ret);
/* reset everything for now */
reset_type = HNAE3_GLOBAL_RESET;
}
HCLGE_SET_DEFAULT_RESET_REQUEST(reset_type);
return ret;
}
static int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en)
{
struct device *dev = &hdev->pdev->dev;
struct hclge_desc desc;
int ret;
if (hdev->pdev->revision < 0x21 || !hnae3_dev_roce_supported(hdev))
return 0;
hclge_cmd_setup_basic_desc(&desc, HCLGE_CONFIG_ROCEE_RAS_INT_EN, false);
if (en) {
/* enable ROCEE hw error interrupts */
desc.data[0] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN);
desc.data[1] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN);
hclge_log_and_clear_rocee_ras_error(hdev);
}
desc.data[2] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN_MASK);
desc.data[3] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN_MASK);
ret = hclge_cmd_send(&hdev->hw, &desc, 1);
if (ret)
dev_err(dev, "failed(%d) to config ROCEE RAS interrupt\n", ret);
return ret;
}
static int hclge_handle_rocee_ras_error(struct hnae3_ae_dev *ae_dev)
{
struct hclge_dev *hdev = ae_dev->priv;
if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
hdev->pdev->revision < 0x21)
return HNAE3_NONE_RESET;
return hclge_log_and_clear_rocee_ras_error(hdev);
}
static const struct hclge_hw_blk hw_blk[] = { static const struct hclge_hw_blk hw_blk[] = {
{ {
.msk = BIT(0), .name = "IGU_EGU", .msk = BIT(0), .name = "IGU_EGU",
...@@ -1058,6 +1224,7 @@ static const struct hclge_hw_blk hw_blk[] = { ...@@ -1058,6 +1224,7 @@ static const struct hclge_hw_blk hw_blk[] = {
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state) int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
{ {
const struct hclge_hw_blk *module = hw_blk; const struct hclge_hw_blk *module = hw_blk;
struct device *dev = &hdev->pdev->dev;
int ret = 0; int ret = 0;
while (module->name) { while (module->name) {
...@@ -1069,6 +1236,10 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state) ...@@ -1069,6 +1236,10 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
module++; module++;
} }
ret = hclge_config_rocee_ras_interrupt(hdev, state);
if (ret)
dev_err(dev, "fail(%d) to configure ROCEE err int\n", ret);
return ret; return ret;
} }
...@@ -1086,9 +1257,21 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) ...@@ -1086,9 +1257,21 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
"HNS Non-Fatal RAS error(status=0x%x) identified\n", "HNS Non-Fatal RAS error(status=0x%x) identified\n",
status); status);
hclge_handle_all_ras_errors(hdev); hclge_handle_all_ras_errors(hdev);
return PCI_ERS_RESULT_NEED_RESET; } else {
if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
hdev->pdev->revision < 0x21)
return PCI_ERS_RESULT_RECOVERED;
}
if (status & HCLGE_RAS_REG_ROCEE_ERR_MASK) {
dev_warn(dev, "ROCEE uncorrected RAS error identified\n");
hclge_handle_rocee_ras_error(ae_dev);
} }
if (status & HCLGE_RAS_REG_NFE_MASK ||
status & HCLGE_RAS_REG_ROCEE_ERR_MASK)
return PCI_ERS_RESULT_NEED_RESET;
return PCI_ERS_RESULT_RECOVERED; return PCI_ERS_RESULT_RECOVERED;
} }
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00 #define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
#define HCLGE_RAS_REG_NFE_MASK 0xFF00 #define HCLGE_RAS_REG_NFE_MASK 0xFF00
#define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000
#define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG 0x20800 #define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG 0x20800
#define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00 #define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00
...@@ -83,6 +84,17 @@ ...@@ -83,6 +84,17 @@
#define HCLGE_QCN_ECC_INT_MASK GENMASK(21, 0) #define HCLGE_QCN_ECC_INT_MASK GENMASK(21, 0)
#define HCLGE_NCSI_ECC_INT_MASK GENMASK(1, 0) #define HCLGE_NCSI_ECC_INT_MASK GENMASK(1, 0)
#define HCLGE_ROCEE_RAS_NFE_INT_EN 0xF
#define HCLGE_ROCEE_RAS_CE_INT_EN 0x1
#define HCLGE_ROCEE_RAS_NFE_INT_EN_MASK 0xF
#define HCLGE_ROCEE_RAS_CE_INT_EN_MASK 0x1
#define HCLGE_ROCEE_RERR_INT_MASK BIT(0)
#define HCLGE_ROCEE_BERR_INT_MASK BIT(1)
#define HCLGE_ROCEE_ECC_INT_MASK BIT(2)
#define HCLGE_ROCEE_OVF_INT_MASK BIT(3)
#define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000
#define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F
enum hclge_err_int_type { enum hclge_err_int_type {
HCLGE_ERR_INT_MSIX = 0, HCLGE_ERR_INT_MSIX = 0,
HCLGE_ERR_INT_RAS_CE = 1, HCLGE_ERR_INT_RAS_CE = 1,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment