From 17f59244029bf9c0673725efdd0386ed95e127a7 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 8 Jun 2021 21:08:27 +0800 Subject: [PATCH 1/5] net: hns3: add support for handling all errors through MSI-X Currently, hardware errors can be reported through AER or MSI-X mode. However, the AER mode is intended to handle only bus errors, but not hardware errors. On the other hand, virtual machines cannot handle AER errors. When an AER error is reported, virtual machines will be suspended. So add support for handling all these hardware errors through MSI-X mode which depends on a newer version of firmware, and reserve the handler of the AER mode for compatibility. Signed-off-by: Yufeng Mo Signed-off-by: Jiaran Zhang Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_err.c | 16 +++++++ .../hisilicon/hns3/hns3pf/hclge_main.c | 47 ++++++++++--------- .../hisilicon/hns3/hns3pf/hclge_main.h | 1 + 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index f125aa425872..540dd15d7771 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -1611,11 +1611,27 @@ static const struct hclge_hw_blk hw_blk[] = { { /* sentinel */ } }; +static void hclge_config_all_msix_error(struct hclge_dev *hdev, bool enable) +{ + u32 reg_val; + + reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG); + + if (enable) + reg_val |= BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B); + else + reg_val &= ~BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B); + + hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val); +} + int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state) { const struct hclge_hw_blk *module = hw_blk; int ret = 0; + hclge_config_all_msix_error(hdev, state); + while (module->name) { if (module->config_err_int) { ret = module->config_err_int(hdev, state); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 45102681bd2a..d5be3bc50b5c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3307,11 +3307,13 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf, static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) { - u32 cmdq_src_reg, msix_src_reg; + u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg; /* fetch the events from their corresponding regs */ cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG); msix_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS); + hw_err_src_reg = hclge_read_dev(&hdev->hw, + HCLGE_RAS_PF_OTHER_INT_STS_REG); /* Assumption: If by any chance reset and mailbox events are reported * together then we will only process reset event in this go and will @@ -3339,11 +3341,11 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) return HCLGE_VECTOR0_EVENT_RST; } - /* check for vector0 msix event source */ - if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK) { - *clearval = msix_src_reg; + /* check for vector0 msix event and hardware error event source */ + if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK || + hw_err_src_reg & HCLGE_RAS_REG_NFE_MASK || + hw_err_src_reg & HCLGE_RAS_REG_ROCEE_ERR_MASK) return HCLGE_VECTOR0_EVENT_ERR; - } /* check for vector0 mailbox(=CMDQ RX) event source */ if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) { @@ -3354,9 +3356,8 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) /* print other vector0 event source */ dev_info(&hdev->pdev->dev, - "CMDQ INT status:0x%x, other INT status:0x%x\n", - cmdq_src_reg, msix_src_reg); - *clearval = msix_src_reg; + "INT status: CMDQ(%#x) HW errors(%#x) other(%#x)\n", + cmdq_src_reg, hw_err_src_reg, msix_src_reg); return HCLGE_VECTOR0_EVENT_OTHER; } @@ -3427,15 +3428,10 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data) hclge_clear_event_cause(hdev, event_cause, clearval); - /* Enable interrupt if it is not cause by reset. And when - * clearval equal to 0, it means interrupt status may be - * cleared by hardware before driver reads status register. - * For this case, vector0 interrupt also should be enabled. - */ - if (!clearval || - event_cause == HCLGE_VECTOR0_EVENT_MBX) { + /* Enable interrupt if it is not caused by reset event or error event */ + if (event_cause == HCLGE_VECTOR0_EVENT_MBX || + event_cause == HCLGE_VECTOR0_EVENT_OTHER) hclge_enable_vector(&hdev->misc_vector, true); - } return IRQ_HANDLED; } @@ -4244,22 +4240,27 @@ static void hclge_misc_err_recovery(struct hclge_dev *hdev) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); struct device *dev = &hdev->pdev->dev; + enum hnae3_reset_type reset_type; u32 msix_sts_reg; msix_sts_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS); - if (msix_sts_reg & HCLGE_VECTOR0_REG_MSIX_MASK) { - if (hclge_handle_hw_msix_error(hdev, - &hdev->default_reset_request)) + if (hclge_handle_hw_msix_error + (hdev, &hdev->default_reset_request)) dev_info(dev, "received msix interrupt 0x%x\n", msix_sts_reg); + } + hclge_enable_vector(&hdev->misc_vector, true); - if (hdev->default_reset_request) - if (ae_dev->ops->reset_event) - ae_dev->ops->reset_event(hdev->pdev, NULL); + hclge_handle_hw_ras_error(ae_dev); + if (ae_dev->hw_err_reset_req) { + reset_type = hclge_get_reset_level(ae_dev, + &ae_dev->hw_err_reset_req); + hclge_set_def_reset_request(ae_dev, reset_type); } - hclge_enable_vector(&hdev->misc_vector, true); + if (hdev->default_reset_request && ae_dev->ops->reset_event) + ae_dev->ops->reset_event(hdev->pdev, NULL); } static void hclge_errhand_service_task(struct hclge_dev *hdev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 9b8abb5d7a8e..582972a6f60e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -190,6 +190,7 @@ enum HLCGE_PORT_TYPE { #define HCLGE_VECTOR0_IMP_RESET_INT_B 1 #define HCLGE_VECTOR0_IMP_CMDQ_ERR_B 4U #define HCLGE_VECTOR0_IMP_RD_POISON_B 5U +#define HCLGE_VECTOR0_ALL_MSIX_ERR_B 6U #define HCLGE_MAC_DEFAULT_FRAME \ (ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN) From 2e2deee7618b062efe3aba9fcb017dadcf148819 Mon Sep 17 00:00:00 2001 From: Jiaran Zhang Date: Tue, 8 Jun 2021 21:08:28 +0800 Subject: [PATCH 2/5] net: hns3: add the RAS compatibility adaptation solution To adapt to hardware modification and ensure that the driver is compatible with the original error handling content, we need to add the RAS compatibility adaptation solution. Add a processing branch to the driver during error handling. In the new processing branch, NIC fault information is integrated by the IMP. An interaction command is added between the driver and IMP to query and clear the fault source and interrupt source. The IMP integrates error information and reports the highest reset level to the driver. Signed-off-by: Jiaran Zhang Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_cmd.c | 3 +- .../hisilicon/hns3/hns3pf/hclge_cmd.h | 2 + .../hisilicon/hns3/hns3pf/hclge_err.c | 320 ++++++++++++++++-- .../hisilicon/hns3/hns3pf/hclge_err.h | 69 ++++ .../hisilicon/hns3/hns3pf/hclge_main.c | 54 ++- 5 files changed, 409 insertions(+), 39 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index 8f6ed8577aea..614763f5e877 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -178,7 +178,8 @@ static bool hclge_is_special_opcode(u16 opcode) HCLGE_QUERY_CLEAR_MPF_RAS_INT, HCLGE_QUERY_CLEAR_PF_RAS_INT, HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT, - HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT}; + HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT, + HCLGE_QUERY_ALL_ERR_INFO}; int i; for (i = 0; i < ARRAY_SIZE(spec_opcode); i++) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index da78a6477e46..234f0a3beec1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -293,6 +293,8 @@ enum hclge_opcode_type { HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513, HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514, HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515, + HCLGE_QUERY_ALL_ERR_BD_NUM = 0x1516, + HCLGE_QUERY_ALL_ERR_INFO = 0x1517, HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580, HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581, HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index 540dd15d7771..36f8055bd859 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -631,6 +631,98 @@ static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = { { /* sentinel */ } }; +static const struct hclge_hw_module_id hclge_hw_module_id_st[] = { + { + .module_id = MODULE_NONE, + .msg = "MODULE_NONE" + }, { + .module_id = MODULE_BIOS_COMMON, + .msg = "MODULE_BIOS_COMMON" + }, { + .module_id = MODULE_GE, + .msg = "MODULE_GE" + }, { + .module_id = MODULE_IGU_EGU, + .msg = "MODULE_IGU_EGU" + }, { + .module_id = MODULE_LGE, + .msg = "MODULE_LGE" + }, { + .module_id = MODULE_NCSI, + .msg = "MODULE_NCSI" + }, { + .module_id = MODULE_PPP, + .msg = "MODULE_PPP" + }, { + .module_id = MODULE_QCN, + .msg = "MODULE_QCN" + }, { + .module_id = MODULE_RCB_RX, + .msg = "MODULE_RCB_RX" + }, { + .module_id = MODULE_RTC, + .msg = "MODULE_RTC" + }, { + .module_id = MODULE_SSU, + .msg = "MODULE_SSU" + }, { + .module_id = MODULE_TM, + .msg = "MODULE_TM" + }, { + .module_id = MODULE_RCB_TX, + .msg = "MODULE_RCB_TX" + }, { + .module_id = MODULE_TXDMA, + .msg = "MODULE_TXDMA" + }, { + .module_id = MODULE_MASTER, + .msg = "MODULE_MASTER" + } +}; + +static const struct hclge_hw_type_id hclge_hw_type_id_st[] = { + { + .type_id = NONE_ERROR, + .msg = "none_error" + }, { + .type_id = FIFO_ERROR, + .msg = "fifo_error" + }, { + .type_id = MEMORY_ERROR, + .msg = "memory_error" + }, { + .type_id = POISON_ERROR, + .msg = "poison_error" + }, { + .type_id = MSIX_ECC_ERROR, + .msg = "msix_ecc_error" + }, { + .type_id = TQP_INT_ECC_ERROR, + .msg = "tqp_int_ecc_error" + }, { + .type_id = PF_ABNORMAL_INT_ERROR, + .msg = "pf_abnormal_int_error" + }, { + .type_id = MPF_ABNORMAL_INT_ERROR, + .msg = "mpf_abnormal_int_error" + }, { + .type_id = COMMON_ERROR, + .msg = "common_error" + }, { + .type_id = PORT_ERROR, + .msg = "port_error" + }, { + .type_id = ETS_ERROR, + .msg = "ets_error" + }, { + .type_id = NCSI_ERROR, + .msg = "ncsi_error" + }, { + .type_id = GLB_ERROR, + .msg = "glb_error" + } +}; + static void hclge_log_error(struct device *dev, char *reg, const struct hclge_hw_error *err, u32 err_sts, unsigned long *reset_requests) @@ -1892,11 +1984,8 @@ static int hclge_handle_pf_msix_error(struct hclge_dev *hdev, static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev, unsigned long *reset_requests) { - struct hclge_mac_tnl_stats mac_tnl_stats; - struct device *dev = &hdev->pdev->dev; u32 mpf_bd_num, pf_bd_num, bd_num; struct hclge_desc *desc; - u32 status; int ret; /* query the number of bds for the MSIx int status */ @@ -1919,29 +2008,7 @@ static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev, if (ret) goto msi_error; - /* query and clear mac tnl interruptions */ - hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_QUERY_MAC_TNL_INT, - true); - ret = hclge_cmd_send(&hdev->hw, &desc[0], 1); - if (ret) { - dev_err(dev, "query mac tnl int cmd failed (%d)\n", ret); - goto msi_error; - } - - status = le32_to_cpu(desc->data[0]); - if (status) { - /* When mac tnl interrupt occurs, we record current time and - * register status here in a fifo, then clear the status. So - * that if link status changes suddenly at some time, we can - * query them by debugfs. - */ - mac_tnl_stats.time = local_clock(); - mac_tnl_stats.status = status; - kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats); - ret = hclge_clear_mac_tnl_int(hdev); - if (ret) - dev_err(dev, "clear mac tnl int failed (%d)\n", ret); - } + ret = hclge_handle_mac_tnl(hdev); msi_error: kfree(desc); @@ -1963,10 +2030,43 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev, return hclge_handle_all_hw_msix_error(hdev, reset_requests); } +int hclge_handle_mac_tnl(struct hclge_dev *hdev) +{ + struct hclge_mac_tnl_stats mac_tnl_stats; + struct device *dev = &hdev->pdev->dev; + struct hclge_desc desc; + u32 status; + int ret; + + /* query and clear mac tnl interruptions */ + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_MAC_TNL_INT, true); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(dev, "failed to query mac tnl int, ret = %d.\n", ret); + return ret; + } + + status = le32_to_cpu(desc.data[0]); + if (status) { + /* When mac tnl interrupt occurs, we record current time and + * register status here in a fifo, then clear the status. So + * that if link status changes suddenly at some time, we can + * query them by debugfs. + */ + mac_tnl_stats.time = local_clock(); + mac_tnl_stats.status = status; + kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats); + ret = hclge_clear_mac_tnl_int(hdev); + if (ret) + dev_err(dev, "failed to clear mac tnl int, ret = %d.\n", + ret); + } + + return ret; +} + void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev) { -#define HCLGE_DESC_NO_DATA_LEN 8 - struct hclge_dev *hdev = ae_dev->priv; struct device *dev = &hdev->pdev->dev; u32 mpf_bd_num, pf_bd_num, bd_num; @@ -2015,3 +2115,167 @@ void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev) msi_error: kfree(desc); } + +static void +hclge_handle_error_type_reg_log(struct device *dev, + struct hclge_mod_err_info *mod_info, + struct hclge_type_reg_err_info *type_reg_info) +{ +#define HCLGE_ERR_TYPE_MASK 0x7F +#define HCLGE_ERR_TYPE_IS_RAS_OFFSET 7 + + u8 mod_id, total_module, type_id, total_type, i, is_ras; + + mod_id = mod_info->mod_id; + type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK; + is_ras = type_reg_info->type_id >> HCLGE_ERR_TYPE_IS_RAS_OFFSET; + + total_module = ARRAY_SIZE(hclge_hw_module_id_st); + total_type = ARRAY_SIZE(hclge_hw_type_id_st); + + if (mod_id < total_module && type_id < total_type) + dev_err(dev, + "found %s %s, is %s error.\n", + hclge_hw_module_id_st[mod_id].msg, + hclge_hw_type_id_st[type_id].msg, + is_ras ? "ras" : "msix"); + else + dev_err(dev, + "unknown module[%u] or type[%u].\n", mod_id, type_id); + + dev_err(dev, "reg_value:\n"); + for (i = 0; i < type_reg_info->reg_num; i++) + dev_err(dev, "0x%08x\n", type_reg_info->hclge_reg[i]); +} + +static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev, + const u32 *buf, u32 buf_size) +{ + struct hclge_type_reg_err_info *type_reg_info; + struct hclge_dev *hdev = ae_dev->priv; + struct device *dev = &hdev->pdev->dev; + struct hclge_mod_err_info *mod_info; + struct hclge_sum_err_info *sum_info; + u8 mod_num, err_num, i; + u32 offset = 0; + + sum_info = (struct hclge_sum_err_info *)&buf[offset++]; + if (sum_info->reset_type && + sum_info->reset_type != HNAE3_NONE_RESET) + set_bit(sum_info->reset_type, &ae_dev->hw_err_reset_req); + mod_num = sum_info->mod_num; + + while (mod_num--) { + if (offset >= buf_size) { + dev_err(dev, "The offset(%u) exceeds buf's size(%u).\n", + offset, buf_size); + return; + } + mod_info = (struct hclge_mod_err_info *)&buf[offset++]; + err_num = mod_info->err_num; + + for (i = 0; i < err_num; i++) { + if (offset >= buf_size) { + dev_err(dev, + "The offset(%u) exceeds buf size(%u).\n", + offset, buf_size); + return; + } + + type_reg_info = (struct hclge_type_reg_err_info *) + &buf[offset++]; + hclge_handle_error_type_reg_log(dev, mod_info, + type_reg_info); + + offset += type_reg_info->reg_num; + } + } +} + +static int hclge_query_all_err_bd_num(struct hclge_dev *hdev, u32 *bd_num) +{ + struct device *dev = &hdev->pdev->dev; + struct hclge_desc desc_bd; + int ret; + + hclge_cmd_setup_basic_desc(&desc_bd, HCLGE_QUERY_ALL_ERR_BD_NUM, true); + ret = hclge_cmd_send(&hdev->hw, &desc_bd, 1); + if (ret) { + dev_err(dev, "failed to query error bd_num, ret = %d.\n", ret); + return ret; + } + + *bd_num = le32_to_cpu(desc_bd.data[0]); + if (!(*bd_num)) { + dev_err(dev, "The value of bd_num is 0!\n"); + return -EINVAL; + } + + return 0; +} + +static int hclge_query_all_err_info(struct hclge_dev *hdev, + struct hclge_desc *desc, u32 bd_num) +{ + struct device *dev = &hdev->pdev->dev; + int ret; + + hclge_cmd_setup_basic_desc(desc, HCLGE_QUERY_ALL_ERR_INFO, true); + ret = hclge_cmd_send(&hdev->hw, desc, bd_num); + if (ret) + dev_err(dev, "failed to query error info, ret = %d.\n", ret); + + return ret; +} + +int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev) +{ + u32 bd_num, desc_len, buf_len, buf_size, i; + struct hclge_dev *hdev = ae_dev->priv; + struct hclge_desc *desc; + __le32 *desc_data; + u32 *buf; + int ret; + + ret = hclge_query_all_err_bd_num(hdev, &bd_num); + if (ret) + goto out; + + desc_len = bd_num * sizeof(struct hclge_desc); + desc = kzalloc(desc_len, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out; + } + + ret = hclge_query_all_err_info(hdev, desc, bd_num); + if (ret) + goto err_desc; + + buf_len = bd_num * sizeof(struct hclge_desc) - HCLGE_DESC_NO_DATA_LEN; + buf_size = buf_len / sizeof(u32); + + desc_data = kzalloc(buf_len, GFP_KERNEL); + if (!desc_data) + return -ENOMEM; + + buf = kzalloc(buf_len, GFP_KERNEL); + if (!buf) { + ret = -ENOMEM; + goto err_buf_alloc; + } + + memcpy(desc_data, &desc[0].data[0], buf_len); + for (i = 0; i < buf_size; i++) + buf[i] = le32_to_cpu(desc_data[i]); + + hclge_handle_error_module_log(ae_dev, buf, buf_size); + kfree(buf); + +err_buf_alloc: + kfree(desc_data); +err_desc: + kfree(desc); +out: + return ret; +} diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h index d647f3c84134..27ab772c665e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h @@ -107,6 +107,10 @@ #define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000 #define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F +#define HCLGE_DESC_DATA_MAX 8 +#define HCLGE_REG_NUM_MAX 256 +#define HCLGE_DESC_NO_DATA_LEN 8 + enum hclge_err_int_type { HCLGE_ERR_INT_MSIX = 0, HCLGE_ERR_INT_RAS_CE = 1, @@ -114,6 +118,40 @@ enum hclge_err_int_type { HCLGE_ERR_INT_RAS_FE = 3, }; +enum hclge_mod_name_list { + MODULE_NONE = 0, + MODULE_BIOS_COMMON = 1, + MODULE_GE = 2, + MODULE_IGU_EGU = 3, + MODULE_LGE = 4, + MODULE_NCSI = 5, + MODULE_PPP = 6, + MODULE_QCN = 7, + MODULE_RCB_RX = 8, + MODULE_RTC = 9, + MODULE_SSU = 10, + MODULE_TM = 11, + MODULE_RCB_TX = 12, + MODULE_TXDMA = 13, + MODULE_MASTER = 14, +}; + +enum hclge_err_type_list { + NONE_ERROR = 0, + FIFO_ERROR = 1, + MEMORY_ERROR = 2, + POISON_ERROR = 3, + MSIX_ECC_ERROR = 4, + TQP_INT_ECC_ERROR = 5, + PF_ABNORMAL_INT_ERROR = 6, + MPF_ABNORMAL_INT_ERROR = 7, + COMMON_ERROR = 8, + PORT_ERROR = 9, + ETS_ERROR = 10, + NCSI_ERROR = 11, + GLB_ERROR = 12, +}; + struct hclge_hw_blk { u32 msk; const char *name; @@ -126,6 +164,35 @@ struct hclge_hw_error { enum hnae3_reset_type reset_level; }; +struct hclge_hw_module_id { + enum hclge_mod_name_list module_id; + const char *msg; +}; + +struct hclge_hw_type_id { + enum hclge_err_type_list type_id; + const char *msg; +}; + +struct hclge_sum_err_info { + u8 reset_type; + u8 mod_num; + u8 rsv[2]; +}; + +struct hclge_mod_err_info { + u8 mod_id; + u8 err_num; + u8 rsv[2]; +}; + +struct hclge_type_reg_err_info { + u8 type_id; + u8 reg_num; + u8 rsv[2]; + u32 hclge_reg[HCLGE_REG_NUM_MAX]; +}; + int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en); int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state); int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en); @@ -133,4 +200,6 @@ void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev); pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev); int hclge_handle_hw_msix_error(struct hclge_dev *hdev, unsigned long *reset_requests); +int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev); +int hclge_handle_mac_tnl(struct hclge_dev *hdev); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index d5be3bc50b5c..3c08fc71b951 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -4236,11 +4236,49 @@ static void hclge_reset_subtask(struct hclge_dev *hdev) hdev->reset_type = HNAE3_NONE_RESET; } +static void hclge_handle_err_reset_request(struct hclge_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + enum hnae3_reset_type reset_type; + + if (ae_dev->hw_err_reset_req) { + reset_type = hclge_get_reset_level(ae_dev, + &ae_dev->hw_err_reset_req); + hclge_set_def_reset_request(ae_dev, reset_type); + } + + if (hdev->default_reset_request && ae_dev->ops->reset_event) + ae_dev->ops->reset_event(hdev->pdev, NULL); + + /* enable interrupt after error handling complete */ + hclge_enable_vector(&hdev->misc_vector, true); +} + +static void hclge_handle_err_recovery(struct hclge_dev *hdev) +{ + u32 mask_val = HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK; + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + u32 msix_src_flag, hw_err_src_flag; + + msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) & + HCLGE_VECTOR0_REG_MSIX_MASK; + + hw_err_src_flag = hclge_read_dev(&hdev->hw, + HCLGE_RAS_PF_OTHER_INT_STS_REG) & + mask_val; + + if (msix_src_flag || hw_err_src_flag) { + hclge_handle_error_info_log(ae_dev); + hclge_handle_mac_tnl(hdev); + } + + hclge_handle_err_reset_request(hdev); +} + static void hclge_misc_err_recovery(struct hclge_dev *hdev) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); struct device *dev = &hdev->pdev->dev; - enum hnae3_reset_type reset_type; u32 msix_sts_reg; msix_sts_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS); @@ -4250,17 +4288,10 @@ static void hclge_misc_err_recovery(struct hclge_dev *hdev) dev_info(dev, "received msix interrupt 0x%x\n", msix_sts_reg); } - hclge_enable_vector(&hdev->misc_vector, true); hclge_handle_hw_ras_error(ae_dev); - if (ae_dev->hw_err_reset_req) { - reset_type = hclge_get_reset_level(ae_dev, - &ae_dev->hw_err_reset_req); - hclge_set_def_reset_request(ae_dev, reset_type); - } - if (hdev->default_reset_request && ae_dev->ops->reset_event) - ae_dev->ops->reset_event(hdev->pdev, NULL); + hclge_handle_err_reset_request(hdev); } static void hclge_errhand_service_task(struct hclge_dev *hdev) @@ -4268,7 +4299,10 @@ static void hclge_errhand_service_task(struct hclge_dev *hdev) if (!test_and_clear_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state)) return; - hclge_misc_err_recovery(hdev); + if (hdev->ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) + hclge_handle_err_recovery(hdev); + else + hclge_misc_err_recovery(hdev); } static void hclge_reset_service_task(struct hclge_dev *hdev) From e65e9f5c2e4efc17657d016d767eb7010d9dd598 Mon Sep 17 00:00:00 2001 From: Jiaran Zhang Date: Tue, 8 Jun 2021 21:08:29 +0800 Subject: [PATCH 3/5] net: hns3: add support for imp-handle ras capability IMP(Intelligent Management Processor) firmware add a new feature to handle and consolidate RAS information for new devices, NIC driver only needs to query the reported RAS information. NIC driver adds support for this feature. Driver queries device capability to check whether IMP support this feature, If yes, execute the new RAS processing branch. In order to add a method to check whether PF supports imp-handle RAS feature, add dumping this info in debugfs. Signed-off-by: Jiaran Zhang Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 4 ++++ drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 3 +++ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 2 ++ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 1 + drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index dc9b5bc3431b..e564aa32a414 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -91,6 +91,7 @@ enum HNAE3_DEV_CAP_BITS { HNAE3_DEV_SUPPORT_STASH_B, HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B, HNAE3_DEV_SUPPORT_PAUSE_B, + HNAE3_DEV_SUPPORT_RAS_IMP_B, HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, HNAE3_DEV_SUPPORT_PORT_VLAN_BYPASS_B, HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, @@ -129,6 +130,9 @@ enum HNAE3_DEV_CAP_BITS { #define hnae3_dev_phy_imp_supported(hdev) \ test_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, (hdev)->ae_dev->caps) +#define hnae3_dev_ras_imp_supported(hdev) \ + test_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, (hdev)->ae_dev->caps) + #define hnae3_dev_tqp_txrx_indep_supported(hdev) \ test_bit(HNAE3_DEV_SUPPORT_TQP_TXRX_INDEP_B, (hdev)->ae_dev->caps) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index cf1efd2f4a0f..a0edca848392 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -349,6 +349,9 @@ static struct hns3_dbg_cap_info hns3_dbg_cap[] = { }, { .name = "support imp-controlled PHY", .cap_bit = HNAE3_DEV_SUPPORT_PHY_IMP_B, + }, { + .name = "support imp-controlled RAS", + .cap_bit = HNAE3_DEV_SUPPORT_RAS_IMP_B, }, { .name = "support rxd advanced layout", .cap_bit = HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index 614763f5e877..887297e37cf3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -387,6 +387,8 @@ static void hclge_parse_capability(struct hclge_dev *hdev, set_bit(HNAE3_DEV_SUPPORT_PAUSE_B, ae_dev->caps); if (hnae3_get_bit(caps, HCLGE_CAP_PHY_IMP_B)) set_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, ae_dev->caps); + if (hnae3_get_bit(caps, HCLGE_CAP_RAS_IMP_B)) + set_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, ae_dev->caps); if (hnae3_get_bit(caps, HCLGE_CAP_RXD_ADV_LAYOUT_B)) set_bit(HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, ae_dev->caps); if (hnae3_get_bit(caps, HCLGE_CAP_PORT_VLAN_BYPASS_B)) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 234f0a3beec1..221811af9473 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -392,6 +392,7 @@ enum HCLGE_CAP_BITS { HCLGE_CAP_HW_PAD_B, HCLGE_CAP_STASH_B, HCLGE_CAP_UDP_TUNNEL_CSUM_B, + HCLGE_CAP_RAS_IMP_B = 12, HCLGE_CAP_FEC_B = 13, HCLGE_CAP_PAUSE_B = 14, HCLGE_CAP_RXD_ADV_LAYOUT_B = 15, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 3c08fc71b951..cf34216df171 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -4299,7 +4299,7 @@ static void hclge_errhand_service_task(struct hclge_dev *hdev) if (!test_and_clear_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state)) return; - if (hdev->ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) + if (hnae3_dev_ras_imp_supported(hdev)) hclge_handle_err_recovery(hdev); else hclge_misc_err_recovery(hdev); From 8a95e360fd512f1cb55239645879b15d26bc7e21 Mon Sep 17 00:00:00 2001 From: Jiaran Zhang Date: Tue, 8 Jun 2021 21:08:30 +0800 Subject: [PATCH 4/5] net: hns3: update error recovery module and type Update error recovery module and type for RoCE. The enumeration values of module names and error types are not sorted in sequence. If use the current printing mode, they cannot be correctly printed. Use the index mode, If mod_id and type_id match the enumerated value, display the corresponding information. Signed-off-by: Jiaran Zhang Signed-off-by: Weihang Li Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_err.c | 58 ++++++++++++++++++- .../hisilicon/hns3/hns3pf/hclge_err.h | 18 ++++++ .../hisilicon/hns3/hns3pf/hclge_main.c | 3 +- 3 files changed, 74 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index 36f8055bd859..0e942d11dbf3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -677,6 +677,36 @@ static const struct hclge_hw_module_id hclge_hw_module_id_st[] = { }, { .module_id = MODULE_MASTER, .msg = "MODULE_MASTER" + }, { + .module_id = MODULE_ROCEE_TOP, + .msg = "MODULE_ROCEE_TOP" + }, { + .module_id = MODULE_ROCEE_TIMER, + .msg = "MODULE_ROCEE_TIMER" + }, { + .module_id = MODULE_ROCEE_MDB, + .msg = "MODULE_ROCEE_MDB" + }, { + .module_id = MODULE_ROCEE_TSP, + .msg = "MODULE_ROCEE_TSP" + }, { + .module_id = MODULE_ROCEE_TRP, + .msg = "MODULE_ROCEE_TRP" + }, { + .module_id = MODULE_ROCEE_SCC, + .msg = "MODULE_ROCEE_SCC" + }, { + .module_id = MODULE_ROCEE_CAEP, + .msg = "MODULE_ROCEE_CAEP" + }, { + .module_id = MODULE_ROCEE_GEN_AC, + .msg = "MODULE_ROCEE_GEN_AC" + }, { + .module_id = MODULE_ROCEE_QMM, + .msg = "MODULE_ROCEE_QMM" + }, { + .module_id = MODULE_ROCEE_LSAN, + .msg = "MODULE_ROCEE_LSAN" } }; @@ -720,6 +750,12 @@ static const struct hclge_hw_type_id hclge_hw_type_id_st[] = { }, { .type_id = GLB_ERROR, .msg = "glb_error" + }, { + .type_id = ROCEE_NORMAL_ERR, + .msg = "rocee_normal_error" + }, { + .type_id = ROCEE_OVF_ERR, + .msg = "rocee_ovf_error" } }; @@ -2125,6 +2161,8 @@ hclge_handle_error_type_reg_log(struct device *dev, #define HCLGE_ERR_TYPE_IS_RAS_OFFSET 7 u8 mod_id, total_module, type_id, total_type, i, is_ras; + u8 index_module = MODULE_NONE; + u8 index_type = NONE_ERROR; mod_id = mod_info->mod_id; type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK; @@ -2133,11 +2171,25 @@ hclge_handle_error_type_reg_log(struct device *dev, total_module = ARRAY_SIZE(hclge_hw_module_id_st); total_type = ARRAY_SIZE(hclge_hw_type_id_st); - if (mod_id < total_module && type_id < total_type) + for (i = 0; i < total_module; i++) { + if (mod_id == hclge_hw_module_id_st[i].module_id) { + index_module = i; + break; + } + } + + for (i = 0; i < total_type; i++) { + if (type_id == hclge_hw_type_id_st[i].type_id) { + index_type = i; + break; + } + } + + if (index_module != MODULE_NONE && index_type != NONE_ERROR) dev_err(dev, "found %s %s, is %s error.\n", - hclge_hw_module_id_st[mod_id].msg, - hclge_hw_type_id_st[type_id].msg, + hclge_hw_module_id_st[index_module].msg, + hclge_hw_type_id_st[index_type].msg, is_ras ? "ras" : "msix"); else dev_err(dev, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h index 27ab772c665e..ce4c96bbef8e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h @@ -15,6 +15,8 @@ #define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00 #define HCLGE_RAS_REG_NFE_MASK 0xFF00 #define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000 +#define HCLGE_RAS_REG_ERR_MASK \ + (HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK) #define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00 @@ -134,6 +136,18 @@ enum hclge_mod_name_list { MODULE_RCB_TX = 12, MODULE_TXDMA = 13, MODULE_MASTER = 14, + /* add new MODULE NAME for NIC here in order */ + MODULE_ROCEE_TOP = 40, + MODULE_ROCEE_TIMER = 41, + MODULE_ROCEE_MDB = 42, + MODULE_ROCEE_TSP = 43, + MODULE_ROCEE_TRP = 44, + MODULE_ROCEE_SCC = 45, + MODULE_ROCEE_CAEP = 46, + MODULE_ROCEE_GEN_AC = 47, + MODULE_ROCEE_QMM = 48, + MODULE_ROCEE_LSAN = 49, + /* add new MODULE NAME for RoCEE here in order */ }; enum hclge_err_type_list { @@ -150,6 +164,10 @@ enum hclge_err_type_list { ETS_ERROR = 10, NCSI_ERROR = 11, GLB_ERROR = 12, + /* add new ERROR TYPE for NIC here in order */ + ROCEE_NORMAL_ERR = 40, + ROCEE_OVF_ERR = 41, + /* add new ERROR TYPE for ROCEE here in order */ }; struct hclge_hw_blk { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index cf34216df171..9ff4210f6477 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3343,8 +3343,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) /* check for vector0 msix event and hardware error event source */ if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK || - hw_err_src_reg & HCLGE_RAS_REG_NFE_MASK || - hw_err_src_reg & HCLGE_RAS_REG_ROCEE_ERR_MASK) + hw_err_src_reg & HCLGE_RAS_REG_ERR_MASK) return HCLGE_VECTOR0_EVENT_ERR; /* check for vector0 mailbox(=CMDQ RX) event source */ From 1c360a4a077fc0f74a350fe2ef267cbe8a9388e3 Mon Sep 17 00:00:00 2001 From: Jiaran Zhang Date: Tue, 8 Jun 2021 21:08:31 +0800 Subject: [PATCH 5/5] net: hns3: add error handling compatibility during initialization During initialization, the driver logs and clears the hw errors that already occurred. For device supports imp-handle ras capability, it needs handle different error status, otherwise it may cause wrong reset. So fix it by adding a new processing branch. Signed-off-by: Jiaran Zhang Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_err.c | 22 +++++++++++++++++++ .../hisilicon/hns3/hns3pf/hclge_err.h | 2 ++ .../hisilicon/hns3/hns3pf/hclge_main.c | 21 +++++++++--------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index 0e942d11dbf3..bad9fda19398 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -2152,6 +2152,28 @@ msi_error: kfree(desc); } +bool hclge_find_error_source(struct hclge_dev *hdev) +{ + u32 msix_src_flag, hw_err_src_flag; + + msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) & + HCLGE_VECTOR0_REG_MSIX_MASK; + + hw_err_src_flag = hclge_read_dev(&hdev->hw, + HCLGE_RAS_PF_OTHER_INT_STS_REG) & + HCLGE_RAS_REG_ERR_MASK; + + return msix_src_flag || hw_err_src_flag; +} + +void hclge_handle_occurred_error(struct hclge_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + + if (hclge_find_error_source(hdev)) + hclge_handle_error_info_log(ae_dev); +} + static void hclge_handle_error_type_reg_log(struct device *dev, struct hclge_mod_err_info *mod_info, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h index ce4c96bbef8e..07987fb8332e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h @@ -215,6 +215,8 @@ int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en); int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state); int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en); void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev); +bool hclge_find_error_source(struct hclge_dev *hdev); +void hclge_handle_occurred_error(struct hclge_dev *hdev); pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev); int hclge_handle_hw_msix_error(struct hclge_dev *hdev, unsigned long *reset_requests); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 9ff4210f6477..d960e08850ae 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -4255,18 +4255,11 @@ static void hclge_handle_err_reset_request(struct hclge_dev *hdev) static void hclge_handle_err_recovery(struct hclge_dev *hdev) { - u32 mask_val = HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK; struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); - u32 msix_src_flag, hw_err_src_flag; - msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) & - HCLGE_VECTOR0_REG_MSIX_MASK; + ae_dev->hw_err_reset_req = 0; - hw_err_src_flag = hclge_read_dev(&hdev->hw, - HCLGE_RAS_PF_OTHER_INT_STS_REG) & - mask_val; - - if (msix_src_flag || hw_err_src_flag) { + if (hclge_find_error_source(hdev)) { hclge_handle_error_info_log(ae_dev); hclge_handle_mac_tnl(hdev); } @@ -11558,7 +11551,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_clear_resetting_state(hdev); /* Log and clear the hw errors those already occurred */ - hclge_handle_all_hns_hw_errors(ae_dev); + if (hnae3_dev_ras_imp_supported(hdev)) + hclge_handle_occurred_error(hdev); + else + hclge_handle_all_hns_hw_errors(ae_dev); /* request delayed reset for the error recovery because an immediate * global reset on a PF affecting pending initialization of other PFs @@ -11911,7 +11907,10 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev) } /* Log and clear the hw errors those already occurred */ - hclge_handle_all_hns_hw_errors(ae_dev); + if (hnae3_dev_ras_imp_supported(hdev)) + hclge_handle_occurred_error(hdev); + else + hclge_handle_all_hns_hw_errors(ae_dev); /* Re-enable the hw error interrupts because * the interrupts get disabled on global reset.