Merge branch 'hns3-RAS'

Guangbin Huang says:

====================
net: hns3: add RAS compatibility adaptation solution

This patchset adds RAS compatibility adaptation solution for new devices.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2021-06-08 14:43:31 -07:00
commit 1c7536528c
8 changed files with 546 additions and 56 deletions

View File

@ -91,6 +91,7 @@ enum HNAE3_DEV_CAP_BITS {
HNAE3_DEV_SUPPORT_STASH_B, HNAE3_DEV_SUPPORT_STASH_B,
HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B, HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B,
HNAE3_DEV_SUPPORT_PAUSE_B, HNAE3_DEV_SUPPORT_PAUSE_B,
HNAE3_DEV_SUPPORT_RAS_IMP_B,
HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
HNAE3_DEV_SUPPORT_PORT_VLAN_BYPASS_B, HNAE3_DEV_SUPPORT_PORT_VLAN_BYPASS_B,
HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B,
@ -129,6 +130,9 @@ enum HNAE3_DEV_CAP_BITS {
#define hnae3_dev_phy_imp_supported(hdev) \ #define hnae3_dev_phy_imp_supported(hdev) \
test_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, (hdev)->ae_dev->caps) test_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, (hdev)->ae_dev->caps)
#define hnae3_dev_ras_imp_supported(hdev) \
test_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, (hdev)->ae_dev->caps)
#define hnae3_dev_tqp_txrx_indep_supported(hdev) \ #define hnae3_dev_tqp_txrx_indep_supported(hdev) \
test_bit(HNAE3_DEV_SUPPORT_TQP_TXRX_INDEP_B, (hdev)->ae_dev->caps) test_bit(HNAE3_DEV_SUPPORT_TQP_TXRX_INDEP_B, (hdev)->ae_dev->caps)

View File

@ -349,6 +349,9 @@ static struct hns3_dbg_cap_info hns3_dbg_cap[] = {
}, { }, {
.name = "support imp-controlled PHY", .name = "support imp-controlled PHY",
.cap_bit = HNAE3_DEV_SUPPORT_PHY_IMP_B, .cap_bit = HNAE3_DEV_SUPPORT_PHY_IMP_B,
}, {
.name = "support imp-controlled RAS",
.cap_bit = HNAE3_DEV_SUPPORT_RAS_IMP_B,
}, { }, {
.name = "support rxd advanced layout", .name = "support rxd advanced layout",
.cap_bit = HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, .cap_bit = HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,

View File

@ -178,7 +178,8 @@ static bool hclge_is_special_opcode(u16 opcode)
HCLGE_QUERY_CLEAR_MPF_RAS_INT, HCLGE_QUERY_CLEAR_MPF_RAS_INT,
HCLGE_QUERY_CLEAR_PF_RAS_INT, HCLGE_QUERY_CLEAR_PF_RAS_INT,
HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT, HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT,
HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT}; HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT,
HCLGE_QUERY_ALL_ERR_INFO};
int i; int i;
for (i = 0; i < ARRAY_SIZE(spec_opcode); i++) { for (i = 0; i < ARRAY_SIZE(spec_opcode); i++) {
@ -386,6 +387,8 @@ static void hclge_parse_capability(struct hclge_dev *hdev,
set_bit(HNAE3_DEV_SUPPORT_PAUSE_B, ae_dev->caps); set_bit(HNAE3_DEV_SUPPORT_PAUSE_B, ae_dev->caps);
if (hnae3_get_bit(caps, HCLGE_CAP_PHY_IMP_B)) if (hnae3_get_bit(caps, HCLGE_CAP_PHY_IMP_B))
set_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, ae_dev->caps); set_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, ae_dev->caps);
if (hnae3_get_bit(caps, HCLGE_CAP_RAS_IMP_B))
set_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, ae_dev->caps);
if (hnae3_get_bit(caps, HCLGE_CAP_RXD_ADV_LAYOUT_B)) if (hnae3_get_bit(caps, HCLGE_CAP_RXD_ADV_LAYOUT_B))
set_bit(HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, ae_dev->caps); set_bit(HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, ae_dev->caps);
if (hnae3_get_bit(caps, HCLGE_CAP_PORT_VLAN_BYPASS_B)) { if (hnae3_get_bit(caps, HCLGE_CAP_PORT_VLAN_BYPASS_B)) {

View File

@ -293,6 +293,8 @@ enum hclge_opcode_type {
HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513, HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514, HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514,
HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515, HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515,
HCLGE_QUERY_ALL_ERR_BD_NUM = 0x1516,
HCLGE_QUERY_ALL_ERR_INFO = 0x1517,
HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580, HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580,
HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581, HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581,
HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584, HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584,
@ -390,6 +392,7 @@ enum HCLGE_CAP_BITS {
HCLGE_CAP_HW_PAD_B, HCLGE_CAP_HW_PAD_B,
HCLGE_CAP_STASH_B, HCLGE_CAP_STASH_B,
HCLGE_CAP_UDP_TUNNEL_CSUM_B, HCLGE_CAP_UDP_TUNNEL_CSUM_B,
HCLGE_CAP_RAS_IMP_B = 12,
HCLGE_CAP_FEC_B = 13, HCLGE_CAP_FEC_B = 13,
HCLGE_CAP_PAUSE_B = 14, HCLGE_CAP_PAUSE_B = 14,
HCLGE_CAP_RXD_ADV_LAYOUT_B = 15, HCLGE_CAP_RXD_ADV_LAYOUT_B = 15,

View File

@ -631,6 +631,134 @@ static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = {
{ /* sentinel */ } { /* sentinel */ }
}; };
static const struct hclge_hw_module_id hclge_hw_module_id_st[] = {
{
.module_id = MODULE_NONE,
.msg = "MODULE_NONE"
}, {
.module_id = MODULE_BIOS_COMMON,
.msg = "MODULE_BIOS_COMMON"
}, {
.module_id = MODULE_GE,
.msg = "MODULE_GE"
}, {
.module_id = MODULE_IGU_EGU,
.msg = "MODULE_IGU_EGU"
}, {
.module_id = MODULE_LGE,
.msg = "MODULE_LGE"
}, {
.module_id = MODULE_NCSI,
.msg = "MODULE_NCSI"
}, {
.module_id = MODULE_PPP,
.msg = "MODULE_PPP"
}, {
.module_id = MODULE_QCN,
.msg = "MODULE_QCN"
}, {
.module_id = MODULE_RCB_RX,
.msg = "MODULE_RCB_RX"
}, {
.module_id = MODULE_RTC,
.msg = "MODULE_RTC"
}, {
.module_id = MODULE_SSU,
.msg = "MODULE_SSU"
}, {
.module_id = MODULE_TM,
.msg = "MODULE_TM"
}, {
.module_id = MODULE_RCB_TX,
.msg = "MODULE_RCB_TX"
}, {
.module_id = MODULE_TXDMA,
.msg = "MODULE_TXDMA"
}, {
.module_id = MODULE_MASTER,
.msg = "MODULE_MASTER"
}, {
.module_id = MODULE_ROCEE_TOP,
.msg = "MODULE_ROCEE_TOP"
}, {
.module_id = MODULE_ROCEE_TIMER,
.msg = "MODULE_ROCEE_TIMER"
}, {
.module_id = MODULE_ROCEE_MDB,
.msg = "MODULE_ROCEE_MDB"
}, {
.module_id = MODULE_ROCEE_TSP,
.msg = "MODULE_ROCEE_TSP"
}, {
.module_id = MODULE_ROCEE_TRP,
.msg = "MODULE_ROCEE_TRP"
}, {
.module_id = MODULE_ROCEE_SCC,
.msg = "MODULE_ROCEE_SCC"
}, {
.module_id = MODULE_ROCEE_CAEP,
.msg = "MODULE_ROCEE_CAEP"
}, {
.module_id = MODULE_ROCEE_GEN_AC,
.msg = "MODULE_ROCEE_GEN_AC"
}, {
.module_id = MODULE_ROCEE_QMM,
.msg = "MODULE_ROCEE_QMM"
}, {
.module_id = MODULE_ROCEE_LSAN,
.msg = "MODULE_ROCEE_LSAN"
}
};
static const struct hclge_hw_type_id hclge_hw_type_id_st[] = {
{
.type_id = NONE_ERROR,
.msg = "none_error"
}, {
.type_id = FIFO_ERROR,
.msg = "fifo_error"
}, {
.type_id = MEMORY_ERROR,
.msg = "memory_error"
}, {
.type_id = POISON_ERROR,
.msg = "poison_error"
}, {
.type_id = MSIX_ECC_ERROR,
.msg = "msix_ecc_error"
}, {
.type_id = TQP_INT_ECC_ERROR,
.msg = "tqp_int_ecc_error"
}, {
.type_id = PF_ABNORMAL_INT_ERROR,
.msg = "pf_abnormal_int_error"
}, {
.type_id = MPF_ABNORMAL_INT_ERROR,
.msg = "mpf_abnormal_int_error"
}, {
.type_id = COMMON_ERROR,
.msg = "common_error"
}, {
.type_id = PORT_ERROR,
.msg = "port_error"
}, {
.type_id = ETS_ERROR,
.msg = "ets_error"
}, {
.type_id = NCSI_ERROR,
.msg = "ncsi_error"
}, {
.type_id = GLB_ERROR,
.msg = "glb_error"
}, {
.type_id = ROCEE_NORMAL_ERR,
.msg = "rocee_normal_error"
}, {
.type_id = ROCEE_OVF_ERR,
.msg = "rocee_ovf_error"
}
};
static void hclge_log_error(struct device *dev, char *reg, static void hclge_log_error(struct device *dev, char *reg,
const struct hclge_hw_error *err, const struct hclge_hw_error *err,
u32 err_sts, unsigned long *reset_requests) u32 err_sts, unsigned long *reset_requests)
@ -1611,11 +1739,27 @@ static const struct hclge_hw_blk hw_blk[] = {
{ /* sentinel */ } { /* sentinel */ }
}; };
static void hclge_config_all_msix_error(struct hclge_dev *hdev, bool enable)
{
u32 reg_val;
reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG);
if (enable)
reg_val |= BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B);
else
reg_val &= ~BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B);
hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val);
}
int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state) int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state)
{ {
const struct hclge_hw_blk *module = hw_blk; const struct hclge_hw_blk *module = hw_blk;
int ret = 0; int ret = 0;
hclge_config_all_msix_error(hdev, state);
while (module->name) { while (module->name) {
if (module->config_err_int) { if (module->config_err_int) {
ret = module->config_err_int(hdev, state); ret = module->config_err_int(hdev, state);
@ -1876,11 +2020,8 @@ static int hclge_handle_pf_msix_error(struct hclge_dev *hdev,
static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev, static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev,
unsigned long *reset_requests) unsigned long *reset_requests)
{ {
struct hclge_mac_tnl_stats mac_tnl_stats;
struct device *dev = &hdev->pdev->dev;
u32 mpf_bd_num, pf_bd_num, bd_num; u32 mpf_bd_num, pf_bd_num, bd_num;
struct hclge_desc *desc; struct hclge_desc *desc;
u32 status;
int ret; int ret;
/* query the number of bds for the MSIx int status */ /* query the number of bds for the MSIx int status */
@ -1903,29 +2044,7 @@ static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev,
if (ret) if (ret)
goto msi_error; goto msi_error;
/* query and clear mac tnl interruptions */ ret = hclge_handle_mac_tnl(hdev);
hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_QUERY_MAC_TNL_INT,
true);
ret = hclge_cmd_send(&hdev->hw, &desc[0], 1);
if (ret) {
dev_err(dev, "query mac tnl int cmd failed (%d)\n", ret);
goto msi_error;
}
status = le32_to_cpu(desc->data[0]);
if (status) {
/* When mac tnl interrupt occurs, we record current time and
* register status here in a fifo, then clear the status. So
* that if link status changes suddenly at some time, we can
* query them by debugfs.
*/
mac_tnl_stats.time = local_clock();
mac_tnl_stats.status = status;
kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats);
ret = hclge_clear_mac_tnl_int(hdev);
if (ret)
dev_err(dev, "clear mac tnl int failed (%d)\n", ret);
}
msi_error: msi_error:
kfree(desc); kfree(desc);
@ -1947,10 +2066,43 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
return hclge_handle_all_hw_msix_error(hdev, reset_requests); return hclge_handle_all_hw_msix_error(hdev, reset_requests);
} }
int hclge_handle_mac_tnl(struct hclge_dev *hdev)
{
struct hclge_mac_tnl_stats mac_tnl_stats;
struct device *dev = &hdev->pdev->dev;
struct hclge_desc desc;
u32 status;
int ret;
/* query and clear mac tnl interruptions */
hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_MAC_TNL_INT, true);
ret = hclge_cmd_send(&hdev->hw, &desc, 1);
if (ret) {
dev_err(dev, "failed to query mac tnl int, ret = %d.\n", ret);
return ret;
}
status = le32_to_cpu(desc.data[0]);
if (status) {
/* When mac tnl interrupt occurs, we record current time and
* register status here in a fifo, then clear the status. So
* that if link status changes suddenly at some time, we can
* query them by debugfs.
*/
mac_tnl_stats.time = local_clock();
mac_tnl_stats.status = status;
kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats);
ret = hclge_clear_mac_tnl_int(hdev);
if (ret)
dev_err(dev, "failed to clear mac tnl int, ret = %d.\n",
ret);
}
return ret;
}
void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev) void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
{ {
#define HCLGE_DESC_NO_DATA_LEN 8
struct hclge_dev *hdev = ae_dev->priv; struct hclge_dev *hdev = ae_dev->priv;
struct device *dev = &hdev->pdev->dev; struct device *dev = &hdev->pdev->dev;
u32 mpf_bd_num, pf_bd_num, bd_num; u32 mpf_bd_num, pf_bd_num, bd_num;
@ -1999,3 +2151,205 @@ void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
msi_error: msi_error:
kfree(desc); kfree(desc);
} }
bool hclge_find_error_source(struct hclge_dev *hdev)
{
u32 msix_src_flag, hw_err_src_flag;
msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) &
HCLGE_VECTOR0_REG_MSIX_MASK;
hw_err_src_flag = hclge_read_dev(&hdev->hw,
HCLGE_RAS_PF_OTHER_INT_STS_REG) &
HCLGE_RAS_REG_ERR_MASK;
return msix_src_flag || hw_err_src_flag;
}
void hclge_handle_occurred_error(struct hclge_dev *hdev)
{
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
if (hclge_find_error_source(hdev))
hclge_handle_error_info_log(ae_dev);
}
static void
hclge_handle_error_type_reg_log(struct device *dev,
struct hclge_mod_err_info *mod_info,
struct hclge_type_reg_err_info *type_reg_info)
{
#define HCLGE_ERR_TYPE_MASK 0x7F
#define HCLGE_ERR_TYPE_IS_RAS_OFFSET 7
u8 mod_id, total_module, type_id, total_type, i, is_ras;
u8 index_module = MODULE_NONE;
u8 index_type = NONE_ERROR;
mod_id = mod_info->mod_id;
type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK;
is_ras = type_reg_info->type_id >> HCLGE_ERR_TYPE_IS_RAS_OFFSET;
total_module = ARRAY_SIZE(hclge_hw_module_id_st);
total_type = ARRAY_SIZE(hclge_hw_type_id_st);
for (i = 0; i < total_module; i++) {
if (mod_id == hclge_hw_module_id_st[i].module_id) {
index_module = i;
break;
}
}
for (i = 0; i < total_type; i++) {
if (type_id == hclge_hw_type_id_st[i].type_id) {
index_type = i;
break;
}
}
if (index_module != MODULE_NONE && index_type != NONE_ERROR)
dev_err(dev,
"found %s %s, is %s error.\n",
hclge_hw_module_id_st[index_module].msg,
hclge_hw_type_id_st[index_type].msg,
is_ras ? "ras" : "msix");
else
dev_err(dev,
"unknown module[%u] or type[%u].\n", mod_id, type_id);
dev_err(dev, "reg_value:\n");
for (i = 0; i < type_reg_info->reg_num; i++)
dev_err(dev, "0x%08x\n", type_reg_info->hclge_reg[i]);
}
static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
const u32 *buf, u32 buf_size)
{
struct hclge_type_reg_err_info *type_reg_info;
struct hclge_dev *hdev = ae_dev->priv;
struct device *dev = &hdev->pdev->dev;
struct hclge_mod_err_info *mod_info;
struct hclge_sum_err_info *sum_info;
u8 mod_num, err_num, i;
u32 offset = 0;
sum_info = (struct hclge_sum_err_info *)&buf[offset++];
if (sum_info->reset_type &&
sum_info->reset_type != HNAE3_NONE_RESET)
set_bit(sum_info->reset_type, &ae_dev->hw_err_reset_req);
mod_num = sum_info->mod_num;
while (mod_num--) {
if (offset >= buf_size) {
dev_err(dev, "The offset(%u) exceeds buf's size(%u).\n",
offset, buf_size);
return;
}
mod_info = (struct hclge_mod_err_info *)&buf[offset++];
err_num = mod_info->err_num;
for (i = 0; i < err_num; i++) {
if (offset >= buf_size) {
dev_err(dev,
"The offset(%u) exceeds buf size(%u).\n",
offset, buf_size);
return;
}
type_reg_info = (struct hclge_type_reg_err_info *)
&buf[offset++];
hclge_handle_error_type_reg_log(dev, mod_info,
type_reg_info);
offset += type_reg_info->reg_num;
}
}
}
static int hclge_query_all_err_bd_num(struct hclge_dev *hdev, u32 *bd_num)
{
struct device *dev = &hdev->pdev->dev;
struct hclge_desc desc_bd;
int ret;
hclge_cmd_setup_basic_desc(&desc_bd, HCLGE_QUERY_ALL_ERR_BD_NUM, true);
ret = hclge_cmd_send(&hdev->hw, &desc_bd, 1);
if (ret) {
dev_err(dev, "failed to query error bd_num, ret = %d.\n", ret);
return ret;
}
*bd_num = le32_to_cpu(desc_bd.data[0]);
if (!(*bd_num)) {
dev_err(dev, "The value of bd_num is 0!\n");
return -EINVAL;
}
return 0;
}
static int hclge_query_all_err_info(struct hclge_dev *hdev,
struct hclge_desc *desc, u32 bd_num)
{
struct device *dev = &hdev->pdev->dev;
int ret;
hclge_cmd_setup_basic_desc(desc, HCLGE_QUERY_ALL_ERR_INFO, true);
ret = hclge_cmd_send(&hdev->hw, desc, bd_num);
if (ret)
dev_err(dev, "failed to query error info, ret = %d.\n", ret);
return ret;
}
int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev)
{
u32 bd_num, desc_len, buf_len, buf_size, i;
struct hclge_dev *hdev = ae_dev->priv;
struct hclge_desc *desc;
__le32 *desc_data;
u32 *buf;
int ret;
ret = hclge_query_all_err_bd_num(hdev, &bd_num);
if (ret)
goto out;
desc_len = bd_num * sizeof(struct hclge_desc);
desc = kzalloc(desc_len, GFP_KERNEL);
if (!desc) {
ret = -ENOMEM;
goto out;
}
ret = hclge_query_all_err_info(hdev, desc, bd_num);
if (ret)
goto err_desc;
buf_len = bd_num * sizeof(struct hclge_desc) - HCLGE_DESC_NO_DATA_LEN;
buf_size = buf_len / sizeof(u32);
desc_data = kzalloc(buf_len, GFP_KERNEL);
if (!desc_data)
return -ENOMEM;
buf = kzalloc(buf_len, GFP_KERNEL);
if (!buf) {
ret = -ENOMEM;
goto err_buf_alloc;
}
memcpy(desc_data, &desc[0].data[0], buf_len);
for (i = 0; i < buf_size; i++)
buf[i] = le32_to_cpu(desc_data[i]);
hclge_handle_error_module_log(ae_dev, buf, buf_size);
kfree(buf);
err_buf_alloc:
kfree(desc_data);
err_desc:
kfree(desc);
out:
return ret;
}

View File

@ -15,6 +15,8 @@
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00 #define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
#define HCLGE_RAS_REG_NFE_MASK 0xFF00 #define HCLGE_RAS_REG_NFE_MASK 0xFF00
#define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000 #define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000
#define HCLGE_RAS_REG_ERR_MASK \
(HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK)
#define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00 #define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00
@ -107,6 +109,10 @@
#define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000 #define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000
#define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F #define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F
#define HCLGE_DESC_DATA_MAX 8
#define HCLGE_REG_NUM_MAX 256
#define HCLGE_DESC_NO_DATA_LEN 8
enum hclge_err_int_type { enum hclge_err_int_type {
HCLGE_ERR_INT_MSIX = 0, HCLGE_ERR_INT_MSIX = 0,
HCLGE_ERR_INT_RAS_CE = 1, HCLGE_ERR_INT_RAS_CE = 1,
@ -114,6 +120,56 @@ enum hclge_err_int_type {
HCLGE_ERR_INT_RAS_FE = 3, HCLGE_ERR_INT_RAS_FE = 3,
}; };
enum hclge_mod_name_list {
MODULE_NONE = 0,
MODULE_BIOS_COMMON = 1,
MODULE_GE = 2,
MODULE_IGU_EGU = 3,
MODULE_LGE = 4,
MODULE_NCSI = 5,
MODULE_PPP = 6,
MODULE_QCN = 7,
MODULE_RCB_RX = 8,
MODULE_RTC = 9,
MODULE_SSU = 10,
MODULE_TM = 11,
MODULE_RCB_TX = 12,
MODULE_TXDMA = 13,
MODULE_MASTER = 14,
/* add new MODULE NAME for NIC here in order */
MODULE_ROCEE_TOP = 40,
MODULE_ROCEE_TIMER = 41,
MODULE_ROCEE_MDB = 42,
MODULE_ROCEE_TSP = 43,
MODULE_ROCEE_TRP = 44,
MODULE_ROCEE_SCC = 45,
MODULE_ROCEE_CAEP = 46,
MODULE_ROCEE_GEN_AC = 47,
MODULE_ROCEE_QMM = 48,
MODULE_ROCEE_LSAN = 49,
/* add new MODULE NAME for RoCEE here in order */
};
enum hclge_err_type_list {
NONE_ERROR = 0,
FIFO_ERROR = 1,
MEMORY_ERROR = 2,
POISON_ERROR = 3,
MSIX_ECC_ERROR = 4,
TQP_INT_ECC_ERROR = 5,
PF_ABNORMAL_INT_ERROR = 6,
MPF_ABNORMAL_INT_ERROR = 7,
COMMON_ERROR = 8,
PORT_ERROR = 9,
ETS_ERROR = 10,
NCSI_ERROR = 11,
GLB_ERROR = 12,
/* add new ERROR TYPE for NIC here in order */
ROCEE_NORMAL_ERR = 40,
ROCEE_OVF_ERR = 41,
/* add new ERROR TYPE for ROCEE here in order */
};
struct hclge_hw_blk { struct hclge_hw_blk {
u32 msk; u32 msk;
const char *name; const char *name;
@ -126,11 +182,44 @@ struct hclge_hw_error {
enum hnae3_reset_type reset_level; enum hnae3_reset_type reset_level;
}; };
struct hclge_hw_module_id {
enum hclge_mod_name_list module_id;
const char *msg;
};
struct hclge_hw_type_id {
enum hclge_err_type_list type_id;
const char *msg;
};
struct hclge_sum_err_info {
u8 reset_type;
u8 mod_num;
u8 rsv[2];
};
struct hclge_mod_err_info {
u8 mod_id;
u8 err_num;
u8 rsv[2];
};
struct hclge_type_reg_err_info {
u8 type_id;
u8 reg_num;
u8 rsv[2];
u32 hclge_reg[HCLGE_REG_NUM_MAX];
};
int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en); int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en);
int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state); int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state);
int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en); int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en);
void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev); void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev);
bool hclge_find_error_source(struct hclge_dev *hdev);
void hclge_handle_occurred_error(struct hclge_dev *hdev);
pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev); pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
int hclge_handle_hw_msix_error(struct hclge_dev *hdev, int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
unsigned long *reset_requests); unsigned long *reset_requests);
int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev);
int hclge_handle_mac_tnl(struct hclge_dev *hdev);
#endif #endif

View File

@ -3307,11 +3307,13 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf,
static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
{ {
u32 cmdq_src_reg, msix_src_reg; u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg;
/* fetch the events from their corresponding regs */ /* fetch the events from their corresponding regs */
cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG); cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG);
msix_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS); msix_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
hw_err_src_reg = hclge_read_dev(&hdev->hw,
HCLGE_RAS_PF_OTHER_INT_STS_REG);
/* Assumption: If by any chance reset and mailbox events are reported /* Assumption: If by any chance reset and mailbox events are reported
* together then we will only process reset event in this go and will * together then we will only process reset event in this go and will
@ -3339,11 +3341,10 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
return HCLGE_VECTOR0_EVENT_RST; return HCLGE_VECTOR0_EVENT_RST;
} }
/* check for vector0 msix event source */ /* check for vector0 msix event and hardware error event source */
if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK) { if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK ||
*clearval = msix_src_reg; hw_err_src_reg & HCLGE_RAS_REG_ERR_MASK)
return HCLGE_VECTOR0_EVENT_ERR; return HCLGE_VECTOR0_EVENT_ERR;
}
/* check for vector0 mailbox(=CMDQ RX) event source */ /* check for vector0 mailbox(=CMDQ RX) event source */
if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) { if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) {
@ -3354,9 +3355,8 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
/* print other vector0 event source */ /* print other vector0 event source */
dev_info(&hdev->pdev->dev, dev_info(&hdev->pdev->dev,
"CMDQ INT status:0x%x, other INT status:0x%x\n", "INT status: CMDQ(%#x) HW errors(%#x) other(%#x)\n",
cmdq_src_reg, msix_src_reg); cmdq_src_reg, hw_err_src_reg, msix_src_reg);
*clearval = msix_src_reg;
return HCLGE_VECTOR0_EVENT_OTHER; return HCLGE_VECTOR0_EVENT_OTHER;
} }
@ -3427,15 +3427,10 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
hclge_clear_event_cause(hdev, event_cause, clearval); hclge_clear_event_cause(hdev, event_cause, clearval);
/* Enable interrupt if it is not cause by reset. And when /* Enable interrupt if it is not caused by reset event or error event */
* clearval equal to 0, it means interrupt status may be if (event_cause == HCLGE_VECTOR0_EVENT_MBX ||
* cleared by hardware before driver reads status register. event_cause == HCLGE_VECTOR0_EVENT_OTHER)
* For this case, vector0 interrupt also should be enabled.
*/
if (!clearval ||
event_cause == HCLGE_VECTOR0_EVENT_MBX) {
hclge_enable_vector(&hdev->misc_vector, true); hclge_enable_vector(&hdev->misc_vector, true);
}
return IRQ_HANDLED; return IRQ_HANDLED;
} }
@ -4240,6 +4235,38 @@ static void hclge_reset_subtask(struct hclge_dev *hdev)
hdev->reset_type = HNAE3_NONE_RESET; hdev->reset_type = HNAE3_NONE_RESET;
} }
static void hclge_handle_err_reset_request(struct hclge_dev *hdev)
{
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
enum hnae3_reset_type reset_type;
if (ae_dev->hw_err_reset_req) {
reset_type = hclge_get_reset_level(ae_dev,
&ae_dev->hw_err_reset_req);
hclge_set_def_reset_request(ae_dev, reset_type);
}
if (hdev->default_reset_request && ae_dev->ops->reset_event)
ae_dev->ops->reset_event(hdev->pdev, NULL);
/* enable interrupt after error handling complete */
hclge_enable_vector(&hdev->misc_vector, true);
}
static void hclge_handle_err_recovery(struct hclge_dev *hdev)
{
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
ae_dev->hw_err_reset_req = 0;
if (hclge_find_error_source(hdev)) {
hclge_handle_error_info_log(ae_dev);
hclge_handle_mac_tnl(hdev);
}
hclge_handle_err_reset_request(hdev);
}
static void hclge_misc_err_recovery(struct hclge_dev *hdev) static void hclge_misc_err_recovery(struct hclge_dev *hdev)
{ {
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
@ -4247,19 +4274,16 @@ static void hclge_misc_err_recovery(struct hclge_dev *hdev)
u32 msix_sts_reg; u32 msix_sts_reg;
msix_sts_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS); msix_sts_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
if (msix_sts_reg & HCLGE_VECTOR0_REG_MSIX_MASK) { if (msix_sts_reg & HCLGE_VECTOR0_REG_MSIX_MASK) {
if (hclge_handle_hw_msix_error(hdev, if (hclge_handle_hw_msix_error
&hdev->default_reset_request)) (hdev, &hdev->default_reset_request))
dev_info(dev, "received msix interrupt 0x%x\n", dev_info(dev, "received msix interrupt 0x%x\n",
msix_sts_reg); msix_sts_reg);
if (hdev->default_reset_request)
if (ae_dev->ops->reset_event)
ae_dev->ops->reset_event(hdev->pdev, NULL);
} }
hclge_enable_vector(&hdev->misc_vector, true); hclge_handle_hw_ras_error(ae_dev);
hclge_handle_err_reset_request(hdev);
} }
static void hclge_errhand_service_task(struct hclge_dev *hdev) static void hclge_errhand_service_task(struct hclge_dev *hdev)
@ -4267,7 +4291,10 @@ static void hclge_errhand_service_task(struct hclge_dev *hdev)
if (!test_and_clear_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state)) if (!test_and_clear_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state))
return; return;
hclge_misc_err_recovery(hdev); if (hnae3_dev_ras_imp_supported(hdev))
hclge_handle_err_recovery(hdev);
else
hclge_misc_err_recovery(hdev);
} }
static void hclge_reset_service_task(struct hclge_dev *hdev) static void hclge_reset_service_task(struct hclge_dev *hdev)
@ -11524,7 +11551,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
hclge_clear_resetting_state(hdev); hclge_clear_resetting_state(hdev);
/* Log and clear the hw errors those already occurred */ /* Log and clear the hw errors those already occurred */
hclge_handle_all_hns_hw_errors(ae_dev); if (hnae3_dev_ras_imp_supported(hdev))
hclge_handle_occurred_error(hdev);
else
hclge_handle_all_hns_hw_errors(ae_dev);
/* request delayed reset for the error recovery because an immediate /* request delayed reset for the error recovery because an immediate
* global reset on a PF affecting pending initialization of other PFs * global reset on a PF affecting pending initialization of other PFs
@ -11877,7 +11907,10 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
} }
/* Log and clear the hw errors those already occurred */ /* Log and clear the hw errors those already occurred */
hclge_handle_all_hns_hw_errors(ae_dev); if (hnae3_dev_ras_imp_supported(hdev))
hclge_handle_occurred_error(hdev);
else
hclge_handle_all_hns_hw_errors(ae_dev);
/* Re-enable the hw error interrupts because /* Re-enable the hw error interrupts because
* the interrupts get disabled on global reset. * the interrupts get disabled on global reset.

View File

@ -190,6 +190,7 @@ enum HLCGE_PORT_TYPE {
#define HCLGE_VECTOR0_IMP_RESET_INT_B 1 #define HCLGE_VECTOR0_IMP_RESET_INT_B 1
#define HCLGE_VECTOR0_IMP_CMDQ_ERR_B 4U #define HCLGE_VECTOR0_IMP_CMDQ_ERR_B 4U
#define HCLGE_VECTOR0_IMP_RD_POISON_B 5U #define HCLGE_VECTOR0_IMP_RD_POISON_B 5U
#define HCLGE_VECTOR0_ALL_MSIX_ERR_B 6U
#define HCLGE_MAC_DEFAULT_FRAME \ #define HCLGE_MAC_DEFAULT_FRAME \
(ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN) (ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN)