net: hns3: add handling of hw errors reported through MSIX
This patch adds handling for HNS3 hardware errors(non-standard) which are reported through MSIX interrupts and not through PCIe AER channel. These MSIX reported hardware errors are handled using common misc. interrupt handler. Hardware error related registers cannot be cleared in context to the interrupt received as they require *heavy* access to hardware using IMP(Integrated Mangement Processor) commands. Hence, we defer the clearing of such error events till later time. Since, we have defered exact identification of errors we will have to defer the level of receovery/reset which might be required. Hence, a new reset type UNKNOWN reset has been introduced which effectively defers the assertion of the reset till we get hold of kind of errors at later time. Signed-off-by: Salil Mehta <salil.mehta@huawei.com> Signed-off-by: Shiju Jose <shiju.jose@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
8bb147927c
commit
f6162d4412
|
@ -136,6 +136,7 @@ enum hnae3_reset_type {
|
|||
HNAE3_CORE_RESET,
|
||||
HNAE3_GLOBAL_RESET,
|
||||
HNAE3_IMP_RESET,
|
||||
HNAE3_UNKNOWN_RESET,
|
||||
HNAE3_NONE_RESET,
|
||||
};
|
||||
|
||||
|
|
|
@ -220,6 +220,9 @@ enum hclge_opcode_type {
|
|||
HCLGE_QUERY_RAS_INT_STS_BD_NUM = 0x1510,
|
||||
HCLGE_QUERY_CLEAR_MPF_RAS_INT = 0x1511,
|
||||
HCLGE_QUERY_CLEAR_PF_RAS_INT = 0x1512,
|
||||
HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
|
||||
HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514,
|
||||
HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515,
|
||||
HCLGE_IGU_EGU_TNL_INT_EN = 0x1803,
|
||||
HCLGE_IGU_COMMON_INT_EN = 0x1806,
|
||||
HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14,
|
||||
|
|
|
@ -727,3 +727,95 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
|
|||
|
||||
return PCI_ERS_RESULT_RECOVERED;
|
||||
}
|
||||
|
||||
int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
|
||||
unsigned long *reset_requests)
|
||||
{
|
||||
struct device *dev = &hdev->pdev->dev;
|
||||
u32 mpf_bd_num, pf_bd_num, bd_num;
|
||||
struct hclge_desc desc_bd;
|
||||
struct hclge_desc *desc;
|
||||
int ret = 0;
|
||||
|
||||
/* set default handling */
|
||||
set_bit(HNAE3_FUNC_RESET, reset_requests);
|
||||
|
||||
/* query the number of bds for the MSIx int status */
|
||||
hclge_cmd_setup_basic_desc(&desc_bd, HCLGE_QUERY_MSIX_INT_STS_BD_NUM,
|
||||
true);
|
||||
ret = hclge_cmd_send(&hdev->hw, &desc_bd, 1);
|
||||
if (ret) {
|
||||
dev_err(dev, "fail(%d) to query msix int status bd num\n",
|
||||
ret);
|
||||
/* reset everything for now */
|
||||
set_bit(HNAE3_GLOBAL_RESET, reset_requests);
|
||||
return ret;
|
||||
}
|
||||
|
||||
mpf_bd_num = le32_to_cpu(desc_bd.data[0]);
|
||||
pf_bd_num = le32_to_cpu(desc_bd.data[1]);
|
||||
bd_num = max_t(u32, mpf_bd_num, pf_bd_num);
|
||||
|
||||
desc = kcalloc(bd_num, sizeof(struct hclge_desc), GFP_KERNEL);
|
||||
if (!desc)
|
||||
goto out;
|
||||
|
||||
/* query all main PF MSIx errors */
|
||||
hclge_cmd_setup_basic_desc(&desc[0], HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT,
|
||||
true);
|
||||
desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
|
||||
|
||||
ret = hclge_cmd_send(&hdev->hw, &desc[0], mpf_bd_num);
|
||||
if (ret) {
|
||||
dev_err(dev, "query all mpf msix int cmd failed (%d)\n",
|
||||
ret);
|
||||
/* reset everything for now */
|
||||
set_bit(HNAE3_GLOBAL_RESET, reset_requests);
|
||||
goto msi_error;
|
||||
}
|
||||
|
||||
/* clear all main PF MSIx errors */
|
||||
hclge_cmd_reuse_desc(&desc[0], false);
|
||||
desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
|
||||
|
||||
ret = hclge_cmd_send(&hdev->hw, &desc[0], mpf_bd_num);
|
||||
if (ret) {
|
||||
dev_err(dev, "clear all mpf msix int cmd failed (%d)\n",
|
||||
ret);
|
||||
/* reset everything for now */
|
||||
set_bit(HNAE3_GLOBAL_RESET, reset_requests);
|
||||
goto msi_error;
|
||||
}
|
||||
|
||||
/* query all PF MSIx errors */
|
||||
memset(desc, 0, bd_num * sizeof(struct hclge_desc));
|
||||
hclge_cmd_setup_basic_desc(&desc[0], HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT,
|
||||
true);
|
||||
desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
|
||||
|
||||
ret = hclge_cmd_send(&hdev->hw, &desc[0], pf_bd_num);
|
||||
if (ret) {
|
||||
dev_err(dev, "query all pf msix int cmd failed (%d)\n",
|
||||
ret);
|
||||
/* reset everything for now */
|
||||
set_bit(HNAE3_GLOBAL_RESET, reset_requests);
|
||||
goto msi_error;
|
||||
}
|
||||
|
||||
/* clear all PF MSIx errors */
|
||||
hclge_cmd_reuse_desc(&desc[0], false);
|
||||
desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
|
||||
|
||||
ret = hclge_cmd_send(&hdev->hw, &desc[0], pf_bd_num);
|
||||
if (ret) {
|
||||
dev_err(dev, "clear all pf msix int cmd failed (%d)\n",
|
||||
ret);
|
||||
/* reset everything for now */
|
||||
set_bit(HNAE3_GLOBAL_RESET, reset_requests);
|
||||
}
|
||||
|
||||
msi_error:
|
||||
kfree(desc);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -9,6 +9,9 @@
|
|||
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
|
||||
#define HCLGE_RAS_REG_NFE_MASK 0xFF00
|
||||
|
||||
#define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG 0x20800
|
||||
#define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00
|
||||
|
||||
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN 0xFFFF0000
|
||||
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN_MASK 0xFFFF0000
|
||||
#define HCLGE_IMP_ITCM4_ECC_ERR_INT_EN 0x300
|
||||
|
@ -69,4 +72,6 @@ struct hclge_hw_error {
|
|||
|
||||
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state);
|
||||
pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
|
||||
int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
|
||||
unsigned long *reset_requests);
|
||||
#endif
|
||||
|
|
|
@ -2200,12 +2200,13 @@ static void hclge_service_complete(struct hclge_dev *hdev)
|
|||
|
||||
static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
|
||||
{
|
||||
u32 rst_src_reg;
|
||||
u32 cmdq_src_reg;
|
||||
u32 rst_src_reg, cmdq_src_reg, msix_src_reg;
|
||||
|
||||
/* fetch the events from their corresponding regs */
|
||||
rst_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
|
||||
cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG);
|
||||
msix_src_reg = hclge_read_dev(&hdev->hw,
|
||||
HCLGE_VECTOR0_PF_OTHER_INT_STS_REG);
|
||||
|
||||
/* Assumption: If by any chance reset and mailbox events are reported
|
||||
* together then we will only process reset event in this go and will
|
||||
|
@ -2239,6 +2240,10 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
|
|||
return HCLGE_VECTOR0_EVENT_RST;
|
||||
}
|
||||
|
||||
/* check for vector0 msix event source */
|
||||
if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK)
|
||||
return HCLGE_VECTOR0_EVENT_ERR;
|
||||
|
||||
/* check for vector0 mailbox(=CMDQ RX) event source */
|
||||
if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) {
|
||||
cmdq_src_reg &= ~BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B);
|
||||
|
@ -2289,6 +2294,19 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
|
|||
|
||||
/* vector 0 interrupt is shared with reset and mailbox source events.*/
|
||||
switch (event_cause) {
|
||||
case HCLGE_VECTOR0_EVENT_ERR:
|
||||
/* we do not know what type of reset is required now. This could
|
||||
* only be decided after we fetch the type of errors which
|
||||
* caused this event. Therefore, we will do below for now:
|
||||
* 1. Assert HNAE3_UNKNOWN_RESET type of reset. This means we
|
||||
* have defered type of reset to be used.
|
||||
* 2. Schedule the reset serivce task.
|
||||
* 3. When service task receives HNAE3_UNKNOWN_RESET type it
|
||||
* will fetch the correct type of reset. This would be done
|
||||
* by first decoding the types of errors.
|
||||
*/
|
||||
set_bit(HNAE3_UNKNOWN_RESET, &hdev->reset_request);
|
||||
/* fall through */
|
||||
case HCLGE_VECTOR0_EVENT_RST:
|
||||
hclge_reset_task_schedule(hdev);
|
||||
break;
|
||||
|
@ -2593,6 +2611,23 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hclge_dev *hdev,
|
|||
{
|
||||
enum hnae3_reset_type rst_level = HNAE3_NONE_RESET;
|
||||
|
||||
/* first, resolve any unknown reset type to the known type(s) */
|
||||
if (test_bit(HNAE3_UNKNOWN_RESET, addr)) {
|
||||
/* we will intentionally ignore any errors from this function
|
||||
* as we will end up in *some* reset request in any case
|
||||
*/
|
||||
hclge_handle_hw_msix_error(hdev, addr);
|
||||
clear_bit(HNAE3_UNKNOWN_RESET, addr);
|
||||
/* We defered the clearing of the error event which caused
|
||||
* interrupt since it was not posssible to do that in
|
||||
* interrupt context (and this is the reason we introduced
|
||||
* new UNKNOWN reset type). Now, the errors have been
|
||||
* handled and cleared in hardware we can safely enable
|
||||
* interrupts. This is an exception to the norm.
|
||||
*/
|
||||
hclge_enable_vector(&hdev->misc_vector, true);
|
||||
}
|
||||
|
||||
/* return the highest priority reset level amongst all */
|
||||
if (test_bit(HNAE3_IMP_RESET, addr)) {
|
||||
rst_level = HNAE3_IMP_RESET;
|
||||
|
|
|
@ -205,6 +205,7 @@ enum HCLGE_DEV_STATE {
|
|||
enum hclge_evt_cause {
|
||||
HCLGE_VECTOR0_EVENT_RST,
|
||||
HCLGE_VECTOR0_EVENT_MBX,
|
||||
HCLGE_VECTOR0_EVENT_ERR,
|
||||
HCLGE_VECTOR0_EVENT_OTHER,
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue