habanalabs/gaudi: handle axi errors from NIC engines

Various AXI errors can occur in the NIC engines and are reported to
the driver by the f/w. Add code to print the errors and ack them to
the f/w.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Oded Gabbay 2022-02-17 16:07:03 +02:00
parent f23f280277
commit 26ef1c000b
1 changed files with 48 additions and 0 deletions

View File

@ -7665,6 +7665,48 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev,
fw_alive->thread_id, fw_alive->uptime_seconds); fw_alive->thread_id, fw_alive->uptime_seconds);
} }
static void gaudi_print_nic_axi_irq_info(struct hl_device *hdev, u16 event_type,
void *data)
{
char desc[64] = "", *type;
struct eq_nic_sei_event *eq_nic_sei = data;
u16 nic_id = event_type - GAUDI_EVENT_NIC_SEI_0;
switch (eq_nic_sei->axi_error_cause) {
case RXB:
type = "RXB";
break;
case RXE:
type = "RXE";
break;
case TXS:
type = "TXS";
break;
case TXE:
type = "TXE";
break;
case QPC_RESP:
type = "QPC_RESP";
break;
case NON_AXI_ERR:
type = "NON_AXI_ERR";
break;
case TMR:
type = "TMR";
break;
default:
dev_err(hdev->dev, "unknown NIC AXI cause %d\n",
eq_nic_sei->axi_error_cause);
type = "N/A";
break;
}
snprintf(desc, sizeof(desc), "NIC%d_%s%d", nic_id, type,
eq_nic_sei->id);
dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
event_type, desc);
}
static int gaudi_non_hard_reset_late_init(struct hl_device *hdev) static int gaudi_non_hard_reset_late_init(struct hl_device *hdev)
{ {
/* GAUDI doesn't support any reset except hard-reset */ /* GAUDI doesn't support any reset except hard-reset */
@ -7898,6 +7940,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
struct hl_eq_entry *eq_entry) struct hl_eq_entry *eq_entry)
{ {
struct gaudi_device *gaudi = hdev->asic_specific; struct gaudi_device *gaudi = hdev->asic_specific;
u64 data = le64_to_cpu(eq_entry->data[0]);
u32 ctl = le32_to_cpu(eq_entry->hdr.ctl); u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
u32 fw_fatal_err_flag = 0; u32 fw_fatal_err_flag = 0;
u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
@ -8095,6 +8138,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
hl_fw_unmask_irq(hdev, event_type); hl_fw_unmask_irq(hdev, event_type);
break; break;
case GAUDI_EVENT_NIC_SEI_0 ... GAUDI_EVENT_NIC_SEI_4:
gaudi_print_nic_axi_irq_info(hdev, event_type, &data);
hl_fw_unmask_irq(hdev, event_type);
break;
case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3: case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3:
gaudi_print_irq_info(hdev, event_type, false); gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_sm_sei_info(hdev, event_type, gaudi_print_sm_sei_info(hdev, event_type,