net/mlx5: Extend health buffer dump
Enhance health buffer to include: - assert_var5: expose the 6'th assert variable. - time: error's time-stamp in seconds (epoch time). - rfr: Recovery Flow Requiered. When set, indicates that the error cannot be recovered without flow involving reset. - severity: error's severity value, ranging from emergency to debug. Expose them in the health buffer dump (dmesg and devlink fw reporter). Health buffer in dmesg: mlx5_core 0000:08:00.0: print_health_info:425:(pid 912): Health issue observed, firmware internal error, severity(3) ERROR: mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[0] 0x08040700 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[1] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[2] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[3] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[4] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[5] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:432:(pid 912): assert_exit_ptr 0x00aaf800 mlx5_core 0000:08:00.0: print_health_info:434:(pid 912): assert_callra 0x00aaf70c mlx5_core 0000:08:00.0: print_health_info:436:(pid 912): fw_ver 16.32.492 mlx5_core 0000:08:00.0: print_health_info:437:(pid 912): time 1634819758 mlx5_core 0000:08:00.0: print_health_info:438:(pid 912): hw_id 0x0000020d mlx5_core 0000:08:00.0: print_health_info:439:(pid 912): rfr 0 mlx5_core 0000:08:00.0: print_health_info:440:(pid 912): severity 3 (ERROR) mlx5_core 0000:08:00.0: print_health_info:441:(pid 912): irisc_index 9 mlx5_core 0000:08:00.0: print_health_info:442:(pid 912): synd 0x1: firmware internal error mlx5_core 0000:08:00.0: print_health_info:444:(pid 912): ext_synd 0x802b mlx5_core 0000:08:00.0: print_health_info:445:(pid 912): raw fw_ver 0x102001ec Signed-off-by: Aya Levin <ayal@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
This commit is contained in:
parent
2fdeb4f4c2
commit
cb464ba53c
|
@ -36,6 +36,7 @@
|
|||
#include <linux/vmalloc.h>
|
||||
#include <linux/hardirq.h>
|
||||
#include <linux/mlx5/driver.h>
|
||||
#include <linux/kern_levels.h>
|
||||
#include "mlx5_core.h"
|
||||
#include "lib/eq.h"
|
||||
#include "lib/mlx5.h"
|
||||
|
@ -74,6 +75,11 @@ enum {
|
|||
MLX5_SENSOR_FW_SYND_RFR = 5,
|
||||
};
|
||||
|
||||
enum {
|
||||
MLX5_SEVERITY_MASK = 0x7,
|
||||
MLX5_SEVERITY_VALID_MASK = 0x8,
|
||||
};
|
||||
|
||||
u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
|
||||
{
|
||||
return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
|
||||
|
@ -98,12 +104,19 @@ static bool sensor_pci_not_working(struct mlx5_core_dev *dev)
|
|||
return (ioread32be(&h->fw_ver) == 0xffffffff);
|
||||
}
|
||||
|
||||
static int mlx5_health_get_rfr(u8 rfr_severity)
|
||||
{
|
||||
return rfr_severity >> MLX5_RFR_BIT_OFFSET;
|
||||
}
|
||||
|
||||
static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
|
||||
{
|
||||
struct mlx5_core_health *health = &dev->priv.health;
|
||||
struct health_buffer __iomem *h = health->health;
|
||||
u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
|
||||
u8 synd = ioread8(&h->synd);
|
||||
u8 rfr;
|
||||
|
||||
rfr = mlx5_health_get_rfr(ioread8(&h->rfr_severity));
|
||||
|
||||
if (rfr && synd)
|
||||
mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
|
||||
|
@ -366,18 +379,52 @@ static const char *hsynd_str(u8 synd)
|
|||
}
|
||||
}
|
||||
|
||||
static const char *mlx5_loglevel_str(int level)
|
||||
{
|
||||
switch (level) {
|
||||
case LOGLEVEL_EMERG:
|
||||
return "EMERGENCY";
|
||||
case LOGLEVEL_ALERT:
|
||||
return "ALERT";
|
||||
case LOGLEVEL_CRIT:
|
||||
return "CRITICAL";
|
||||
case LOGLEVEL_ERR:
|
||||
return "ERROR";
|
||||
case LOGLEVEL_WARNING:
|
||||
return "WARNING";
|
||||
case LOGLEVEL_NOTICE:
|
||||
return "NOTICE";
|
||||
case LOGLEVEL_INFO:
|
||||
return "INFO";
|
||||
case LOGLEVEL_DEBUG:
|
||||
return "DEBUG";
|
||||
}
|
||||
return "Unknown log level";
|
||||
}
|
||||
|
||||
static int mlx5_health_get_severity(u8 rfr_severity)
|
||||
{
|
||||
return rfr_severity & MLX5_SEVERITY_VALID_MASK ?
|
||||
rfr_severity & MLX5_SEVERITY_MASK : LOGLEVEL_ERR;
|
||||
}
|
||||
|
||||
static void print_health_info(struct mlx5_core_dev *dev)
|
||||
{
|
||||
struct mlx5_core_health *health = &dev->priv.health;
|
||||
struct health_buffer __iomem *h = health->health;
|
||||
char fw_str[18];
|
||||
u32 fw;
|
||||
u8 rfr_severity;
|
||||
int severity;
|
||||
int i;
|
||||
|
||||
/* If the syndrome is 0, the device is OK and no need to print buffer */
|
||||
if (!ioread8(&h->synd))
|
||||
return;
|
||||
|
||||
rfr_severity = ioread8(&h->rfr_severity);
|
||||
severity = mlx5_health_get_severity(rfr_severity);
|
||||
mlx5_core_err(dev, "Health issue observed, %s, severity(%d) %s:\n",
|
||||
hsynd_str(ioread8(&h->synd)), severity, mlx5_loglevel_str(severity));
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
|
||||
mlx5_core_err(dev, "assert_var[%d] 0x%08x\n", i,
|
||||
ioread32be(h->assert_var + i));
|
||||
|
@ -386,15 +433,16 @@ static void print_health_info(struct mlx5_core_dev *dev)
|
|||
ioread32be(&h->assert_exit_ptr));
|
||||
mlx5_core_err(dev, "assert_callra 0x%08x\n",
|
||||
ioread32be(&h->assert_callra));
|
||||
sprintf(fw_str, "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
|
||||
mlx5_core_err(dev, "fw_ver %s\n", fw_str);
|
||||
mlx5_core_err(dev, "fw_ver %d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
|
||||
mlx5_core_err(dev, "time %u\n", ioread32be(&h->time));
|
||||
mlx5_core_err(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
|
||||
mlx5_core_err(dev, "rfr %d\n", mlx5_health_get_rfr(rfr_severity));
|
||||
mlx5_core_err(dev, "severity %d (%s)\n", severity, mlx5_loglevel_str(severity));
|
||||
mlx5_core_err(dev, "irisc_index %d\n", ioread8(&h->irisc_index));
|
||||
mlx5_core_err(dev, "synd 0x%x: %s\n", ioread8(&h->synd),
|
||||
hsynd_str(ioread8(&h->synd)));
|
||||
mlx5_core_err(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
|
||||
fw = ioread32be(&h->fw_ver);
|
||||
mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw);
|
||||
mlx5_core_err(dev, "raw fw_ver 0x%08x\n", ioread32be(&h->fw_ver));
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -443,6 +491,7 @@ mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev,
|
|||
{
|
||||
struct mlx5_core_health *health = &dev->priv.health;
|
||||
struct health_buffer __iomem *h = health->health;
|
||||
u8 rfr_severity;
|
||||
int err;
|
||||
int i;
|
||||
|
||||
|
@ -473,9 +522,19 @@ mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev,
|
|||
return err;
|
||||
err = devlink_fmsg_u32_pair_put(fmsg, "assert_callra",
|
||||
ioread32be(&h->assert_callra));
|
||||
if (err)
|
||||
return err;
|
||||
err = devlink_fmsg_u32_pair_put(fmsg, "time", ioread32be(&h->time));
|
||||
if (err)
|
||||
return err;
|
||||
err = devlink_fmsg_u32_pair_put(fmsg, "hw_id", ioread32be(&h->hw_id));
|
||||
if (err)
|
||||
return err;
|
||||
rfr_severity = ioread8(&h->rfr_severity);
|
||||
err = devlink_fmsg_u8_pair_put(fmsg, "rfr", mlx5_health_get_rfr(rfr_severity));
|
||||
if (err)
|
||||
return err;
|
||||
err = devlink_fmsg_u8_pair_put(fmsg, "severity", mlx5_health_get_severity(rfr_severity));
|
||||
if (err)
|
||||
return err;
|
||||
err = devlink_fmsg_u8_pair_put(fmsg, "irisc_index",
|
||||
|
|
|
@ -541,19 +541,21 @@ struct mlx5_cmd_layout {
|
|||
u8 status_own;
|
||||
};
|
||||
|
||||
enum mlx5_fatal_assert_bit_offsets {
|
||||
MLX5_RFR_OFFSET = 31,
|
||||
enum mlx5_rfr_severity_bit_offsets {
|
||||
MLX5_RFR_BIT_OFFSET = 0x7,
|
||||
};
|
||||
|
||||
struct health_buffer {
|
||||
__be32 assert_var[5];
|
||||
__be32 rsvd0[3];
|
||||
__be32 assert_var[6];
|
||||
__be32 rsvd0[2];
|
||||
__be32 assert_exit_ptr;
|
||||
__be32 assert_callra;
|
||||
__be32 rsvd1[2];
|
||||
__be32 rsvd1[1];
|
||||
__be32 time;
|
||||
__be32 fw_ver;
|
||||
__be32 hw_id;
|
||||
__be32 rfr;
|
||||
u8 rfr_severity;
|
||||
u8 rsvd2[3];
|
||||
u8 irisc_index;
|
||||
u8 synd;
|
||||
__be16 ext_synd;
|
||||
|
|
|
@ -4149,13 +4149,19 @@ struct mlx5_ifc_health_buffer_bits {
|
|||
|
||||
u8 assert_callra[0x20];
|
||||
|
||||
u8 reserved_at_140[0x40];
|
||||
u8 reserved_at_140[0x20];
|
||||
|
||||
u8 time[0x20];
|
||||
|
||||
u8 fw_version[0x20];
|
||||
|
||||
u8 hw_id[0x20];
|
||||
|
||||
u8 reserved_at_1c0[0x20];
|
||||
u8 rfr[0x1];
|
||||
u8 reserved_at_1c1[0x3];
|
||||
u8 valid[0x1];
|
||||
u8 severity[0x3];
|
||||
u8 reserved_at_1c8[0x18];
|
||||
|
||||
u8 irisc_index[0x8];
|
||||
u8 synd[0x8];
|
||||
|
|
Loading…
Reference in New Issue