net/mlx5: Add vnic devlink health reporter to PFs/VFs

Create a vnic devlink health reporter for PFs/VFs interfaces.
The reporter's diagnose callback displays the values of vNIC/vport
transport debug counters of PFs/VFs, as follows:

$ devlink health diagnose pci/0000:08:00.0 reporter vnic
 vNIC env counters:
    total_error_queues: 0 send_queue_priority_update_flow: 0
    comp_eq_overrun: 0 async_eq_overrun: 0 cq_overrun: 0
    invalid_command: 0 quota_exceeded_command: 0
    nic_receive_steering_discard: 0

Moreover, add documentation on the reporter functionality and the
counters description.

While at it, expose the vNIC counters diagnose function to be used by
the downstream patch, which will reveal the counters for representor
interfaces.

Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
This commit is contained in:
Maher Sanalla 2023-03-21 00:10:16 +02:00 committed by Saeed Mahameed
parent 0a431418f6
commit b0bc615df4
6 changed files with 177 additions and 1 deletions

View File

@ -257,3 +257,33 @@ User commands examples:
$ devlink health dump show pci/0000:82:00.1 reporter fw_fatal
NOTE: This command can run only on PF.
vnic reporter
-------------
The vnic reporter implements only the `diagnose` callback.
It is responsible for querying the vnic diagnostic counters from fw and displaying
them in realtime.
Description of the vnic counters:
total_q_under_processor_handle: number of queues in an error state due to
an async error or errored command.
send_queue_priority_update_flow: number of QP/SQ priority/SL update
events.
cq_overrun: number of times CQ entered an error state due to an
overflow.
async_eq_overrun: number of times an EQ mapped to async events was
overrun.
comp_eq_overrun: number of times an EQ mapped to completion events was
overrun.
quota_exceeded_command: number of commands issued and failed due to quota
exceeded.
invalid_command: number of commands issued and failed dues to any reason
other than quota exceeded.
nic_receive_steering_discard: number of packets that completed RX flow
steering but were discarded due to a mismatch in flow table.
User commands examples:
- Diagnose PF/VF vnic counters
$ devlink health diagnose pci/0000:82:00.1 reporter vnic
NOTE: This command can run only on PF/VF ports.

View File

@ -16,7 +16,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \
lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \
diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o \
diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \
fw_reset.o qos.o lib/tout.o lib/aso.o
#

View File

@ -0,0 +1,125 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */
#include "reporter_vnic.h"
#include "devlink.h"
#define VNIC_ENV_GET64(vnic_env_stats, c) \
MLX5_GET64(query_vnic_env_out, (vnic_env_stats)->query_vnic_env_out, \
vport_env.c)
struct mlx5_vnic_diag_stats {
__be64 query_vnic_env_out[MLX5_ST_SZ_QW(query_vnic_env_out)];
};
int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
struct devlink_fmsg *fmsg,
u16 vport_num, bool other_vport)
{
u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {};
struct mlx5_vnic_diag_stats vnic;
int err;
MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV);
MLX5_SET(query_vnic_env_in, in, vport_number, vport_num);
MLX5_SET(query_vnic_env_in, in, other_vport, !!other_vport);
err = mlx5_cmd_exec_inout(dev, query_vnic_env, in, &vnic.query_vnic_env_out);
if (err)
return err;
err = devlink_fmsg_pair_nest_start(fmsg, "vNIC env counters");
if (err)
return err;
err = devlink_fmsg_obj_nest_start(fmsg);
if (err)
return err;
err = devlink_fmsg_u64_pair_put(fmsg, "total_error_queues",
VNIC_ENV_GET64(&vnic, total_error_queues));
if (err)
return err;
err = devlink_fmsg_u64_pair_put(fmsg, "send_queue_priority_update_flow",
VNIC_ENV_GET64(&vnic, send_queue_priority_update_flow));
if (err)
return err;
err = devlink_fmsg_u64_pair_put(fmsg, "comp_eq_overrun",
VNIC_ENV_GET64(&vnic, comp_eq_overrun));
if (err)
return err;
err = devlink_fmsg_u64_pair_put(fmsg, "async_eq_overrun",
VNIC_ENV_GET64(&vnic, async_eq_overrun));
if (err)
return err;
err = devlink_fmsg_u64_pair_put(fmsg, "cq_overrun",
VNIC_ENV_GET64(&vnic, cq_overrun));
if (err)
return err;
err = devlink_fmsg_u64_pair_put(fmsg, "invalid_command",
VNIC_ENV_GET64(&vnic, invalid_command));
if (err)
return err;
err = devlink_fmsg_u64_pair_put(fmsg, "quota_exceeded_command",
VNIC_ENV_GET64(&vnic, quota_exceeded_command));
if (err)
return err;
err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard",
VNIC_ENV_GET64(&vnic, nic_receive_steering_discard));
if (err)
return err;
err = devlink_fmsg_obj_nest_end(fmsg);
if (err)
return err;
err = devlink_fmsg_pair_nest_end(fmsg);
if (err)
return err;
return 0;
}
static int mlx5_reporter_vnic_diagnose(struct devlink_health_reporter *reporter,
struct devlink_fmsg *fmsg,
struct netlink_ext_ack *extack)
{
struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
return mlx5_reporter_vnic_diagnose_counters(dev, fmsg, 0, false);
}
static const struct devlink_health_reporter_ops mlx5_reporter_vnic_ops = {
.name = "vnic",
.diagnose = mlx5_reporter_vnic_diagnose,
};
void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
struct devlink *devlink = priv_to_devlink(dev);
health->vnic_reporter =
devlink_health_reporter_create(devlink,
&mlx5_reporter_vnic_ops,
0, dev);
if (IS_ERR(health->vnic_reporter))
mlx5_core_warn(dev,
"Failed to create vnic reporter, err = %ld\n",
PTR_ERR(health->vnic_reporter));
}
void mlx5_reporter_vnic_destroy(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
if (!IS_ERR_OR_NULL(health->vnic_reporter))
devlink_health_reporter_destroy(health->vnic_reporter);
}

View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
*/
#ifndef __MLX5_REPORTER_VNIC_H
#define __MLX5_REPORTER_VNIC_H
#include "mlx5_core.h"
void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev);
void mlx5_reporter_vnic_destroy(struct mlx5_core_dev *dev);
int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
struct devlink_fmsg *fmsg,
u16 vport_num, bool other_vport);
#endif /* __MLX5_REPORTER_VNIC_H */

View File

@ -42,6 +42,7 @@
#include "lib/pci_vsc.h"
#include "lib/tout.h"
#include "diag/fw_tracer.h"
#include "diag/reporter_vnic.h"
enum {
MAX_MISSES = 3,
@ -898,6 +899,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
cancel_delayed_work_sync(&health->update_fw_log_ts_work);
destroy_workqueue(health->wq);
mlx5_reporter_vnic_destroy(dev);
mlx5_fw_reporters_destroy(dev);
}
@ -907,6 +909,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
char *name;
mlx5_fw_reporters_create(dev);
mlx5_reporter_vnic_create(dev);
health = &dev->priv.health;
name = kmalloc(64, GFP_KERNEL);
@ -926,6 +929,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
return 0;
out_err:
mlx5_reporter_vnic_destroy(dev);
mlx5_fw_reporters_destroy(dev);
return -ENOMEM;
}

View File

@ -439,6 +439,7 @@ struct mlx5_core_health {
struct work_struct report_work;
struct devlink_health_reporter *fw_reporter;
struct devlink_health_reporter *fw_fatal_reporter;
struct devlink_health_reporter *vnic_reporter;
struct delayed_work update_fw_log_ts_work;
};