net/mlx5: Fix health error state handling
Currently, when we discover a fatal error, we are queueing a work that
will wait for a lock in order to enter the device to error state.
Meanwhile, FW commands are still being processed, and gets timeouts.
This can block the driver for few minutes before the work will manage
to get the lock and enter to error state.
Setting the device to error state before queueing health work, in order
to avoid FW commands being processed while the work is waiting for the
lock.
Fixes: c1d4d2e92a
("net/mlx5: Avoid calling sleeping function by the health poll thread")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
This commit is contained in:
parent
65ba8594a2
commit
51d138c261
|
@ -190,6 +190,16 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void enter_error_state(struct mlx5_core_dev *dev, bool force)
|
||||||
|
{
|
||||||
|
if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */
|
||||||
|
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
||||||
|
mlx5_cmd_flush(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
|
||||||
|
}
|
||||||
|
|
||||||
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
|
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
|
||||||
{
|
{
|
||||||
bool err_detected = false;
|
bool err_detected = false;
|
||||||
|
@ -208,12 +218,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
|
||||||
goto unlock;
|
goto unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */
|
enter_error_state(dev, force);
|
||||||
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
|
||||||
mlx5_cmd_flush(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
|
|
||||||
unlock:
|
unlock:
|
||||||
mutex_unlock(&dev->intf_state_mutex);
|
mutex_unlock(&dev->intf_state_mutex);
|
||||||
}
|
}
|
||||||
|
@ -613,7 +618,7 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
|
||||||
priv = container_of(health, struct mlx5_priv, health);
|
priv = container_of(health, struct mlx5_priv, health);
|
||||||
dev = container_of(priv, struct mlx5_core_dev, priv);
|
dev = container_of(priv, struct mlx5_core_dev, priv);
|
||||||
|
|
||||||
mlx5_enter_error_state(dev, false);
|
enter_error_state(dev, false);
|
||||||
if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
|
if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
|
||||||
if (mlx5_health_try_recover(dev))
|
if (mlx5_health_try_recover(dev))
|
||||||
mlx5_core_err(dev, "health recovery failed\n");
|
mlx5_core_err(dev, "health recovery failed\n");
|
||||||
|
@ -707,8 +712,9 @@ static void poll_health(struct timer_list *t)
|
||||||
mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);
|
mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);
|
||||||
dev->priv.health.fatal_error = fatal_error;
|
dev->priv.health.fatal_error = fatal_error;
|
||||||
print_health_info(dev);
|
print_health_info(dev);
|
||||||
|
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
||||||
mlx5_trigger_health_work(dev);
|
mlx5_trigger_health_work(dev);
|
||||||
goto out;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
count = ioread32be(health->health_counter);
|
count = ioread32be(health->health_counter);
|
||||||
|
|
Loading…
Reference in New Issue