Commit e20204dc authored by Mohamad Haj Yahia's avatar Mohamad Haj Yahia Committed by Greg Kroah-Hartman

net/mlx5: Cancel delayed recovery work when unloading the driver

commit 2a0165a0 upstream.

Draining the health workqueue will ignore future health works including
the one that report hardware failure and thus we can't enter error state
Instead cancel the recovery flow and make sure only recovery flow won't
be scheduled.

Fixes: 5e44fca5 ('net/mlx5: Only cancel recovery work when cleaning up device')
Signed-off-by: default avatarMohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: default avatarMoshe Shemesh <moshe@mellanox.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 06732807
......@@ -67,6 +67,7 @@ enum {
enum {
MLX5_DROP_NEW_HEALTH_WORK,
MLX5_DROP_NEW_RECOVERY_WORK,
};
static u8 get_nic_state(struct mlx5_core_dev *dev)
......@@ -193,7 +194,7 @@ static void health_care(struct work_struct *work)
mlx5_handle_bad_state(dev);
spin_lock(&health->wq_lock);
if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
schedule_delayed_work(&health->recover_work, recover_delay);
else
dev_err(&dev->pdev->dev,
......@@ -328,6 +329,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
init_timer(&health->timer);
health->sick = 0;
clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
health->health = &dev->iseg->health;
health->health_counter = &dev->iseg->health_counter;
......@@ -350,11 +352,22 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
spin_lock(&health->wq_lock);
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
spin_unlock(&health->wq_lock);
cancel_delayed_work_sync(&health->recover_work);
cancel_work_sync(&health->work);
}
void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
spin_lock(&health->wq_lock);
set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
spin_unlock(&health->wq_lock);
cancel_delayed_work_sync(&dev->priv.health.recover_work);
}
void mlx5_health_cleanup(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
......
......@@ -1169,7 +1169,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
int err = 0;
if (cleanup)
mlx5_drain_health_wq(dev);
mlx5_drain_health_recovery(dev);
mutex_lock(&dev->intf_state_mutex);
if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) {
......
......@@ -788,6 +788,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev);
void mlx5_start_health_poll(struct mlx5_core_dev *dev);
void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
struct mlx5_buf *buf, int node);
int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment