summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMoshe Shemesh <moshe@nvidia.com>2022-04-11 20:31:06 +0200
committerSaeed Mahameed <saeedm@nvidia.com>2022-05-04 09:00:06 +0200
commitcb7786a76ea39f394f0a059787fe24fa8e340fb6 (patch)
treeb6033260843a95fce8fb31da02c99ada8895c847
parentnet/mlx5e: Fix trust state reset in reload (diff)
downloadlinux-cb7786a76ea39f394f0a059787fe24fa8e340fb6.tar.xz
linux-cb7786a76ea39f394f0a059787fe24fa8e340fb6.zip
net/mlx5: Fix deadlock in sync reset flow
The sync reset flow can lead to the following deadlock when poll_sync_reset() is called by timer softirq and waiting on del_timer_sync() for the same timer. Fix that by moving the part of the flow that waits for the timer to reset_reload_work. It fixes the following kernel Trace: RIP: 0010:del_timer_sync+0x32/0x40 ... Call Trace: <IRQ> mlx5_sync_reset_clear_reset_requested+0x26/0x50 [mlx5_core] poll_sync_reset.cold+0x36/0x52 [mlx5_core] call_timer_fn+0x32/0x130 __run_timers.part.0+0x180/0x280 ? tick_sched_handle+0x33/0x60 ? tick_sched_timer+0x3d/0x80 ? ktime_get+0x3e/0xa0 run_timer_softirq+0x2a/0x50 __do_softirq+0xe1/0x2d6 ? hrtimer_interrupt+0x136/0x220 irq_exit+0xae/0xb0 smp_apic_timer_interrupt+0x7b/0x140 apic_timer_interrupt+0xf/0x20 </IRQ> Fixes: 3c5193a87b0f ("net/mlx5: Use del_timer_sync in fw reset flow of halting poll") Signed-off-by: Moshe Shemesh <moshe@nvidia.com> Reviewed-by: Maher Sanalla <msanalla@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c34
1 files changed, 17 insertions, 17 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index 4aa22dce9b77..ec18d4ccbc11 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -155,22 +155,6 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
}
}
-static void mlx5_sync_reset_reload_work(struct work_struct *work)
-{
- struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
- reset_reload_work);
- struct mlx5_core_dev *dev = fw_reset->dev;
- int err;
-
- mlx5_enter_error_state(dev, true);
- mlx5_unload_one(dev);
- err = mlx5_health_wait_pci_up(dev);
- if (err)
- mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
- fw_reset->ret = err;
- mlx5_fw_reset_complete_reload(dev);
-}
-
static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev)
{
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
@@ -188,6 +172,23 @@ static void mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, boo
mlx5_start_health_poll(dev);
}
+static void mlx5_sync_reset_reload_work(struct work_struct *work)
+{
+ struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
+ reset_reload_work);
+ struct mlx5_core_dev *dev = fw_reset->dev;
+ int err;
+
+ mlx5_sync_reset_clear_reset_requested(dev, false);
+ mlx5_enter_error_state(dev, true);
+ mlx5_unload_one(dev);
+ err = mlx5_health_wait_pci_up(dev);
+ if (err)
+ mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
+ fw_reset->ret = err;
+ mlx5_fw_reset_complete_reload(dev);
+}
+
#define MLX5_RESET_POLL_INTERVAL (HZ / 10)
static void poll_sync_reset(struct timer_list *t)
{
@@ -202,7 +203,6 @@ static void poll_sync_reset(struct timer_list *t)
if (fatal_error) {
mlx5_core_warn(dev, "Got Device Reset\n");
- mlx5_sync_reset_clear_reset_requested(dev, false);
queue_work(fw_reset->wq, &fw_reset->reset_reload_work);
return;
}