summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/hfi1/chip.c
diff options
context:
space:
mode:
authorMichael J. Ruhl <michael.j.ruhl@intel.com>2018-09-20 21:59:14 +0200
committerJason Gunthorpe <jgg@mellanox.com>2018-09-21 03:24:51 +0200
commitb4a4957d3d1c328b733fce783b7264996f866ad2 (patch)
tree030e57d01483ab424cf5d1e712077f8282cc5c7b /drivers/infiniband/hw/hfi1/chip.c
parentIB/hfi1: Fix context recovery when PBC has an UnsupportedVL (diff)
downloadlinux-b4a4957d3d1c328b733fce783b7264996f866ad2.tar.xz
linux-b4a4957d3d1c328b733fce783b7264996f866ad2.zip
IB/hfi1: Fix destroy_qp hang after a link down
rvt_destroy_qp() cannot complete until all in process packets have been released from the underlying hardware. If a link down event occurs, an application can hang with a kernel stack similar to: cat /proc/<app PID>/stack quiesce_qp+0x178/0x250 [hfi1] rvt_reset_qp+0x23d/0x400 [rdmavt] rvt_destroy_qp+0x69/0x210 [rdmavt] ib_destroy_qp+0xba/0x1c0 [ib_core] nvme_rdma_destroy_queue_ib+0x46/0x80 [nvme_rdma] nvme_rdma_free_queue+0x3c/0xd0 [nvme_rdma] nvme_rdma_destroy_io_queues+0x88/0xd0 [nvme_rdma] nvme_rdma_error_recovery_work+0x52/0xf0 [nvme_rdma] process_one_work+0x17a/0x440 worker_thread+0x126/0x3c0 kthread+0xcf/0xe0 ret_from_fork+0x58/0x90 0xffffffffffffffff quiesce_qp() waits until all outstanding packets have been freed. This wait should be momentary. During a link down event, the cleanup handling does not ensure that all packets caught by the link down are flushed properly. This is caused by the fact that the freeze path and the link down event is handled the same. This is not correct. The freeze path waits until the HFI is unfrozen and then restarts PIO. A link down is not a freeze event. The link down path cannot restart the PIO until link is restored. If the PIO path is restarted before the link comes up, the application (QP) using the PIO path will hang (until link is restored). Fix by separating the linkdown path from the freeze path and use the link down path for link down events. Close a race condition sc_disable() by acquiring both the progress and release locks. Close a race condition in sc_stop() by moving the setting of the flag bits under the alloc lock. Cc: <stable@vger.kernel.org> # 4.9.x+ Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'drivers/infiniband/hw/hfi1/chip.c')
-rw-r--r--drivers/infiniband/hw/hfi1/chip.c6
1 files changed, 5 insertions, 1 deletions
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 2c19bf772451..e1668bcc2d13 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -6733,6 +6733,7 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
struct hfi1_devdata *dd = ppd->dd;
struct send_context *sc;
int i;
+ int sc_flags;
if (flags & FREEZE_SELF)
write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
@@ -6743,11 +6744,13 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
/* notify all SDMA engines that they are going into a freeze */
sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
+ sc_flags = SCF_FROZEN | SCF_HALTED | (flags & FREEZE_LINK_DOWN ?
+ SCF_LINK_DOWN : 0);
/* do halt pre-handling on all enabled send contexts */
for (i = 0; i < dd->num_send_contexts; i++) {
sc = dd->send_contexts[i].sc;
if (sc && (sc->flags & SCF_ENABLED))
- sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+ sc_stop(sc, sc_flags);
}
/* Send context are frozen. Notify user space */
@@ -10674,6 +10677,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
handle_linkup_change(dd, 1);
+ pio_kernel_linkup(dd);
/*
* After link up, a new link width will have been set.