IB/hfi1: Fix destroy_qp hang after a link down

rvt_destroy_qp() cannot complete until all in process packets have been released from the underlying hardware. If a link down event occurs, an application can hang with a kernel stack similar to: cat /proc/<app PID>/stack quiesce_qp+0x178/0x250 [hfi1] rvt_reset_qp+0x23d/0x400 [rdmavt] rvt_destroy_qp+0x69/0x210 [rdmavt] ib_destroy_qp+0xba/0x1c0 [ib_core] nvme_rdma_destroy_queue_ib+0x46/0x80 [nvme_rdma] nvme_rdma_free_queue+0x3c/0xd0 [nvme_rdma] nvme_rdma_destroy_io_queues+0x88/0xd0 [nvme_rdma] nvme_rdma_error_recovery_work+0x52/0xf0 [nvme_rdma] process_one_work+0x17a/0x440 worker_thread+0x126/0x3c0 kthread+0xcf/0xe0 ret_from_fork+0x58/0x90 0xffffffffffffffff quiesce_qp() waits until all outstanding packets have been freed. This wait should be momentary. During a link down event, the cleanup handling does not ensure that all packets caught by the link down are flushed properly. This is caused by the fact that the freeze path and the link down event is handled the same. This is not correct. The freeze path waits until the HFI is unfrozen and then restarts PIO. A link down is not a freeze event. The link down path cannot restart the PIO until link is restored. If the PIO path is restarted before the link comes up, the application (QP) using the PIO path will hang (until link is restored). Fix by separating the linkdown path from the freeze path and use the link down path for link down events. Close a race condition sc_disable() by acquiring both the progress and release locks. Close a race condition in sc_stop() by moving the setting of the flag bits under the alloc lock. Cc: <stable@vger.kernel.org> # 4.9.x+ Fixes: 77241056 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

IB/hfi1: Fix destroy_qp hang after a link down
rvt_destroy_qp() cannot complete until all in process packets have been released from the underlying hardware. If a link down event occurs, an application can hang with a kernel stack similar to: cat /proc/<app PID>/stack quiesce_qp+0x178/0x250 [hfi1] rvt_reset_qp+0x23d/0x400 [rdmavt] rvt_destroy_qp+0x69/0x210 [rdmavt] ib_destroy_qp+0xba/0x1c0 [ib_core] nvme_rdma_destroy_queue_ib+0x46/0x80 [nvme_rdma] nvme_rdma_free_queue+0x3c/0xd0 [nvme_rdma] nvme_rdma_destroy_io_queues+0x88/0xd0 [nvme_rdma] nvme_rdma_error_recovery_work+0x52/0xf0 [nvme_rdma] process_one_work+0x17a/0x440 worker_thread+0x126/0x3c0 kthread+0xcf/0xe0 ret_from_fork+0x58/0x90 0xffffffffffffffff quiesce_qp() waits until all outstanding packets have been freed. This wait should be momentary. During a link down event, the cleanup handling does not ensure that all packets caught by the link down are flushed properly. This is caused by the fact that the freeze path and the link down event is handled the same. This is not correct. The freeze path waits until the HFI is unfrozen and then restarts PIO. A link down is not a freeze event. The link down path cannot restart the PIO until link is restored. If the PIO path is restarted before the link comes up, the application (QP) using the PIO path will hang (until link is restored). Fix by separating the linkdown path from the freeze path and use the link down path for link down events. Close a race condition sc_disable() by acquiring both the progress and release locks. Close a race condition in sc_stop() by moving the setting of the flag bits under the alloc lock. Cc: <stable@vger.kernel.org> # 4.9.x+ Fixes: 77241056 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
b4a4957d · Michael J. Ruhl · Jason Gunthorpe · d623500b · b4a4957d · b4a4957d
Commit b4a4957d authored Sep 20, 2018 by Michael J. Ruhl Committed by Jason Gunthorpe Sep 20, 2018
3 changed files
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -6733,6 +6733,7 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
 	struct hfi1_devdata *dd = ppd->dd;
 	struct send_context *sc;
 	int i;
+	int sc_flags;

 	if (flags & FREEZE_SELF)
 		write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
@@ -6743,11 +6744,13 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
 	/* notify all SDMA engines that they are going into a freeze */
 	sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));

+	sc_flags = SCF_FROZEN | SCF_HALTED | (flags & FREEZE_LINK_DOWN ?
+					      SCF_LINK_DOWN : 0);
 	/* do halt pre-handling on all enabled send contexts */
 	for (i = 0; i < dd->num_send_contexts; i++) {
 		sc = dd->send_contexts[i].sc;
 		if (sc && (sc->flags & SCF_ENABLED))
-			sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+			sc_stop(sc, sc_flags);
 	}

 	/* Send context are frozen. Notify user space */
@@ -10674,6 +10677,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 		add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);

 		handle_linkup_change(dd, 1);
+		pio_kernel_linkup(dd);

 		/*
 		 * After link up, a new link width will have been set.

--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -926,20 +926,18 @@ void sc_free(struct send_context *sc)
 void sc_disable(struct send_context *sc)
 {
 	u64 reg;
-	unsigned long flags;
 	struct pio_buf *pbuf;

 	if (!sc)
 		return;

 	/* do all steps, even if already disabled */
-	spin_lock_irqsave(&sc->alloc_lock, flags);
+	spin_lock_irq(&sc->alloc_lock);
 	reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
 	reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
 	sc->flags &= ~SCF_ENABLED;
 	sc_wait_for_packet_egress(sc, 1);
 	write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
-	spin_unlock_irqrestore(&sc->alloc_lock, flags);

 	/*
 	 * Flush any waiters.  Once the context is disabled,
@@ -949,7 +947,7 @@ void sc_disable(struct send_context *sc)
 	 * proceed with the flush.
 	 */
 	udelay(1);
-	spin_lock_irqsave(&sc->release_lock, flags);
+	spin_lock(&sc->release_lock);
 	if (sc->sr) {	/* this context has a shadow ring */
 		while (sc->sr_tail != sc->sr_head) {
 			pbuf = &sc->sr[sc->sr_tail].pbuf;
@@ -960,7 +958,8 @@ void sc_disable(struct send_context *sc)
 				sc->sr_tail = 0;
 		}
 	}
-	spin_unlock_irqrestore(&sc->release_lock, flags);
+	spin_unlock(&sc->release_lock);
+	spin_unlock_irq(&sc->alloc_lock);
 }

 /* return SendEgressCtxtStatus.PacketOccupancy */
@@ -1183,11 +1182,39 @@ void pio_kernel_unfreeze(struct hfi1_devdata *dd)
 		sc = dd->send_contexts[i].sc;
 		if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
 			continue;
+		if (sc->flags & SCF_LINK_DOWN)
+			continue;

 		sc_enable(sc);	/* will clear the sc frozen flag */
 	}
 }

+/**
+ * pio_kernel_linkup() - Re-enable send contexts after linkup event
+ * @dd: valid devive data
+ *
+ * When the link goes down, the freeze path is taken.  However, a link down
+ * event is different from a freeze because if the send context is re-enabled
+ * whowever is sending data will start sending data again, which will hang
+ * any QP that is sending data.
+ *
+ * The freeze path now looks at the type of event that occurs and takes this
+ * path for link down event.
+ */
+void pio_kernel_linkup(struct hfi1_devdata *dd)
+{
+	struct send_context *sc;
+	int i;
+
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		sc = dd->send_contexts[i].sc;
+		if (!sc || !(sc->flags & SCF_LINK_DOWN) || sc->type == SC_USER)
+			continue;
+
+		sc_enable(sc);	/* will clear the sc link down flag */
+	}
+}
+
 /*
 * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
 * Returns:
@@ -1387,11 +1414,10 @@ void sc_stop(struct send_context *sc, int flag)
 {
 	unsigned long flags;

-	/* mark the context */
-	sc->flags |= flag;
-
 	/* stop buffer allocations */
 	spin_lock_irqsave(&sc->alloc_lock, flags);
+	/* mark the context */
+	sc->flags |= flag;
 	sc->flags &= ~SCF_ENABLED;
 	spin_unlock_irqrestore(&sc->alloc_lock, flags);
 	wake_up(&sc->halt_wait);

--- a/drivers/infiniband/hw/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -139,6 +139,7 @@ struct send_context {
 #define SCF_IN_FREE 0x02
 #define SCF_HALTED  0x04
 #define SCF_FROZEN  0x08
+#define SCF_LINK_DOWN 0x10

 struct send_context_info {
 	struct send_context *sc;	/* allocated working context */
@@ -306,6 +307,7 @@ void set_pio_integrity(struct send_context *sc);
 void pio_reset_all(struct hfi1_devdata *dd);
 void pio_freeze(struct hfi1_devdata *dd);
 void pio_kernel_unfreeze(struct hfi1_devdata *dd);
+void pio_kernel_linkup(struct hfi1_devdata *dd);

 /* global PIO send control operations */
 #define PSC_GLOBAL_ENABLE 0