Commit dd1ed108 authored by Mike Marciniszyn's avatar Mike Marciniszyn Committed by Doug Ledford

IB/hfi1: Fix yield logic in send engine

When there are many RC QPs and an RDMA READ request
is sent, timeouts occur on the requester side because
of fairness among RC QPs on their relative SDMA engine
on the responder side.  This also hits write and send, but
to a lesser extent.

Complicating the issue is that the current code checks if workqueue
is congested before scheduling other QPs, however, this
check is based on the number of active entries in the
workqueue, which was found to be too big to for
workqueue_congested() to be effective.

Fix by reducing the number of active entries as revealed by
experimentation from the default of num_sdma to
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES.  Retry counts were monitored
to determine the correct value.

Tracing to investigate any future issues is also added.
Reviewed-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: default avatarSebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent 688f21c0
...@@ -70,6 +70,7 @@ ...@@ -70,6 +70,7 @@
#undef pr_fmt #undef pr_fmt
#define pr_fmt(fmt) DRIVER_NAME ": " fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt
#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
/* /*
* min buffers we want to have per context, after driver * min buffers we want to have per context, after driver
*/ */
...@@ -623,7 +624,7 @@ static int create_workqueues(struct hfi1_devdata *dd) ...@@ -623,7 +624,7 @@ static int create_workqueues(struct hfi1_devdata *dd)
alloc_workqueue( alloc_workqueue(
"hfi%d_%d", "hfi%d_%d",
WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE, WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
dd->num_sdma, HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
dd->unit, pidx); dd->unit, pidx);
if (!ppd->hfi1_wq) if (!ppd->hfi1_wq)
goto wq_error; goto wq_error;
......
...@@ -800,6 +800,43 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, ...@@ -800,6 +800,43 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
/* when sending, force a reschedule every one of these periods */ /* when sending, force a reschedule every one of these periods */
#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */ #define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */
/**
* schedule_send_yield - test for a yield required for QP send engine
* @timeout: Final time for timeout slice for jiffies
* @qp: a pointer to QP
* @ps: a pointer to a structure with commonly lookup values for
* the the send engine progress
*
* This routine checks if the time slice for the QP has expired
* for RC QPs, if so an additional work entry is queued. At this
* point, other QPs have an opportunity to be scheduled. It
* returns true if a yield is required, otherwise, false
* is returned.
*/
static bool schedule_send_yield(struct rvt_qp *qp,
struct hfi1_pkt_state *ps)
{
if (unlikely(time_after(jiffies, ps->timeout))) {
if (!ps->in_thread ||
workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
spin_lock_irqsave(&qp->s_lock, ps->flags);
qp->s_flags &= ~RVT_S_BUSY;
hfi1_schedule_send(qp);
spin_unlock_irqrestore(&qp->s_lock, ps->flags);
this_cpu_inc(*ps->ppd->dd->send_schedule);
trace_hfi1_rc_expired_time_slice(qp, true);
return true;
}
cond_resched();
this_cpu_inc(*ps->ppd->dd->send_schedule);
ps->timeout = jiffies + ps->timeout_int;
}
trace_hfi1_rc_expired_time_slice(qp, false);
return false;
}
void hfi1_do_send_from_rvt(struct rvt_qp *qp) void hfi1_do_send_from_rvt(struct rvt_qp *qp)
{ {
hfi1_do_send(qp, false); hfi1_do_send(qp, false);
...@@ -827,13 +864,13 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ...@@ -827,13 +864,13 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
struct hfi1_pkt_state ps; struct hfi1_pkt_state ps;
struct hfi1_qp_priv *priv = qp->priv; struct hfi1_qp_priv *priv = qp->priv;
int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps); int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
unsigned long timeout;
unsigned long timeout_int;
int cpu;
ps.dev = to_idev(qp->ibqp.device); ps.dev = to_idev(qp->ibqp.device);
ps.ibp = to_iport(qp->ibqp.device, qp->port_num); ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
ps.ppd = ppd_from_ibp(ps.ibp); ps.ppd = ppd_from_ibp(ps.ibp);
ps.in_thread = in_thread;
trace_hfi1_rc_do_send(qp, in_thread);
switch (qp->ibqp.qp_type) { switch (qp->ibqp.qp_type) {
case IB_QPT_RC: case IB_QPT_RC:
...@@ -844,7 +881,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ...@@ -844,7 +881,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
return; return;
} }
make_req = hfi1_make_rc_req; make_req = hfi1_make_rc_req;
timeout_int = (qp->timeout_jiffies); ps.timeout_int = qp->timeout_jiffies;
break; break;
case IB_QPT_UC: case IB_QPT_UC:
if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) &
...@@ -854,11 +891,11 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ...@@ -854,11 +891,11 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
return; return;
} }
make_req = hfi1_make_uc_req; make_req = hfi1_make_uc_req;
timeout_int = SEND_RESCHED_TIMEOUT; ps.timeout_int = SEND_RESCHED_TIMEOUT;
break; break;
default: default:
make_req = hfi1_make_ud_req; make_req = hfi1_make_ud_req;
timeout_int = SEND_RESCHED_TIMEOUT; ps.timeout_int = SEND_RESCHED_TIMEOUT;
} }
spin_lock_irqsave(&qp->s_lock, ps.flags); spin_lock_irqsave(&qp->s_lock, ps.flags);
...@@ -871,9 +908,11 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ...@@ -871,9 +908,11 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
qp->s_flags |= RVT_S_BUSY; qp->s_flags |= RVT_S_BUSY;
timeout = jiffies + (timeout_int) / 8; ps.timeout_int = ps.timeout_int / 8;
cpu = priv->s_sde ? priv->s_sde->cpu : ps.timeout = jiffies + ps.timeout_int;
ps.cpu = priv->s_sde ? priv->s_sde->cpu :
cpumask_first(cpumask_of_node(ps.ppd->dd->node)); cpumask_first(cpumask_of_node(ps.ppd->dd->node));
/* insure a pre-built packet is handled */ /* insure a pre-built packet is handled */
ps.s_txreq = get_waiting_verbs_txreq(qp); ps.s_txreq = get_waiting_verbs_txreq(qp);
do { do {
...@@ -889,28 +928,9 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ...@@ -889,28 +928,9 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
/* Record that s_ahg is empty. */ /* Record that s_ahg is empty. */
qp->s_hdrwords = 0; qp->s_hdrwords = 0;
/* allow other tasks to run */ /* allow other tasks to run */
if (unlikely(time_after(jiffies, timeout))) { if (schedule_send_yield(qp, &ps))
if (!in_thread ||
workqueue_congested(
cpu,
ps.ppd->hfi1_wq)) {
spin_lock_irqsave(
&qp->s_lock,
ps.flags);
qp->s_flags &= ~RVT_S_BUSY;
hfi1_schedule_send(qp);
spin_unlock_irqrestore(
&qp->s_lock,
ps.flags);
this_cpu_inc(
*ps.ppd->dd->send_schedule);
return; return;
}
cond_resched();
this_cpu_inc(
*ps.ppd->dd->send_schedule);
timeout = jiffies + (timeout_int) / 8;
}
spin_lock_irqsave(&qp->s_lock, ps.flags); spin_lock_irqsave(&qp->s_lock, ps.flags);
} }
} while (make_req(qp, &ps)); } while (make_req(qp, &ps));
......
...@@ -676,6 +676,40 @@ TRACE_EVENT( ...@@ -676,6 +676,40 @@ TRACE_EVENT(
) )
); );
DECLARE_EVENT_CLASS(
hfi1_do_send_template,
TP_PROTO(struct rvt_qp *qp, bool flag),
TP_ARGS(qp, flag),
TP_STRUCT__entry(
DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
__field(u32, qpn)
__field(bool, flag)
),
TP_fast_assign(
DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
__entry->qpn = qp->ibqp.qp_num;
__entry->flag = flag;
),
TP_printk(
"[%s] qpn %x flag %d",
__get_str(dev),
__entry->qpn,
__entry->flag
)
);
DEFINE_EVENT(
hfi1_do_send_template, hfi1_rc_do_send,
TP_PROTO(struct rvt_qp *qp, bool flag),
TP_ARGS(qp, flag)
);
DEFINE_EVENT(
hfi1_do_send_template, hfi1_rc_expired_time_slice,
TP_PROTO(struct rvt_qp *qp, bool flag),
TP_ARGS(qp, flag)
);
#endif /* __HFI1_TRACE_TX_H */ #endif /* __HFI1_TRACE_TX_H */
#undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_PATH
......
...@@ -139,6 +139,10 @@ struct hfi1_pkt_state { ...@@ -139,6 +139,10 @@ struct hfi1_pkt_state {
struct hfi1_pportdata *ppd; struct hfi1_pportdata *ppd;
struct verbs_txreq *s_txreq; struct verbs_txreq *s_txreq;
unsigned long flags; unsigned long flags;
unsigned long timeout;
unsigned long timeout_int;
int cpu;
bool in_thread;
}; };
#define HFI1_PSN_CREDIT 16 #define HFI1_PSN_CREDIT 16
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment