Commit 14553ca1 authored by Mike Marciniszyn's avatar Mike Marciniszyn Committed by Doug Ledford

staging/rdma/hfi1: Adaptive PIO for short messages

The change requires a new pio_busy field in the iowait structure to
track the number of outstanding pios.  The new counter together
with the sdma counter serve as the basis for a packet by packet decision
as to which egress mechanism to use.  Since packets given to different
egress mechanisms are not ordered, this scheme will preserve the order.

The iowait drain/wait mechanisms are extended for a pio case.  An
additional qp wait flag is added for the PIO drain wait case.

Currently the only pio wait is for buffers, so the no_bufs_available()
routine name is changed to pio_wait() and a third argument is passed
with one of the two pio wait flags to generalize the routine.  A module
parameter is added to hold a configurable threshold. For now, the
module parameter is zero.

A heuristic routine is added to return the func pointer of the proper
egress routine to use.

The heuristic is as follows:
- SMI always uses pio
- GSI,UD qps <= threshold use pio
- UD qps > threadhold use sdma
  o No coordination with sdma is required because order is not required
    and this qp pio count is not maintained for UD
- RC/UC ONLY packets <= threshold chose as follows:
  o If sdmas pending, use SDMA
  o Otherwise use pio and enable the pio tracking count at
    the time the pio buffer is allocated
- RC/UC ONLY packets > threshold use SDMA
  o If pio's are pending the pio_wait with the new wait flag is
    called to delay for pios to drain

The threshold is potentially reduced by the QP's mtu.

The sc_buffer_alloc() has two additional args (a callback, a void *)
which are exploited by the RC/UC cases to pass a new complete routine
and a qp *.

When the shadow ring completes the credit associated with a packet,
the new complete routine is called.  The verbs_pio_complete() will then
decrement the busy count and trigger any drain waiters in qp destroy
or reset.
Reviewed-by: default avatarJubin John <jubin.john@intel.com>
Reviewed-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent 4f8cc5c0
...@@ -1588,6 +1588,14 @@ static u64 access_sw_pio_wait(const struct cntr_entry *entry, ...@@ -1588,6 +1588,14 @@ static u64 access_sw_pio_wait(const struct cntr_entry *entry,
return dd->verbs_dev.n_piowait; return dd->verbs_dev.n_piowait;
} }
static u64 access_sw_pio_drain(const struct cntr_entry *entry,
void *context, int vl, int mode, u64 data)
{
struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
return dd->verbs_dev.n_piodrain;
}
static u64 access_sw_vtx_wait(const struct cntr_entry *entry, static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
void *context, int vl, int mode, u64 data) void *context, int vl, int mode, u64 data)
{ {
...@@ -4129,6 +4137,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = { ...@@ -4129,6 +4137,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
access_sw_vtx_wait), access_sw_vtx_wait),
[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL, [C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
access_sw_pio_wait), access_sw_pio_wait),
[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL,
access_sw_pio_drain),
[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL, [C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
access_sw_kmem_wait), access_sw_kmem_wait),
[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL, [C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
......
...@@ -800,6 +800,7 @@ enum { ...@@ -800,6 +800,7 @@ enum {
C_SW_CPU_RCV_LIM, C_SW_CPU_RCV_LIM,
C_SW_VTX_WAIT, C_SW_VTX_WAIT,
C_SW_PIO_WAIT, C_SW_PIO_WAIT,
C_SW_PIO_DRAIN,
C_SW_KMEM_WAIT, C_SW_KMEM_WAIT,
C_SW_SEND_SCHED, C_SW_SEND_SCHED,
C_SDMA_DESC_FETCHED_CNT, C_SDMA_DESC_FETCHED_CNT,
......
...@@ -811,6 +811,7 @@ struct sdma_vl_map; ...@@ -811,6 +811,7 @@ struct sdma_vl_map;
#define BOARD_VERS_MAX 96 /* how long the version string can be */ #define BOARD_VERS_MAX 96 /* how long the version string can be */
#define SERIAL_MAX 16 /* length of the serial number */ #define SERIAL_MAX 16 /* length of the serial number */
typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
struct hfi1_devdata { struct hfi1_devdata {
struct hfi1_ibdev verbs_dev; /* must be first */ struct hfi1_ibdev verbs_dev; /* must be first */
struct list_head list; struct list_head list;
...@@ -1121,10 +1122,8 @@ struct hfi1_devdata { ...@@ -1121,10 +1122,8 @@ struct hfi1_devdata {
* Handlers for outgoing data so that snoop/capture does not * Handlers for outgoing data so that snoop/capture does not
* have to have its hooks in the send path * have to have its hooks in the send path
*/ */
int (*process_pio_send)(struct rvt_qp *qp, struct hfi1_pkt_state *ps, send_routine process_pio_send;
u64 pbc); send_routine process_dma_send;
int (*process_dma_send)(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
u64 pbc);
void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf, void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
u64 pbc, const void *from, size_t count); u64 pbc, const void *from, size_t count);
......
...@@ -55,6 +55,7 @@ ...@@ -55,6 +55,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include "sdma_txreq.h" #include "sdma_txreq.h"
/* /*
* typedef (*restart_t)() - restart callback * typedef (*restart_t)() - restart callback
* @work: pointer to work structure * @work: pointer to work structure
...@@ -71,6 +72,7 @@ struct sdma_engine; ...@@ -71,6 +72,7 @@ struct sdma_engine;
* @wakeup: space callback * @wakeup: space callback
* @iowork: workqueue overhead * @iowork: workqueue overhead
* @wait_dma: wait for sdma_busy == 0 * @wait_dma: wait for sdma_busy == 0
* @wait_pio: wait for pio_busy == 0
* @sdma_busy: # of packets in flight * @sdma_busy: # of packets in flight
* @count: total number of descriptors in tx_head'ed list * @count: total number of descriptors in tx_head'ed list
* @tx_limit: limit for overflow queuing * @tx_limit: limit for overflow queuing
...@@ -104,7 +106,9 @@ struct iowait { ...@@ -104,7 +106,9 @@ struct iowait {
void (*wakeup)(struct iowait *wait, int reason); void (*wakeup)(struct iowait *wait, int reason);
struct work_struct iowork; struct work_struct iowork;
wait_queue_head_t wait_dma; wait_queue_head_t wait_dma;
wait_queue_head_t wait_pio;
atomic_t sdma_busy; atomic_t sdma_busy;
atomic_t pio_busy;
u32 count; u32 count;
u32 tx_limit; u32 tx_limit;
u32 tx_count; u32 tx_count;
...@@ -141,7 +145,9 @@ static inline void iowait_init( ...@@ -141,7 +145,9 @@ static inline void iowait_init(
INIT_LIST_HEAD(&wait->tx_head); INIT_LIST_HEAD(&wait->tx_head);
INIT_WORK(&wait->iowork, func); INIT_WORK(&wait->iowork, func);
init_waitqueue_head(&wait->wait_dma); init_waitqueue_head(&wait->wait_dma);
init_waitqueue_head(&wait->wait_pio);
atomic_set(&wait->sdma_busy, 0); atomic_set(&wait->sdma_busy, 0);
atomic_set(&wait->pio_busy, 0);
wait->tx_limit = tx_limit; wait->tx_limit = tx_limit;
wait->sleep = sleep; wait->sleep = sleep;
wait->wakeup = wakeup; wait->wakeup = wakeup;
...@@ -174,6 +180,88 @@ static inline void iowait_sdma_drain(struct iowait *wait) ...@@ -174,6 +180,88 @@ static inline void iowait_sdma_drain(struct iowait *wait)
wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy)); wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
} }
/**
* iowait_sdma_pending() - return sdma pending count
*
* @wait: iowait structure
*
*/
static inline int iowait_sdma_pending(struct iowait *wait)
{
return atomic_read(&wait->sdma_busy);
}
/**
* iowait_sdma_inc - note sdma io pending
* @wait: iowait structure
*/
static inline void iowait_sdma_inc(struct iowait *wait)
{
atomic_inc(&wait->sdma_busy);
}
/**
* iowait_sdma_add - add count to pending
* @wait: iowait structure
*/
static inline void iowait_sdma_add(struct iowait *wait, int count)
{
atomic_add(count, &wait->sdma_busy);
}
/**
* iowait_sdma_dec - note sdma complete
* @wait: iowait structure
*/
static inline int iowait_sdma_dec(struct iowait *wait)
{
return atomic_dec_and_test(&wait->sdma_busy);
}
/**
* iowait_pio_drain() - wait for pios to drain
*
* @wait: iowait structure
*
* This will delay until the iowait pios have
* completed.
*/
static inline void iowait_pio_drain(struct iowait *wait)
{
wait_event_timeout(wait->wait_pio,
!atomic_read(&wait->pio_busy),
HZ);
}
/**
* iowait_pio_pending() - return pio pending count
*
* @wait: iowait structure
*
*/
static inline int iowait_pio_pending(struct iowait *wait)
{
return atomic_read(&wait->pio_busy);
}
/**
* iowait_pio_inc - note pio pending
* @wait: iowait structure
*/
static inline void iowait_pio_inc(struct iowait *wait)
{
atomic_inc(&wait->pio_busy);
}
/**
* iowait_sdma_dec - note pio complete
* @wait: iowait structure
*/
static inline int iowait_pio_dec(struct iowait *wait)
{
return atomic_dec_and_test(&wait->pio_busy);
}
/** /**
* iowait_drain_wakeup() - trigger iowait_drain() waiter * iowait_drain_wakeup() - trigger iowait_drain() waiter
* *
...@@ -184,6 +272,7 @@ static inline void iowait_sdma_drain(struct iowait *wait) ...@@ -184,6 +272,7 @@ static inline void iowait_sdma_drain(struct iowait *wait)
static inline void iowait_drain_wakeup(struct iowait *wait) static inline void iowait_drain_wakeup(struct iowait *wait)
{ {
wake_up(&wait->wait_dma); wake_up(&wait->wait_dma);
wake_up(&wait->wait_pio);
} }
/** /**
......
...@@ -1564,7 +1564,8 @@ static void sc_piobufavail(struct send_context *sc) ...@@ -1564,7 +1564,8 @@ static void sc_piobufavail(struct send_context *sc)
write_sequnlock_irqrestore(&dev->iowait_lock, flags); write_sequnlock_irqrestore(&dev->iowait_lock, flags);
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
hfi1_qp_wakeup(qps[i], RVT_S_WAIT_PIO); hfi1_qp_wakeup(qps[i],
RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
} }
/* translate a send credit update to a bit code of reasons */ /* translate a send credit update to a bit code of reasons */
......
...@@ -359,6 +359,25 @@ void _hfi1_schedule_send(struct rvt_qp *qp) ...@@ -359,6 +359,25 @@ void _hfi1_schedule_send(struct rvt_qp *qp)
cpumask_first(cpumask_of_node(dd->node))); cpumask_first(cpumask_of_node(dd->node)));
} }
static void qp_pio_drain(struct rvt_qp *qp)
{
struct hfi1_ibdev *dev;
struct hfi1_qp_priv *priv = qp->priv;
if (!priv->s_sendcontext)
return;
dev = to_idev(qp->ibqp.device);
while (iowait_pio_pending(&priv->s_iowait)) {
write_seqlock_irq(&dev->iowait_lock);
hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
write_sequnlock_irq(&dev->iowait_lock);
iowait_pio_drain(&priv->s_iowait);
write_seqlock_irq(&dev->iowait_lock);
hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
write_sequnlock_irq(&dev->iowait_lock);
}
}
/** /**
* hfi1_schedule_send - schedule progress * hfi1_schedule_send - schedule progress
* @qp: the QP * @qp: the QP
...@@ -620,7 +639,7 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter) ...@@ -620,7 +639,7 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
wqe = rvt_get_swqe_ptr(qp, qp->s_last); wqe = rvt_get_swqe_ptr(qp, qp->s_last);
send_context = qp_to_send_context(qp, priv->s_sc); send_context = qp_to_send_context(qp, priv->s_sc);
seq_printf(s, seq_printf(s,
"N %d %s QP%u R %u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) QP%u LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p\n", "N %d %s QP%x R %u %s %u %u %u f=%x %u %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) QP%x LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p\n",
iter->n, iter->n,
qp_idle(qp) ? "I" : "B", qp_idle(qp) ? "I" : "B",
qp->ibqp.qp_num, qp->ibqp.qp_num,
...@@ -630,7 +649,8 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter) ...@@ -630,7 +649,8 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
wqe ? wqe->wr.opcode : 0, wqe ? wqe->wr.opcode : 0,
qp->s_hdrwords, qp->s_hdrwords,
qp->s_flags, qp->s_flags,
atomic_read(&priv->s_iowait.sdma_busy), iowait_sdma_pending(&priv->s_iowait),
iowait_pio_pending(&priv->s_iowait),
!list_empty(&priv->s_iowait.list), !list_empty(&priv->s_iowait.list),
qp->timeout, qp->timeout,
wqe ? wqe->ssn : 0, wqe ? wqe->ssn : 0,
...@@ -739,6 +759,7 @@ void quiesce_qp(struct rvt_qp *qp) ...@@ -739,6 +759,7 @@ void quiesce_qp(struct rvt_qp *qp)
struct hfi1_qp_priv *priv = qp->priv; struct hfi1_qp_priv *priv = qp->priv;
iowait_sdma_drain(&priv->s_iowait); iowait_sdma_drain(&priv->s_iowait);
qp_pio_drain(qp);
flush_tx_list(qp); flush_tx_list(qp);
} }
......
...@@ -181,6 +181,18 @@ void hfi1_del_timers_sync(struct rvt_qp *qp) ...@@ -181,6 +181,18 @@ void hfi1_del_timers_sync(struct rvt_qp *qp)
del_timer_sync(&priv->s_rnr_timer); del_timer_sync(&priv->s_rnr_timer);
} }
/* only opcode mask for adaptive pio */
const u32 rc_only_opcode =
BIT(OP(SEND_ONLY) & 0x1f) |
BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)) |
BIT(OP(RDMA_READ_REQUEST & 0x1f)) |
BIT(OP(ACKNOWLEDGE & 0x1f)) |
BIT(OP(ATOMIC_ACKNOWLEDGE & 0x1f)) |
BIT(OP(COMPARE_SWAP & 0x1f)) |
BIT(OP(FETCH_ADD & 0x1f));
static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
u32 psn, u32 pmtu) u32 psn, u32 pmtu)
{ {
...@@ -217,6 +229,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, ...@@ -217,6 +229,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
u32 bth2; u32 bth2;
int middle = 0; int middle = 0;
u32 pmtu = qp->pmtu; u32 pmtu = qp->pmtu;
struct hfi1_qp_priv *priv = qp->priv;
/* Don't send an ACK if we aren't supposed to. */ /* Don't send an ACK if we aren't supposed to. */
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
...@@ -350,6 +363,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, ...@@ -350,6 +363,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
qp->s_hdrwords = hwords; qp->s_hdrwords = hwords;
/* pbc */ /* pbc */
ps->s_txreq->hdr_dwords = hwords + 2; ps->s_txreq->hdr_dwords = hwords + 2;
ps->s_txreq->sde = priv->s_sde;
qp->s_cur_size = len; qp->s_cur_size = len;
hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps); hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
return 1; return 1;
...@@ -413,7 +427,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -413,7 +427,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
if (qp->s_last == ACCESS_ONCE(qp->s_head)) if (qp->s_last == ACCESS_ONCE(qp->s_head))
goto bail; goto bail;
/* If DMAs are in progress, we can't flush immediately. */ /* If DMAs are in progress, we can't flush immediately. */
if (atomic_read(&priv->s_iowait.sdma_busy)) { if (iowait_sdma_pending(&priv->s_iowait)) {
qp->s_flags |= RVT_S_WAIT_DMA; qp->s_flags |= RVT_S_WAIT_DMA;
goto bail; goto bail;
} }
...@@ -754,6 +768,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -754,6 +768,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
qp->s_hdrwords = hwords; qp->s_hdrwords = hwords;
/* pbc */ /* pbc */
ps->s_txreq->hdr_dwords = hwords + 2; ps->s_txreq->hdr_dwords = hwords + 2;
ps->s_txreq->sde = priv->s_sde;
qp->s_cur_sge = ss; qp->s_cur_sge = ss;
qp->s_cur_size = len; qp->s_cur_size = len;
hfi1_make_ruc_header( hfi1_make_ruc_header(
......
...@@ -410,7 +410,7 @@ static void sdma_flush(struct sdma_engine *sde) ...@@ -410,7 +410,7 @@ static void sdma_flush(struct sdma_engine *sde)
#endif #endif
sdma_txclean(sde->dd, txp); sdma_txclean(sde->dd, txp);
if (wait) if (wait)
drained = atomic_dec_and_test(&wait->sdma_busy); drained = iowait_sdma_dec(wait);
if (txp->complete) if (txp->complete)
(*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained); (*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained);
if (wait && drained) if (wait && drained)
...@@ -584,7 +584,7 @@ static void sdma_flush_descq(struct sdma_engine *sde) ...@@ -584,7 +584,7 @@ static void sdma_flush_descq(struct sdma_engine *sde)
/* remove from list */ /* remove from list */
sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
if (wait) if (wait)
drained = atomic_dec_and_test(&wait->sdma_busy); drained = iowait_sdma_dec(wait);
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
trace_hfi1_sdma_out_sn(sde, txp->sn); trace_hfi1_sdma_out_sn(sde, txp->sn);
if (WARN_ON_ONCE(sde->head_sn != txp->sn)) if (WARN_ON_ONCE(sde->head_sn != txp->sn))
...@@ -1498,7 +1498,7 @@ static void sdma_make_progress(struct sdma_engine *sde, u64 status) ...@@ -1498,7 +1498,7 @@ static void sdma_make_progress(struct sdma_engine *sde, u64 status)
/* remove from list */ /* remove from list */
sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
if (wait) if (wait)
drained = atomic_dec_and_test(&wait->sdma_busy); drained = iowait_sdma_dec(wait);
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
trace_hfi1_sdma_out_sn(sde, txp->sn); trace_hfi1_sdma_out_sn(sde, txp->sn);
if (WARN_ON_ONCE(sde->head_sn != txp->sn)) if (WARN_ON_ONCE(sde->head_sn != txp->sn))
...@@ -2092,14 +2092,14 @@ int sdma_send_txreq(struct sdma_engine *sde, ...@@ -2092,14 +2092,14 @@ int sdma_send_txreq(struct sdma_engine *sde,
goto nodesc; goto nodesc;
tail = submit_tx(sde, tx); tail = submit_tx(sde, tx);
if (wait) if (wait)
atomic_inc(&wait->sdma_busy); iowait_sdma_inc(wait);
sdma_update_tail(sde, tail); sdma_update_tail(sde, tail);
unlock: unlock:
spin_unlock_irqrestore(&sde->tail_lock, flags); spin_unlock_irqrestore(&sde->tail_lock, flags);
return ret; return ret;
unlock_noconn: unlock_noconn:
if (wait) if (wait)
atomic_inc(&wait->sdma_busy); iowait_sdma_inc(wait);
tx->next_descq_idx = 0; tx->next_descq_idx = 0;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
tx->sn = sde->tail_sn++; tx->sn = sde->tail_sn++;
...@@ -2181,7 +2181,7 @@ int sdma_send_txlist(struct sdma_engine *sde, ...@@ -2181,7 +2181,7 @@ int sdma_send_txlist(struct sdma_engine *sde,
} }
update_tail: update_tail:
if (wait) if (wait)
atomic_add(count, &wait->sdma_busy); iowait_sdma_add(wait, count);
if (tail != INVALID_TAIL) if (tail != INVALID_TAIL)
sdma_update_tail(sde, tail); sdma_update_tail(sde, tail);
spin_unlock_irqrestore(&sde->tail_lock, flags); spin_unlock_irqrestore(&sde->tail_lock, flags);
...@@ -2192,7 +2192,7 @@ int sdma_send_txlist(struct sdma_engine *sde, ...@@ -2192,7 +2192,7 @@ int sdma_send_txlist(struct sdma_engine *sde,
tx->wait = wait; tx->wait = wait;
list_del_init(&tx->list); list_del_init(&tx->list);
if (wait) if (wait)
atomic_inc(&wait->sdma_busy); iowait_sdma_inc(wait);
tx->next_descq_idx = 0; tx->next_descq_idx = 0;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
tx->sn = sde->tail_sn++; tx->sn = sde->tail_sn++;
......
...@@ -55,6 +55,13 @@ ...@@ -55,6 +55,13 @@
/* cut down ridiculously long IB macro names */ /* cut down ridiculously long IB macro names */
#define OP(x) IB_OPCODE_UC_##x #define OP(x) IB_OPCODE_UC_##x
/* only opcode mask for adaptive pio */
const u32 uc_only_opcode =
BIT(OP(SEND_ONLY) & 0x1f) |
BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f));
/** /**
* hfi1_make_uc_req - construct a request packet (SEND, RDMA write) * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
* @qp: a pointer to the QP * @qp: a pointer to the QP
...@@ -86,7 +93,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -86,7 +93,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
if (qp->s_last == ACCESS_ONCE(qp->s_head)) if (qp->s_last == ACCESS_ONCE(qp->s_head))
goto bail; goto bail;
/* If DMAs are in progress, we can't flush immediately. */ /* If DMAs are in progress, we can't flush immediately. */
if (atomic_read(&priv->s_iowait.sdma_busy)) { if (iowait_sdma_pending(&priv->s_iowait)) {
qp->s_flags |= RVT_S_WAIT_DMA; qp->s_flags |= RVT_S_WAIT_DMA;
goto bail; goto bail;
} }
...@@ -237,6 +244,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -237,6 +244,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
qp->s_hdrwords = hwords; qp->s_hdrwords = hwords;
/* pbc */ /* pbc */
ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
ps->s_txreq->sde = priv->s_sde;
qp->s_cur_sge = &qp->s_sge; qp->s_cur_sge = &qp->s_sge;
qp->s_cur_size = len; qp->s_cur_size = len;
hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
......
...@@ -294,7 +294,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -294,7 +294,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
if (qp->s_last == ACCESS_ONCE(qp->s_head)) if (qp->s_last == ACCESS_ONCE(qp->s_head))
goto bail; goto bail;
/* If DMAs are in progress, we can't flush immediately. */ /* If DMAs are in progress, we can't flush immediately. */
if (atomic_read(&priv->s_iowait.sdma_busy)) { if (iowait_sdma_pending(&priv->s_iowait)) {
qp->s_flags |= RVT_S_WAIT_DMA; qp->s_flags |= RVT_S_WAIT_DMA;
goto bail; goto bail;
} }
...@@ -331,7 +331,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -331,7 +331,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
* Instead of waiting, we could queue a * Instead of waiting, we could queue a
* zero length descriptor so we get a callback. * zero length descriptor so we get a callback.
*/ */
if (atomic_read(&priv->s_iowait.sdma_busy)) { if (iowait_sdma_pending(&priv->s_iowait)) {
qp->s_flags |= RVT_S_WAIT_DMA; qp->s_flags |= RVT_S_WAIT_DMA;
goto bail; goto bail;
} }
......
...@@ -124,11 +124,20 @@ unsigned int hfi1_max_srq_wrs = 0x1FFFF; ...@@ -124,11 +124,20 @@ unsigned int hfi1_max_srq_wrs = 0x1FFFF;
module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO); module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
unsigned short piothreshold;
module_param(piothreshold, ushort, S_IRUGO);
MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
static void verbs_sdma_complete( static void verbs_sdma_complete(
struct sdma_txreq *cookie, struct sdma_txreq *cookie,
int status, int status,
int drained); int drained);
static int pio_wait(struct rvt_qp *qp,
struct send_context *sc,
struct hfi1_pkt_state *ps,
u32 flag);
/* Length of buffer to create verbs txreq cache name */ /* Length of buffer to create verbs txreq cache name */
#define TXREQ_NAME_LEN 24 #define TXREQ_NAME_LEN 24
...@@ -742,9 +751,10 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ...@@ -742,9 +751,10 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
* If we are now in the error state, return zero to flush the * If we are now in the error state, return zero to flush the
* send work request. * send work request.
*/ */
static int no_bufs_available(struct rvt_qp *qp, static int pio_wait(struct rvt_qp *qp,
struct send_context *sc, struct send_context *sc,
struct hfi1_pkt_state *ps) struct hfi1_pkt_state *ps,
u32 flag)
{ {
struct hfi1_qp_priv *priv = qp->priv; struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_devdata *dd = sc->dd; struct hfi1_devdata *dd = sc->dd;
...@@ -767,8 +777,10 @@ static int no_bufs_available(struct rvt_qp *qp, ...@@ -767,8 +777,10 @@ static int no_bufs_available(struct rvt_qp *qp,
struct hfi1_ibdev *dev = &dd->verbs_dev; struct hfi1_ibdev *dev = &dd->verbs_dev;
int was_empty; int was_empty;
dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
dev->n_piowait++; dev->n_piowait++;
qp->s_flags |= RVT_S_WAIT_PIO; qp->s_flags |= flag;
was_empty = list_empty(&sc->piowait); was_empty = list_empty(&sc->piowait);
list_add_tail(&priv->s_iowait.list, &sc->piowait); list_add_tail(&priv->s_iowait.list, &sc->piowait);
trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO); trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
...@@ -797,6 +809,15 @@ struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5) ...@@ -797,6 +809,15 @@ struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
return dd->vld[vl].sc; return dd->vld[vl].sc;
} }
static void verbs_pio_complete(void *arg, int code)
{
struct rvt_qp *qp = (struct rvt_qp *)arg;
struct hfi1_qp_priv *priv = qp->priv;
if (iowait_pio_dec(&priv->s_iowait))
iowait_drain_wakeup(&priv->s_iowait);
}
int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
u64 pbc) u64 pbc)
{ {
...@@ -815,6 +836,17 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ...@@ -815,6 +836,17 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
struct pio_buf *pbuf; struct pio_buf *pbuf;
int wc_status = IB_WC_SUCCESS; int wc_status = IB_WC_SUCCESS;
int ret = 0; int ret = 0;
pio_release_cb cb = NULL;
/* only RC/UC use complete */
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
case IB_QPT_UC:
cb = verbs_pio_complete;
break;
default:
break;
}
/* vl15 special case taken care of in ud.c */ /* vl15 special case taken care of in ud.c */
sc5 = priv->s_sc; sc5 = priv->s_sc;
...@@ -830,8 +862,12 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ...@@ -830,8 +862,12 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
} }
pbuf = sc_buffer_alloc(sc, plen, NULL, NULL); if (cb)
iowait_pio_inc(&priv->s_iowait);
pbuf = sc_buffer_alloc(sc, plen, cb, qp);
if (unlikely(pbuf == NULL)) { if (unlikely(pbuf == NULL)) {
if (cb)
verbs_pio_complete(qp, 0);
if (ppd->host_link_state != HLS_UP_ACTIVE) { if (ppd->host_link_state != HLS_UP_ACTIVE) {
/* /*
* If we have filled the PIO buffers to capacity and are * If we have filled the PIO buffers to capacity and are
...@@ -851,8 +887,9 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ...@@ -851,8 +887,9 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
* so lets continue to queue the request. * so lets continue to queue the request.
*/ */
hfi1_cdbg(PIO, "alloc failed. state active, queuing"); hfi1_cdbg(PIO, "alloc failed. state active, queuing");
ret = no_bufs_available(qp, sc, ps); ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
if (!ret) if (!ret)
/* txreq not queued - free */
goto bail; goto bail;
/* tx consumed in wait */ /* tx consumed in wait */
return ret; return ret;
...@@ -984,6 +1021,48 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd, ...@@ -984,6 +1021,48 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
return 1; return 1;
} }
/**
* get_send_routine - choose an egress routine
*
* Choose an egress routine based on QP type
* and size
*/
static inline send_routine get_send_routine(struct rvt_qp *qp,
struct hfi1_ib_header *h)
{
struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
struct hfi1_qp_priv *priv = qp->priv;
if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
return dd->process_pio_send;
switch (qp->ibqp.qp_type) {
case IB_QPT_SMI:
return dd->process_pio_send;
case IB_QPT_GSI:
case IB_QPT_UD:
if (piothreshold && qp->s_cur_size <= piothreshold)
return dd->process_pio_send;
break;
case IB_QPT_RC:
if (piothreshold &&
qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
(BIT(get_opcode(h) & 0x1f) & rc_only_opcode) &&
iowait_sdma_pending(&priv->s_iowait) == 0)
return dd->process_pio_send;
break;
case IB_QPT_UC:
if (piothreshold &&
qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
(BIT(get_opcode(h) & 0x1f) & uc_only_opcode) &&
iowait_sdma_pending(&priv->s_iowait) == 0)
return dd->process_pio_send;
break;
default:
break;
}
return dd->process_dma_send;
}
/** /**
* hfi1_verbs_send - send a packet * hfi1_verbs_send - send a packet
* @qp: the QP to send on * @qp: the QP to send on
...@@ -995,19 +1074,10 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd, ...@@ -995,19 +1074,10 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
{ {
struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
send_routine sr;
int ret; int ret;
int pio = 0;
unsigned long flags = 0;
/*
* VL15 packets (IB_QPT_SMI) will always use PIO, so we
* can defer SDMA restart until link goes ACTIVE without
* worrying about just how we got there.
*/
if ((qp->ibqp.qp_type == IB_QPT_SMI) ||
!(dd->flags & HFI1_HAS_SEND_DMA))
pio = 1;
sr = get_send_routine(qp, &ps->s_txreq->phdr.hdr);
ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp); ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp);
if (unlikely(ret)) { if (unlikely(ret)) {
/* /*
...@@ -1018,7 +1088,9 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -1018,7 +1088,9 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
* mechanism for handling the errors. So for SDMA we can just * mechanism for handling the errors. So for SDMA we can just
* return. * return.
*/ */
if (pio) { if (sr == dd->process_pio_send) {
unsigned long flags;
hfi1_cdbg(PIO, "%s() Failed. Completing with err", hfi1_cdbg(PIO, "%s() Failed. Completing with err",
__func__); __func__);
spin_lock_irqsave(&qp->s_lock, flags); spin_lock_irqsave(&qp->s_lock, flags);
...@@ -1027,20 +1099,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -1027,20 +1099,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
} }
return -EINVAL; return -EINVAL;
} }
return sr(qp, ps, 0);
if (pio) {
ret = dd->process_pio_send(qp, ps, 0);
} else {
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(dd, "CONFIG SDMA %s:%d %s()\n",
slashstrip(__FILE__), __LINE__, __func__);
dd_dev_err(dd, "SDMA hdrwords = %u, len = %u\n", qp->s_hdrwords,
qp->s_cur_size);
#endif
ret = dd->process_dma_send(qp, ps, 0);
}
return ret;
} }
/** /**
......
...@@ -265,6 +265,7 @@ struct hfi1_ibdev { ...@@ -265,6 +265,7 @@ struct hfi1_ibdev {
struct timer_list mem_timer; struct timer_list mem_timer;
u64 n_piowait; u64 n_piowait;
u64 n_piodrain;
u64 n_txwait; u64 n_txwait;
u64 n_kmem_wait; u64 n_kmem_wait;
...@@ -425,6 +426,19 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, ...@@ -425,6 +426,19 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
extern const u32 rc_only_opcode;
extern const u32 uc_only_opcode;
static inline u8 get_opcode(struct hfi1_ib_header *h)
{
u16 lnh = be16_to_cpu(h->lrh[0]) & 3;
if (lnh == IB_LNH_IBA_LOCAL)
return be32_to_cpu(h->u.oth.bth[0]) >> 24;
else
return be32_to_cpu(h->u.l.oth.bth[0]) >> 24;
}
int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr, int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
int has_grh, struct rvt_qp *qp, u32 bth0); int has_grh, struct rvt_qp *qp, u32 bth0);
...@@ -494,6 +508,8 @@ extern unsigned int hfi1_max_srq_sges; ...@@ -494,6 +508,8 @@ extern unsigned int hfi1_max_srq_sges;
extern unsigned int hfi1_max_srq_wrs; extern unsigned int hfi1_max_srq_wrs;
extern unsigned short piothreshold;
extern const u32 ib_hfi1_rnr_table[]; extern const u32 ib_hfi1_rnr_table[];
#endif /* HFI1_VERBS_H */ #endif /* HFI1_VERBS_H */
...@@ -93,6 +93,11 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, ...@@ -93,6 +93,11 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
return tx; return tx;
} }
static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
{
return &tx->txreq;
}
static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp) static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
{ {
struct sdma_txreq *stx; struct sdma_txreq *stx;
......
...@@ -82,6 +82,7 @@ ...@@ -82,6 +82,7 @@
* RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating * RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating
* next send completion entry not via send DMA * next send completion entry not via send DMA
* RVT_S_WAIT_PIO - waiting for a send buffer to be available * RVT_S_WAIT_PIO - waiting for a send buffer to be available
* RVT_S_WAIT_PIO_DRAIN - waiting for a qp to drain pio packets
* RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available * RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available
* RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available * RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
* RVT_S_WAIT_KMEM - waiting for kernel memory to be available * RVT_S_WAIT_KMEM - waiting for kernel memory to be available
...@@ -101,16 +102,17 @@ ...@@ -101,16 +102,17 @@
#define RVT_S_WAIT_SSN_CREDIT 0x0100 #define RVT_S_WAIT_SSN_CREDIT 0x0100
#define RVT_S_WAIT_DMA 0x0200 #define RVT_S_WAIT_DMA 0x0200
#define RVT_S_WAIT_PIO 0x0400 #define RVT_S_WAIT_PIO 0x0400
#define RVT_S_WAIT_TX 0x0800 #define RVT_S_WAIT_PIO_DRAIN 0x0800
#define RVT_S_WAIT_DMA_DESC 0x1000 #define RVT_S_WAIT_TX 0x1000
#define RVT_S_WAIT_KMEM 0x2000 #define RVT_S_WAIT_DMA_DESC 0x2000
#define RVT_S_WAIT_PSN 0x4000 #define RVT_S_WAIT_KMEM 0x4000
#define RVT_S_WAIT_ACK 0x8000 #define RVT_S_WAIT_PSN 0x8000
#define RVT_S_SEND_ONE 0x10000 #define RVT_S_WAIT_ACK 0x10000
#define RVT_S_UNLIMITED_CREDIT 0x20000 #define RVT_S_SEND_ONE 0x20000
#define RVT_S_AHG_VALID 0x40000 #define RVT_S_UNLIMITED_CREDIT 0x40000
#define RVT_S_AHG_CLEAR 0x80000 #define RVT_S_AHG_VALID 0x80000
#define RVT_S_ECN 0x100000 #define RVT_S_AHG_CLEAR 0x100000
#define RVT_S_ECN 0x200000
/* /*
* Wait flags that would prevent any packet type from being sent. * Wait flags that would prevent any packet type from being sent.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment