Commit 82771f20 authored by Doug Ledford's avatar Doug Ledford

Merge branch 'wip/dl-for-next' into for-next

Due to concurrent work by myself and Jason, a normal fast forward merge
was not possible.  This brings in a number of hfi1 changes, mainly the
hfi1 TID RDMA support (roughly 10,000 LOC change), which was reviewed
and integrated over a period of days.
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parents ecb53feb 416fbc1b
......@@ -24,6 +24,7 @@ hfi1-y := \
mad.o \
mmu_rb.o \
msix.o \
opfn.o \
pcie.o \
pio.o \
pio_copy.o \
......
......@@ -4253,6 +4253,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
access_sw_pio_drain),
[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
access_sw_kmem_wait),
[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
hfi1_access_sw_tid_wait),
[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
access_sw_send_schedule),
[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
......@@ -5222,6 +5224,17 @@ int is_bx(struct hfi1_devdata *dd)
return (chip_rev_minor & 0xF0) == 0x10;
}
/* return true is kernel urg disabled for rcd */
bool is_urg_masked(struct hfi1_ctxtdata *rcd)
{
u64 mask;
u32 is = IS_RCVURGENT_START + rcd->ctxt;
u8 bit = is % 64;
mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64)));
return !(mask & BIT_ULL(bit));
}
/*
* Append string s to buffer buf. Arguments curp and len are the current
* position and remaining length, respectively.
......
#ifndef _CHIP_H
#define _CHIP_H
/*
* Copyright(c) 2015 - 2017 Intel Corporation.
* Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
......@@ -804,6 +804,7 @@ void clear_linkup_counters(struct hfi1_devdata *dd);
u32 hdrqempty(struct hfi1_ctxtdata *rcd);
int is_ax(struct hfi1_devdata *dd);
int is_bx(struct hfi1_devdata *dd);
bool is_urg_masked(struct hfi1_ctxtdata *rcd);
u32 read_physical_state(struct hfi1_devdata *dd);
u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
const char *opa_lstate_name(u32 lstate);
......@@ -926,6 +927,7 @@ enum {
C_SW_PIO_WAIT,
C_SW_PIO_DRAIN,
C_SW_KMEM_WAIT,
C_SW_TID_WAIT,
C_SW_SEND_SCHED,
C_SDMA_DESC_FETCHED_CNT,
C_SDMA_INT_CNT,
......
......@@ -340,6 +340,10 @@ struct diag_pkt {
#define HFI1_PSM_IOC_BASE_SEQ 0x0
/* Number of BTH.PSN bits used for sequence number in expected rcvs */
#define HFI1_KDETH_BTH_SEQ_SHIFT 11
#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)
static inline __u64 rhf_to_cpu(const __le32 *rbuf)
{
return __le64_to_cpu(*((__le64 *)rbuf));
......
......@@ -1575,25 +1575,32 @@ static int hfi1_setup_bypass_packet(struct hfi1_packet *packet)
return -EINVAL;
}
void handle_eflags(struct hfi1_packet *packet)
static void show_eflags_errs(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
u32 rte = rhf_rcv_type_err(packet->rhf);
dd_dev_err(rcd->dd,
"receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
rcd->ctxt, packet->rhf,
packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
packet->rhf & RHF_DC_ERR ? "dc " : "",
packet->rhf & RHF_TID_ERR ? "tid " : "",
packet->rhf & RHF_LEN_ERR ? "len " : "",
packet->rhf & RHF_ECC_ERR ? "ecc " : "",
packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
rte);
}
void handle_eflags(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
rcv_hdrerr(rcd, rcd->ppd, packet);
if (rhf_err_flags(packet->rhf))
dd_dev_err(rcd->dd,
"receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
rcd->ctxt, packet->rhf,
packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
packet->rhf & RHF_DC_ERR ? "dc " : "",
packet->rhf & RHF_TID_ERR ? "tid " : "",
packet->rhf & RHF_LEN_ERR ? "len " : "",
packet->rhf & RHF_ECC_ERR ? "ecc " : "",
packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
rte);
show_eflags_errs(packet);
}
/*
......@@ -1699,11 +1706,14 @@ static int kdeth_process_expected(struct hfi1_packet *packet)
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
if (unlikely(rhf_err_flags(packet->rhf)))
handle_eflags(packet);
if (unlikely(rhf_err_flags(packet->rhf))) {
struct hfi1_ctxtdata *rcd = packet->rcd;
dd_dev_err(packet->rcd->dd,
"Unhandled expected packet received. Dropping.\n");
if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
return RHF_RCV_CONTINUE;
}
hfi1_kdeth_expected_rcv(packet);
return RHF_RCV_CONTINUE;
}
......@@ -1712,11 +1722,17 @@ static int kdeth_process_eager(struct hfi1_packet *packet)
hfi1_setup_9B_packet(packet);
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
if (unlikely(rhf_err_flags(packet->rhf)))
handle_eflags(packet);
dd_dev_err(packet->rcd->dd,
"Unhandled eager packet received. Dropping.\n");
trace_hfi1_rcvhdr(packet);
if (unlikely(rhf_err_flags(packet->rhf))) {
struct hfi1_ctxtdata *rcd = packet->rcd;
show_eflags_errs(packet);
if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
return RHF_RCV_CONTINUE;
}
hfi1_kdeth_eager_rcv(packet);
return RHF_RCV_CONTINUE;
}
......
......@@ -73,6 +73,7 @@
#include "chip_registers.h"
#include "common.h"
#include "opfn.h"
#include "verbs.h"
#include "pio.h"
#include "chip.h"
......@@ -98,6 +99,8 @@
#define NEIGHBOR_TYPE_HFI 0
#define NEIGHBOR_TYPE_SWITCH 1
#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
extern unsigned long hfi1_cap_mask;
#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
#define HFI1_CAP_UGET_MASK(mask, cap) \
......@@ -195,6 +198,14 @@ struct exp_tid_set {
};
typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
struct tid_queue {
struct list_head queue_head;
/* queue head for QP TID resource waiters */
u32 enqueue; /* count of tid enqueues */
u32 dequeue; /* count of tid dequeues */
};
struct hfi1_ctxtdata {
/* rcvhdrq base, needs mmap before useful */
void *rcvhdrq;
......@@ -288,6 +299,12 @@ struct hfi1_ctxtdata {
/* PSM Specific fields */
/* lock protecting all Expected TID data */
struct mutex exp_mutex;
/* lock protecting all Expected TID data of kernel contexts */
spinlock_t exp_lock;
/* Queue for QP's waiting for HW TID flows */
struct tid_queue flow_queue;
/* Queue for QP's waiting for HW receive array entries */
struct tid_queue rarr_queue;
/* when waiting for rcv or pioavail */
wait_queue_head_t wait;
/* uuid from PSM */
......@@ -320,6 +337,9 @@ struct hfi1_ctxtdata {
*/
u8 subctxt_cnt;
/* Bit mask to track free TID RDMA HW flows */
unsigned long flow_mask;
struct tid_flow_state flows[RXE_NUM_TID_FLOWS];
};
/**
......@@ -2100,7 +2120,7 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK |
#endif
HFI1_PKT_USER_SC_INTEGRITY;
else
else if (ctxt_type != SC_KERNEL)
base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
/* turn on send-side job key checks if !A0 */
......
......@@ -72,7 +72,6 @@
#undef pr_fmt
#define pr_fmt(fmt) DRIVER_NAME ": " fmt
#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
/*
* min buffers we want to have per context, after driver
*/
......@@ -371,6 +370,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
mutex_init(&rcd->exp_mutex);
spin_lock_init(&rcd->exp_lock);
INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
......@@ -473,6 +475,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
GFP_KERNEL, numa);
if (!rcd->opstats)
goto bail;
/* Initialize TID flow generations for the context */
hfi1_kern_init_ctxt_generations(rcd);
}
*context = rcd;
......@@ -772,6 +777,8 @@ static void enable_chip(struct hfi1_devdata *dd)
rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
if (HFI1_CAP_IS_KSET(TID_RDMA))
rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
hfi1_rcvctrl(dd, rcvmask, rcd);
sc_enable(rcd->sc);
hfi1_rcd_put(rcd);
......@@ -927,6 +934,8 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit)
lastfail = hfi1_create_rcvhdrq(dd, rcd);
if (!lastfail)
lastfail = hfi1_setup_eagerbufs(rcd);
if (!lastfail)
lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
if (lastfail) {
dd_dev_err(dd,
"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
......@@ -1497,6 +1506,13 @@ static int __init hfi1_mod_init(void)
/* sanitize link CRC options */
link_crc_mask &= SUPPORTED_CRCS;
ret = opfn_init();
if (ret < 0) {
pr_err("Failed to allocate opfn_wq");
goto bail_dev;
}
hfi1_compute_tid_rdma_flow_wt();
/*
* These must be called before the driver is registered with
* the PCI subsystem.
......@@ -1527,6 +1543,7 @@ module_init(hfi1_mod_init);
static void __exit hfi1_mod_cleanup(void)
{
pci_unregister_driver(&hfi1_pci_driver);
opfn_exit();
node_affinity_destroy_all();
hfi1_dbg_exit();
......@@ -1581,7 +1598,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
if (rcd) {
hfi1_clear_tids(rcd);
hfi1_free_ctxt_rcv_groups(rcd);
hfi1_free_ctxt(rcd);
}
}
......
......@@ -6,6 +6,9 @@
#include "iowait.h"
#include "trace_iowait.h"
/* 1 priority == 16 starve_cnt */
#define IOWAIT_PRIORITY_STARVE_SHIFT 4
void iowait_set_flag(struct iowait *wait, u32 flag)
{
trace_hfi1_iowait_set(wait, flag);
......@@ -44,7 +47,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
uint seq,
bool pkts_sent),
void (*wakeup)(struct iowait *wait, int reason),
void (*sdma_drained)(struct iowait *wait))
void (*sdma_drained)(struct iowait *wait),
void (*init_priority)(struct iowait *wait))
{
int i;
......@@ -58,6 +62,7 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
wait->sleep = sleep;
wait->wakeup = wakeup;
wait->sdma_drained = sdma_drained;
wait->init_priority = init_priority;
wait->flags = 0;
for (i = 0; i < IOWAIT_SES; i++) {
wait->wait[i].iow = wait;
......@@ -92,3 +97,30 @@ int iowait_set_work_flag(struct iowait_work *w)
iowait_set_flag(w->iow, IOWAIT_PENDING_TID);
return IOWAIT_TID_SE;
}
/**
* iowait_priority_update_top - update the top priority entry
* @w: the iowait struct
* @top: a pointer to the top priority entry
* @idx: the index of the current iowait in an array
* @top_idx: the array index for the iowait entry that has the top priority
*
* This function is called to compare the priority of a given
* iowait with the given top priority entry. The top index will
* be returned.
*/
uint iowait_priority_update_top(struct iowait *w,
struct iowait *top,
uint idx, uint top_idx)
{
u8 cnt, tcnt;
/* Convert priority into starve_cnt and compare the total.*/
cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt;
tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) +
top->starved_cnt;
if (cnt > tcnt)
return idx;
else
return top_idx;
}
......@@ -100,6 +100,7 @@ struct iowait_work {
* @sleep: no space callback
* @wakeup: space callback wakeup
* @sdma_drained: sdma count drained
* @init_priority: callback to manipulate priority
* @lock: lock protected head of wait queue
* @iowork: workqueue overhead
* @wait_dma: wait for sdma_busy == 0
......@@ -109,7 +110,7 @@ struct iowait_work {
* @tx_limit: limit for overflow queuing
* @tx_count: number of tx entry's in tx_head'ed list
* @flags: wait flags (one per QP)
* @wait: SE array
* @wait: SE array for multiple legs
*
* This is to be embedded in user's state structure
* (QP or PQ).
......@@ -120,10 +121,13 @@ struct iowait_work {
* are callbacks for the ULP to implement
* what ever queuing/dequeuing of
* the embedded iowait and its containing struct
* when a resource shortage like SDMA ring space is seen.
* when a resource shortage like SDMA ring space
* or PIO credit space is seen.
*
* Both potentially have locks help
* so sleeping is not allowed.
* so sleeping is not allowed and it is not
* supported to submit txreqs from the wakeup
* call directly because of lock conflicts.
*
* The wait_dma member along with the iow
*
......@@ -143,6 +147,7 @@ struct iowait {
);
void (*wakeup)(struct iowait *wait, int reason);
void (*sdma_drained)(struct iowait *wait);
void (*init_priority)(struct iowait *wait);
seqlock_t *lock;
wait_queue_head_t wait_dma;
wait_queue_head_t wait_pio;
......@@ -152,6 +157,7 @@ struct iowait {
u32 tx_limit;
u32 tx_count;
u8 starved_cnt;
u8 priority;
unsigned long flags;
struct iowait_work wait[IOWAIT_SES];
};
......@@ -171,7 +177,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
uint seq,
bool pkts_sent),
void (*wakeup)(struct iowait *wait, int reason),
void (*sdma_drained)(struct iowait *wait));
void (*sdma_drained)(struct iowait *wait),
void (*init_priority)(struct iowait *wait));
/**
* iowait_schedule() - schedule the default send engine work
......@@ -185,6 +192,18 @@ static inline bool iowait_schedule(struct iowait *wait,
return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork);
}
/**
* iowait_tid_schedule - schedule the tid SE
* @wait: the iowait structure
* @wq: the work queue
* @cpu: the cpu
*/
static inline bool iowait_tid_schedule(struct iowait *wait,
struct workqueue_struct *wq, int cpu)
{
return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
}
/**
* iowait_sdma_drain() - wait for DMAs to drain
*
......@@ -327,6 +346,8 @@ static inline u16 iowait_get_desc(struct iowait_work *w)
tx = list_first_entry(&w->tx_head, struct sdma_txreq,
list);
num_desc = tx->num_desc;
if (tx->flags & SDMA_TXREQ_F_VIP)
w->iow->priority++;
}
return num_desc;
}
......@@ -340,6 +361,37 @@ static inline u32 iowait_get_all_desc(struct iowait *w)
return num_desc;
}
static inline void iowait_update_priority(struct iowait_work *w)
{
struct sdma_txreq *tx = NULL;
if (!list_empty(&w->tx_head)) {
tx = list_first_entry(&w->tx_head, struct sdma_txreq,
list);
if (tx->flags & SDMA_TXREQ_F_VIP)
w->iow->priority++;
}
}
static inline void iowait_update_all_priority(struct iowait *w)
{
iowait_update_priority(&w->wait[IOWAIT_IB_SE]);
iowait_update_priority(&w->wait[IOWAIT_TID_SE]);
}
static inline void iowait_init_priority(struct iowait *w)
{
w->priority = 0;
if (w->init_priority)
w->init_priority(w);
}
static inline void iowait_get_priority(struct iowait *w)
{
iowait_init_priority(w);
iowait_update_all_priority(w);
}
/**
* iowait_queue - Put the iowait on a wait queue
* @pkts_sent: have some packets been sent before queuing?
......@@ -356,14 +408,18 @@ static inline void iowait_queue(bool pkts_sent, struct iowait *w,
/*
* To play fair, insert the iowait at the tail of the wait queue if it
* has already sent some packets; Otherwise, put it at the head.
* However, if it has priority packets to send, also put it at the
* head.
*/
if (pkts_sent) {
list_add_tail(&w->list, wait_head);
if (pkts_sent)
w->starved_cnt = 0;
} else {
list_add(&w->list, wait_head);
else
w->starved_cnt++;
}
if (w->priority > 0 || !pkts_sent)
list_add(&w->list, wait_head);
else
list_add_tail(&w->list, wait_head);
}
/**
......@@ -380,27 +436,10 @@ static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w)
w->starved_cnt = 0;
}
/**
* iowait_starve_find_max - Find the maximum of the starve count
* @w: the iowait struct
* @max: a variable containing the max starve count
* @idx: the index of the current iowait in an array
* @max_idx: a variable containing the array index for the
* iowait entry that has the max starve count
*
* This function is called to compare the starve count of a
* given iowait with the given max starve count. The max starve
* count and the index will be updated if the iowait's start
* count is larger.
*/
static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
uint idx, uint *max_idx)
{
if (w->starved_cnt > *max) {
*max = w->starved_cnt;
*max_idx = idx;
}
}
/* Update the top priority index */
uint iowait_priority_update_top(struct iowait *w,
struct iowait *top,
uint idx, uint top_idx);
/**
* iowait_packet_queued() - determine if a packet is queued
......
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* Copyright(c) 2018 Intel Corporation.
*
*/
#include "hfi.h"
#include "trace.h"
#include "qp.h"
#include "opfn.h"
#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT)
#define OPFN_CODE(code) BIT((code) - 1)
#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code)
struct hfi1_opfn_type {
bool (*request)(struct rvt_qp *qp, u64 *data);
bool (*response)(struct rvt_qp *qp, u64 *data);
bool (*reply)(struct rvt_qp *qp, u64 data);
void (*error)(struct rvt_qp *qp);
};
static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = {
[STL_VERBS_EXTD_TID_RDMA] = {
.request = tid_rdma_conn_req,
.response = tid_rdma_conn_resp,
.reply = tid_rdma_conn_reply,
.error = tid_rdma_conn_error,
},
};
static struct workqueue_struct *opfn_wq;
static void opfn_schedule_conn_request(struct rvt_qp *qp);
static bool hfi1_opfn_extended(u32 bth1)
{
return !!(bth1 & IB_BTHE_E);
}
static void opfn_conn_request(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
struct ib_atomic_wr wr;
u16 mask, capcode;
struct hfi1_opfn_type *extd;
u64 data;
unsigned long flags;
int ret = 0;
trace_hfi1_opfn_state_conn_request(qp);
spin_lock_irqsave(&priv->opfn.lock, flags);
/*
* Exit if the extended bit is not set, or if nothing is requested, or
* if we have completed all requests, or if a previous request is in
* progress
*/
if (!priv->opfn.extended || !priv->opfn.requested ||
priv->opfn.requested == priv->opfn.completed || priv->opfn.curr)
goto done;
mask = priv->opfn.requested & ~priv->opfn.completed;
capcode = ilog2(mask & ~(mask - 1)) + 1;
if (capcode >= STL_VERBS_EXTD_MAX) {
priv->opfn.completed |= OPFN_CODE(capcode);
goto done;
}
extd = &hfi1_opfn_handlers[capcode];
if (!extd || !extd->request || !extd->request(qp, &data)) {
/*
* Either there is no handler for this capability or the request
* packet could not be generated. Either way, mark it as done so
* we don't keep attempting to complete it.
*/
priv->opfn.completed |= OPFN_CODE(capcode);
goto done;
}
trace_hfi1_opfn_data_conn_request(qp, capcode, data);
data = (data & ~0xf) | capcode;
memset(&wr, 0, sizeof(wr));
wr.wr.opcode = IB_WR_OPFN;
wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR;
wr.compare_add = data;
priv->opfn.curr = capcode; /* A new request is now in progress */
/* Drop opfn.lock before calling ib_post_send() */
spin_unlock_irqrestore(&priv->opfn.lock, flags);
ret = ib_post_send(&qp->ibqp, &wr.wr, NULL);
if (ret)
goto err;
trace_hfi1_opfn_state_conn_request(qp);
return;
err:
trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ",
(u64)ret);
spin_lock_irqsave(&priv->opfn.lock, flags);
/*
* In case of an unexpected error return from ib_post_send
* clear opfn.curr and reschedule to try again
*/
priv->opfn.curr = STL_VERBS_EXTD_NONE;
opfn_schedule_conn_request(qp);
done:
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_send_conn_request(struct work_struct *work)
{
struct hfi1_opfn_data *od;
struct hfi1_qp_priv *qpriv;
od = container_of(work, struct hfi1_opfn_data, opfn_work);
qpriv = container_of(od, struct hfi1_qp_priv, opfn);
opfn_conn_request(qpriv->owner);
}
/*
* When QP s_lock is held in the caller, the OPFN request must be scheduled
* to a different workqueue to avoid double locking QP s_lock in call to
* ib_post_send in opfn_conn_request
*/
static void opfn_schedule_conn_request(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
trace_hfi1_opfn_state_sched_conn_request(qp);
queue_work(opfn_wq, &priv->opfn.opfn_work);
}
void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
struct ib_atomic_eth *ateth)
{
struct hfi1_qp_priv *priv = qp->priv;
u64 data = be64_to_cpu(ateth->compare_data);
struct hfi1_opfn_type *extd;
u8 capcode;
unsigned long flags;
trace_hfi1_opfn_state_conn_response(qp);
capcode = data & 0xf;
trace_hfi1_opfn_data_conn_response(qp, capcode, data);
if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
return;
extd = &hfi1_opfn_handlers[capcode];
if (!extd || !extd->response) {
e->atomic_data = capcode;
return;
}
spin_lock_irqsave(&priv->opfn.lock, flags);
if (priv->opfn.completed & OPFN_CODE(capcode)) {
/*
* We are receiving a request for a feature that has already
* been negotiated. This may mean that the other side has reset
*/
priv->opfn.completed &= ~OPFN_CODE(capcode);
if (extd->error)
extd->error(qp);
}
if (extd->response(qp, &data))
priv->opfn.completed |= OPFN_CODE(capcode);
e->atomic_data = (data & ~0xf) | capcode;
trace_hfi1_opfn_state_conn_response(qp);
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_conn_reply(struct rvt_qp *qp, u64 data)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_opfn_type *extd;
u8 capcode;
unsigned long flags;
trace_hfi1_opfn_state_conn_reply(qp);
capcode = data & 0xf;
trace_hfi1_opfn_data_conn_reply(qp, capcode, data);
if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
return;
spin_lock_irqsave(&priv->opfn.lock, flags);
/*
* Either there is no previous request or the reply is not for the
* current request
*/
if (!priv->opfn.curr || capcode != priv->opfn.curr)
goto done;
extd = &hfi1_opfn_handlers[capcode];
if (!extd || !extd->reply)
goto clear;
if (extd->reply(qp, data))
priv->opfn.completed |= OPFN_CODE(capcode);
clear:
/*
* Clear opfn.curr to indicate that the previous request is no longer in
* progress
*/
priv->opfn.curr = STL_VERBS_EXTD_NONE;
trace_hfi1_opfn_state_conn_reply(qp);
done:
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_conn_error(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_opfn_type *extd = NULL;
unsigned long flags;
u16 capcode;
trace_hfi1_opfn_state_conn_error(qp);
trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state);
/*
* The QP has gone into the Error state. We have to invalidate all
* negotiated feature, including the one in progress (if any). The RC
* QP handling will clean the WQE for the connection request.
*/
spin_lock_irqsave(&priv->opfn.lock, flags);
while (priv->opfn.completed) {
capcode = priv->opfn.completed & ~(priv->opfn.completed - 1);
extd = &hfi1_opfn_handlers[ilog2(capcode) + 1];
if (extd->error)
extd->error(qp);
priv->opfn.completed &= ~OPFN_CODE(capcode);
}
priv->opfn.extended = 0;
priv->opfn.requested = 0;
priv->opfn.curr = STL_VERBS_EXTD_NONE;
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask)
{
struct ib_qp *ibqp = &qp->ibqp;
struct hfi1_qp_priv *priv = qp->priv;
unsigned long flags;
if (attr_mask & IB_QP_RETRY_CNT)
priv->s_retry = attr->retry_cnt;
spin_lock_irqsave(&priv->opfn.lock, flags);
if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
struct tid_rdma_params *local = &priv->tid_rdma.local;
if (attr_mask & IB_QP_TIMEOUT)
priv->tid_retry_timeout_jiffies = qp->timeout_jiffies;
if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) ||
qp->pmtu == enum_to_mtu(OPA_MTU_8192)) {
tid_rdma_opfn_init(qp, local);
/*
* We only want to set the OPFN requested bit when the
* QP transitions to RTS.
*/
if (attr_mask & IB_QP_STATE &&
attr->qp_state == IB_QPS_RTS) {
priv->opfn.requested |= OPFN_MASK(TID_RDMA);
/*
* If the QP is transitioning to RTS and the
* opfn.completed for TID RDMA has already been
* set, the QP is being moved *back* into RTS.
* We can now renegotiate the TID RDMA
* parameters.
*/
if (priv->opfn.completed &
OPFN_MASK(TID_RDMA)) {
priv->opfn.completed &=
~OPFN_MASK(TID_RDMA);
/*
* Since the opfn.completed bit was
* already set, it is safe to assume
* that the opfn.extended is also set.
*/
opfn_schedule_conn_request(qp);
}
}
} else {
memset(local, 0, sizeof(*local));
}
}
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
{
struct hfi1_qp_priv *priv = qp->priv;
if (!priv->opfn.extended && hfi1_opfn_extended(bth1) &&
HFI1_CAP_IS_KSET(OPFN)) {
priv->opfn.extended = 1;
if (qp->state == IB_QPS_RTS)
opfn_conn_request(qp);
}
}
int opfn_init(void)
{
opfn_wq = alloc_workqueue("hfi_opfn",
WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
WQ_MEM_RECLAIM,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
if (!opfn_wq)
return -ENOMEM;
return 0;
}
void opfn_exit(void)
{
if (opfn_wq) {
destroy_workqueue(opfn_wq);
opfn_wq = NULL;
}
}
/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
/*
* Copyright(c) 2018 Intel Corporation.
*
*/
#ifndef _HFI1_OPFN_H
#define _HFI1_OPFN_H
/**
* DOC: Omni Path Feature Negotion (OPFN)
*
* OPFN is a discovery protocol for Intel Omni-Path fabric that
* allows two RC QPs to negotiate a common feature that both QPs
* can support. Currently, the only OPA feature that OPFN
* supports is TID RDMA.
*
* Architecture
*
* OPFN involves the communication between two QPs on the HFI
* level on an Omni-Path fabric, and ULPs have no knowledge of
* OPFN at all.
*
* Implementation
*
* OPFN extends the existing IB RC protocol with the following
* changes:
* -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport
* Header (BTH1) to indicate that the RC QP supports OPFN;
* -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and
* the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN
* request; The 64-bit data carried with the request/response
* contains the parameters for negotiation and will be
* defined in tid_rdma.c file;
* -- Defines IB_WR_RESERVED3 as IB_WR_OPFN.
*
* The OPFN communication will be triggered when an RC QP
* receives a request with Bit 24 of BTH1 set. The responder QP
* will then post send an OPFN request with its local
* parameters, which will be sent to the requester QP once all
* existing requests on the responder QP side have been sent.
* Once the requester QP receives the OPFN request, it will
* keep a copy of the responder QP's parameters, and return a
* response packet with its own local parameters. The responder
* QP receives the response packet and keeps a copy of the requester
* QP's parameters. After this exchange, each side has the parameters
* for both sides and therefore can select the right parameters
* for future transactions
*/
/* STL Verbs Extended */
#define IB_BTHE_E_SHIFT 24
#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX
struct ib_atomic_eth;
enum hfi1_opfn_codes {
STL_VERBS_EXTD_NONE = 0,
STL_VERBS_EXTD_TID_RDMA,
STL_VERBS_EXTD_MAX
};
struct hfi1_opfn_data {
u8 extended;
u16 requested;
u16 completed;
enum hfi1_opfn_codes curr;
/* serialize opfn function calls */
spinlock_t lock;
struct work_struct opfn_work;
};
/* WR opcode for OPFN */
#define IB_WR_OPFN IB_WR_RESERVED3
void opfn_send_conn_request(struct work_struct *work);
void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
struct ib_atomic_eth *ateth);
void opfn_conn_reply(struct rvt_qp *qp, u64 data);
void opfn_conn_error(struct rvt_qp *qp);
void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask);
void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1);
int opfn_init(void);
void opfn_exit(void);
#endif /* _HFI1_OPFN_H */
......@@ -1599,8 +1599,7 @@ static void sc_piobufavail(struct send_context *sc)
struct rvt_qp *qp;
struct hfi1_qp_priv *priv;
unsigned long flags;
uint i, n = 0, max_idx = 0;
u8 max_starved_cnt = 0;
uint i, n = 0, top_idx = 0;
if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
dd->send_contexts[sc->sw_index].type != SC_VL15)
......@@ -1619,11 +1618,18 @@ static void sc_piobufavail(struct send_context *sc)
if (n == ARRAY_SIZE(qps))
break;
wait = list_first_entry(list, struct iowait, list);
iowait_get_priority(wait);
qp = iowait_to_qp(wait);
priv = qp->priv;
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
if (n) {
priv = qps[top_idx]->priv;
top_idx = iowait_priority_update_top(wait,
&priv->s_iowait,
n, top_idx);
}
/* refcount held until actual wake up */
qps[n++] = qp;
}
......@@ -1638,12 +1644,12 @@ static void sc_piobufavail(struct send_context *sc)
}
write_sequnlock_irqrestore(&sc->waitlock, flags);
/* Wake up the most starved one first */
/* Wake up the top-priority one first */
if (n)
hfi1_qp_wakeup(qps[max_idx],
hfi1_qp_wakeup(qps[top_idx],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
for (i = 0; i < n; i++)
if (i != max_idx)
if (i != top_idx)
hfi1_qp_wakeup(qps[i],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
}
......
......@@ -132,6 +132,18 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
.qpt_support = BIT(IB_QPT_RC),
},
[IB_WR_OPFN] = {
.length = sizeof(struct ib_atomic_wr),
.qpt_support = BIT(IB_QPT_RC),
.flags = RVT_OPERATION_USE_RESERVE,
},
[IB_WR_TID_RDMA_WRITE] = {
.length = sizeof(struct ib_rdma_wr),
.qpt_support = BIT(IB_QPT_RC),
.flags = RVT_OPERATION_IGN_RNR_CNT,
},
};
static void flush_list_head(struct list_head *l)
......@@ -285,6 +297,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
qp_set_16b(qp);
}
opfn_qp_init(qp, attr, attr_mask);
}
/**
......@@ -311,6 +325,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
hfi1_setup_tid_rdma_wqe(qp, wqe);
case IB_QPT_UC:
if (wqe->length > 0x80000000U)
return -EINVAL;
......@@ -422,6 +437,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp)
if (ret)
iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
}
if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
ret = hfi1_schedule_tid_send(qp);
if (ret)
iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
}
}
void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
......@@ -441,8 +461,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
{
if (iowait_set_work_flag(wait) == IOWAIT_IB_SE)
struct hfi1_qp_priv *priv = qp->priv;
if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
qp->s_flags &= ~RVT_S_BUSY;
/*
* If we are sending a first-leg packet from the second leg,
* we need to clear the busy flag from priv->s_flags to
* avoid a race condition when the qp wakes up before
* the call to hfi1_verbs_send() returns to the second
* leg. In that case, the second leg will terminate without
* being re-scheduled, resulting in failure to send TID RDMA
* WRITE DATA and TID RDMA ACK packets.
*/
if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
RVT_S_BUSY);
iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
}
} else {
priv->s_flags &= ~RVT_S_BUSY;
}
}
static int iowait_sleep(
......@@ -479,6 +518,7 @@ static int iowait_sleep(
ibp->rvp.n_dmawait++;
qp->s_flags |= RVT_S_WAIT_DMA_DESC;
iowait_get_priority(&priv->s_iowait);
iowait_queue(pkts_sent, &priv->s_iowait,
&sde->dmawait);
priv->s_iowait.lock = &sde->waitlock;
......@@ -528,6 +568,17 @@ static void iowait_sdma_drained(struct iowait *wait)
spin_unlock_irqrestore(&qp->s_lock, flags);
}
static void hfi1_init_priority(struct iowait *w)
{
struct rvt_qp *qp = iowait_to_qp(w);
struct hfi1_qp_priv *priv = qp->priv;
if (qp->s_flags & RVT_S_ACK_PENDING)
w->priority++;
if (priv->s_flags & RVT_S_ACK_PENDING)
w->priority++;
}
/**
* qp_to_sdma_engine - map a qp to a send engine
* @qp: the QP
......@@ -685,10 +736,11 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
&priv->s_iowait,
1,
_hfi1_do_send,
NULL,
_hfi1_do_tid_send,
iowait_sleep,
iowait_wakeup,
iowait_sdma_drained);
iowait_sdma_drained,
hfi1_init_priority);
return priv;
}
......@@ -696,6 +748,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
hfi1_qp_priv_tid_free(rdi, qp);
kfree(priv->s_ahg);
kfree(priv);
}
......@@ -729,6 +782,7 @@ void flush_qp_waiters(struct rvt_qp *qp)
{
lockdep_assert_held(&qp->s_lock);
flush_iowait(qp);
hfi1_tid_rdma_flush_wait(qp);
}
void stop_send_queue(struct rvt_qp *qp)
......@@ -736,12 +790,16 @@ void stop_send_queue(struct rvt_qp *qp)
struct hfi1_qp_priv *priv = qp->priv;
iowait_cancel_work(&priv->s_iowait);
if (cancel_work_sync(&priv->tid_rdma.trigger_work))
rvt_put_qp(qp);
}
void quiesce_qp(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
hfi1_del_tid_reap_timer(qp);
hfi1_del_tid_retry_timer(qp);
iowait_sdma_drain(&priv->s_iowait);
qp_pio_drain(qp);
flush_tx_list(qp);
......@@ -749,8 +807,13 @@ void quiesce_qp(struct rvt_qp *qp)
void notify_qp_reset(struct rvt_qp *qp)
{
hfi1_qp_kern_exp_rcv_clear_all(qp);
qp->r_adefered = 0;
clear_ahg(qp);
/* Clear any OPFN state */
if (qp->ibqp.qp_type == IB_QPT_RC)
opfn_conn_error(qp);
}
/*
......@@ -832,7 +895,8 @@ void notify_error_qp(struct rvt_qp *qp)
if (lock) {
write_seqlock(lock);
if (!list_empty(&priv->s_iowait.list) &&
!(qp->s_flags & RVT_S_BUSY)) {
!(qp->s_flags & RVT_S_BUSY) &&
!(priv->s_flags & RVT_S_BUSY)) {
qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
......@@ -841,7 +905,8 @@ void notify_error_qp(struct rvt_qp *qp)
write_sequnlock(lock);
}
if (!(qp->s_flags & RVT_S_BUSY)) {
if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
qp->s_hdrwords = 0;
if (qp->s_rdma_mr) {
rvt_put_mr(qp->s_rdma_mr);
qp->s_rdma_mr = NULL;
......
......@@ -63,11 +63,17 @@ extern const struct rvt_operation_params hfi1_post_parms[];
* HFI1_S_AHG_VALID - ahg header valid on chip
* HFI1_S_AHG_CLEAR - have send engine clear ahg state
* HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
* HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
* HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response
* HFI1_S_WAIT_HALT - halt the first leg send engine
* HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
*/
#define HFI1_S_AHG_VALID 0x80000000
#define HFI1_S_AHG_CLEAR 0x40000000
#define HFI1_S_WAIT_PIO_DRAIN 0x20000000
#define HFI1_S_WAIT_TID_SPACE 0x10000000
#define HFI1_S_WAIT_TID_RESP 0x08000000
#define HFI1_S_WAIT_HALT 0x04000000
#define HFI1_S_MIN_BIT_MASK 0x01000000
/*
......@@ -76,6 +82,7 @@ extern const struct rvt_operation_params hfi1_post_parms[];
#define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
/*
* Send if not busy or waiting for I/O and either
......
This diff is collapsed.
/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
/*
* Copyright(c) 2018 Intel Corporation.
*
*/
#ifndef HFI1_RC_H
#define HFI1_RC_H
/* cut down ridiculously long IB macro names */
#define OP(x) IB_OPCODE_RC_##x
static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
{
unsigned int next;
next = n + 1;
if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
qp->s_tail_ack_queue = next;
qp->s_acked_ack_queue = next;
qp->s_ack_state = OP(ACKNOWLEDGE);
}
static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
struct rvt_qp *qp)
{
if (list_empty(&qp->rspwait)) {
qp->r_flags |= RVT_R_RSP_NAK;
rvt_get_qp(qp);
list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
}
}
static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
u32 psn, u32 pmtu)
{
u32 len;
len = delta_psn(psn, wqe->psn) * pmtu;
return rvt_restart_sge(ss, wqe, len);
}
struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
u8 *prev_ack, bool *scheduled);
int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
struct hfi1_ctxtdata *rcd);
struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe,
struct hfi1_ibport *ibp);
#endif /* HFI1_RC_H */
......@@ -250,7 +250,6 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
u32 bth0, u32 bth1, u32 bth2)
{
bth1 |= qp->remote_qpn;
ohdr->bth[0] = cpu_to_be32(bth0);
ohdr->bth[1] = cpu_to_be32(bth1);
ohdr->bth[2] = cpu_to_be32(bth2);
......@@ -272,13 +271,13 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
*/
static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle,
u32 bth0, u32 bth1, u32 bth2,
int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
u32 bth1 = 0;
u32 slid;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u8 l4 = OPA_16B_L4_IB_LOCAL;
......@@ -360,12 +359,12 @@ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
*/
static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle,
u32 bth0, u32 bth1, u32 bth2,
int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
u32 bth1 = 0;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u16 lrh0 = HFI1_LRH_BTH;
u8 extra_bytes = -ps->s_txreq->s_cur_size & 3;
......@@ -415,7 +414,7 @@ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle,
u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
/* We support only two types - 9B and 16B for now */
......@@ -425,7 +424,7 @@ static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = {
};
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle,
u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
......@@ -446,18 +445,21 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
priv->s_ahg->ahgidx = 0;
/* Make the appropriate header */
hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps);
hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle,
ps);
}
/* when sending, force a reschedule every one of these periods */
#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */
/**
* schedule_send_yield - test for a yield required for QP send engine
* hfi1_schedule_send_yield - test for a yield required for QP
* send engine
* @timeout: Final time for timeout slice for jiffies
* @qp: a pointer to QP
* @ps: a pointer to a structure with commonly lookup values for
* the the send engine progress
* @tid - true if it is the tid leg
*
* This routine checks if the time slice for the QP has expired
* for RC QPs, if so an additional work entry is queued. At this
......@@ -465,8 +467,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
* returns true if a yield is required, otherwise, false
* is returned.
*/
static bool schedule_send_yield(struct rvt_qp *qp,
struct hfi1_pkt_state *ps)
bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
bool tid)
{
ps->pkts_sent = true;
......@@ -474,8 +476,24 @@ static bool schedule_send_yield(struct rvt_qp *qp,
if (!ps->in_thread ||
workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
spin_lock_irqsave(&qp->s_lock, ps->flags);
qp->s_flags &= ~RVT_S_BUSY;
hfi1_schedule_send(qp);
if (!tid) {
qp->s_flags &= ~RVT_S_BUSY;
hfi1_schedule_send(qp);
} else {
struct hfi1_qp_priv *priv = qp->priv;
if (priv->s_flags &
HFI1_S_TID_BUSY_SET) {
qp->s_flags &= ~RVT_S_BUSY;
priv->s_flags &=
~(HFI1_S_TID_BUSY_SET |
RVT_S_BUSY);
} else {
priv->s_flags &= ~RVT_S_BUSY;
}
hfi1_schedule_tid_send(qp);
}
spin_unlock_irqrestore(&qp->s_lock, ps->flags);
this_cpu_inc(*ps->ppd->dd->send_schedule);
trace_hfi1_rc_expired_time_slice(qp, true);
......@@ -576,6 +594,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
do {
/* Check for a constructed packet to be sent. */
if (ps.s_txreq) {
if (priv->s_flags & HFI1_S_TID_BUSY_SET)
qp->s_flags |= RVT_S_BUSY;
spin_unlock_irqrestore(&qp->s_lock, ps.flags);
/*
* If the packet cannot be sent now, return and
......@@ -585,7 +605,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
return;
/* allow other tasks to run */
if (schedule_send_yield(qp, &ps))
if (hfi1_schedule_send_yield(qp, &ps, false))
return;
spin_lock_irqsave(&qp->s_lock, ps.flags);
......
......@@ -1747,10 +1747,9 @@ static inline u16 sdma_gethead(struct sdma_engine *sde)
*/
static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
{
struct iowait *wait, *nw;
struct iowait *wait, *nw, *twait;
struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
uint i, n = 0, seq, max_idx = 0;
u8 max_starved_cnt = 0;
uint i, n = 0, seq, tidx = 0;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
......@@ -1775,13 +1774,20 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
continue;
if (n == ARRAY_SIZE(waits))
break;
iowait_init_priority(wait);
num_desc = iowait_get_all_desc(wait);
if (num_desc > avail)
break;
avail -= num_desc;
/* Find the most starved wait memeber */
iowait_starve_find_max(wait, &max_starved_cnt,
n, &max_idx);
/* Find the top-priority wait memeber */
if (n) {
twait = waits[tidx];
tidx =
iowait_priority_update_top(wait,
twait,
n,
tidx);
}
list_del_init(&wait->list);
waits[n++] = wait;
}
......@@ -1790,12 +1796,12 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
}
} while (read_seqretry(&sde->waitlock, seq));
/* Schedule the most starved one first */
/* Schedule the top-priority entry first */
if (n)
waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON);
waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
for (i = 0; i < n; i++)
if (i != max_idx)
if (i != tidx)
waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
}
......
......@@ -91,6 +91,7 @@ struct sdma_desc {
#define SDMA_TXREQ_F_URGENT 0x0001
#define SDMA_TXREQ_F_AHG_COPY 0x0002
#define SDMA_TXREQ_F_USE_AHG 0x0004
#define SDMA_TXREQ_F_VIP 0x0010
struct sdma_txreq;
typedef void (*callback_t)(struct sdma_txreq *, int);
......
This diff is collapsed.
This diff is collapsed.
......@@ -46,6 +46,7 @@
*/
#define CREATE_TRACE_POINTS
#include "trace.h"
#include "exp_rcv.h"
static u8 __get_ib_hdr_len(struct ib_header *hdr)
{
......@@ -128,6 +129,15 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2)
#define IETH_PRN "ieth rkey:0x%.8x"
#define ATOMICACKETH_PRN "origdata:%llx"
#define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx"
#define TID_RDMA_KDETH "kdeth0 0x%x kdeth1 0x%x"
#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x"
#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
#define TID_READ_RSP_PRN "verbs_qp 0x%x"
#define TID_WRITE_REQ_PRN "original_qp 0x%x"
#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
#define TID_WRITE_DATA_PRN "verbs_qp 0x%x"
#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
#define TID_RESYNC_PRN "verbs_qp 0x%x"
#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
......@@ -322,6 +332,99 @@ const char *parse_everbs_hdrs(
parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
be32_to_cpu(eh->aeth) & IB_MSN_MASK);
break;
case OP(TID_RDMA, WRITE_REQ):
trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
TID_WRITE_REQ_PRN,
le32_to_cpu(eh->tid_rdma.w_req.kdeth0),
le32_to_cpu(eh->tid_rdma.w_req.kdeth1),
ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr),
be32_to_cpu(eh->tid_rdma.w_req.reth.rkey),
be32_to_cpu(eh->tid_rdma.w_req.reth.length),
be32_to_cpu(eh->tid_rdma.w_req.verbs_qp));
break;
case OP(TID_RDMA, WRITE_RESP):
trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
TID_WRITE_RSP_PRN,
le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0),
le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1),
be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24,
parse_syndrome(/* aeth */
be32_to_cpu(eh->tid_rdma.w_rsp.aeth)
>> 24),
(be32_to_cpu(eh->tid_rdma.w_rsp.aeth) &
IB_MSN_MASK),
be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn),
be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp),
be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp));
break;
case OP(TID_RDMA, WRITE_DATA_LAST):
case OP(TID_RDMA, WRITE_DATA):
trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN,
le32_to_cpu(eh->tid_rdma.w_data.kdeth0),
KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER),
KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH),
KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR),
KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL),
KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID),
KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET),
le32_to_cpu(eh->tid_rdma.w_data.kdeth1),
KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY),
be32_to_cpu(eh->tid_rdma.w_data.verbs_qp));
break;
case OP(TID_RDMA, READ_REQ):
trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
TID_READ_REQ_PRN,
le32_to_cpu(eh->tid_rdma.r_req.kdeth0),
le32_to_cpu(eh->tid_rdma.r_req.kdeth1),
ib_u64_get(&eh->tid_rdma.r_req.reth.vaddr),
be32_to_cpu(eh->tid_rdma.r_req.reth.rkey),
be32_to_cpu(eh->tid_rdma.r_req.reth.length),
be32_to_cpu(eh->tid_rdma.r_req.tid_flow_psn),
be32_to_cpu(eh->tid_rdma.r_req.tid_flow_qp),
be32_to_cpu(eh->tid_rdma.r_req.verbs_qp));
break;
case OP(TID_RDMA, READ_RESP):
trace_seq_printf(p, TID_RDMA_KDETH_DATA " " AETH_PRN " "
TID_READ_RSP_PRN,
le32_to_cpu(eh->tid_rdma.r_rsp.kdeth0),
KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, KVER),
KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, SH),
KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, INTR),
KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TIDCTRL),
KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TID),
KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, OFFSET),
le32_to_cpu(eh->tid_rdma.r_rsp.kdeth1),
KDETH_GET(eh->tid_rdma.r_rsp.kdeth1, JKEY),
be32_to_cpu(eh->tid_rdma.r_rsp.aeth) >> 24,
parse_syndrome(/* aeth */
be32_to_cpu(eh->tid_rdma.r_rsp.aeth)
>> 24),
(be32_to_cpu(eh->tid_rdma.r_rsp.aeth) &
IB_MSN_MASK),
be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp));
break;
case OP(TID_RDMA, ACK):
trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
TID_ACK_PRN,
le32_to_cpu(eh->tid_rdma.ack.kdeth0),
le32_to_cpu(eh->tid_rdma.ack.kdeth1),
be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24,
parse_syndrome(/* aeth */
be32_to_cpu(eh->tid_rdma.ack.aeth)
>> 24),
(be32_to_cpu(eh->tid_rdma.ack.aeth) &
IB_MSN_MASK),
be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn),
be32_to_cpu(eh->tid_rdma.ack.verbs_psn),
be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp),
be32_to_cpu(eh->tid_rdma.ack.verbs_qp));
break;
case OP(TID_RDMA, RESYNC):
trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN,
le32_to_cpu(eh->tid_rdma.resync.kdeth0),
le32_to_cpu(eh->tid_rdma.resync.kdeth1),
be32_to_cpu(eh->tid_rdma.resync.verbs_qp));
break;
/* aeth + atomicacketh */
case OP(RC, ATOMIC_ACKNOWLEDGE):
trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
......@@ -394,6 +497,21 @@ const char *print_u32_array(
return ret;
}
u8 hfi1_trace_get_tid_ctrl(u32 ent)
{
return EXP_TID_GET(ent, CTRL);
}
u16 hfi1_trace_get_tid_len(u32 ent)
{
return EXP_TID_GET(ent, LEN);
}
u16 hfi1_trace_get_tid_idx(u32 ent)
{
return EXP_TID_GET(ent, IDX);
}
__hfi1_trace_fn(AFFINITY);
__hfi1_trace_fn(PKT);
__hfi1_trace_fn(PROC);
......
......@@ -63,3 +63,4 @@ __print_symbolic(etype, \
#include "trace_tx.h"
#include "trace_mmu.h"
#include "trace_iowait.h"
#include "trace_tid.h"
......@@ -79,6 +79,14 @@ __print_symbolic(opcode, \
ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \
ib_opcode_name(RC_COMPARE_SWAP), \
ib_opcode_name(RC_FETCH_ADD), \
ib_opcode_name(TID_RDMA_WRITE_REQ), \
ib_opcode_name(TID_RDMA_WRITE_RESP), \
ib_opcode_name(TID_RDMA_WRITE_DATA), \
ib_opcode_name(TID_RDMA_WRITE_DATA_LAST), \
ib_opcode_name(TID_RDMA_READ_REQ), \
ib_opcode_name(TID_RDMA_READ_RESP), \
ib_opcode_name(TID_RDMA_RESYNC), \
ib_opcode_name(TID_RDMA_ACK), \
ib_opcode_name(UC_SEND_FIRST), \
ib_opcode_name(UC_SEND_MIDDLE), \
ib_opcode_name(UC_SEND_LAST), \
......
......@@ -109,6 +109,54 @@ DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error,
TP_ARGS(qp, psn)
);
DEFINE_EVENT(/* event */
hfi1_rc_template, hfi1_rc_completion,
TP_PROTO(struct rvt_qp *qp, u32 psn),
TP_ARGS(qp, psn)
);
DECLARE_EVENT_CLASS(/* rc_ack */
hfi1_rc_ack_template,
TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
struct rvt_swqe *wqe),
TP_ARGS(qp, aeth, psn, wqe),
TP_STRUCT__entry(/* entry */
DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
__field(u32, qpn)
__field(u32, aeth)
__field(u32, psn)
__field(u8, opcode)
__field(u32, spsn)
__field(u32, lpsn)
),
TP_fast_assign(/* assign */
DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
__entry->qpn = qp->ibqp.qp_num;
__entry->aeth = aeth;
__entry->psn = psn;
__entry->opcode = wqe->wr.opcode;
__entry->spsn = wqe->psn;
__entry->lpsn = wqe->lpsn;
),
TP_printk(/* print */
"[%s] qpn 0x%x aeth 0x%x psn 0x%x opcode 0x%x spsn 0x%x lpsn 0x%x",
__get_str(dev),
__entry->qpn,
__entry->aeth,
__entry->psn,
__entry->opcode,
__entry->spsn,
__entry->lpsn
)
);
DEFINE_EVENT(/* do_rc_ack */
hfi1_rc_ack_template, hfi1_rc_ack_do,
TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
struct rvt_swqe *wqe),
TP_ARGS(qp, aeth, psn, wqe)
);
#endif /* __HFI1_TRACE_RC_H */
#undef TRACE_INCLUDE_PATH
......
/*
* Copyright(c) 2015 - 2017 Intel Corporation.
* Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
......@@ -128,111 +128,6 @@ TRACE_EVENT(hfi1_receive_interrupt,
)
);
DECLARE_EVENT_CLASS(
hfi1_exp_tid_reg_unreg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
u32 npages, unsigned long va, unsigned long pa,
dma_addr_t dma),
TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
TP_STRUCT__entry(
__field(unsigned int, ctxt)
__field(u16, subctxt)
__field(u32, rarr)
__field(u32, npages)
__field(unsigned long, va)
__field(unsigned long, pa)
__field(dma_addr_t, dma)
),
TP_fast_assign(
__entry->ctxt = ctxt;
__entry->subctxt = subctxt;
__entry->rarr = rarr;
__entry->npages = npages;
__entry->va = va;
__entry->pa = pa;
__entry->dma = dma;
),
TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
__entry->ctxt,
__entry->subctxt,
__entry->rarr,
__entry->npages,
__entry->pa,
__entry->va,
__entry->dma
)
);
DEFINE_EVENT(
hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
unsigned long va, unsigned long pa, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
DEFINE_EVENT(
hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
unsigned long va, unsigned long pa, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
TRACE_EVENT(
hfi1_put_tid,
TP_PROTO(struct hfi1_devdata *dd,
u32 index, u32 type, unsigned long pa, u16 order),
TP_ARGS(dd, index, type, pa, order),
TP_STRUCT__entry(
DD_DEV_ENTRY(dd)
__field(unsigned long, pa);
__field(u32, index);
__field(u32, type);
__field(u16, order);
),
TP_fast_assign(
DD_DEV_ASSIGN(dd);
__entry->pa = pa;
__entry->index = index;
__entry->type = type;
__entry->order = order;
),
TP_printk("[%s] type %s pa %lx index %u order %u",
__get_str(dev),
show_tidtype(__entry->type),
__entry->pa,
__entry->index,
__entry->order
)
);
TRACE_EVENT(hfi1_exp_tid_inval,
TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
u32 npages, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
TP_STRUCT__entry(
__field(unsigned int, ctxt)
__field(u16, subctxt)
__field(unsigned long, va)
__field(u32, rarr)
__field(u32, npages)
__field(dma_addr_t, dma)
),
TP_fast_assign(
__entry->ctxt = ctxt;
__entry->subctxt = subctxt;
__entry->va = va;
__entry->rarr = rarr;
__entry->npages = npages;
__entry->dma = dma;
),
TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
__entry->ctxt,
__entry->subctxt,
__entry->rarr,
__entry->npages,
__entry->va,
__entry->dma
)
);
TRACE_EVENT(hfi1_mmu_invalidate,
TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type,
unsigned long start, unsigned long end),
......
This diff is collapsed.
......@@ -114,19 +114,27 @@ DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
__field(u32, qpn)
__field(u32, flags)
__field(u32, s_flags)
__field(u32, ps_flags)
__field(unsigned long, iow_flags)
),
TP_fast_assign(
DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
__entry->flags = flags;
__entry->qpn = qp->ibqp.qp_num;
__entry->s_flags = qp->s_flags;
__entry->ps_flags =
((struct hfi1_qp_priv *)qp->priv)->s_flags;
__entry->iow_flags =
((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
),
TP_printk(
"[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
"[%s] qpn 0x%x flags 0x%x s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx",
__get_str(dev),
__entry->qpn,
__entry->flags,
__entry->s_flags
__entry->s_flags,
__entry->ps_flags,
__entry->iow_flags
)
);
......@@ -838,6 +846,12 @@ DEFINE_EVENT(
TP_ARGS(qp, flag)
);
DEFINE_EVENT(/* event */
hfi1_do_send_template, hfi1_rc_do_tid_send,
TP_PROTO(struct rvt_qp *qp, bool flag),
TP_ARGS(qp, flag)
);
DEFINE_EVENT(
hfi1_do_send_template, hfi1_rc_expired_time_slice,
TP_PROTO(struct rvt_qp *qp, bool flag),
......
......@@ -271,7 +271,8 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
ps->s_txreq->ss = &qp->s_sge;
ps->s_txreq->s_cur_size = len;
hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
mask_psn(qp->s_psn++), middle, ps);
qp->remote_qpn, mask_psn(qp->s_psn++),
middle, ps);
return 1;
done_free_tx:
......
......@@ -48,7 +48,6 @@
*/
#include "hfi.h"
#include "exp_rcv.h"
struct tid_pageset {
......
......@@ -144,8 +144,10 @@ static int defer_packet_queue(
*/
xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
write_seqlock(&sde->waitlock);
if (list_empty(&pq->busy.list))
if (list_empty(&pq->busy.list)) {
iowait_get_priority(&pq->busy);
iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
}
write_sequnlock(&sde->waitlock);
return -EBUSY;
eagain:
......@@ -191,7 +193,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
pq->mm = fd->mm;
iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
activate_packet_queue, NULL);
activate_packet_queue, NULL, NULL);
pq->reqidx = 0;
pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
......@@ -1126,7 +1128,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
0xffffffull),
psn = val & mask;
if (expct)
psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
else
psn = psn + frags;
return psn & mask;
......
This diff is collapsed.
......@@ -72,6 +72,7 @@ struct hfi1_packet;
#include "iowait.h"
#include "tid_rdma.h"
#include "opfn.h"
#define HFI1_MAX_RDMA_ATOMIC 16
......@@ -158,10 +159,68 @@ struct hfi1_qp_priv {
struct sdma_engine *s_sde; /* current sde */
struct send_context *s_sendcontext; /* current sendcontext */
struct hfi1_ctxtdata *rcd; /* QP's receive context */
struct page **pages; /* for TID page scan */
u32 tid_enqueue; /* saved when tid waited */
u8 s_sc; /* SC[0..4] for next packet */
struct iowait s_iowait;
struct timer_list s_tid_timer; /* for timing tid wait */
struct timer_list s_tid_retry_timer; /* for timing tid ack */
struct list_head tid_wait; /* for queueing tid space */
struct hfi1_opfn_data opfn;
struct tid_flow_state flow_state;
struct tid_rdma_qp_params tid_rdma;
struct rvt_qp *owner;
u8 hdr_type; /* 9B or 16B */
struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */
atomic_t n_requests; /* # of TID RDMA requests in the */
/* queue */
atomic_t n_tid_requests; /* # of sent TID RDMA requests */
unsigned long tid_timer_timeout_jiffies;
unsigned long tid_retry_timeout_jiffies;
/* variables for the TID RDMA SE state machine */
u8 s_state;
u8 s_retry;
u8 rnr_nak_state; /* RNR NAK state */
u8 s_nak_state;
u32 s_nak_psn;
u32 s_flags;
u32 s_tid_cur;
u32 s_tid_head;
u32 s_tid_tail;
u32 r_tid_head; /* Most recently added TID RDMA request */
u32 r_tid_tail; /* the last completed TID RDMA request */
u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */
u32 r_tid_alloc; /* Request for which we are allocating resources */
u32 pending_tid_w_segs; /* Num of pending tid write segments */
u32 pending_tid_w_resp; /* Num of pending tid write responses */
u32 alloc_w_segs; /* Number of segments for which write */
/* resources have been allocated for this QP */
/* For TID RDMA READ */
u32 tid_r_reqs; /* Num of tid reads requested */
u32 tid_r_comp; /* Num of tid reads completed */
u32 pending_tid_r_segs; /* Num of pending tid read segments */
u16 pkts_ps; /* packets per segment */
u8 timeout_shift; /* account for number of packets per segment */
u32 r_next_psn_kdeth;
u32 r_next_psn_kdeth_save;
u32 s_resync_psn;
u8 sync_pt; /* Set when QP reaches sync point */
u8 resync;
};
#define HFI1_QP_WQE_INVALID ((u32)-1)
struct hfi1_swqe_priv {
struct tid_rdma_request tid_req;
struct rvt_sge_state ss; /* Used for TID RDMA READ Request */
};
struct hfi1_ack_priv {
struct rvt_sge_state ss; /* used for TID WRITE RESP */
struct tid_rdma_request tid_req;
};
/*
......@@ -225,6 +284,7 @@ struct hfi1_ibdev {
struct kmem_cache *verbs_txreq_cache;
u64 n_txwait;
u64 n_kmem_wait;
u64 n_tidwait;
/* protect iowait lists */
seqlock_t iowait_lock ____cacheline_aligned_in_smp;
......@@ -312,6 +372,31 @@ static inline u32 delta_psn(u32 a, u32 b)
return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
}
static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe)
{
return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req;
}
static inline struct tid_rdma_request *ack_to_tid_req(struct rvt_ack_entry *e)
{
return &((struct hfi1_ack_priv *)e->priv)->tid_req;
}
/*
* Look through all the active flows for a TID RDMA request and find
* the one (if it exists) that contains the specified PSN.
*/
static inline u32 __full_flow_psn(struct flow_state *state, u32 psn)
{
return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
(psn & HFI1_KDETH_BTH_SEQ_MASK));
}
static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn)
{
return __full_flow_psn(&flow->flow_state, psn);
}
struct verbs_txreq;
void hfi1_put_txreq(struct verbs_txreq *tx);
......@@ -356,9 +441,12 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
const struct ib_global_route *grh, u32 hwords, u32 nwords);
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle,
u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
bool tid);
void _hfi1_do_send(struct work_struct *work);
void hfi1_do_send_from_rvt(struct rvt_qp *qp);
......@@ -377,6 +465,10 @@ int hfi1_register_ib_device(struct hfi1_devdata *);
void hfi1_unregister_ib_device(struct hfi1_devdata *);
void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet);
void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet);
void hfi1_ib_rcv(struct hfi1_packet *packet);
void hfi1_16B_rcv(struct hfi1_packet *packet);
......@@ -394,6 +486,16 @@ static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);
}
void hfi1_wait_kmem(struct rvt_qp *qp);
static inline void hfi1_trdma_send_complete(struct rvt_qp *qp,
struct rvt_swqe *wqe,
enum ib_wc_status status)
{
trdma_clean_swqe(qp, wqe);
rvt_send_complete(qp, wqe, status);
}
extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
extern const u8 hdr_len_by_opcode[];
......
......@@ -94,6 +94,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
tx->txreq.num_desc = 0;
/* Set the header type */
tx->phdr.hdr.hdr_type = priv->hdr_type;
tx->txreq.flags = 0;
return tx;
}
......
......@@ -240,8 +240,10 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
}
vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
if (list_empty(&vnic_sdma->wait.list))
if (list_empty(&vnic_sdma->wait.list)) {
iowait_get_priority(wait->iow);
iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
}
write_sequnlock(&sde->waitlock);
return -EBUSY;
}
......@@ -281,7 +283,7 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo)
iowait_init(&vnic_sdma->wait, 0, NULL, NULL,
hfi1_vnic_sdma_sleep,
hfi1_vnic_sdma_wakeup, NULL);
hfi1_vnic_sdma_wakeup, NULL, NULL);
vnic_sdma->sde = &vinfo->dd->per_sdma[i];
vnic_sdma->dd = vinfo->dd;
vnic_sdma->vinfo = vinfo;
......
......@@ -45,12 +45,7 @@ static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
u32 len;
len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
ss->sge = wqe->sg_list[0];
ss->sg_list = wqe->sg_list + 1;
ss->num_sge = wqe->wr.num_sge;
ss->total_len = wqe->length;
rvt_skip_sge(ss, len, false);
return wqe->length - len;
return rvt_restart_sge(ss, wqe, len);
}
/**
......
......@@ -854,6 +854,7 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
qp->s_mig_state = IB_MIG_MIGRATED;
qp->r_head_ack_queue = 0;
qp->s_tail_ack_queue = 0;
qp->s_acked_ack_queue = 0;
qp->s_num_rd_atomic = 0;
if (qp->r_rq.wq) {
qp->r_rq.wq->head = 0;
......@@ -1642,11 +1643,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
kref_put(&qp->ip->ref, rvt_release_mmap_info);
else
vfree(qp->r_rq.wq);
vfree(qp->s_wq);
rdi->driver_f.qp_priv_free(rdi, qp);
kfree(qp->s_ack_queue);
rdma_destroy_ah_attr(&qp->remote_ah_attr);
rdma_destroy_ah_attr(&qp->alt_ah_attr);
vfree(qp->s_wq);
kfree(qp);
return 0;
}
......@@ -2393,11 +2394,12 @@ static inline unsigned long rvt_aeth_to_usec(u32 aeth)
}
/*
* rvt_add_retry_timer - add/start a retry timer
* rvt_add_retry_timer_ext - add/start a retry timer
* @qp - the QP
* @shift - timeout shift to wait for multiple packets
* add a retry timer on the QP
*/
void rvt_add_retry_timer(struct rvt_qp *qp)
void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
{
struct ib_qp *ibqp = &qp->ibqp;
struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
......@@ -2405,11 +2407,11 @@ void rvt_add_retry_timer(struct rvt_qp *qp)
lockdep_assert_held(&qp->s_lock);
qp->s_flags |= RVT_S_TIMER;
/* 4.096 usec. * (1 << qp->timeout) */
qp->s_timer.expires = jiffies + qp->timeout_jiffies +
rdi->busy_jiffies;
qp->s_timer.expires = jiffies + rdi->busy_jiffies +
(qp->timeout_jiffies << shift);
add_timer(&qp->s_timer);
}
EXPORT_SYMBOL(rvt_add_retry_timer);
EXPORT_SYMBOL(rvt_add_retry_timer_ext);
/**
* rvt_add_rnr_timer - add/start an rnr timer
......
......@@ -187,3 +187,16 @@ void rvt_get_credit(struct rvt_qp *qp, u32 aeth)
}
}
EXPORT_SYMBOL(rvt_get_credit);
/* rvt_restart_sge - rewind the sge state for a wqe */
u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len)
{
ss->sge = wqe->sg_list[0];
ss->sg_list = wqe->sg_list + 1;
ss->num_sge = wqe->wr.num_sge;
ss->total_len = wqe->length;
rvt_skip_sge(ss, len, false);
return wqe->length - len;
}
EXPORT_SYMBOL(rvt_restart_sge);
/*
* Copyright(c) 2016 Intel Corporation.
* Copyright(c) 2016 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
......@@ -100,6 +100,8 @@ struct ib_atomic_eth {
__be64 compare_data; /* potentially unaligned */
} __packed;
#include <rdma/tid_rdma_defs.h>
union ib_ehdrs {
struct {
__be32 deth[2];
......@@ -117,6 +119,16 @@ union ib_ehdrs {
__be32 aeth;
__be32 ieth;
struct ib_atomic_eth atomic_eth;
/* TID RDMA headers */
union {
struct tid_rdma_read_req r_req;
struct tid_rdma_read_resp r_rsp;
struct tid_rdma_write_req w_req;
struct tid_rdma_write_resp w_rsp;
struct tid_rdma_write_data w_data;
struct tid_rdma_resync resync;
struct tid_rdma_ack ack;
} tid_rdma;
} __packed;
struct ib_other_headers {
......
......@@ -182,6 +182,7 @@ struct rvt_driver_params {
u32 max_mad_size;
u8 qos_shift;
u8 max_rdma_atomic;
u8 extra_rdma_atomic;
u8 reserved_operations;
};
......@@ -519,7 +520,14 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi)
*/
static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi)
{
return rdi->dparms.max_rdma_atomic + 1;
return rdi->dparms.max_rdma_atomic +
rdi->dparms.extra_rdma_atomic + 1;
}
static inline unsigned int rvt_size_atomic(struct rvt_dev_info *rdi)
{
return rdi->dparms.max_rdma_atomic +
rdi->dparms.extra_rdma_atomic;
}
/*
......@@ -566,9 +574,10 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
/**
* rvt_mod_retry_timer - mod a retry timer
* @qp - the QP
* @shift - timeout shift to wait for multiple packets
* Modify a potentially already running retry timer
*/
static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift)
{
struct ib_qp *ibqp = &qp->ibqp;
struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
......@@ -576,8 +585,13 @@ static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
lockdep_assert_held(&qp->s_lock);
qp->s_flags |= RVT_S_TIMER;
/* 4.096 usec. * (1 << qp->timeout) */
mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
rdi->busy_jiffies);
mod_timer(&qp->s_timer, jiffies + rdi->busy_jiffies +
(qp->timeout_jiffies << shift));
}
static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
{
return rvt_mod_retry_timer_ext(qp, 0);
}
struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
......
......@@ -174,6 +174,7 @@ struct rvt_swqe {
u32 lpsn; /* last packet sequence number */
u32 ssn; /* send sequence number */
u32 length; /* total length of data in sg_list */
void *priv; /* driver dependent field */
struct rvt_sge sg_list[0];
};
......@@ -235,6 +236,7 @@ struct rvt_ack_entry {
u32 lpsn;
u8 opcode;
u8 sent;
void *priv;
};
#define RC_QP_SCALING_INTERVAL 5
......@@ -244,6 +246,7 @@ struct rvt_ack_entry {
#define RVT_OPERATION_ATOMIC_SGE 0x00000004
#define RVT_OPERATION_LOCAL 0x00000008
#define RVT_OPERATION_USE_RESERVE 0x00000010
#define RVT_OPERATION_IGN_RNR_CNT 0x00000020
#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
......@@ -373,6 +376,7 @@ struct rvt_qp {
u8 s_rnr_retry; /* requester RNR retry counter */
u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */
u8 s_tail_ack_queue; /* index into s_ack_queue[] */
u8 s_acked_ack_queue; /* index into s_ack_queue[] */
struct rvt_sge_state s_ack_rdma_sge;
struct timer_list s_timer;
......@@ -628,6 +632,16 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp);
*/
void rvt_get_credit(struct rvt_qp *qp, u32 aeth);
/**
* rvt_restart_sge - rewind the sge state for a wqe
* @ss: the sge state pointer
* @wqe: the wqe to rewind
* @len: the data length from the start of the wqe in bytes
*
* Returns the remaining data length.
*/
u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len);
/**
* @qp - the qp pair
* @len - the length
......@@ -676,7 +690,11 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t);
void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth);
void rvt_del_timers_sync(struct rvt_qp *qp);
void rvt_stop_rc_timers(struct rvt_qp *qp);
void rvt_add_retry_timer(struct rvt_qp *qp);
void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift);
static inline void rvt_add_retry_timer(struct rvt_qp *qp)
{
rvt_add_retry_timer_ext(qp, 0);
}
void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
void *data, u32 length,
......
/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
/*
* Copyright(c) 2018 Intel Corporation.
*
*/
#ifndef TID_RDMA_DEFS_H
#define TID_RDMA_DEFS_H
#include <rdma/ib_pack.h>
struct tid_rdma_read_req {
__le32 kdeth0;
__le32 kdeth1;
struct ib_reth reth;
__be32 tid_flow_psn;
__be32 tid_flow_qp;
__be32 verbs_qp;
};
struct tid_rdma_read_resp {
__le32 kdeth0;
__le32 kdeth1;
__be32 aeth;
__be32 reserved[4];
__be32 verbs_psn;
__be32 verbs_qp;
};
struct tid_rdma_write_req {
__le32 kdeth0;
__le32 kdeth1;
struct ib_reth reth;
__be32 reserved[2];
__be32 verbs_qp;
};
struct tid_rdma_write_resp {
__le32 kdeth0;
__le32 kdeth1;
__be32 aeth;
__be32 reserved[3];
__be32 tid_flow_psn;
__be32 tid_flow_qp;
__be32 verbs_qp;
};
struct tid_rdma_write_data {
__le32 kdeth0;
__le32 kdeth1;
__be32 reserved[6];
__be32 verbs_qp;
};
struct tid_rdma_resync {
__le32 kdeth0;
__le32 kdeth1;
__be32 reserved[6];
__be32 verbs_qp;
};
struct tid_rdma_ack {
__le32 kdeth0;
__le32 kdeth1;
__be32 aeth;
__be32 reserved[2];
__be32 tid_flow_psn;
__be32 verbs_psn;
__be32 tid_flow_qp;
__be32 verbs_qp;
};
/*
* TID RDMA Opcodes
*/
#define IB_OPCODE_TID_RDMA 0xe0
enum {
IB_OPCODE_WRITE_REQ = 0x0,
IB_OPCODE_WRITE_RESP = 0x1,
IB_OPCODE_WRITE_DATA = 0x2,
IB_OPCODE_WRITE_DATA_LAST = 0x3,
IB_OPCODE_READ_REQ = 0x4,
IB_OPCODE_READ_RESP = 0x5,
IB_OPCODE_RESYNC = 0x6,
IB_OPCODE_ACK = 0x7,
IB_OPCODE(TID_RDMA, WRITE_REQ),
IB_OPCODE(TID_RDMA, WRITE_RESP),
IB_OPCODE(TID_RDMA, WRITE_DATA),
IB_OPCODE(TID_RDMA, WRITE_DATA_LAST),
IB_OPCODE(TID_RDMA, READ_REQ),
IB_OPCODE(TID_RDMA, READ_RESP),
IB_OPCODE(TID_RDMA, RESYNC),
IB_OPCODE(TID_RDMA, ACK),
};
#define TID_OP(x) IB_OPCODE_TID_RDMA_##x
/*
* Define TID RDMA specific WR opcodes. The ib_wr_opcode
* enum already provides some reserved values for use by
* low level drivers. Two of those are used but renamed
* to be more descriptive.
*/
#define IB_WR_TID_RDMA_WRITE IB_WR_RESERVED1
#define IB_WR_TID_RDMA_READ IB_WR_RESERVED2
#endif /* TID_RDMA_DEFS_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment