Commit 7b160cfd authored by Trond Myklebust's avatar Trond Myklebust

Merge tag 'nfs-rdma-3.16' of git://git.linux-nfs.org/projects/anna/nfs-rdma into linux-next

Pull NFS client side changes for RDMA from Anna Schumaker

These patches are mostly cleanups and bugfixes for using RDMA as an
over-the-wire transport.

Highlights include:

- Remove obsolete memory registration modes.
- Removing BUG_ON()s to keep client's running.
- Fix deadlocks, NULL-pointer dereferences, and memory leaks.

* tag 'nfs-rdma-3.16' of git://git.linux-nfs.org/projects/anna/nfs-rdma: (24 commits)
  xprtrdma: Disconnect on registration failure
  xprtrdma: Remove BUG_ON() call sites
  xprtrdma: Avoid deadlock when credit window is reset
  SUNRPC: Move congestion window constants to header file
  xprtrdma: Reset connection timeout after successful reconnect
  xprtrdma: Use macros for reconnection timeout constants
  xprtrdma: Allocate missing pagelist
  xprtrdma: Remove Tavor MTU setting
  xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting
  xprtrdma: Reduce the number of hardway buffer allocations
  xprtrdma: Limit work done by completion handler
  xprtrmda: Reduce calls to ib_poll_cq() in completion handlers
  xprtrmda: Reduce lock contention in completion handlers
  xprtrdma: Split the completion queue
  xprtrdma: Make rpcrdma_ep_destroy() return void
  xprtrdma: Simplify rpcrdma_deregister_external() synopsis
  xprtrdma: mount reports "Invalid mount option" if memreg mode not supported
  xprtrdma: Fall back to MTHCAFMR when FRMR is not supported
  xprtrdma: Remove REGISTER memory registration mode
  xprtrdma: Remove MEMWINDOWS registration modes
  ...
parents b5968725 c93c6223
......@@ -24,6 +24,12 @@
#define RPC_MAX_SLOT_TABLE_LIMIT (65536U)
#define RPC_MAX_SLOT_TABLE RPC_MAX_SLOT_TABLE_LIMIT
#define RPC_CWNDSHIFT (8U)
#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT)
#define RPC_INITCWND RPC_CWNDSCALE
#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT)
#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
/*
* This describes a timeout strategy
*/
......
......@@ -71,24 +71,6 @@ static void xprt_destroy(struct rpc_xprt *xprt);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
/*
* The transport code maintains an estimate on the maximum number of out-
* standing RPC requests, using a smoothed version of the congestion
* avoidance implemented in 44BSD. This is basically the Van Jacobson
* congestion algorithm: If a retransmit occurs, the congestion window is
* halved; otherwise, it is incremented by 1/cwnd when
*
* - a reply is received and
* - a full number of requests are outstanding and
* - the congestion window hasn't been updated recently.
*/
#define RPC_CWNDSHIFT (8U)
#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT)
#define RPC_INITCWND RPC_CWNDSCALE
#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT)
#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
/**
* xprt_register_transport - register a transport implementation
* @transport: transport to register
......@@ -446,7 +428,15 @@ EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
* @task: recently completed RPC request used to adjust window
* @result: result code of completed RPC request
*
* We use a time-smoothed congestion estimator to avoid heavy oscillation.
* The transport code maintains an estimate on the maximum number of out-
* standing RPC requests, using a smoothed version of the congestion
* avoidance implemented in 44BSD. This is basically the Van Jacobson
* congestion algorithm: If a retransmit occurs, the congestion window is
* halved; otherwise, it is incremented by 1/cwnd when
*
* - a reply is received and
* - a full number of requests are outstanding and
* - the congestion window hasn't been updated recently.
*/
void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result)
{
......
......@@ -78,8 +78,7 @@ static const char transfertypes[][12] = {
* elements. Segments are then coalesced when registered, if possible
* within the selected memreg mode.
*
* Note, this routine is never called if the connection's memory
* registration strategy is 0 (bounce buffers).
* Returns positive number of segments converted, or a negative errno.
*/
static int
......@@ -102,10 +101,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
page_base = xdrbuf->page_base & ~PAGE_MASK;
p = 0;
while (len && n < nsegs) {
if (!ppages[p]) {
/* alloc the pagelist for receiving buffer */
ppages[p] = alloc_page(GFP_ATOMIC);
if (!ppages[p])
return -ENOMEM;
}
seg[n].mr_page = ppages[p];
seg[n].mr_offset = (void *)(unsigned long) page_base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
BUG_ON(seg[n].mr_len > PAGE_SIZE);
if (seg[n].mr_len > PAGE_SIZE)
return -EIO;
len -= seg[n].mr_len;
++n;
++p;
......@@ -114,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
/* Message overflows the seg array */
if (len && n == nsegs)
return 0;
return -EIO;
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
......@@ -123,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
return n;
if (n == nsegs)
/* Tail remains, but we're out of segments */
return 0;
return -EIO;
seg[n].mr_page = NULL;
seg[n].mr_offset = xdrbuf->tail[0].iov_base;
seg[n].mr_len = xdrbuf->tail[0].iov_len;
......@@ -164,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* Reply chunk (a counted array):
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO
*
* Returns positive RPC/RDMA header size, or negative errno.
*/
static unsigned int
static ssize_t
rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
{
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
int nsegs, nchunks = 0;
int n, nsegs, nchunks = 0;
unsigned int pos;
struct rpcrdma_mr_seg *seg = req->rl_segments;
struct rpcrdma_read_chunk *cur_rchunk = NULL;
......@@ -198,12 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
pos = target->head[0].iov_len;
nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
if (nsegs == 0)
return 0;
if (nsegs < 0)
return nsegs;
do {
/* bind/register the memory, then build chunk from result. */
int n = rpcrdma_register_external(seg, nsegs,
n = rpcrdma_register_external(seg, nsegs,
cur_wchunk != NULL, r_xprt);
if (n <= 0)
goto out;
......@@ -248,10 +255,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
/* success. all failures return above */
req->rl_nchunks = nchunks;
BUG_ON(nchunks == 0);
BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
&& (nchunks > 3));
/*
* finish off header. If write, marshal discrim and nchunks.
*/
......@@ -278,8 +281,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
out:
for (pos = 0; nchunks--;)
pos += rpcrdma_deregister_external(
&req->rl_segments[pos], r_xprt, NULL);
return 0;
&req->rl_segments[pos], r_xprt);
return n;
}
/*
......@@ -361,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
* [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
* [2] -- optional padding.
* [3] -- if padded, header only in [1] and data here.
*
* Returns zero on success, otherwise a negative errno.
*/
int
......@@ -370,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
char *base;
size_t hdrlen, rpclen, padlen;
size_t rpclen, padlen;
ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
......@@ -441,14 +447,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/* The following simplification is not true forever */
if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
wtype = rpcrdma_noch;
BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
(rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
/* forced to "pure inline"? */
dprintk("RPC: %s: too much data (%d/%d) for inline\n",
__func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
return -1;
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
hdrlen = 28; /*sizeof *headerp;*/
......@@ -474,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
BUG_ON(wtype != rpcrdma_noch);
if (wtype != rpcrdma_noch) {
dprintk("RPC: %s: invalid chunk list\n",
__func__);
return -EIO;
}
} else {
headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
......@@ -492,8 +497,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* on receive. Therefore, we request a reply chunk
* for non-writes wherever feasible and efficient.
*/
if (wtype == rpcrdma_noch &&
r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
if (wtype == rpcrdma_noch)
wtype = rpcrdma_replych;
}
}
......@@ -511,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
hdrlen = rpcrdma_create_chunks(rqst,
&rqst->rq_rcv_buf, headerp, wtype);
}
if (hdrlen == 0)
return -1;
if (hdrlen < 0)
return hdrlen;
dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
......@@ -680,15 +683,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
rqst->rq_private_buf = rqst->rq_rcv_buf;
}
/*
* This function is called when an async event is posted to
* the connection which changes the connection state. All it
* does at this point is mark the connection up/down, the rpc
* timers do the rest.
*/
void
rpcrdma_conn_func(struct rpcrdma_ep *ep)
rpcrdma_connect_worker(struct work_struct *work)
{
struct rpcrdma_ep *ep =
container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
struct rpc_xprt *xprt = ep->rep_xprt;
spin_lock_bh(&xprt->transport_lock);
......@@ -705,13 +704,15 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
}
/*
* This function is called when memory window unbind which we are waiting
* for completes. Just use rr_func (zeroed by upcall) to signal completion.
* This function is called when an async event is posted to
* the connection which changes the connection state. All it
* does at this point is mark the connection up/down, the rpc
* timers do the rest.
*/
static void
rpcrdma_unbind_func(struct rpcrdma_rep *rep)
void
rpcrdma_conn_func(struct rpcrdma_ep *ep)
{
wake_up(&rep->rr_unbind);
schedule_delayed_work(&ep->rep_connect_worker, 0);
}
/*
......@@ -728,7 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
struct rpc_xprt *xprt = rep->rr_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
__be32 *iptr;
int i, rdmalen, status;
int rdmalen, status;
unsigned long cwnd;
/* Check status. If bad, signal disconnect and return rep to pool */
if (rep->rr_len == ~0U) {
......@@ -783,6 +785,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
/* from here on, the reply is no longer an orphan */
req->rl_reply = rep;
xprt->reestablish_timeout = 0;
/* check for expected message types */
/* The order of some of these tests is important. */
......@@ -857,26 +860,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
break;
}
/* If using mw bind, start the deregister process now. */
/* (Note: if mr_free(), cannot perform it here, in tasklet context) */
if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
case RPCRDMA_MEMWINDOWS:
for (i = 0; req->rl_nchunks-- > 1;)
i += rpcrdma_deregister_external(
&req->rl_segments[i], r_xprt, NULL);
/* Optionally wait (not here) for unbinds to complete */
rep->rr_func = rpcrdma_unbind_func;
(void) rpcrdma_deregister_external(&req->rl_segments[i],
r_xprt, rep);
break;
case RPCRDMA_MEMWINDOWS_ASYNC:
for (i = 0; req->rl_nchunks--;)
i += rpcrdma_deregister_external(&req->rl_segments[i],
r_xprt, NULL);
break;
default:
break;
}
cwnd = xprt->cwnd;
xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
if (xprt->cwnd > cwnd)
xprt_release_rqst_cong(rqst->rq_task);
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
__func__, xprt, rqst, status);
......
......@@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = {
#endif
#define RPCRDMA_BIND_TO (60U * HZ)
#define RPCRDMA_INIT_REEST_TO (5U * HZ)
#define RPCRDMA_MAX_REEST_TO (30U * HZ)
#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
static void
......@@ -229,7 +234,6 @@ static void
xprt_rdma_destroy(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc;
dprintk("RPC: %s: called\n", __func__);
......@@ -238,10 +242,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
xprt_clear_connected(xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
if (rc)
dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n",
__func__, rc);
rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
......@@ -289,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args)
/* 60 second timeout, no retries */
xprt->timeout = &xprt_rdma_default_timeout;
xprt->bind_timeout = (60U * HZ);
xprt->reestablish_timeout = (5U * HZ);
xprt->idle_timeout = (5U * 60 * HZ);
xprt->bind_timeout = RPCRDMA_BIND_TO;
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
xprt->resvport = 0; /* privileged port not needed */
xprt->tsh_size = 0; /* RPC-RDMA handles framing */
......@@ -391,7 +392,7 @@ xprt_setup_rdma(struct xprt_create *args)
xprt_rdma_free_addresses(xprt);
rc = -EINVAL;
out3:
(void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
out2:
rpcrdma_ia_close(&new_xprt->rx_ia);
out1:
......@@ -436,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
schedule_delayed_work(&r_xprt->rdma_connect,
xprt->reestablish_timeout);
xprt->reestablish_timeout <<= 1;
if (xprt->reestablish_timeout > (30 * HZ))
xprt->reestablish_timeout = (30 * HZ);
else if (xprt->reestablish_timeout < (5 * HZ))
xprt->reestablish_timeout = (5 * HZ);
if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
} else {
schedule_delayed_work(&r_xprt->rdma_connect, 0);
if (!RPC_IS_ASYNC(task))
......@@ -447,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
}
}
static int
xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
/* == RPC_CWNDSCALE @ init, but *after* setup */
if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
dprintk("RPC: %s: cwndscale %lu\n", __func__,
r_xprt->rx_buf.rb_cwndscale);
BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
}
xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
return xprt_reserve_xprt_cong(xprt, task);
}
/*
* The RDMA allocate/free functions need the task structure as a place
* to hide the struct rpcrdma_req, which is necessary for the actual send/recv
......@@ -479,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
struct rpcrdma_req *req, *nreq;
req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
BUG_ON(NULL == req);
if (req == NULL)
return NULL;
if (size > req->rl_size) {
dprintk("RPC: %s: size %zd too large for buffer[%zd]: "
......@@ -503,18 +488,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
* If the allocation or registration fails, the RPC framework
* will (doggedly) retry.
*/
if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
RPCRDMA_BOUNCEBUFFERS) {
/* forced to "pure inline" */
dprintk("RPC: %s: too much data (%zd) for inline "
"(r/w max %d/%d)\n", __func__, size,
rpcx_to_rdmad(xprt).inline_rsize,
rpcx_to_rdmad(xprt).inline_wsize);
size = req->rl_size;
rpc_exit(task, -EIO); /* fail the operation */
rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
goto out;
}
if (task->tk_flags & RPC_TASK_SWAPPER)
nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
else
......@@ -543,7 +516,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
req = nreq;
}
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
out:
req->rl_connect_cookie = 0; /* our reserved value */
return req->rl_xdr_buf;
......@@ -579,9 +551,7 @@ xprt_rdma_free(void *buffer)
__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
/*
* Finish the deregistration. When using mw bind, this was
* begun in rpcrdma_reply_handler(). In all other modes, we
* do it here, in thread context. The process is considered
* Finish the deregistration. The process is considered
* complete when the rr_func vector becomes NULL - this
* was put in place during rpcrdma_reply_handler() - the wait
* call below will not block if the dereg is "done". If
......@@ -590,12 +560,7 @@ xprt_rdma_free(void *buffer)
for (i = 0; req->rl_nchunks;) {
--req->rl_nchunks;
i += rpcrdma_deregister_external(
&req->rl_segments[i], r_xprt, NULL);
}
if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
rep->rr_func = NULL; /* abandon the callback */
req->rl_reply = NULL;
&req->rl_segments[i], r_xprt);
}
if (req->rl_iov.length == 0) { /* see allocate above */
......@@ -630,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc;
/* marshal the send itself */
if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
r_xprt->rx_stats.failed_marshal_count++;
dprintk("RPC: %s: rpcrdma_marshal_req failed\n",
__func__);
return -EIO;
if (req->rl_niovs == 0) {
rc = rpcrdma_marshal_req(rqst);
if (rc < 0)
goto failed_marshal;
}
if (req->rl_reply == NULL) /* e.g. reconnection */
......@@ -660,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task)
rqst->rq_bytes_sent = 0;
return 0;
failed_marshal:
r_xprt->rx_stats.failed_marshal_count++;
dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
__func__, rc);
if (rc == -EIO)
return -EIO;
drop_connection:
xprt_disconnect_done(xprt);
return -ENOTCONN; /* implies disconnect */
......@@ -705,7 +675,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
*/
static struct rpc_xprt_ops xprt_rdma_procs = {
.reserve_xprt = xprt_rdma_reserve_xprt,
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
.alloc_slot = xprt_alloc_slot,
.release_request = xprt_release_rqst_cong, /* ditto */
......
This diff is collapsed.
......@@ -43,6 +43,7 @@
#include <linux/wait.h> /* wait_queue_head_t, etc */
#include <linux/spinlock.h> /* spinlock_t, etc */
#include <linux/atomic.h> /* atomic_t, etc */
#include <linux/workqueue.h> /* struct work_struct */
#include <rdma/rdma_cm.h> /* RDMA connection api */
#include <rdma/ib_verbs.h> /* RDMA verbs api */
......@@ -66,18 +67,21 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
enum rpcrdma_memreg ri_memreg_strategy;
unsigned int ri_max_frmr_depth;
};
/*
* RDMA Endpoint -- one per transport instance
*/
#define RPCRDMA_WC_BUDGET (128)
#define RPCRDMA_POLLSIZE (16)
struct rpcrdma_ep {
atomic_t rep_cqcount;
int rep_cqinit;
int rep_connected;
struct rpcrdma_ia *rep_ia;
struct ib_cq *rep_cq;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
struct ib_sge rep_pad; /* holds zeroed pad */
......@@ -86,6 +90,9 @@ struct rpcrdma_ep {
struct rpc_xprt *rep_xprt; /* for rep_func */
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
struct delayed_work rep_connect_worker;
struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
};
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
......@@ -124,7 +131,6 @@ struct rpcrdma_rep {
struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
struct list_head rr_list; /* tasklet list */
wait_queue_head_t rr_unbind; /* optional unbind wait */
struct ib_sge rr_iov; /* for posting */
struct ib_mr *rr_handle; /* handle for mem in rr_iov */
char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
......@@ -159,7 +165,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
struct ib_mr *rl_mr; /* if registered directly */
struct rpcrdma_mw { /* if registered from region */
union {
struct ib_mw *mw;
struct ib_fmr *fmr;
struct {
struct ib_fast_reg_page_list *fr_pgl;
......@@ -207,7 +212,6 @@ struct rpcrdma_req {
struct rpcrdma_buffer {
spinlock_t rb_lock; /* protects indexes */
atomic_t rb_credits; /* most recent server credits */
unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
int rb_max_requests;/* client max requests */
struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
int rb_send_index;
......@@ -300,7 +304,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *);
*/
int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
struct rpcrdma_create_data_internal *);
int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
......@@ -330,11 +334,12 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *,
int rpcrdma_register_external(struct rpcrdma_mr_seg *,
int, int, struct rpcrdma_xprt *);
int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
struct rpcrdma_xprt *, void *);
struct rpcrdma_xprt *);
/*
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
*/
void rpcrdma_connect_worker(struct work_struct *);
void rpcrdma_conn_func(struct rpcrdma_ep *);
void rpcrdma_reply_handler(struct rpcrdma_rep *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment