Commit f7976a64 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'nfsd-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux

Pull nfsd updates from Chuck Lever:

 - Clean-ups in the READ path in anticipation of MSG_SPLICE_PAGES

 - Better NUMA awareness when allocating pages and other objects

 - A number of minor clean-ups to XDR encoding

 - Elimination of a race when accepting a TCP socket

 - Numerous observability enhancements

* tag 'nfsd-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux: (46 commits)
  nfsd: remove redundant assignments to variable len
  svcrdma: Fix stale comment
  NFSD: Distinguish per-net namespace initialization
  nfsd: move init of percpu reply_cache_stats counters back to nfsd_init_net
  SUNRPC: Address RCU warning in net/sunrpc/svc.c
  SUNRPC: Use sysfs_emit in place of strlcpy/sprintf
  SUNRPC: Remove transport class dprintk call sites
  SUNRPC: Fix comments for transport class registration
  svcrdma: Remove an unused argument from __svc_rdma_put_rw_ctxt()
  svcrdma: trace cc_release calls
  svcrdma: Convert "might sleep" comment into a code annotation
  NFSD: Add an nfsd4_encode_nfstime4() helper
  SUNRPC: Move initialization of rq_stime
  SUNRPC: Optimize page release in svc_rdma_sendto()
  svcrdma: Prevent page release when nothing was received
  svcrdma: Revert 2a1e4f21 ("svcrdma: Normalize Send page handling")
  SUNRPC: Revert 57990067 ("svcrdma: Remove unused sc_pages field")
  SUNRPC: Revert cc93ce95 ("svcrdma: Retain the page backing rq_res.head[0].iov_base")
  NFSD: add encoding of op_recall flag for write delegation
  NFSD: Add "official" reviewers for this subsystem
  ...
parents c0a572d9 75bfb704
......@@ -183,6 +183,8 @@ Henrik Rydberg <rydberg@bitmath.org>
Herbert Xu <herbert@gondor.apana.org.au>
Huacai Chen <chenhuacai@kernel.org> <chenhc@lemote.com>
Huacai Chen <chenhuacai@kernel.org> <chenhuacai@loongson.cn>
J. Bruce Fields <bfields@fieldses.org> <bfields@redhat.com>
J. Bruce Fields <bfields@fieldses.org> <bfields@citi.umich.edu>
Jacob Shin <Jacob.Shin@amd.com>
Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk@google.com>
Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk.kim@samsung.com>
......
......@@ -11275,6 +11275,10 @@ W: http://kernelnewbies.org/KernelJanitors
KERNEL NFSD, SUNRPC, AND LOCKD SERVERS
M: Chuck Lever <chuck.lever@oracle.com>
M: Jeff Layton <jlayton@kernel.org>
R: Neil Brown <neilb@suse.de>
R: Olga Kornievskaia <kolga@netapp.com>
R: Dai Ngo <Dai.Ngo@oracle.com>
R: Tom Talpey <tom@talpey.com>
L: linux-nfs@vger.kernel.org
S: Supported
W: http://nfs.sourceforge.net/
......
......@@ -355,7 +355,6 @@ static int lockd_get(void)
int error;
if (nlmsvc_serv) {
svc_get(nlmsvc_serv);
nlmsvc_users++;
return 0;
}
......
......@@ -80,6 +80,8 @@ enum {
int nfsd_drc_slab_create(void);
void nfsd_drc_slab_free(void);
int nfsd_net_reply_cache_init(struct nfsd_net *nn);
void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
int nfsd_cache_lookup(struct svc_rqst *);
......
......@@ -97,7 +97,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
goto out;
err = -EINVAL;
if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out;
err = -ENOENT;
......@@ -107,7 +107,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
dprintk("found domain %s\n", buf);
err = -EINVAL;
if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out;
fsidtype = simple_strtoul(buf, &ep, 10);
if (*ep)
......@@ -593,7 +593,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
{
/* client path expiry [flags anonuid anongid fsid] */
char *buf;
int len;
int err;
struct auth_domain *dom = NULL;
struct svc_export exp = {}, *expp;
......@@ -609,8 +608,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
/* client */
err = -EINVAL;
len = qword_get(&mesg, buf, PAGE_SIZE);
if (len <= 0)
if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out;
err = -ENOENT;
......@@ -620,7 +618,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
/* path */
err = -EINVAL;
if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out1;
err = kern_path(buf, 0, &exp.ex_path);
......@@ -665,7 +663,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
goto out3;
exp.ex_fsid = an_int;
while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
while (qword_get(&mesg, buf, PAGE_SIZE) > 0) {
if (strcmp(buf, "fsloc") == 0)
err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
else if (strcmp(buf, "uuid") == 0)
......
......@@ -151,8 +151,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
{
struct nfsd3_readargs *argp = rqstp->rq_argp;
struct nfsd3_readres *resp = rqstp->rq_resp;
unsigned int len;
int v;
dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
SVCFH_fmt(&argp->fh),
......@@ -166,17 +164,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
if (argp->offset + argp->count > (u64)OFFSET_MAX)
argp->count = (u64)OFFSET_MAX - argp->offset;
v = 0;
len = argp->count;
resp->pages = rqstp->rq_next_page;
while (len > 0) {
struct page *page = *(rqstp->rq_next_page++);
rqstp->rq_vec[v].iov_base = page_address(page);
rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
len -= rqstp->rq_vec[v].iov_len;
v++;
}
/* Obtain buffer pointer for payload.
* 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
......@@ -187,7 +175,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
rqstp->rq_vec, v, &resp->count, &resp->eof);
&resp->count, &resp->eof);
return rpc_success;
}
......
......@@ -828,7 +828,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (xdr_stream_encode_u32(xdr, resp->len) < 0)
return false;
xdr_write_pages(xdr, resp->pages, 0, resp->len);
svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, 0,
resp->len);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
return false;
break;
......@@ -859,8 +860,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
return false;
xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
resp->count);
svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages,
rqstp->rq_res.page_base,
resp->count);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
return false;
break;
......@@ -961,7 +963,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (!svcxdr_encode_cookieverf3(xdr, resp->verf))
return false;
xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0,
dirlist->len);
/* no more entries */
if (xdr_stream_encode_item_absent(xdr) < 0)
return false;
......
This diff is collapsed.
......@@ -148,12 +148,23 @@ void nfsd_drc_slab_free(void)
kmem_cache_destroy(drc_slab);
}
static int nfsd_reply_cache_stats_init(struct nfsd_net *nn)
/**
* nfsd_net_reply_cache_init - per net namespace reply cache set-up
* @nn: nfsd_net being initialized
*
* Returns zero on succes; otherwise a negative errno is returned.
*/
int nfsd_net_reply_cache_init(struct nfsd_net *nn)
{
return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
}
static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn)
/**
* nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down
* @nn: nfsd_net being freed
*
*/
void nfsd_net_reply_cache_destroy(struct nfsd_net *nn)
{
nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
}
......@@ -169,17 +180,13 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
hashsize = nfsd_hashsize(nn->max_drc_entries);
nn->maskbits = ilog2(hashsize);
status = nfsd_reply_cache_stats_init(nn);
if (status)
goto out_nomem;
nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
nn->nfsd_reply_cache_shrinker.seeks = 1;
status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
"nfsd-reply:%s", nn->nfsd_name);
if (status)
goto out_stats_destroy;
return status;
nn->drc_hashtbl = kvzalloc(array_size(hashsize,
sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
......@@ -195,9 +202,6 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
return 0;
out_shrinker:
unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
out_stats_destroy:
nfsd_reply_cache_stats_destroy(nn);
out_nomem:
printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
return -ENOMEM;
}
......@@ -217,7 +221,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
rp, nn);
}
}
nfsd_reply_cache_stats_destroy(nn);
kvfree(nn->drc_hashtbl);
nn->drc_hashtbl = NULL;
......
This diff is collapsed.
......@@ -623,16 +623,9 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
inode = d_inode(fhp->fh_dentry);
err = fh_getattr(fhp, &stat);
if (err) {
/* Grab the times from inode anyway */
stat.mtime = inode->i_mtime;
stat.ctime = inode->i_ctime;
stat.size = inode->i_size;
if (v4 && IS_I_VERSION(inode)) {
stat.change_cookie = inode_query_iversion(inode);
stat.result_mask |= STATX_CHANGE_COOKIE;
}
}
if (err)
return;
if (v4)
fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
......@@ -660,15 +653,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
printk("nfsd: inode locked twice during operation.\n");
err = fh_getattr(fhp, &fhp->fh_post_attr);
if (err) {
fhp->fh_post_saved = false;
fhp->fh_post_attr.ctime = inode->i_ctime;
if (v4 && IS_I_VERSION(inode)) {
fhp->fh_post_attr.change_cookie = inode_query_iversion(inode);
fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE;
}
} else
fhp->fh_post_saved = true;
if (err)
return;
fhp->fh_post_saved = true;
if (v4)
fhp->fh_post_change =
nfsd4_change_attribute(&fhp->fh_post_attr, inode);
......
......@@ -176,9 +176,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
{
struct nfsd_readargs *argp = rqstp->rq_argp;
struct nfsd_readres *resp = rqstp->rq_resp;
unsigned int len;
u32 eof;
int v;
dprintk("nfsd: READ %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
......@@ -187,17 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
v = 0;
len = argp->count;
resp->pages = rqstp->rq_next_page;
while (len > 0) {
struct page *page = *(rqstp->rq_next_page++);
rqstp->rq_vec[v].iov_base = page_address(page);
rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
len -= rqstp->rq_vec[v].iov_len;
v++;
}
/* Obtain buffer pointer for payload. 19 is 1 word for
* status, 17 words for fattr, and 1 word for the byte count.
......@@ -207,7 +195,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
resp->count = argp->count;
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
rqstp->rq_vec, v, &resp->count, &eof);
&resp->count, &eof);
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
......
......@@ -402,6 +402,11 @@ void nfsd_reset_write_verifier(struct nfsd_net *nn)
write_sequnlock(&nn->writeverf_lock);
}
/*
* Crank up a set of per-namespace resources for a new NFSD instance,
* including lockd, a duplicate reply cache, an open file cache
* instance, and a cache of NFSv4 state objects.
*/
static int nfsd_startup_net(struct net *net, const struct cred *cred)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
......
......@@ -468,7 +468,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
case nfs_ok:
if (xdr_stream_encode_u32(xdr, resp->len) < 0)
return false;
xdr_write_pages(xdr, &resp->page, 0, resp->len);
svcxdr_encode_opaque_pages(rqstp, xdr, &resp->page, 0,
resp->len);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
return false;
break;
......@@ -491,8 +492,9 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
return false;
xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
resp->count);
svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages,
rqstp->rq_res.page_base,
resp->count);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
return false;
break;
......@@ -511,7 +513,8 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
switch (resp->status) {
case nfs_ok:
xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0,
dirlist->len);
/* no more entries */
if (xdr_stream_encode_item_absent(xdr) < 0)
return false;
......
......@@ -1581,6 +1581,265 @@ TRACE_EVENT(nfsd_cb_recall_any_done,
)
);
TRACE_EVENT(nfsd_ctl_unlock_ip,
TP_PROTO(
const struct net *net,
const char *address
),
TP_ARGS(net, address),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__string(address, address)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__assign_str(address, address);
),
TP_printk("address=%s",
__get_str(address)
)
);
TRACE_EVENT(nfsd_ctl_unlock_fs,
TP_PROTO(
const struct net *net,
const char *path
),
TP_ARGS(net, path),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__string(path, path)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__assign_str(path, path);
),
TP_printk("path=%s",
__get_str(path)
)
);
TRACE_EVENT(nfsd_ctl_filehandle,
TP_PROTO(
const struct net *net,
const char *domain,
const char *path,
int maxsize
),
TP_ARGS(net, domain, path, maxsize),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__field(int, maxsize)
__string(domain, domain)
__string(path, path)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->maxsize = maxsize;
__assign_str(domain, domain);
__assign_str(path, path);
),
TP_printk("domain=%s path=%s maxsize=%d",
__get_str(domain), __get_str(path), __entry->maxsize
)
);
TRACE_EVENT(nfsd_ctl_threads,
TP_PROTO(
const struct net *net,
int newthreads
),
TP_ARGS(net, newthreads),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__field(int, newthreads)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->newthreads = newthreads;
),
TP_printk("newthreads=%d",
__entry->newthreads
)
);
TRACE_EVENT(nfsd_ctl_pool_threads,
TP_PROTO(
const struct net *net,
int pool,
int nrthreads
),
TP_ARGS(net, pool, nrthreads),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__field(int, pool)
__field(int, nrthreads)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->pool = pool;
__entry->nrthreads = nrthreads;
),
TP_printk("pool=%d nrthreads=%d",
__entry->pool, __entry->nrthreads
)
);
TRACE_EVENT(nfsd_ctl_version,
TP_PROTO(
const struct net *net,
const char *mesg
),
TP_ARGS(net, mesg),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__string(mesg, mesg)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__assign_str(mesg, mesg);
),
TP_printk("%s",
__get_str(mesg)
)
);
TRACE_EVENT(nfsd_ctl_ports_addfd,
TP_PROTO(
const struct net *net,
int fd
),
TP_ARGS(net, fd),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__field(int, fd)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->fd = fd;
),
TP_printk("fd=%d",
__entry->fd
)
);
TRACE_EVENT(nfsd_ctl_ports_addxprt,
TP_PROTO(
const struct net *net,
const char *transport,
int port
),
TP_ARGS(net, transport, port),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__field(int, port)
__string(transport, transport)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->port = port;
__assign_str(transport, transport);
),
TP_printk("transport=%s port=%d",
__get_str(transport), __entry->port
)
);
TRACE_EVENT(nfsd_ctl_maxblksize,
TP_PROTO(
const struct net *net,
int bsize
),
TP_ARGS(net, bsize),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__field(int, bsize)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->bsize = bsize;
),
TP_printk("bsize=%d",
__entry->bsize
)
);
TRACE_EVENT(nfsd_ctl_maxconn,
TP_PROTO(
const struct net *net,
int maxconn
),
TP_ARGS(net, maxconn),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__field(int, maxconn)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->maxconn = maxconn;
),
TP_printk("maxconn=%d",
__entry->maxconn
)
);
TRACE_EVENT(nfsd_ctl_time,
TP_PROTO(
const struct net *net,
const char *name,
size_t namelen,
int time
),
TP_ARGS(net, name, namelen, time),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__field(int, time)
__string_len(name, name, namelen)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->time = time;
__assign_str_len(name, name, namelen);
),
TP_printk("file=%s time=%d\n",
__get_str(name), __entry->time
)
);
TRACE_EVENT(nfsd_ctl_recoverydir,
TP_PROTO(
const struct net *net,
const char *recdir
),
TP_ARGS(net, recdir),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__string(recdir, recdir)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__assign_str(recdir, recdir);
),
TP_printk("recdir=%s",
__get_str(recdir)
)
);
TRACE_EVENT(nfsd_end_grace,
TP_PROTO(
const struct net *net
),
TP_ARGS(net),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
),
TP_printk("nn=%d", __entry->netns_ino
)
);
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
......
......@@ -388,7 +388,9 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
iap->ia_mode &= ~S_ISGID;
} else {
/* set ATTR_KILL_* bits and let VFS handle it */
iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
iap->ia_valid |= ATTR_KILL_SUID;
iap->ia_valid |=
setattr_should_drop_sgid(&nop_mnt_idmap, inode);
}
}
}
......@@ -1001,6 +1003,18 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
}
/**
* nfsd_splice_read - Perform a VFS read using a splice pipe
* @rqstp: RPC transaction context
* @fhp: file handle of file to be read
* @file: opened struct file of file to be read
* @offset: starting byte offset
* @count: IN: requested number of bytes; OUT: number of bytes read
* @eof: OUT: set non-zero if operation reached the end of the file
*
* Returns nfs_ok on success, otherwise an nfserr stat value is
* returned.
*/
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset, unsigned long *count,
u32 *eof)
......@@ -1014,22 +1028,50 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
ssize_t host_err;
trace_nfsd_read_splice(rqstp, fhp, offset, *count);
rqstp->rq_next_page = rqstp->rq_respages + 1;
host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
struct kvec *vec, int vlen, unsigned long *count,
u32 *eof)
/**
* nfsd_iter_read - Perform a VFS read using an iterator
* @rqstp: RPC transaction context
* @fhp: file handle of file to be read
* @file: opened struct file of file to be read
* @offset: starting byte offset
* @count: IN: requested number of bytes; OUT: number of bytes read
* @base: offset in first page of read buffer
* @eof: OUT: set non-zero if operation reached the end of the file
*
* Some filesystems or situations cannot use nfsd_splice_read. This
* function is the slightly less-performant fallback for those cases.
*
* Returns nfs_ok on success, otherwise an nfserr stat value is
* returned.
*/
__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset, unsigned long *count,
unsigned int base, u32 *eof)
{
unsigned long v, total;
struct iov_iter iter;
loff_t ppos = offset;
struct page *page;
ssize_t host_err;
v = 0;
total = *count;
while (total) {
page = *(rqstp->rq_next_page++);
rqstp->rq_vec[v].iov_base = page_address(page) + base;
rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base);
total -= rqstp->rq_vec[v].iov_len;
++v;
base = 0;
}
WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec));
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count);
iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count);
host_err = vfs_iter_read(file, &iter, &ppos, 0);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
......@@ -1159,14 +1201,24 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
return nfserr;
}
/*
* Read data from a file. count must contain the requested read count
* on entry. On return, *count contains the number of bytes actually read.
/**
* nfsd_read - Read data from a file
* @rqstp: RPC transaction context
* @fhp: file handle of file to be read
* @offset: starting byte offset
* @count: IN: requested number of bytes; OUT: number of bytes read
* @eof: OUT: set non-zero if operation reached the end of the file
*
* The caller must verify that there is enough space in @rqstp.rq_res
* to perform this operation.
*
* N.B. After this call fhp needs an fh_put
*
* Returns nfs_ok on success, otherwise an nfserr stat value is
* returned.
*/
__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
loff_t offset, struct kvec *vec, int vlen, unsigned long *count,
u32 *eof)
loff_t offset, unsigned long *count, u32 *eof)
{
struct nfsd_file *nf;
struct file *file;
......@@ -1181,12 +1233,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
else
err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof);
err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof);
nfsd_file_put(nf);
trace_nfsd_read_done(rqstp, fhp, offset, *count);
return err;
}
......
......@@ -110,13 +110,12 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
unsigned long *count,
u32 *eof);
__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
struct kvec *vec, int vlen,
unsigned long *count,
unsigned long *count, unsigned int base,
u32 *eof);
__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
loff_t, struct kvec *, int, unsigned long *,
__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
loff_t offset, unsigned long *count,
u32 *eof);
__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
struct kvec *, int, unsigned long *,
......
......@@ -508,6 +508,27 @@ static inline void svcxdr_init_encode(struct svc_rqst *rqstp)
xdr->rqst = NULL;
}
/**
* svcxdr_encode_opaque_pages - Insert pages into an xdr_stream
* @xdr: xdr_stream to be updated
* @pages: array of pages to insert
* @base: starting offset of first data byte in @pages
* @len: number of data bytes in @pages to insert
*
* After the @pages are added, the tail iovec is instantiated pointing
* to end of the head buffer, and the stream is set up to encode
* subsequent items into the tail.
*/
static inline void svcxdr_encode_opaque_pages(struct svc_rqst *rqstp,
struct xdr_stream *xdr,
struct page **pages,
unsigned int base,
unsigned int len)
{
xdr_write_pages(xdr, pages, base, len);
xdr->page_ptr = rqstp->rq_next_page - 1;
}
/**
* svcxdr_set_auth_slack -
* @rqstp: RPC transaction
......
......@@ -135,7 +135,6 @@ struct svc_rdma_recv_ctxt {
struct ib_sge rc_recv_sge;
void *rc_recv_buf;
struct xdr_stream rc_stream;
bool rc_temp;
u32 rc_byte_len;
unsigned int rc_page_count;
u32 rc_inv_rkey;
......@@ -155,12 +154,12 @@ struct svc_rdma_send_ctxt {
struct ib_send_wr sc_send_wr;
struct ib_cqe sc_cqe;
struct completion sc_done;
struct xdr_buf sc_hdrbuf;
struct xdr_stream sc_stream;
void *sc_xprt_buf;
int sc_page_count;
int sc_cur_sge_no;
struct page *sc_pages[RPCSVC_MAXPAGES];
struct ib_sge sc_sges[];
};
......
......@@ -242,8 +242,7 @@ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf,
extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
struct page **pages, struct rpc_rqst *rqst);
extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec,
size_t nbytes);
extern int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes);
extern void __xdr_commit_encode(struct xdr_stream *xdr);
extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
extern void xdr_truncate_decode(struct xdr_stream *xdr, size_t len);
......
......@@ -2112,6 +2112,14 @@ DEFINE_POST_CHUNK_EVENT(read);
DEFINE_POST_CHUNK_EVENT(write);
DEFINE_POST_CHUNK_EVENT(reply);
DEFINE_EVENT(svcrdma_post_chunk_class, svcrdma_cc_release,
TP_PROTO(
const struct rpc_rdma_cid *cid,
int sqecount
),
TP_ARGS(cid, sqecount)
);
TRACE_EVENT(svcrdma_wc_read,
TP_PROTO(
const struct ib_wc *wc,
......
......@@ -2104,31 +2104,46 @@ DEFINE_SVC_DEFERRED_EVENT(drop);
DEFINE_SVC_DEFERRED_EVENT(queue);
DEFINE_SVC_DEFERRED_EVENT(recv);
TRACE_EVENT(svcsock_new_socket,
DECLARE_EVENT_CLASS(svcsock_lifetime_class,
TP_PROTO(
const void *svsk,
const struct socket *socket
),
TP_ARGS(socket),
TP_ARGS(svsk, socket),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__field(const void *, svsk)
__field(const void *, sk)
__field(unsigned long, type)
__field(unsigned long, family)
__field(bool, listener)
__field(unsigned long, state)
),
TP_fast_assign(
struct sock *sk = socket->sk;
__entry->netns_ino = sock_net(sk)->ns.inum;
__entry->svsk = svsk;
__entry->sk = sk;
__entry->type = socket->type;
__entry->family = socket->sk->sk_family;
__entry->listener = (socket->sk->sk_state == TCP_LISTEN);
__entry->family = sk->sk_family;
__entry->state = sk->sk_state;
),
TP_printk("type=%s family=%s%s",
show_socket_type(__entry->type),
TP_printk("svsk=%p type=%s family=%s%s",
__entry->svsk, show_socket_type(__entry->type),
rpc_show_address_family(__entry->family),
__entry->listener ? " (listener)" : ""
__entry->state == TCP_LISTEN ? " (listener)" : ""
)
);
#define DEFINE_SVCSOCK_LIFETIME_EVENT(name) \
DEFINE_EVENT(svcsock_lifetime_class, name, \
TP_PROTO( \
const void *svsk, \
const struct socket *socket \
), \
TP_ARGS(svsk, socket))
DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_new);
DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_free);
TRACE_EVENT(svcsock_marker,
TP_PROTO(
......
......@@ -109,15 +109,15 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp)
switch (*ip)
{
case SVC_POOL_AUTO:
return strlcpy(buf, "auto\n", 20);
return sysfs_emit(buf, "auto\n");
case SVC_POOL_GLOBAL:
return strlcpy(buf, "global\n", 20);
return sysfs_emit(buf, "global\n");
case SVC_POOL_PERCPU:
return strlcpy(buf, "percpu\n", 20);
return sysfs_emit(buf, "percpu\n");
case SVC_POOL_PERNODE:
return strlcpy(buf, "pernode\n", 20);
return sysfs_emit(buf, "pernode\n");
default:
return sprintf(buf, "%d\n", *ip);
return sysfs_emit(buf, "%d\n", *ip);
}
}
......@@ -597,34 +597,25 @@ svc_destroy(struct kref *ref)
}
EXPORT_SYMBOL_GPL(svc_destroy);
/*
* Allocate an RPC server's buffer space.
* We allocate pages and place them in rq_pages.
*/
static int
static bool
svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
{
unsigned int pages, arghi;
unsigned long pages, ret;
/* bc_xprt uses fore channel allocated buffers */
if (svc_is_backchannel(rqstp))
return 1;
return true;
pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply.
* We assume one is at most one page
*/
arghi = 0;
WARN_ON_ONCE(pages > RPCSVC_MAXPAGES);
if (pages > RPCSVC_MAXPAGES)
pages = RPCSVC_MAXPAGES;
while (pages) {
struct page *p = alloc_pages_node(node, GFP_KERNEL, 0);
if (!p)
break;
rqstp->rq_pages[arghi++] = p;
pages--;
}
return pages == 0;
ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages,
rqstp->rq_pages);
return ret == pages;
}
/*
......@@ -1173,6 +1164,7 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi
*/
static void svc_unregister(const struct svc_serv *serv, struct net *net)
{
struct sighand_struct *sighand;
struct svc_program *progp;
unsigned long flags;
unsigned int i;
......@@ -1189,9 +1181,12 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net)
}
}
spin_lock_irqsave(&current->sighand->siglock, flags);
rcu_read_lock();
sighand = rcu_dereference(current->sighand);
spin_lock_irqsave(&sighand->siglock, flags);
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, flags);
spin_unlock_irqrestore(&sighand->siglock, flags);
rcu_read_unlock();
}
/*
......
......@@ -74,13 +74,18 @@ static LIST_HEAD(svc_xprt_class_list);
* that no other thread will be using the transport or will
* try to set XPT_DEAD.
*/
/**
* svc_reg_xprt_class - Register a server-side RPC transport class
* @xcl: New transport class to be registered
*
* Returns zero on success; otherwise a negative errno is returned.
*/
int svc_reg_xprt_class(struct svc_xprt_class *xcl)
{
struct svc_xprt_class *cl;
int res = -EEXIST;
dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
INIT_LIST_HEAD(&xcl->xcl_list);
spin_lock(&svc_xprt_class_lock);
/* Make sure there isn't already a class with the same name */
......@@ -96,9 +101,13 @@ int svc_reg_xprt_class(struct svc_xprt_class *xcl)
}
EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
/**
* svc_unreg_xprt_class - Unregister a server-side RPC transport class
* @xcl: Transport class to be unregistered
*
*/
void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
{
dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
spin_lock(&svc_xprt_class_lock);
list_del_init(&xcl->xcl_list);
spin_unlock(&svc_xprt_class_lock);
......@@ -685,8 +694,9 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
}
for (filled = 0; filled < pages; filled = ret) {
ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
rqstp->rq_pages);
ret = alloc_pages_bulk_array_node(GFP_KERNEL,
rqstp->rq_pool->sp_id,
pages, rqstp->rq_pages);
if (ret > filled)
/* Made progress, don't sleep yet */
continue;
......@@ -843,15 +853,11 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
svc_xprt_received(xprt);
} else if (svc_xprt_reserve_slot(rqstp, xprt)) {
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
rqstp, rqstp->rq_pool->sp_id, xprt,
kref_read(&xprt->xpt_ref));
rqstp->rq_deferred = svc_deferred_dequeue(xprt);
if (rqstp->rq_deferred)
len = svc_deferred_recv(rqstp);
else
len = xprt->xpt_ops->xpo_recvfrom(rqstp);
rqstp->rq_stime = ktime_get();
rqstp->rq_reserved = serv->sv_max_mesg;
atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
} else
......@@ -894,6 +900,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
err = -EAGAIN;
if (len <= 0)
goto out_release;
trace_svc_xdr_recvfrom(&rqstp->rq_arg);
clear_bit(XPT_OLD, &xprt->xpt_flags);
......@@ -902,6 +909,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
if (serv->sv_stats)
serv->sv_stats->netcnt++;
rqstp->rq_stime = ktime_get();
return len;
out_release:
rqstp->rq_res.len = 0;
......
......@@ -826,12 +826,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
trace_sk_data_ready(sk);
if (svsk) {
/* Refer to svc_setup_socket() for details. */
rmb();
svsk->sk_odata(sk);
}
/*
* This callback may called twice when a new connection
* is established as a child socket inherits everything
......@@ -840,13 +834,18 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
* when one of child sockets become ESTABLISHED.
* 2) data_ready method of the child socket may be called
* when it receives data before the socket is accepted.
* In case of 2, we should ignore it silently.
* In case of 2, we should ignore it silently and DO NOT
* dereference svsk.
*/
if (sk->sk_state == TCP_LISTEN) {
if (svsk) {
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
if (sk->sk_state != TCP_LISTEN)
return;
if (svsk) {
/* Refer to svc_setup_socket() for details. */
rmb();
svsk->sk_odata(sk);
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
}
......@@ -887,13 +886,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
err = kernel_accept(sock, &newsock, O_NONBLOCK);
if (err < 0) {
if (err == -ENOMEM)
printk(KERN_WARNING "%s: no more sockets!\n",
serv->sv_name);
else if (err != -EAGAIN)
net_warn_ratelimited("%s: accept failed (err %d)!\n",
serv->sv_name, -err);
trace_svcsock_accept_err(xprt, serv->sv_name, err);
if (err != -EAGAIN)
trace_svcsock_accept_err(xprt, serv->sv_name, err);
return NULL;
}
if (IS_ERR(sock_alloc_file(newsock, O_NONBLOCK, NULL)))
......@@ -1464,7 +1458,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
svsk->sk_owspace = inet->sk_write_space;
/*
* This barrier is necessary in order to prevent race condition
* with svc_data_ready(), svc_listen_data_ready() and others
* with svc_data_ready(), svc_tcp_listen_data_ready(), and others
* when calling callbacks above.
*/
wmb();
......@@ -1476,7 +1470,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
else
svc_tcp_init(svsk, serv);
trace_svcsock_new_socket(sock);
trace_svcsock_new(svsk, sock);
return svsk;
}
......@@ -1657,6 +1651,8 @@ static void svc_sock_free(struct svc_xprt *xprt)
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct socket *sock = svsk->sk_sock;
trace_svcsock_free(svsk, sock);
tls_handshake_cancel(sock->sk);
if (sock->file)
sockfd_put(sock);
......
......@@ -1070,22 +1070,22 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
}
EXPORT_SYMBOL_GPL(xdr_reserve_space);
/**
* xdr_reserve_space_vec - Reserves a large amount of buffer space for sending
* @xdr: pointer to xdr_stream
* @vec: pointer to a kvec array
* @nbytes: number of bytes to reserve
*
* Reserves enough buffer space to encode 'nbytes' of data and stores the
* pointers in 'vec'. The size argument passed to xdr_reserve_space() is
* determined based on the number of bytes remaining in the current page to
* avoid invalidating iov_base pointers when xdr_commit_encode() is called.
* The size argument passed to xdr_reserve_space() is determined based
* on the number of bytes remaining in the current page to avoid
* invalidating iov_base pointers when xdr_commit_encode() is called.
*
* Return values:
* %0: success
* %-EMSGSIZE: not enough space is available in @xdr
*/
int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes)
int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes)
{
int thislen;
int v = 0;
size_t thislen;
__be32 *p;
/*
......@@ -1097,21 +1097,19 @@ int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbyte
xdr->end = xdr->p;
}
/* XXX: Let's find a way to make this more efficient */
while (nbytes) {
thislen = xdr->buf->page_len % PAGE_SIZE;
thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen);
p = xdr_reserve_space(xdr, thislen);
if (!p)
return -EIO;
return -EMSGSIZE;
vec[v].iov_base = p;
vec[v].iov_len = thislen;
v++;
nbytes -= thislen;
}
return v;
return 0;
}
EXPORT_SYMBOL_GPL(xdr_reserve_space_vec);
......
......@@ -93,13 +93,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
*/
get_page(virt_to_page(rqst->rq_buffer));
sctxt->sc_send_wr.opcode = IB_WR_SEND;
ret = svc_rdma_send(rdma, sctxt);
if (ret < 0)
return ret;
ret = wait_for_completion_killable(&sctxt->sc_done);
svc_rdma_send_ctxt_put(rdma, sctxt);
return ret;
return svc_rdma_send(rdma, sctxt);
}
/* Server-side transport endpoint wants a whole page for its send
......
......@@ -125,14 +125,15 @@ static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
int node = ibdev_to_node(rdma->sc_cm_id->device);
struct svc_rdma_recv_ctxt *ctxt;
dma_addr_t addr;
void *buffer;
ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
ctxt = kmalloc_node(sizeof(*ctxt), GFP_KERNEL, node);
if (!ctxt)
goto fail0;
buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
goto fail1;
addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
......@@ -155,7 +156,6 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
ctxt->rc_recv_buf = buffer;
ctxt->rc_temp = false;
return ctxt;
fail2:
......@@ -232,10 +232,7 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
pcl_free(&ctxt->rc_write_pcl);
pcl_free(&ctxt->rc_reply_pcl);
if (!ctxt->rc_temp)
llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
else
svc_rdma_recv_ctxt_destroy(rdma, ctxt);
llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
}
/**
......@@ -258,7 +255,7 @@ void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
}
static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
unsigned int wanted, bool temp)
unsigned int wanted)
{
const struct ib_recv_wr *bad_wr = NULL;
struct svc_rdma_recv_ctxt *ctxt;
......@@ -275,7 +272,6 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
break;
trace_svcrdma_post_recv(ctxt);
ctxt->rc_temp = temp;
ctxt->rc_recv_wr.next = recv_chain;
recv_chain = &ctxt->rc_recv_wr;
rdma->sc_pending_recvs++;
......@@ -309,7 +305,7 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
*/
bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
{
return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests);
}
/**
......@@ -343,7 +339,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
* client reconnects.
*/
if (rdma->sc_pending_recvs < rdma->sc_max_requests)
if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false))
if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch))
goto dropped;
/* All wc fields are now known to be valid */
......@@ -775,9 +771,6 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
*
* The next ctxt is removed from the "receive" lists.
*
* - If the ctxt completes a Read, then finish assembling the Call
* message and return the number of bytes in the message.
*
* - If the ctxt completes a Receive, then construct the Call
* message from the contents of the Receive buffer.
*
......@@ -786,7 +779,8 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
* in the message.
*
* - If there are Read chunks in this message, post Read WRs to
* pull that payload and return 0.
* pull that payload. When the Read WRs complete, build the
* full message and return the number of bytes in it.
*/
int svc_rdma_recvfrom(struct svc_rqst *rqstp)
{
......@@ -796,6 +790,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
struct svc_rdma_recv_ctxt *ctxt;
int ret;
/* Prevent svc_xprt_release() from releasing pages in rq_pages
* when returning 0 or an error.
*/
rqstp->rq_respages = rqstp->rq_pages;
rqstp->rq_next_page = rqstp->rq_respages;
rqstp->rq_xprt_ctxt = NULL;
ctxt = NULL;
......@@ -819,12 +819,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
DMA_FROM_DEVICE);
svc_rdma_build_arg_xdr(rqstp, ctxt);
/* Prevent svc_xprt_release from releasing pages in rq_pages
* if we return 0 or an error.
*/
rqstp->rq_respages = rqstp->rq_pages;
rqstp->rq_next_page = rqstp->rq_respages;
ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
if (ret < 0)
goto out_err;
......
......@@ -62,8 +62,8 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
if (node) {
ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
} else {
ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
GFP_KERNEL);
ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device));
if (!ctxt)
goto out_noctx;
......@@ -84,8 +84,7 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
return NULL;
}
static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
struct svc_rdma_rw_ctxt *ctxt,
static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt,
struct llist_head *list)
{
sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE);
......@@ -95,7 +94,7 @@ static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
struct svc_rdma_rw_ctxt *ctxt)
{
__svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts);
__svc_rdma_put_rw_ctxt(ctxt, &rdma->sc_rw_ctxts);
}
/**
......@@ -191,6 +190,8 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
struct svc_rdma_rw_ctxt *ctxt;
LLIST_HEAD(free);
trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount);
first = last = NULL;
while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
list_del(&ctxt->rw_list);
......@@ -198,7 +199,7 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
rdma->sc_port_num, ctxt->rw_sg_table.sgl,
ctxt->rw_nents, dir);
__svc_rdma_put_rw_ctxt(rdma, ctxt, &free);
__svc_rdma_put_rw_ctxt(ctxt, &free);
ctxt->rw_node.next = first;
first = &ctxt->rw_node;
......@@ -234,7 +235,8 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
{
struct svc_rdma_write_info *info;
info = kmalloc(sizeof(*info), GFP_KERNEL);
info = kmalloc_node(sizeof(*info), GFP_KERNEL,
ibdev_to_node(rdma->sc_cm_id->device));
if (!info)
return info;
......@@ -304,7 +306,8 @@ svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma)
{
struct svc_rdma_read_info *info;
info = kmalloc(sizeof(*info), GFP_KERNEL);
info = kmalloc_node(sizeof(*info), GFP_KERNEL,
ibdev_to_node(rdma->sc_cm_id->device));
if (!info)
return info;
......@@ -351,8 +354,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
return;
}
/* This function sleeps when the transport's Send Queue is congested.
*
/*
* Assumptions:
* - If ib_post_send() succeeds, only one completion is expected,
* even if one or more WRs are flushed. This is true when posting
......@@ -367,6 +369,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
struct ib_cqe *cqe;
int ret;
might_sleep();
if (cc->cc_sqecount > rdma->sc_sq_depth)
return -EINVAL;
......
......@@ -123,18 +123,17 @@ static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
static struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
{
int node = ibdev_to_node(rdma->sc_cm_id->device);
struct svc_rdma_send_ctxt *ctxt;
dma_addr_t addr;
void *buffer;
size_t size;
int i;
size = sizeof(*ctxt);
size += rdma->sc_max_send_sges * sizeof(struct ib_sge);
ctxt = kmalloc(size, GFP_KERNEL);
ctxt = kmalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges),
GFP_KERNEL, node);
if (!ctxt)
goto fail0;
buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
goto fail1;
addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
......@@ -148,7 +147,6 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
init_completion(&ctxt->sc_done);
ctxt->sc_cqe.done = svc_rdma_wc_send;
ctxt->sc_xprt_buf = buffer;
xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
......@@ -214,6 +212,7 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
ctxt->sc_send_wr.num_sge = 0;
ctxt->sc_cur_sge_no = 0;
ctxt->sc_page_count = 0;
return ctxt;
out_empty:
......@@ -228,6 +227,8 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
* svc_rdma_send_ctxt_put - Return send_ctxt to free list
* @rdma: controlling svcxprt_rdma
* @ctxt: object to return to the free list
*
* Pages left in sc_pages are DMA unmapped and released.
*/
void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt)
......@@ -235,6 +236,9 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
struct ib_device *device = rdma->sc_cm_id->device;
unsigned int i;
if (ctxt->sc_page_count)
release_pages(ctxt->sc_pages, ctxt->sc_page_count);
/* The first SGE contains the transport header, which
* remains mapped until @ctxt is destroyed.
*/
......@@ -281,12 +285,12 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
svc_rdma_wake_send_waiters(rdma, 1);
complete(&ctxt->sc_done);
if (unlikely(wc->status != IB_WC_SUCCESS))
goto flushed;
trace_svcrdma_wc_send(wc, &ctxt->sc_cid);
svc_rdma_send_ctxt_put(rdma, ctxt);
return;
flushed:
......@@ -294,6 +298,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid);
else
trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid);
svc_rdma_send_ctxt_put(rdma, ctxt);
svc_xprt_deferred_close(&rdma->sc_xprt);
}
......@@ -310,7 +315,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
struct ib_send_wr *wr = &ctxt->sc_send_wr;
int ret;
reinit_completion(&ctxt->sc_done);
might_sleep();
/* Sync the transport header buffer */
ib_dma_sync_single_for_device(rdma->sc_pd->device,
......@@ -799,6 +804,25 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
svc_rdma_xb_dma_map, &args);
}
/* The svc_rqst and all resources it owns are released as soon as
* svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
* so they are released by the Send completion handler.
*/
static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
struct svc_rdma_send_ctxt *ctxt)
{
int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
ctxt->sc_page_count += pages;
for (i = 0; i < pages; i++) {
ctxt->sc_pages[i] = rqstp->rq_respages[i];
rqstp->rq_respages[i] = NULL;
}
/* Prevent svc_xprt_release from releasing pages in rq_pages */
rqstp->rq_next_page = rqstp->rq_respages;
}
/* Prepare the portion of the RPC Reply that will be transmitted
* via RDMA Send. The RPC-over-RDMA transport header is prepared
* in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
......@@ -828,6 +852,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
if (ret < 0)
return ret;
svc_rdma_save_io_pages(rqstp, sctxt);
if (rctxt->rc_inv_rkey) {
sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
......@@ -835,13 +861,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
sctxt->sc_send_wr.opcode = IB_WR_SEND;
}
ret = svc_rdma_send(rdma, sctxt);
if (ret < 0)
return ret;
ret = wait_for_completion_killable(&sctxt->sc_done);
svc_rdma_send_ctxt_put(rdma, sctxt);
return ret;
return svc_rdma_send(rdma, sctxt);
}
/**
......@@ -907,8 +927,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
if (svc_rdma_send(rdma, sctxt))
goto put_ctxt;
wait_for_completion_killable(&sctxt->sc_done);
return;
put_ctxt:
svc_rdma_send_ctxt_put(rdma, sctxt);
......@@ -976,17 +995,16 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
if (ret < 0)
goto put_ctxt;
/* Prevent svc_xprt_release() from releasing the page backing
* rq_res.head[0].iov_base. It's no longer being accessed by
* the I/O device. */
rqstp->rq_respages++;
return 0;
reply_chunk:
if (ret != -E2BIG && ret != -EINVAL)
goto put_ctxt;
/* Send completion releases payload pages that were part
* of previously posted RDMA Writes.
*/
svc_rdma_save_io_pages(rqstp, sctxt);
svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
return 0;
......
......@@ -64,7 +64,7 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
struct net *net);
struct net *net, int node);
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
......@@ -123,14 +123,14 @@ static void qp_event_handler(struct ib_event *event, void *context)
}
static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
struct net *net)
struct net *net, int node)
{
struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
struct svcxprt_rdma *cma_xprt;
if (!cma_xprt) {
dprintk("svcrdma: failed to create new transport\n");
cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node);
if (!cma_xprt)
return NULL;
}
svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
......@@ -193,9 +193,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
struct svcxprt_rdma *newxprt;
struct sockaddr *sa;
/* Create a new transport */
newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server,
listen_xprt->sc_xprt.xpt_net);
listen_xprt->sc_xprt.xpt_net,
ibdev_to_node(new_cma_id->device));
if (!newxprt)
return;
newxprt->sc_cm_id = new_cma_id;
......@@ -304,7 +304,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
return ERR_PTR(-EAFNOSUPPORT);
cma_xprt = svc_rdma_create_xprt(serv, net);
cma_xprt = svc_rdma_create_xprt(serv, net, NUMA_NO_NODE);
if (!cma_xprt)
return ERR_PTR(-ENOMEM);
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment