Commit 8acd3a60 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-2.6.28' of git://linux-nfs.org/~bfields/linux

* 'for-2.6.28' of git://linux-nfs.org/~bfields/linux: (59 commits)
  svcrdma: Fix IRD/ORD polarity
  svcrdma: Update svc_rdma_send_error to use DMA LKEY
  svcrdma: Modify the RPC reply path to use FRMR when available
  svcrdma: Modify the RPC recv path to use FRMR when available
  svcrdma: Add support to svc_rdma_send to handle chained WR
  svcrdma: Modify post recv path to use local dma key
  svcrdma: Add a service to register a Fast Reg MR with the device
  svcrdma: Query device for Fast Reg support during connection setup
  svcrdma: Add FRMR get/put services
  NLM: Remove unused argument from svc_addsock() function
  NLM: Remove "proto" argument from lockd_up()
  NLM: Always start both UDP and TCP listeners
  lockd: Remove unused fields in the nlm_reboot structure
  lockd: Add helper to sanity check incoming NOTIFY requests
  lockd: change nlmclnt_grant() to take a "struct sockaddr *"
  lockd: Adjust nlmsvc_lookup_host() to accomodate AF_INET6 addresses
  lockd: Adjust nlmclnt_lookup_host() signature to accomodate non-AF_INET
  lockd: Support non-AF_INET addresses in nlm_lookup_host()
  NLM: Convert nlm_lookup_host() to use a single argument
  svcrdma: Add Fast Reg MR Data Types
  ...
parents c269bc00 107e0008
......@@ -433,6 +433,14 @@ config FS_POSIX_ACL
bool
default n
config FILE_LOCKING
bool "Enable POSIX file locking API" if EMBEDDED
default y
help
This option enables standard file locking support, required
for filesystems like NFS and for the flock() system
call. Disabling this option saves about 11k.
source "fs/xfs/Kconfig"
source "fs/gfs2/Kconfig"
......@@ -1779,6 +1787,28 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
config SUNRPC_REGISTER_V4
bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
depends on SUNRPC && EXPERIMENTAL
default n
help
Sun added support for registering RPC services at an IPv6
address by creating two new versions of the rpcbind protocol
(RFC 1833).
This option enables support in the kernel RPC server for
registering kernel RPC services via version 4 of the rpcbind
protocol. If you enable this option, you must run a portmapper
daemon that supports rpcbind protocol version 4.
Serving NFS over IPv6 from knfsd (the kernel's NFS server)
requires that you enable this option and use a portmapper that
supports rpcbind version 4.
If unsure, say N to get traditional behavior (register kernel
RPC services using only rpcbind version 2). Distributions
using the legacy Linux portmapper daemon must say N here.
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
depends on SUNRPC && EXPERIMENTAL
......
......@@ -7,7 +7,7 @@
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
......@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
nfsd-$(CONFIG_NFSD) := nfsctl.o
......
......@@ -5,6 +5,6 @@
obj-$(CONFIG_LOCKD) += lockd.o
lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
svcproc.o svcsubs.o mon.o xdr.o
svcproc.o svcsubs.o mon.o xdr.o grace.o
lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
lockd-objs := $(lockd-objs-y)
......@@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
int status;
status = lockd_up(nlm_init->protocol);
status = lockd_up();
if (status < 0)
return ERR_PTR(status);
host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
nlm_init->protocol, nlm_version,
nlm_init->hostname,
strlen(nlm_init->hostname));
nlm_init->hostname);
if (host == NULL) {
lockd_down();
return ERR_PTR(-ENOLCK);
......@@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
/*
* The server lockd has called us back to tell us the lock was granted
*/
__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
{
const struct file_lock *fl = &lock->fl;
const struct nfs_fh *fh = &lock->fh;
......@@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock
*/
if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
continue;
if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
continue;
if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
continue;
......@@ -216,7 +215,7 @@ reclaimer(void *ptr)
/* This one ensures that our parent doesn't terminate while the
* reclaim is in progress */
lock_kernel();
lockd_up(0); /* note: this cannot fail as lockd is already running */
lockd_up(); /* note: this cannot fail as lockd is already running */
dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
......
/*
* Common code for control of lockd and nfsv4 grace periods.
*/
#include <linux/module.h>
#include <linux/lockd/bind.h>
static LIST_HEAD(grace_list);
static DEFINE_SPINLOCK(grace_lock);
/**
* locks_start_grace
* @lm: who this grace period is for
*
* A grace period is a period during which locks should not be given
* out. Currently grace periods are only enforced by the two lock
* managers (lockd and nfsd), using the locks_in_grace() function to
* check when they are in a grace period.
*
* This function is called to start a grace period.
*/
void locks_start_grace(struct lock_manager *lm)
{
spin_lock(&grace_lock);
list_add(&lm->list, &grace_list);
spin_unlock(&grace_lock);
}
EXPORT_SYMBOL_GPL(locks_start_grace);
/**
* locks_end_grace
* @lm: who this grace period is for
*
* Call this function to state that the given lock manager is ready to
* resume regular locking. The grace period will not end until all lock
* managers that called locks_start_grace() also call locks_end_grace().
* Note that callers count on it being safe to call this more than once,
* and the second call should be a no-op.
*/
void locks_end_grace(struct lock_manager *lm)
{
spin_lock(&grace_lock);
list_del_init(&lm->list);
spin_unlock(&grace_lock);
}
EXPORT_SYMBOL_GPL(locks_end_grace);
/**
* locks_in_grace
*
* Lock managers call this function to determine when it is OK for them
* to answer ordinary lock requests, and when they should accept only
* lock reclaims.
*/
int locks_in_grace(void)
{
return !list_empty(&grace_list);
}
EXPORT_SYMBOL_GPL(locks_in_grace);
......@@ -11,16 +11,17 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/sm_inter.h>
#include <linux/mutex.h>
#include <net/ipv6.h>
#define NLMDBG_FACILITY NLMDBG_HOSTCACHE
#define NLM_HOST_NRHASH 32
#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1))
#define NLM_HOST_REBIND (60 * HZ)
#define NLM_HOST_EXPIRE (300 * HZ)
#define NLM_HOST_COLLECT (120 * HZ)
......@@ -30,42 +31,115 @@ static unsigned long next_gc;
static int nrhosts;
static DEFINE_MUTEX(nlm_host_mutex);
static void nlm_gc_hosts(void);
static struct nsm_handle * __nsm_find(const struct sockaddr_in *,
const char *, unsigned int, int);
static struct nsm_handle * nsm_find(const struct sockaddr_in *sin,
const char *hostname,
unsigned int hostname_len);
static struct nsm_handle *nsm_find(const struct sockaddr *sap,
const size_t salen,
const char *hostname,
const size_t hostname_len,
const int create);
struct nlm_lookup_host_info {
const int server; /* search for server|client */
const struct sockaddr *sap; /* address to search for */
const size_t salen; /* it's length */
const unsigned short protocol; /* transport to search for*/
const u32 version; /* NLM version to search for */
const char *hostname; /* remote's hostname */
const size_t hostname_len; /* it's length */
const struct sockaddr *src_sap; /* our address (optional) */
const size_t src_len; /* it's length */
};
/*
* Hash function must work well on big- and little-endian platforms
*/
static unsigned int __nlm_hash32(const __be32 n)
{
unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
return hash ^ (hash >> 8);
}
static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
{
const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
return __nlm_hash32(sin->sin_addr.s_addr);
}
static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
{
const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
const struct in6_addr addr = sin6->sin6_addr;
return __nlm_hash32(addr.s6_addr32[0]) ^
__nlm_hash32(addr.s6_addr32[1]) ^
__nlm_hash32(addr.s6_addr32[2]) ^
__nlm_hash32(addr.s6_addr32[3]);
}
static unsigned int nlm_hash_address(const struct sockaddr *sap)
{
unsigned int hash;
switch (sap->sa_family) {
case AF_INET:
hash = __nlm_hash_addr4(sap);
break;
case AF_INET6:
hash = __nlm_hash_addr6(sap);
break;
default:
hash = 0;
}
return hash & (NLM_HOST_NRHASH - 1);
}
static void nlm_clear_port(struct sockaddr *sap)
{
switch (sap->sa_family) {
case AF_INET:
((struct sockaddr_in *)sap)->sin_port = 0;
break;
case AF_INET6:
((struct sockaddr_in6 *)sap)->sin6_port = 0;
break;
}
}
static void nlm_display_address(const struct sockaddr *sap,
char *buf, const size_t len)
{
const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
switch (sap->sa_family) {
case AF_UNSPEC:
snprintf(buf, len, "unspecified");
break;
case AF_INET:
snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
break;
case AF_INET6:
if (ipv6_addr_v4mapped(&sin6->sin6_addr))
snprintf(buf, len, NIPQUAD_FMT,
NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
else
snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
break;
default:
snprintf(buf, len, "unsupported address family");
break;
}
}
/*
* Common host lookup routine for server & client
*/
static struct nlm_host *nlm_lookup_host(int server,
const struct sockaddr_in *sin,
int proto, u32 version,
const char *hostname,
unsigned int hostname_len,
const struct sockaddr_in *ssin)
static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
{
struct hlist_head *chain;
struct hlist_node *pos;
struct nlm_host *host;
struct nsm_handle *nsm = NULL;
int hash;
dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
", p=%d, v=%u, my role=%s, name=%.*s)\n",
NIPQUAD(ssin->sin_addr.s_addr),
NIPQUAD(sin->sin_addr.s_addr), proto, version,
server? "server" : "client",
hostname_len,
hostname? hostname : "<none>");
hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
/* Lock hash table */
mutex_lock(&nlm_host_mutex);
if (time_after_eq(jiffies, next_gc))
......@@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server,
* different NLM rpc_clients into one single nlm_host object.
* This would allow us to have one nlm_host per address.
*/
chain = &nlm_hosts[hash];
chain = &nlm_hosts[nlm_hash_address(ni->sap)];
hlist_for_each_entry(host, pos, chain, h_hash) {
if (!nlm_cmp_addr(&host->h_addr, sin))
if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
continue;
/* See if we have an NSM handle for this client */
if (!nsm)
nsm = host->h_nsmhandle;
if (host->h_proto != proto)
if (host->h_proto != ni->protocol)
continue;
if (host->h_version != version)
if (host->h_version != ni->version)
continue;
if (host->h_server != server)
if (host->h_server != ni->server)
continue;
if (!nlm_cmp_addr(&host->h_saddr, ssin))
if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
continue;
/* Move to head of hash chain. */
......@@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server,
hlist_add_head(&host->h_hash, chain);
nlm_get_host(host);
dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
host->h_name, host->h_addrbuf);
goto out;
}
if (nsm)
atomic_inc(&nsm->sm_count);
host = NULL;
/* Sadly, the host isn't in our hash table yet. See if
* we have an NSM handle for it. If not, create one.
/*
* The host wasn't in our hash table. If we don't
* have an NSM handle for it yet, create one.
*/
if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
goto out;
if (nsm)
atomic_inc(&nsm->sm_count);
else {
host = NULL;
nsm = nsm_find(ni->sap, ni->salen,
ni->hostname, ni->hostname_len, 1);
if (!nsm) {
dprintk("lockd: nlm_lookup_host failed; "
"no nsm handle\n");
goto out;
}
}
host = kzalloc(sizeof(*host), GFP_KERNEL);
if (!host) {
nsm_release(nsm);
dprintk("lockd: nlm_lookup_host failed; no memory\n");
goto out;
}
host->h_name = nsm->sm_name;
host->h_addr = *sin;
host->h_addr.sin_port = 0; /* ouch! */
host->h_saddr = *ssin;
host->h_version = version;
host->h_proto = proto;
memcpy(nlm_addr(host), ni->sap, ni->salen);
host->h_addrlen = ni->salen;
nlm_clear_port(nlm_addr(host));
memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
host->h_version = ni->version;
host->h_proto = ni->protocol;
host->h_rpcclnt = NULL;
mutex_init(&host->h_mutex);
host->h_nextrebind = jiffies + NLM_HOST_REBIND;
......@@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server,
host->h_state = 0; /* pseudo NSM state */
host->h_nsmstate = 0; /* real NSM state */
host->h_nsmhandle = nsm;
host->h_server = server;
host->h_server = ni->server;
hlist_add_head(&host->h_hash, chain);
INIT_LIST_HEAD(&host->h_lockowners);
spin_lock_init(&host->h_lock);
......@@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server,
INIT_LIST_HEAD(&host->h_reclaim);
nrhosts++;
nlm_display_address((struct sockaddr *)&host->h_addr,
host->h_addrbuf, sizeof(host->h_addrbuf));
nlm_display_address((struct sockaddr *)&host->h_srcaddr,
host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
dprintk("lockd: nlm_lookup_host created host %s\n",
host->h_name);
out:
mutex_unlock(&nlm_host_mutex);
return host;
......@@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host)
kfree(host);
}
/*
* Find an NLM server handle in the cache. If there is none, create it.
/**
* nlmclnt_lookup_host - Find an NLM host handle matching a remote server
* @sap: network address of server
* @salen: length of server address
* @protocol: transport protocol to use
* @version: NLM protocol version
* @hostname: '\0'-terminated hostname of server
*
* Returns an nlm_host structure that matches the passed-in
* [server address, transport protocol, NLM version, server hostname].
* If one doesn't already exist in the host cache, a new handle is
* created and returned.
*/
struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
int proto, u32 version,
const char *hostname,
unsigned int hostname_len)
struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
const size_t salen,
const unsigned short protocol,
const u32 version, const char *hostname)
{
struct sockaddr_in ssin = {0};
return nlm_lookup_host(0, sin, proto, version,
hostname, hostname_len, &ssin);
const struct sockaddr source = {
.sa_family = AF_UNSPEC,
};
struct nlm_lookup_host_info ni = {
.server = 0,
.sap = sap,
.salen = salen,
.protocol = protocol,
.version = version,
.hostname = hostname,
.hostname_len = strlen(hostname),
.src_sap = &source,
.src_len = sizeof(source),
};
dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
(hostname ? hostname : "<none>"), version,
(protocol == IPPROTO_UDP ? "udp" : "tcp"));
return nlm_lookup_host(&ni);
}
/*
* Find an NLM client handle in the cache. If there is none, create it.
/**
* nlmsvc_lookup_host - Find an NLM host handle matching a remote client
* @rqstp: incoming NLM request
* @hostname: name of client host
* @hostname_len: length of client hostname
*
* Returns an nlm_host structure that matches the [client address,
* transport protocol, NLM version, client hostname] of the passed-in
* NLM request. If one doesn't already exist in the host cache, a
* new handle is created and returned.
*
* Before possibly creating a new nlm_host, construct a sockaddr
* for a specific source address in case the local system has
* multiple network addresses. The family of the address in
* rq_daddr is guaranteed to be the same as the family of the
* address in rq_addr, so it's safe to use the same family for
* the source address.
*/
struct nlm_host *
nlmsvc_lookup_host(struct svc_rqst *rqstp,
const char *hostname, unsigned int hostname_len)
struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
const char *hostname,
const size_t hostname_len)
{
struct sockaddr_in ssin = {0};
struct sockaddr_in sin = {
.sin_family = AF_INET,
};
struct sockaddr_in6 sin6 = {
.sin6_family = AF_INET6,
};
struct nlm_lookup_host_info ni = {
.server = 1,
.sap = svc_addr(rqstp),
.salen = rqstp->rq_addrlen,
.protocol = rqstp->rq_prot,
.version = rqstp->rq_vers,
.hostname = hostname,
.hostname_len = hostname_len,
.src_len = rqstp->rq_addrlen,
};
dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
(int)hostname_len, hostname, rqstp->rq_vers,
(rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
switch (ni.sap->sa_family) {
case AF_INET:
sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
ni.src_sap = (struct sockaddr *)&sin;
break;
case AF_INET6:
ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
ni.src_sap = (struct sockaddr *)&sin6;
break;
default:
return NULL;
}
ssin.sin_addr = rqstp->rq_daddr.addr;
return nlm_lookup_host(1, svc_addr_in(rqstp),
rqstp->rq_prot, rqstp->rq_vers,
hostname, hostname_len, &ssin);
return nlm_lookup_host(&ni);
}
/*
......@@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host)
{
struct rpc_clnt *clnt;
dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n",
NIPQUAD(host->h_saddr.sin_addr),
NIPQUAD(host->h_addr.sin_addr));
dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
/* Lock host handle */
mutex_lock(&host->h_mutex);
......@@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host)
if (time_after_eq(jiffies, host->h_nextrebind)) {
rpc_force_rebind(clnt);
host->h_nextrebind = jiffies + NLM_HOST_REBIND;
dprintk("lockd: next rebind in %ld jiffies\n",
dprintk("lockd: next rebind in %lu jiffies\n",
host->h_nextrebind - jiffies);
}
} else {
......@@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host)
};
struct rpc_create_args args = {
.protocol = host->h_proto,
.address = (struct sockaddr *)&host->h_addr,
.addrsize = sizeof(host->h_addr),
.saddress = (struct sockaddr *)&host->h_saddr,
.address = nlm_addr(host),
.addrsize = host->h_addrlen,
.saddress = nlm_srcaddr(host),
.timeout = &timeparms,
.servername = host->h_name,
.program = &nlm_program,
......@@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin,
struct nsm_handle *nsm;
struct nlm_host *host;
dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
hostname, NIPQUAD(sin->sin_addr));
/* Find the NSM handle for this peer */
if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
hostname, hostname_len, 0);
if (nsm == NULL) {
dprintk("lockd: never saw rebooted peer '%.*s' before\n",
hostname_len, hostname);
return;
}
dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
hostname_len, hostname, nsm->sm_addrbuf);
/* When reclaiming locks on this peer, make sure that
* we set up a new notification */
......@@ -461,22 +628,23 @@ nlm_gc_hosts(void)
static LIST_HEAD(nsm_handles);
static DEFINE_SPINLOCK(nsm_lock);
static struct nsm_handle *
__nsm_find(const struct sockaddr_in *sin,
const char *hostname, unsigned int hostname_len,
int create)
static struct nsm_handle *nsm_find(const struct sockaddr *sap,
const size_t salen,
const char *hostname,
const size_t hostname_len,
const int create)
{
struct nsm_handle *nsm = NULL;
struct nsm_handle *pos;
if (!sin)
if (!sap)
return NULL;
if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
if (printk_ratelimit()) {
printk(KERN_WARNING "Invalid hostname \"%.*s\" "
"in NFS lock request\n",
hostname_len, hostname);
(int)hostname_len, hostname);
}
return NULL;
}
......@@ -489,7 +657,7 @@ __nsm_find(const struct sockaddr_in *sin,
if (strlen(pos->sm_name) != hostname_len
|| memcmp(pos->sm_name, hostname, hostname_len))
continue;
} else if (!nlm_cmp_addr(&pos->sm_addr, sin))
} else if (!nlm_cmp_addr(nsm_addr(pos), sap))
continue;
atomic_inc(&pos->sm_count);
kfree(nsm);
......@@ -509,10 +677,13 @@ __nsm_find(const struct sockaddr_in *sin,
if (nsm == NULL)
return NULL;
nsm->sm_addr = *sin;
memcpy(nsm_addr(nsm), sap, salen);
nsm->sm_addrlen = salen;
nsm->sm_name = (char *) (nsm + 1);
memcpy(nsm->sm_name, hostname, hostname_len);
nsm->sm_name[hostname_len] = '\0';
nlm_display_address((struct sockaddr *)&nsm->sm_addr,
nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
atomic_set(&nsm->sm_count, 1);
goto retry;
......@@ -521,13 +692,6 @@ __nsm_find(const struct sockaddr_in *sin,
return nsm;
}
static struct nsm_handle *
nsm_find(const struct sockaddr_in *sin, const char *hostname,
unsigned int hostname_len)
{
return __nsm_find(sin, hostname, hostname_len, 1);
}
/*
* Release an NSM handle
*/
......
......@@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
memset(&args, 0, sizeof(args));
args.mon_name = nsm->sm_name;
args.addr = nsm->sm_addr.sin_addr.s_addr;
args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
args.prog = NLM_PROGRAM;
args.vers = 3;
args.proc = NLMPROC_NSM_NOTIFY;
......
......@@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex);
static unsigned int nlmsvc_users;
static struct task_struct *nlmsvc_task;
static struct svc_rqst *nlmsvc_rqst;
int nlmsvc_grace_period;
unsigned long nlmsvc_timeout;
/*
......@@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void)
return nlm_timeout * 5 * HZ;
}
unsigned long get_nfs_grace_period(void)
{
unsigned long lockdgrace = get_lockd_grace_period();
unsigned long nfsdgrace = 0;
if (nlmsvc_ops)
nfsdgrace = nlmsvc_ops->get_grace_period();
return max(lockdgrace, nfsdgrace);
}
EXPORT_SYMBOL(get_nfs_grace_period);
static struct lock_manager lockd_manager = {
};
static unsigned long set_grace_period(void)
static void grace_ender(struct work_struct *not_used)
{
nlmsvc_grace_period = 1;
return get_nfs_grace_period() + jiffies;
locks_end_grace(&lockd_manager);
}
static inline void clear_grace_period(void)
static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
static void set_grace_period(void)
{
nlmsvc_grace_period = 0;
unsigned long grace_period = get_lockd_grace_period();
locks_start_grace(&lockd_manager);
cancel_delayed_work_sync(&grace_period_end);
schedule_delayed_work(&grace_period_end, grace_period);
}
/*
......@@ -116,7 +111,6 @@ lockd(void *vrqstp)
{
int err = 0, preverr = 0;
struct svc_rqst *rqstp = vrqstp;
unsigned long grace_period_expire;
/* try_to_freeze() is called from svc_recv() */
set_freezable();
......@@ -139,7 +133,7 @@ lockd(void *vrqstp)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
grace_period_expire = set_grace_period();
set_grace_period();
/*
* The main request loop. We don't terminate until the last
......@@ -153,21 +147,12 @@ lockd(void *vrqstp)
flush_signals(current);
if (nlmsvc_ops) {
nlmsvc_invalidate_all();
grace_period_expire = set_grace_period();
set_grace_period();
}
continue;
}
/*
* Retry any blocked locks that have been notified by
* the VFS. Don't do this during grace period.
* (Theoretically, there shouldn't even be blocked locks
* during grace period).
*/
if (!nlmsvc_grace_period) {
timeout = nlmsvc_retry_blocked();
} else if (time_before(grace_period_expire, jiffies))
clear_grace_period();
timeout = nlmsvc_retry_blocked();
/*
* Find a socket with data available and call its
......@@ -195,6 +180,7 @@ lockd(void *vrqstp)
svc_process(rqstp);
}
flush_signals(current);
cancel_delayed_work_sync(&grace_period_end);
if (nlmsvc_ops)
nlmsvc_invalidate_all();
nlm_shutdown_hosts();
......@@ -203,25 +189,28 @@ lockd(void *vrqstp)
}
/*
* Make any sockets that are needed but not present.
* If nlm_udpport or nlm_tcpport were set as module
* options, make those sockets unconditionally
* Ensure there are active UDP and TCP listeners for lockd.
*
* Even if we have only TCP NFS mounts and/or TCP NFSDs, some
* local services (such as rpc.statd) still require UDP, and
* some NFS servers do not yet support NLM over TCP.
*
* Returns zero if all listeners are available; otherwise a
* negative errno value is returned.
*/
static int make_socks(struct svc_serv *serv, int proto)
static int make_socks(struct svc_serv *serv)
{
static int warned;
struct svc_xprt *xprt;
int err = 0;
if (proto == IPPROTO_UDP || nlm_udpport) {
xprt = svc_find_xprt(serv, "udp", 0, 0);
if (!xprt)
err = svc_create_xprt(serv, "udp", nlm_udpport,
SVC_SOCK_DEFAULTS);
else
svc_xprt_put(xprt);
}
if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
xprt = svc_find_xprt(serv, "udp", 0, 0);
if (!xprt)
err = svc_create_xprt(serv, "udp", nlm_udpport,
SVC_SOCK_DEFAULTS);
else
svc_xprt_put(xprt);
if (err >= 0) {
xprt = svc_find_xprt(serv, "tcp", 0, 0);
if (!xprt)
err = svc_create_xprt(serv, "tcp", nlm_tcpport,
......@@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto)
/*
* Bring up the lockd process if it's not already up.
*/
int
lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
int lockd_up(void)
{
struct svc_serv *serv;
int error = 0;
......@@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
/*
* Check whether we're already up and running.
*/
if (nlmsvc_rqst) {
if (proto)
error = make_socks(nlmsvc_rqst->rq_server, proto);
if (nlmsvc_rqst)
goto out;
}
/*
* Sanity check: if there's no pid,
......@@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
"lockd_up: no pid, %d users??\n", nlmsvc_users);
error = -ENOMEM;
serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
goto out;
}
if ((error = make_socks(serv, proto)) < 0)
error = make_socks(serv);
if (error < 0)
goto destroy_and_out;
/*
......
......@@ -88,12 +88,6 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
dprintk("lockd: TEST4 called\n");
resp->cookie = argp->cookie;
/* Don't accept test requests during grace period */
if (nlmsvc_grace_period) {
resp->status = nlm_lck_denied_grace_period;
return rc;
}
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
......@@ -122,12 +116,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
if (nlmsvc_grace_period && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rc;
}
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
......@@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Now try to lock the file */
resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
argp->block, &argp->cookie);
argp->block, &argp->cookie,
argp->reclaim);
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
......@@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
if (nlmsvc_grace_period) {
if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
......@@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
if (nlmsvc_grace_period) {
if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
......@@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
dprintk("lockd: GRANTED called\n");
resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
return rpc_success;
}
......@@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
if (nlmsvc_grace_period && !argp->reclaim) {
if (locks_in_grace() && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
......@@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
if (nlmsvc_grace_period) {
if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
......@@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
{
struct sockaddr_in saddr;
memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
dprintk("lockd: SM_NOTIFY called\n");
if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
|| ntohs(saddr.sin_port) >= 1024) {
if (!nlm_privileged_requester(rqstp)) {
char buf[RPC_MAX_ADDRBUFLEN];
printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
......
......@@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
__be32
nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
struct nlm_cookie *cookie)
struct nlm_cookie *cookie, int reclaim)
{
struct nlm_block *block = NULL;
int error;
......@@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
if (locks_in_grace() && !reclaim) {
ret = nlm_lck_denied_grace_period;
goto out;
}
if (reclaim && !locks_in_grace()) {
ret = nlm_lck_denied_grace_period;
goto out;
}
if (!wait)
lock->fl.fl_flags &= ~FL_SLEEP;
error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
......@@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
if (locks_in_grace()) {
ret = nlm_lck_denied_grace_period;
goto out;
}
error = vfs_test_lock(file->f_file, &lock->fl);
if (error == FILE_LOCK_DEFERRED) {
ret = nlmsvc_defer_lock_rqst(rqstp, block);
......@@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
if (locks_in_grace())
return nlm_lck_denied_grace_period;
mutex_lock(&file->f_mutex);
block = nlmsvc_lookup_block(file, lock);
mutex_unlock(&file->f_mutex);
......
......@@ -117,12 +117,6 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
dprintk("lockd: TEST called\n");
resp->cookie = argp->cookie;
/* Don't accept test requests during grace period */
if (nlmsvc_grace_period) {
resp->status = nlm_lck_denied_grace_period;
return rc;
}
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
......@@ -152,12 +146,6 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
if (nlmsvc_grace_period && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rc;
}
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
......@@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Now try to lock the file */
resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
argp->block, &argp->cookie));
argp->block, &argp->cookie,
argp->reclaim));
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
......@@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
if (nlmsvc_grace_period) {
if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
......@@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
if (nlmsvc_grace_period) {
if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
......@@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
dprintk("lockd: GRANTED called\n");
resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
return rpc_success;
}
......@@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
if (nlmsvc_grace_period && !argp->reclaim) {
if (locks_in_grace() && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
......@@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
if (nlmsvc_grace_period) {
if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
......@@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
{
struct sockaddr_in saddr;
memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
dprintk("lockd: SM_NOTIFY called\n");
if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
|| ntohs(saddr.sin_port) >= 1024) {
if (!nlm_privileged_requester(rqstp)) {
char buf[RPC_MAX_ADDRBUFLEN];
printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
......
......@@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
static int
nlmsvc_match_ip(void *datap, struct nlm_host *host)
{
return nlm_cmp_addr(&host->h_saddr, datap);
return nlm_cmp_addr(nlm_srcaddr(host), datap);
}
/**
......
......@@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
argp->state = ntohl(*p++);
/* Preserve the address in network byte order */
argp->addr = *p++;
argp->vers = *p++;
argp->proto = *p++;
return xdr_argsize_check(rqstp, p);
}
......
......@@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
argp->state = ntohl(*p++);
/* Preserve the address in network byte order */
argp->addr = *p++;
argp->vers = *p++;
argp->proto = *p++;
return xdr_argsize_check(rqstp, p);
}
......
......@@ -105,7 +105,8 @@ int nfs_callback_up(void)
mutex_lock(&nfs_callback_mutex);
if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
goto out;
serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
AF_INET, NULL);
ret = -ENOMEM;
if (!serv)
goto out_err;
......
......@@ -70,7 +70,6 @@ nlm_fclose(struct file *filp)
static struct nlmsvc_binding nfsd_nlm_ops = {
.fopen = nlm_fopen, /* open file for locking */
.fclose = nlm_fclose, /* close file */
.get_grace_period = get_nfs4_grace_period,
};
void
......
......@@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
nfserr = fh_verify(rqstp, &resp->fh, 0,
NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
if (nfserr)
RETURN_STATUS(nfserr);
......@@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: FSSTAT(3) %s\n",
SVCFH_fmt(&argp->fh));
nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
fh_put(&argp->fh);
RETURN_STATUS(nfserr);
}
......@@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
resp->f_maxfilesize = ~(u32) 0;
resp->f_properties = NFS3_FSF_DEFAULT;
nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
nfserr = fh_verify(rqstp, &argp->fh, 0,
NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
/* Check special features of the file system. May request
* different read/write sizes for file systems known to have
......
......@@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
WRITE32(OP_CB_RECALL);
WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t));
WRITE32(cb_rec->cbr_stateid.si_generation);
WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
WRITE32(cb_rec->cbr_trunc);
WRITE32(len);
WRITEMEM(cb_rec->cbr_fhval, len);
......@@ -379,6 +380,7 @@ static int do_probe_callback(void *data)
.addrsize = sizeof(addr),
.timeout = &timeparms,
.program = &cb_program,
.prognumber = cb->cb_prog,
.version = nfs_cb_version[1]->number,
.authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
......@@ -396,9 +398,6 @@ static int do_probe_callback(void *data)
addr.sin_port = htons(cb->cb_port);
addr.sin_addr.s_addr = htonl(cb->cb_addr);
/* Initialize rpc_stat */
memset(args.program->stats, 0, sizeof(struct rpc_stat));
/* Create RPC client */
client = rpc_create(&args);
if (IS_ERR(client)) {
......
......@@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Openowner is now set, so sequence id will get bumped. Now we need
* these checks before we do any creates: */
status = nfserr_grace;
if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
status = nfserr_no_grace;
if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
switch (open->op_claim_type) {
......@@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
if (nfs4_in_grace())
if (locks_in_grace())
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
......@@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!cstate->save_fh.fh_dentry)
return status;
if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags
if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
& NFSEXP_NOSUBTREECHECK))
return nfserr_grace;
status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
......
......@@ -61,7 +61,6 @@
static time_t lease_time = 90; /* default lease time */
static time_t user_lease_time = 90;
static time_t boot_time;
static int in_grace = 1;
static u32 current_ownerid = 1;
static u32 current_fileid = 1;
static u32 current_delegid = 1;
......@@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
case NFS4_OPEN_CLAIM_NULL:
/* Let's not give out any delegations till everyone's
* had the chance to reclaim theirs.... */
if (nfs4_in_grace())
if (locks_in_grace())
goto out;
if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
goto out;
......@@ -1816,12 +1815,15 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
struct lock_manager nfsd4_manager = {
};
static void
end_grace(void)
nfsd4_end_grace(void)
{
dprintk("NFSD: end of grace period\n");
nfsd4_recdir_purge_old();
in_grace = 0;
locks_end_grace(&nfsd4_manager);
}
static time_t
......@@ -1838,8 +1840,8 @@ nfs4_laundromat(void)
nfs4_lock_state();
dprintk("NFSD: laundromat service - starting\n");
if (in_grace)
end_grace();
if (locks_in_grace())
nfsd4_end_grace();
list_for_each_safe(pos, next, &client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
......@@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
return nfserr_bad_stateid;
else if (ONE_STATEID(stateid) && (flags & RD_STATE))
return nfs_ok;
else if (nfs4_in_grace()) {
else if (locks_in_grace()) {
/* Answer in remaining cases depends on existance of
* conflicting state; so we must wait out the grace period. */
return nfserr_grace;
......@@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
static inline int
io_during_grace_disallowed(struct inode *inode, int flags)
{
return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
&& mandatory_lock(inode);
}
......@@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
filp = lock_stp->st_vfs_file;
status = nfserr_grace;
if (nfs4_in_grace() && !lock->lk_reclaim)
if (locks_in_grace() && !lock->lk_reclaim)
goto out;
status = nfserr_no_grace;
if (!nfs4_in_grace() && lock->lk_reclaim)
if (!locks_in_grace() && lock->lk_reclaim)
goto out;
locks_init_lock(&file_lock);
......@@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int error;
__be32 status;
if (nfs4_in_grace())
if (locks_in_grace())
return nfserr_grace;
if (check_lock_length(lockt->lt_offset, lockt->lt_length))
......@@ -3192,9 +3194,9 @@ __nfs4_state_start(void)
unsigned long grace_time;
boot_time = get_seconds();
grace_time = get_nfs_grace_period();
grace_time = get_nfs4_grace_period();
lease_time = user_lease_time;
in_grace = 1;
locks_start_grace(&nfsd4_manager);
printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
grace_time/HZ);
laundry_wq = create_singlethread_workqueue("nfsd4");
......@@ -3213,12 +3215,6 @@ nfs4_state_start(void)
return;
}
int
nfs4_in_grace(void)
{
return in_grace;
}
time_t
nfs4_lease_time(void)
{
......
......@@ -412,6 +412,18 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
goto out;
}
static __be32
nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
{
DECODE_HEAD;
READ_BUF(sizeof(stateid_t));
READ32(sid->si_generation);
COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
DECODE_TAIL;
}
static __be32
nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
{
......@@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
DECODE_HEAD;
close->cl_stateowner = NULL;
READ_BUF(4 + sizeof(stateid_t));
READ_BUF(4);
READ32(close->cl_seqid);
READ32(close->cl_stateid.si_generation);
COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
return nfsd4_decode_stateid(argp, &close->cl_stateid);
DECODE_TAIL;
}
......@@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
static inline __be32
nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
{
DECODE_HEAD;
READ_BUF(sizeof(stateid_t));
READ32(dr->dr_stateid.si_generation);
COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
DECODE_TAIL;
return nfsd4_decode_stateid(argp, &dr->dr_stateid);
}
static inline __be32
......@@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
READ32(lock->lk_is_new);
if (lock->lk_is_new) {
READ_BUF(36);
READ_BUF(4);
READ32(lock->lk_new_open_seqid);
READ32(lock->lk_new_open_stateid.si_generation);
COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t));
status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
if (status)
return status;
READ_BUF(8 + sizeof(clientid_t));
READ32(lock->lk_new_lock_seqid);
COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
READ32(lock->lk_new_owner.len);
READ_BUF(lock->lk_new_owner.len);
READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
} else {
READ_BUF(20);
READ32(lock->lk_old_lock_stateid.si_generation);
COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t));
status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
if (status)
return status;
READ_BUF(4);
READ32(lock->lk_old_lock_seqid);
}
......@@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
DECODE_HEAD;
locku->lu_stateowner = NULL;
READ_BUF(24 + sizeof(stateid_t));
READ_BUF(8);
READ32(locku->lu_type);
if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
goto xdr_error;
READ32(locku->lu_seqid);
READ32(locku->lu_stateid.si_generation);
COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
if (status)
return status;
READ_BUF(16);
READ64(locku->lu_offset);
READ64(locku->lu_length);
......@@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ32(open->op_delegate_type);
break;
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
READ_BUF(sizeof(stateid_t) + 4);
COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t));
status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
if (status)
return status;
READ_BUF(4);
READ32(open->op_fname.len);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
......@@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
DECODE_HEAD;
open_conf->oc_stateowner = NULL;
READ_BUF(4 + sizeof(stateid_t));
READ32(open_conf->oc_req_stateid.si_generation);
COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t));
status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
if (status)
return status;
READ_BUF(4);
READ32(open_conf->oc_seqid);
DECODE_TAIL;
......@@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
DECODE_HEAD;
open_down->od_stateowner = NULL;
READ_BUF(12 + sizeof(stateid_t));
READ32(open_down->od_stateid.si_generation);
COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t));
status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
if (status)
return status;
READ_BUF(12);
READ32(open_down->od_seqid);
READ32(open_down->od_share_access);
READ32(open_down->od_share_deny);
......@@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
{
DECODE_HEAD;
READ_BUF(sizeof(stateid_t) + 12);
READ32(read->rd_stateid.si_generation);
COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t));
status = nfsd4_decode_stateid(argp, &read->rd_stateid);
if (status)
return status;
READ_BUF(12);
READ64(read->rd_offset);
READ32(read->rd_length);
......@@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
{
DECODE_HEAD;
READ_BUF(sizeof(stateid_t));
READ32(setattr->sa_stateid.si_generation);
COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
goto out;
__be32 status;
DECODE_TAIL;
status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
if (status)
return status;
return nfsd4_decode_fattr(argp, setattr->sa_bmval,
&setattr->sa_iattr, &setattr->sa_acl);
}
static __be32
......@@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
int len;
DECODE_HEAD;
READ_BUF(sizeof(stateid_opaque_t) + 20);
READ32(write->wr_stateid.si_generation);
COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t));
status = nfsd4_decode_stateid(argp, &write->wr_stateid);
if (status)
return status;
READ_BUF(16);
READ64(write->wr_offset);
READ32(write->wr_stable_how);
if (write->wr_stable_how > 2)
......@@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
* Header routine to setup seqid operation replay cache
*/
#define ENCODE_SEQID_OP_HEAD \
__be32 *p; \
__be32 *save; \
\
save = resp->p;
......@@ -1950,6 +1962,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
return -EINVAL;
}
static void
nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
{
ENCODE_HEAD;
RESERVE_SPACE(sizeof(stateid_t));
WRITE32(sid->si_generation);
WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
ADJUST_ARGS();
}
static __be32
nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
{
......@@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
{
ENCODE_SEQID_OP_HEAD;
if (!nfserr) {
RESERVE_SPACE(sizeof(stateid_t));
WRITE32(close->cl_stateid.si_generation);
WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
ADJUST_ARGS();
}
if (!nfserr)
nfsd4_encode_stateid(resp, &close->cl_stateid);
ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
return nfserr;
}
......@@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
{
ENCODE_SEQID_OP_HEAD;
if (!nfserr) {
RESERVE_SPACE(4 + sizeof(stateid_t));
WRITE32(lock->lk_resp_stateid.si_generation);
WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
ADJUST_ARGS();
} else if (nfserr == nfserr_denied)
if (!nfserr)
nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
else if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(resp, &lock->lk_denied);
ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
......@@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
{
ENCODE_SEQID_OP_HEAD;
if (!nfserr) {
RESERVE_SPACE(sizeof(stateid_t));
WRITE32(locku->lu_stateid.si_generation);
WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
ADJUST_ARGS();
}
if (!nfserr)
nfsd4_encode_stateid(resp, &locku->lu_stateid);
ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
return nfserr;
}
......@@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
static __be32
nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
{
ENCODE_HEAD;
ENCODE_SEQID_OP_HEAD;
if (nfserr)
goto out;
RESERVE_SPACE(36 + sizeof(stateid_t));
WRITE32(open->op_stateid.si_generation);
WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t));
nfsd4_encode_stateid(resp, &open->op_stateid);
RESERVE_SPACE(40);
WRITECINFO(open->op_cinfo);
WRITE32(open->op_rflags);
WRITE32(2);
......@@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
case NFS4_OPEN_DELEGATE_NONE:
break;
case NFS4_OPEN_DELEGATE_READ:
RESERVE_SPACE(20 + sizeof(stateid_t));
WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
RESERVE_SPACE(20);
WRITE32(open->op_recall);
/*
......@@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
ADJUST_ARGS();
break;
case NFS4_OPEN_DELEGATE_WRITE:
RESERVE_SPACE(32 + sizeof(stateid_t));
WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
RESERVE_SPACE(32);
WRITE32(0);
/*
......@@ -2195,13 +2208,9 @@ static __be32
nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
{
ENCODE_SEQID_OP_HEAD;
if (!nfserr) {
RESERVE_SPACE(sizeof(stateid_t));
WRITE32(oc->oc_resp_stateid.si_generation);
WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
ADJUST_ARGS();
}
if (!nfserr)
nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
return nfserr;
......@@ -2211,13 +2220,9 @@ static __be32
nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
{
ENCODE_SEQID_OP_HEAD;
if (!nfserr) {
RESERVE_SPACE(sizeof(stateid_t));
WRITE32(od->od_stateid.si_generation);
WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t));
ADJUST_ARGS();
}
if (!nfserr)
nfsd4_encode_stateid(resp, &od->od_stateid);
ENCODE_SEQID_OP_TAIL(od->od_stateowner);
return nfserr;
......
......@@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
return -EINVAL;
err = nfsd_create_serv();
if (!err) {
int proto = 0;
err = svc_addsock(nfsd_serv, fd, buf, &proto);
err = svc_addsock(nfsd_serv, fd, buf);
if (err >= 0) {
err = lockd_up(proto);
err = lockd_up();
if (err < 0)
svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
}
......
......@@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
if (error)
goto out;
if (!(access & NFSD_MAY_LOCK)) {
/*
* pseudoflavor restrictions are not enforced on NLM,
* which clients virtually always use auth_sys for,
* even while using RPCSEC_GSS for NFS.
*/
error = check_nfsd_access(exp, rqstp);
if (error)
goto out;
}
/*
* pseudoflavor restrictions are not enforced on NLM,
* which clients virtually always use auth_sys for,
* even while using RPCSEC_GSS for NFS.
*/
if (access & NFSD_MAY_LOCK)
goto skip_pseudoflavor_check;
/*
* Clients may expect to be able to use auth_sys during mount,
* even if they use gss for everything else; see section 2.3.2
* of rfc 2623.
*/
if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
&& exp->ex_path.dentry == dentry)
goto skip_pseudoflavor_check;
error = check_nfsd_access(exp, rqstp);
if (error)
goto out;
skip_pseudoflavor_check:
/* Finally, check access permissions. */
error = nfsd_permission(rqstp, exp, dentry, access);
......
......@@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
nfserr = fh_verify(rqstp, &resp->fh, 0,
NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
return nfsd_return_attrs(nfserr, resp);
}
......@@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
NFSD_MAY_BYPASS_GSS_ON_ROOT);
fh_put(&argp->fh);
return nfserr;
}
......
......@@ -229,6 +229,7 @@ int nfsd_create_serv(void)
atomic_set(&nfsd_busy, 0);
nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
AF_INET,
nfsd_last_thread, nfsd, THIS_MODULE);
if (nfsd_serv == NULL)
err = -ENOMEM;
......@@ -243,25 +244,20 @@ static int nfsd_init_socks(int port)
if (!list_empty(&nfsd_serv->sv_permsocks))
return 0;
error = lockd_up(IPPROTO_UDP);
if (error >= 0) {
error = svc_create_xprt(nfsd_serv, "udp", port,
error = svc_create_xprt(nfsd_serv, "udp", port,
SVC_SOCK_DEFAULTS);
if (error < 0)
lockd_down();
}
if (error < 0)
return error;
error = lockd_up(IPPROTO_TCP);
if (error >= 0) {
error = svc_create_xprt(nfsd_serv, "tcp", port,
error = svc_create_xprt(nfsd_serv, "tcp", port,
SVC_SOCK_DEFAULTS);
if (error < 0)
lockd_down();
}
if (error < 0)
return error;
error = lockd_up();
if (error < 0)
return error;
return 0;
}
......
......@@ -83,7 +83,6 @@ struct raparm_hbucket {
spinlock_t pb_lock;
} ____cacheline_aligned_in_smp;
static struct raparms * raparml;
#define RAPARM_HASH_BITS 4
#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
......@@ -1866,9 +1865,9 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
* N.B. After this call fhp needs an fh_put
*/
__be32
nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
{
__be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
__be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
if (!err && vfs_statfs(fhp->fh_dentry,stat))
err = nfserr_io;
return err;
......@@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
void
nfsd_racache_shutdown(void)
{
if (!raparml)
return;
struct raparms *raparm, *last_raparm;
unsigned int i;
dprintk("nfsd: freeing readahead buffers.\n");
kfree(raparml);
raparml = NULL;
for (i = 0; i < RAPARM_HASH_SIZE; i++) {
raparm = raparm_hash[i].pb_head;
while(raparm) {
last_raparm = raparm;
raparm = raparm->p_next;
kfree(last_raparm);
}
raparm_hash[i].pb_head = NULL;
}
}
/*
* Initialize readahead param cache
......@@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size)
int i;
int j = 0;
int nperbucket;
struct raparms **raparm = NULL;
if (raparml)
if (raparm_hash[0].pb_head)
return 0;
if (cache_size < 2*RAPARM_HASH_SIZE)
cache_size = 2*RAPARM_HASH_SIZE;
raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
if (!raparml) {
printk(KERN_WARNING
"nfsd: Could not allocate memory read-ahead cache.\n");
return -ENOMEM;
}
nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
if (nperbucket < 2)
nperbucket = 2;
cache_size = nperbucket * RAPARM_HASH_SIZE;
dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
raparm_hash[i].pb_head = NULL;
for (i = 0; i < RAPARM_HASH_SIZE; i++) {
spin_lock_init(&raparm_hash[i].pb_lock);
}
nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
for (i = 0; i < cache_size - 1; i++) {
if (i % nperbucket == 0)
raparm_hash[j++].pb_head = raparml + i;
if (i % nperbucket < nperbucket-1)
raparml[i].p_next = raparml + i + 1;
raparm = &raparm_hash[i].pb_head;
for (j = 0; j < nperbucket; j++) {
*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
if (!*raparm)
goto out_nomem;
raparm = &(*raparm)->p_next;
}
*raparm = NULL;
}
nfsdstats.ra_size = cache_size;
return 0;
out_nomem:
dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
nfsd_racache_shutdown();
return -ENOMEM;
}
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
......
......@@ -683,6 +683,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
return proc_calc_metrics(page, start, off, count, eof, len);
}
#ifdef CONFIG_FILE_LOCKING
static int locks_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &locks_seq_operations);
......@@ -694,6 +695,7 @@ static const struct file_operations proc_locks_operations = {
.llseek = seq_lseek,
.release = seq_release,
};
#endif /* CONFIG_FILE_LOCKING */
static int execdomains_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
......@@ -887,7 +889,9 @@ void __init proc_misc_init(void)
#ifdef CONFIG_PRINTK
proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
#endif
#ifdef CONFIG_FILE_LOCKING
proc_create("locks", 0, NULL, &proc_locks_operations);
#endif
proc_create("devices", 0, NULL, &proc_devinfo_operations);
proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
#ifdef CONFIG_BLOCK
......
......@@ -947,6 +947,14 @@ struct lock_manager_operations {
int (*fl_change)(struct file_lock **, int);
};
struct lock_manager {
struct list_head list;
};
void locks_start_grace(struct lock_manager *);
void locks_end_grace(struct lock_manager *);
int locks_in_grace(void);
/* that will die - we need it for nfs_lock_info */
#include <linux/nfs_fs_i.h>
......@@ -988,6 +996,13 @@ struct file_lock {
#include <linux/fcntl.h>
extern void send_sigio(struct fown_struct *fown, int fd, int band);
/* fs/sync.c */
extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
loff_t endbyte, unsigned int flags);
#ifdef CONFIG_FILE_LOCKING
extern int fcntl_getlk(struct file *, struct flock __user *);
extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
struct flock __user *);
......@@ -998,14 +1013,9 @@ extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
struct flock64 __user *);
#endif
extern void send_sigio(struct fown_struct *fown, int fd, int band);
extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
extern int fcntl_getlease(struct file *filp);
/* fs/sync.c */
extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
loff_t endbyte, unsigned int flags);
/* fs/locks.c */
extern void locks_init_lock(struct file_lock *);
extern void locks_copy_lock(struct file_lock *, struct file_lock *);
......@@ -1028,6 +1038,37 @@ extern int lease_modify(struct file_lock **, int);
extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
extern struct seq_operations locks_seq_operations;
#else /* !CONFIG_FILE_LOCKING */
#define fcntl_getlk(a, b) ({ -EINVAL; })
#define fcntl_setlk(a, b, c, d) ({ -EACCES; })
#if BITS_PER_LONG == 32
#define fcntl_getlk64(a, b) ({ -EINVAL; })
#define fcntl_setlk64(a, b, c, d) ({ -EACCES; })
#endif
#define fcntl_setlease(a, b, c) ({ 0; })
#define fcntl_getlease(a) ({ 0; })
#define locks_init_lock(a) ({ })
#define __locks_copy_lock(a, b) ({ })
#define locks_copy_lock(a, b) ({ })
#define locks_remove_posix(a, b) ({ })
#define locks_remove_flock(a) ({ })
#define posix_test_lock(a, b) ({ 0; })
#define posix_lock_file(a, b, c) ({ -ENOLCK; })
#define posix_lock_file_wait(a, b) ({ -ENOLCK; })
#define posix_unblock_lock(a, b) (-ENOENT)
#define vfs_test_lock(a, b) ({ 0; })
#define vfs_lock_file(a, b, c, d) (-ENOLCK)
#define vfs_cancel_lock(a, b) ({ 0; })
#define flock_lock_file_wait(a, b) ({ -ENOLCK; })
#define __break_lease(a, b) ({ 0; })
#define lease_get_mtime(a, b) ({ })
#define generic_setlease(a, b, c) ({ -EINVAL; })
#define vfs_setlease(a, b, c) ({ -EINVAL; })
#define lease_modify(a, b) ({ -EINVAL; })
#define lock_may_read(a, b, c) ({ 1; })
#define lock_may_write(a, b, c) ({ 1; })
#endif /* !CONFIG_FILE_LOCKING */
struct fasync_struct {
int magic;
......@@ -1575,9 +1616,12 @@ extern int vfs_statfs(struct dentry *, struct kstatfs *);
/* /sys/fs */
extern struct kobject *fs_kobj;
extern int rw_verify_area(int, struct file *, loff_t *, size_t);
#define FLOCK_VERIFY_READ 1
#define FLOCK_VERIFY_WRITE 2
#ifdef CONFIG_FILE_LOCKING
extern int locks_mandatory_locked(struct inode *);
extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
......@@ -1608,8 +1652,6 @@ static inline int locks_verify_locked(struct inode *inode)
return 0;
}
extern int rw_verify_area(int, struct file *, loff_t *, size_t);
static inline int locks_verify_truncate(struct inode *inode,
struct file *filp,
loff_t size)
......@@ -1630,6 +1672,15 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
return __break_lease(inode, mode);
return 0;
}
#else /* !CONFIG_FILE_LOCKING */
#define locks_mandatory_locked(a) ({ 0; })
#define locks_mandatory_area(a, b, c, d, e) ({ 0; })
#define __mandatory_lock(a) ({ 0; })
#define mandatory_lock(a) ({ 0; })
#define locks_verify_locked(a) ({ 0; })
#define locks_verify_truncate(a, b, c) ({ 0; })
#define break_lease(a, b) ({ 0; })
#endif /* CONFIG_FILE_LOCKING */
/* fs/open.c */
......
......@@ -27,7 +27,6 @@ struct nlmsvc_binding {
struct nfs_fh *,
struct file **);
void (*fclose)(struct file *);
unsigned long (*get_grace_period)(void);
};
extern struct nlmsvc_binding * nlmsvc_ops;
......@@ -53,15 +52,7 @@ extern void nlmclnt_done(struct nlm_host *host);
extern int nlmclnt_proc(struct nlm_host *host, int cmd,
struct file_lock *fl);
extern int lockd_up(int proto);
extern int lockd_up(void);
extern void lockd_down(void);
unsigned long get_nfs_grace_period(void);
#ifdef CONFIG_NFSD_V4
unsigned long get_nfs4_grace_period(void);
#else
static inline unsigned long get_nfs4_grace_period(void) {return 0;}
#endif
#endif /* LINUX_LOCKD_BIND_H */
......@@ -12,6 +12,8 @@
#ifdef __KERNEL__
#include <linux/in.h>
#include <linux/in6.h>
#include <net/ipv6.h>
#include <linux/fs.h>
#include <linux/kref.h>
#include <linux/utsname.h>
......@@ -38,8 +40,9 @@
*/
struct nlm_host {
struct hlist_node h_hash; /* doubly linked list */
struct sockaddr_in h_addr; /* peer address */
struct sockaddr_in h_saddr; /* our address (optional) */
struct sockaddr_storage h_addr; /* peer address */
size_t h_addrlen;
struct sockaddr_storage h_srcaddr; /* our address (optional) */
struct rpc_clnt * h_rpcclnt; /* RPC client to talk to peer */
char * h_name; /* remote hostname */
u32 h_version; /* interface version */
......@@ -61,17 +64,55 @@ struct nlm_host {
struct list_head h_granted; /* Locks in GRANTED state */
struct list_head h_reclaim; /* Locks in RECLAIM state */
struct nsm_handle * h_nsmhandle; /* NSM status handle */
char h_addrbuf[48], /* address eyecatchers */
h_srcaddrbuf[48];
};
struct nsm_handle {
struct list_head sm_link;
atomic_t sm_count;
char * sm_name;
struct sockaddr_in sm_addr;
struct sockaddr_storage sm_addr;
size_t sm_addrlen;
unsigned int sm_monitored : 1,
sm_sticky : 1; /* don't unmonitor */
char sm_addrbuf[48]; /* address eyecatcher */
};
/*
* Rigorous type checking on sockaddr type conversions
*/
static inline struct sockaddr_in *nlm_addr_in(const struct nlm_host *host)
{
return (struct sockaddr_in *)&host->h_addr;
}
static inline struct sockaddr *nlm_addr(const struct nlm_host *host)
{
return (struct sockaddr *)&host->h_addr;
}
static inline struct sockaddr_in *nlm_srcaddr_in(const struct nlm_host *host)
{
return (struct sockaddr_in *)&host->h_srcaddr;
}
static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host)
{
return (struct sockaddr *)&host->h_srcaddr;
}
static inline struct sockaddr_in *nsm_addr_in(const struct nsm_handle *handle)
{
return (struct sockaddr_in *)&handle->sm_addr;
}
static inline struct sockaddr *nsm_addr(const struct nsm_handle *handle)
{
return (struct sockaddr *)&handle->sm_addr;
}
/*
* Map an fl_owner_t into a unique 32-bit "pid"
*/
......@@ -166,7 +207,8 @@ int nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *);
struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl);
void nlmclnt_finish_block(struct nlm_wait *block);
int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout);
__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *);
__be32 nlmclnt_grant(const struct sockaddr *addr,
const struct nlm_lock *lock);
void nlmclnt_recovery(struct nlm_host *);
int nlmclnt_reclaim(struct nlm_host *, struct file_lock *);
void nlmclnt_next_cookie(struct nlm_cookie *);
......@@ -174,12 +216,14 @@ void nlmclnt_next_cookie(struct nlm_cookie *);
/*
* Host cache
*/
struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
int proto, u32 version,
struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
const size_t salen,
const unsigned short protocol,
const u32 version,
const char *hostname);
struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
const char *hostname,
unsigned int hostname_len);
struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *,
unsigned int);
const size_t hostname_len);
struct rpc_clnt * nlm_bind_host(struct nlm_host *);
void nlm_rebind_host(struct nlm_host *);
struct nlm_host * nlm_get_host(struct nlm_host *);
......@@ -201,7 +245,7 @@ typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref);
*/
__be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
struct nlm_host *, struct nlm_lock *, int,
struct nlm_cookie *);
struct nlm_cookie *, int);
__be32 nlmsvc_unlock(struct nlm_file *, struct nlm_lock *);
__be32 nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
struct nlm_host *, struct nlm_lock *,
......@@ -233,15 +277,82 @@ static inline struct inode *nlmsvc_file_inode(struct nlm_file *file)
return file->f_file->f_path.dentry->d_inode;
}
static inline int __nlm_privileged_request4(const struct sockaddr *sap)
{
const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
return (sin->sin_addr.s_addr == htonl(INADDR_LOOPBACK)) &&
(ntohs(sin->sin_port) < 1024);
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
static inline int __nlm_privileged_request6(const struct sockaddr *sap)
{
const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
return (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LOOPBACK) &&
(ntohs(sin6->sin6_port) < 1024);
}
#else /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
static inline int __nlm_privileged_request6(const struct sockaddr *sap)
{
return 0;
}
#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
/*
* Compare two host addresses (needs modifying for ipv6)
* Ensure incoming requests are from local privileged callers.
*
* Return TRUE if sender is local and is connecting via a privileged port;
* otherwise return FALSE.
*/
static inline int nlm_cmp_addr(const struct sockaddr_in *sin1,
const struct sockaddr_in *sin2)
static inline int nlm_privileged_requester(const struct svc_rqst *rqstp)
{
const struct sockaddr *sap = svc_addr(rqstp);
switch (sap->sa_family) {
case AF_INET:
return __nlm_privileged_request4(sap);
case AF_INET6:
return __nlm_privileged_request6(sap);
default:
return 0;
}
}
static inline int __nlm_cmp_addr4(const struct sockaddr *sap1,
const struct sockaddr *sap2)
{
const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
}
static inline int __nlm_cmp_addr6(const struct sockaddr *sap1,
const struct sockaddr *sap2)
{
const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
}
/*
* Compare two host addresses
*
* Return TRUE if the addresses are the same; otherwise FALSE.
*/
static inline int nlm_cmp_addr(const struct sockaddr *sap1,
const struct sockaddr *sap2)
{
if (sap1->sa_family == sap2->sa_family) {
switch (sap1->sa_family) {
case AF_INET:
return __nlm_cmp_addr4(sap1, sap2);
case AF_INET6:
return __nlm_cmp_addr6(sap1, sap2);
}
}
return 0;
}
/*
* Compare two NLM locks.
* When the second lock is of type F_UNLCK, this acts like a wildcard.
......
......@@ -81,8 +81,6 @@ struct nlm_reboot {
unsigned int len;
u32 state;
__be32 addr;
__be32 vers;
__be32 proto;
};
/*
......
......@@ -38,6 +38,7 @@
#define NFSD_MAY_LOCK 32
#define NFSD_MAY_OWNER_OVERRIDE 64
#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
......@@ -125,7 +126,7 @@ int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
loff_t *, struct readdir_cd *, filldir_t);
__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
struct kstatfs *);
struct kstatfs *, int access);
int nfsd_notify_change(struct inode *, struct iattr *);
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
......
......@@ -104,6 +104,7 @@ struct rpc_create_args {
const struct rpc_timeout *timeout;
char *servername;
struct rpc_program *program;
u32 prognumber; /* overrides program->number */
u32 version;
rpc_authflavor_t authflavor;
unsigned long flags;
......@@ -124,10 +125,10 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
void rpc_shutdown_client(struct rpc_clnt *);
void rpc_release_client(struct rpc_clnt *);
int rpcb_register(u32, u32, int, unsigned short, int *);
int rpcb_register(u32, u32, int, unsigned short);
int rpcb_v4_register(const u32 program, const u32 version,
const struct sockaddr *address,
const char *netid, int *result);
const char *netid);
int rpcb_getport_sync(struct sockaddr_in *, u32, u32, int);
void rpcb_getport_async(struct rpc_task *);
......
......@@ -66,6 +66,7 @@ struct svc_serv {
struct list_head sv_tempsocks; /* all temporary sockets */
int sv_tmpcnt; /* count of temporary sockets */
struct timer_list sv_temptimer; /* timer for aging temporary sockets */
sa_family_t sv_family; /* listener's address family */
char * sv_name; /* service name */
......@@ -265,17 +266,17 @@ struct svc_rqst {
/*
* Rigorous type checking on sockaddr type conversions
*/
static inline struct sockaddr_in *svc_addr_in(struct svc_rqst *rqst)
static inline struct sockaddr_in *svc_addr_in(const struct svc_rqst *rqst)
{
return (struct sockaddr_in *) &rqst->rq_addr;
}
static inline struct sockaddr_in6 *svc_addr_in6(struct svc_rqst *rqst)
static inline struct sockaddr_in6 *svc_addr_in6(const struct svc_rqst *rqst)
{
return (struct sockaddr_in6 *) &rqst->rq_addr;
}
static inline struct sockaddr *svc_addr(struct svc_rqst *rqst)
static inline struct sockaddr *svc_addr(const struct svc_rqst *rqst)
{
return (struct sockaddr *) &rqst->rq_addr;
}
......@@ -381,18 +382,20 @@ struct svc_procedure {
/*
* Function prototypes.
*/
struct svc_serv * svc_create(struct svc_program *, unsigned int,
void (*shutdown)(struct svc_serv*));
struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t,
void (*shutdown)(struct svc_serv *));
struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
struct svc_pool *pool);
void svc_exit_thread(struct svc_rqst *);
struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,
void (*shutdown)(struct svc_serv*), svc_thread_fn,
struct module *);
sa_family_t, void (*shutdown)(struct svc_serv *),
svc_thread_fn, struct module *);
int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
void svc_destroy(struct svc_serv *);
int svc_process(struct svc_rqst *);
int svc_register(struct svc_serv *, int, unsigned short);
int svc_register(const struct svc_serv *, const unsigned short,
const unsigned short);
void svc_wake_up(struct svc_serv *);
void svc_reserve(struct svc_rqst *rqstp, int space);
struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu);
......
......@@ -72,6 +72,7 @@ extern atomic_t rdma_stat_sq_prod;
*/
struct svc_rdma_op_ctxt {
struct svc_rdma_op_ctxt *read_hdr;
struct svc_rdma_fastreg_mr *frmr;
int hdr_count;
struct xdr_buf arg;
struct list_head dto_q;
......@@ -103,16 +104,30 @@ struct svc_rdma_chunk_sge {
int start; /* sge no for this chunk */
int count; /* sge count for this chunk */
};
struct svc_rdma_fastreg_mr {
struct ib_mr *mr;
void *kva;
struct ib_fast_reg_page_list *page_list;
int page_list_len;
unsigned long access_flags;
unsigned long map_len;
enum dma_data_direction direction;
struct list_head frmr_list;
};
struct svc_rdma_req_map {
struct svc_rdma_fastreg_mr *frmr;
unsigned long count;
union {
struct kvec sge[RPCSVC_MAXPAGES];
struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES];
};
};
#define RDMACTXT_F_FAST_UNREG 1
#define RDMACTXT_F_LAST_CTXT 2
#define SVCRDMA_DEVCAP_FAST_REG 1 /* fast mr registration */
#define SVCRDMA_DEVCAP_READ_W_INV 2 /* read w/ invalidate */
struct svcxprt_rdma {
struct svc_xprt sc_xprt; /* SVC transport structure */
struct rdma_cm_id *sc_cm_id; /* RDMA connection id */
......@@ -136,6 +151,11 @@ struct svcxprt_rdma {
struct ib_cq *sc_rq_cq;
struct ib_cq *sc_sq_cq;
struct ib_mr *sc_phys_mr; /* MR for server memory */
u32 sc_dev_caps; /* distilled device caps */
u32 sc_dma_lkey; /* local dma key */
unsigned int sc_frmr_pg_list_len;
struct list_head sc_frmr_q;
spinlock_t sc_frmr_q_lock;
spinlock_t sc_lock; /* transport lock */
......@@ -192,8 +212,13 @@ extern int svc_rdma_post_recv(struct svcxprt_rdma *);
extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
extern struct svc_rdma_req_map *svc_rdma_get_req_map(void);
extern void svc_rdma_put_req_map(struct svc_rdma_req_map *);
extern int svc_rdma_fastreg(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *);
extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
struct svc_rdma_fastreg_mr *);
extern void svc_sq_reap(struct svcxprt_rdma *);
extern void svc_rq_reap(struct svcxprt_rdma *);
extern struct svc_xprt_class svc_rdma_class;
......
......@@ -39,10 +39,7 @@ int svc_send(struct svc_rqst *);
void svc_drop(struct svc_rqst *);
void svc_sock_update_bufs(struct svc_serv *serv);
int svc_sock_names(char *buf, struct svc_serv *serv, char *toclose);
int svc_addsock(struct svc_serv *serv,
int fd,
char *name_return,
int *proto);
int svc_addsock(struct svc_serv *serv, int fd, char *name_return);
void svc_init_xprt_sock(void);
void svc_cleanup_xprt_sock(void);
......
......@@ -125,6 +125,7 @@ cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
cond_syscall(compat_sys_ipc);
cond_syscall(compat_sys_sysctl);
cond_syscall(sys_flock);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
......
......@@ -96,7 +96,7 @@ static int sixty = 60;
static int neg_one = -1;
#endif
#ifdef CONFIG_MMU
#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
static int two = 2;
#endif
......@@ -1248,6 +1248,7 @@ static struct ctl_table fs_table[] = {
.extra1 = &minolduid,
.extra2 = &maxolduid,
},
#ifdef CONFIG_FILE_LOCKING
{
.ctl_name = FS_LEASES,
.procname = "leases-enable",
......@@ -1256,6 +1257,7 @@ static struct ctl_table fs_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
#ifdef CONFIG_DNOTIFY
{
.ctl_name = FS_DIR_NOTIFY,
......@@ -1267,6 +1269,7 @@ static struct ctl_table fs_table[] = {
},
#endif
#ifdef CONFIG_MMU
#ifdef CONFIG_FILE_LOCKING
{
.ctl_name = FS_LEASE_TIME,
.procname = "lease-break-time",
......@@ -1278,6 +1281,7 @@ static struct ctl_table fs_table[] = {
.extra1 = &zero,
.extra2 = &two,
},
#endif
{
.procname = "aio-nr",
.data = &aio_nr,
......
......@@ -174,7 +174,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
clnt->cl_procinfo = version->procs;
clnt->cl_maxproc = version->nrprocs;
clnt->cl_protname = program->name;
clnt->cl_prog = program->number;
clnt->cl_prog = args->prognumber ? : program->number;
clnt->cl_vers = version->number;
clnt->cl_stats = program->stats;
clnt->cl_metrics = rpc_alloc_iostats(clnt);
......
......@@ -20,6 +20,7 @@
#include <linux/in6.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <net/ipv6.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/sched.h>
......@@ -176,13 +177,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
}
static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
u32 version, struct rpc_message *msg,
int *result)
u32 version, struct rpc_message *msg)
{
struct rpc_clnt *rpcb_clnt;
int error = 0;
int result, error = 0;
*result = 0;
msg->rpc_resp = &result;
rpcb_clnt = rpcb_create_local(addr, addrlen, version);
if (!IS_ERR(rpcb_clnt)) {
......@@ -191,12 +191,15 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
} else
error = PTR_ERR(rpcb_clnt);
if (error < 0)
if (error < 0) {
printk(KERN_WARNING "RPC: failed to contact local rpcbind "
"server (errno %d).\n", -error);
dprintk("RPC: registration status %d/%d\n", error, *result);
return error;
}
return error;
if (!result)
return -EACCES;
return 0;
}
/**
......@@ -205,7 +208,11 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
* @vers: RPC version number to bind
* @prot: transport protocol to register
* @port: port value to register
* @okay: OUT: result code
*
* Returns zero if the registration request was dispatched successfully
* and the rpcbind daemon returned success. Otherwise, returns an errno
* value that reflects the nature of the error (request could not be
* dispatched, timed out, or rpcbind returned an error).
*
* RPC services invoke this function to advertise their contact
* information via the system's rpcbind daemon. RPC services
......@@ -217,15 +224,6 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
* all registered transports for [program, version] from the local
* rpcbind database.
*
* Returns zero if the registration request was dispatched
* successfully and a reply was received. The rpcbind daemon's
* boolean result code is stored in *okay.
*
* Returns an errno value and sets *result to zero if there was
* some problem that prevented the rpcbind request from being
* dispatched, or if the rpcbind daemon did not respond within
* the timeout.
*
* This function uses rpcbind protocol version 2 to contact the
* local rpcbind daemon.
*
......@@ -236,7 +234,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
* IN6ADDR_ANY (ie available for all AF_INET and AF_INET6
* addresses).
*/
int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
{
struct rpcbind_args map = {
.r_prog = prog,
......@@ -246,7 +244,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
};
struct rpc_message msg = {
.rpc_argp = &map,
.rpc_resp = okay,
};
dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
......@@ -259,7 +256,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
sizeof(rpcb_inaddr_loopback),
RPCBVERS_2, &msg, okay);
RPCBVERS_2, &msg);
}
/*
......@@ -290,7 +287,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
sizeof(rpcb_inaddr_loopback),
RPCBVERS_4, msg, msg->rpc_resp);
RPCBVERS_4, msg);
}
/*
......@@ -304,10 +301,13 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
char buf[64];
/* Construct AF_INET6 universal address */
snprintf(buf, sizeof(buf),
NIP6_FMT".%u.%u",
NIP6(address_to_register->sin6_addr),
port >> 8, port & 0xff);
if (ipv6_addr_any(&address_to_register->sin6_addr))
snprintf(buf, sizeof(buf), "::.%u.%u",
port >> 8, port & 0xff);
else
snprintf(buf, sizeof(buf), NIP6_FMT".%u.%u",
NIP6(address_to_register->sin6_addr),
port >> 8, port & 0xff);
map->r_addr = buf;
dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
......@@ -321,7 +321,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback,
sizeof(rpcb_in6addr_loopback),
RPCBVERS_4, msg, msg->rpc_resp);
RPCBVERS_4, msg);
}
/**
......@@ -330,7 +330,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* @version: RPC version number of service to (un)register
* @address: address family, IP address, and port to (un)register
* @netid: netid of transport protocol to (un)register
* @result: result code from rpcbind RPC call
*
* Returns zero if the registration request was dispatched successfully
* and the rpcbind daemon returned success. Otherwise, returns an errno
* value that reflects the nature of the error (request could not be
* dispatched, timed out, or rpcbind returned an error).
*
* RPC services invoke this function to advertise their contact
* information via the system's rpcbind daemon. RPC services
......@@ -342,15 +346,6 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* to zero. Callers pass a netid of "" to unregister all
* transport netids associated with [program, version, address].
*
* Returns zero if the registration request was dispatched
* successfully and a reply was received. The rpcbind daemon's
* result code is stored in *result.
*
* Returns an errno value and sets *result to zero if there was
* some problem that prevented the rpcbind request from being
* dispatched, or if the rpcbind daemon did not respond within
* the timeout.
*
* This function uses rpcbind protocol version 4 to contact the
* local rpcbind daemon. The local rpcbind daemon must support
* version 4 of the rpcbind protocol in order for these functions
......@@ -372,8 +367,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* advertises the service on all IPv4 and IPv6 addresses.
*/
int rpcb_v4_register(const u32 program, const u32 version,
const struct sockaddr *address, const char *netid,
int *result)
const struct sockaddr *address, const char *netid)
{
struct rpcbind_args map = {
.r_prog = program,
......@@ -383,11 +377,8 @@ int rpcb_v4_register(const u32 program, const u32 version,
};
struct rpc_message msg = {
.rpc_argp = &map,
.rpc_resp = result,
};
*result = 0;
switch (address->sa_family) {
case AF_INET:
return rpcb_register_netid4((struct sockaddr_in *)address,
......@@ -633,7 +624,7 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p,
struct rpcbind_args *rpcb)
{
dprintk("RPC: rpcb_encode_mapping(%u, %u, %d, %u)\n",
dprintk("RPC: encoding rpcb request (%u, %u, %d, %u)\n",
rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
*p++ = htonl(rpcb->r_prog);
*p++ = htonl(rpcb->r_vers);
......@@ -648,7 +639,7 @@ static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p,
unsigned short *portp)
{
*portp = (unsigned short) ntohl(*p++);
dprintk("RPC: rpcb_decode_getport result %u\n",
dprintk("RPC: rpcb getport result: %u\n",
*portp);
return 0;
}
......@@ -657,7 +648,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
unsigned int *boolp)
{
*boolp = (unsigned int) ntohl(*p++);
dprintk("RPC: rpcb_decode_set: call %s\n",
dprintk("RPC: rpcb set/unset call %s\n",
(*boolp ? "succeeded" : "failed"));
return 0;
}
......@@ -665,7 +656,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p,
struct rpcbind_args *rpcb)
{
dprintk("RPC: rpcb_encode_getaddr(%u, %u, %s)\n",
dprintk("RPC: encoding rpcb request (%u, %u, %s)\n",
rpcb->r_prog, rpcb->r_vers, rpcb->r_addr);
*p++ = htonl(rpcb->r_prog);
*p++ = htonl(rpcb->r_vers);
......
......@@ -28,6 +28,8 @@
#define RPCDBG_FACILITY RPCDBG_SVCDSP
static void svc_unregister(const struct svc_serv *serv);
#define svc_serv_is_pooled(serv) ((serv)->sv_function)
/*
......@@ -357,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
*/
static struct svc_serv *
__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
void (*shutdown)(struct svc_serv *serv))
sa_family_t family, void (*shutdown)(struct svc_serv *serv))
{
struct svc_serv *serv;
unsigned int vers;
......@@ -366,6 +368,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
return NULL;
serv->sv_family = family;
serv->sv_name = prog->pg_name;
serv->sv_program = prog;
serv->sv_nrthreads = 1;
......@@ -416,30 +419,29 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
spin_lock_init(&pool->sp_lock);
}
/* Remove any stale portmap registrations */
svc_register(serv, 0, 0);
svc_unregister(serv);
return serv;
}
struct svc_serv *
svc_create(struct svc_program *prog, unsigned int bufsize,
void (*shutdown)(struct svc_serv *serv))
sa_family_t family, void (*shutdown)(struct svc_serv *serv))
{
return __svc_create(prog, bufsize, /*npools*/1, shutdown);
return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
}
EXPORT_SYMBOL(svc_create);
struct svc_serv *
svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
void (*shutdown)(struct svc_serv *serv),
sa_family_t family, void (*shutdown)(struct svc_serv *serv),
svc_thread_fn func, struct module *mod)
{
struct svc_serv *serv;
unsigned int npools = svc_pool_map_get();
serv = __svc_create(prog, bufsize, npools, shutdown);
serv = __svc_create(prog, bufsize, npools, family, shutdown);
if (serv != NULL) {
serv->sv_function = func;
......@@ -486,8 +488,7 @@ svc_destroy(struct svc_serv *serv)
if (svc_serv_is_pooled(serv))
svc_pool_map_put();
/* Unregister service with the portmapper */
svc_register(serv, 0, 0);
svc_unregister(serv);
kfree(serv->sv_pools);
kfree(serv);
}
......@@ -718,55 +719,245 @@ svc_exit_thread(struct svc_rqst *rqstp)
}
EXPORT_SYMBOL(svc_exit_thread);
#ifdef CONFIG_SUNRPC_REGISTER_V4
/*
* Register an RPC service with the local portmapper.
* To unregister a service, call this routine with
* proto and port == 0.
* Register an "inet" protocol family netid with the local
* rpcbind daemon via an rpcbind v4 SET request.
*
* No netconfig infrastructure is available in the kernel, so
* we map IP_ protocol numbers to netids by hand.
*
* Returns zero on success; a negative errno value is returned
* if any error occurs.
*/
int
svc_register(struct svc_serv *serv, int proto, unsigned short port)
static int __svc_rpcb_register4(const u32 program, const u32 version,
const unsigned short protocol,
const unsigned short port)
{
struct sockaddr_in sin = {
.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_ANY),
.sin_port = htons(port),
};
char *netid;
switch (protocol) {
case IPPROTO_UDP:
netid = RPCBIND_NETID_UDP;
break;
case IPPROTO_TCP:
netid = RPCBIND_NETID_TCP;
break;
default:
return -EPROTONOSUPPORT;
}
return rpcb_v4_register(program, version,
(struct sockaddr *)&sin, netid);
}
/*
* Register an "inet6" protocol family netid with the local
* rpcbind daemon via an rpcbind v4 SET request.
*
* No netconfig infrastructure is available in the kernel, so
* we map IP_ protocol numbers to netids by hand.
*
* Returns zero on success; a negative errno value is returned
* if any error occurs.
*/
static int __svc_rpcb_register6(const u32 program, const u32 version,
const unsigned short protocol,
const unsigned short port)
{
struct sockaddr_in6 sin6 = {
.sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_ANY_INIT,
.sin6_port = htons(port),
};
char *netid;
switch (protocol) {
case IPPROTO_UDP:
netid = RPCBIND_NETID_UDP6;
break;
case IPPROTO_TCP:
netid = RPCBIND_NETID_TCP6;
break;
default:
return -EPROTONOSUPPORT;
}
return rpcb_v4_register(program, version,
(struct sockaddr *)&sin6, netid);
}
/*
* Register a kernel RPC service via rpcbind version 4.
*
* Returns zero on success; a negative errno value is returned
* if any error occurs.
*/
static int __svc_register(const u32 program, const u32 version,
const sa_family_t family,
const unsigned short protocol,
const unsigned short port)
{
int error;
switch (family) {
case AF_INET:
return __svc_rpcb_register4(program, version,
protocol, port);
case AF_INET6:
error = __svc_rpcb_register6(program, version,
protocol, port);
if (error < 0)
return error;
/*
* Work around bug in some versions of Linux rpcbind
* which don't allow registration of both inet and
* inet6 netids.
*
* Error return ignored for now.
*/
__svc_rpcb_register4(program, version,
protocol, port);
return 0;
}
return -EAFNOSUPPORT;
}
#else /* CONFIG_SUNRPC_REGISTER_V4 */
/*
* Register a kernel RPC service via rpcbind version 2.
*
* Returns zero on success; a negative errno value is returned
* if any error occurs.
*/
static int __svc_register(const u32 program, const u32 version,
sa_family_t family,
const unsigned short protocol,
const unsigned short port)
{
if (family != AF_INET)
return -EAFNOSUPPORT;
return rpcb_register(program, version, protocol, port);
}
#endif /* CONFIG_SUNRPC_REGISTER_V4 */
/**
* svc_register - register an RPC service with the local portmapper
* @serv: svc_serv struct for the service to register
* @proto: transport protocol number to advertise
* @port: port to advertise
*
* Service is registered for any address in serv's address family
*/
int svc_register(const struct svc_serv *serv, const unsigned short proto,
const unsigned short port)
{
struct svc_program *progp;
unsigned long flags;
unsigned int i;
int error = 0, dummy;
int error = 0;
if (!port)
clear_thread_flag(TIF_SIGPENDING);
BUG_ON(proto == 0 && port == 0);
for (progp = serv->sv_program; progp; progp = progp->pg_next) {
for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
continue;
dprintk("svc: svc_register(%s, %s, %d, %d)%s\n",
dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
progp->pg_name,
i,
proto == IPPROTO_UDP? "udp" : "tcp",
port,
i,
serv->sv_family,
progp->pg_vers[i]->vs_hidden?
" (but not telling portmap)" : "");
if (progp->pg_vers[i]->vs_hidden)
continue;
error = rpcb_register(progp->pg_prog, i, proto, port, &dummy);
error = __svc_register(progp->pg_prog, i,
serv->sv_family, proto, port);
if (error < 0)
break;
if (port && !dummy) {
error = -EACCES;
break;
}
}
}
if (!port) {
spin_lock_irqsave(&current->sighand->siglock, flags);
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, flags);
return error;
}
#ifdef CONFIG_SUNRPC_REGISTER_V4
static void __svc_unregister(const u32 program, const u32 version,
const char *progname)
{
struct sockaddr_in6 sin6 = {
.sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_ANY_INIT,
.sin6_port = 0,
};
int error;
error = rpcb_v4_register(program, version,
(struct sockaddr *)&sin6, "");
dprintk("svc: %s(%sv%u), error %d\n",
__func__, progname, version, error);
}
#else /* CONFIG_SUNRPC_REGISTER_V4 */
static void __svc_unregister(const u32 program, const u32 version,
const char *progname)
{
int error;
error = rpcb_register(program, version, 0, 0);
dprintk("svc: %s(%sv%u), error %d\n",
__func__, progname, version, error);
}
#endif /* CONFIG_SUNRPC_REGISTER_V4 */
/*
* All netids, bind addresses and ports registered for [program, version]
* are removed from the local rpcbind database (if the service is not
* hidden) to make way for a new instance of the service.
*
* The result of unregistration is reported via dprintk for those who want
* verification of the result, but is otherwise not important.
*/
static void svc_unregister(const struct svc_serv *serv)
{
struct svc_program *progp;
unsigned long flags;
unsigned int i;
clear_thread_flag(TIF_SIGPENDING);
for (progp = serv->sv_program; progp; progp = progp->pg_next) {
for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
continue;
if (progp->pg_vers[i]->vs_hidden)
continue;
__svc_unregister(progp->pg_prog, i, progp->pg_name);
}
}
return error;
spin_lock_irqsave(&current->sighand->siglock, flags);
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
/*
......
......@@ -159,15 +159,44 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
}
EXPORT_SYMBOL_GPL(svc_xprt_init);
int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
int flags)
static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
struct svc_serv *serv,
unsigned short port, int flags)
{
struct svc_xprt_class *xcl;
struct sockaddr_in sin = {
.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_ANY),
.sin_port = htons(port),
};
struct sockaddr_in6 sin6 = {
.sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_ANY_INIT,
.sin6_port = htons(port),
};
struct sockaddr *sap;
size_t len;
switch (serv->sv_family) {
case AF_INET:
sap = (struct sockaddr *)&sin;
len = sizeof(sin);
break;
case AF_INET6:
sap = (struct sockaddr *)&sin6;
len = sizeof(sin6);
break;
default:
return ERR_PTR(-EAFNOSUPPORT);
}
return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
}
int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
int flags)
{
struct svc_xprt_class *xcl;
dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
spin_lock(&svc_xprt_class_lock);
list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
......@@ -180,9 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
goto err;
spin_unlock(&svc_xprt_class_lock);
newxprt = xcl->xcl_ops->
xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
flags);
newxprt = __svc_xpo_create(xcl, serv, port, flags);
if (IS_ERR(newxprt)) {
module_put(xcl->xcl_owner);
return PTR_ERR(newxprt);
......
......@@ -1114,6 +1114,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
struct svc_sock *svsk;
struct sock *inet;
int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
int val;
dprintk("svc: svc_setup_socket %p\n", sock);
if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
......@@ -1146,6 +1147,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
else
svc_tcp_init(svsk, serv);
/*
* We start one listener per sv_serv. We want AF_INET
* requests to be automatically shunted to our AF_INET6
* listener using a mapped IPv4 address. Make sure
* no-one starts an equivalent IPv4 listener, which
* would steal our incoming connections.
*/
val = 0;
if (serv->sv_family == AF_INET6)
kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
(char *)&val, sizeof(val));
dprintk("svc: svc_setup_socket created %p (inet %p)\n",
svsk, svsk->sk_sk);
......@@ -1154,8 +1167,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
int svc_addsock(struct svc_serv *serv,
int fd,
char *name_return,
int *proto)
char *name_return)
{
int err = 0;
struct socket *so = sockfd_lookup(fd, &err);
......@@ -1190,7 +1202,6 @@ int svc_addsock(struct svc_serv *serv,
sockfd_put(so);
return err;
}
if (proto) *proto = so->sk->sk_protocol;
return one_sock_name(name_return, svsk);
}
EXPORT_SYMBOL_GPL(svc_addsock);
......
......@@ -116,7 +116,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
*
* Assumptions:
* - chunk[0]->position points to pages[0] at an offset of 0
* - pages[] is not physically or virtually contigous and consists of
* - pages[] is not physically or virtually contiguous and consists of
* PAGE_SIZE elements.
*
* Output:
......@@ -125,7 +125,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
* chunk in the read list
*
*/
static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
static int map_read_chunks(struct svcxprt_rdma *xprt,
struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *head,
struct rpcrdma_msg *rmsgp,
......@@ -211,26 +211,128 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
return sge_no;
}
static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
struct svc_rdma_op_ctxt *ctxt,
struct kvec *vec,
u64 *sgl_offset,
int count)
/* Map a read-chunk-list to an XDR and fast register the page-list.
*
* Assumptions:
* - chunk[0] position points to pages[0] at an offset of 0
* - pages[] will be made physically contiguous by creating a one-off memory
* region using the fastreg verb.
* - byte_count is # of bytes in read-chunk-list
* - ch_count is # of chunks in read-chunk-list
*
* Output:
* - sge array pointing into pages[] array.
* - chunk_sge array specifying sge index and count for each
* chunk in the read list
*/
static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *head,
struct rpcrdma_msg *rmsgp,
struct svc_rdma_req_map *rpl_map,
struct svc_rdma_req_map *chl_map,
int ch_count,
int byte_count)
{
int page_no;
int ch_no;
u32 offset;
struct rpcrdma_read_chunk *ch;
struct svc_rdma_fastreg_mr *frmr;
int ret = 0;
frmr = svc_rdma_get_frmr(xprt);
if (IS_ERR(frmr))
return -ENOMEM;
head->frmr = frmr;
head->arg.head[0] = rqstp->rq_arg.head[0];
head->arg.tail[0] = rqstp->rq_arg.tail[0];
head->arg.pages = &head->pages[head->count];
head->hdr_count = head->count; /* save count of hdr pages */
head->arg.page_base = 0;
head->arg.page_len = byte_count;
head->arg.len = rqstp->rq_arg.len + byte_count;
head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
/* Fast register the page list */
frmr->kva = page_address(rqstp->rq_arg.pages[0]);
frmr->direction = DMA_FROM_DEVICE;
frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
frmr->map_len = byte_count;
frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
frmr->page_list->page_list[page_no] =
ib_dma_map_single(xprt->sc_cm_id->device,
page_address(rqstp->rq_arg.pages[page_no]),
PAGE_SIZE, DMA_TO_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
frmr->page_list->page_list[page_no]))
goto fatal_err;
atomic_inc(&xprt->sc_dma_used);
head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
}
head->count += page_no;
/* rq_respages points one past arg pages */
rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
/* Create the reply and chunk maps */
offset = 0;
ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
for (ch_no = 0; ch_no < ch_count; ch_no++) {
rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length;
chl_map->ch[ch_no].count = 1;
chl_map->ch[ch_no].start = ch_no;
offset += ch->rc_target.rs_length;
ch++;
}
ret = svc_rdma_fastreg(xprt, frmr);
if (ret)
goto fatal_err;
return ch_no;
fatal_err:
printk("svcrdma: error fast registering xdr for xprt %p", xprt);
svc_rdma_put_frmr(xprt, frmr);
return -EIO;
}
static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
struct svc_rdma_op_ctxt *ctxt,
struct svc_rdma_fastreg_mr *frmr,
struct kvec *vec,
u64 *sgl_offset,
int count)
{
int i;
ctxt->count = count;
ctxt->direction = DMA_FROM_DEVICE;
for (i = 0; i < count; i++) {
atomic_inc(&xprt->sc_dma_used);
ctxt->sge[i].addr =
ib_dma_map_single(xprt->sc_cm_id->device,
vec[i].iov_base, vec[i].iov_len,
DMA_FROM_DEVICE);
ctxt->sge[i].length = 0; /* in case map fails */
if (!frmr) {
ctxt->sge[i].addr =
ib_dma_map_single(xprt->sc_cm_id->device,
vec[i].iov_base,
vec[i].iov_len,
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
ctxt->sge[i].addr))
return -EINVAL;
ctxt->sge[i].lkey = xprt->sc_dma_lkey;
atomic_inc(&xprt->sc_dma_used);
} else {
ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
ctxt->sge[i].lkey = frmr->mr->lkey;
}
ctxt->sge[i].length = vec[i].iov_len;
ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey;
*sgl_offset = *sgl_offset + vec[i].iov_len;
}
return 0;
}
static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
......@@ -278,6 +380,7 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
struct svc_rdma_op_ctxt *hdr_ctxt)
{
struct ib_send_wr read_wr;
struct ib_send_wr inv_wr;
int err = 0;
int ch_no;
int ch_count;
......@@ -301,9 +404,20 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
if (ch_count > RPCSVC_MAXPAGES)
return -EINVAL;
sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
rpl_map, chl_map,
ch_count, byte_count);
if (!xprt->sc_frmr_pg_list_len)
sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
rpl_map, chl_map, ch_count,
byte_count);
else
sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
rpl_map, chl_map, ch_count,
byte_count);
if (sge_count < 0) {
err = -EIO;
goto out;
}
sgl_offset = 0;
ch_no = 0;
......@@ -312,13 +426,16 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
next_sge:
ctxt = svc_rdma_get_context(xprt);
ctxt->direction = DMA_FROM_DEVICE;
ctxt->frmr = hdr_ctxt->frmr;
ctxt->read_hdr = NULL;
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
/* Prepare READ WR */
memset(&read_wr, 0, sizeof read_wr);
ctxt->wr_op = IB_WR_RDMA_READ;
read_wr.wr_id = (unsigned long)ctxt;
read_wr.opcode = IB_WR_RDMA_READ;
ctxt->wr_op = read_wr.opcode;
read_wr.send_flags = IB_SEND_SIGNALED;
read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
read_wr.wr.rdma.remote_addr =
......@@ -327,10 +444,15 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
read_wr.sg_list = ctxt->sge;
read_wr.num_sge =
rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
rdma_set_ctxt_sge(xprt, ctxt,
&rpl_map->sge[chl_map->ch[ch_no].start],
&sgl_offset,
read_wr.num_sge);
err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
&rpl_map->sge[chl_map->ch[ch_no].start],
&sgl_offset,
read_wr.num_sge);
if (err) {
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 0);
goto out;
}
if (((ch+1)->rc_discrim == 0) &&
(read_wr.num_sge == chl_map->ch[ch_no].count)) {
/*
......@@ -339,6 +461,29 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
* the client and the RPC needs to be enqueued.
*/
set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
if (hdr_ctxt->frmr) {
set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
/*
* Invalidate the local MR used to map the data
* sink.
*/
if (xprt->sc_dev_caps &
SVCRDMA_DEVCAP_READ_W_INV) {
read_wr.opcode =
IB_WR_RDMA_READ_WITH_INV;
ctxt->wr_op = read_wr.opcode;
read_wr.ex.invalidate_rkey =
ctxt->frmr->mr->lkey;
} else {
/* Prepare INVALIDATE WR */
memset(&inv_wr, 0, sizeof inv_wr);
inv_wr.opcode = IB_WR_LOCAL_INV;
inv_wr.send_flags = IB_SEND_SIGNALED;
inv_wr.ex.invalidate_rkey =
hdr_ctxt->frmr->mr->lkey;
read_wr.next = &inv_wr;
}
}
ctxt->read_hdr = hdr_ctxt;
}
/* Post the read */
......
......@@ -69,9 +69,127 @@
* array is only concerned with the reply we are assured that we have
* on extra page for the RPCRMDA header.
*/
static void xdr_to_sge(struct svcxprt_rdma *xprt,
struct xdr_buf *xdr,
struct svc_rdma_req_map *vec)
int fast_reg_xdr(struct svcxprt_rdma *xprt,
struct xdr_buf *xdr,
struct svc_rdma_req_map *vec)
{
int sge_no;
u32 sge_bytes;
u32 page_bytes;
u32 page_off;
int page_no = 0;
u8 *frva;
struct svc_rdma_fastreg_mr *frmr;
frmr = svc_rdma_get_frmr(xprt);
if (IS_ERR(frmr))
return -ENOMEM;
vec->frmr = frmr;
/* Skip the RPCRDMA header */
sge_no = 1;
/* Map the head. */
frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
vec->count = 2;
sge_no++;
/* Build the FRMR */
frmr->kva = frva;
frmr->direction = DMA_TO_DEVICE;
frmr->access_flags = 0;
frmr->map_len = PAGE_SIZE;
frmr->page_list_len = 1;
frmr->page_list->page_list[page_no] =
ib_dma_map_single(xprt->sc_cm_id->device,
(void *)xdr->head[0].iov_base,
PAGE_SIZE, DMA_TO_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
frmr->page_list->page_list[page_no]))
goto fatal_err;
atomic_inc(&xprt->sc_dma_used);
page_off = xdr->page_base;
page_bytes = xdr->page_len + page_off;
if (!page_bytes)
goto encode_tail;
/* Map the pages */
vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
vec->sge[sge_no].iov_len = page_bytes;
sge_no++;
while (page_bytes) {
struct page *page;
page = xdr->pages[page_no++];
sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
page_bytes -= sge_bytes;
frmr->page_list->page_list[page_no] =
ib_dma_map_page(xprt->sc_cm_id->device, page, 0,
PAGE_SIZE, DMA_TO_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
frmr->page_list->page_list[page_no]))
goto fatal_err;
atomic_inc(&xprt->sc_dma_used);
page_off = 0; /* reset for next time through loop */
frmr->map_len += PAGE_SIZE;
frmr->page_list_len++;
}
vec->count++;
encode_tail:
/* Map tail */
if (0 == xdr->tail[0].iov_len)
goto done;
vec->count++;
vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
/*
* If head and tail use the same page, we don't need
* to map it again.
*/
vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
} else {
void *va;
/* Map another page for the tail */
page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
frmr->page_list->page_list[page_no] =
ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE,
DMA_TO_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
frmr->page_list->page_list[page_no]))
goto fatal_err;
atomic_inc(&xprt->sc_dma_used);
frmr->map_len += PAGE_SIZE;
frmr->page_list_len++;
}
done:
if (svc_rdma_fastreg(xprt, frmr))
goto fatal_err;
return 0;
fatal_err:
printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
svc_rdma_put_frmr(xprt, frmr);
return -EIO;
}
static int map_xdr(struct svcxprt_rdma *xprt,
struct xdr_buf *xdr,
struct svc_rdma_req_map *vec)
{
int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
int sge_no;
......@@ -83,6 +201,9 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
BUG_ON(xdr->len !=
(xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
if (xprt->sc_frmr_pg_list_len)
return fast_reg_xdr(xprt, xdr, vec);
/* Skip the first sge, this is for the RPCRDMA header */
sge_no = 1;
......@@ -116,9 +237,12 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
BUG_ON(sge_no > sge_max);
vec->count = sge_no;
return 0;
}
/* Assumptions:
* - We are using FRMR
* - or -
* - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
*/
static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
......@@ -158,30 +282,35 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
sge_no = 0;
/* Copy the remaining SGE */
while (bc != 0 && xdr_sge_no < vec->count) {
sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
sge_bytes = min((size_t)bc,
(size_t)(vec->sge[xdr_sge_no].iov_len-sge_off));
while (bc != 0) {
sge_bytes = min_t(size_t,
bc, vec->sge[xdr_sge_no].iov_len-sge_off);
sge[sge_no].length = sge_bytes;
atomic_inc(&xprt->sc_dma_used);
sge[sge_no].addr =
ib_dma_map_single(xprt->sc_cm_id->device,
(void *)
vec->sge[xdr_sge_no].iov_base + sge_off,
sge_bytes, DMA_TO_DEVICE);
if (dma_mapping_error(xprt->sc_cm_id->device->dma_device,
sge[sge_no].addr))
goto err;
if (!vec->frmr) {
sge[sge_no].addr =
ib_dma_map_single(xprt->sc_cm_id->device,
(void *)
vec->sge[xdr_sge_no].iov_base + sge_off,
sge_bytes, DMA_TO_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
sge[sge_no].addr))
goto err;
atomic_inc(&xprt->sc_dma_used);
sge[sge_no].lkey = xprt->sc_dma_lkey;
} else {
sge[sge_no].addr = (unsigned long)
vec->sge[xdr_sge_no].iov_base + sge_off;
sge[sge_no].lkey = vec->frmr->mr->lkey;
}
ctxt->count++;
ctxt->frmr = vec->frmr;
sge_off = 0;
sge_no++;
ctxt->count++;
xdr_sge_no++;
BUG_ON(xdr_sge_no > vec->count);
bc -= sge_bytes;
}
BUG_ON(bc != 0);
BUG_ON(xdr_sge_no > vec->count);
/* Prepare WRITE WR */
memset(&write_wr, 0, sizeof write_wr);
ctxt->wr_op = IB_WR_RDMA_WRITE;
......@@ -226,7 +355,10 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[1];
max_write = xprt->sc_max_sge * PAGE_SIZE;
if (vec->frmr)
max_write = vec->frmr->map_len;
else
max_write = xprt->sc_max_sge * PAGE_SIZE;
/* Write chunks start at the pagelist */
for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
......@@ -297,7 +429,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[2];
max_write = xprt->sc_max_sge * PAGE_SIZE;
if (vec->frmr)
max_write = vec->frmr->map_len;
else
max_write = xprt->sc_max_sge * PAGE_SIZE;
/* xdr offset starts at RPC message */
for (xdr_off = 0, chunk_no = 0;
......@@ -307,7 +442,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
ch = &arg_ary->wc_array[chunk_no].wc_target;
write_len = min(xfer_len, ch->rs_length);
/* Prepare the reply chunk given the length actually
* written */
rs_offset = get_unaligned(&(ch->rs_offset));
......@@ -366,6 +500,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
int byte_count)
{
struct ib_send_wr send_wr;
struct ib_send_wr inv_wr;
int sge_no;
int sge_bytes;
int page_no;
......@@ -385,27 +520,45 @@ static int send_reply(struct svcxprt_rdma *rdma,
/* Prepare the context */
ctxt->pages[0] = page;
ctxt->count = 1;
ctxt->frmr = vec->frmr;
if (vec->frmr)
set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
else
clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
/* Prepare the SGE for the RPCRDMA Header */
atomic_inc(&rdma->sc_dma_used);
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device,
page, 0, PAGE_SIZE, DMA_TO_DEVICE);
if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
goto err;
atomic_inc(&rdma->sc_dma_used);
ctxt->direction = DMA_TO_DEVICE;
ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
ctxt->sge[0].lkey = rdma->sc_dma_lkey;
/* Determine how many of our SGE are to be transmitted */
for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
byte_count -= sge_bytes;
atomic_inc(&rdma->sc_dma_used);
ctxt->sge[sge_no].addr =
ib_dma_map_single(rdma->sc_cm_id->device,
vec->sge[sge_no].iov_base,
sge_bytes, DMA_TO_DEVICE);
if (!vec->frmr) {
ctxt->sge[sge_no].addr =
ib_dma_map_single(rdma->sc_cm_id->device,
vec->sge[sge_no].iov_base,
sge_bytes, DMA_TO_DEVICE);
if (ib_dma_mapping_error(rdma->sc_cm_id->device,
ctxt->sge[sge_no].addr))
goto err;
atomic_inc(&rdma->sc_dma_used);
ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
} else {
ctxt->sge[sge_no].addr = (unsigned long)
vec->sge[sge_no].iov_base;
ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
}
ctxt->sge[sge_no].length = sge_bytes;
ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey;
}
BUG_ON(byte_count != 0);
......@@ -417,11 +570,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
ctxt->count++;
rqstp->rq_respages[page_no] = NULL;
/* If there are more pages than SGE, terminate SGE list */
/*
* If there are more pages than SGE, terminate SGE
* list so that svc_rdma_unmap_dma doesn't attempt to
* unmap garbage.
*/
if (page_no+1 >= sge_no)
ctxt->sge[page_no+1].length = 0;
}
BUG_ON(sge_no > rdma->sc_max_sge);
BUG_ON(sge_no > ctxt->count);
memset(&send_wr, 0, sizeof send_wr);
ctxt->wr_op = IB_WR_SEND;
send_wr.wr_id = (unsigned long)ctxt;
......@@ -429,12 +587,26 @@ static int send_reply(struct svcxprt_rdma *rdma,
send_wr.num_sge = sge_no;
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
if (vec->frmr) {
/* Prepare INVALIDATE WR */
memset(&inv_wr, 0, sizeof inv_wr);
inv_wr.opcode = IB_WR_LOCAL_INV;
inv_wr.send_flags = IB_SEND_SIGNALED;
inv_wr.ex.invalidate_rkey =
vec->frmr->mr->lkey;
send_wr.next = &inv_wr;
}
ret = svc_rdma_send(rdma, &send_wr);
if (ret)
svc_rdma_put_context(ctxt, 1);
goto err;
return ret;
return 0;
err:
svc_rdma_put_frmr(rdma, vec->frmr);
svc_rdma_put_context(ctxt, 1);
return -EIO;
}
void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
......@@ -477,8 +649,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
ctxt = svc_rdma_get_context(rdma);
ctxt->direction = DMA_TO_DEVICE;
vec = svc_rdma_get_req_map();
xdr_to_sge(rdma, &rqstp->rq_res, vec);
ret = map_xdr(rdma, &rqstp->rq_res, vec);
if (ret)
goto err0;
inline_bytes = rqstp->rq_res.len;
/* Create the RDMA response header */
......@@ -498,7 +671,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (ret < 0) {
printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
ret);
goto error;
goto err1;
}
inline_bytes -= ret;
......@@ -508,7 +681,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (ret < 0) {
printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
ret);
goto error;
goto err1;
}
inline_bytes -= ret;
......@@ -517,9 +690,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
svc_rdma_put_req_map(vec);
dprintk("svcrdma: send_reply returns %d\n", ret);
return ret;
error:
err1:
put_page(res_page);
err0:
svc_rdma_put_req_map(vec);
svc_rdma_put_context(ctxt, 0);
put_page(res_page);
return ret;
}
......@@ -100,20 +100,29 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
ctxt->xprt = xprt;
INIT_LIST_HEAD(&ctxt->dto_q);
ctxt->count = 0;
ctxt->frmr = NULL;
atomic_inc(&xprt->sc_ctxt_used);
return ctxt;
}
static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
{
struct svcxprt_rdma *xprt = ctxt->xprt;
int i;
for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
atomic_dec(&xprt->sc_dma_used);
ib_dma_unmap_single(xprt->sc_cm_id->device,
ctxt->sge[i].addr,
ctxt->sge[i].length,
ctxt->direction);
/*
* Unmap the DMA addr in the SGE if the lkey matches
* the sc_dma_lkey, otherwise, ignore it since it is
* an FRMR lkey and will be unmapped later when the
* last WR that uses it completes.
*/
if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
atomic_dec(&xprt->sc_dma_used);
ib_dma_unmap_single(xprt->sc_cm_id->device,
ctxt->sge[i].addr,
ctxt->sge[i].length,
ctxt->direction);
}
}
}
......@@ -150,6 +159,7 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void)
schedule_timeout_uninterruptible(msecs_to_jiffies(500));
}
map->count = 0;
map->frmr = NULL;
return map;
}
......@@ -315,6 +325,50 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
svc_xprt_enqueue(&xprt->sc_xprt);
}
/*
* Processs a completion context
*/
static void process_context(struct svcxprt_rdma *xprt,
struct svc_rdma_op_ctxt *ctxt)
{
svc_rdma_unmap_dma(ctxt);
switch (ctxt->wr_op) {
case IB_WR_SEND:
if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
svc_rdma_put_frmr(xprt, ctxt->frmr);
svc_rdma_put_context(ctxt, 1);
break;
case IB_WR_RDMA_WRITE:
svc_rdma_put_context(ctxt, 0);
break;
case IB_WR_RDMA_READ:
case IB_WR_RDMA_READ_WITH_INV:
if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
BUG_ON(!read_hdr);
if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
svc_rdma_put_frmr(xprt, ctxt->frmr);
spin_lock_bh(&xprt->sc_rq_dto_lock);
set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
list_add_tail(&read_hdr->dto_q,
&xprt->sc_read_complete_q);
spin_unlock_bh(&xprt->sc_rq_dto_lock);
svc_xprt_enqueue(&xprt->sc_xprt);
}
svc_rdma_put_context(ctxt, 0);
break;
default:
printk(KERN_ERR "svcrdma: unexpected completion type, "
"opcode=%d\n",
ctxt->wr_op);
break;
}
}
/*
* Send Queue Completion Handler - potentially called on interrupt context.
*
......@@ -327,17 +381,12 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
struct ib_cq *cq = xprt->sc_sq_cq;
int ret;
if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
return;
ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
atomic_inc(&rdma_stat_sq_poll);
while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
xprt = ctxt->xprt;
svc_rdma_unmap_dma(ctxt);
if (wc.status != IB_WC_SUCCESS)
/* Close the transport */
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
......@@ -346,35 +395,10 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
atomic_dec(&xprt->sc_sq_count);
wake_up(&xprt->sc_send_wait);
switch (ctxt->wr_op) {
case IB_WR_SEND:
svc_rdma_put_context(ctxt, 1);
break;
case IB_WR_RDMA_WRITE:
svc_rdma_put_context(ctxt, 0);
break;
case IB_WR_RDMA_READ:
if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
BUG_ON(!read_hdr);
spin_lock_bh(&xprt->sc_rq_dto_lock);
set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
list_add_tail(&read_hdr->dto_q,
&xprt->sc_read_complete_q);
spin_unlock_bh(&xprt->sc_rq_dto_lock);
svc_xprt_enqueue(&xprt->sc_xprt);
}
svc_rdma_put_context(ctxt, 0);
break;
ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
if (ctxt)
process_context(xprt, ctxt);
default:
printk(KERN_ERR "svcrdma: unexpected completion type, "
"opcode=%d, status=%d\n",
wc.opcode, wc.status);
break;
}
svc_xprt_put(&xprt->sc_xprt);
}
......@@ -425,10 +449,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_frmr_q_lock);
cma_xprt->sc_ord = svcrdma_ord;
......@@ -462,7 +488,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
struct ib_recv_wr recv_wr, *bad_recv_wr;
struct svc_rdma_op_ctxt *ctxt;
struct page *page;
unsigned long pa;
dma_addr_t pa;
int sge_no;
int buflen;
int ret;
......@@ -474,13 +500,15 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
BUG_ON(sge_no >= xprt->sc_max_sge);
page = svc_rdma_get_page();
ctxt->pages[sge_no] = page;
atomic_inc(&xprt->sc_dma_used);
pa = ib_dma_map_page(xprt->sc_cm_id->device,
page, 0, PAGE_SIZE,
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
goto err_put_ctxt;
atomic_inc(&xprt->sc_dma_used);
ctxt->sge[sge_no].addr = pa;
ctxt->sge[sge_no].length = PAGE_SIZE;
ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
buflen += PAGE_SIZE;
}
ctxt->count = sge_no;
......@@ -496,6 +524,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
svc_rdma_put_context(ctxt, 1);
}
return ret;
err_put_ctxt:
svc_rdma_put_context(ctxt, 1);
return -ENOMEM;
}
/*
......@@ -566,7 +598,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
"event=%d\n", cma_id, cma_id->context, event->event);
handle_connect_req(cma_id,
event->param.conn.responder_resources);
event->param.conn.initiator_depth);
break;
case RDMA_CM_EVENT_ESTABLISHED:
......@@ -686,6 +718,97 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
return ERR_PTR(ret);
}
static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
{
struct ib_mr *mr;
struct ib_fast_reg_page_list *pl;
struct svc_rdma_fastreg_mr *frmr;
frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
if (!frmr)
goto err;
mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
if (!mr)
goto err_free_frmr;
pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
RPCSVC_MAXPAGES);
if (!pl)
goto err_free_mr;
frmr->mr = mr;
frmr->page_list = pl;
INIT_LIST_HEAD(&frmr->frmr_list);
return frmr;
err_free_mr:
ib_dereg_mr(mr);
err_free_frmr:
kfree(frmr);
err:
return ERR_PTR(-ENOMEM);
}
static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
{
struct svc_rdma_fastreg_mr *frmr;
while (!list_empty(&xprt->sc_frmr_q)) {
frmr = list_entry(xprt->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
ib_dereg_mr(frmr->mr);
ib_free_fast_reg_page_list(frmr->page_list);
kfree(frmr);
}
}
struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
{
struct svc_rdma_fastreg_mr *frmr = NULL;
spin_lock_bh(&rdma->sc_frmr_q_lock);
if (!list_empty(&rdma->sc_frmr_q)) {
frmr = list_entry(rdma->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
frmr->map_len = 0;
frmr->page_list_len = 0;
}
spin_unlock_bh(&rdma->sc_frmr_q_lock);
if (frmr)
return frmr;
return rdma_alloc_frmr(rdma);
}
static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
struct svc_rdma_fastreg_mr *frmr)
{
int page_no;
for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
dma_addr_t addr = frmr->page_list->page_list[page_no];
if (ib_dma_mapping_error(frmr->mr->device, addr))
continue;
atomic_dec(&xprt->sc_dma_used);
ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE,
frmr->direction);
}
}
void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
struct svc_rdma_fastreg_mr *frmr)
{
if (frmr) {
frmr_unmap_dma(rdma, frmr);
spin_lock_bh(&rdma->sc_frmr_q_lock);
BUG_ON(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
spin_unlock_bh(&rdma->sc_frmr_q_lock);
}
}
/*
* This is the xpo_recvfrom function for listening endpoints. Its
* purpose is to accept incoming connections. The CMA callback handler
......@@ -704,6 +827,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
struct ib_device_attr devattr;
int dma_mr_acc;
int need_dma_mr;
int ret;
int i;
......@@ -819,15 +944,77 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
}
newxprt->sc_qp = newxprt->sc_cm_id->qp;
/* Register all of physical memory */
newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd,
IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE);
if (IS_ERR(newxprt->sc_phys_mr)) {
dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret);
/*
* Use the most secure set of MR resources based on the
* transport type and available memory management features in
* the device. Here's the table implemented below:
*
* Fast Global DMA Remote WR
* Reg LKEY MR Access
* Sup'd Sup'd Needed Needed
*
* IWARP N N Y Y
* N Y Y Y
* Y N Y N
* Y Y N -
*
* IB N N Y N
* N Y N -
* Y N Y N
* Y Y N -
*
* NB: iWARP requires remote write access for the data sink
* of an RDMA_READ. IB does not.
*/
if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
newxprt->sc_frmr_pg_list_len =
devattr.max_fast_reg_page_list_len;
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
}
/*
* Determine if a DMA MR is required and if so, what privs are required
*/
switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
case RDMA_TRANSPORT_IWARP:
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
need_dma_mr = 1;
dma_mr_acc =
(IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE);
} else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
need_dma_mr = 1;
dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
} else
need_dma_mr = 0;
break;
case RDMA_TRANSPORT_IB:
if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
need_dma_mr = 1;
dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
} else
need_dma_mr = 0;
break;
default:
goto errout;
}
/* Create the DMA MR if needed, otherwise, use the DMA LKEY */
if (need_dma_mr) {
/* Register all of physical memory */
newxprt->sc_phys_mr =
ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
if (IS_ERR(newxprt->sc_phys_mr)) {
dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
ret);
goto errout;
}
newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
} else
newxprt->sc_dma_lkey =
newxprt->sc_cm_id->device->local_dma_lkey;
/* Post receive buffers */
for (i = 0; i < newxprt->sc_max_requests; i++) {
ret = svc_rdma_post_recv(newxprt);
......@@ -961,6 +1148,9 @@ static void __svc_rdma_free(struct work_struct *work)
WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
/* De-allocate fastreg mr */
rdma_dealloc_frmr_q(rdma);
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_destroy_qp(rdma->sc_qp);
......@@ -1014,21 +1204,59 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
return 1;
}
/*
* Attempt to register the kvec representing the RPC memory with the
* device.
*
* Returns:
* NULL : The device does not support fastreg or there were no more
* fastreg mr.
* frmr : The kvec register request was successfully posted.
* <0 : An error was encountered attempting to register the kvec.
*/
int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
struct svc_rdma_fastreg_mr *frmr)
{
struct ib_send_wr fastreg_wr;
u8 key;
/* Bump the key */
key = (u8)(frmr->mr->lkey & 0x000000FF);
ib_update_fast_reg_key(frmr->mr, ++key);
/* Prepare FASTREG WR */
memset(&fastreg_wr, 0, sizeof fastreg_wr);
fastreg_wr.opcode = IB_WR_FAST_REG_MR;
fastreg_wr.send_flags = IB_SEND_SIGNALED;
fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
fastreg_wr.wr.fast_reg.length = frmr->map_len;
fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
return svc_rdma_send(xprt, &fastreg_wr);
}
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
struct ib_send_wr *bad_wr;
struct ib_send_wr *bad_wr, *n_wr;
int wr_count;
int i;
int ret;
if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
return -ENOTCONN;
BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
wr->opcode);
wr_count = 1;
for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
wr_count++;
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
spin_lock_bh(&xprt->sc_lock);
if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
spin_unlock_bh(&xprt->sc_lock);
atomic_inc(&rdma_stat_sq_starve);
......@@ -1043,19 +1271,26 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
return 0;
continue;
}
/* Bumped used SQ WR count and post */
svc_xprt_get(&xprt->sc_xprt);
/* Take a transport ref for each WR posted */
for (i = 0; i < wr_count; i++)
svc_xprt_get(&xprt->sc_xprt);
/* Bump used SQ WR count and post */
atomic_add(wr_count, &xprt->sc_sq_count);
ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
if (!ret)
atomic_inc(&xprt->sc_sq_count);
else {
svc_xprt_put(&xprt->sc_xprt);
if (ret) {
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
atomic_sub(wr_count, &xprt->sc_sq_count);
for (i = 0; i < wr_count; i ++)
svc_xprt_put(&xprt->sc_xprt);
dprintk("svcrdma: failed to post SQ WR rc=%d, "
"sc_sq_count=%d, sc_sq_depth=%d\n",
ret, atomic_read(&xprt->sc_sq_count),
xprt->sc_sq_depth);
}
spin_unlock_bh(&xprt->sc_lock);
if (ret)
wake_up(&xprt->sc_send_wait);
break;
}
return ret;
......@@ -1079,10 +1314,14 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
/* Prepare SGE for local address */
atomic_inc(&xprt->sc_dma_used);
sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
sge.lkey = xprt->sc_phys_mr->lkey;
if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) {
put_page(p);
return;
}
atomic_inc(&xprt->sc_dma_used);
sge.lkey = xprt->sc_dma_lkey;
sge.length = length;
ctxt = svc_rdma_get_context(xprt);
......@@ -1103,6 +1342,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
if (ret) {
dprintk("svcrdma: Error %d posting send for protocol error\n",
ret);
ib_dma_unmap_page(xprt->sc_cm_id->device,
sge.addr, PAGE_SIZE,
DMA_FROM_DEVICE);
svc_rdma_put_context(ctxt, 1);
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment