Commit 8b306a2e authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'nfsd-4.6-1' of git://linux-nfs.org/~bfields/linux

Pull more nfsd updates from Bruce Fields:
 "Apologies for the previous request, which omitted the top 8 commits
  from my for-next branch (including the SCSI layout commits).  Thanks
  to Trond for spotting my error!"

This actually includes the new layout types, so here's that part of
the pull message repeated:

 "Support for a new pnfs layout type from Christoph Hellwig.  The new
  layout type is a variant of the block layout which uses SCSI features
  to offer improved fencing and device identification.

  Note this pull request also includes the client side of SCSI layout,
  with Trond's permission"

* tag 'nfsd-4.6-1' of git://linux-nfs.org/~bfields/linux:
  nfsd: use short read as well as i_size to set eof
  nfsd: better layoutupdate bounds-checking
  nfsd: block and scsi layout drivers need to depend on CONFIG_BLOCK
  nfsd: add SCSI layout support
  nfsd: move some blocklayout code
  nfsd: add a new config option for the block layout driver
  nfs/blocklayout: add SCSI layout support
  nfs4.h: add SCSI layout definitions
parents 70c5eb84 ac503e4a
pNFS SCSI layout server user guide
==================================
This document describes support for pNFS SCSI layouts in the Linux NFS server.
With pNFS SCSI layouts, the NFS server acts as Metadata Server (MDS) for pNFS,
which in addition to handling all the metadata access to the NFS export,
also hands out layouts to the clients so that they can directly access the
underlying SCSI LUNs that are shared with the client.
To use pNFS SCSI layouts with with the Linux NFS server, the exported file
system needs to support the pNFS SCSI layouts (currently just XFS), and the
file system must sit on a SCSI LUN that is accessible to the clients in
addition to the MDS. As of now the file system needs to sit directly on the
exported LUN, striping or concatenation of LUNs on the MDS and clients
is not supported yet.
On a server built with CONFIG_NFSD_SCSI, the pNFS SCSI volume support is
automatically enabled if the file system is exported using the "pnfs"
option and the underlying SCSI device support persistent reservations.
On the client make sure the kernel has the CONFIG_PNFS_BLOCK option
enabled, and the file system is mounted using the NFSv4.1 protocol
version (mount -o vers=4.1).
......@@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
kfree(bl);
}
static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
gfp_t gfp_flags)
static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
gfp_t gfp_flags, bool is_scsi_layout)
{
struct pnfs_block_layout *bl;
......@@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
bl->bl_ext_ro = RB_ROOT;
spin_lock_init(&bl->bl_ext_lock);
bl->bl_scsi_layout = is_scsi_layout;
return &bl->bl_layout;
}
static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
gfp_t gfp_flags)
{
return __bl_alloc_layout_hdr(inode, gfp_flags, false);
}
static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
gfp_t gfp_flags)
{
return __bl_alloc_layout_hdr(inode, gfp_flags, true);
}
static void bl_free_lseg(struct pnfs_layout_segment *lseg)
{
dprintk("%s enter\n", __func__);
......@@ -889,22 +902,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.sync = pnfs_generic_sync,
};
static struct pnfs_layoutdriver_type scsilayout_type = {
.id = LAYOUT_SCSI,
.name = "LAYOUT_SCSI",
.owner = THIS_MODULE,
.flags = PNFS_LAYOUTRET_ON_SETATTR |
PNFS_READ_WHOLE_PAGE,
.read_pagelist = bl_read_pagelist,
.write_pagelist = bl_write_pagelist,
.alloc_layout_hdr = sl_alloc_layout_hdr,
.free_layout_hdr = bl_free_layout_hdr,
.alloc_lseg = bl_alloc_lseg,
.free_lseg = bl_free_lseg,
.return_range = bl_return_range,
.prepare_layoutcommit = bl_prepare_layoutcommit,
.cleanup_layoutcommit = bl_cleanup_layoutcommit,
.set_layoutdriver = bl_set_layoutdriver,
.alloc_deviceid_node = bl_alloc_deviceid_node,
.free_deviceid_node = bl_free_deviceid_node,
.pg_read_ops = &bl_pg_read_ops,
.pg_write_ops = &bl_pg_write_ops,
.sync = pnfs_generic_sync,
};
static int __init nfs4blocklayout_init(void)
{
int ret;
dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
ret = pnfs_register_layoutdriver(&blocklayout_type);
ret = bl_init_pipefs();
if (ret)
goto out;
ret = bl_init_pipefs();
ret = pnfs_register_layoutdriver(&blocklayout_type);
if (ret)
goto out_unregister;
goto out_cleanup_pipe;
ret = pnfs_register_layoutdriver(&scsilayout_type);
if (ret)
goto out_unregister_block;
return 0;
out_unregister:
out_unregister_block:
pnfs_unregister_layoutdriver(&blocklayout_type);
out_cleanup_pipe:
bl_cleanup_pipefs();
out:
return ret;
}
......@@ -914,8 +958,9 @@ static void __exit nfs4blocklayout_exit(void)
dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
__func__);
bl_cleanup_pipefs();
pnfs_unregister_layoutdriver(&scsilayout_type);
pnfs_unregister_layoutdriver(&blocklayout_type);
bl_cleanup_pipefs();
}
MODULE_ALIAS("nfs-layouttype4-3");
......
......@@ -55,7 +55,6 @@ struct pnfs_block_dev;
*/
#define PNFS_BLOCK_UUID_LEN 128
struct pnfs_block_volume {
enum pnfs_block_volume_type type;
union {
......@@ -82,6 +81,13 @@ struct pnfs_block_volume {
u32 volumes_count;
u32 volumes[PNFS_BLOCK_MAX_DEVICES];
} stripe;
struct {
enum scsi_code_set code_set;
enum scsi_designator_type designator_type;
int designator_len;
u8 designator[256];
u64 pr_key;
} scsi;
};
};
......@@ -106,6 +112,9 @@ struct pnfs_block_dev {
struct block_device *bdev;
u64 disk_offset;
u64 pr_key;
bool pr_registered;
bool (*map)(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map);
};
......@@ -131,6 +140,7 @@ struct pnfs_block_layout {
struct rb_root bl_ext_rw;
struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
bool bl_scsi_layout;
};
static inline struct pnfs_block_layout *
......@@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
dev_t bl_resolve_deviceid(struct nfs_server *server,
struct pnfs_block_volume *b, gfp_t gfp_mask);
int __init bl_init_pipefs(void);
void __exit bl_cleanup_pipefs(void);
void bl_cleanup_pipefs(void);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
/*
* Copyright (c) 2014 Christoph Hellwig.
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/sunrpc/svc.h>
#include <linux/blkdev.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_xdr.h>
#include <linux/pr.h>
#include "blocklayout.h"
......@@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
bl_free_device(&dev->children[i]);
kfree(dev->children);
} else {
if (dev->pr_registered) {
const struct pr_ops *ops =
dev->bdev->bd_disk->fops->pr_ops;
int error;
error = ops->pr_register(dev->bdev, dev->pr_key, 0,
false);
if (error)
pr_err("failed to unregister PR key.\n");
}
if (dev->bdev)
blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
}
......@@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
for (i = 0; i < b->stripe.volumes_count; i++)
b->stripe.volumes[i] = be32_to_cpup(p++);
break;
case PNFS_BLOCK_VOLUME_SCSI:
p = xdr_inline_decode(xdr, 4 + 4 + 4);
if (!p)
return -EIO;
b->scsi.code_set = be32_to_cpup(p++);
b->scsi.designator_type = be32_to_cpup(p++);
b->scsi.designator_len = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, b->scsi.designator_len);
if (!p)
return -EIO;
if (b->scsi.designator_len > 256)
return -EIO;
memcpy(&b->scsi.designator, p, b->scsi.designator_len);
p = xdr_inline_decode(xdr, 8);
if (!p)
return -EIO;
p = xdr_decode_hyper(p, &b->scsi.pr_key);
break;
default:
dprintk("unknown volume type!\n");
return -EIO;
......@@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
}
static bool
bl_validate_designator(struct pnfs_block_volume *v)
{
switch (v->scsi.designator_type) {
case PS_DESIGNATOR_EUI64:
if (v->scsi.code_set != PS_CODE_SET_BINARY)
return false;
if (v->scsi.designator_len != 8 &&
v->scsi.designator_len != 10 &&
v->scsi.designator_len != 16)
return false;
return true;
case PS_DESIGNATOR_NAA:
if (v->scsi.code_set != PS_CODE_SET_BINARY)
return false;
if (v->scsi.designator_len != 8 &&
v->scsi.designator_len != 16)
return false;
return true;
case PS_DESIGNATOR_T10:
case PS_DESIGNATOR_NAME:
pr_err("pNFS: unsupported designator "
"(code set %d, type %d, len %d.\n",
v->scsi.code_set,
v->scsi.designator_type,
v->scsi.designator_len);
return false;
default:
pr_err("pNFS: invalid designator "
"(code set %d, type %d, len %d.\n",
v->scsi.code_set,
v->scsi.designator_type,
v->scsi.designator_len);
return false;
}
}
static int
bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
const struct pr_ops *ops;
const char *devname;
int error;
if (!bl_validate_designator(v))
return -EINVAL;
switch (v->scsi.designator_len) {
case 8:
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
v->scsi.designator);
break;
case 12:
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
v->scsi.designator);
break;
case 16:
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
v->scsi.designator);
break;
default:
return -EINVAL;
}
d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
if (IS_ERR(d->bdev)) {
pr_warn("pNFS: failed to open device %s (%ld)\n",
devname, PTR_ERR(d->bdev));
kfree(devname);
return PTR_ERR(d->bdev);
}
kfree(devname);
d->len = i_size_read(d->bdev->bd_inode);
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
d->bdev->bd_disk->disk_name, d->pr_key);
ops = d->bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: block device %s does not support reservations.",
d->bdev->bd_disk->disk_name);
error = -EINVAL;
goto out_blkdev_put;
}
error = ops->pr_register(d->bdev, 0, d->pr_key, true);
if (error) {
pr_err("pNFS: failed to register key for block device %s.",
d->bdev->bd_disk->disk_name);
goto out_blkdev_put;
}
d->pr_registered = true;
return 0;
out_blkdev_put:
blkdev_put(d->bdev, FMODE_READ);
return error;
}
static int
bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
......@@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
return bl_parse_concat(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_STRIPE:
return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_SCSI:
return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
default:
dprintk("unsupported volume type: %d\n", volumes[idx].type);
return -EIO;
......
/*
* Copyright (c) 2014 Christoph Hellwig.
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/vmalloc.h>
......@@ -462,10 +462,12 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
return err;
}
static size_t ext_tree_layoutupdate_size(size_t count)
static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
{
return sizeof(__be32) /* number of entries */ +
PNFS_BLOCK_EXTENT_SIZE * count;
if (bl->bl_scsi_layout)
return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
else
return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
}
static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
......@@ -483,6 +485,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
}
}
static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
{
p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
NFS4_DEVICEID4_SIZE);
p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
p = xdr_encode_hyper(p, 0LL);
*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
return p;
}
static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
{
p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
}
static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
size_t buffer_size, size_t *count)
{
......@@ -496,19 +515,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
continue;
(*count)++;
if (ext_tree_layoutupdate_size(*count) > buffer_size) {
if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
/* keep counting.. */
ret = -ENOSPC;
continue;
}
p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
NFS4_DEVICEID4_SIZE);
p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
p = xdr_encode_hyper(p, 0LL);
*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
if (bl->bl_scsi_layout)
p = encode_scsi_range(be, p);
else
p = encode_block_extent(be, p);
be->be_tag = EXTENT_COMMITTING;
}
spin_unlock(&bl->bl_ext_lock);
......@@ -537,7 +553,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
buffer_size = ext_tree_layoutupdate_size(count);
buffer_size = ext_tree_layoutupdate_size(bl, count);
count = 0;
arg->layoutupdate_pages =
......@@ -556,7 +572,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
}
*start_p = cpu_to_be32(count);
arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
void *p = start_p, *end = p + arg->layoutupdate_len;
......
......@@ -281,7 +281,7 @@ int __init bl_init_pipefs(void)
return ret;
}
void __exit bl_cleanup_pipefs(void)
void bl_cleanup_pipefs(void)
{
rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
unregister_pernet_subsys(&nfs4blocklayout_net_ops);
......
......@@ -84,12 +84,30 @@ config NFSD_V4
If unsure, say N.
config NFSD_PNFS
bool "NFSv4.1 server support for Parallel NFS (pNFS)"
depends on NFSD_V4
bool
config NFSD_BLOCKLAYOUT
bool "NFSv4.1 server support for pNFS block layouts"
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
help
This option enables support for the exporting pNFS block layouts
in the kernel's NFS server. The pNFS block layout enables NFS
clients to directly perform I/O to block devices accesible to both
the server and the clients. See RFC 5663 for more details.
If unsure, say N.
config NFSD_SCSILAYOUT
bool "NFSv4.1 server support for pNFS SCSI layouts"
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
help
This option enables support for the parallel NFS features of the
minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
server.
This option enables support for the exporting pNFS SCSI layouts
in the kernel's NFS server. The pNFS SCSI layout enables NFS
clients to directly perform I/O to SCSI devices accesible to both
the server and the clients. See draft-ietf-nfsv4-scsi-layout for
more details.
If unsure, say N.
......
......@@ -17,4 +17,6 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
/*
* Copyright (c) 2014 Christoph Hellwig.
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/exportfs.h>
#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/pr.h>
#include <linux/nfsd/debug.h>
#include <scsi/scsi_proto.h>
#include <scsi/scsi_common.h>
#include "blocklayoutxdr.h"
#include "pnfs.h"
......@@ -13,37 +16,6 @@
#define NFSDDBG_FACILITY NFSDDBG_PNFS
static int
nfsd4_block_get_device_info_simple(struct super_block *sb,
struct nfsd4_getdeviceinfo *gdp)
{
struct pnfs_block_deviceaddr *dev;
struct pnfs_block_volume *b;
dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
sizeof(struct pnfs_block_volume), GFP_KERNEL);
if (!dev)
return -ENOMEM;
gdp->gd_device = dev;
dev->nr_volumes = 1;
b = &dev->volumes[0];
b->type = PNFS_BLOCK_VOLUME_SIMPLE;
b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
&b->simple.offset);
}
static __be32
nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
struct nfsd4_getdeviceinfo *gdp)
{
if (sb->s_bdev != sb->s_bdev->bd_contains)
return nfserr_inval;
return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
}
static __be32
nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
struct nfsd4_layoutget *args)
......@@ -141,20 +113,13 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
}
static __be32
nfsd4_block_proc_layoutcommit(struct inode *inode,
struct nfsd4_layoutcommit *lcp)
nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
struct iomap *iomaps, int nr_iomaps)
{
loff_t new_size = lcp->lc_last_wr + 1;
struct iattr iattr = { .ia_valid = 0 };
struct iomap *iomaps;
int nr_iomaps;
int error;
nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
if (nr_iomaps < 0)
return nfserrno(nr_iomaps);
if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
lcp->lc_mtime = current_fs_time(inode->i_sb);
......@@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
return nfserrno(error);
}
#ifdef CONFIG_NFSD_BLOCKLAYOUT
static int
nfsd4_block_get_device_info_simple(struct super_block *sb,
struct nfsd4_getdeviceinfo *gdp)
{
struct pnfs_block_deviceaddr *dev;
struct pnfs_block_volume *b;
dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
sizeof(struct pnfs_block_volume), GFP_KERNEL);
if (!dev)
return -ENOMEM;
gdp->gd_device = dev;
dev->nr_volumes = 1;
b = &dev->volumes[0];
b->type = PNFS_BLOCK_VOLUME_SIMPLE;
b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
&b->simple.offset);
}
static __be32
nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
if (sb->s_bdev != sb->s_bdev->bd_contains)
return nfserr_inval;
return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
}
static __be32
nfsd4_block_proc_layoutcommit(struct inode *inode,
struct nfsd4_layoutcommit *lcp)
{
struct iomap *iomaps;
int nr_iomaps;
nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
if (nr_iomaps < 0)
return nfserrno(nr_iomaps);
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}
const struct nfsd4_layout_ops bl_layout_ops = {
/*
* Pretend that we send notification to the client. This is a blatant
......@@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
.encode_layoutget = nfsd4_block_encode_layoutget,
.proc_layoutcommit = nfsd4_block_proc_layoutcommit,
};
#endif /* CONFIG_NFSD_BLOCKLAYOUT */
#ifdef CONFIG_NFSD_SCSILAYOUT
static int nfsd4_scsi_identify_device(struct block_device *bdev,
struct pnfs_block_volume *b)
{
struct request_queue *q = bdev->bd_disk->queue;
struct request *rq;
size_t bufflen = 252, len, id_len;
u8 *buf, *d, type, assoc;
int error;
buf = kzalloc(bufflen, GFP_KERNEL);
if (!buf)
return -ENOMEM;
rq = blk_get_request(q, READ, GFP_KERNEL);
if (IS_ERR(rq)) {
error = -ENOMEM;
goto out_free_buf;
}
blk_rq_set_block_pc(rq);
error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
if (error)
goto out_put_request;
rq->cmd[0] = INQUIRY;
rq->cmd[1] = 1;
rq->cmd[2] = 0x83;
rq->cmd[3] = bufflen >> 8;
rq->cmd[4] = bufflen & 0xff;
rq->cmd_len = COMMAND_SIZE(INQUIRY);
error = blk_execute_rq(rq->q, NULL, rq, 1);
if (error) {
pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
rq->errors);
goto out_put_request;
}
len = (buf[2] << 8) + buf[3] + 4;
if (len > bufflen) {
pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
len);
goto out_put_request;
}
d = buf + 4;
for (d = buf + 4; d < buf + len; d += id_len + 4) {
id_len = d[3];
type = d[1] & 0xf;
assoc = (d[1] >> 4) & 0x3;
/*
* We only care about a EUI-64 and NAA designator types
* with LU association.
*/
if (assoc != 0x00)
continue;
if (type != 0x02 && type != 0x03)
continue;
if (id_len != 8 && id_len != 12 && id_len != 16)
continue;
b->scsi.code_set = PS_CODE_SET_BINARY;
b->scsi.designator_type = type == 0x02 ?
PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
b->scsi.designator_len = id_len;
memcpy(b->scsi.designator, d + 4, id_len);
/*
* If we found a 8 or 12 byte descriptor continue on to
* see if a 16 byte one is available. If we find a
* 16 byte descriptor we're done.
*/
if (id_len == 16)
break;
}
out_put_request:
blk_put_request(rq);
out_free_buf:
kfree(buf);
return error;
}
#define NFSD_MDS_PR_KEY 0x0100000000000000
/*
* We use the client ID as a unique key for the reservations.
* This allows us to easily fence a client when recalls fail.
*/
static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
{
return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
}
static int
nfsd4_block_get_device_info_scsi(struct super_block *sb,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
struct pnfs_block_deviceaddr *dev;
struct pnfs_block_volume *b;
const struct pr_ops *ops;
int error;
dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
sizeof(struct pnfs_block_volume), GFP_KERNEL);
if (!dev)
return -ENOMEM;
gdp->gd_device = dev;
dev->nr_volumes = 1;
b = &dev->volumes[0];
b->type = PNFS_BLOCK_VOLUME_SCSI;
b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
error = nfsd4_scsi_identify_device(sb->s_bdev, b);
if (error)
return error;
ops = sb->s_bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: device %s does not support PRs.\n",
sb->s_id);
return -EINVAL;
}
error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
if (error) {
pr_err("pNFS: failed to register key for device %s.\n",
sb->s_id);
return -EINVAL;
}
error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
if (error) {
pr_err("pNFS: failed to reserve device %s.\n",
sb->s_id);
return -EINVAL;
}
return 0;
}
static __be32
nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
if (sb->s_bdev != sb->s_bdev->bd_contains)
return nfserr_inval;
return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
}
static __be32
nfsd4_scsi_proc_layoutcommit(struct inode *inode,
struct nfsd4_layoutcommit *lcp)
{
struct iomap *iomaps;
int nr_iomaps;
nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
if (nr_iomaps < 0)
return nfserrno(nr_iomaps);
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}
static void
nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
{
struct nfs4_client *clp = ls->ls_stid.sc_client;
struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
nfsd4_scsi_pr_key(clp), 0, true);
}
const struct nfsd4_layout_ops scsi_layout_ops = {
/*
* Pretend that we send notification to the client. This is a blatant
* lie to force recent Linux clients to cache our device IDs.
* We rarely ever change the device ID, so the harm of leaking deviceids
* for a while isn't too bad. Unfortunately RFC5661 is a complete mess
* in this regard, but I filed errata 4119 for this a while ago, and
* hopefully the Linux client will eventually start caching deviceids
* without this again.
*/
.notify_types =
NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
.proc_getdeviceinfo = nfsd4_scsi_proc_getdeviceinfo,
.encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
.proc_layoutget = nfsd4_block_proc_layoutget,
.encode_layoutget = nfsd4_block_encode_layoutget,
.proc_layoutcommit = nfsd4_scsi_proc_layoutcommit,
.fence_client = nfsd4_scsi_fence_client,
};
#endif /* CONFIG_NFSD_SCSILAYOUT */
/*
* Copyright (c) 2014 Christoph Hellwig.
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/sunrpc/svc.h>
#include <linux/exportfs.h>
......@@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_encode_hyper(p, b->simple.offset);
p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
break;
case PNFS_BLOCK_VOLUME_SCSI:
len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
p = xdr_reserve_space(xdr, len);
if (!p)
return -ETOOSMALL;
*p++ = cpu_to_be32(b->type);
*p++ = cpu_to_be32(b->scsi.code_set);
*p++ = cpu_to_be32(b->scsi.designator_type);
p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
p = xdr_encode_hyper(p, b->scsi.pr_key);
break;
default:
return -ENOTSUPP;
}
......@@ -93,18 +105,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size)
{
struct iomap *iomaps;
u32 nr_iomaps, expected, i;
u32 nr_iomaps, i;
if (len < sizeof(u32)) {
dprintk("%s: extent array too small: %u\n", __func__, len);
return -EINVAL;
}
len -= sizeof(u32);
if (len % PNFS_BLOCK_EXTENT_SIZE) {
dprintk("%s: extent array invalid: %u\n", __func__, len);
return -EINVAL;
}
nr_iomaps = be32_to_cpup(p++);
expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
if (len != expected) {
if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
dprintk("%s: extent array size mismatch: %u/%u\n",
__func__, len, expected);
__func__, len, nr_iomaps);
return -EINVAL;
}
......@@ -155,3 +171,54 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
kfree(iomaps);
return -EINVAL;
}
int
nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size)
{
struct iomap *iomaps;
u32 nr_iomaps, expected, i;
if (len < sizeof(u32)) {
dprintk("%s: extent array too small: %u\n", __func__, len);
return -EINVAL;
}
nr_iomaps = be32_to_cpup(p++);
expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
if (len != expected) {
dprintk("%s: extent array size mismatch: %u/%u\n",
__func__, len, expected);
return -EINVAL;
}
iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
if (!iomaps) {
dprintk("%s: failed to allocate extent array\n", __func__);
return -ENOMEM;
}
for (i = 0; i < nr_iomaps; i++) {
u64 val;
p = xdr_decode_hyper(p, &val);
if (val & (block_size - 1)) {
dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
goto fail;
}
iomaps[i].offset = val;
p = xdr_decode_hyper(p, &val);
if (val & (block_size - 1)) {
dprintk("%s: unaligned length 0x%llx\n", __func__, val);
goto fail;
}
iomaps[i].length = val;
}
*iomapp = iomaps;
return nr_iomaps;
fail:
kfree(iomaps);
return -EINVAL;
}
......@@ -15,6 +15,11 @@ struct pnfs_block_extent {
enum pnfs_block_extent_state es;
};
struct pnfs_block_range {
u64 foff;
u64 len;
};
/*
* Random upper cap for the uuid length to avoid unbounded allocation.
* Not actually limited by the protocol.
......@@ -29,6 +34,13 @@ struct pnfs_block_volume {
u32 sig_len;
u8 sig[PNFS_BLOCK_UUID_LEN];
} simple;
struct {
enum scsi_code_set code_set;
enum scsi_designator_type designator_type;
int designator_len;
u8 designator[256];
u64 pr_key;
} scsi;
};
};
......@@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
struct nfsd4_layoutget *lgp);
int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size);
int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size);
#endif /* _NFSD_BLOCKLAYOUTXDR_H */
......@@ -147,6 +147,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
{
__be32 nfserr;
u32 max_blocksize = svc_max_payload(rqstp);
unsigned long cnt = min(argp->count, max_blocksize);
dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
SVCFH_fmt(&argp->fh),
......@@ -157,7 +158,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
* 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
* + 1 (xdr opaque byte count) = 26
*/
resp->count = min(argp->count, max_blocksize);
resp->count = cnt;
svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
fh_copy(&resp->fh, &argp->fh);
......@@ -167,8 +168,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
&resp->count);
if (nfserr == 0) {
struct inode *inode = d_inode(resp->fh.fh_dentry);
resp->eof = (argp->offset + resp->count) >= inode->i_size;
resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset,
inode->i_size);
}
RETURN_STATUS(nfserr);
......
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
#include <linux/blkdev.h>
#include <linux/kmod.h>
#include <linux/file.h>
#include <linux/jhash.h>
......@@ -26,7 +27,12 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lock_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
#ifdef CONFIG_NFSD_BLOCKLAYOUT
[LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
#endif
#ifdef CONFIG_NFSD_SCSILAYOUT
[LAYOUT_SCSI] = &scsi_layout_ops,
#endif
};
/* pNFS device ID to export fsid mapping */
......@@ -121,10 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
if (!(exp->ex_flags & NFSEXP_PNFS))
return;
/*
* Check if the file system supports exporting a block-like layout.
* If the block device supports reservations prefer the SCSI layout,
* otherwise advertise the block layout.
*/
#ifdef CONFIG_NFSD_BLOCKLAYOUT
if (sb->s_export_op->get_uuid &&
sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks)
exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
#endif
#ifdef CONFIG_NFSD_SCSILAYOUT
/* overwrite block layout selection if needed */
if (sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks &&
sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
exp->ex_layout_type = LAYOUT_SCSI;
#endif
}
static void
......@@ -590,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
printk(KERN_WARNING
"nfsd: client %s failed to respond to layout recall. "
" Fencing..\n", addr_str);
......@@ -626,6 +644,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
container_of(cb, struct nfs4_layout_stateid, ls_recall);
struct nfsd_net *nn;
ktime_t now, cutoff;
const struct nfsd4_layout_ops *ops;
LIST_HEAD(reaplist);
......@@ -661,6 +680,12 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
/*
* Unknown error or non-responding client, we'll need to fence.
*/
trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
ops = nfsd4_layout_ops[ls->ls_layout_type];
if (ops->fence_client)
ops->fence_client(ls);
else
nfsd4_cb_layout_fail(ls);
return -1;
}
......
......@@ -1268,8 +1268,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
goto out;
nfserr = nfs_ok;
if (gdp->gd_maxcount != 0)
nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
if (gdp->gd_maxcount != 0) {
nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
cstate->session->se_client, gdp);
}
gdp->gd_notify_types &= ops->notify_types;
out:
......
......@@ -3365,6 +3365,7 @@ static __be32 nfsd4_encode_splice_read(
struct xdr_stream *xdr = &resp->xdr;
struct xdr_buf *buf = xdr->buf;
u32 eof;
long len;
int space_left;
__be32 nfserr;
__be32 *p = xdr->p - 2;
......@@ -3373,6 +3374,7 @@ static __be32 nfsd4_encode_splice_read(
if (xdr->end - xdr->p < 1)
return nfserr_resource;
len = maxcount;
nfserr = nfsd_splice_read(read->rd_rqstp, file,
read->rd_offset, &maxcount);
if (nfserr) {
......@@ -3385,7 +3387,7 @@ static __be32 nfsd4_encode_splice_read(
return nfserr;
}
eof = (read->rd_offset + maxcount >=
eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
d_inode(read->rd_fhp->fh_dentry)->i_size);
*(p++) = htonl(eof);
......@@ -3456,13 +3458,14 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
}
read->rd_vlen = v;
len = maxcount;
nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
read->rd_vlen, &maxcount);
if (nfserr)
return nfserr;
xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
eof = (read->rd_offset + maxcount >=
eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
d_inode(read->rd_fhp->fh_dentry)->i_size);
tmp = htonl(eof);
......
......@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
u32 notify_types;
__be32 (*proc_getdeviceinfo)(struct super_block *sb,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdevp);
__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
struct nfsd4_getdeviceinfo *gdevp);
......@@ -32,10 +33,17 @@ struct nfsd4_layout_ops {
__be32 (*proc_layoutcommit)(struct inode *inode,
struct nfsd4_layoutcommit *lcp);
void (*fence_client)(struct nfs4_layout_stateid *ls);
};
extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
#ifdef CONFIG_NFSD_BLOCKLAYOUT
extern const struct nfsd4_layout_ops bl_layout_ops;
#endif
#ifdef CONFIG_NFSD_SCSILAYOUT
extern const struct nfsd4_layout_ops scsi_layout_ops;
#endif
__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, stateid_t *stateid,
......
......@@ -139,4 +139,23 @@ static inline int nfsd_create_is_exclusive(int createmode)
|| createmode == NFS4_CREATE_EXCLUSIVE4_1;
}
static inline bool nfsd_eof_on_read(long requested, long read,
loff_t offset, loff_t size)
{
/* We assume a short read means eof: */
if (requested > read)
return true;
/*
* A non-short read might also reach end of file. The spec
* still requires us to set eof in that case.
*
* Further operations may have modified the file size since
* the read, so the following check is not atomic with the read.
* We've only seen that cause a problem for a client in the case
* where the read returned a count of 0 without setting eof.
* That case was fixed by the addition of the above check.
*/
return (offset + read >= size);
}
#endif /* LINUX_NFSD_VFS_H */
......@@ -121,4 +121,5 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o
xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o
......@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
.fh_to_parent = xfs_fs_fh_to_parent,
.get_parent = xfs_fs_get_parent,
.commit_metadata = xfs_fs_nfs_commit_metadata,
#ifdef CONFIG_NFSD_PNFS
#ifdef CONFIG_NFSD_BLOCKLAYOUT
.get_uuid = xfs_fs_get_uuid,
.map_blocks = xfs_fs_map_blocks,
.commit_blocks = xfs_fs_commit_blocks,
......
#ifndef _XFS_PNFS_H
#define _XFS_PNFS_H 1
#ifdef CONFIG_NFSD_PNFS
#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
struct iomap *iomap, bool write, u32 *device_generation);
......
......@@ -529,6 +529,7 @@ enum pnfs_layouttype {
LAYOUT_OSD2_OBJECTS = 2,
LAYOUT_BLOCK_VOLUME = 3,
LAYOUT_FLEX_FILES = 4,
LAYOUT_SCSI = 5,
LAYOUT_TYPE_MAX
};
......@@ -555,6 +556,7 @@ enum pnfs_block_volume_type {
PNFS_BLOCK_VOLUME_SLICE = 1,
PNFS_BLOCK_VOLUME_CONCAT = 2,
PNFS_BLOCK_VOLUME_STRIPE = 3,
PNFS_BLOCK_VOLUME_SCSI = 4,
};
enum pnfs_block_extent_state {
......@@ -568,6 +570,23 @@ enum pnfs_block_extent_state {
#define PNFS_BLOCK_EXTENT_SIZE \
(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
/* on the wire size of a scsi commit range */
#define PNFS_SCSI_RANGE_SIZE \
(4 * sizeof(__be32))
enum scsi_code_set {
PS_CODE_SET_BINARY = 1,
PS_CODE_SET_ASCII = 2,
PS_CODE_SET_UTF8 = 3
};
enum scsi_designator_type {
PS_DESIGNATOR_T10 = 1,
PS_DESIGNATOR_EUI64 = 2,
PS_DESIGNATOR_NAA = 3,
PS_DESIGNATOR_NAME = 8
};
#define NFL4_UFLG_MASK 0x0000003F
#define NFL4_UFLG_DENSE 0x00000001
#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment