Commit 0c76c6ba authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "We have a pile of bug fixes from Ilya, including a few patches that
  sync up the CRUSH code with the latest from userspace.

  There is also a long series from Zheng that fixes various issues with
  snapshots, inline data, and directory fsync, some simplification and
  improvement in the cap release code, and a rework of the caching of
  directory contents.

  To top it off there are a few small fixes and cleanups from Benoit and
  Hong"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (40 commits)
  rbd: use GFP_NOIO in rbd_obj_request_create()
  crush: fix a bug in tree bucket decode
  libceph: Fix ceph_tcp_sendpage()'s more boolean usage
  libceph: Remove spurious kunmap() of the zero page
  rbd: queue_depth map option
  rbd: store rbd_options in rbd_device
  rbd: terminate rbd_opts_tokens with Opt_err
  ceph: fix ceph_writepages_start()
  rbd: bump queue_max_segments
  ceph: rework dcache readdir
  crush: sync up with userspace
  crush: fix crash from invalid 'take' argument
  ceph: switch some GFP_NOFS memory allocation to GFP_KERNEL
  ceph: pre-allocate data structure that tracks caps flushing
  ceph: re-send flushing caps (which are revoked) in reconnect stage
  ceph: send TID of the oldest pending caps flush to MDS
  ceph: track pending caps flushing globally
  ceph: track pending caps flushing accurately
  libceph: fix wrong name "Ceph filesystem for Linux"
  ceph: fix directory fsync
  ...
parents 8688d954 5a60e876
......@@ -346,6 +346,7 @@ struct rbd_device {
struct rbd_image_header header;
unsigned long flags; /* possibly lock protected */
struct rbd_spec *spec;
struct rbd_options *opts;
char *header_name;
......@@ -724,34 +725,36 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
}
/*
* mount options
* (Per device) rbd map options
*/
enum {
Opt_queue_depth,
Opt_last_int,
/* int args above */
Opt_last_string,
/* string args above */
Opt_read_only,
Opt_read_write,
/* Boolean args above */
Opt_last_bool,
Opt_err
};
static match_table_t rbd_opts_tokens = {
{Opt_queue_depth, "queue_depth=%d"},
/* int args above */
/* string args above */
{Opt_read_only, "read_only"},
{Opt_read_only, "ro"}, /* Alternate spelling */
{Opt_read_write, "read_write"},
{Opt_read_write, "rw"}, /* Alternate spelling */
/* Boolean args above */
{-1, NULL}
{Opt_err, NULL}
};
struct rbd_options {
int queue_depth;
bool read_only;
};
#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
#define RBD_READ_ONLY_DEFAULT false
static int parse_rbd_opts_token(char *c, void *private)
......@@ -761,27 +764,27 @@ static int parse_rbd_opts_token(char *c, void *private)
int token, intval, ret;
token = match_token(c, rbd_opts_tokens, argstr);
if (token < 0)
return -EINVAL;
if (token < Opt_last_int) {
ret = match_int(&argstr[0], &intval);
if (ret < 0) {
pr_err("bad mount option arg (not int) "
"at '%s'\n", c);
pr_err("bad mount option arg (not int) at '%s'\n", c);
return ret;
}
dout("got int token %d val %d\n", token, intval);
} else if (token > Opt_last_int && token < Opt_last_string) {
dout("got string token %d val %s\n", token,
argstr[0].from);
} else if (token > Opt_last_string && token < Opt_last_bool) {
dout("got Boolean token %d\n", token);
dout("got string token %d val %s\n", token, argstr[0].from);
} else {
dout("got token %d\n", token);
}
switch (token) {
case Opt_queue_depth:
if (intval < 1) {
pr_err("queue_depth out of range\n");
return -EINVAL;
}
rbd_opts->queue_depth = intval;
break;
case Opt_read_only:
rbd_opts->read_only = true;
break;
......@@ -789,9 +792,10 @@ static int parse_rbd_opts_token(char *c, void *private)
rbd_opts->read_only = false;
break;
default:
rbd_assert(false);
break;
/* libceph prints "bad option" msg */
return -EINVAL;
}
return 0;
}
......@@ -1563,22 +1567,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
/*
* Wait for an object request to complete. If interrupted, cancel the
* underlying osd request.
*
* @timeout: in jiffies, 0 means "wait forever"
*/
static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
unsigned long timeout)
{
int ret;
long ret;
dout("%s %p\n", __func__, obj_request);
ret = wait_for_completion_interruptible(&obj_request->completion);
if (ret < 0) {
dout("%s %p interrupted\n", __func__, obj_request);
ret = wait_for_completion_interruptible_timeout(
&obj_request->completion,
ceph_timeout_jiffies(timeout));
if (ret <= 0) {
if (ret == 0)
ret = -ETIMEDOUT;
rbd_obj_request_end(obj_request);
return ret;
} else {
ret = 0;
}
dout("%s %p done\n", __func__, obj_request);
return 0;
dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
return ret;
}
static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
{
return __rbd_obj_request_wait(obj_request, 0);
}
static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
unsigned long timeout)
{
return __rbd_obj_request_wait(obj_request, timeout);
}
static void rbd_img_request_complete(struct rbd_img_request *img_request)
......@@ -2001,11 +2022,11 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
rbd_assert(obj_request_type_valid(type));
size = strlen(object_name) + 1;
name = kmalloc(size, GFP_KERNEL);
name = kmalloc(size, GFP_NOIO);
if (!name)
return NULL;
obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
if (!obj_request) {
kfree(name);
return NULL;
......@@ -2376,7 +2397,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
}
if (opcode == CEPH_OSD_OP_DELETE)
osd_req_op_init(osd_request, num_ops, opcode);
osd_req_op_init(osd_request, num_ops, opcode, 0);
else
osd_req_op_extent_init(osd_request, num_ops, opcode,
offset, length, 0, 0);
......@@ -2848,7 +2869,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
goto out;
stat_request->callback = rbd_img_obj_exists_callback;
osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
false, false);
rbd_osd_req_format_read(stat_request);
......@@ -3122,6 +3143,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
bool watch)
{
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct ceph_options *opts = osdc->client->options;
struct rbd_obj_request *obj_request;
int ret;
......@@ -3148,7 +3170,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
if (ret)
goto out;
ret = rbd_obj_request_wait(obj_request);
ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
if (ret)
goto out;
......@@ -3750,10 +3772,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
rbd_dev->tag_set.flags =
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
......@@ -3773,6 +3794,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
/* set io sizes to object size */
segment_size = rbd_obj_bytes(&rbd_dev->header);
blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
blk_queue_max_segment_size(q, segment_size);
blk_queue_io_min(q, segment_size);
blk_queue_io_opt(q, segment_size);
......@@ -4044,7 +4066,8 @@ static void rbd_spec_free(struct kref *kref)
}
static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
struct rbd_spec *spec)
struct rbd_spec *spec,
struct rbd_options *opts)
{
struct rbd_device *rbd_dev;
......@@ -4058,8 +4081,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
INIT_LIST_HEAD(&rbd_dev->node);
init_rwsem(&rbd_dev->header_rwsem);
rbd_dev->spec = spec;
rbd_dev->rbd_client = rbdc;
rbd_dev->spec = spec;
rbd_dev->opts = opts;
/* Initialize the layout used for all rbd requests */
......@@ -4075,6 +4099,7 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev)
{
rbd_put_client(rbd_dev->rbd_client);
rbd_spec_put(rbd_dev->spec);
kfree(rbd_dev->opts);
kfree(rbd_dev);
}
......@@ -4933,6 +4958,7 @@ static int rbd_add_parse_args(const char *buf,
goto out_mem;
rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
copts = ceph_parse_options(options, mon_addrs,
mon_addrs + mon_addrs_size - 1,
......@@ -4963,8 +4989,8 @@ static int rbd_add_parse_args(const char *buf,
*/
static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
{
struct ceph_options *opts = rbdc->client->options;
u64 newest_epoch;
unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
int tries = 0;
int ret;
......@@ -4979,7 +5005,8 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
ceph_monc_request_next_osdmap(&rbdc->client->monc);
(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
newest_epoch, timeout);
newest_epoch,
opts->mount_timeout);
goto again;
} else {
/* the osdmap we have is new enough */
......@@ -5148,7 +5175,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
rbdc = __rbd_get_client(rbd_dev->rbd_client);
ret = -ENOMEM;
parent = rbd_dev_create(rbdc, parent_spec);
parent = rbd_dev_create(rbdc, parent_spec, NULL);
if (!parent)
goto out_err;
......@@ -5394,9 +5421,6 @@ static ssize_t do_rbd_add(struct bus_type *bus,
rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
if (rc < 0)
goto err_out_module;
read_only = rbd_opts->read_only;
kfree(rbd_opts);
rbd_opts = NULL; /* done with this */
rbdc = rbd_get_client(ceph_opts);
if (IS_ERR(rbdc)) {
......@@ -5422,11 +5446,12 @@ static ssize_t do_rbd_add(struct bus_type *bus,
goto err_out_client;
}
rbd_dev = rbd_dev_create(rbdc, spec);
rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
if (!rbd_dev)
goto err_out_client;
rbdc = NULL; /* rbd_dev now owns this */
spec = NULL; /* rbd_dev now owns this */
rbd_opts = NULL; /* rbd_dev now owns this */
rc = rbd_dev_image_probe(rbd_dev, true);
if (rc < 0)
......@@ -5434,6 +5459,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
/* If we are mapping a snapshot it must be marked read-only */
read_only = rbd_dev->opts->read_only;
if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
read_only = true;
rbd_dev->mapping.read_only = read_only;
......@@ -5458,6 +5484,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
rbd_put_client(rbdc);
err_out_args:
rbd_spec_put(spec);
kfree(rbd_opts);
err_out_module:
module_put(THIS_MODULE);
......
......@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
val_size2 = posix_acl_xattr_size(default_acl->a_count);
err = -ENOMEM;
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
if (!tmp_buf)
goto out_err;
pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
if (!pagelist)
goto out_err;
ceph_pagelist_init(pagelist);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
if (cf == NULL) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM;
}
cf->fmode = fmode;
cf->next_offset = 2;
cf->readdir_cache_idx = -1;
file->private_data = cf;
BUG_ON(inode->i_fop->release != ceph_release);
break;
......@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
ceph_mdsc_put_request(cf->last_readdir);
kfree(cf->last_name);
kfree(cf->dir_info);
dput(cf->dentry);
kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */
......@@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
}
} else {
num_pages = calc_pages_for(off, len);
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages))
return PTR_ERR(pages);
ret = striped_read(inode, off, len, pages,
......@@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
* objects, rollback on failure, etc.)
*/
static ssize_t
ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct ceph_snap_context *snapc)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
......@@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t start;
ssize_t n;
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0,
......@@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
break;
}
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
n = iov_iter_get_pages_alloc(from, &pages, len, &start);
if (unlikely(n < 0)) {
......@@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
* objects, rollback on failure, etc.)
*/
static ssize_t
ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct ceph_snap_context *snapc)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
......@@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t left;
int n;
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0, 1,
......@@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
*/
num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
......@@ -860,7 +858,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct page *page = NULL;
loff_t i_size;
if (retry_op == READ_INLINE) {
page = __page_cache_alloc(GFP_NOFS);
page = __page_cache_alloc(GFP_KERNEL);
if (!page)
return -ENOMEM;
}
......@@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
int err, want, got;
loff_t pos;
......@@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
......@@ -996,14 +999,30 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
struct ceph_snap_context *snapc;
struct iov_iter data;
mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
if (__ceph_have_pending_cap_snap(ci)) {
struct ceph_cap_snap *capsnap =
list_last_entry(&ci->i_cap_snaps,
struct ceph_cap_snap,
ci_item);
snapc = ceph_get_snap_context(capsnap->context);
} else {
BUG_ON(!ci->i_head_snapc);
snapc = ceph_get_snap_context(ci->i_head_snapc);
}
spin_unlock(&ci->i_ceph_lock);
/* we might need to revert back to that point */
data = *from;
if (iocb->ki_flags & IOCB_DIRECT)
written = ceph_sync_direct_write(iocb, &data, pos);
written = ceph_sync_direct_write(iocb, &data, pos,
snapc);
else
written = ceph_sync_write(iocb, &data, pos);
written = ceph_sync_write(iocb, &data, pos, snapc);
if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n",
......@@ -1014,6 +1033,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
if (written > 0)
iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
} else {
loff_t old_size = inode->i_size;
/*
......@@ -1035,7 +1055,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
int dirty;
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
......@@ -1059,6 +1080,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
out:
mutex_unlock(&inode->i_mutex);
out_unlocked:
ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL;
return written ? written : err;
}
......@@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
struct ceph_cap_flush *prealloc_cf;
int want, got = 0;
int dirty;
int ret = 0;
......@@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
mutex_lock(&inode->i_mutex);
if (ceph_snap(inode) != CEPH_NOSNAP) {
......@@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
if (!ret) {
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
......@@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
ceph_put_cap_refs(ci, got);
unlock:
mutex_unlock(&inode->i_mutex);
ceph_free_cap_flush(prealloc_cf);
return ret;
}
......
......@@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_inline_version = 0;
ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0;
ci->i_ordered_count = 0;
atomic_set(&ci->i_release_count, 1);
atomic_set(&ci->i_complete_count, 0);
atomic64_set(&ci->i_ordered_count, 1);
atomic64_set(&ci->i_release_count, 1);
atomic64_set(&ci->i_complete_seq[0], 0);
atomic64_set(&ci->i_complete_seq[1], 0);
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
......@@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_flushing_caps = 0;
INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item);
ci->i_cap_flush_seq = 0;
ci->i_cap_flush_last_tid = 0;
memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
ci->i_prealloc_cap_flush = NULL;
ci->i_cap_flush_tree = RB_ROOT;
init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
......@@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
ci->i_layout = info->layout;
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
le64_to_cpu(info->truncate_size),
......@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode);
i_size_write(inode, 0);
__ceph_dir_set_complete(ci,
atomic_read(&ci->i_release_count),
ci->i_ordered_count);
atomic64_read(&ci->i_release_count),
atomic64_read(&ci->i_ordered_count));
}
wake = true;
......@@ -1212,6 +1216,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
dout("fill_trace doing d_move %p -> %p\n",
req->r_old_dentry, dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
d_move(req->r_old_dentry, dn);
dout(" src %p '%pd' dst %p '%pd'\n",
req->r_old_dentry,
......@@ -1222,10 +1230,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
......@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
return err;
}
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
if (ctl->page) {
kunmap(ctl->page);
page_cache_release(ctl->page);
ctl->page = NULL;
}
}
static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
struct ceph_readdir_cache_control *ctl,
struct ceph_mds_request *req)
{
struct ceph_inode_info *ci = ceph_inode(dir);
unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
if (!ctl->page || pgoff != page_index(ctl->page)) {
ceph_readdir_cache_release(ctl);
ctl->page = grab_cache_page(&dir->i_data, pgoff);
if (!ctl->page) {
ctl->index = -1;
return -ENOMEM;
}
/* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
}
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
dout("readdir cache dn %p idx %d\n", dn, ctl->index);
ctl->dentries[idx] = dn;
ctl->index++;
} else {
dout("disable readdir cache\n");
ctl->index = -1;
}
return 0;
}
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
......@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di;
u64 r_readdir_offset = req->r_readdir_offset;
u32 frag = le32_to_cpu(rhead->args.readdir.frag);
struct ceph_readdir_cache_control cache_ctl = {};
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
......@@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag);
if (ceph_frag_is_leftmost(frag))
r_readdir_offset = 2;
req->r_readdir_offset = 2;
else
r_readdir_offset = 0;
req->r_readdir_offset = 0;
}
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
snapdir = ceph_get_snapdir(d_inode(parent));
parent = d_find_alias(snapdir);
......@@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
}
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
req->r_readdir_cache_idx = 0;
}
cache_ctl.index = req->r_readdir_cache_idx;
/* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_vino vino;
......@@ -1413,13 +1471,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
d_delete(dn);
dput(dn);
goto retry_lookup;
} else {
/* reorder parent's d_subdirs */
spin_lock(&parent->d_lock);
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
list_move(&dn->d_child, &parent->d_subdirs);
spin_unlock(&dn->d_lock);
spin_unlock(&parent->d_lock);
}
/* inode */
......@@ -1436,13 +1487,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
}
}
if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation) < 0) {
ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation);
if (ret < 0) {
pr_err("fill_inode badness on %p\n", in);
if (d_really_is_negative(dn))
iput(in);
d_drop(dn);
err = ret;
goto next_item;
}
......@@ -1458,19 +1511,28 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
}
di = dn->d_fsdata;
di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
update_dentry_lease(dn, rinfo->dir_dlease[i],
req->r_session,
req->r_request_started);
if (err == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn,
&cache_ctl, req);
if (ret < 0)
err = ret;
}
next_item:
if (dn)
dput(dn);
}
if (err == 0)
req->r_did_prepopulate = true;
out:
if (err == 0) {
req->r_did_prepopulate = true;
req->r_readdir_cache_idx = cache_ctl.index;
}
ceph_readdir_cache_release(&cache_ctl);
if (snapdir) {
iput(snapdir);
dput(parent);
......@@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf;
int issued;
int release = 0, dirtied = 0;
int mask = 0;
int err = 0;
int inode_dirty_flags = 0;
bool lock_snap_rwsem = false;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
......@@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (err != 0)
return err;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
USE_AUTH_MDS);
if (IS_ERR(req))
if (IS_ERR(req)) {
ceph_free_cap_flush(prealloc_cf);
return PTR_ERR(req);
}
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
if (!ci->i_head_snapc &&
(issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
lock_snap_rwsem = true;
if (!down_read_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
}
}
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ia_valid & ATTR_UID) {
......@@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
dout("setattr %p ATTR_FILE ... hrm!\n", inode);
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
inode->i_ctime = CURRENT_TIME;
}
release &= issued;
spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
......@@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_mdsc_put_request(req);
if (mask & CEPH_SETATTR_SIZE)
__ceph_do_pending_vmtruncate(inode);
ceph_free_cap_flush(prealloc_cf);
return err;
out_put:
ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
return err;
}
......
This diff is collapsed.
......@@ -139,7 +139,6 @@ struct ceph_mds_session {
int s_cap_reconnect;
int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator;
/* protected by mutex */
......@@ -228,7 +227,7 @@ struct ceph_mds_request {
int r_err;
bool r_aborted;
unsigned long r_timeout; /* optional. jiffies */
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only,
used to measure lease durations */
......@@ -254,12 +253,21 @@ struct ceph_mds_request {
bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate;
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation;
int r_num_caps;
};
struct ceph_pool_perm {
struct rb_node node;
u32 pool;
int perm;
};
/*
* mds client state
*/
......@@ -284,12 +292,15 @@ struct ceph_mds_client {
* references (implying they contain no inodes with caps) that
* should be destroyed.
*/
u64 last_snap_seq;
struct rw_semaphore snap_rwsem;
struct rb_root snap_realms;
struct list_head snap_empty;
spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */
u64 oldest_tid; /* oldest incomplete mds request,
excluding setfilelock requests */
struct rb_root request_tree; /* pending mds requests */
struct delayed_work delayed_work; /* delayed work */
unsigned long last_renew_caps; /* last time we renewed our caps */
......@@ -298,7 +309,8 @@ struct ceph_mds_client {
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
u64 cap_flush_seq;
u64 last_cap_flush_tid;
struct rb_root cap_flush_tree;
struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
......@@ -328,6 +340,9 @@ struct ceph_mds_client {
spinlock_t dentry_lru_lock;
struct list_head dentry_lru;
int num_dentry;
struct rw_semaphore pool_perm_rwsem;
struct rb_root pool_perm_tree;
};
extern const char *ceph_mds_op_name(int op);
......@@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
......
......@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b)
}
static struct ceph_snap_context *empty_snapc;
struct ceph_snap_context *ceph_empty_snapc;
/*
* build the snap context for a given realm.
......@@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
return 0;
}
if (num == 0 && realm->seq == empty_snapc->seq) {
ceph_get_snap_context(empty_snapc);
snapc = empty_snapc;
if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
ceph_get_snap_context(ceph_empty_snapc);
snapc = ceph_empty_snapc;
goto done;
}
......@@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
return 0;
}
static bool has_new_snaps(struct ceph_snap_context *o,
struct ceph_snap_context *n)
{
if (n->num_snaps == 0)
return false;
/* snaps are in descending order */
return n->snaps[0] > o->seq;
}
/*
* When a snapshot is applied, the size/mtime inode metadata is queued
......@@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap;
struct ceph_snap_context *old_snapc, *new_snapc;
int used, dirty;
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
......@@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
old_snapc = ci->i_head_snapc;
new_snapc = ci->i_snap_realm->cached_context;
/*
* If there is a write in progress, treat that as a dirty Fw,
* even though it hasn't completed yet; by the time we finish
......@@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
writes in progress now were started before the previous
cap_snap. lucky us. */
dout("queue_cap_snap %p already pending\n", inode);
kfree(capsnap);
} else if (ci->i_snap_realm->cached_context == empty_snapc) {
dout("queue_cap_snap %p empty snapc\n", inode);
kfree(capsnap);
} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
struct ceph_snap_context *snapc = ci->i_head_snapc;
/*
* if we are a sync write, we may need to go to the snaprealm
* to get the current snapc.
*/
if (!snapc)
snapc = ci->i_snap_realm->cached_context;
goto update_snapc;
}
if (ci->i_wrbuffer_ref_head == 0 &&
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
dout("queue_cap_snap %p nothing dirty|writing\n", inode);
goto update_snapc;
}
dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
inode, capsnap, snapc, ceph_cap_string(dirty));
ihold(inode);
BUG_ON(!old_snapc);
atomic_set(&capsnap->nref, 1);
capsnap->ci = ci;
INIT_LIST_HEAD(&capsnap->ci_item);
INIT_LIST_HEAD(&capsnap->flushing_item);
capsnap->follows = snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty;
capsnap->mode = inode->i_mode;
capsnap->uid = inode->i_uid;
capsnap->gid = inode->i_gid;
if (dirty & CEPH_CAP_XATTR_EXCL) {
__ceph_build_xattrs_blob(ci);
capsnap->xattr_blob =
ceph_buffer_get(ci->i_xattrs.blob);
capsnap->xattr_version = ci->i_xattrs.version;
} else {
capsnap->xattr_blob = NULL;
capsnap->xattr_version = 0;
/*
* There is no need to send FLUSHSNAP message to MDS if there is
* no new snapshot. But when there is dirty pages or on-going
* writes, we still need to create cap_snap. cap_snap is needed
* by the write path and page writeback path.
*
* also see ceph_try_drop_cap_snap()
*/
if (has_new_snaps(old_snapc, new_snapc)) {
if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
capsnap->need_flush = true;
} else {
if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
dout("queue_cap_snap %p "
"no new_snap|dirty_page|writing\n", inode);
goto update_snapc;
}
}
capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
/* dirty page count moved from _head to this cap_snap;
all subsequent writes page dirties occur _after_ this
snapshot. */
capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
ci->i_wrbuffer_ref_head = 0;
capsnap->context = snapc;
ci->i_head_snapc =
ceph_get_snap_context(ci->i_snap_realm->cached_context);
dout(" new snapc is %p\n", ci->i_head_snapc);
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p"
" seq %llu used WR, now pending\n", inode,
capsnap, snapc, snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap);
}
dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
inode, capsnap, old_snapc, ceph_cap_string(dirty),
capsnap->need_flush ? "" : "no_flush");
ihold(inode);
atomic_set(&capsnap->nref, 1);
capsnap->ci = ci;
INIT_LIST_HEAD(&capsnap->ci_item);
INIT_LIST_HEAD(&capsnap->flushing_item);
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty;
capsnap->mode = inode->i_mode;
capsnap->uid = inode->i_uid;
capsnap->gid = inode->i_gid;
if (dirty & CEPH_CAP_XATTR_EXCL) {
__ceph_build_xattrs_blob(ci);
capsnap->xattr_blob =
ceph_buffer_get(ci->i_xattrs.blob);
capsnap->xattr_version = ci->i_xattrs.version;
} else {
dout("queue_cap_snap %p nothing dirty|writing\n", inode);
kfree(capsnap);
capsnap->xattr_blob = NULL;
capsnap->xattr_version = 0;
}
capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
/* dirty page count moved from _head to this cap_snap;
all subsequent writes page dirties occur _after_ this
snapshot. */
capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
ci->i_wrbuffer_ref_head = 0;
capsnap->context = old_snapc;
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
old_snapc = NULL;
if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p"
" seq %llu used WR, now pending\n", inode,
capsnap, old_snapc, old_snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap);
}
capsnap = NULL;
update_snapc:
if (ci->i_head_snapc) {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock);
kfree(capsnap);
ceph_put_snap_context(old_snapc);
}
/*
......@@ -699,6 +730,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
/* queue realm for cap_snap creation */
list_add(&realm->dirty_item, &dirty_realms);
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
invalidate = 1;
} else if (!realm->cached_context) {
......@@ -964,14 +997,14 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
int __init ceph_snap_init(void)
{
empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
if (!empty_snapc)
ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
if (!ceph_empty_snapc)
return -ENOMEM;
empty_snapc->seq = 1;
ceph_empty_snapc->seq = 1;
return 0;
}
void ceph_snap_exit(void)
{
ceph_put_snap_context(empty_snapc);
ceph_put_snap_context(ceph_empty_snapc);
}
......@@ -134,10 +134,12 @@ enum {
Opt_noino32,
Opt_fscache,
Opt_nofscache,
Opt_poolperm,
Opt_nopoolperm,
#ifdef CONFIG_CEPH_FS_POSIX_ACL
Opt_acl,
#endif
Opt_noacl
Opt_noacl,
};
static match_table_t fsopt_tokens = {
......@@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = {
{Opt_noino32, "noino32"},
{Opt_fscache, "fsc"},
{Opt_nofscache, "nofsc"},
{Opt_poolperm, "poolperm"},
{Opt_nopoolperm, "nopoolperm"},
#ifdef CONFIG_CEPH_FS_POSIX_ACL
{Opt_acl, "acl"},
#endif
......@@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
break;
case Opt_poolperm:
fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
printk ("pool perm");
break;
case Opt_nopoolperm:
fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl:
fsopt->sb_flags |= MS_POSIXACL;
......@@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",nodcache");
if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
seq_puts(m, ",fsc");
if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
seq_puts(m, ",nopoolperm");
#ifdef CONFIG_CEPH_FS_POSIX_ACL
if (fsopt->sb_flags & MS_POSIXACL)
......@@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
......@@ -634,6 +648,10 @@ static int __init init_caches(void)
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (ceph_cap_cachep == NULL)
goto bad_cap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (ceph_cap_flush_cachep == NULL)
goto bad_cap_flush;
ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
......@@ -652,6 +670,8 @@ static int __init init_caches(void)
bad_file:
kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry:
kmem_cache_destroy(ceph_cap_flush_cachep);
bad_cap_flush:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
......@@ -668,6 +688,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
......@@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
req->r_ino1.ino = CEPH_INO_ROOT;
req->r_ino1.snap = CEPH_NOSNAP;
req->r_started = started;
req->r_timeout = fsc->client->options->mount_timeout * HZ;
req->r_timeout = fsc->client->options->mount_timeout;
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
......
This diff is collapsed.
......@@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
struct inode *inode = d_inode(dentry);
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
int issued;
int err;
int dirty = 0;
......@@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
char *newval = NULL;
struct ceph_inode_xattr *xattr = NULL;
int required_blob_size;
bool lock_snap_rwsem = false;
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
......@@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
if (!xattr)
goto out;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
goto out;
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
if (!down_read_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
goto retry;
}
}
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
__build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, name_len, val_len);
......@@ -966,7 +984,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
dout(" preaallocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
goto out;
goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
......@@ -978,21 +996,28 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
flags, value ? 1 : -1, &xattr);
if (!err) {
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME;
}
spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (dirty)
__mark_inode_dirty(inode, dirty);
ceph_free_cap_flush(prealloc_cf);
return err;
do_sync:
spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked:
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
err = ceph_sync_setxattr(dentry, name, value, size, flags);
out:
ceph_free_cap_flush(prealloc_cf);
kfree(newname);
kfree(newval);
kfree(xattr);
......@@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
struct inode *inode = d_inode(dentry);
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
int issued;
int err;
int required_blob_size;
int dirty;
bool lock_snap_rwsem = false;
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
......@@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
goto do_sync_unlocked;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
err = -ENOMEM;
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
if (!down_read_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
goto retry;
}
}
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
__build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, 0, 0);
......@@ -1080,7 +1123,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
dout(" preaallocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
goto out;
goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
......@@ -1090,18 +1133,24 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
err = __remove_xattr_by_name(ceph_inode(inode), name);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME;
spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (dirty)
__mark_inode_dirty(inode, dirty);
ceph_free_cap_flush(prealloc_cf);
return err;
do_sync:
spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked:
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
ceph_free_cap_flush(prealloc_cf);
err = ceph_send_removexattr(dentry, name);
out:
return err;
}
......
......@@ -43,9 +43,9 @@ struct ceph_options {
int flags;
struct ceph_fsid fsid;
struct ceph_entity_addr my_addr;
int mount_timeout;
int osd_idle_ttl;
int osd_keepalive_timeout;
unsigned long mount_timeout; /* jiffies */
unsigned long osd_idle_ttl; /* jiffies */
unsigned long osd_keepalive_timeout; /* jiffies */
/*
* any type that can't be simply compared or doesn't need need
......@@ -63,9 +63,9 @@ struct ceph_options {
/*
* defaults
*/
#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
#define CEPH_OSD_KEEPALIVE_DEFAULT 5
#define CEPH_OSD_IDLE_TTL_DEFAULT 60
#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
......@@ -93,13 +93,9 @@ enum {
CEPH_MOUNT_SHUTDOWN,
};
/*
* subtract jiffies
*/
static inline unsigned long time_sub(unsigned long a, unsigned long b)
static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
{
BUG_ON(time_after(b, a));
return (long)a - (long)b;
return timeout ?: MAX_SCHEDULE_TIMEOUT;
}
struct ceph_mds_client;
......@@ -178,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len)
extern struct kmem_cache *ceph_inode_cachep;
extern struct kmem_cache *ceph_cap_cachep;
extern struct kmem_cache *ceph_cap_flush_cachep;
extern struct kmem_cache *ceph_dentry_cachep;
extern struct kmem_cache *ceph_file_cachep;
......
......@@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
extern void osd_req_op_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode);
unsigned int which, u16 opcode, u32 flags);
extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
unsigned int which,
......
#ifndef CEPH_CRUSH_CRUSH_H
#define CEPH_CRUSH_CRUSH_H
#include <linux/types.h>
#ifdef __KERNEL__
# include <linux/types.h>
#else
# include "crush_compat.h"
#endif
/*
* CRUSH is a pseudo-random data distribution algorithm that
......@@ -20,7 +24,11 @@
#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */
#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */
#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
#define CRUSH_ITEM_NONE 0x7fffffff /* no result */
......@@ -108,6 +116,15 @@ enum {
};
extern const char *crush_bucket_alg_name(int alg);
/*
* although tree was a legacy algorithm, it has been buggy, so
* exclude it.
*/
#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
(1 << CRUSH_BUCKET_UNIFORM) | \
(1 << CRUSH_BUCKET_LIST) | \
(1 << CRUSH_BUCKET_STRAW))
struct crush_bucket {
__s32 id; /* this'll be negative */
__u16 type; /* non-zero; type=0 is reserved for devices */
......@@ -174,7 +191,7 @@ struct crush_map {
/* choose local attempts using a fallback permutation before
* re-descent */
__u32 choose_local_fallback_tries;
/* choose attempts before giving up */
/* choose attempts before giving up */
__u32 choose_total_tries;
/* attempt chooseleaf inner descent once for firstn mode; on
* reject retry outer descent. Note that this does *not*
......@@ -187,6 +204,25 @@ struct crush_map {
* that want to limit reshuffling, a value of 3 or 4 will make the
* mappings line up a bit better with previous mappings. */
__u8 chooseleaf_vary_r;
#ifndef __KERNEL__
/*
* version 0 (original) of straw_calc has various flaws. version 1
* fixes a few of them.
*/
__u8 straw_calc_version;
/*
* allowed bucket algs is a bitmask, here the bit positions
* are CRUSH_BUCKET_*. note that these are *bits* and
* CRUSH_BUCKET_* values are not, so we need to or together (1
* << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
* minimize confusion (bucket type values start at 1).
*/
__u32 allowed_bucket_algs;
__u32 *choose_tries;
#endif
};
......
#ifndef CEPH_CRUSH_HASH_H
#define CEPH_CRUSH_HASH_H
#ifdef __KERNEL__
# include <linux/types.h>
#else
# include "crush_compat.h"
#endif
#define CRUSH_HASH_RJENKINS1 0
#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
......
......@@ -8,7 +8,7 @@
* LGPL2
*/
#include <linux/crush/crush.h>
#include "crush.h"
extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
extern int crush_do_rule(const struct crush_map *map,
......
......@@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name,
/* start with defaults */
opt->flags = CEPH_OPT_DEFAULT;
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
/* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */
......@@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name,
pr_warn("ignoring deprecated osdtimeout option\n");
break;
case Opt_osdkeepalivetimeout:
opt->osd_keepalive_timeout = intval;
/* 0 isn't well defined right now, reject it */
if (intval < 1 || intval > INT_MAX / 1000) {
pr_err("osdkeepalive out of range\n");
err = -EINVAL;
goto out;
}
opt->osd_keepalive_timeout =
msecs_to_jiffies(intval * 1000);
break;
case Opt_osd_idle_ttl:
opt->osd_idle_ttl = intval;
/* 0 isn't well defined right now, reject it */
if (intval < 1 || intval > INT_MAX / 1000) {
pr_err("osd_idle_ttl out of range\n");
err = -EINVAL;
goto out;
}
opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
break;
case Opt_mount_timeout:
opt->mount_timeout = intval;
/* 0 is "wait forever" (i.e. infinite timeout) */
if (intval < 0 || intval > INT_MAX / 1000) {
pr_err("mount_timeout out of range\n");
err = -EINVAL;
goto out;
}
opt->mount_timeout = msecs_to_jiffies(intval * 1000);
break;
case Opt_share:
......@@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
seq_puts(m, "notcp_nodelay,");
if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
seq_printf(m, "mount_timeout=%d,",
jiffies_to_msecs(opt->mount_timeout) / 1000);
if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
seq_printf(m, "osd_idle_ttl=%d,",
jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, "osdkeepalivetimeout=%d,",
opt->osd_keepalive_timeout);
jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
/* drop redundant comma */
if (m->count != pos)
......@@ -626,8 +647,8 @@ static int have_mon_and_osd_map(struct ceph_client *client)
*/
int __ceph_open_session(struct ceph_client *client, unsigned long started)
{
int err;
unsigned long timeout = client->options->mount_timeout * HZ;
unsigned long timeout = client->options->mount_timeout;
long err;
/* open session, and wait for mon and osd maps */
err = ceph_monc_open_session(&client->monc);
......@@ -635,16 +656,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
return err;
while (!have_mon_and_osd_map(client)) {
err = -EIO;
if (timeout && time_after_eq(jiffies, started + timeout))
return err;
return -ETIMEDOUT;
/* wait */
dout("mount waiting for mon_map\n");
err = wait_event_interruptible_timeout(client->auth_wq,
have_mon_and_osd_map(client) || (client->auth_err < 0),
timeout);
if (err == -EINTR || err == -ERESTARTSYS)
ceph_timeout_jiffies(timeout));
if (err < 0)
return err;
if (client->auth_err < 0)
return client->auth_err;
......@@ -721,5 +741,5 @@ module_exit(exit_ceph_lib);
MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
MODULE_DESCRIPTION("Ceph filesystem for Linux");
MODULE_DESCRIPTION("Ceph core library");
MODULE_LICENSE("GPL");
#ifdef __KERNEL__
# include <linux/slab.h>
# include <linux/crush/crush.h>
#else
# include <stdlib.h>
# include <assert.h>
# define kfree(x) do { if (x) free(x); } while (0)
# define BUG_ON(x) assert(!(x))
# include "crush_compat.h"
# include "crush.h"
#endif
#include <linux/crush/crush.h>
const char *crush_bucket_alg_name(int alg)
{
switch (alg) {
......@@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map)
kfree(map->rules);
}
#ifndef __KERNEL__
kfree(map->choose_tries);
#endif
kfree(map);
}
......
This diff is collapsed.
#include <linux/types.h>
#include <linux/crush/hash.h>
#ifdef __KERNEL__
# include <linux/crush/hash.h>
#else
# include "hash.h"
#endif
/*
* Robert Jenkins' function for mixing 32-bit values
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment