Commit 0c76c6ba authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "We have a pile of bug fixes from Ilya, including a few patches that
  sync up the CRUSH code with the latest from userspace.

  There is also a long series from Zheng that fixes various issues with
  snapshots, inline data, and directory fsync, some simplification and
  improvement in the cap release code, and a rework of the caching of
  directory contents.

  To top it off there are a few small fixes and cleanups from Benoit and
  Hong"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (40 commits)
  rbd: use GFP_NOIO in rbd_obj_request_create()
  crush: fix a bug in tree bucket decode
  libceph: Fix ceph_tcp_sendpage()'s more boolean usage
  libceph: Remove spurious kunmap() of the zero page
  rbd: queue_depth map option
  rbd: store rbd_options in rbd_device
  rbd: terminate rbd_opts_tokens with Opt_err
  ceph: fix ceph_writepages_start()
  rbd: bump queue_max_segments
  ceph: rework dcache readdir
  crush: sync up with userspace
  crush: fix crash from invalid 'take' argument
  ceph: switch some GFP_NOFS memory allocation to GFP_KERNEL
  ceph: pre-allocate data structure that tracks caps flushing
  ceph: re-send flushing caps (which are revoked) in reconnect stage
  ceph: send TID of the oldest pending caps flush to MDS
  ceph: track pending caps flushing globally
  ceph: track pending caps flushing accurately
  libceph: fix wrong name "Ceph filesystem for Linux"
  ceph: fix directory fsync
  ...
parents 8688d954 5a60e876
...@@ -346,6 +346,7 @@ struct rbd_device { ...@@ -346,6 +346,7 @@ struct rbd_device {
struct rbd_image_header header; struct rbd_image_header header;
unsigned long flags; /* possibly lock protected */ unsigned long flags; /* possibly lock protected */
struct rbd_spec *spec; struct rbd_spec *spec;
struct rbd_options *opts;
char *header_name; char *header_name;
...@@ -724,34 +725,36 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) ...@@ -724,34 +725,36 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
} }
/* /*
* mount options * (Per device) rbd map options
*/ */
enum { enum {
Opt_queue_depth,
Opt_last_int, Opt_last_int,
/* int args above */ /* int args above */
Opt_last_string, Opt_last_string,
/* string args above */ /* string args above */
Opt_read_only, Opt_read_only,
Opt_read_write, Opt_read_write,
/* Boolean args above */ Opt_err
Opt_last_bool,
}; };
static match_table_t rbd_opts_tokens = { static match_table_t rbd_opts_tokens = {
{Opt_queue_depth, "queue_depth=%d"},
/* int args above */ /* int args above */
/* string args above */ /* string args above */
{Opt_read_only, "read_only"}, {Opt_read_only, "read_only"},
{Opt_read_only, "ro"}, /* Alternate spelling */ {Opt_read_only, "ro"}, /* Alternate spelling */
{Opt_read_write, "read_write"}, {Opt_read_write, "read_write"},
{Opt_read_write, "rw"}, /* Alternate spelling */ {Opt_read_write, "rw"}, /* Alternate spelling */
/* Boolean args above */ {Opt_err, NULL}
{-1, NULL}
}; };
struct rbd_options { struct rbd_options {
int queue_depth;
bool read_only; bool read_only;
}; };
#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
#define RBD_READ_ONLY_DEFAULT false #define RBD_READ_ONLY_DEFAULT false
static int parse_rbd_opts_token(char *c, void *private) static int parse_rbd_opts_token(char *c, void *private)
...@@ -761,27 +764,27 @@ static int parse_rbd_opts_token(char *c, void *private) ...@@ -761,27 +764,27 @@ static int parse_rbd_opts_token(char *c, void *private)
int token, intval, ret; int token, intval, ret;
token = match_token(c, rbd_opts_tokens, argstr); token = match_token(c, rbd_opts_tokens, argstr);
if (token < 0)
return -EINVAL;
if (token < Opt_last_int) { if (token < Opt_last_int) {
ret = match_int(&argstr[0], &intval); ret = match_int(&argstr[0], &intval);
if (ret < 0) { if (ret < 0) {
pr_err("bad mount option arg (not int) " pr_err("bad mount option arg (not int) at '%s'\n", c);
"at '%s'\n", c);
return ret; return ret;
} }
dout("got int token %d val %d\n", token, intval); dout("got int token %d val %d\n", token, intval);
} else if (token > Opt_last_int && token < Opt_last_string) { } else if (token > Opt_last_int && token < Opt_last_string) {
dout("got string token %d val %s\n", token, dout("got string token %d val %s\n", token, argstr[0].from);
argstr[0].from);
} else if (token > Opt_last_string && token < Opt_last_bool) {
dout("got Boolean token %d\n", token);
} else { } else {
dout("got token %d\n", token); dout("got token %d\n", token);
} }
switch (token) { switch (token) {
case Opt_queue_depth:
if (intval < 1) {
pr_err("queue_depth out of range\n");
return -EINVAL;
}
rbd_opts->queue_depth = intval;
break;
case Opt_read_only: case Opt_read_only:
rbd_opts->read_only = true; rbd_opts->read_only = true;
break; break;
...@@ -789,9 +792,10 @@ static int parse_rbd_opts_token(char *c, void *private) ...@@ -789,9 +792,10 @@ static int parse_rbd_opts_token(char *c, void *private)
rbd_opts->read_only = false; rbd_opts->read_only = false;
break; break;
default: default:
rbd_assert(false); /* libceph prints "bad option" msg */
break; return -EINVAL;
} }
return 0; return 0;
} }
...@@ -1563,22 +1567,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request) ...@@ -1563,22 +1567,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
/* /*
* Wait for an object request to complete. If interrupted, cancel the * Wait for an object request to complete. If interrupted, cancel the
* underlying osd request. * underlying osd request.
*
* @timeout: in jiffies, 0 means "wait forever"
*/ */
static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
unsigned long timeout)
{ {
int ret; long ret;
dout("%s %p\n", __func__, obj_request); dout("%s %p\n", __func__, obj_request);
ret = wait_for_completion_interruptible_timeout(
ret = wait_for_completion_interruptible(&obj_request->completion); &obj_request->completion,
if (ret < 0) { ceph_timeout_jiffies(timeout));
dout("%s %p interrupted\n", __func__, obj_request); if (ret <= 0) {
if (ret == 0)
ret = -ETIMEDOUT;
rbd_obj_request_end(obj_request); rbd_obj_request_end(obj_request);
return ret; } else {
ret = 0;
} }
dout("%s %p done\n", __func__, obj_request); dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
return 0; return ret;
}
static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
{
return __rbd_obj_request_wait(obj_request, 0);
}
static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
unsigned long timeout)
{
return __rbd_obj_request_wait(obj_request, timeout);
} }
static void rbd_img_request_complete(struct rbd_img_request *img_request) static void rbd_img_request_complete(struct rbd_img_request *img_request)
...@@ -2001,11 +2022,11 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, ...@@ -2001,11 +2022,11 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
rbd_assert(obj_request_type_valid(type)); rbd_assert(obj_request_type_valid(type));
size = strlen(object_name) + 1; size = strlen(object_name) + 1;
name = kmalloc(size, GFP_KERNEL); name = kmalloc(size, GFP_NOIO);
if (!name) if (!name)
return NULL; return NULL;
obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL); obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
if (!obj_request) { if (!obj_request) {
kfree(name); kfree(name);
return NULL; return NULL;
...@@ -2376,7 +2397,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, ...@@ -2376,7 +2397,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
} }
if (opcode == CEPH_OSD_OP_DELETE) if (opcode == CEPH_OSD_OP_DELETE)
osd_req_op_init(osd_request, num_ops, opcode); osd_req_op_init(osd_request, num_ops, opcode, 0);
else else
osd_req_op_extent_init(osd_request, num_ops, opcode, osd_req_op_extent_init(osd_request, num_ops, opcode,
offset, length, 0, 0); offset, length, 0, 0);
...@@ -2848,7 +2869,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) ...@@ -2848,7 +2869,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
goto out; goto out;
stat_request->callback = rbd_img_obj_exists_callback; stat_request->callback = rbd_img_obj_exists_callback;
osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
false, false); false, false);
rbd_osd_req_format_read(stat_request); rbd_osd_req_format_read(stat_request);
...@@ -3122,6 +3143,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( ...@@ -3122,6 +3143,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
bool watch) bool watch)
{ {
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct ceph_options *opts = osdc->client->options;
struct rbd_obj_request *obj_request; struct rbd_obj_request *obj_request;
int ret; int ret;
...@@ -3148,7 +3170,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( ...@@ -3148,7 +3170,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
if (ret) if (ret)
goto out; goto out;
ret = rbd_obj_request_wait(obj_request); ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
if (ret) if (ret)
goto out; goto out;
...@@ -3750,10 +3772,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) ...@@ -3750,10 +3772,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
rbd_dev->tag_set.ops = &rbd_mq_ops; rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE; rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
rbd_dev->tag_set.flags = rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1; rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
...@@ -3773,6 +3794,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) ...@@ -3773,6 +3794,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
/* set io sizes to object size */ /* set io sizes to object size */
segment_size = rbd_obj_bytes(&rbd_dev->header); segment_size = rbd_obj_bytes(&rbd_dev->header);
blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
blk_queue_max_segment_size(q, segment_size); blk_queue_max_segment_size(q, segment_size);
blk_queue_io_min(q, segment_size); blk_queue_io_min(q, segment_size);
blk_queue_io_opt(q, segment_size); blk_queue_io_opt(q, segment_size);
...@@ -4044,7 +4066,8 @@ static void rbd_spec_free(struct kref *kref) ...@@ -4044,7 +4066,8 @@ static void rbd_spec_free(struct kref *kref)
} }
static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
struct rbd_spec *spec) struct rbd_spec *spec,
struct rbd_options *opts)
{ {
struct rbd_device *rbd_dev; struct rbd_device *rbd_dev;
...@@ -4058,8 +4081,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, ...@@ -4058,8 +4081,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->node);
init_rwsem(&rbd_dev->header_rwsem); init_rwsem(&rbd_dev->header_rwsem);
rbd_dev->spec = spec;
rbd_dev->rbd_client = rbdc; rbd_dev->rbd_client = rbdc;
rbd_dev->spec = spec;
rbd_dev->opts = opts;
/* Initialize the layout used for all rbd requests */ /* Initialize the layout used for all rbd requests */
...@@ -4075,6 +4099,7 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev) ...@@ -4075,6 +4099,7 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev)
{ {
rbd_put_client(rbd_dev->rbd_client); rbd_put_client(rbd_dev->rbd_client);
rbd_spec_put(rbd_dev->spec); rbd_spec_put(rbd_dev->spec);
kfree(rbd_dev->opts);
kfree(rbd_dev); kfree(rbd_dev);
} }
...@@ -4933,6 +4958,7 @@ static int rbd_add_parse_args(const char *buf, ...@@ -4933,6 +4958,7 @@ static int rbd_add_parse_args(const char *buf,
goto out_mem; goto out_mem;
rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
copts = ceph_parse_options(options, mon_addrs, copts = ceph_parse_options(options, mon_addrs,
mon_addrs + mon_addrs_size - 1, mon_addrs + mon_addrs_size - 1,
...@@ -4963,8 +4989,8 @@ static int rbd_add_parse_args(const char *buf, ...@@ -4963,8 +4989,8 @@ static int rbd_add_parse_args(const char *buf,
*/ */
static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
{ {
struct ceph_options *opts = rbdc->client->options;
u64 newest_epoch; u64 newest_epoch;
unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
int tries = 0; int tries = 0;
int ret; int ret;
...@@ -4979,7 +5005,8 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) ...@@ -4979,7 +5005,8 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
ceph_monc_request_next_osdmap(&rbdc->client->monc); ceph_monc_request_next_osdmap(&rbdc->client->monc);
(void) ceph_monc_wait_osdmap(&rbdc->client->monc, (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
newest_epoch, timeout); newest_epoch,
opts->mount_timeout);
goto again; goto again;
} else { } else {
/* the osdmap we have is new enough */ /* the osdmap we have is new enough */
...@@ -5148,7 +5175,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) ...@@ -5148,7 +5175,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
rbdc = __rbd_get_client(rbd_dev->rbd_client); rbdc = __rbd_get_client(rbd_dev->rbd_client);
ret = -ENOMEM; ret = -ENOMEM;
parent = rbd_dev_create(rbdc, parent_spec); parent = rbd_dev_create(rbdc, parent_spec, NULL);
if (!parent) if (!parent)
goto out_err; goto out_err;
...@@ -5394,9 +5421,6 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -5394,9 +5421,6 @@ static ssize_t do_rbd_add(struct bus_type *bus,
rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
if (rc < 0) if (rc < 0)
goto err_out_module; goto err_out_module;
read_only = rbd_opts->read_only;
kfree(rbd_opts);
rbd_opts = NULL; /* done with this */
rbdc = rbd_get_client(ceph_opts); rbdc = rbd_get_client(ceph_opts);
if (IS_ERR(rbdc)) { if (IS_ERR(rbdc)) {
...@@ -5422,11 +5446,12 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -5422,11 +5446,12 @@ static ssize_t do_rbd_add(struct bus_type *bus,
goto err_out_client; goto err_out_client;
} }
rbd_dev = rbd_dev_create(rbdc, spec); rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
if (!rbd_dev) if (!rbd_dev)
goto err_out_client; goto err_out_client;
rbdc = NULL; /* rbd_dev now owns this */ rbdc = NULL; /* rbd_dev now owns this */
spec = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */
rbd_opts = NULL; /* rbd_dev now owns this */
rc = rbd_dev_image_probe(rbd_dev, true); rc = rbd_dev_image_probe(rbd_dev, true);
if (rc < 0) if (rc < 0)
...@@ -5434,6 +5459,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -5434,6 +5459,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
/* If we are mapping a snapshot it must be marked read-only */ /* If we are mapping a snapshot it must be marked read-only */
read_only = rbd_dev->opts->read_only;
if (rbd_dev->spec->snap_id != CEPH_NOSNAP) if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
read_only = true; read_only = true;
rbd_dev->mapping.read_only = read_only; rbd_dev->mapping.read_only = read_only;
...@@ -5458,6 +5484,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -5458,6 +5484,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
rbd_put_client(rbdc); rbd_put_client(rbdc);
err_out_args: err_out_args:
rbd_spec_put(spec); rbd_spec_put(spec);
kfree(rbd_opts);
err_out_module: err_out_module:
module_put(THIS_MODULE); module_put(THIS_MODULE);
......
...@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, ...@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
val_size2 = posix_acl_xattr_size(default_acl->a_count); val_size2 = posix_acl_xattr_size(default_acl->a_count);
err = -ENOMEM; err = -ENOMEM;
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS); tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
if (!tmp_buf) if (!tmp_buf)
goto out_err; goto out_err;
pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS); pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
if (!pagelist) if (!pagelist)
goto out_err; goto out_err;
ceph_pagelist_init(pagelist); ceph_pagelist_init(pagelist);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) ...@@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR: case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file, dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode); inode->i_mode);
cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
if (cf == NULL) { if (cf == NULL) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM; return -ENOMEM;
} }
cf->fmode = fmode; cf->fmode = fmode;
cf->next_offset = 2; cf->next_offset = 2;
cf->readdir_cache_idx = -1;
file->private_data = cf; file->private_data = cf;
BUG_ON(inode->i_fop->release != ceph_release); BUG_ON(inode->i_fop->release != ceph_release);
break; break;
...@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file) ...@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
ceph_mdsc_put_request(cf->last_readdir); ceph_mdsc_put_request(cf->last_readdir);
kfree(cf->last_name); kfree(cf->last_name);
kfree(cf->dir_info); kfree(cf->dir_info);
dput(cf->dentry);
kmem_cache_free(ceph_file_cachep, cf); kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */ /* wake up anyone waiting for caps on this inode */
...@@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, ...@@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
} }
} else { } else {
num_pages = calc_pages_for(off, len); num_pages = calc_pages_for(off, len);
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) if (IS_ERR(pages))
return PTR_ERR(pages); return PTR_ERR(pages);
ret = striped_read(inode, off, len, pages, ret = striped_read(inode, off, len, pages,
...@@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) ...@@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
* objects, rollback on failure, etc.) * objects, rollback on failure, etc.)
*/ */
static ssize_t static ssize_t
ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct ceph_snap_context *snapc)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc;
struct ceph_vino vino; struct ceph_vino vino;
struct ceph_osd_request *req; struct ceph_osd_request *req;
struct page **pages; struct page **pages;
...@@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t start; size_t start;
ssize_t n; ssize_t n;
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode); vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0, vino, pos, &len, 0,
...@@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
break; break;
} }
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
n = iov_iter_get_pages_alloc(from, &pages, len, &start); n = iov_iter_get_pages_alloc(from, &pages, len, &start);
if (unlikely(n < 0)) { if (unlikely(n < 0)) {
...@@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
* objects, rollback on failure, etc.) * objects, rollback on failure, etc.)
*/ */
static ssize_t static ssize_t
ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct ceph_snap_context *snapc)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc;
struct ceph_vino vino; struct ceph_vino vino;
struct ceph_osd_request *req; struct ceph_osd_request *req;
struct page **pages; struct page **pages;
...@@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t left; size_t left;
int n; int n;
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode); vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0, 1, vino, pos, &len, 0, 1,
...@@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
*/ */
num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) { if (IS_ERR(pages)) {
ret = PTR_ERR(pages); ret = PTR_ERR(pages);
goto out; goto out;
...@@ -860,7 +858,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) ...@@ -860,7 +858,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct page *page = NULL; struct page *page = NULL;
loff_t i_size; loff_t i_size;
if (retry_op == READ_INLINE) { if (retry_op == READ_INLINE) {
page = __page_cache_alloc(GFP_NOFS); page = __page_cache_alloc(GFP_KERNEL);
if (!page) if (!page)
return -ENOMEM; return -ENOMEM;
} }
...@@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc = struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc; &ceph_sb_to_client(inode->i_sb)->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0; ssize_t count, written = 0;
int err, want, got; int err, want, got;
loff_t pos; loff_t pos;
...@@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ceph_snap(inode) != CEPH_NOSNAP) if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS; return -EROFS;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */ /* We can write back this queue in page reclaim */
...@@ -996,14 +999,30 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -996,14 +999,30 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
struct ceph_snap_context *snapc;
struct iov_iter data; struct iov_iter data;
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
if (__ceph_have_pending_cap_snap(ci)) {
struct ceph_cap_snap *capsnap =
list_last_entry(&ci->i_cap_snaps,
struct ceph_cap_snap,
ci_item);
snapc = ceph_get_snap_context(capsnap->context);
} else {
BUG_ON(!ci->i_head_snapc);
snapc = ceph_get_snap_context(ci->i_head_snapc);
}
spin_unlock(&ci->i_ceph_lock);
/* we might need to revert back to that point */ /* we might need to revert back to that point */
data = *from; data = *from;
if (iocb->ki_flags & IOCB_DIRECT) if (iocb->ki_flags & IOCB_DIRECT)
written = ceph_sync_direct_write(iocb, &data, pos); written = ceph_sync_direct_write(iocb, &data, pos,
snapc);
else else
written = ceph_sync_write(iocb, &data, pos); written = ceph_sync_write(iocb, &data, pos, snapc);
if (written == -EOLDSNAPC) { if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u" dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n", "got EOLDSNAPC, retrying\n",
...@@ -1014,6 +1033,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -1014,6 +1033,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
} }
if (written > 0) if (written > 0)
iov_iter_advance(from, written); iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
} else { } else {
loff_t old_size = inode->i_size; loff_t old_size = inode->i_size;
/* /*
...@@ -1035,7 +1055,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -1035,7 +1055,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
int dirty; int dirty;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE; ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (dirty) if (dirty)
__mark_inode_dirty(inode, dirty); __mark_inode_dirty(inode, dirty);
...@@ -1059,6 +1080,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -1059,6 +1080,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
out: out:
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
out_unlocked: out_unlocked:
ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL; current->backing_dev_info = NULL;
return written ? written : err; return written ? written : err;
} }
...@@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode, ...@@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc = struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc; &ceph_inode_to_client(inode)->client->osdc;
struct ceph_cap_flush *prealloc_cf;
int want, got = 0; int want, got = 0;
int dirty; int dirty;
int ret = 0; int ret = 0;
...@@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode, ...@@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode)) if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP; return -EOPNOTSUPP;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
if (ceph_snap(inode) != CEPH_NOSNAP) { if (ceph_snap(inode) != CEPH_NOSNAP) {
...@@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode, ...@@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
if (!ret) { if (!ret) {
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE; ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (dirty) if (dirty)
__mark_inode_dirty(inode, dirty); __mark_inode_dirty(inode, dirty);
...@@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode, ...@@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
ceph_put_cap_refs(ci, got); ceph_put_cap_refs(ci, got);
unlock: unlock:
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
ceph_free_cap_flush(prealloc_cf);
return ret; return ret;
} }
......
...@@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_inline_version = 0; ci->i_inline_version = 0;
ci->i_time_warp_seq = 0; ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0; ci->i_ceph_flags = 0;
ci->i_ordered_count = 0; atomic64_set(&ci->i_ordered_count, 1);
atomic_set(&ci->i_release_count, 1); atomic64_set(&ci->i_release_count, 1);
atomic_set(&ci->i_complete_count, 0); atomic64_set(&ci->i_complete_seq[0], 0);
atomic64_set(&ci->i_complete_seq[1], 0);
ci->i_symlink = NULL; ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
...@@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_flushing_caps = 0; ci->i_flushing_caps = 0;
INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item); INIT_LIST_HEAD(&ci->i_flushing_item);
ci->i_cap_flush_seq = 0; ci->i_prealloc_cap_flush = NULL;
ci->i_cap_flush_last_tid = 0; ci->i_cap_flush_tree = RB_ROOT;
memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
init_waitqueue_head(&ci->i_cap_wq); init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0; ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0; ci->i_hold_caps_max = 0;
...@@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ...@@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (new_version || if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
ci->i_layout = info->layout; ci->i_layout = info->layout;
queue_trunc = ceph_fill_file_size(inode, issued, queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq), le32_to_cpu(info->truncate_seq),
le64_to_cpu(info->truncate_size), le64_to_cpu(info->truncate_size),
...@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ...@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
(issued & CEPH_CAP_FILE_EXCL) == 0 && (issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) { !__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode); dout(" marking %p complete (empty)\n", inode);
i_size_write(inode, 0);
__ceph_dir_set_complete(ci, __ceph_dir_set_complete(ci,
atomic_read(&ci->i_release_count), atomic64_read(&ci->i_release_count),
ci->i_ordered_count); atomic64_read(&ci->i_ordered_count));
} }
wake = true; wake = true;
...@@ -1212,6 +1216,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1212,6 +1216,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
dout("fill_trace doing d_move %p -> %p\n", dout("fill_trace doing d_move %p -> %p\n",
req->r_old_dentry, dn); req->r_old_dentry, dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
d_move(req->r_old_dentry, dn); d_move(req->r_old_dentry, dn);
dout(" src %p '%pd' dst %p '%pd'\n", dout(" src %p '%pd' dst %p '%pd'\n",
req->r_old_dentry, req->r_old_dentry,
...@@ -1222,10 +1230,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1222,10 +1230,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
rehashing bug in vfs_rename_dir */ rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn); ceph_invalidate_dentry_lease(dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
dout("dn %p gets new offset %lld\n", req->r_old_dentry, dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset); ceph_dentry(req->r_old_dentry)->offset);
...@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, ...@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
return err; return err;
} }
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
if (ctl->page) {
kunmap(ctl->page);
page_cache_release(ctl->page);
ctl->page = NULL;
}
}
static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
struct ceph_readdir_cache_control *ctl,
struct ceph_mds_request *req)
{
struct ceph_inode_info *ci = ceph_inode(dir);
unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
if (!ctl->page || pgoff != page_index(ctl->page)) {
ceph_readdir_cache_release(ctl);
ctl->page = grab_cache_page(&dir->i_data, pgoff);
if (!ctl->page) {
ctl->index = -1;
return -ENOMEM;
}
/* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
}
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
dout("readdir cache dn %p idx %d\n", dn, ctl->index);
ctl->dentries[idx] = dn;
ctl->index++;
} else {
dout("disable readdir cache\n");
ctl->index = -1;
}
return 0;
}
int ceph_readdir_prepopulate(struct ceph_mds_request *req, int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session) struct ceph_mds_session *session)
{ {
...@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct inode *snapdir = NULL; struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di; struct ceph_dentry_info *di;
u64 r_readdir_offset = req->r_readdir_offset;
u32 frag = le32_to_cpu(rhead->args.readdir.frag); u32 frag = le32_to_cpu(rhead->args.readdir.frag);
struct ceph_readdir_cache_control cache_ctl = {};
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (rinfo->dir_dir && if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) { le32_to_cpu(rinfo->dir_dir->frag) != frag) {
...@@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
frag, le32_to_cpu(rinfo->dir_dir->frag)); frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag); frag = le32_to_cpu(rinfo->dir_dir->frag);
if (ceph_frag_is_leftmost(frag)) if (ceph_frag_is_leftmost(frag))
r_readdir_offset = 2; req->r_readdir_offset = 2;
else else
r_readdir_offset = 0; req->r_readdir_offset = 0;
} }
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
snapdir = ceph_get_snapdir(d_inode(parent)); snapdir = ceph_get_snapdir(d_inode(parent));
parent = d_find_alias(snapdir); parent = d_find_alias(snapdir);
...@@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
} }
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
req->r_readdir_cache_idx = 0;
}
cache_ctl.index = req->r_readdir_cache_idx;
/* FIXME: release caps/leases if error occurs */ /* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) { for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_vino vino; struct ceph_vino vino;
...@@ -1413,13 +1471,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1413,13 +1471,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
d_delete(dn); d_delete(dn);
dput(dn); dput(dn);
goto retry_lookup; goto retry_lookup;
} else {
/* reorder parent's d_subdirs */
spin_lock(&parent->d_lock);
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
list_move(&dn->d_child, &parent->d_subdirs);
spin_unlock(&dn->d_lock);
spin_unlock(&parent->d_lock);
} }
/* inode */ /* inode */
...@@ -1436,13 +1487,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1436,13 +1487,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
} }
if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1, req->r_request_started, -1,
&req->r_caps_reservation) < 0) { &req->r_caps_reservation);
if (ret < 0) {
pr_err("fill_inode badness on %p\n", in); pr_err("fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) if (d_really_is_negative(dn))
iput(in); iput(in);
d_drop(dn); d_drop(dn);
err = ret;
goto next_item; goto next_item;
} }
...@@ -1458,19 +1511,28 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1458,19 +1511,28 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
di = dn->d_fsdata; di = dn->d_fsdata;
di->offset = ceph_make_fpos(frag, i + r_readdir_offset); di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
update_dentry_lease(dn, rinfo->dir_dlease[i], update_dentry_lease(dn, rinfo->dir_dlease[i],
req->r_session, req->r_session,
req->r_request_started); req->r_request_started);
if (err == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn,
&cache_ctl, req);
if (ret < 0)
err = ret;
}
next_item: next_item:
if (dn) if (dn)
dput(dn); dput(dn);
} }
if (err == 0)
req->r_did_prepopulate = true;
out: out:
if (err == 0) {
req->r_did_prepopulate = true;
req->r_readdir_cache_idx = cache_ctl.index;
}
ceph_readdir_cache_release(&cache_ctl);
if (snapdir) { if (snapdir) {
iput(snapdir); iput(snapdir);
dput(parent); dput(parent);
...@@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
const unsigned int ia_valid = attr->ia_valid; const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req; struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf;
int issued; int issued;
int release = 0, dirtied = 0; int release = 0, dirtied = 0;
int mask = 0; int mask = 0;
int err = 0; int err = 0;
int inode_dirty_flags = 0; int inode_dirty_flags = 0;
bool lock_snap_rwsem = false;
if (ceph_snap(inode) != CEPH_NOSNAP) if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS; return -EROFS;
...@@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (err != 0) if (err != 0)
return err; return err;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
USE_AUTH_MDS); USE_AUTH_MDS);
if (IS_ERR(req)) if (IS_ERR(req)) {
ceph_free_cap_flush(prealloc_cf);
return PTR_ERR(req); return PTR_ERR(req);
}
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL); issued = __ceph_caps_issued(ci, NULL);
if (!ci->i_head_snapc &&
(issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
lock_snap_rwsem = true;
if (!down_read_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
}
}
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ia_valid & ATTR_UID) { if (ia_valid & ATTR_UID) {
...@@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
dout("setattr %p ATTR_FILE ... hrm!\n", inode); dout("setattr %p ATTR_FILE ... hrm!\n", inode);
if (dirtied) { if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
inode->i_ctime = CURRENT_TIME; inode->i_ctime = CURRENT_TIME;
} }
release &= issued; release &= issued;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (inode_dirty_flags) if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags); __mark_inode_dirty(inode, inode_dirty_flags);
...@@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
if (mask & CEPH_SETATTR_SIZE) if (mask & CEPH_SETATTR_SIZE)
__ceph_do_pending_vmtruncate(inode); __ceph_do_pending_vmtruncate(inode);
ceph_free_cap_flush(prealloc_cf);
return err; return err;
out_put: out_put:
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
return err; return err;
} }
......
This diff is collapsed.
...@@ -139,7 +139,6 @@ struct ceph_mds_session { ...@@ -139,7 +139,6 @@ struct ceph_mds_session {
int s_cap_reconnect; int s_cap_reconnect;
int s_readonly; int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator; struct ceph_cap *s_cap_iterator;
/* protected by mutex */ /* protected by mutex */
...@@ -228,7 +227,7 @@ struct ceph_mds_request { ...@@ -228,7 +227,7 @@ struct ceph_mds_request {
int r_err; int r_err;
bool r_aborted; bool r_aborted;
unsigned long r_timeout; /* optional. jiffies */ unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */ unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only, unsigned long r_request_started; /* start time for mds request only,
used to measure lease durations */ used to measure lease durations */
...@@ -254,12 +253,21 @@ struct ceph_mds_request { ...@@ -254,12 +253,21 @@ struct ceph_mds_request {
bool r_got_unsafe, r_got_safe, r_got_result; bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate; bool r_did_prepopulate;
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
u32 r_readdir_offset; u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation; struct ceph_cap_reservation r_caps_reservation;
int r_num_caps; int r_num_caps;
}; };
struct ceph_pool_perm {
struct rb_node node;
u32 pool;
int perm;
};
/* /*
* mds client state * mds client state
*/ */
...@@ -284,12 +292,15 @@ struct ceph_mds_client { ...@@ -284,12 +292,15 @@ struct ceph_mds_client {
* references (implying they contain no inodes with caps) that * references (implying they contain no inodes with caps) that
* should be destroyed. * should be destroyed.
*/ */
u64 last_snap_seq;
struct rw_semaphore snap_rwsem; struct rw_semaphore snap_rwsem;
struct rb_root snap_realms; struct rb_root snap_realms;
struct list_head snap_empty; struct list_head snap_empty;
spinlock_t snap_empty_lock; /* protect snap_empty */ spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */ u64 last_tid; /* most recent mds request */
u64 oldest_tid; /* oldest incomplete mds request,
excluding setfilelock requests */
struct rb_root request_tree; /* pending mds requests */ struct rb_root request_tree; /* pending mds requests */
struct delayed_work delayed_work; /* delayed work */ struct delayed_work delayed_work; /* delayed work */
unsigned long last_renew_caps; /* last time we renewed our caps */ unsigned long last_renew_caps; /* last time we renewed our caps */
...@@ -298,7 +309,8 @@ struct ceph_mds_client { ...@@ -298,7 +309,8 @@ struct ceph_mds_client {
struct list_head snap_flush_list; /* cap_snaps ready to flush */ struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock; spinlock_t snap_flush_lock;
u64 cap_flush_seq; u64 last_cap_flush_tid;
struct rb_root cap_flush_tree;
struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */ struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */ int num_cap_flushing; /* # caps we are flushing */
...@@ -328,6 +340,9 @@ struct ceph_mds_client { ...@@ -328,6 +340,9 @@ struct ceph_mds_client {
spinlock_t dentry_lru_lock; spinlock_t dentry_lru_lock;
struct list_head dentry_lru; struct list_head dentry_lru;
int num_dentry; int num_dentry;
struct rw_semaphore pool_perm_rwsem;
struct rb_root pool_perm_tree;
}; };
extern const char *ceph_mds_op_name(int op); extern const char *ceph_mds_op_name(int op);
...@@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) ...@@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request); kref_put(&req->r_kref, ceph_mdsc_release_request);
} }
extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session); struct ceph_mds_session *session);
......
...@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b) ...@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b)
} }
static struct ceph_snap_context *empty_snapc; struct ceph_snap_context *ceph_empty_snapc;
/* /*
* build the snap context for a given realm. * build the snap context for a given realm.
...@@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) ...@@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
return 0; return 0;
} }
if (num == 0 && realm->seq == empty_snapc->seq) { if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
ceph_get_snap_context(empty_snapc); ceph_get_snap_context(ceph_empty_snapc);
snapc = empty_snapc; snapc = ceph_empty_snapc;
goto done; goto done;
} }
...@@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num) ...@@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
return 0; return 0;
} }
static bool has_new_snaps(struct ceph_snap_context *o,
struct ceph_snap_context *n)
{
if (n->num_snaps == 0)
return false;
/* snaps are in descending order */
return n->snaps[0] > o->seq;
}
/* /*
* When a snapshot is applied, the size/mtime inode metadata is queued * When a snapshot is applied, the size/mtime inode metadata is queued
...@@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ...@@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
{ {
struct inode *inode = &ci->vfs_inode; struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap; struct ceph_cap_snap *capsnap;
struct ceph_snap_context *old_snapc, *new_snapc;
int used, dirty; int used, dirty;
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
...@@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ...@@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
used = __ceph_caps_used(ci); used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci); dirty = __ceph_caps_dirty(ci);
old_snapc = ci->i_head_snapc;
new_snapc = ci->i_snap_realm->cached_context;
/* /*
* If there is a write in progress, treat that as a dirty Fw, * If there is a write in progress, treat that as a dirty Fw,
* even though it hasn't completed yet; by the time we finish * even though it hasn't completed yet; by the time we finish
...@@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ...@@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
writes in progress now were started before the previous writes in progress now were started before the previous
cap_snap. lucky us. */ cap_snap. lucky us. */
dout("queue_cap_snap %p already pending\n", inode); dout("queue_cap_snap %p already pending\n", inode);
kfree(capsnap); goto update_snapc;
} else if (ci->i_snap_realm->cached_context == empty_snapc) { }
dout("queue_cap_snap %p empty snapc\n", inode); if (ci->i_wrbuffer_ref_head == 0 &&
kfree(capsnap); !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| dout("queue_cap_snap %p nothing dirty|writing\n", inode);
CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { goto update_snapc;
struct ceph_snap_context *snapc = ci->i_head_snapc; }
/*
* if we are a sync write, we may need to go to the snaprealm
* to get the current snapc.
*/
if (!snapc)
snapc = ci->i_snap_realm->cached_context;
dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", BUG_ON(!old_snapc);
inode, capsnap, snapc, ceph_cap_string(dirty));
ihold(inode);
atomic_set(&capsnap->nref, 1); /*
capsnap->ci = ci; * There is no need to send FLUSHSNAP message to MDS if there is
INIT_LIST_HEAD(&capsnap->ci_item); * no new snapshot. But when there is dirty pages or on-going
INIT_LIST_HEAD(&capsnap->flushing_item); * writes, we still need to create cap_snap. cap_snap is needed
* by the write path and page writeback path.
capsnap->follows = snapc->seq; *
capsnap->issued = __ceph_caps_issued(ci, NULL); * also see ceph_try_drop_cap_snap()
capsnap->dirty = dirty; */
if (has_new_snaps(old_snapc, new_snapc)) {
capsnap->mode = inode->i_mode; if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
capsnap->uid = inode->i_uid; capsnap->need_flush = true;
capsnap->gid = inode->i_gid; } else {
if (!(used & CEPH_CAP_FILE_WR) &&
if (dirty & CEPH_CAP_XATTR_EXCL) { ci->i_wrbuffer_ref_head == 0) {
__ceph_build_xattrs_blob(ci); dout("queue_cap_snap %p "
capsnap->xattr_blob = "no new_snap|dirty_page|writing\n", inode);
ceph_buffer_get(ci->i_xattrs.blob); goto update_snapc;
capsnap->xattr_version = ci->i_xattrs.version;
} else {
capsnap->xattr_blob = NULL;
capsnap->xattr_version = 0;
} }
}
capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
inode, capsnap, old_snapc, ceph_cap_string(dirty),
/* dirty page count moved from _head to this cap_snap; capsnap->need_flush ? "" : "no_flush");
all subsequent writes page dirties occur _after_ this ihold(inode);
snapshot. */
capsnap->dirty_pages = ci->i_wrbuffer_ref_head; atomic_set(&capsnap->nref, 1);
ci->i_wrbuffer_ref_head = 0; capsnap->ci = ci;
capsnap->context = snapc; INIT_LIST_HEAD(&capsnap->ci_item);
ci->i_head_snapc = INIT_LIST_HEAD(&capsnap->flushing_item);
ceph_get_snap_context(ci->i_snap_realm->cached_context);
dout(" new snapc is %p\n", ci->i_head_snapc); capsnap->follows = old_snapc->seq;
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty;
if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p" capsnap->mode = inode->i_mode;
" seq %llu used WR, now pending\n", inode, capsnap->uid = inode->i_uid;
capsnap, snapc, snapc->seq); capsnap->gid = inode->i_gid;
capsnap->writing = 1;
} else { if (dirty & CEPH_CAP_XATTR_EXCL) {
/* note mtime, size NOW. */ __ceph_build_xattrs_blob(ci);
__ceph_finish_cap_snap(ci, capsnap); capsnap->xattr_blob =
} ceph_buffer_get(ci->i_xattrs.blob);
capsnap->xattr_version = ci->i_xattrs.version;
} else { } else {
dout("queue_cap_snap %p nothing dirty|writing\n", inode); capsnap->xattr_blob = NULL;
kfree(capsnap); capsnap->xattr_version = 0;
} }
capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
/* dirty page count moved from _head to this cap_snap;
all subsequent writes page dirties occur _after_ this
snapshot. */
capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
ci->i_wrbuffer_ref_head = 0;
capsnap->context = old_snapc;
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
old_snapc = NULL;
if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p"
" seq %llu used WR, now pending\n", inode,
capsnap, old_snapc, old_snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap);
}
capsnap = NULL;
update_snapc:
if (ci->i_head_snapc) {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
kfree(capsnap);
ceph_put_snap_context(old_snapc);
} }
/* /*
...@@ -699,6 +730,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, ...@@ -699,6 +730,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
/* queue realm for cap_snap creation */ /* queue realm for cap_snap creation */
list_add(&realm->dirty_item, &dirty_realms); list_add(&realm->dirty_item, &dirty_realms);
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
invalidate = 1; invalidate = 1;
} else if (!realm->cached_context) { } else if (!realm->cached_context) {
...@@ -964,14 +997,14 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, ...@@ -964,14 +997,14 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
int __init ceph_snap_init(void) int __init ceph_snap_init(void)
{ {
empty_snapc = ceph_create_snap_context(0, GFP_NOFS); ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
if (!empty_snapc) if (!ceph_empty_snapc)
return -ENOMEM; return -ENOMEM;
empty_snapc->seq = 1; ceph_empty_snapc->seq = 1;
return 0; return 0;
} }
void ceph_snap_exit(void) void ceph_snap_exit(void)
{ {
ceph_put_snap_context(empty_snapc); ceph_put_snap_context(ceph_empty_snapc);
} }
...@@ -134,10 +134,12 @@ enum { ...@@ -134,10 +134,12 @@ enum {
Opt_noino32, Opt_noino32,
Opt_fscache, Opt_fscache,
Opt_nofscache, Opt_nofscache,
Opt_poolperm,
Opt_nopoolperm,
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
Opt_acl, Opt_acl,
#endif #endif
Opt_noacl Opt_noacl,
}; };
static match_table_t fsopt_tokens = { static match_table_t fsopt_tokens = {
...@@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = { ...@@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = {
{Opt_noino32, "noino32"}, {Opt_noino32, "noino32"},
{Opt_fscache, "fsc"}, {Opt_fscache, "fsc"},
{Opt_nofscache, "nofsc"}, {Opt_nofscache, "nofsc"},
{Opt_poolperm, "poolperm"},
{Opt_nopoolperm, "nopoolperm"},
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
{Opt_acl, "acl"}, {Opt_acl, "acl"},
#endif #endif
...@@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private) ...@@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_nofscache: case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
break; break;
case Opt_poolperm:
fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
printk ("pool perm");
break;
case Opt_nopoolperm:
fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl: case Opt_acl:
fsopt->sb_flags |= MS_POSIXACL; fsopt->sb_flags |= MS_POSIXACL;
...@@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) ...@@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",nodcache"); seq_puts(m, ",nodcache");
if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
seq_puts(m, ",fsc"); seq_puts(m, ",fsc");
if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
seq_puts(m, ",nopoolperm");
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
if (fsopt->sb_flags & MS_POSIXACL) if (fsopt->sb_flags & MS_POSIXACL)
...@@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) ...@@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/ */
struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep; struct kmem_cache *ceph_cap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep; struct kmem_cache *ceph_file_cachep;
...@@ -634,6 +648,10 @@ static int __init init_caches(void) ...@@ -634,6 +648,10 @@ static int __init init_caches(void)
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (ceph_cap_cachep == NULL) if (ceph_cap_cachep == NULL)
goto bad_cap; goto bad_cap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (ceph_cap_flush_cachep == NULL)
goto bad_cap_flush;
ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
...@@ -652,6 +670,8 @@ static int __init init_caches(void) ...@@ -652,6 +670,8 @@ static int __init init_caches(void)
bad_file: bad_file:
kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry: bad_dentry:
kmem_cache_destroy(ceph_cap_flush_cachep);
bad_cap_flush:
kmem_cache_destroy(ceph_cap_cachep); kmem_cache_destroy(ceph_cap_cachep);
bad_cap: bad_cap:
kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_inode_cachep);
...@@ -668,6 +688,7 @@ static void destroy_caches(void) ...@@ -668,6 +688,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep); kmem_cache_destroy(ceph_cap_cachep);
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep); kmem_cache_destroy(ceph_file_cachep);
...@@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, ...@@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.ino = CEPH_INO_ROOT;
req->r_ino1.snap = CEPH_NOSNAP; req->r_ino1.snap = CEPH_NOSNAP;
req->r_started = started; req->r_started = started;
req->r_timeout = fsc->client->options->mount_timeout * HZ; req->r_timeout = fsc->client->options->mount_timeout;
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
req->r_num_caps = 2; req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_mdsc_do_request(mdsc, NULL, req);
......
This diff is collapsed.
...@@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
struct inode *inode = d_inode(dentry); struct inode *inode = d_inode(dentry);
struct ceph_vxattr *vxattr; struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
int issued; int issued;
int err; int err;
int dirty = 0; int dirty = 0;
...@@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
char *newval = NULL; char *newval = NULL;
struct ceph_inode_xattr *xattr = NULL; struct ceph_inode_xattr *xattr = NULL;
int required_blob_size; int required_blob_size;
bool lock_snap_rwsem = false;
if (!ceph_is_valid_xattr(name)) if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP; return -EOPNOTSUPP;
...@@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
if (!xattr) if (!xattr)
goto out; goto out;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
goto out;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
retry: retry:
issued = __ceph_caps_issued(ci, NULL); issued = __ceph_caps_issued(ci, NULL);
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync; goto do_sync;
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
if (!down_read_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
goto retry;
}
}
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
__build_xattrs(inode); __build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, name_len, val_len); required_blob_size = __get_required_blob_size(ci, name_len, val_len);
...@@ -966,7 +984,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -966,7 +984,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
dout(" preaallocating new blob size=%d\n", required_blob_size); dout(" preaallocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS); blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob) if (!blob)
goto out; goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.prealloc_blob) if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob); ceph_buffer_put(ci->i_xattrs.prealloc_blob);
...@@ -978,21 +996,28 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -978,21 +996,28 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
flags, value ? 1 : -1, &xattr); flags, value ? 1 : -1, &xattr);
if (!err) { if (!err) {
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true; ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME; inode->i_ctime = CURRENT_TIME;
} }
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (dirty) if (dirty)
__mark_inode_dirty(inode, dirty); __mark_inode_dirty(inode, dirty);
ceph_free_cap_flush(prealloc_cf);
return err; return err;
do_sync: do_sync:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked: do_sync_unlocked:
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
err = ceph_sync_setxattr(dentry, name, value, size, flags); err = ceph_sync_setxattr(dentry, name, value, size, flags);
out: out:
ceph_free_cap_flush(prealloc_cf);
kfree(newname); kfree(newname);
kfree(newval); kfree(newval);
kfree(xattr); kfree(xattr);
...@@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
struct inode *inode = d_inode(dentry); struct inode *inode = d_inode(dentry);
struct ceph_vxattr *vxattr; struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
int issued; int issued;
int err; int err;
int required_blob_size; int required_blob_size;
int dirty; int dirty;
bool lock_snap_rwsem = false;
if (!ceph_is_valid_xattr(name)) if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP; return -EOPNOTSUPP;
...@@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
goto do_sync_unlocked; goto do_sync_unlocked;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
err = -ENOMEM; err = -ENOMEM;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
retry: retry:
issued = __ceph_caps_issued(ci, NULL); issued = __ceph_caps_issued(ci, NULL);
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync; goto do_sync;
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
if (!down_read_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
goto retry;
}
}
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
__build_xattrs(inode); __build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, 0, 0); required_blob_size = __get_required_blob_size(ci, 0, 0);
...@@ -1080,7 +1123,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1080,7 +1123,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
dout(" preaallocating new blob size=%d\n", required_blob_size); dout(" preaallocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS); blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob) if (!blob)
goto out; goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.prealloc_blob) if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob); ceph_buffer_put(ci->i_xattrs.prealloc_blob);
...@@ -1090,18 +1133,24 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1090,18 +1133,24 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
err = __remove_xattr_by_name(ceph_inode(inode), name); err = __remove_xattr_by_name(ceph_inode(inode), name);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true; ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME; inode->i_ctime = CURRENT_TIME;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (dirty) if (dirty)
__mark_inode_dirty(inode, dirty); __mark_inode_dirty(inode, dirty);
ceph_free_cap_flush(prealloc_cf);
return err; return err;
do_sync: do_sync:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked: do_sync_unlocked:
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
ceph_free_cap_flush(prealloc_cf);
err = ceph_send_removexattr(dentry, name); err = ceph_send_removexattr(dentry, name);
out:
return err; return err;
} }
......
...@@ -43,9 +43,9 @@ struct ceph_options { ...@@ -43,9 +43,9 @@ struct ceph_options {
int flags; int flags;
struct ceph_fsid fsid; struct ceph_fsid fsid;
struct ceph_entity_addr my_addr; struct ceph_entity_addr my_addr;
int mount_timeout; unsigned long mount_timeout; /* jiffies */
int osd_idle_ttl; unsigned long osd_idle_ttl; /* jiffies */
int osd_keepalive_timeout; unsigned long osd_keepalive_timeout; /* jiffies */
/* /*
* any type that can't be simply compared or doesn't need need * any type that can't be simply compared or doesn't need need
...@@ -63,9 +63,9 @@ struct ceph_options { ...@@ -63,9 +63,9 @@ struct ceph_options {
/* /*
* defaults * defaults
*/ */
#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_OSD_KEEPALIVE_DEFAULT 5 #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
#define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
...@@ -93,13 +93,9 @@ enum { ...@@ -93,13 +93,9 @@ enum {
CEPH_MOUNT_SHUTDOWN, CEPH_MOUNT_SHUTDOWN,
}; };
/* static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
* subtract jiffies
*/
static inline unsigned long time_sub(unsigned long a, unsigned long b)
{ {
BUG_ON(time_after(b, a)); return timeout ?: MAX_SCHEDULE_TIMEOUT;
return (long)a - (long)b;
} }
struct ceph_mds_client; struct ceph_mds_client;
...@@ -178,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len) ...@@ -178,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len)
extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_inode_cachep;
extern struct kmem_cache *ceph_cap_cachep; extern struct kmem_cache *ceph_cap_cachep;
extern struct kmem_cache *ceph_cap_flush_cachep;
extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_dentry_cachep;
extern struct kmem_cache *ceph_file_cachep; extern struct kmem_cache *ceph_file_cachep;
......
...@@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, ...@@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg); struct ceph_msg *msg);
extern void osd_req_op_init(struct ceph_osd_request *osd_req, extern void osd_req_op_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode); unsigned int which, u16 opcode, u32 flags);
extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
unsigned int which, unsigned int which,
......
#ifndef CEPH_CRUSH_CRUSH_H #ifndef CEPH_CRUSH_CRUSH_H
#define CEPH_CRUSH_CRUSH_H #define CEPH_CRUSH_CRUSH_H
#include <linux/types.h> #ifdef __KERNEL__
# include <linux/types.h>
#else
# include "crush_compat.h"
#endif
/* /*
* CRUSH is a pseudo-random data distribution algorithm that * CRUSH is a pseudo-random data distribution algorithm that
...@@ -20,7 +24,11 @@ ...@@ -20,7 +24,11 @@
#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */
#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */
#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ #define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
#define CRUSH_ITEM_NONE 0x7fffffff /* no result */ #define CRUSH_ITEM_NONE 0x7fffffff /* no result */
...@@ -108,6 +116,15 @@ enum { ...@@ -108,6 +116,15 @@ enum {
}; };
extern const char *crush_bucket_alg_name(int alg); extern const char *crush_bucket_alg_name(int alg);
/*
* although tree was a legacy algorithm, it has been buggy, so
* exclude it.
*/
#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
(1 << CRUSH_BUCKET_UNIFORM) | \
(1 << CRUSH_BUCKET_LIST) | \
(1 << CRUSH_BUCKET_STRAW))
struct crush_bucket { struct crush_bucket {
__s32 id; /* this'll be negative */ __s32 id; /* this'll be negative */
__u16 type; /* non-zero; type=0 is reserved for devices */ __u16 type; /* non-zero; type=0 is reserved for devices */
...@@ -174,7 +191,7 @@ struct crush_map { ...@@ -174,7 +191,7 @@ struct crush_map {
/* choose local attempts using a fallback permutation before /* choose local attempts using a fallback permutation before
* re-descent */ * re-descent */
__u32 choose_local_fallback_tries; __u32 choose_local_fallback_tries;
/* choose attempts before giving up */ /* choose attempts before giving up */
__u32 choose_total_tries; __u32 choose_total_tries;
/* attempt chooseleaf inner descent once for firstn mode; on /* attempt chooseleaf inner descent once for firstn mode; on
* reject retry outer descent. Note that this does *not* * reject retry outer descent. Note that this does *not*
...@@ -187,6 +204,25 @@ struct crush_map { ...@@ -187,6 +204,25 @@ struct crush_map {
* that want to limit reshuffling, a value of 3 or 4 will make the * that want to limit reshuffling, a value of 3 or 4 will make the
* mappings line up a bit better with previous mappings. */ * mappings line up a bit better with previous mappings. */
__u8 chooseleaf_vary_r; __u8 chooseleaf_vary_r;
#ifndef __KERNEL__
/*
* version 0 (original) of straw_calc has various flaws. version 1
* fixes a few of them.
*/
__u8 straw_calc_version;
/*
* allowed bucket algs is a bitmask, here the bit positions
* are CRUSH_BUCKET_*. note that these are *bits* and
* CRUSH_BUCKET_* values are not, so we need to or together (1
* << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
* minimize confusion (bucket type values start at 1).
*/
__u32 allowed_bucket_algs;
__u32 *choose_tries;
#endif
}; };
......
#ifndef CEPH_CRUSH_HASH_H #ifndef CEPH_CRUSH_HASH_H
#define CEPH_CRUSH_HASH_H #define CEPH_CRUSH_HASH_H
#ifdef __KERNEL__
# include <linux/types.h>
#else
# include "crush_compat.h"
#endif
#define CRUSH_HASH_RJENKINS1 0 #define CRUSH_HASH_RJENKINS1 0
#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* LGPL2 * LGPL2
*/ */
#include <linux/crush/crush.h> #include "crush.h"
extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
extern int crush_do_rule(const struct crush_map *map, extern int crush_do_rule(const struct crush_map *map,
......
...@@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name, ...@@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name,
/* start with defaults */ /* start with defaults */
opt->flags = CEPH_OPT_DEFAULT; opt->flags = CEPH_OPT_DEFAULT;
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
/* get mon ip(s) */ /* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */ /* ip1[:port1][,ip2[:port2]...] */
...@@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name, ...@@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name,
pr_warn("ignoring deprecated osdtimeout option\n"); pr_warn("ignoring deprecated osdtimeout option\n");
break; break;
case Opt_osdkeepalivetimeout: case Opt_osdkeepalivetimeout:
opt->osd_keepalive_timeout = intval; /* 0 isn't well defined right now, reject it */
if (intval < 1 || intval > INT_MAX / 1000) {
pr_err("osdkeepalive out of range\n");
err = -EINVAL;
goto out;
}
opt->osd_keepalive_timeout =
msecs_to_jiffies(intval * 1000);
break; break;
case Opt_osd_idle_ttl: case Opt_osd_idle_ttl:
opt->osd_idle_ttl = intval; /* 0 isn't well defined right now, reject it */
if (intval < 1 || intval > INT_MAX / 1000) {
pr_err("osd_idle_ttl out of range\n");
err = -EINVAL;
goto out;
}
opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
break; break;
case Opt_mount_timeout: case Opt_mount_timeout:
opt->mount_timeout = intval; /* 0 is "wait forever" (i.e. infinite timeout) */
if (intval < 0 || intval > INT_MAX / 1000) {
pr_err("mount_timeout out of range\n");
err = -EINVAL;
goto out;
}
opt->mount_timeout = msecs_to_jiffies(intval * 1000);
break; break;
case Opt_share: case Opt_share:
...@@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) ...@@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
seq_puts(m, "notcp_nodelay,"); seq_puts(m, "notcp_nodelay,");
if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
seq_printf(m, "mount_timeout=%d,", opt->mount_timeout); seq_printf(m, "mount_timeout=%d,",
jiffies_to_msecs(opt->mount_timeout) / 1000);
if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl); seq_printf(m, "osd_idle_ttl=%d,",
jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, "osdkeepalivetimeout=%d,", seq_printf(m, "osdkeepalivetimeout=%d,",
opt->osd_keepalive_timeout); jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
/* drop redundant comma */ /* drop redundant comma */
if (m->count != pos) if (m->count != pos)
...@@ -626,8 +647,8 @@ static int have_mon_and_osd_map(struct ceph_client *client) ...@@ -626,8 +647,8 @@ static int have_mon_and_osd_map(struct ceph_client *client)
*/ */
int __ceph_open_session(struct ceph_client *client, unsigned long started) int __ceph_open_session(struct ceph_client *client, unsigned long started)
{ {
int err; unsigned long timeout = client->options->mount_timeout;
unsigned long timeout = client->options->mount_timeout * HZ; long err;
/* open session, and wait for mon and osd maps */ /* open session, and wait for mon and osd maps */
err = ceph_monc_open_session(&client->monc); err = ceph_monc_open_session(&client->monc);
...@@ -635,16 +656,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) ...@@ -635,16 +656,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
return err; return err;
while (!have_mon_and_osd_map(client)) { while (!have_mon_and_osd_map(client)) {
err = -EIO;
if (timeout && time_after_eq(jiffies, started + timeout)) if (timeout && time_after_eq(jiffies, started + timeout))
return err; return -ETIMEDOUT;
/* wait */ /* wait */
dout("mount waiting for mon_map\n"); dout("mount waiting for mon_map\n");
err = wait_event_interruptible_timeout(client->auth_wq, err = wait_event_interruptible_timeout(client->auth_wq,
have_mon_and_osd_map(client) || (client->auth_err < 0), have_mon_and_osd_map(client) || (client->auth_err < 0),
timeout); ceph_timeout_jiffies(timeout));
if (err == -EINTR || err == -ERESTARTSYS) if (err < 0)
return err; return err;
if (client->auth_err < 0) if (client->auth_err < 0)
return client->auth_err; return client->auth_err;
...@@ -721,5 +741,5 @@ module_exit(exit_ceph_lib); ...@@ -721,5 +741,5 @@ module_exit(exit_ceph_lib);
MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
MODULE_DESCRIPTION("Ceph filesystem for Linux"); MODULE_DESCRIPTION("Ceph core library");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
#ifdef __KERNEL__ #ifdef __KERNEL__
# include <linux/slab.h> # include <linux/slab.h>
# include <linux/crush/crush.h>
#else #else
# include <stdlib.h> # include "crush_compat.h"
# include <assert.h> # include "crush.h"
# define kfree(x) do { if (x) free(x); } while (0)
# define BUG_ON(x) assert(!(x))
#endif #endif
#include <linux/crush/crush.h>
const char *crush_bucket_alg_name(int alg) const char *crush_bucket_alg_name(int alg)
{ {
switch (alg) { switch (alg) {
...@@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map) ...@@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map)
kfree(map->rules); kfree(map->rules);
} }
#ifndef __KERNEL__
kfree(map->choose_tries);
#endif
kfree(map); kfree(map);
} }
......
This diff is collapsed.
#ifdef __KERNEL__
#include <linux/types.h> # include <linux/crush/hash.h>
#include <linux/crush/hash.h> #else
# include "hash.h"
#endif
/* /*
* Robert Jenkins' function for mixing 32-bit values * Robert Jenkins' function for mixing 32-bit values
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment