Commit 0c76c6ba authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "We have a pile of bug fixes from Ilya, including a few patches that
  sync up the CRUSH code with the latest from userspace.

  There is also a long series from Zheng that fixes various issues with
  snapshots, inline data, and directory fsync, some simplification and
  improvement in the cap release code, and a rework of the caching of
  directory contents.

  To top it off there are a few small fixes and cleanups from Benoit and
  Hong"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (40 commits)
  rbd: use GFP_NOIO in rbd_obj_request_create()
  crush: fix a bug in tree bucket decode
  libceph: Fix ceph_tcp_sendpage()'s more boolean usage
  libceph: Remove spurious kunmap() of the zero page
  rbd: queue_depth map option
  rbd: store rbd_options in rbd_device
  rbd: terminate rbd_opts_tokens with Opt_err
  ceph: fix ceph_writepages_start()
  rbd: bump queue_max_segments
  ceph: rework dcache readdir
  crush: sync up with userspace
  crush: fix crash from invalid 'take' argument
  ceph: switch some GFP_NOFS memory allocation to GFP_KERNEL
  ceph: pre-allocate data structure that tracks caps flushing
  ceph: re-send flushing caps (which are revoked) in reconnect stage
  ceph: send TID of the oldest pending caps flush to MDS
  ceph: track pending caps flushing globally
  ceph: track pending caps flushing accurately
  libceph: fix wrong name "Ceph filesystem for Linux"
  ceph: fix directory fsync
  ...
parents 8688d954 5a60e876
...@@ -346,6 +346,7 @@ struct rbd_device { ...@@ -346,6 +346,7 @@ struct rbd_device {
struct rbd_image_header header; struct rbd_image_header header;
unsigned long flags; /* possibly lock protected */ unsigned long flags; /* possibly lock protected */
struct rbd_spec *spec; struct rbd_spec *spec;
struct rbd_options *opts;
char *header_name; char *header_name;
...@@ -724,34 +725,36 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) ...@@ -724,34 +725,36 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
} }
/* /*
* mount options * (Per device) rbd map options
*/ */
enum { enum {
Opt_queue_depth,
Opt_last_int, Opt_last_int,
/* int args above */ /* int args above */
Opt_last_string, Opt_last_string,
/* string args above */ /* string args above */
Opt_read_only, Opt_read_only,
Opt_read_write, Opt_read_write,
/* Boolean args above */ Opt_err
Opt_last_bool,
}; };
static match_table_t rbd_opts_tokens = { static match_table_t rbd_opts_tokens = {
{Opt_queue_depth, "queue_depth=%d"},
/* int args above */ /* int args above */
/* string args above */ /* string args above */
{Opt_read_only, "read_only"}, {Opt_read_only, "read_only"},
{Opt_read_only, "ro"}, /* Alternate spelling */ {Opt_read_only, "ro"}, /* Alternate spelling */
{Opt_read_write, "read_write"}, {Opt_read_write, "read_write"},
{Opt_read_write, "rw"}, /* Alternate spelling */ {Opt_read_write, "rw"}, /* Alternate spelling */
/* Boolean args above */ {Opt_err, NULL}
{-1, NULL}
}; };
struct rbd_options { struct rbd_options {
int queue_depth;
bool read_only; bool read_only;
}; };
#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
#define RBD_READ_ONLY_DEFAULT false #define RBD_READ_ONLY_DEFAULT false
static int parse_rbd_opts_token(char *c, void *private) static int parse_rbd_opts_token(char *c, void *private)
...@@ -761,27 +764,27 @@ static int parse_rbd_opts_token(char *c, void *private) ...@@ -761,27 +764,27 @@ static int parse_rbd_opts_token(char *c, void *private)
int token, intval, ret; int token, intval, ret;
token = match_token(c, rbd_opts_tokens, argstr); token = match_token(c, rbd_opts_tokens, argstr);
if (token < 0)
return -EINVAL;
if (token < Opt_last_int) { if (token < Opt_last_int) {
ret = match_int(&argstr[0], &intval); ret = match_int(&argstr[0], &intval);
if (ret < 0) { if (ret < 0) {
pr_err("bad mount option arg (not int) " pr_err("bad mount option arg (not int) at '%s'\n", c);
"at '%s'\n", c);
return ret; return ret;
} }
dout("got int token %d val %d\n", token, intval); dout("got int token %d val %d\n", token, intval);
} else if (token > Opt_last_int && token < Opt_last_string) { } else if (token > Opt_last_int && token < Opt_last_string) {
dout("got string token %d val %s\n", token, dout("got string token %d val %s\n", token, argstr[0].from);
argstr[0].from);
} else if (token > Opt_last_string && token < Opt_last_bool) {
dout("got Boolean token %d\n", token);
} else { } else {
dout("got token %d\n", token); dout("got token %d\n", token);
} }
switch (token) { switch (token) {
case Opt_queue_depth:
if (intval < 1) {
pr_err("queue_depth out of range\n");
return -EINVAL;
}
rbd_opts->queue_depth = intval;
break;
case Opt_read_only: case Opt_read_only:
rbd_opts->read_only = true; rbd_opts->read_only = true;
break; break;
...@@ -789,9 +792,10 @@ static int parse_rbd_opts_token(char *c, void *private) ...@@ -789,9 +792,10 @@ static int parse_rbd_opts_token(char *c, void *private)
rbd_opts->read_only = false; rbd_opts->read_only = false;
break; break;
default: default:
rbd_assert(false); /* libceph prints "bad option" msg */
break; return -EINVAL;
} }
return 0; return 0;
} }
...@@ -1563,22 +1567,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request) ...@@ -1563,22 +1567,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
/* /*
* Wait for an object request to complete. If interrupted, cancel the * Wait for an object request to complete. If interrupted, cancel the
* underlying osd request. * underlying osd request.
*
* @timeout: in jiffies, 0 means "wait forever"
*/ */
static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
unsigned long timeout)
{ {
int ret; long ret;
dout("%s %p\n", __func__, obj_request); dout("%s %p\n", __func__, obj_request);
ret = wait_for_completion_interruptible_timeout(
ret = wait_for_completion_interruptible(&obj_request->completion); &obj_request->completion,
if (ret < 0) { ceph_timeout_jiffies(timeout));
dout("%s %p interrupted\n", __func__, obj_request); if (ret <= 0) {
if (ret == 0)
ret = -ETIMEDOUT;
rbd_obj_request_end(obj_request); rbd_obj_request_end(obj_request);
return ret; } else {
ret = 0;
} }
dout("%s %p done\n", __func__, obj_request); dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
return 0; return ret;
}
static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
{
return __rbd_obj_request_wait(obj_request, 0);
}
static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
unsigned long timeout)
{
return __rbd_obj_request_wait(obj_request, timeout);
} }
static void rbd_img_request_complete(struct rbd_img_request *img_request) static void rbd_img_request_complete(struct rbd_img_request *img_request)
...@@ -2001,11 +2022,11 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, ...@@ -2001,11 +2022,11 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
rbd_assert(obj_request_type_valid(type)); rbd_assert(obj_request_type_valid(type));
size = strlen(object_name) + 1; size = strlen(object_name) + 1;
name = kmalloc(size, GFP_KERNEL); name = kmalloc(size, GFP_NOIO);
if (!name) if (!name)
return NULL; return NULL;
obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL); obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
if (!obj_request) { if (!obj_request) {
kfree(name); kfree(name);
return NULL; return NULL;
...@@ -2376,7 +2397,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, ...@@ -2376,7 +2397,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
} }
if (opcode == CEPH_OSD_OP_DELETE) if (opcode == CEPH_OSD_OP_DELETE)
osd_req_op_init(osd_request, num_ops, opcode); osd_req_op_init(osd_request, num_ops, opcode, 0);
else else
osd_req_op_extent_init(osd_request, num_ops, opcode, osd_req_op_extent_init(osd_request, num_ops, opcode,
offset, length, 0, 0); offset, length, 0, 0);
...@@ -2848,7 +2869,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) ...@@ -2848,7 +2869,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
goto out; goto out;
stat_request->callback = rbd_img_obj_exists_callback; stat_request->callback = rbd_img_obj_exists_callback;
osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
false, false); false, false);
rbd_osd_req_format_read(stat_request); rbd_osd_req_format_read(stat_request);
...@@ -3122,6 +3143,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( ...@@ -3122,6 +3143,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
bool watch) bool watch)
{ {
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct ceph_options *opts = osdc->client->options;
struct rbd_obj_request *obj_request; struct rbd_obj_request *obj_request;
int ret; int ret;
...@@ -3148,7 +3170,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( ...@@ -3148,7 +3170,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
if (ret) if (ret)
goto out; goto out;
ret = rbd_obj_request_wait(obj_request); ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
if (ret) if (ret)
goto out; goto out;
...@@ -3750,10 +3772,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) ...@@ -3750,10 +3772,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
rbd_dev->tag_set.ops = &rbd_mq_ops; rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE; rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
rbd_dev->tag_set.flags = rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1; rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
...@@ -3773,6 +3794,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) ...@@ -3773,6 +3794,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
/* set io sizes to object size */ /* set io sizes to object size */
segment_size = rbd_obj_bytes(&rbd_dev->header); segment_size = rbd_obj_bytes(&rbd_dev->header);
blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
blk_queue_max_segment_size(q, segment_size); blk_queue_max_segment_size(q, segment_size);
blk_queue_io_min(q, segment_size); blk_queue_io_min(q, segment_size);
blk_queue_io_opt(q, segment_size); blk_queue_io_opt(q, segment_size);
...@@ -4044,7 +4066,8 @@ static void rbd_spec_free(struct kref *kref) ...@@ -4044,7 +4066,8 @@ static void rbd_spec_free(struct kref *kref)
} }
static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
struct rbd_spec *spec) struct rbd_spec *spec,
struct rbd_options *opts)
{ {
struct rbd_device *rbd_dev; struct rbd_device *rbd_dev;
...@@ -4058,8 +4081,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, ...@@ -4058,8 +4081,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->node);
init_rwsem(&rbd_dev->header_rwsem); init_rwsem(&rbd_dev->header_rwsem);
rbd_dev->spec = spec;
rbd_dev->rbd_client = rbdc; rbd_dev->rbd_client = rbdc;
rbd_dev->spec = spec;
rbd_dev->opts = opts;
/* Initialize the layout used for all rbd requests */ /* Initialize the layout used for all rbd requests */
...@@ -4075,6 +4099,7 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev) ...@@ -4075,6 +4099,7 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev)
{ {
rbd_put_client(rbd_dev->rbd_client); rbd_put_client(rbd_dev->rbd_client);
rbd_spec_put(rbd_dev->spec); rbd_spec_put(rbd_dev->spec);
kfree(rbd_dev->opts);
kfree(rbd_dev); kfree(rbd_dev);
} }
...@@ -4933,6 +4958,7 @@ static int rbd_add_parse_args(const char *buf, ...@@ -4933,6 +4958,7 @@ static int rbd_add_parse_args(const char *buf,
goto out_mem; goto out_mem;
rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
copts = ceph_parse_options(options, mon_addrs, copts = ceph_parse_options(options, mon_addrs,
mon_addrs + mon_addrs_size - 1, mon_addrs + mon_addrs_size - 1,
...@@ -4963,8 +4989,8 @@ static int rbd_add_parse_args(const char *buf, ...@@ -4963,8 +4989,8 @@ static int rbd_add_parse_args(const char *buf,
*/ */
static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
{ {
struct ceph_options *opts = rbdc->client->options;
u64 newest_epoch; u64 newest_epoch;
unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
int tries = 0; int tries = 0;
int ret; int ret;
...@@ -4979,7 +5005,8 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) ...@@ -4979,7 +5005,8 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
ceph_monc_request_next_osdmap(&rbdc->client->monc); ceph_monc_request_next_osdmap(&rbdc->client->monc);
(void) ceph_monc_wait_osdmap(&rbdc->client->monc, (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
newest_epoch, timeout); newest_epoch,
opts->mount_timeout);
goto again; goto again;
} else { } else {
/* the osdmap we have is new enough */ /* the osdmap we have is new enough */
...@@ -5148,7 +5175,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) ...@@ -5148,7 +5175,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
rbdc = __rbd_get_client(rbd_dev->rbd_client); rbdc = __rbd_get_client(rbd_dev->rbd_client);
ret = -ENOMEM; ret = -ENOMEM;
parent = rbd_dev_create(rbdc, parent_spec); parent = rbd_dev_create(rbdc, parent_spec, NULL);
if (!parent) if (!parent)
goto out_err; goto out_err;
...@@ -5394,9 +5421,6 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -5394,9 +5421,6 @@ static ssize_t do_rbd_add(struct bus_type *bus,
rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
if (rc < 0) if (rc < 0)
goto err_out_module; goto err_out_module;
read_only = rbd_opts->read_only;
kfree(rbd_opts);
rbd_opts = NULL; /* done with this */
rbdc = rbd_get_client(ceph_opts); rbdc = rbd_get_client(ceph_opts);
if (IS_ERR(rbdc)) { if (IS_ERR(rbdc)) {
...@@ -5422,11 +5446,12 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -5422,11 +5446,12 @@ static ssize_t do_rbd_add(struct bus_type *bus,
goto err_out_client; goto err_out_client;
} }
rbd_dev = rbd_dev_create(rbdc, spec); rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
if (!rbd_dev) if (!rbd_dev)
goto err_out_client; goto err_out_client;
rbdc = NULL; /* rbd_dev now owns this */ rbdc = NULL; /* rbd_dev now owns this */
spec = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */
rbd_opts = NULL; /* rbd_dev now owns this */
rc = rbd_dev_image_probe(rbd_dev, true); rc = rbd_dev_image_probe(rbd_dev, true);
if (rc < 0) if (rc < 0)
...@@ -5434,6 +5459,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -5434,6 +5459,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
/* If we are mapping a snapshot it must be marked read-only */ /* If we are mapping a snapshot it must be marked read-only */
read_only = rbd_dev->opts->read_only;
if (rbd_dev->spec->snap_id != CEPH_NOSNAP) if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
read_only = true; read_only = true;
rbd_dev->mapping.read_only = read_only; rbd_dev->mapping.read_only = read_only;
...@@ -5458,6 +5484,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -5458,6 +5484,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
rbd_put_client(rbdc); rbd_put_client(rbdc);
err_out_args: err_out_args:
rbd_spec_put(spec); rbd_spec_put(spec);
kfree(rbd_opts);
err_out_module: err_out_module:
module_put(THIS_MODULE); module_put(THIS_MODULE);
......
...@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, ...@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
val_size2 = posix_acl_xattr_size(default_acl->a_count); val_size2 = posix_acl_xattr_size(default_acl->a_count);
err = -ENOMEM; err = -ENOMEM;
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS); tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
if (!tmp_buf) if (!tmp_buf)
goto out_err; goto out_err;
pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS); pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
if (!pagelist) if (!pagelist)
goto out_err; goto out_err;
ceph_pagelist_init(pagelist); ceph_pagelist_init(pagelist);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) ...@@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR: case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file, dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode); inode->i_mode);
cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
if (cf == NULL) { if (cf == NULL) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM; return -ENOMEM;
} }
cf->fmode = fmode; cf->fmode = fmode;
cf->next_offset = 2; cf->next_offset = 2;
cf->readdir_cache_idx = -1;
file->private_data = cf; file->private_data = cf;
BUG_ON(inode->i_fop->release != ceph_release); BUG_ON(inode->i_fop->release != ceph_release);
break; break;
...@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file) ...@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
ceph_mdsc_put_request(cf->last_readdir); ceph_mdsc_put_request(cf->last_readdir);
kfree(cf->last_name); kfree(cf->last_name);
kfree(cf->dir_info); kfree(cf->dir_info);
dput(cf->dentry);
kmem_cache_free(ceph_file_cachep, cf); kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */ /* wake up anyone waiting for caps on this inode */
...@@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, ...@@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
} }
} else { } else {
num_pages = calc_pages_for(off, len); num_pages = calc_pages_for(off, len);
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) if (IS_ERR(pages))
return PTR_ERR(pages); return PTR_ERR(pages);
ret = striped_read(inode, off, len, pages, ret = striped_read(inode, off, len, pages,
...@@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) ...@@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
* objects, rollback on failure, etc.) * objects, rollback on failure, etc.)
*/ */
static ssize_t static ssize_t
ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct ceph_snap_context *snapc)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc;
struct ceph_vino vino; struct ceph_vino vino;
struct ceph_osd_request *req; struct ceph_osd_request *req;
struct page **pages; struct page **pages;
...@@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t start; size_t start;
ssize_t n; ssize_t n;
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode); vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0, vino, pos, &len, 0,
...@@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
break; break;
} }
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
n = iov_iter_get_pages_alloc(from, &pages, len, &start); n = iov_iter_get_pages_alloc(from, &pages, len, &start);
if (unlikely(n < 0)) { if (unlikely(n < 0)) {
...@@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
* objects, rollback on failure, etc.) * objects, rollback on failure, etc.)
*/ */
static ssize_t static ssize_t
ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct ceph_snap_context *snapc)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc;
struct ceph_vino vino; struct ceph_vino vino;
struct ceph_osd_request *req; struct ceph_osd_request *req;
struct page **pages; struct page **pages;
...@@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t left; size_t left;
int n; int n;
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode); vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0, 1, vino, pos, &len, 0, 1,
...@@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) ...@@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
*/ */
num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) { if (IS_ERR(pages)) {
ret = PTR_ERR(pages); ret = PTR_ERR(pages);
goto out; goto out;
...@@ -860,7 +858,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) ...@@ -860,7 +858,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct page *page = NULL; struct page *page = NULL;
loff_t i_size; loff_t i_size;
if (retry_op == READ_INLINE) { if (retry_op == READ_INLINE) {
page = __page_cache_alloc(GFP_NOFS); page = __page_cache_alloc(GFP_KERNEL);
if (!page) if (!page)
return -ENOMEM; return -ENOMEM;
} }
...@@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc = struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc; &ceph_sb_to_client(inode->i_sb)->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0; ssize_t count, written = 0;
int err, want, got; int err, want, got;
loff_t pos; loff_t pos;
...@@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ceph_snap(inode) != CEPH_NOSNAP) if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS; return -EROFS;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */ /* We can write back this queue in page reclaim */
...@@ -996,14 +999,30 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -996,14 +999,30 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
struct ceph_snap_context *snapc;
struct iov_iter data; struct iov_iter data;
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
if (__ceph_have_pending_cap_snap(ci)) {
struct ceph_cap_snap *capsnap =
list_last_entry(&ci->i_cap_snaps,
struct ceph_cap_snap,
ci_item);
snapc = ceph_get_snap_context(capsnap->context);
} else {
BUG_ON(!ci->i_head_snapc);
snapc = ceph_get_snap_context(ci->i_head_snapc);
}
spin_unlock(&ci->i_ceph_lock);
/* we might need to revert back to that point */ /* we might need to revert back to that point */
data = *from; data = *from;
if (iocb->ki_flags & IOCB_DIRECT) if (iocb->ki_flags & IOCB_DIRECT)
written = ceph_sync_direct_write(iocb, &data, pos); written = ceph_sync_direct_write(iocb, &data, pos,
snapc);
else else
written = ceph_sync_write(iocb, &data, pos); written = ceph_sync_write(iocb, &data, pos, snapc);
if (written == -EOLDSNAPC) { if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u" dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n", "got EOLDSNAPC, retrying\n",
...@@ -1014,6 +1033,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -1014,6 +1033,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
} }
if (written > 0) if (written > 0)
iov_iter_advance(from, written); iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
} else { } else {
loff_t old_size = inode->i_size; loff_t old_size = inode->i_size;
/* /*
...@@ -1035,7 +1055,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -1035,7 +1055,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
int dirty; int dirty;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE; ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (dirty) if (dirty)
__mark_inode_dirty(inode, dirty); __mark_inode_dirty(inode, dirty);
...@@ -1059,6 +1080,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -1059,6 +1080,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
out: out:
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
out_unlocked: out_unlocked:
ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL; current->backing_dev_info = NULL;
return written ? written : err; return written ? written : err;
} }
...@@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode, ...@@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc = struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc; &ceph_inode_to_client(inode)->client->osdc;
struct ceph_cap_flush *prealloc_cf;
int want, got = 0; int want, got = 0;
int dirty; int dirty;
int ret = 0; int ret = 0;
...@@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode, ...@@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode)) if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP; return -EOPNOTSUPP;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
if (ceph_snap(inode) != CEPH_NOSNAP) { if (ceph_snap(inode) != CEPH_NOSNAP) {
...@@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode, ...@@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
if (!ret) { if (!ret) {
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE; ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (dirty) if (dirty)
__mark_inode_dirty(inode, dirty); __mark_inode_dirty(inode, dirty);
...@@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode, ...@@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
ceph_put_cap_refs(ci, got); ceph_put_cap_refs(ci, got);
unlock: unlock:
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
ceph_free_cap_flush(prealloc_cf);
return ret; return ret;
} }
......
...@@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_inline_version = 0; ci->i_inline_version = 0;
ci->i_time_warp_seq = 0; ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0; ci->i_ceph_flags = 0;
ci->i_ordered_count = 0; atomic64_set(&ci->i_ordered_count, 1);
atomic_set(&ci->i_release_count, 1); atomic64_set(&ci->i_release_count, 1);
atomic_set(&ci->i_complete_count, 0); atomic64_set(&ci->i_complete_seq[0], 0);
atomic64_set(&ci->i_complete_seq[1], 0);
ci->i_symlink = NULL; ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
...@@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_flushing_caps = 0; ci->i_flushing_caps = 0;
INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item); INIT_LIST_HEAD(&ci->i_flushing_item);
ci->i_cap_flush_seq = 0; ci->i_prealloc_cap_flush = NULL;
ci->i_cap_flush_last_tid = 0; ci->i_cap_flush_tree = RB_ROOT;
memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
init_waitqueue_head(&ci->i_cap_wq); init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0; ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0; ci->i_hold_caps_max = 0;
...@@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ...@@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (new_version || if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
ci->i_layout = info->layout; ci->i_layout = info->layout;
queue_trunc = ceph_fill_file_size(inode, issued, queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq), le32_to_cpu(info->truncate_seq),
le64_to_cpu(info->truncate_size), le64_to_cpu(info->truncate_size),
...@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ...@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
(issued & CEPH_CAP_FILE_EXCL) == 0 && (issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) { !__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode); dout(" marking %p complete (empty)\n", inode);
i_size_write(inode, 0);
__ceph_dir_set_complete(ci, __ceph_dir_set_complete(ci,
atomic_read(&ci->i_release_count), atomic64_read(&ci->i_release_count),
ci->i_ordered_count); atomic64_read(&ci->i_ordered_count));
} }
wake = true; wake = true;
...@@ -1212,6 +1216,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1212,6 +1216,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
dout("fill_trace doing d_move %p -> %p\n", dout("fill_trace doing d_move %p -> %p\n",
req->r_old_dentry, dn); req->r_old_dentry, dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
d_move(req->r_old_dentry, dn); d_move(req->r_old_dentry, dn);
dout(" src %p '%pd' dst %p '%pd'\n", dout(" src %p '%pd' dst %p '%pd'\n",
req->r_old_dentry, req->r_old_dentry,
...@@ -1222,10 +1230,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1222,10 +1230,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
rehashing bug in vfs_rename_dir */ rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn); ceph_invalidate_dentry_lease(dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
dout("dn %p gets new offset %lld\n", req->r_old_dentry, dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset); ceph_dentry(req->r_old_dentry)->offset);
...@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, ...@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
return err; return err;
} }
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
if (ctl->page) {
kunmap(ctl->page);
page_cache_release(ctl->page);
ctl->page = NULL;
}
}
static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
struct ceph_readdir_cache_control *ctl,
struct ceph_mds_request *req)
{
struct ceph_inode_info *ci = ceph_inode(dir);
unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
if (!ctl->page || pgoff != page_index(ctl->page)) {
ceph_readdir_cache_release(ctl);
ctl->page = grab_cache_page(&dir->i_data, pgoff);
if (!ctl->page) {
ctl->index = -1;
return -ENOMEM;
}
/* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
}
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
dout("readdir cache dn %p idx %d\n", dn, ctl->index);
ctl->dentries[idx] = dn;
ctl->index++;
} else {
dout("disable readdir cache\n");
ctl->index = -1;
}
return 0;
}
int ceph_readdir_prepopulate(struct ceph_mds_request *req, int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session) struct ceph_mds_session *session)
{ {
...@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct inode *snapdir = NULL; struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di; struct ceph_dentry_info *di;
u64 r_readdir_offset = req->r_readdir_offset;
u32 frag = le32_to_cpu(rhead->args.readdir.frag); u32 frag = le32_to_cpu(rhead->args.readdir.frag);
struct ceph_readdir_cache_control cache_ctl = {};
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (rinfo->dir_dir && if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) { le32_to_cpu(rinfo->dir_dir->frag) != frag) {
...@@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
frag, le32_to_cpu(rinfo->dir_dir->frag)); frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag); frag = le32_to_cpu(rinfo->dir_dir->frag);
if (ceph_frag_is_leftmost(frag)) if (ceph_frag_is_leftmost(frag))
r_readdir_offset = 2; req->r_readdir_offset = 2;
else else
r_readdir_offset = 0; req->r_readdir_offset = 0;
} }
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
snapdir = ceph_get_snapdir(d_inode(parent)); snapdir = ceph_get_snapdir(d_inode(parent));
parent = d_find_alias(snapdir); parent = d_find_alias(snapdir);
...@@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
} }
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
req->r_readdir_cache_idx = 0;
}
cache_ctl.index = req->r_readdir_cache_idx;
/* FIXME: release caps/leases if error occurs */ /* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) { for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_vino vino; struct ceph_vino vino;
...@@ -1413,13 +1471,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1413,13 +1471,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
d_delete(dn); d_delete(dn);
dput(dn); dput(dn);
goto retry_lookup; goto retry_lookup;
} else {
/* reorder parent's d_subdirs */
spin_lock(&parent->d_lock);
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
list_move(&dn->d_child, &parent->d_subdirs);
spin_unlock(&dn->d_lock);
spin_unlock(&parent->d_lock);
} }
/* inode */ /* inode */
...@@ -1436,13 +1487,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1436,13 +1487,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
} }
if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1, req->r_request_started, -1,
&req->r_caps_reservation) < 0) { &req->r_caps_reservation);
if (ret < 0) {
pr_err("fill_inode badness on %p\n", in); pr_err("fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) if (d_really_is_negative(dn))
iput(in); iput(in);
d_drop(dn); d_drop(dn);
err = ret;
goto next_item; goto next_item;
} }
...@@ -1458,19 +1511,28 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1458,19 +1511,28 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
di = dn->d_fsdata; di = dn->d_fsdata;
di->offset = ceph_make_fpos(frag, i + r_readdir_offset); di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
update_dentry_lease(dn, rinfo->dir_dlease[i], update_dentry_lease(dn, rinfo->dir_dlease[i],
req->r_session, req->r_session,
req->r_request_started); req->r_request_started);
if (err == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn,
&cache_ctl, req);
if (ret < 0)
err = ret;
}
next_item: next_item:
if (dn) if (dn)
dput(dn); dput(dn);
} }
if (err == 0)
req->r_did_prepopulate = true;
out: out:
if (err == 0) {
req->r_did_prepopulate = true;
req->r_readdir_cache_idx = cache_ctl.index;
}
ceph_readdir_cache_release(&cache_ctl);
if (snapdir) { if (snapdir) {
iput(snapdir); iput(snapdir);
dput(parent); dput(parent);
...@@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
const unsigned int ia_valid = attr->ia_valid; const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req; struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf;
int issued; int issued;
int release = 0, dirtied = 0; int release = 0, dirtied = 0;
int mask = 0; int mask = 0;
int err = 0; int err = 0;
int inode_dirty_flags = 0; int inode_dirty_flags = 0;
bool lock_snap_rwsem = false;
if (ceph_snap(inode) != CEPH_NOSNAP) if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS; return -EROFS;
...@@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (err != 0) if (err != 0)
return err; return err;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
USE_AUTH_MDS); USE_AUTH_MDS);
if (IS_ERR(req)) if (IS_ERR(req)) {
ceph_free_cap_flush(prealloc_cf);
return PTR_ERR(req); return PTR_ERR(req);
}
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
if (!ci->i_head_snapc &&
(issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
lock_snap_rwsem = true;
if (!down_read_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL); issued = __ceph_caps_issued(ci, NULL);
}
}
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ia_valid & ATTR_UID) { if (ia_valid & ATTR_UID) {
...@@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
dout("setattr %p ATTR_FILE ... hrm!\n", inode); dout("setattr %p ATTR_FILE ... hrm!\n", inode);
if (dirtied) { if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
inode->i_ctime = CURRENT_TIME; inode->i_ctime = CURRENT_TIME;
} }
release &= issued; release &= issued;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (inode_dirty_flags) if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags); __mark_inode_dirty(inode, inode_dirty_flags);
...@@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
if (mask & CEPH_SETATTR_SIZE) if (mask & CEPH_SETATTR_SIZE)
__ceph_do_pending_vmtruncate(inode); __ceph_do_pending_vmtruncate(inode);
ceph_free_cap_flush(prealloc_cf);
return err; return err;
out_put: out_put:
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
return err; return err;
} }
......
This diff is collapsed.
...@@ -139,7 +139,6 @@ struct ceph_mds_session { ...@@ -139,7 +139,6 @@ struct ceph_mds_session {
int s_cap_reconnect; int s_cap_reconnect;
int s_readonly; int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator; struct ceph_cap *s_cap_iterator;
/* protected by mutex */ /* protected by mutex */
...@@ -228,7 +227,7 @@ struct ceph_mds_request { ...@@ -228,7 +227,7 @@ struct ceph_mds_request {
int r_err; int r_err;
bool r_aborted; bool r_aborted;
unsigned long r_timeout; /* optional. jiffies */ unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */ unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only, unsigned long r_request_started; /* start time for mds request only,
used to measure lease durations */ used to measure lease durations */
...@@ -254,12 +253,21 @@ struct ceph_mds_request { ...@@ -254,12 +253,21 @@ struct ceph_mds_request {
bool r_got_unsafe, r_got_safe, r_got_result; bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate; bool r_did_prepopulate;
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
u32 r_readdir_offset; u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation; struct ceph_cap_reservation r_caps_reservation;
int r_num_caps; int r_num_caps;
}; };
struct ceph_pool_perm {
struct rb_node node;
u32 pool;
int perm;
};
/* /*
* mds client state * mds client state
*/ */
...@@ -284,12 +292,15 @@ struct ceph_mds_client { ...@@ -284,12 +292,15 @@ struct ceph_mds_client {
* references (implying they contain no inodes with caps) that * references (implying they contain no inodes with caps) that
* should be destroyed. * should be destroyed.
*/ */
u64 last_snap_seq;
struct rw_semaphore snap_rwsem; struct rw_semaphore snap_rwsem;
struct rb_root snap_realms; struct rb_root snap_realms;
struct list_head snap_empty; struct list_head snap_empty;
spinlock_t snap_empty_lock; /* protect snap_empty */ spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */ u64 last_tid; /* most recent mds request */
u64 oldest_tid; /* oldest incomplete mds request,
excluding setfilelock requests */
struct rb_root request_tree; /* pending mds requests */ struct rb_root request_tree; /* pending mds requests */
struct delayed_work delayed_work; /* delayed work */ struct delayed_work delayed_work; /* delayed work */
unsigned long last_renew_caps; /* last time we renewed our caps */ unsigned long last_renew_caps; /* last time we renewed our caps */
...@@ -298,7 +309,8 @@ struct ceph_mds_client { ...@@ -298,7 +309,8 @@ struct ceph_mds_client {
struct list_head snap_flush_list; /* cap_snaps ready to flush */ struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock; spinlock_t snap_flush_lock;
u64 cap_flush_seq; u64 last_cap_flush_tid;
struct rb_root cap_flush_tree;
struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */ struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */ int num_cap_flushing; /* # caps we are flushing */
...@@ -328,6 +340,9 @@ struct ceph_mds_client { ...@@ -328,6 +340,9 @@ struct ceph_mds_client {
spinlock_t dentry_lru_lock; spinlock_t dentry_lru_lock;
struct list_head dentry_lru; struct list_head dentry_lru;
int num_dentry; int num_dentry;
struct rw_semaphore pool_perm_rwsem;
struct rb_root pool_perm_tree;
}; };
extern const char *ceph_mds_op_name(int op); extern const char *ceph_mds_op_name(int op);
...@@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) ...@@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request); kref_put(&req->r_kref, ceph_mdsc_release_request);
} }
extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session); struct ceph_mds_session *session);
......
...@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b) ...@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b)
} }
static struct ceph_snap_context *empty_snapc; struct ceph_snap_context *ceph_empty_snapc;
/* /*
* build the snap context for a given realm. * build the snap context for a given realm.
...@@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) ...@@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
return 0; return 0;
} }
if (num == 0 && realm->seq == empty_snapc->seq) { if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
ceph_get_snap_context(empty_snapc); ceph_get_snap_context(ceph_empty_snapc);
snapc = empty_snapc; snapc = ceph_empty_snapc;
goto done; goto done;
} }
...@@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num) ...@@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
return 0; return 0;
} }
static bool has_new_snaps(struct ceph_snap_context *o,
struct ceph_snap_context *n)
{
if (n->num_snaps == 0)
return false;
/* snaps are in descending order */
return n->snaps[0] > o->seq;
}
/* /*
* When a snapshot is applied, the size/mtime inode metadata is queued * When a snapshot is applied, the size/mtime inode metadata is queued
...@@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ...@@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
{ {
struct inode *inode = &ci->vfs_inode; struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap; struct ceph_cap_snap *capsnap;
struct ceph_snap_context *old_snapc, *new_snapc;
int used, dirty; int used, dirty;
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
...@@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ...@@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
used = __ceph_caps_used(ci); used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci); dirty = __ceph_caps_dirty(ci);
old_snapc = ci->i_head_snapc;
new_snapc = ci->i_snap_realm->cached_context;
/* /*
* If there is a write in progress, treat that as a dirty Fw, * If there is a write in progress, treat that as a dirty Fw,
* even though it hasn't completed yet; by the time we finish * even though it hasn't completed yet; by the time we finish
...@@ -481,23 +493,39 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ...@@ -481,23 +493,39 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
writes in progress now were started before the previous writes in progress now were started before the previous
cap_snap. lucky us. */ cap_snap. lucky us. */
dout("queue_cap_snap %p already pending\n", inode); dout("queue_cap_snap %p already pending\n", inode);
kfree(capsnap); goto update_snapc;
} else if (ci->i_snap_realm->cached_context == empty_snapc) { }
dout("queue_cap_snap %p empty snapc\n", inode); if (ci->i_wrbuffer_ref_head == 0 &&
kfree(capsnap); !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| dout("queue_cap_snap %p nothing dirty|writing\n", inode);
CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { goto update_snapc;
struct ceph_snap_context *snapc = ci->i_head_snapc; }
BUG_ON(!old_snapc);
/* /*
* if we are a sync write, we may need to go to the snaprealm * There is no need to send FLUSHSNAP message to MDS if there is
* to get the current snapc. * no new snapshot. But when there is dirty pages or on-going
* writes, we still need to create cap_snap. cap_snap is needed
* by the write path and page writeback path.
*
* also see ceph_try_drop_cap_snap()
*/ */
if (!snapc) if (has_new_snaps(old_snapc, new_snapc)) {
snapc = ci->i_snap_realm->cached_context; if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
capsnap->need_flush = true;
} else {
if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
dout("queue_cap_snap %p "
"no new_snap|dirty_page|writing\n", inode);
goto update_snapc;
}
}
dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
inode, capsnap, snapc, ceph_cap_string(dirty)); inode, capsnap, old_snapc, ceph_cap_string(dirty),
capsnap->need_flush ? "" : "no_flush");
ihold(inode); ihold(inode);
atomic_set(&capsnap->nref, 1); atomic_set(&capsnap->nref, 1);
...@@ -505,7 +533,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ...@@ -505,7 +533,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
INIT_LIST_HEAD(&capsnap->ci_item); INIT_LIST_HEAD(&capsnap->ci_item);
INIT_LIST_HEAD(&capsnap->flushing_item); INIT_LIST_HEAD(&capsnap->flushing_item);
capsnap->follows = snapc->seq; capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty; capsnap->dirty = dirty;
...@@ -530,27 +558,30 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ...@@ -530,27 +558,30 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
snapshot. */ snapshot. */
capsnap->dirty_pages = ci->i_wrbuffer_ref_head; capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
ci->i_wrbuffer_ref_head = 0; ci->i_wrbuffer_ref_head = 0;
capsnap->context = snapc; capsnap->context = old_snapc;
ci->i_head_snapc =
ceph_get_snap_context(ci->i_snap_realm->cached_context);
dout(" new snapc is %p\n", ci->i_head_snapc);
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
old_snapc = NULL;
if (used & CEPH_CAP_FILE_WR) { if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p" dout("queue_cap_snap %p cap_snap %p snapc %p"
" seq %llu used WR, now pending\n", inode, " seq %llu used WR, now pending\n", inode,
capsnap, snapc, snapc->seq); capsnap, old_snapc, old_snapc->seq);
capsnap->writing = 1; capsnap->writing = 1;
} else { } else {
/* note mtime, size NOW. */ /* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap); __ceph_finish_cap_snap(ci, capsnap);
} }
} else { capsnap = NULL;
dout("queue_cap_snap %p nothing dirty|writing\n", inode);
kfree(capsnap);
}
update_snapc:
if (ci->i_head_snapc) {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
kfree(capsnap);
ceph_put_snap_context(old_snapc);
} }
/* /*
...@@ -699,6 +730,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, ...@@ -699,6 +730,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
/* queue realm for cap_snap creation */ /* queue realm for cap_snap creation */
list_add(&realm->dirty_item, &dirty_realms); list_add(&realm->dirty_item, &dirty_realms);
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
invalidate = 1; invalidate = 1;
} else if (!realm->cached_context) { } else if (!realm->cached_context) {
...@@ -964,14 +997,14 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, ...@@ -964,14 +997,14 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
int __init ceph_snap_init(void) int __init ceph_snap_init(void)
{ {
empty_snapc = ceph_create_snap_context(0, GFP_NOFS); ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
if (!empty_snapc) if (!ceph_empty_snapc)
return -ENOMEM; return -ENOMEM;
empty_snapc->seq = 1; ceph_empty_snapc->seq = 1;
return 0; return 0;
} }
void ceph_snap_exit(void) void ceph_snap_exit(void)
{ {
ceph_put_snap_context(empty_snapc); ceph_put_snap_context(ceph_empty_snapc);
} }
...@@ -134,10 +134,12 @@ enum { ...@@ -134,10 +134,12 @@ enum {
Opt_noino32, Opt_noino32,
Opt_fscache, Opt_fscache,
Opt_nofscache, Opt_nofscache,
Opt_poolperm,
Opt_nopoolperm,
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
Opt_acl, Opt_acl,
#endif #endif
Opt_noacl Opt_noacl,
}; };
static match_table_t fsopt_tokens = { static match_table_t fsopt_tokens = {
...@@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = { ...@@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = {
{Opt_noino32, "noino32"}, {Opt_noino32, "noino32"},
{Opt_fscache, "fsc"}, {Opt_fscache, "fsc"},
{Opt_nofscache, "nofsc"}, {Opt_nofscache, "nofsc"},
{Opt_poolperm, "poolperm"},
{Opt_nopoolperm, "nopoolperm"},
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
{Opt_acl, "acl"}, {Opt_acl, "acl"},
#endif #endif
...@@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private) ...@@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_nofscache: case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
break; break;
case Opt_poolperm:
fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
printk ("pool perm");
break;
case Opt_nopoolperm:
fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl: case Opt_acl:
fsopt->sb_flags |= MS_POSIXACL; fsopt->sb_flags |= MS_POSIXACL;
...@@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) ...@@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",nodcache"); seq_puts(m, ",nodcache");
if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
seq_puts(m, ",fsc"); seq_puts(m, ",fsc");
if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
seq_puts(m, ",nopoolperm");
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
if (fsopt->sb_flags & MS_POSIXACL) if (fsopt->sb_flags & MS_POSIXACL)
...@@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) ...@@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/ */
struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep; struct kmem_cache *ceph_cap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep; struct kmem_cache *ceph_file_cachep;
...@@ -634,6 +648,10 @@ static int __init init_caches(void) ...@@ -634,6 +648,10 @@ static int __init init_caches(void)
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (ceph_cap_cachep == NULL) if (ceph_cap_cachep == NULL)
goto bad_cap; goto bad_cap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (ceph_cap_flush_cachep == NULL)
goto bad_cap_flush;
ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
...@@ -652,6 +670,8 @@ static int __init init_caches(void) ...@@ -652,6 +670,8 @@ static int __init init_caches(void)
bad_file: bad_file:
kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry: bad_dentry:
kmem_cache_destroy(ceph_cap_flush_cachep);
bad_cap_flush:
kmem_cache_destroy(ceph_cap_cachep); kmem_cache_destroy(ceph_cap_cachep);
bad_cap: bad_cap:
kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_inode_cachep);
...@@ -668,6 +688,7 @@ static void destroy_caches(void) ...@@ -668,6 +688,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep); kmem_cache_destroy(ceph_cap_cachep);
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep); kmem_cache_destroy(ceph_file_cachep);
...@@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, ...@@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.ino = CEPH_INO_ROOT;
req->r_ino1.snap = CEPH_NOSNAP; req->r_ino1.snap = CEPH_NOSNAP;
req->r_started = started; req->r_started = started;
req->r_timeout = fsc->client->options->mount_timeout * HZ; req->r_timeout = fsc->client->options->mount_timeout;
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
req->r_num_caps = 2; req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_mdsc_do_request(mdsc, NULL, req);
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \
CEPH_MOUNT_OPT_DCACHE) CEPH_MOUNT_OPT_DCACHE)
...@@ -121,11 +122,21 @@ struct ceph_cap { ...@@ -121,11 +122,21 @@ struct ceph_cap {
struct rb_node ci_node; /* per-ci cap tree */ struct rb_node ci_node; /* per-ci cap tree */
struct ceph_mds_session *session; struct ceph_mds_session *session;
struct list_head session_caps; /* per-session caplist */ struct list_head session_caps; /* per-session caplist */
int mds;
u64 cap_id; /* unique cap id (mds provided) */ u64 cap_id; /* unique cap id (mds provided) */
union {
/* in-use caps */
struct {
int issued; /* latest, from the mds */ int issued; /* latest, from the mds */
int implemented; /* implemented superset of issued (for revocation) */ int implemented; /* implemented superset of
int mds_wanted; issued (for revocation) */
int mds, mds_wanted;
};
/* caps to release */
struct {
u64 cap_ino;
int queue_release;
};
};
u32 seq, issue_seq, mseq; u32 seq, issue_seq, mseq;
u32 cap_gen; /* active/stale cycle */ u32 cap_gen; /* active/stale cycle */
unsigned long last_used; unsigned long last_used;
...@@ -163,6 +174,7 @@ struct ceph_cap_snap { ...@@ -163,6 +174,7 @@ struct ceph_cap_snap {
int writing; /* a sync write is still in progress */ int writing; /* a sync write is still in progress */
int dirty_pages; /* dirty pages awaiting writeback */ int dirty_pages; /* dirty pages awaiting writeback */
bool inline_data; bool inline_data;
bool need_flush;
}; };
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
...@@ -174,6 +186,17 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) ...@@ -174,6 +186,17 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
} }
} }
struct ceph_cap_flush {
u64 tid;
int caps;
bool kick;
struct rb_node g_node; // global
union {
struct rb_node i_node; // inode
struct list_head list;
};
};
/* /*
* The frag tree describes how a directory is fragmented, potentially across * The frag tree describes how a directory is fragmented, potentially across
* multiple metadata servers. It is also used to indicate points where * multiple metadata servers. It is also used to indicate points where
...@@ -259,9 +282,9 @@ struct ceph_inode_info { ...@@ -259,9 +282,9 @@ struct ceph_inode_info {
u32 i_time_warp_seq; u32 i_time_warp_seq;
unsigned i_ceph_flags; unsigned i_ceph_flags;
int i_ordered_count; atomic64_t i_release_count;
atomic_t i_release_count; atomic64_t i_ordered_count;
atomic_t i_complete_count; atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout; struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout; struct ceph_file_layout i_layout;
...@@ -283,11 +306,11 @@ struct ceph_inode_info { ...@@ -283,11 +306,11 @@ struct ceph_inode_info {
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
struct list_head i_dirty_item, i_flushing_item; struct list_head i_dirty_item, i_flushing_item;
u64 i_cap_flush_seq;
/* we need to track cap writeback on a per-cap-bit basis, to allow /* we need to track cap writeback on a per-cap-bit basis, to allow
* overlapping, pipelined cap flushes to the mds. we can probably * overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */ * reduce the tid to 8 bits if we're concerned about inode size. */
u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; struct ceph_cap_flush *i_prealloc_cap_flush;
struct rb_root i_cap_flush_tree;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */
...@@ -438,36 +461,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, ...@@ -438,36 +461,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
/* /*
* Ceph inode. * Ceph inode.
*/ */
#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
#define CEPH_I_NODELAY 4 /* do not delay cap release */ #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ #define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */
#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
int release_count, int ordered_count) long long release_count,
long long ordered_count)
{ {
atomic_set(&ci->i_complete_count, release_count); smp_mb__before_atomic();
if (ci->i_ordered_count == ordered_count) atomic64_set(&ci->i_complete_seq[0], release_count);
ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; atomic64_set(&ci->i_complete_seq[1], ordered_count);
else
ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
} }
static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
{ {
atomic_inc(&ci->i_release_count); atomic64_inc(&ci->i_release_count);
}
static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci)
{
atomic64_inc(&ci->i_ordered_count);
} }
static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
{ {
return atomic_read(&ci->i_complete_count) == return atomic64_read(&ci->i_complete_seq[0]) ==
atomic_read(&ci->i_release_count); atomic64_read(&ci->i_release_count);
} }
static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
{ {
return __ceph_dir_is_complete(ci) && return atomic64_read(&ci->i_complete_seq[0]) ==
(ci->i_ceph_flags & CEPH_I_DIR_ORDERED); atomic64_read(&ci->i_release_count) &&
atomic64_read(&ci->i_complete_seq[1]) ==
atomic64_read(&ci->i_ordered_count);
} }
static inline void ceph_dir_clear_complete(struct inode *inode) static inline void ceph_dir_clear_complete(struct inode *inode)
...@@ -477,20 +510,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode) ...@@ -477,20 +510,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode)
static inline void ceph_dir_clear_ordered(struct inode *inode) static inline void ceph_dir_clear_ordered(struct inode *inode)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); __ceph_dir_clear_ordered(ceph_inode(inode));
spin_lock(&ci->i_ceph_lock);
ci->i_ordered_count++;
ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
spin_unlock(&ci->i_ceph_lock);
} }
static inline bool ceph_dir_is_complete_ordered(struct inode *inode) static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode));
bool ret; smp_rmb();
spin_lock(&ci->i_ceph_lock);
ret = __ceph_dir_is_complete_ordered(ci);
spin_unlock(&ci->i_ceph_lock);
return ret; return ret;
} }
...@@ -552,7 +578,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) ...@@ -552,7 +578,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
{ {
return ci->i_dirty_caps | ci->i_flushing_caps; return ci->i_dirty_caps | ci->i_flushing_caps;
} }
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); extern struct ceph_cap_flush *ceph_alloc_cap_flush(void);
extern void ceph_free_cap_flush(struct ceph_cap_flush *cf);
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush **pcf);
extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
struct ceph_cap *ocap, int mask); struct ceph_cap *ocap, int mask);
...@@ -606,16 +635,20 @@ struct ceph_file_info { ...@@ -606,16 +635,20 @@ struct ceph_file_info {
unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned offset; /* offset of last chunk, adjusted for . and .. */
unsigned next_offset; /* offset of next chunk (last_name's + 1) */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */ char *last_name; /* last entry in previous chunk */
struct dentry *dentry; /* next dentry (for dcache readdir) */ long long dir_release_count;
int dir_release_count; long long dir_ordered_count;
int dir_ordered_count; int readdir_cache_idx;
/* used for -o dirstat read() on directory thing */ /* used for -o dirstat read() on directory thing */
char *dir_info; char *dir_info;
int dir_info_len; int dir_info_len;
}; };
struct ceph_readdir_cache_control {
struct page *page;
struct dentry **dentries;
int index;
};
/* /*
* A "snap realm" describes a subset of the file hierarchy sharing * A "snap realm" describes a subset of the file hierarchy sharing
...@@ -687,6 +720,7 @@ static inline int default_congestion_kb(void) ...@@ -687,6 +720,7 @@ static inline int default_congestion_kb(void)
/* snap.c */ /* snap.c */
extern struct ceph_snap_context *ceph_empty_snapc;
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino); u64 ino);
extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
...@@ -713,7 +747,7 @@ extern void ceph_snap_exit(void); ...@@ -713,7 +747,7 @@ extern void ceph_snap_exit(void);
static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
{ {
return !list_empty(&ci->i_cap_snaps) && return !list_empty(&ci->i_cap_snaps) &&
list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap,
ci_item)->writing; ci_item)->writing;
} }
...@@ -838,12 +872,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc, ...@@ -838,12 +872,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap); struct ceph_cap *cap);
extern int ceph_is_any_caps(struct inode *inode); extern int ceph_is_any_caps(struct inode *inode);
extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
u64 cap_id, u32 migrate_seq, u32 issue_seq);
extern void ceph_queue_caps_release(struct inode *inode); extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int ceph_fsync(struct file *file, loff_t start, loff_t end, extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
int datasync); int datasync);
extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session); struct ceph_mds_session *session);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
...@@ -879,6 +913,9 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); ...@@ -879,6 +913,9 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
/* addr.c */ /* addr.c */
extern const struct address_space_operations ceph_aops; extern const struct address_space_operations ceph_aops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */ /* file.c */
extern const struct file_operations ceph_file_fops; extern const struct file_operations ceph_file_fops;
...@@ -890,7 +927,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, ...@@ -890,7 +927,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
extern int ceph_release(struct inode *inode, struct file *filp); extern int ceph_release(struct inode *inode, struct file *filp);
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len); char *data, size_t len);
int ceph_uninline_data(struct file *filp, struct page *locked_page);
/* dir.c */ /* dir.c */
extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_dir_fops;
extern const struct file_operations ceph_snapdir_fops; extern const struct file_operations ceph_snapdir_fops;
...@@ -911,6 +947,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn); ...@@ -911,6 +947,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
/* /*
* our d_ops vary depending on whether the inode is live, * our d_ops vary depending on whether the inode is live,
......
...@@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
struct inode *inode = d_inode(dentry); struct inode *inode = d_inode(dentry);
struct ceph_vxattr *vxattr; struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
int issued; int issued;
int err; int err;
int dirty = 0; int dirty = 0;
...@@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
char *newval = NULL; char *newval = NULL;
struct ceph_inode_xattr *xattr = NULL; struct ceph_inode_xattr *xattr = NULL;
int required_blob_size; int required_blob_size;
bool lock_snap_rwsem = false;
if (!ceph_is_valid_xattr(name)) if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP; return -EOPNOTSUPP;
...@@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
if (!xattr) if (!xattr)
goto out; goto out;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
goto out;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
retry: retry:
issued = __ceph_caps_issued(ci, NULL); issued = __ceph_caps_issued(ci, NULL);
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync; goto do_sync;
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
if (!down_read_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
goto retry;
}
}
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
__build_xattrs(inode); __build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, name_len, val_len); required_blob_size = __get_required_blob_size(ci, name_len, val_len);
...@@ -966,7 +984,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -966,7 +984,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
dout(" preaallocating new blob size=%d\n", required_blob_size); dout(" preaallocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS); blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob) if (!blob)
goto out; goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.prealloc_blob) if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob); ceph_buffer_put(ci->i_xattrs.prealloc_blob);
...@@ -978,21 +996,28 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -978,21 +996,28 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
flags, value ? 1 : -1, &xattr); flags, value ? 1 : -1, &xattr);
if (!err) { if (!err) {
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true; ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME; inode->i_ctime = CURRENT_TIME;
} }
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (dirty) if (dirty)
__mark_inode_dirty(inode, dirty); __mark_inode_dirty(inode, dirty);
ceph_free_cap_flush(prealloc_cf);
return err; return err;
do_sync: do_sync:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked: do_sync_unlocked:
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
err = ceph_sync_setxattr(dentry, name, value, size, flags); err = ceph_sync_setxattr(dentry, name, value, size, flags);
out: out:
ceph_free_cap_flush(prealloc_cf);
kfree(newname); kfree(newname);
kfree(newval); kfree(newval);
kfree(xattr); kfree(xattr);
...@@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
struct inode *inode = d_inode(dentry); struct inode *inode = d_inode(dentry);
struct ceph_vxattr *vxattr; struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
int issued; int issued;
int err; int err;
int required_blob_size; int required_blob_size;
int dirty; int dirty;
bool lock_snap_rwsem = false;
if (!ceph_is_valid_xattr(name)) if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP; return -EOPNOTSUPP;
...@@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
goto do_sync_unlocked; goto do_sync_unlocked;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
err = -ENOMEM; err = -ENOMEM;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
retry: retry:
issued = __ceph_caps_issued(ci, NULL); issued = __ceph_caps_issued(ci, NULL);
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync; goto do_sync;
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
if (!down_read_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
goto retry;
}
}
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
__build_xattrs(inode); __build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, 0, 0); required_blob_size = __get_required_blob_size(ci, 0, 0);
...@@ -1080,7 +1123,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1080,7 +1123,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
dout(" preaallocating new blob size=%d\n", required_blob_size); dout(" preaallocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS); blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob) if (!blob)
goto out; goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.prealloc_blob) if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob); ceph_buffer_put(ci->i_xattrs.prealloc_blob);
...@@ -1090,18 +1133,24 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1090,18 +1133,24 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
err = __remove_xattr_by_name(ceph_inode(inode), name); err = __remove_xattr_by_name(ceph_inode(inode), name);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true; ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME; inode->i_ctime = CURRENT_TIME;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (dirty) if (dirty)
__mark_inode_dirty(inode, dirty); __mark_inode_dirty(inode, dirty);
ceph_free_cap_flush(prealloc_cf);
return err; return err;
do_sync: do_sync:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked: do_sync_unlocked:
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
ceph_free_cap_flush(prealloc_cf);
err = ceph_send_removexattr(dentry, name); err = ceph_send_removexattr(dentry, name);
out:
return err; return err;
} }
......
...@@ -43,9 +43,9 @@ struct ceph_options { ...@@ -43,9 +43,9 @@ struct ceph_options {
int flags; int flags;
struct ceph_fsid fsid; struct ceph_fsid fsid;
struct ceph_entity_addr my_addr; struct ceph_entity_addr my_addr;
int mount_timeout; unsigned long mount_timeout; /* jiffies */
int osd_idle_ttl; unsigned long osd_idle_ttl; /* jiffies */
int osd_keepalive_timeout; unsigned long osd_keepalive_timeout; /* jiffies */
/* /*
* any type that can't be simply compared or doesn't need need * any type that can't be simply compared or doesn't need need
...@@ -63,9 +63,9 @@ struct ceph_options { ...@@ -63,9 +63,9 @@ struct ceph_options {
/* /*
* defaults * defaults
*/ */
#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_OSD_KEEPALIVE_DEFAULT 5 #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
#define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
...@@ -93,13 +93,9 @@ enum { ...@@ -93,13 +93,9 @@ enum {
CEPH_MOUNT_SHUTDOWN, CEPH_MOUNT_SHUTDOWN,
}; };
/* static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
* subtract jiffies
*/
static inline unsigned long time_sub(unsigned long a, unsigned long b)
{ {
BUG_ON(time_after(b, a)); return timeout ?: MAX_SCHEDULE_TIMEOUT;
return (long)a - (long)b;
} }
struct ceph_mds_client; struct ceph_mds_client;
...@@ -178,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len) ...@@ -178,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len)
extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_inode_cachep;
extern struct kmem_cache *ceph_cap_cachep; extern struct kmem_cache *ceph_cap_cachep;
extern struct kmem_cache *ceph_cap_flush_cachep;
extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_dentry_cachep;
extern struct kmem_cache *ceph_file_cachep; extern struct kmem_cache *ceph_file_cachep;
......
...@@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, ...@@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg); struct ceph_msg *msg);
extern void osd_req_op_init(struct ceph_osd_request *osd_req, extern void osd_req_op_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode); unsigned int which, u16 opcode, u32 flags);
extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
unsigned int which, unsigned int which,
......
#ifndef CEPH_CRUSH_CRUSH_H #ifndef CEPH_CRUSH_CRUSH_H
#define CEPH_CRUSH_CRUSH_H #define CEPH_CRUSH_CRUSH_H
#include <linux/types.h> #ifdef __KERNEL__
# include <linux/types.h>
#else
# include "crush_compat.h"
#endif
/* /*
* CRUSH is a pseudo-random data distribution algorithm that * CRUSH is a pseudo-random data distribution algorithm that
...@@ -20,7 +24,11 @@ ...@@ -20,7 +24,11 @@
#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */
#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */
#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ #define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
#define CRUSH_ITEM_NONE 0x7fffffff /* no result */ #define CRUSH_ITEM_NONE 0x7fffffff /* no result */
...@@ -108,6 +116,15 @@ enum { ...@@ -108,6 +116,15 @@ enum {
}; };
extern const char *crush_bucket_alg_name(int alg); extern const char *crush_bucket_alg_name(int alg);
/*
* although tree was a legacy algorithm, it has been buggy, so
* exclude it.
*/
#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
(1 << CRUSH_BUCKET_UNIFORM) | \
(1 << CRUSH_BUCKET_LIST) | \
(1 << CRUSH_BUCKET_STRAW))
struct crush_bucket { struct crush_bucket {
__s32 id; /* this'll be negative */ __s32 id; /* this'll be negative */
__u16 type; /* non-zero; type=0 is reserved for devices */ __u16 type; /* non-zero; type=0 is reserved for devices */
...@@ -187,6 +204,25 @@ struct crush_map { ...@@ -187,6 +204,25 @@ struct crush_map {
* that want to limit reshuffling, a value of 3 or 4 will make the * that want to limit reshuffling, a value of 3 or 4 will make the
* mappings line up a bit better with previous mappings. */ * mappings line up a bit better with previous mappings. */
__u8 chooseleaf_vary_r; __u8 chooseleaf_vary_r;
#ifndef __KERNEL__
/*
* version 0 (original) of straw_calc has various flaws. version 1
* fixes a few of them.
*/
__u8 straw_calc_version;
/*
* allowed bucket algs is a bitmask, here the bit positions
* are CRUSH_BUCKET_*. note that these are *bits* and
* CRUSH_BUCKET_* values are not, so we need to or together (1
* << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
* minimize confusion (bucket type values start at 1).
*/
__u32 allowed_bucket_algs;
__u32 *choose_tries;
#endif
}; };
......
#ifndef CEPH_CRUSH_HASH_H #ifndef CEPH_CRUSH_HASH_H
#define CEPH_CRUSH_HASH_H #define CEPH_CRUSH_HASH_H
#ifdef __KERNEL__
# include <linux/types.h>
#else
# include "crush_compat.h"
#endif
#define CRUSH_HASH_RJENKINS1 0 #define CRUSH_HASH_RJENKINS1 0
#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* LGPL2 * LGPL2
*/ */
#include <linux/crush/crush.h> #include "crush.h"
extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
extern int crush_do_rule(const struct crush_map *map, extern int crush_do_rule(const struct crush_map *map,
......
...@@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name, ...@@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name,
/* start with defaults */ /* start with defaults */
opt->flags = CEPH_OPT_DEFAULT; opt->flags = CEPH_OPT_DEFAULT;
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
/* get mon ip(s) */ /* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */ /* ip1[:port1][,ip2[:port2]...] */
...@@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name, ...@@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name,
pr_warn("ignoring deprecated osdtimeout option\n"); pr_warn("ignoring deprecated osdtimeout option\n");
break; break;
case Opt_osdkeepalivetimeout: case Opt_osdkeepalivetimeout:
opt->osd_keepalive_timeout = intval; /* 0 isn't well defined right now, reject it */
if (intval < 1 || intval > INT_MAX / 1000) {
pr_err("osdkeepalive out of range\n");
err = -EINVAL;
goto out;
}
opt->osd_keepalive_timeout =
msecs_to_jiffies(intval * 1000);
break; break;
case Opt_osd_idle_ttl: case Opt_osd_idle_ttl:
opt->osd_idle_ttl = intval; /* 0 isn't well defined right now, reject it */
if (intval < 1 || intval > INT_MAX / 1000) {
pr_err("osd_idle_ttl out of range\n");
err = -EINVAL;
goto out;
}
opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
break; break;
case Opt_mount_timeout: case Opt_mount_timeout:
opt->mount_timeout = intval; /* 0 is "wait forever" (i.e. infinite timeout) */
if (intval < 0 || intval > INT_MAX / 1000) {
pr_err("mount_timeout out of range\n");
err = -EINVAL;
goto out;
}
opt->mount_timeout = msecs_to_jiffies(intval * 1000);
break; break;
case Opt_share: case Opt_share:
...@@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) ...@@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
seq_puts(m, "notcp_nodelay,"); seq_puts(m, "notcp_nodelay,");
if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
seq_printf(m, "mount_timeout=%d,", opt->mount_timeout); seq_printf(m, "mount_timeout=%d,",
jiffies_to_msecs(opt->mount_timeout) / 1000);
if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl); seq_printf(m, "osd_idle_ttl=%d,",
jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, "osdkeepalivetimeout=%d,", seq_printf(m, "osdkeepalivetimeout=%d,",
opt->osd_keepalive_timeout); jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
/* drop redundant comma */ /* drop redundant comma */
if (m->count != pos) if (m->count != pos)
...@@ -626,8 +647,8 @@ static int have_mon_and_osd_map(struct ceph_client *client) ...@@ -626,8 +647,8 @@ static int have_mon_and_osd_map(struct ceph_client *client)
*/ */
int __ceph_open_session(struct ceph_client *client, unsigned long started) int __ceph_open_session(struct ceph_client *client, unsigned long started)
{ {
int err; unsigned long timeout = client->options->mount_timeout;
unsigned long timeout = client->options->mount_timeout * HZ; long err;
/* open session, and wait for mon and osd maps */ /* open session, and wait for mon and osd maps */
err = ceph_monc_open_session(&client->monc); err = ceph_monc_open_session(&client->monc);
...@@ -635,16 +656,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) ...@@ -635,16 +656,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
return err; return err;
while (!have_mon_and_osd_map(client)) { while (!have_mon_and_osd_map(client)) {
err = -EIO;
if (timeout && time_after_eq(jiffies, started + timeout)) if (timeout && time_after_eq(jiffies, started + timeout))
return err; return -ETIMEDOUT;
/* wait */ /* wait */
dout("mount waiting for mon_map\n"); dout("mount waiting for mon_map\n");
err = wait_event_interruptible_timeout(client->auth_wq, err = wait_event_interruptible_timeout(client->auth_wq,
have_mon_and_osd_map(client) || (client->auth_err < 0), have_mon_and_osd_map(client) || (client->auth_err < 0),
timeout); ceph_timeout_jiffies(timeout));
if (err == -EINTR || err == -ERESTARTSYS) if (err < 0)
return err; return err;
if (client->auth_err < 0) if (client->auth_err < 0)
return client->auth_err; return client->auth_err;
...@@ -721,5 +741,5 @@ module_exit(exit_ceph_lib); ...@@ -721,5 +741,5 @@ module_exit(exit_ceph_lib);
MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
MODULE_DESCRIPTION("Ceph filesystem for Linux"); MODULE_DESCRIPTION("Ceph core library");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
#ifdef __KERNEL__ #ifdef __KERNEL__
# include <linux/slab.h> # include <linux/slab.h>
# include <linux/crush/crush.h>
#else #else
# include <stdlib.h> # include "crush_compat.h"
# include <assert.h> # include "crush.h"
# define kfree(x) do { if (x) free(x); } while (0)
# define BUG_ON(x) assert(!(x))
#endif #endif
#include <linux/crush/crush.h>
const char *crush_bucket_alg_name(int alg) const char *crush_bucket_alg_name(int alg)
{ {
switch (alg) { switch (alg) {
...@@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map) ...@@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map)
kfree(map->rules); kfree(map->rules);
} }
#ifndef __KERNEL__
kfree(map->choose_tries);
#endif
kfree(map); kfree(map);
} }
......
...@@ -10,20 +10,20 @@ ...@@ -10,20 +10,20 @@
* *
*/ */
#if defined(__linux__)
#include <linux/types.h>
#elif defined(__FreeBSD__)
#include <sys/types.h>
#endif
#ifndef CEPH_CRUSH_LN_H #ifndef CEPH_CRUSH_LN_H
#define CEPH_CRUSH_LN_H #define CEPH_CRUSH_LN_H
#ifdef __KERNEL__
# include <linux/types.h>
#else
# include "crush_compat.h"
#endif
// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0) /*
// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0) * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
* RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
static int64_t __RH_LH_tbl[128*2+2] = { */
static __s64 __RH_LH_tbl[128*2+2] = {
0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll, 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all, 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll, 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
...@@ -89,11 +89,12 @@ static int64_t __RH_LH_tbl[128*2+2] = { ...@@ -89,11 +89,12 @@ static int64_t __RH_LH_tbl[128*2+2] = {
0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll, 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll, 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
0x0000800000000000ll, 0x0000ffff00000000ll, 0x0000800000000000ll, 0x0000ffff00000000ll,
}; };
// LL_tbl[k] = 2^48*log2(1.0+k/2^15); /*
static int64_t __LL_tbl[256] = { * LL_tbl[k] = 2^48*log2(1.0+k/2^15)
*/
static __s64 __LL_tbl[256] = {
0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull, 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull, 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull, 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
...@@ -160,7 +161,4 @@ static int64_t __LL_tbl[256] = { ...@@ -160,7 +161,4 @@ static int64_t __LL_tbl[256] = {
0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull, 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
}; };
#endif #endif
#ifdef __KERNEL__
#include <linux/types.h> # include <linux/crush/hash.h>
#include <linux/crush/hash.h> #else
# include "hash.h"
#endif
/* /*
* Robert Jenkins' function for mixing 32-bit values * Robert Jenkins' function for mixing 32-bit values
......
This diff is collapsed.
...@@ -278,7 +278,6 @@ static void _ceph_msgr_exit(void) ...@@ -278,7 +278,6 @@ static void _ceph_msgr_exit(void)
ceph_msgr_slab_exit(); ceph_msgr_slab_exit();
BUG_ON(zero_page == NULL); BUG_ON(zero_page == NULL);
kunmap(zero_page);
page_cache_release(zero_page); page_cache_release(zero_page);
zero_page = NULL; zero_page = NULL;
} }
...@@ -1545,7 +1544,7 @@ static int write_partial_message_data(struct ceph_connection *con) ...@@ -1545,7 +1544,7 @@ static int write_partial_message_data(struct ceph_connection *con)
page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
&last_piece); &last_piece);
ret = ceph_tcp_sendpage(con->sock, page, page_offset, ret = ceph_tcp_sendpage(con->sock, page, page_offset,
length, last_piece); length, !last_piece);
if (ret <= 0) { if (ret <= 0) {
if (do_datacrc) if (do_datacrc)
msg->footer.data_crc = cpu_to_le32(crc); msg->footer.data_crc = cpu_to_le32(crc);
......
...@@ -298,21 +298,28 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) ...@@ -298,21 +298,28 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
} }
EXPORT_SYMBOL(ceph_monc_request_next_osdmap); EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
/*
* Wait for an osdmap with a given epoch.
*
* @epoch: epoch to wait for
* @timeout: in jiffies, 0 means "wait forever"
*/
int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
unsigned long timeout) unsigned long timeout)
{ {
unsigned long started = jiffies; unsigned long started = jiffies;
int ret; long ret;
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
while (monc->have_osdmap < epoch) { while (monc->have_osdmap < epoch) {
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
if (timeout != 0 && time_after_eq(jiffies, started + timeout)) if (timeout && time_after_eq(jiffies, started + timeout))
return -ETIMEDOUT; return -ETIMEDOUT;
ret = wait_event_interruptible_timeout(monc->client->auth_wq, ret = wait_event_interruptible_timeout(monc->client->auth_wq,
monc->have_osdmap >= epoch, timeout); monc->have_osdmap >= epoch,
ceph_timeout_jiffies(timeout));
if (ret < 0) if (ret < 0)
return ret; return ret;
......
This diff is collapsed.
...@@ -89,7 +89,7 @@ static int crush_decode_tree_bucket(void **p, void *end, ...@@ -89,7 +89,7 @@ static int crush_decode_tree_bucket(void **p, void *end,
{ {
int j; int j;
dout("crush_decode_tree_bucket %p to %p\n", *p, end); dout("crush_decode_tree_bucket %p to %p\n", *p, end);
ceph_decode_32_safe(p, end, b->num_nodes, bad); ceph_decode_8_safe(p, end, b->num_nodes, bad);
b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
if (b->node_weights == NULL) if (b->node_weights == NULL)
return -ENOMEM; return -ENOMEM;
......
...@@ -51,10 +51,7 @@ void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty) ...@@ -51,10 +51,7 @@ void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
set_page_dirty_lock(pages[i]); set_page_dirty_lock(pages[i]);
put_page(pages[i]); put_page(pages[i]);
} }
if (is_vmalloc_addr(pages)) kvfree(pages);
vfree(pages);
else
kfree(pages);
} }
EXPORT_SYMBOL(ceph_put_page_vector); EXPORT_SYMBOL(ceph_put_page_vector);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment