Commit a10c38a4 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "This changeset has a few main parts:

   - Ilya has finished a huge refactoring effort to sync up the
     client-side logic in libceph with the user-space client code, which
     has evolved significantly over the last couple years, with lots of
     additional behaviors (e.g., how requests are handled when cluster
     is full and transitions from full to non-full).

     This structure of the code is more closely aligned with userspace
     now such that it will be much easier to maintain going forward when
     behavior changes take place.  There are some locking improvements
     bundled in as well.

   - Zheng adds multi-filesystem support (multiple namespaces within the
     same Ceph cluster)

   - Zheng has changed the readdir offsets and directory enumeration so
     that dentry offsets are hash-based and therefore stable across
     directory fragmentation events on the MDS.

   - Zheng has a smorgasbord of bug fixes across fs/ceph"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (71 commits)
  ceph: fix wake_up_session_cb()
  ceph: don't use truncate_pagecache() to invalidate read cache
  ceph: SetPageError() for writeback pages if writepages fails
  ceph: handle interrupted ceph_writepage()
  ceph: make ceph_update_writeable_page() uninterruptible
  libceph: make ceph_osdc_wait_request() uninterruptible
  ceph: handle -EAGAIN returned by ceph_update_writeable_page()
  ceph: make fault/page_mkwrite return VM_FAULT_OOM for -ENOMEM
  ceph: block non-fatal signals for fault/page_mkwrite
  ceph: make logical calculation functions return bool
  ceph: tolerate bad i_size for symlink inode
  ceph: improve fragtree change detection
  ceph: keep leaf frag when updating fragtree
  ceph: fix dir_auth check in ceph_fill_dirfrag()
  ceph: don't assume frag tree splits in mds reply are sorted
  ceph: fix inode reference leak
  ceph: using hash value to compose dentry offset
  ceph: don't forbid marking directory complete after forward seek
  ceph: record 'offset' for each entry of readdir result
  ceph: define 'end/complete' in readdir reply as bit flags
  ...
parents ea8ea737 e5360309
...@@ -350,12 +350,12 @@ struct rbd_device { ...@@ -350,12 +350,12 @@ struct rbd_device {
struct rbd_spec *spec; struct rbd_spec *spec;
struct rbd_options *opts; struct rbd_options *opts;
char *header_name; struct ceph_object_id header_oid;
struct ceph_object_locator header_oloc;
struct ceph_file_layout layout; struct ceph_file_layout layout;
struct ceph_osd_event *watch_event; struct ceph_osd_linger_request *watch_handle;
struct rbd_obj_request *watch_request;
struct rbd_spec *parent_spec; struct rbd_spec *parent_spec;
u64 parent_overlap; u64 parent_overlap;
...@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) ...@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
return __rbd_obj_request_wait(obj_request, 0); return __rbd_obj_request_wait(obj_request, 0);
} }
static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
unsigned long timeout)
{
return __rbd_obj_request_wait(obj_request, timeout);
}
static void rbd_img_request_complete(struct rbd_img_request *img_request) static void rbd_img_request_complete(struct rbd_img_request *img_request)
{ {
...@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) ...@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
complete_all(&obj_request->completion); complete_all(&obj_request->completion);
} }
static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
{
dout("%s: obj %p\n", __func__, obj_request);
obj_request_done_set(obj_request);
}
static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
{ {
struct rbd_img_request *img_request = NULL; struct rbd_img_request *img_request = NULL;
...@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) ...@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
obj_request_done_set(obj_request); obj_request_done_set(obj_request);
} }
static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
struct ceph_msg *msg)
{ {
struct rbd_obj_request *obj_request = osd_req->r_priv; struct rbd_obj_request *obj_request = osd_req->r_priv;
u16 opcode; u16 opcode;
dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); dout("%s: osd_req %p\n", __func__, osd_req);
rbd_assert(osd_req == obj_request->osd_req); rbd_assert(osd_req == obj_request->osd_req);
if (obj_request_img_data_test(obj_request)) { if (obj_request_img_data_test(obj_request)) {
rbd_assert(obj_request->img_request); rbd_assert(obj_request->img_request);
...@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, ...@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_CALL:
rbd_osd_call_callback(obj_request); rbd_osd_call_callback(obj_request);
break; break;
case CEPH_OSD_OP_NOTIFY_ACK:
case CEPH_OSD_OP_WATCH:
rbd_osd_trivial_callback(obj_request);
break;
default: default:
rbd_warn(NULL, "%s: unsupported op %hu", rbd_warn(NULL, "%s: unsupported op %hu",
obj_request->object_name, (unsigned short) opcode); obj_request->object_name, (unsigned short) opcode);
...@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) ...@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
{ {
struct rbd_img_request *img_request = obj_request->img_request; struct rbd_img_request *img_request = obj_request->img_request;
struct ceph_osd_request *osd_req = obj_request->osd_req; struct ceph_osd_request *osd_req = obj_request->osd_req;
u64 snap_id;
rbd_assert(osd_req != NULL); if (img_request)
osd_req->r_snapid = img_request->snap_id;
snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
ceph_osdc_build_request(osd_req, obj_request->offset,
NULL, snap_id, NULL);
} }
static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
{ {
struct rbd_img_request *img_request = obj_request->img_request;
struct ceph_osd_request *osd_req = obj_request->osd_req; struct ceph_osd_request *osd_req = obj_request->osd_req;
struct ceph_snap_context *snapc;
struct timespec mtime = CURRENT_TIME;
rbd_assert(osd_req != NULL); osd_req->r_mtime = CURRENT_TIME;
osd_req->r_data_offset = obj_request->offset;
snapc = img_request ? img_request->snapc : NULL;
ceph_osdc_build_request(osd_req, obj_request->offset,
snapc, CEPH_NOSNAP, &mtime);
} }
/* /*
...@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create( ...@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
GFP_NOIO); GFP_NOIO);
if (!osd_req) if (!osd_req)
return NULL; /* ENOMEM */ goto fail;
if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
...@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create( ...@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create(
osd_req->r_priv = obj_request; osd_req->r_priv = obj_request;
osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
obj_request->object_name))
goto fail;
if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
goto fail;
return osd_req; return osd_req;
fail:
ceph_osdc_put_request(osd_req);
return NULL;
} }
/* /*
...@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) ...@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
false, GFP_NOIO); false, GFP_NOIO);
if (!osd_req) if (!osd_req)
return NULL; /* ENOMEM */ goto fail;
osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
osd_req->r_callback = rbd_osd_req_callback; osd_req->r_callback = rbd_osd_req_callback;
osd_req->r_priv = obj_request; osd_req->r_priv = obj_request;
osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
obj_request->object_name))
goto fail;
if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
goto fail;
return osd_req; return osd_req;
fail:
ceph_osdc_put_request(osd_req);
return NULL;
} }
...@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) ...@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
{ {
struct rbd_obj_request *obj_request; struct rbd_obj_request *obj_request;
struct rbd_obj_request *next_obj_request; struct rbd_obj_request *next_obj_request;
int ret = 0;
dout("%s: img %p\n", __func__, img_request); dout("%s: img %p\n", __func__, img_request);
for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
int ret;
rbd_img_request_get(img_request);
for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
ret = rbd_img_obj_request_submit(obj_request); ret = rbd_img_obj_request_submit(obj_request);
if (ret) if (ret)
return ret; goto out_put_ireq;
} }
return 0; out_put_ireq:
rbd_img_request_put(img_request);
return ret;
} }
static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
...@@ -3090,45 +3084,18 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) ...@@ -3090,45 +3084,18 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
obj_request_done_set(obj_request); obj_request_done_set(obj_request);
} }
static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
{ static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
struct rbd_obj_request *obj_request;
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
int ret;
obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
OBJ_REQUEST_NODATA);
if (!obj_request)
return -ENOMEM;
ret = -ENOMEM;
obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
obj_request);
if (!obj_request->osd_req)
goto out;
osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
notify_id, 0, 0);
rbd_osd_req_format_read(obj_request);
ret = rbd_obj_request_submit(osdc, obj_request); static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
if (ret) u64 notifier_id, void *data, size_t data_len)
goto out;
ret = rbd_obj_request_wait(obj_request);
out:
rbd_obj_request_put(obj_request);
return ret;
}
static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
{ {
struct rbd_device *rbd_dev = (struct rbd_device *)data; struct rbd_device *rbd_dev = arg;
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
int ret; int ret;
dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
rbd_dev->header_name, (unsigned long long)notify_id, cookie, notify_id);
(unsigned int)opcode);
/* /*
* Until adequate refresh error handling is in place, there is * Until adequate refresh error handling is in place, there is
...@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) ...@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
if (ret) if (ret)
rbd_warn(rbd_dev, "refresh failed: %d", ret); rbd_warn(rbd_dev, "refresh failed: %d", ret);
ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
&rbd_dev->header_oloc, notify_id, cookie,
NULL, 0);
if (ret) if (ret)
rbd_warn(rbd_dev, "notify_ack ret %d", ret); rbd_warn(rbd_dev, "notify_ack ret %d", ret);
} }
/* static void rbd_watch_errcb(void *arg, u64 cookie, int err)
* Send a (un)watch request and wait for the ack. Return a request
* with a ref held on success or error.
*/
static struct rbd_obj_request *rbd_obj_watch_request_helper(
struct rbd_device *rbd_dev,
bool watch)
{ {
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_device *rbd_dev = arg;
struct ceph_options *opts = osdc->client->options;
struct rbd_obj_request *obj_request;
int ret; int ret;
obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, rbd_warn(rbd_dev, "encountered watch error: %d", err);
OBJ_REQUEST_NODATA);
if (!obj_request)
return ERR_PTR(-ENOMEM);
obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
obj_request);
if (!obj_request->osd_req) {
ret = -ENOMEM;
goto out;
}
osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
rbd_dev->watch_event->cookie, 0, watch);
rbd_osd_req_format_write(obj_request);
if (watch) __rbd_dev_header_unwatch_sync(rbd_dev);
ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
ret = rbd_obj_request_submit(osdc, obj_request); ret = rbd_dev_header_watch_sync(rbd_dev);
if (ret)
goto out;
ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
if (ret)
goto out;
ret = obj_request->result;
if (ret) { if (ret) {
if (watch) rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
rbd_obj_request_end(obj_request); return;
goto out;
} }
return obj_request; ret = rbd_dev_refresh(rbd_dev);
if (ret)
out: rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
rbd_obj_request_put(obj_request);
return ERR_PTR(ret);
} }
/* /*
...@@ -3205,35 +3140,33 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( ...@@ -3205,35 +3140,33 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
{ {
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct rbd_obj_request *obj_request; struct ceph_osd_linger_request *handle;
int ret;
rbd_assert(!rbd_dev->watch_event); rbd_assert(!rbd_dev->watch_handle);
rbd_assert(!rbd_dev->watch_request);
ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
&rbd_dev->watch_event); &rbd_dev->header_oloc, rbd_watch_cb,
if (ret < 0) rbd_watch_errcb, rbd_dev);
return ret; if (IS_ERR(handle))
return PTR_ERR(handle);
obj_request = rbd_obj_watch_request_helper(rbd_dev, true); rbd_dev->watch_handle = handle;
if (IS_ERR(obj_request)) { return 0;
ceph_osdc_cancel_event(rbd_dev->watch_event); }
rbd_dev->watch_event = NULL;
return PTR_ERR(obj_request);
}
/* static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
* A watch request is set to linger, so the underlying osd {
* request won't go away until we unregister it. We retain struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
* a pointer to the object request during that time (in int ret;
* rbd_dev->watch_request), so we'll keep a reference to it.
* We'll drop that reference after we've unregistered it in
* rbd_dev_header_unwatch_sync().
*/
rbd_dev->watch_request = obj_request;
return 0; if (!rbd_dev->watch_handle)
return;
ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
if (ret)
rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
rbd_dev->watch_handle = NULL;
} }
/* /*
...@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) ...@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
*/ */
static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
{ {
struct rbd_obj_request *obj_request; __rbd_dev_header_unwatch_sync(rbd_dev);
rbd_assert(rbd_dev->watch_event);
rbd_assert(rbd_dev->watch_request);
rbd_obj_request_end(rbd_dev->watch_request);
rbd_obj_request_put(rbd_dev->watch_request);
rbd_dev->watch_request = NULL;
obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
if (!IS_ERR(obj_request))
rbd_obj_request_put(obj_request);
else
rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
PTR_ERR(obj_request));
ceph_osdc_cancel_event(rbd_dev->watch_event);
rbd_dev->watch_event = NULL;
dout("%s flushing notifies\n", __func__); dout("%s flushing notifies\n", __func__);
ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
...@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) ...@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
if (!ondisk) if (!ondisk)
return -ENOMEM; return -ENOMEM;
ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
0, size, ondisk); 0, size, ondisk);
if (ret < 0) if (ret < 0)
goto out; goto out;
...@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev) ...@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev)
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
bool need_put = !!rbd_dev->opts; bool need_put = !!rbd_dev->opts;
ceph_oid_destroy(&rbd_dev->header_oid);
rbd_put_client(rbd_dev->rbd_client); rbd_put_client(rbd_dev->rbd_client);
rbd_spec_put(rbd_dev->spec); rbd_spec_put(rbd_dev->spec);
kfree(rbd_dev->opts); kfree(rbd_dev->opts);
...@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, ...@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->node);
init_rwsem(&rbd_dev->header_rwsem); init_rwsem(&rbd_dev->header_rwsem);
ceph_oid_init(&rbd_dev->header_oid);
ceph_oloc_init(&rbd_dev->header_oloc);
rbd_dev->dev.bus = &rbd_bus_type; rbd_dev->dev.bus = &rbd_bus_type;
rbd_dev->dev.type = &rbd_device_type; rbd_dev->dev.type = &rbd_device_type;
rbd_dev->dev.parent = &rbd_root_dev; rbd_dev->dev.parent = &rbd_root_dev;
...@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, ...@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
__le64 size; __le64 size;
} __attribute__ ((packed)) size_buf = { 0 }; } __attribute__ ((packed)) size_buf = { 0 };
ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
"rbd", "get_size", "rbd", "get_size",
&snapid, sizeof (snapid), &snapid, sizeof (snapid),
&size_buf, sizeof (size_buf)); &size_buf, sizeof (size_buf));
...@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) ...@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
if (!reply_buf) if (!reply_buf)
return -ENOMEM; return -ENOMEM;
ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
"rbd", "get_object_prefix", NULL, 0, "rbd", "get_object_prefix", NULL, 0,
reply_buf, RBD_OBJ_PREFIX_LEN_MAX); reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
...@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, ...@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
u64 unsup; u64 unsup;
int ret; int ret;
ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
"rbd", "get_features", "rbd", "get_features",
&snapid, sizeof (snapid), &snapid, sizeof (snapid),
&features_buf, sizeof (features_buf)); &features_buf, sizeof (features_buf));
...@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) ...@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
} }
snapid = cpu_to_le64(rbd_dev->spec->snap_id); snapid = cpu_to_le64(rbd_dev->spec->snap_id);
ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
"rbd", "get_parent", "rbd", "get_parent",
&snapid, sizeof (snapid), &snapid, sizeof (snapid),
reply_buf, size); reply_buf, size);
...@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) ...@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
u64 stripe_count; u64 stripe_count;
int ret; int ret;
ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
"rbd", "get_stripe_unit_count", NULL, 0, "rbd", "get_stripe_unit_count", NULL, 0,
(char *)&striping_info_buf, size); (char *)&striping_info_buf, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
...@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) ...@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
if (!reply_buf) if (!reply_buf)
return -ENOMEM; return -ENOMEM;
ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
"rbd", "get_snapcontext", NULL, 0, "rbd", "get_snapcontext", NULL, 0,
reply_buf, size); reply_buf, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
...@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, ...@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
snapid = cpu_to_le64(snap_id); snapid = cpu_to_le64(snap_id);
ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
"rbd", "get_snapshot_name", "rbd", "get_snapshot_name",
&snapid, sizeof (snapid), &snapid, sizeof (snapid),
reply_buf, size); reply_buf, size);
...@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) ...@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
again: again:
ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
if (ret == -ENOENT && tries++ < 1) { if (ret == -ENOENT && tries++ < 1) {
ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
&newest_epoch); &newest_epoch);
if (ret < 0) if (ret < 0)
return ret; return ret;
if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
ceph_monc_request_next_osdmap(&rbdc->client->monc); ceph_osdc_maybe_request_map(&rbdc->client->osdc);
(void) ceph_monc_wait_osdmap(&rbdc->client->monc, (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
newest_epoch, newest_epoch,
opts->mount_timeout); opts->mount_timeout);
...@@ -5260,35 +5181,26 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) ...@@ -5260,35 +5181,26 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
static int rbd_dev_header_name(struct rbd_device *rbd_dev) static int rbd_dev_header_name(struct rbd_device *rbd_dev)
{ {
struct rbd_spec *spec = rbd_dev->spec; struct rbd_spec *spec = rbd_dev->spec;
size_t size; int ret;
/* Record the header object name for this rbd image. */ /* Record the header object name for this rbd image. */
rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
if (rbd_dev->image_format == 1) if (rbd_dev->image_format == 1)
size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
else
size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
if (!rbd_dev->header_name)
return -ENOMEM;
if (rbd_dev->image_format == 1)
sprintf(rbd_dev->header_name, "%s%s",
spec->image_name, RBD_SUFFIX); spec->image_name, RBD_SUFFIX);
else else
sprintf(rbd_dev->header_name, "%s%s", ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
RBD_HEADER_PREFIX, spec->image_id); RBD_HEADER_PREFIX, spec->image_id);
return 0;
return ret;
} }
static void rbd_dev_image_release(struct rbd_device *rbd_dev) static void rbd_dev_image_release(struct rbd_device *rbd_dev)
{ {
rbd_dev_unprobe(rbd_dev); rbd_dev_unprobe(rbd_dev);
kfree(rbd_dev->header_name);
rbd_dev->header_name = NULL;
rbd_dev->image_format = 0; rbd_dev->image_format = 0;
kfree(rbd_dev->spec->image_id); kfree(rbd_dev->spec->image_id);
rbd_dev->spec->image_id = NULL; rbd_dev->spec->image_id = NULL;
...@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) ...@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
pr_info("image %s/%s does not exist\n", pr_info("image %s/%s does not exist\n",
rbd_dev->spec->pool_name, rbd_dev->spec->pool_name,
rbd_dev->spec->image_name); rbd_dev->spec->image_name);
goto out_header_name; goto err_out_format;
} }
} }
...@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) ...@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
goto err_out_probe; goto err_out_probe;
dout("discovered format %u image, header name is %s\n", dout("discovered format %u image, header name is %s\n",
rbd_dev->image_format, rbd_dev->header_name); rbd_dev->image_format, rbd_dev->header_oid.name);
return 0; return 0;
err_out_probe: err_out_probe:
...@@ -5381,9 +5293,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) ...@@ -5381,9 +5293,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
err_out_watch: err_out_watch:
if (!depth) if (!depth)
rbd_dev_header_unwatch_sync(rbd_dev); rbd_dev_header_unwatch_sync(rbd_dev);
out_header_name:
kfree(rbd_dev->header_name);
rbd_dev->header_name = NULL;
err_out_format: err_out_format:
rbd_dev->image_format = 0; rbd_dev->image_format = 0;
kfree(rbd_dev->spec->image_id); kfree(rbd_dev->spec->image_id);
......
...@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page) ...@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
/* /*
* Finish an async read(ahead) op. * Finish an async read(ahead) op.
*/ */
static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) static void finish_read(struct ceph_osd_request *req)
{ {
struct inode *inode = req->r_inode; struct inode *inode = req->r_inode;
struct ceph_osd_data *osd_data; struct ceph_osd_data *osd_data;
int rc = req->r_result; int rc = req->r_result <= 0 ? req->r_result : 0;
int bytes = le32_to_cpu(msg->hdr.data_len); int bytes = req->r_result >= 0 ? req->r_result : 0;
int num_pages; int num_pages;
int i; int i;
...@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) ...@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
req->r_callback = finish_read; req->r_callback = finish_read;
req->r_inode = inode; req->r_inode = inode;
ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
ret = ceph_osdc_start_request(osdc, req, false); ret = ceph_osdc_start_request(osdc, req, false);
if (ret < 0) if (ret < 0)
...@@ -546,10 +544,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ...@@ -546,10 +544,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
truncate_seq, truncate_size, truncate_seq, truncate_size,
&inode->i_mtime, &page, 1); &inode->i_mtime, &page, 1);
if (err < 0) { if (err < 0) {
dout("writepage setting page/mapping error %d %p\n", err, page); struct writeback_control tmp_wbc;
if (!wbc)
wbc = &tmp_wbc;
if (err == -ERESTARTSYS) {
/* killed by SIGKILL */
dout("writepage interrupted page %p\n", page);
redirty_page_for_writepage(wbc, page);
end_page_writeback(page);
goto out;
}
dout("writepage setting page/mapping error %d %p\n",
err, page);
SetPageError(page); SetPageError(page);
mapping_set_error(&inode->i_data, err); mapping_set_error(&inode->i_data, err);
if (wbc)
wbc->pages_skipped++; wbc->pages_skipped++;
} else { } else {
dout("writepage cleaned page %p\n", page); dout("writepage cleaned page %p\n", page);
...@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) ...@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
BUG_ON(!inode); BUG_ON(!inode);
ihold(inode); ihold(inode);
err = writepage_nounlock(page, wbc); err = writepage_nounlock(page, wbc);
if (err == -ERESTARTSYS) {
/* direct memory reclaimer was killed by SIGKILL. return 0
* to prevent caller from setting mapping/page error */
err = 0;
}
unlock_page(page); unlock_page(page);
iput(inode); iput(inode);
return err; return err;
} }
/* /*
* lame release_pages helper. release_pages() isn't exported to * lame release_pages helper. release_pages() isn't exported to
* modules. * modules.
...@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num) ...@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
* If we get an error, set the mapping error bit, but not the individual * If we get an error, set the mapping error bit, but not the individual
* page error bits. * page error bits.
*/ */
static void writepages_finish(struct ceph_osd_request *req, static void writepages_finish(struct ceph_osd_request *req)
struct ceph_msg *msg)
{ {
struct inode *inode = req->r_inode; struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
...@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req, ...@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
bool remove_page; bool remove_page;
dout("writepages_finish %p rc %d\n", inode, rc); dout("writepages_finish %p rc %d\n", inode, rc);
if (rc < 0) if (rc < 0)
mapping_set_error(mapping, rc); mapping_set_error(mapping, rc);
...@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req, ...@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
clear_bdi_congested(&fsc->backing_dev_info, clear_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC); BLK_RW_ASYNC);
if (rc < 0)
SetPageError(page);
ceph_put_snap_context(page_snap_context(page)); ceph_put_snap_context(page_snap_context(page));
page->private = 0; page->private = 0;
ClearPagePrivate(page); ClearPagePrivate(page);
...@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping,
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
pr_warn("writepage_start %p on forced umount\n", inode); if (ci->i_wrbuffer_ref > 0) {
truncate_pagecache(inode, 0); pr_warn_ratelimited(
"writepage_start %p %lld forced umount\n",
inode, ceph_ino(inode));
}
mapping_set_error(mapping, -EIO); mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */ return -EIO; /* we're in a forced umount, don't write! */
} }
...@@ -1063,10 +1079,7 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -1063,10 +1079,7 @@ static int ceph_writepages_start(struct address_space *mapping,
pages = NULL; pages = NULL;
} }
vino = ceph_vino(inode); req->r_mtime = inode->i_mtime;
ceph_osdc_build_request(req, offset, snapc, vino.snap,
&inode->i_mtime);
rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
BUG_ON(rc); BUG_ON(rc);
req = NULL; req = NULL;
...@@ -1099,7 +1112,6 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -1099,7 +1112,6 @@ static int ceph_writepages_start(struct address_space *mapping,
mapping->writeback_index = index; mapping->writeback_index = index;
out: out:
if (req)
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
ceph_put_snap_context(snapc); ceph_put_snap_context(snapc);
dout("writepages done, rc = %d\n", rc); dout("writepages done, rc = %d\n", rc);
...@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file, ...@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file,
struct page *page) struct page *page)
{ {
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
loff_t page_off = pos & PAGE_MASK; loff_t page_off = pos & PAGE_MASK;
int pos_in_page = pos & ~PAGE_MASK; int pos_in_page = pos & ~PAGE_MASK;
...@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file, ...@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file,
int r; int r;
struct ceph_snap_context *snapc, *oldest; struct ceph_snap_context *snapc, *oldest;
if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout(" page %p forced umount\n", page);
unlock_page(page);
return -EIO;
}
retry_locked: retry_locked:
/* writepages currently holds page lock, but if we change that later, */ /* writepages currently holds page lock, but if we change that later, */
wait_on_page_writeback(page); wait_on_page_writeback(page);
...@@ -1165,7 +1184,7 @@ static int ceph_update_writeable_page(struct file *file, ...@@ -1165,7 +1184,7 @@ static int ceph_update_writeable_page(struct file *file,
snapc = ceph_get_snap_context(snapc); snapc = ceph_get_snap_context(snapc);
unlock_page(page); unlock_page(page);
ceph_queue_writeback(inode); ceph_queue_writeback(inode);
r = wait_event_interruptible(ci->i_cap_wq, r = wait_event_killable(ci->i_cap_wq,
context_is_writeable_or_written(inode, snapc)); context_is_writeable_or_written(inode, snapc));
ceph_put_snap_context(snapc); ceph_put_snap_context(snapc);
if (r == -ERESTARTSYS) if (r == -ERESTARTSYS)
...@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = { ...@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
.direct_IO = ceph_direct_io, .direct_IO = ceph_direct_io,
}; };
static void ceph_block_sigs(sigset_t *oldset)
{
sigset_t mask;
siginitsetinv(&mask, sigmask(SIGKILL));
sigprocmask(SIG_BLOCK, &mask, oldset);
}
static void ceph_restore_sigs(sigset_t *oldset)
{
sigprocmask(SIG_SETMASK, oldset, NULL);
}
/* /*
* vm ops * vm ops
...@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *pinned_page = NULL; struct page *pinned_page = NULL;
loff_t off = vmf->pgoff << PAGE_SHIFT; loff_t off = vmf->pgoff << PAGE_SHIFT;
int want, got, ret; int want, got, ret;
sigset_t oldset;
ceph_block_sigs(&oldset);
dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
...@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else else
want = CEPH_CAP_FILE_CACHE; want = CEPH_CAP_FILE_CACHE;
while (1) {
got = 0; got = 0;
ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
-1, &got, &pinned_page); if (ret < 0)
if (ret == 0) goto out_restore;
break;
if (ret != -ERESTARTSYS) {
WARN_ON(1);
return VM_FAULT_SIGBUS;
}
}
dout("filemap_fault %p %llu~%zd got cap refs on %s\n", dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
...@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
ceph_put_cap_refs(ci, got); ceph_put_cap_refs(ci, got);
if (ret != -EAGAIN) if (ret != -EAGAIN)
return ret; goto out_restore;
/* read inline data */ /* read inline data */
if (off >= PAGE_SIZE) { if (off >= PAGE_SIZE) {
...@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
~__GFP_FS)); ~__GFP_FS));
if (!page) { if (!page) {
ret = VM_FAULT_OOM; ret = VM_FAULT_OOM;
goto out; goto out_inline;
} }
ret1 = __ceph_do_getattr(inode, page, ret1 = __ceph_do_getattr(inode, page,
CEPH_STAT_CAP_INLINE_DATA, true); CEPH_STAT_CAP_INLINE_DATA, true);
if (ret1 < 0 || off >= i_size_read(inode)) { if (ret1 < 0 || off >= i_size_read(inode)) {
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
if (ret1 < 0)
ret = ret1;
else
ret = VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS;
goto out; goto out_inline;
} }
if (ret1 < PAGE_SIZE) if (ret1 < PAGE_SIZE)
zero_user_segment(page, ret1, PAGE_SIZE); zero_user_segment(page, ret1, PAGE_SIZE);
...@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
SetPageUptodate(page); SetPageUptodate(page);
vmf->page = page; vmf->page = page;
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
} out_inline:
out:
dout("filemap_fault %p %llu~%zd read inline data ret %d\n", dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
inode, off, (size_t)PAGE_SIZE, ret); inode, off, (size_t)PAGE_SIZE, ret);
}
out_restore:
ceph_restore_sigs(&oldset);
if (ret < 0)
ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
return ret; return ret;
} }
...@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
loff_t size = i_size_read(inode); loff_t size = i_size_read(inode);
size_t len; size_t len;
int want, got, ret; int want, got, ret;
sigset_t oldset;
prealloc_cf = ceph_alloc_cap_flush(); prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf) if (!prealloc_cf)
return VM_FAULT_SIGBUS; return VM_FAULT_OOM;
ceph_block_sigs(&oldset);
if (ci->i_inline_version != CEPH_INLINE_NONE) { if (ci->i_inline_version != CEPH_INLINE_NONE) {
struct page *locked_page = NULL; struct page *locked_page = NULL;
...@@ -1423,11 +1462,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1423,11 +1462,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = ceph_uninline_data(vma->vm_file, locked_page); ret = ceph_uninline_data(vma->vm_file, locked_page);
if (locked_page) if (locked_page)
unlock_page(locked_page); unlock_page(locked_page);
if (ret < 0) { if (ret < 0)
ret = VM_FAULT_SIGBUS;
goto out_free; goto out_free;
} }
}
if (off + PAGE_SIZE <= size) if (off + PAGE_SIZE <= size)
len = PAGE_SIZE; len = PAGE_SIZE;
...@@ -1440,31 +1477,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1440,31 +1477,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
else else
want = CEPH_CAP_FILE_BUFFER; want = CEPH_CAP_FILE_BUFFER;
while (1) {
got = 0; got = 0;
ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
&got, NULL); &got, NULL);
if (ret == 0) if (ret < 0)
break;
if (ret != -ERESTARTSYS) {
WARN_ON(1);
ret = VM_FAULT_SIGBUS;
goto out_free; goto out_free;
}
}
dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
inode, off, len, ceph_cap_string(got)); inode, off, len, ceph_cap_string(got));
/* Update time before taking page lock */ /* Update time before taking page lock */
file_update_time(vma->vm_file); file_update_time(vma->vm_file);
do {
lock_page(page); lock_page(page);
ret = VM_FAULT_NOPAGE; if ((off > size) || (page->mapping != inode->i_mapping)) {
if ((off > size) ||
(page->mapping != inode->i_mapping)) {
unlock_page(page); unlock_page(page);
goto out; ret = VM_FAULT_NOPAGE;
break;
} }
ret = ceph_update_writeable_page(vma->vm_file, off, len, page); ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
...@@ -1472,13 +1504,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1472,13 +1504,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
/* success. we'll keep the page locked. */ /* success. we'll keep the page locked. */
set_page_dirty(page); set_page_dirty(page);
ret = VM_FAULT_LOCKED; ret = VM_FAULT_LOCKED;
} else {
if (ret == -ENOMEM)
ret = VM_FAULT_OOM;
else
ret = VM_FAULT_SIGBUS;
} }
out: } while (ret == -EAGAIN);
if (ret == VM_FAULT_LOCKED || if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) { ci->i_inline_version != CEPH_INLINE_NONE) {
int dirty; int dirty;
...@@ -1495,8 +1523,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1495,8 +1523,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
inode, off, len, ceph_cap_string(got), ret); inode, off, len, ceph_cap_string(got), ret);
ceph_put_cap_refs(ci, got); ceph_put_cap_refs(ci, got);
out_free: out_free:
ceph_restore_sigs(&oldset);
ceph_free_cap_flush(prealloc_cf); ceph_free_cap_flush(prealloc_cf);
if (ret < 0)
ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
return ret; return ret;
} }
...@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ...@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
goto out; goto out;
} }
ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); req->r_mtime = inode->i_mtime;
err = ceph_osdc_start_request(&fsc->client->osdc, req, false); err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!err) if (!err)
err = ceph_osdc_wait_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req);
...@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ...@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
goto out_put; goto out_put;
} }
ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); req->r_mtime = inode->i_mtime;
err = ceph_osdc_start_request(&fsc->client->osdc, req, false); err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!err) if (!err)
err = ceph_osdc_wait_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req);
...@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) ...@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
rd_req->r_flags = CEPH_OSD_FLAG_READ; rd_req->r_flags = CEPH_OSD_FLAG_READ;
osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
rd_req->r_base_oloc.pool = pool; rd_req->r_base_oloc.pool = pool;
snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
"%llx.00000000", ci->i_vino.ino);
rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
if (err)
goto out_unlock;
wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1, false, GFP_NOFS); 1, false, GFP_NOFS);
...@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) ...@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out_unlock; goto out_unlock;
} }
wr_req->r_flags = CEPH_OSD_FLAG_WRITE | wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
wr_req->r_base_oloc.pool = pool; ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
wr_req->r_base_oid = rd_req->r_base_oid; ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
if (err)
goto out_unlock;
/* one page should be large enough for STAT data */ /* one page should be large enough for STAT data */
pages = ceph_alloc_page_vector(1, GFP_KERNEL); pages = ceph_alloc_page_vector(1, GFP_KERNEL);
...@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) ...@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
0, false, true); 0, false, true);
ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
&ci->vfs_inode.i_mtime);
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, wr_req->r_mtime = ci->vfs_inode.i_mtime;
&ci->vfs_inode.i_mtime);
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err) if (!err)
...@@ -1823,9 +1855,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) ...@@ -1823,9 +1855,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
out_unlock: out_unlock:
up_write(&mdsc->pool_perm_rwsem); up_write(&mdsc->pool_perm_rwsem);
if (rd_req)
ceph_osdc_put_request(rd_req); ceph_osdc_put_request(rd_req);
if (wr_req)
ceph_osdc_put_request(wr_req); ceph_osdc_put_request(wr_req);
out: out:
if (!err) if (!err)
......
...@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int ...@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
unlock_page(page); unlock_page(page);
} }
static inline int cache_valid(struct ceph_inode_info *ci) static inline bool cache_valid(struct ceph_inode_info *ci)
{ {
return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
(ci->i_fscache_gen == ci->i_rdcache_gen)); (ci->i_fscache_gen == ci->i_rdcache_gen));
......
...@@ -1656,7 +1656,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ...@@ -1656,7 +1656,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
*/ */
if ((!is_delayed || mdsc->stopping) && if ((!is_delayed || mdsc->stopping) &&
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */ !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */ inode->i_data.nrpages && /* have cached pages */
(revoking & (CEPH_CAP_FILE_CACHE| (revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
...@@ -1698,8 +1698,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ...@@ -1698,8 +1698,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
revoking = cap->implemented & ~cap->issued; revoking = cap->implemented & ~cap->issued;
dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
cap->mds, cap, ceph_cap_string(cap->issued), cap->mds, cap, ceph_cap_string(cap_used),
ceph_cap_string(cap_used), ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented), ceph_cap_string(cap->implemented),
ceph_cap_string(revoking)); ceph_cap_string(revoking));
...@@ -2317,7 +2317,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ...@@ -2317,7 +2317,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
/* make sure file is actually open */ /* make sure file is actually open */
file_wanted = __ceph_caps_file_wanted(ci); file_wanted = __ceph_caps_file_wanted(ci);
if ((file_wanted & need) == 0) { if ((file_wanted & need) != need) {
dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
ceph_cap_string(need), ceph_cap_string(file_wanted)); ceph_cap_string(need), ceph_cap_string(file_wanted));
*err = -EBADF; *err = -EBADF;
...@@ -2412,13 +2412,27 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ...@@ -2412,13 +2412,27 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
goto out_unlock; goto out_unlock;
} }
if (!__ceph_is_any_caps(ci) && if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { int mds_wanted;
if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
CEPH_MOUNT_SHUTDOWN) {
dout("get_cap_refs %p forced umount\n", inode); dout("get_cap_refs %p forced umount\n", inode);
*err = -EIO; *err = -EIO;
ret = 1; ret = 1;
goto out_unlock; goto out_unlock;
} }
mds_wanted = __ceph_caps_mds_wanted(ci);
if ((mds_wanted & need) != need) {
dout("get_cap_refs %p caps were dropped"
" (session killed?)\n", inode);
*err = -ESTALE;
ret = 1;
goto out_unlock;
}
if ((mds_wanted & file_wanted) ==
(file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
}
dout("get_cap_refs %p have %s needed %s\n", inode, dout("get_cap_refs %p have %s needed %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need)); ceph_cap_string(have), ceph_cap_string(need));
...@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, ...@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
if (err == -EAGAIN) if (err == -EAGAIN)
continue; continue;
if (err < 0) if (err < 0)
return err; ret = err;
} else { } else {
ret = wait_event_interruptible(ci->i_cap_wq, ret = wait_event_interruptible(ci->i_cap_wq,
try_get_cap_refs(ci, need, want, endoff, try_get_cap_refs(ci, need, want, endoff,
...@@ -2496,7 +2510,14 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, ...@@ -2496,7 +2510,14 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
continue; continue;
if (err < 0) if (err < 0)
ret = err; ret = err;
if (ret < 0) }
if (ret < 0) {
if (err == -ESTALE) {
/* session was killed, try renew caps */
ret = ceph_renew_caps(&ci->vfs_inode);
if (ret == 0)
continue;
}
return ret; return ret;
} }
...@@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ...@@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!ci->i_wrbuffer_ref) { !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
if (try_nonblocking_invalidate(inode)) { if (try_nonblocking_invalidate(inode)) {
/* there were locked pages.. invalidate later /* there were locked pages.. invalidate later
in a separate thread. */ in a separate thread. */
...@@ -3226,6 +3247,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, ...@@ -3226,6 +3247,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
if (target < 0) { if (target < 0) {
__ceph_remove_cap(cap, false); __ceph_remove_cap(cap, false);
if (!ci->i_auth_cap)
ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
goto out_unlock; goto out_unlock;
} }
......
...@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) ...@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
path ? path : ""); path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock); spin_unlock(&req->r_old_dentry->d_lock);
kfree(path); kfree(path);
} else if (req->r_path2) { } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino) if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino, seq_printf(s, " #%llx/%s", req->r_ino2.ino,
req->r_path2); req->r_path2);
......
...@@ -70,16 +70,42 @@ int ceph_init_dentry(struct dentry *dentry) ...@@ -70,16 +70,42 @@ int ceph_init_dentry(struct dentry *dentry)
} }
/* /*
* for readdir, we encode the directory frag and offset within that * for f_pos for readdir:
* frag into f_pos. * - hash order:
* (0xff << 52) | ((24 bits hash) << 28) |
* (the nth entry has hash collision);
* - frag+name order;
* ((frag value) << 28) | (the nth entry in frag);
*/ */
#define OFFSET_BITS 28
#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
{
loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
if (hash_order)
fpos |= HASH_ORDER;
return fpos;
}
static bool is_hash_order(loff_t p)
{
return (p & HASH_ORDER) == HASH_ORDER;
}
static unsigned fpos_frag(loff_t p) static unsigned fpos_frag(loff_t p)
{ {
return p >> 32; return p >> OFFSET_BITS;
} }
static unsigned fpos_hash(loff_t p)
{
return ceph_frag_value(fpos_frag(p));
}
static unsigned fpos_off(loff_t p) static unsigned fpos_off(loff_t p)
{ {
return p & 0xffffffff; return p & OFFSET_MASK;
} }
static int fpos_cmp(loff_t l, loff_t r) static int fpos_cmp(loff_t l, loff_t r)
...@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, ...@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
return 0; return 0;
} }
static struct dentry *
__dcache_find_get_entry(struct dentry *parent, u64 idx,
struct ceph_readdir_cache_control *cache_ctl)
{
struct inode *dir = d_inode(parent);
struct dentry *dentry;
unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
loff_t ptr_pos = idx * sizeof(struct dentry *);
pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
if (ptr_pos >= i_size_read(dir))
return NULL;
if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
ceph_readdir_cache_release(cache_ctl);
cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
if (!cache_ctl->page) {
dout(" page %lu not found\n", ptr_pgoff);
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
i_mutex, no need to use page lock */
unlock_page(cache_ctl->page);
cache_ctl->dentries = kmap(cache_ctl->page);
}
cache_ctl->index = idx & idx_mask;
rcu_read_lock();
spin_lock(&parent->d_lock);
/* check i_size again here, because empty directory can be
* marked as complete while not holding the i_mutex. */
if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
dentry = cache_ctl->dentries[cache_ctl->index];
else
dentry = NULL;
spin_unlock(&parent->d_lock);
if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
dentry = NULL;
rcu_read_unlock();
return dentry ? : ERR_PTR(-EAGAIN);
}
/* /*
* When possible, we try to satisfy a readdir by peeking at the * When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on * dcache. We make this work by carefully ordering dentries on
...@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, ...@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
struct inode *dir = d_inode(parent); struct inode *dir = d_inode(parent);
struct dentry *dentry, *last = NULL; struct dentry *dentry, *last = NULL;
struct ceph_dentry_info *di; struct ceph_dentry_info *di;
unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
int err = 0;
loff_t ptr_pos = 0;
struct ceph_readdir_cache_control cache_ctl = {}; struct ceph_readdir_cache_control cache_ctl = {};
u64 idx = 0;
int err = 0;
dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
/* search start position */
if (ctx->pos > 2) {
u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
while (count > 0) {
u64 step = count >> 1;
dentry = __dcache_find_get_entry(parent, idx + step,
&cache_ctl);
if (!dentry) {
/* use linar search */
idx = 0;
break;
}
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out;
}
di = ceph_dentry(dentry);
spin_lock(&dentry->d_lock);
if (fpos_cmp(di->offset, ctx->pos) < 0) {
idx += step + 1;
count -= step + 1;
} else {
count = step;
}
spin_unlock(&dentry->d_lock);
dput(dentry);
}
/* we can calculate cache index for the first dirfrag */ dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
cache_ctl.index = fpos_off(ctx->pos) - 2;
BUG_ON(cache_ctl.index < 0);
ptr_pos = cache_ctl.index * sizeof(struct dentry *);
} }
while (true) {
pgoff_t pgoff;
bool emit_dentry;
if (ptr_pos >= i_size_read(dir)) { for (;;) {
bool emit_dentry = false;
dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
if (!dentry) {
fi->flags |= CEPH_F_ATEND; fi->flags |= CEPH_F_ATEND;
err = 0; err = 0;
break; break;
} }
if (IS_ERR(dentry)) {
err = -EAGAIN; err = PTR_ERR(dentry);
pgoff = ptr_pos >> PAGE_SHIFT; goto out;
if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
ceph_readdir_cache_release(&cache_ctl);
cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
if (!cache_ctl.page) {
dout(" page %lu not found\n", pgoff);
break;
}
/* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */
unlock_page(cache_ctl.page);
cache_ctl.dentries = kmap(cache_ctl.page);
} }
rcu_read_lock();
spin_lock(&parent->d_lock);
/* check i_size again here, because empty directory can be
* marked as complete while not holding the i_mutex. */
if (ceph_dir_is_complete_ordered(dir) &&
ptr_pos < i_size_read(dir))
dentry = cache_ctl.dentries[cache_ctl.index % nsize];
else
dentry = NULL;
spin_unlock(&parent->d_lock);
if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
dentry = NULL;
rcu_read_unlock();
if (!dentry)
break;
emit_dentry = false;
di = ceph_dentry(dentry); di = ceph_dentry(dentry);
spin_lock(&dentry->d_lock); spin_lock(&dentry->d_lock);
if (di->lease_shared_gen == shared_gen && if (di->lease_shared_gen == shared_gen &&
d_really_is_positive(dentry) && d_really_is_positive(dentry) &&
ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
fpos_cmp(ctx->pos, di->offset) <= 0) { fpos_cmp(ctx->pos, di->offset) <= 0) {
emit_dentry = true; emit_dentry = true;
} }
spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_lock);
if (emit_dentry) { if (emit_dentry) {
dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, dout(" %llx dentry %p %pd %p\n", di->offset,
dentry, dentry, d_inode(dentry)); dentry, dentry, d_inode(dentry));
ctx->pos = di->offset; ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name, if (!dir_emit(ctx, dentry->d_name.name,
...@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, ...@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
} else { } else {
dput(dentry); dput(dentry);
} }
cache_ctl.index++;
ptr_pos += sizeof(struct dentry *);
} }
out:
ceph_readdir_cache_release(&cache_ctl); ceph_readdir_cache_release(&cache_ctl);
if (last) { if (last) {
int ret; int ret;
...@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, ...@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
return err; return err;
} }
static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
{
if (!fi->last_readdir)
return true;
if (is_hash_order(pos))
return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
else
return fi->frag != fpos_frag(pos);
}
static int ceph_readdir(struct file *file, struct dir_context *ctx) static int ceph_readdir(struct file *file, struct dir_context *ctx)
{ {
struct ceph_file_info *fi = file->private_data; struct ceph_file_info *fi = file->private_data;
...@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned frag = fpos_frag(ctx->pos); int i;
int off = fpos_off(ctx->pos);
int err; int err;
u32 ftype; u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo; struct ceph_mds_reply_info_parsed *rinfo;
dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
if (fi->flags & CEPH_F_ATEND) if (fi->flags & CEPH_F_ATEND)
return 0; return 0;
...@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
inode->i_mode >> 12)) inode->i_mode >> 12))
return 0; return 0;
ctx->pos = 1; ctx->pos = 1;
off = 1;
} }
if (ctx->pos == 1) { if (ctx->pos == 1) {
ino_t ino = parent_ino(file->f_path.dentry); ino_t ino = parent_ino(file->f_path.dentry);
...@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
inode->i_mode >> 12)) inode->i_mode >> 12))
return 0; return 0;
ctx->pos = 2; ctx->pos = 2;
off = 2;
} }
/* can we use the dcache? */ /* can we use the dcache? */
...@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
err = __dcache_readdir(file, ctx, shared_gen); err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN) if (err != -EAGAIN)
return err; return err;
frag = fpos_frag(ctx->pos);
off = fpos_off(ctx->pos);
} else { } else {
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
} }
...@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* proceed with a normal readdir */ /* proceed with a normal readdir */
more: more:
/* do we have the correct frag content buffered? */ /* do we have the correct frag content buffered? */
if (fi->frag != frag || fi->last_readdir == NULL) { if (need_send_readdir(fi, ctx->pos)) {
struct ceph_mds_request *req; struct ceph_mds_request *req;
unsigned frag;
int op = ceph_snap(inode) == CEPH_SNAPDIR ? int op = ceph_snap(inode) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
...@@ -305,6 +372,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -305,6 +372,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
fi->last_readdir = NULL; fi->last_readdir = NULL;
} }
if (is_hash_order(ctx->pos)) {
frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
NULL, NULL);
} else {
frag = fpos_frag(ctx->pos);
}
dout("readdir fetching %llx.%llx frag %x offset '%s'\n", dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
ceph_vinop(inode), frag, fi->last_name); ceph_vinop(inode), frag, fi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
...@@ -331,6 +405,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -331,6 +405,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_cache_idx = fi->readdir_cache_idx;
req->r_readdir_offset = fi->next_offset; req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.frag = cpu_to_le32(frag);
req->r_args.readdir.flags =
cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
req->r_inode = inode; req->r_inode = inode;
ihold(inode); ihold(inode);
...@@ -340,22 +416,26 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -340,22 +416,26 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
return err; return err;
} }
dout("readdir got and parsed readdir result=%d" dout("readdir got and parsed readdir result=%d on "
" on frag %x, end=%d, complete=%d\n", err, frag, "frag %x, end=%d, complete=%d, hash_order=%d\n",
err, frag,
(int)req->r_reply_info.dir_end, (int)req->r_reply_info.dir_end,
(int)req->r_reply_info.dir_complete); (int)req->r_reply_info.dir_complete,
(int)req->r_reply_info.hash_order);
/* note next offset and last dentry name */
rinfo = &req->r_reply_info; rinfo = &req->r_reply_info;
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
frag = le32_to_cpu(rinfo->dir_dir->frag); frag = le32_to_cpu(rinfo->dir_dir->frag);
off = req->r_readdir_offset; if (!rinfo->hash_order) {
fi->next_offset = off; fi->next_offset = req->r_readdir_offset;
/* adjust ctx->pos to beginning of frag */
ctx->pos = ceph_make_fpos(frag,
fi->next_offset,
false);
}
} }
fi->frag = frag; fi->frag = frag;
fi->offset = fi->next_offset;
fi->last_readdir = req; fi->last_readdir = req;
if (req->r_did_prepopulate) { if (req->r_did_prepopulate) {
...@@ -363,7 +443,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -363,7 +443,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
if (fi->readdir_cache_idx < 0) { if (fi->readdir_cache_idx < 0) {
/* preclude from marking dir ordered */ /* preclude from marking dir ordered */
fi->dir_ordered_count = 0; fi->dir_ordered_count = 0;
} else if (ceph_frag_is_leftmost(frag) && off == 2) { } else if (ceph_frag_is_leftmost(frag) &&
fi->next_offset == 2) {
/* note dir version at start of readdir so /* note dir version at start of readdir so
* we can tell if any dentries get dropped */ * we can tell if any dentries get dropped */
fi->dir_release_count = req->r_dir_release_cnt; fi->dir_release_count = req->r_dir_release_cnt;
...@@ -377,65 +458,87 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -377,65 +458,87 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
fi->dir_release_count = 0; fi->dir_release_count = 0;
} }
if (req->r_reply_info.dir_end) { /* note next offset and last dentry name */
kfree(fi->last_name); if (rinfo->dir_nr > 0) {
fi->last_name = NULL; struct ceph_mds_reply_dir_entry *rde =
if (ceph_frag_is_rightmost(frag)) rinfo->dir_entries + (rinfo->dir_nr-1);
fi->next_offset = 2; unsigned next_offset = req->r_reply_info.dir_end ?
else 2 : (fpos_off(rde->offset) + 1);
fi->next_offset = 0; err = note_last_dentry(fi, rde->name, rde->name_len,
} else { next_offset);
err = note_last_dentry(fi,
rinfo->dir_dname[rinfo->dir_nr-1],
rinfo->dir_dname_len[rinfo->dir_nr-1],
fi->next_offset + rinfo->dir_nr);
if (err) if (err)
return err; return err;
} else if (req->r_reply_info.dir_end) {
fi->next_offset = 2;
/* keep last name */
} }
} }
rinfo = &fi->last_readdir->r_reply_info; rinfo = &fi->last_readdir->r_reply_info;
dout("readdir frag %x num %d off %d chunkoff %d\n", frag, dout("readdir frag %x num %d pos %llx chunk first %llx\n",
rinfo->dir_nr, off, fi->offset); fi->frag, rinfo->dir_nr, ctx->pos,
rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
ctx->pos = ceph_make_fpos(frag, off);
while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { i = 0;
struct ceph_mds_reply_inode *in = /* search start position */
rinfo->dir_in[off - fi->offset].in; if (rinfo->dir_nr > 0) {
int step, nr = rinfo->dir_nr;
while (nr > 0) {
step = nr >> 1;
if (rinfo->dir_entries[i + step].offset < ctx->pos) {
i += step + 1;
nr -= step + 1;
} else {
nr = step;
}
}
}
for (; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino; struct ceph_vino vino;
ino_t ino; ino_t ino;
dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", BUG_ON(rde->offset < ctx->pos);
off, off - fi->offset, rinfo->dir_nr, ctx->pos,
rinfo->dir_dname_len[off - fi->offset], ctx->pos = rde->offset;
rinfo->dir_dname[off - fi->offset], in); dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
BUG_ON(!in); i, rinfo->dir_nr, ctx->pos,
ftype = le32_to_cpu(in->mode) >> 12; rde->name_len, rde->name, &rde->inode.in);
vino.ino = le64_to_cpu(in->ino);
vino.snap = le64_to_cpu(in->snapid); BUG_ON(!rde->inode.in);
ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
vino.ino = le64_to_cpu(rde->inode.in->ino);
vino.snap = le64_to_cpu(rde->inode.in->snapid);
ino = ceph_vino_to_ino(vino); ino = ceph_vino_to_ino(vino);
if (!dir_emit(ctx,
rinfo->dir_dname[off - fi->offset], if (!dir_emit(ctx, rde->name, rde->name_len,
rinfo->dir_dname_len[off - fi->offset],
ceph_translate_ino(inode->i_sb, ino), ftype)) { ceph_translate_ino(inode->i_sb, ino), ftype)) {
dout("filldir stopping us...\n"); dout("filldir stopping us...\n");
return 0; return 0;
} }
off++;
ctx->pos++; ctx->pos++;
} }
if (fi->last_name) { if (fi->next_offset > 2) {
ceph_mdsc_put_request(fi->last_readdir); ceph_mdsc_put_request(fi->last_readdir);
fi->last_readdir = NULL; fi->last_readdir = NULL;
goto more; goto more;
} }
/* more frags? */ /* more frags? */
if (!ceph_frag_is_rightmost(frag)) { if (!ceph_frag_is_rightmost(fi->frag)) {
frag = ceph_frag_next(frag); unsigned frag = ceph_frag_next(fi->frag);
off = 0; if (is_hash_order(ctx->pos)) {
ctx->pos = ceph_make_fpos(frag, off); loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
fi->next_offset, true);
if (new_pos > ctx->pos)
ctx->pos = new_pos;
/* keep last_name */
} else {
ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
kfree(fi->last_name);
fi->last_name = NULL;
}
dout("readdir next frag is %x\n", frag); dout("readdir next frag is %x\n", frag);
goto more; goto more;
} }
...@@ -467,7 +570,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -467,7 +570,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
return 0; return 0;
} }
static void reset_readdir(struct ceph_file_info *fi, unsigned frag) static void reset_readdir(struct ceph_file_info *fi)
{ {
if (fi->last_readdir) { if (fi->last_readdir) {
ceph_mdsc_put_request(fi->last_readdir); ceph_mdsc_put_request(fi->last_readdir);
...@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) ...@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
fi->last_name = NULL; fi->last_name = NULL;
fi->dir_release_count = 0; fi->dir_release_count = 0;
fi->readdir_cache_idx = -1; fi->readdir_cache_idx = -1;
if (ceph_frag_is_leftmost(frag))
fi->next_offset = 2; /* compensate for . and .. */ fi->next_offset = 2; /* compensate for . and .. */
else
fi->next_offset = 0;
fi->flags &= ~CEPH_F_ATEND; fi->flags &= ~CEPH_F_ATEND;
} }
/*
* discard buffered readdir content on seekdir(0), or seek to new frag,
* or seek prior to current chunk
*/
static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
{
struct ceph_mds_reply_info_parsed *rinfo;
loff_t chunk_offset;
if (new_pos == 0)
return true;
if (is_hash_order(new_pos)) {
/* no need to reset last_name for a forward seek when
* dentries are sotred in hash order */
} else if (fi->frag |= fpos_frag(new_pos)) {
return true;
}
rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
if (!rinfo || !rinfo->dir_nr)
return true;
chunk_offset = rinfo->dir_entries[0].offset;
return new_pos < chunk_offset ||
is_hash_order(new_pos) != is_hash_order(chunk_offset);
}
static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{ {
struct ceph_file_info *fi = file->private_data; struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host; struct inode *inode = file->f_mapping->host;
loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
loff_t retval; loff_t retval;
inode_lock(inode); inode_lock(inode);
...@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) ...@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
} }
if (offset >= 0) { if (offset >= 0) {
if (need_reset_readdir(fi, offset)) {
dout("dir_llseek dropping %p content\n", file);
reset_readdir(fi);
} else if (is_hash_order(offset) && offset > file->f_pos) {
/* for hash offset, we don't know if a forward seek
* is within same frag */
fi->dir_release_count = 0;
fi->readdir_cache_idx = -1;
}
if (offset != file->f_pos) { if (offset != file->f_pos) {
file->f_pos = offset; file->f_pos = offset;
file->f_version = 0; file->f_version = 0;
fi->flags &= ~CEPH_F_ATEND; fi->flags &= ~CEPH_F_ATEND;
} }
retval = offset; retval = offset;
if (offset == 0 ||
fpos_frag(offset) != fi->frag ||
fpos_off(offset) < fi->offset) {
/* discard buffered readdir content on seekdir(0), or
* seek to new frag, or seek prior to current chunk */
dout("dir_llseek dropping %p content\n", file);
reset_readdir(fi, fpos_frag(offset));
} else if (fpos_cmp(offset, old_offset) > 0) {
/* reset dir_release_count if we did a forward seek */
fi->dir_release_count = 0;
fi->readdir_cache_idx = -1;
}
} }
out: out:
inode_unlock(inode); inode_unlock(inode);
...@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, ...@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
return dentry; return dentry;
} }
static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
{ {
return ceph_ino(inode) == CEPH_INO_ROOT && return ceph_ino(inode) == CEPH_INO_ROOT &&
strncmp(dentry->d_name.name, ".ceph", 5) == 0; strncmp(dentry->d_name.name, ".ceph", 5) == 0;
......
...@@ -191,6 +191,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) ...@@ -191,6 +191,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
return ret; return ret;
} }
/*
* try renew caps after session gets killed.
*/
int ceph_renew_caps(struct inode *inode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
spin_lock(&ci->i_ceph_lock);
wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) &&
(!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
int issued = __ceph_caps_issued(ci, NULL);
spin_unlock(&ci->i_ceph_lock);
dout("renew caps %p want %s issued %s updating mds_wanted\n",
inode, ceph_cap_string(wanted), ceph_cap_string(issued));
ceph_check_caps(ci, 0, NULL);
return 0;
}
spin_unlock(&ci->i_ceph_lock);
flags = 0;
if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
flags = O_RDWR;
else if (wanted & CEPH_CAP_FILE_RD)
flags = O_RDONLY;
else if (wanted & CEPH_CAP_FILE_WR)
flags = O_WRONLY;
#ifdef O_LAZY
if (wanted & CEPH_CAP_FILE_LAZYIO)
flags |= O_LAZY;
#endif
req = prepare_open_request(inode->i_sb, flags, 0);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
}
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
req->r_fmode = -1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
out:
dout("renew caps %p open result=%d\n", inode, err);
return err < 0 ? err : 0;
}
/* /*
* If we already have the requisite capabilities, we can satisfy * If we already have the requisite capabilities, we can satisfy
* the open request locally (no need to request new caps from the * the open request locally (no need to request new caps from the
...@@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode, ...@@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode,
kfree(aio_req); kfree(aio_req);
} }
static void ceph_aio_complete_req(struct ceph_osd_request *req, static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct ceph_msg *msg)
{ {
int rc = req->r_result; int rc = req->r_result;
struct inode *inode = req->r_inode; struct inode *inode = req->r_inode;
...@@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work) ...@@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE; CEPH_OSD_FLAG_WRITE;
req->r_base_oloc = orig_req->r_base_oloc; ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
req->r_base_oid = orig_req->r_base_oid; ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
if (ret) {
ceph_osdc_put_request(req);
req = orig_req;
goto out;
}
req->r_ops[0] = orig_req->r_ops[0]; req->r_ops[0] = orig_req->r_ops[0];
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
ceph_osdc_build_request(req, req->r_ops[0].extent.offset, req->r_mtime = aio_req->mtime;
snapc, CEPH_NOSNAP, &aio_req->mtime); req->r_data_offset = req->r_ops[0].extent.offset;
ceph_osdc_put_request(orig_req); ceph_osdc_put_request(orig_req);
...@@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work) ...@@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
out: out:
if (ret < 0) { if (ret < 0) {
req->r_result = ret; req->r_result = ret;
ceph_aio_complete_req(req, NULL); ceph_aio_complete_req(req);
} }
ceph_put_snap_context(snapc); ceph_put_snap_context(snapc);
...@@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) ...@@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
list_add_tail(&req->r_unsafe_item, list_add_tail(&req->r_unsafe_item,
&ci->i_unsafe_writes); &ci->i_unsafe_writes);
spin_unlock(&ci->i_unsafe_lock); spin_unlock(&ci->i_unsafe_lock);
complete_all(&req->r_completion);
} else { } else {
spin_lock(&ci->i_unsafe_lock); spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_item); list_del_init(&req->r_unsafe_item);
...@@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
(pos+len) | (PAGE_SIZE - 1)); (pos+len) | (PAGE_SIZE - 1));
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
req->r_mtime = mtime;
} }
osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
false, false); false, false);
ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
if (aio_req) { if (aio_req) {
aio_req->total_len += len; aio_req->total_len += len;
aio_req->num_reqs++; aio_req->num_reqs++;
...@@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
req, false); req, false);
if (ret < 0) { if (ret < 0) {
req->r_result = ret; req->r_result = ret;
ceph_aio_complete_req(req, NULL); ceph_aio_complete_req(req);
} }
} }
return -EIOCBQUEUED; return -EIOCBQUEUED;
...@@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ...@@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
false, true); false, true);
/* BUG_ON(vino.snap != CEPH_NOSNAP); */ req->r_mtime = mtime;
ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret) if (!ret)
ret = ceph_osdc_wait_request(&fsc->client->osdc, req); ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
...@@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode, ...@@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode,
goto out; goto out;
} }
ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, req->r_mtime = inode->i_mtime;
&inode->i_mtime);
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret) { if (!ret) {
ret = ceph_osdc_wait_request(&fsc->client->osdc, req); ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <linux/xattr.h> #include <linux/xattr.h>
#include <linux/posix_acl.h> #include <linux/posix_acl.h>
#include <linux/random.h> #include <linux/random.h>
#include <linux/sort.h>
#include "super.h" #include "super.h"
#include "mds_client.h" #include "mds_client.h"
...@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode, ...@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
diri_auth = ci->i_auth_cap->mds; diri_auth = ci->i_auth_cap->mds;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (mds == -1) /* CDIR_AUTH_PARENT */
mds = diri_auth;
mutex_lock(&ci->i_fragtree_mutex); mutex_lock(&ci->i_fragtree_mutex);
if (ndist == 0 && mds == diri_auth) { if (ndist == 0 && mds == diri_auth) {
/* no delegation info needed. */ /* no delegation info needed. */
...@@ -300,20 +304,38 @@ static int ceph_fill_dirfrag(struct inode *inode, ...@@ -300,20 +304,38 @@ static int ceph_fill_dirfrag(struct inode *inode,
return err; return err;
} }
static int frag_tree_split_cmp(const void *l, const void *r)
{
struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
return ceph_frag_compare(ls->frag, rs->frag);
}
static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
{
if (!frag)
return f == ceph_frag_make(0, 0);
if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
return false;
return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
}
static int ceph_fill_fragtree(struct inode *inode, static int ceph_fill_fragtree(struct inode *inode,
struct ceph_frag_tree_head *fragtree, struct ceph_frag_tree_head *fragtree,
struct ceph_mds_reply_dirfrag *dirinfo) struct ceph_mds_reply_dirfrag *dirinfo)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_frag *frag; struct ceph_inode_frag *frag, *prev_frag = NULL;
struct rb_node *rb_node; struct rb_node *rb_node;
int i; unsigned i, split_by, nsplits;
u32 id, nsplits; u32 id;
bool update = false; bool update = false;
mutex_lock(&ci->i_fragtree_mutex); mutex_lock(&ci->i_fragtree_mutex);
nsplits = le32_to_cpu(fragtree->nsplits); nsplits = le32_to_cpu(fragtree->nsplits);
if (nsplits) { if (nsplits != ci->i_fragtree_nsplits) {
update = true;
} else if (nsplits) {
i = prandom_u32() % nsplits; i = prandom_u32() % nsplits;
id = le32_to_cpu(fragtree->splits[i].frag); id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id)) if (!__ceph_find_frag(ci, id))
...@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode, ...@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
if (!update) if (!update)
goto out_unlock; goto out_unlock;
if (nsplits > 1) {
sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
frag_tree_split_cmp, NULL);
}
dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
rb_node = rb_first(&ci->i_fragtree); rb_node = rb_first(&ci->i_fragtree);
for (i = 0; i < nsplits; i++) { for (i = 0; i < nsplits; i++) {
id = le32_to_cpu(fragtree->splits[i].frag); id = le32_to_cpu(fragtree->splits[i].frag);
split_by = le32_to_cpu(fragtree->splits[i].by);
if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
"frag %x split by %d\n", ceph_vinop(inode),
i, nsplits, id, split_by);
continue;
}
frag = NULL; frag = NULL;
while (rb_node) { while (rb_node) {
frag = rb_entry(rb_node, struct ceph_inode_frag, node); frag = rb_entry(rb_node, struct ceph_inode_frag, node);
...@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode, ...@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
break; break;
} }
rb_node = rb_next(rb_node); rb_node = rb_next(rb_node);
/* delete stale split/leaf node */
if (frag->split_by > 0 ||
!is_frag_child(frag->frag, prev_frag)) {
rb_erase(&frag->node, &ci->i_fragtree); rb_erase(&frag->node, &ci->i_fragtree);
if (frag->split_by > 0)
ci->i_fragtree_nsplits--;
kfree(frag); kfree(frag);
}
frag = NULL; frag = NULL;
} }
if (!frag) { if (!frag) {
...@@ -356,15 +396,24 @@ static int ceph_fill_fragtree(struct inode *inode, ...@@ -356,15 +396,24 @@ static int ceph_fill_fragtree(struct inode *inode,
if (IS_ERR(frag)) if (IS_ERR(frag))
continue; continue;
} }
frag->split_by = le32_to_cpu(fragtree->splits[i].by); if (frag->split_by == 0)
ci->i_fragtree_nsplits++;
frag->split_by = split_by;
dout(" frag %x split by %d\n", frag->frag, frag->split_by); dout(" frag %x split by %d\n", frag->frag, frag->split_by);
prev_frag = frag;
} }
while (rb_node) { while (rb_node) {
frag = rb_entry(rb_node, struct ceph_inode_frag, node); frag = rb_entry(rb_node, struct ceph_inode_frag, node);
rb_node = rb_next(rb_node); rb_node = rb_next(rb_node);
/* delete stale split/leaf node */
if (frag->split_by > 0 ||
!is_frag_child(frag->frag, prev_frag)) {
rb_erase(&frag->node, &ci->i_fragtree); rb_erase(&frag->node, &ci->i_fragtree);
if (frag->split_by > 0)
ci->i_fragtree_nsplits--;
kfree(frag); kfree(frag);
} }
}
out_unlock: out_unlock:
mutex_unlock(&ci->i_fragtree_mutex); mutex_unlock(&ci->i_fragtree_mutex);
return 0; return 0;
...@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode) ...@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
rb_erase(n, &ci->i_fragtree); rb_erase(n, &ci->i_fragtree);
kfree(frag); kfree(frag);
} }
ci->i_fragtree_nsplits = 0;
__ceph_destroy_xattrs(ci); __ceph_destroy_xattrs(ci);
if (ci->i_xattrs.blob) if (ci->i_xattrs.blob)
...@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode) ...@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
return 1; return 1;
} }
static inline blkcnt_t calc_inode_blocks(u64 size)
{
return (size + (1<<9) - 1) >> 9;
}
/* /*
* Helpers to fill in size, ctime, mtime, and atime. We have to be * Helpers to fill in size, ctime, mtime, and atime. We have to be
* careful because either the client or MDS may have more up to date * careful because either the client or MDS may have more up to date
...@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, ...@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
size = 0; size = 0;
} }
i_size_write(inode, size); i_size_write(inode, size);
inode->i_blocks = (size + (1<<9) - 1) >> 9; inode->i_blocks = calc_inode_blocks(size);
ci->i_reported_size = size; ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) { if (truncate_seq != ci->i_truncate_seq) {
dout("truncate_seq %u -> %u\n", dout("truncate_seq %u -> %u\n",
...@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ...@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
err = -EINVAL; if (symlen != i_size_read(inode)) {
if (WARN_ON(symlen != i_size_read(inode))) pr_err("fill_inode %llx.%llx BAD symlink "
goto out; "size %lld\n", ceph_vinop(inode),
i_size_read(inode));
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
}
err = -ENOMEM; err = -ENOMEM;
sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
...@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, ...@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
int i, err = 0; int i, err = 0;
for (i = 0; i < rinfo->dir_nr; i++) { for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino; struct ceph_vino vino;
struct inode *in; struct inode *in;
int rc; int rc;
vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); vino.ino = le64_to_cpu(rde->inode.in->ino);
vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); vino.snap = le64_to_cpu(rde->inode.in->snapid);
in = ceph_get_inode(req->r_dentry->d_sb, vino); in = ceph_get_inode(req->r_dentry->d_sb, vino);
if (IS_ERR(in)) { if (IS_ERR(in)) {
...@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, ...@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err); dout("new_inode badness got %d\n", err);
continue; continue;
} }
rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, rc = fill_inode(in, NULL, &rde->inode, NULL, session,
req->r_request_started, -1, req->r_request_started, -1,
&req->r_caps_reservation); &req->r_caps_reservation);
if (rc < 0) { if (rc < 0) {
pr_err("fill_inode badness on %p got %d\n", in, rc); pr_err("fill_inode badness on %p got %d\n", in, rc);
err = rc; err = rc;
continue;
} }
iput(in);
} }
return err; return err;
...@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session) struct ceph_mds_session *session)
{ {
struct dentry *parent = req->r_dentry; struct dentry *parent = req->r_dentry;
struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct qstr dname; struct qstr dname;
struct dentry *dn; struct dentry *dn;
...@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
int err = 0, skipped = 0, ret, i; int err = 0, skipped = 0, ret, i;
struct inode *snapdir = NULL; struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di;
u32 frag = le32_to_cpu(rhead->args.readdir.frag); u32 frag = le32_to_cpu(rhead->args.readdir.frag);
u32 last_hash = 0;
u32 fpos_offset;
struct ceph_readdir_cache_control cache_ctl = {}; struct ceph_readdir_cache_control cache_ctl = {};
if (req->r_aborted) if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session); return readdir_prepopulate_inodes_only(req, session);
if (rinfo->hash_order && req->r_path2) {
last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
req->r_path2, strlen(req->r_path2));
last_hash = ceph_frag_value(last_hash);
}
if (rinfo->dir_dir && if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) { le32_to_cpu(rinfo->dir_dir->frag) != frag) {
dout("readdir_prepopulate got new frag %x -> %x\n", dout("readdir_prepopulate got new frag %x -> %x\n",
frag, le32_to_cpu(rinfo->dir_dir->frag)); frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag); frag = le32_to_cpu(rinfo->dir_dir->frag);
if (ceph_frag_is_leftmost(frag)) if (!rinfo->hash_order)
req->r_readdir_offset = 2; req->r_readdir_offset = 2;
else
req->r_readdir_offset = 0;
} }
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
...@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
/* note dir version at start of readdir so we can tell /* note dir version at start of readdir so we can tell
* if any dentries get dropped */ * if any dentries get dropped */
struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
req->r_readdir_cache_idx = 0; req->r_readdir_cache_idx = 0;
} }
cache_ctl.index = req->r_readdir_cache_idx; cache_ctl.index = req->r_readdir_cache_idx;
fpos_offset = req->r_readdir_offset;
/* FIXME: release caps/leases if error occurs */ /* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) { for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino; struct ceph_vino vino;
dname.name = rinfo->dir_dname[i]; dname.name = rde->name;
dname.len = rinfo->dir_dname_len[i]; dname.len = rde->name_len;
dname.hash = full_name_hash(dname.name, dname.len); dname.hash = full_name_hash(dname.name, dname.len);
vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); vino.ino = le64_to_cpu(rde->inode.in->ino);
vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); vino.snap = le64_to_cpu(rde->inode.in->snapid);
if (rinfo->hash_order) {
u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
rde->name, rde->name_len);
hash = ceph_frag_value(hash);
if (hash != last_hash)
fpos_offset = 2;
last_hash = hash;
rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
} else {
rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
}
retry_lookup: retry_lookup:
dn = d_lookup(parent, &dname); dn = d_lookup(parent, &dname);
...@@ -1490,7 +1569,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1490,7 +1569,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
} }
ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, ret = fill_inode(in, NULL, &rde->inode, NULL, session,
req->r_request_started, -1, req->r_request_started, -1,
&req->r_caps_reservation); &req->r_caps_reservation);
if (ret < 0) { if (ret < 0) {
...@@ -1523,11 +1602,9 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1523,11 +1602,9 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
dn = realdn; dn = realdn;
} }
di = dn->d_fsdata; ceph_dentry(dn)->offset = rde->offset;
di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
update_dentry_lease(dn, rinfo->dir_dlease[i], update_dentry_lease(dn, rde->lease, req->r_session,
req->r_session,
req->r_request_started); req->r_request_started);
if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
...@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) ...@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
i_size_write(inode, size); i_size_write(inode, size);
inode->i_blocks = (size + (1 << 9) - 1) >> 9; inode->i_blocks = calc_inode_blocks(size);
/* tell the MDS if we are approaching max_size */ /* tell the MDS if we are approaching max_size */
if ((size << 1) >= ci->i_max_size && if ((size << 1) >= ci->i_max_size &&
...@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work) ...@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_pg_inv_work); i_pg_inv_work);
struct inode *inode = &ci->vfs_inode; struct inode *inode = &ci->vfs_inode;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
u32 orig_gen; u32 orig_gen;
int check = 0; int check = 0;
mutex_lock(&ci->i_truncate_mutex); mutex_lock(&ci->i_truncate_mutex);
if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
inode, ceph_ino(inode));
mapping_set_error(inode->i_mapping, -EIO);
truncate_pagecache(inode, 0);
mutex_unlock(&ci->i_truncate_mutex);
goto out;
}
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
dout("invalidate_pages %p gen %d revoking %d\n", inode, dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking); ci->i_rdcache_gen, ci->i_rdcache_revoking);
...@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work) ...@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
orig_gen = ci->i_rdcache_gen; orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
truncate_pagecache(inode, 0); if (invalidate_inode_pages2(inode->i_mapping) < 0) {
pr_err("invalidate_pages %p fails\n", inode);
}
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen && if (orig_gen == ci->i_rdcache_gen &&
...@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) ...@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if ((issued & CEPH_CAP_FILE_EXCL) && if ((issued & CEPH_CAP_FILE_EXCL) &&
attr->ia_size > inode->i_size) { attr->ia_size > inode->i_size) {
i_size_write(inode, attr->ia_size); i_size_write(inode, attr->ia_size);
inode->i_blocks = inode->i_blocks = calc_inode_blocks(attr->ia_size);
(attr->ia_size + (1 << 9) - 1) >> 9;
inode->i_ctime = attr->ia_ctime; inode->i_ctime = attr->ia_ctime;
ci->i_reported_size = attr->ia_size; ci->i_reported_size = attr->ia_size;
dirtied |= CEPH_CAP_FILE_EXCL; dirtied |= CEPH_CAP_FILE_EXCL;
......
...@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ...@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
if (copy_from_user(&dl, arg, sizeof(dl))) if (copy_from_user(&dl, arg, sizeof(dl)))
return -EFAULT; return -EFAULT;
down_read(&osdc->map_sem); down_read(&osdc->lock);
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset, &dl.object_no, &dl.object_offset,
&olen); &olen);
if (r < 0) { if (r < 0) {
up_read(&osdc->map_sem); up_read(&osdc->lock);
return -EIO; return -EIO;
} }
dl.file_offset -= dl.object_offset; dl.file_offset -= dl.object_offset;
...@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ...@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
ceph_ino(inode), dl.object_no); ceph_ino(inode), dl.object_no);
oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
ceph_oid_set_name(&oid, dl.object_name); ceph_oid_printf(&oid, "%s", dl.object_name);
r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
if (r < 0) { if (r < 0) {
up_read(&osdc->map_sem); up_read(&osdc->lock);
return r; return r;
} }
dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
if (dl.osd >= 0) { if (dl.osd >= 0) {
struct ceph_entity_addr *a = struct ceph_entity_addr *a =
ceph_osd_addr(osdc->osdmap, dl.osd); ceph_osd_addr(osdc->osdmap, dl.osd);
...@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ...@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
} else { } else {
memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
} }
up_read(&osdc->map_sem); up_read(&osdc->lock);
/* send result back to user */ /* send result back to user */
if (copy_to_user(arg, &dl, sizeof(dl))) if (copy_to_user(arg, &dl, sizeof(dl)))
......
...@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end, ...@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end,
ceph_decode_need(p, end, sizeof(num) + 2, bad); ceph_decode_need(p, end, sizeof(num) + 2, bad);
num = ceph_decode_32(p); num = ceph_decode_32(p);
info->dir_end = ceph_decode_8(p); {
info->dir_complete = ceph_decode_8(p); u16 flags = ceph_decode_16(p);
info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
}
if (num == 0) if (num == 0)
goto done; goto done;
BUG_ON(!info->dir_in); BUG_ON(!info->dir_entries);
info->dir_dname = (void *)(info->dir_in + num); if ((unsigned long)(info->dir_entries + num) >
info->dir_dname_len = (void *)(info->dir_dname + num); (unsigned long)info->dir_entries + info->dir_buf_size) {
info->dir_dlease = (void *)(info->dir_dname_len + num);
if ((unsigned long)(info->dir_dlease + num) >
(unsigned long)info->dir_in + info->dir_buf_size) {
pr_err("dir contents are larger than expected\n"); pr_err("dir contents are larger than expected\n");
WARN_ON(1); WARN_ON(1);
goto bad; goto bad;
...@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end, ...@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end,
info->dir_nr = num; info->dir_nr = num;
while (num) { while (num) {
struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
/* dentry */ /* dentry */
ceph_decode_need(p, end, sizeof(u32)*2, bad); ceph_decode_need(p, end, sizeof(u32)*2, bad);
info->dir_dname_len[i] = ceph_decode_32(p); rde->name_len = ceph_decode_32(p);
ceph_decode_need(p, end, info->dir_dname_len[i], bad); ceph_decode_need(p, end, rde->name_len, bad);
info->dir_dname[i] = *p; rde->name = *p;
*p += info->dir_dname_len[i]; *p += rde->name_len;
dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
info->dir_dname[i]); rde->lease = *p;
info->dir_dlease[i] = *p;
*p += sizeof(struct ceph_mds_reply_lease); *p += sizeof(struct ceph_mds_reply_lease);
/* inode */ /* inode */
err = parse_reply_info_in(p, end, &info->dir_in[i], features); err = parse_reply_info_in(p, end, &rde->inode, features);
if (err < 0) if (err < 0)
goto out_bad; goto out_bad;
/* ceph_readdir_prepopulate() will update it */
rde->offset = 0;
i++; i++;
num--; num--;
} }
...@@ -345,9 +348,9 @@ static int parse_reply_info(struct ceph_msg *msg, ...@@ -345,9 +348,9 @@ static int parse_reply_info(struct ceph_msg *msg,
static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
{ {
if (!info->dir_in) if (!info->dir_entries)
return; return;
free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
} }
...@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref) ...@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref)
kfree(req); kfree(req);
} }
DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
/* /*
* lookup session, bump ref if found. * lookup session, bump ref if found.
* *
* called under mdsc->mutex. * called under mdsc->mutex.
*/ */
static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, static struct ceph_mds_request *
u64 tid) lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
{ {
struct ceph_mds_request *req; struct ceph_mds_request *req;
struct rb_node *n = mdsc->request_tree.rb_node;
while (n) {
req = rb_entry(n, struct ceph_mds_request, r_node);
if (tid < req->r_tid)
n = n->rb_left;
else if (tid > req->r_tid)
n = n->rb_right;
else {
ceph_mdsc_get_request(req);
return req;
}
}
return NULL;
}
static void __insert_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *new)
{
struct rb_node **p = &mdsc->request_tree.rb_node;
struct rb_node *parent = NULL;
struct ceph_mds_request *req = NULL;
while (*p) { req = lookup_request(&mdsc->request_tree, tid);
parent = *p; if (req)
req = rb_entry(parent, struct ceph_mds_request, r_node); ceph_mdsc_get_request(req);
if (new->r_tid < req->r_tid)
p = &(*p)->rb_left;
else if (new->r_tid > req->r_tid)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new->r_node, parent, p); return req;
rb_insert_color(&new->r_node, &mdsc->request_tree);
} }
/* /*
...@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc, ...@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
req->r_num_caps); req->r_num_caps);
dout("__register_request %p tid %lld\n", req, req->r_tid); dout("__register_request %p tid %lld\n", req, req->r_tid);
ceph_mdsc_get_request(req); ceph_mdsc_get_request(req);
__insert_request(mdsc, req); insert_request(&mdsc->request_tree, req);
req->r_uid = current_fsuid(); req->r_uid = current_fsuid();
req->r_gid = current_fsgid(); req->r_gid = current_fsgid();
...@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, ...@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
} }
} }
rb_erase(&req->r_node, &mdsc->request_tree); erase_request(&mdsc->request_tree, req);
RB_CLEAR_NODE(&req->r_node);
if (req->r_unsafe_dir && req->r_got_unsafe) { if (req->r_unsafe_dir && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
...@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 ...@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
int metadata_bytes = 0; int metadata_bytes = 0;
int metadata_key_count = 0; int metadata_key_count = 0;
struct ceph_options *opt = mdsc->fsc->client->options; struct ceph_options *opt = mdsc->fsc->client->options;
struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
void *p; void *p;
const char* metadata[][2] = { const char* metadata[][2] = {
{"hostname", utsname()->nodename}, {"hostname", utsname()->nodename},
{"kernel_version", utsname()->release}, {"kernel_version", utsname()->release},
{"entity_id", opt->name ? opt->name : ""}, {"entity_id", opt->name ? : ""},
{"root", fsopt->server_path ? : "/"},
{NULL, NULL} {NULL, NULL}
}; };
...@@ -1149,9 +1125,11 @@ static int iterate_session_caps(struct ceph_mds_session *session, ...@@ -1149,9 +1125,11 @@ static int iterate_session_caps(struct ceph_mds_session *session,
static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg) void *arg)
{ {
struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
LIST_HEAD(to_remove); LIST_HEAD(to_remove);
int drop = 0; bool drop = false;
bool invalidate = false;
dout("removing cap %p, ci is %p, inode is %p\n", dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode); cap, ci, &ci->vfs_inode);
...@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
__ceph_remove_cap(cap, false); __ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) { if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf; struct ceph_cap_flush *cf;
struct ceph_mds_client *mdsc = struct ceph_mds_client *mdsc = fsc->mdsc;
ceph_sb_to_client(inode->i_sb)->mdsc;
ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
if (ci->i_wrbuffer_ref > 0 &&
ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
invalidate = true;
while (true) { while (true) {
struct rb_node *n = rb_first(&ci->i_cap_flush_tree); struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
...@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
inode, ceph_ino(inode)); inode, ceph_ino(inode));
ci->i_dirty_caps = 0; ci->i_dirty_caps = 0;
list_del_init(&ci->i_dirty_item); list_del_init(&ci->i_dirty_item);
drop = 1; drop = true;
} }
if (!list_empty(&ci->i_flushing_item)) { if (!list_empty(&ci->i_flushing_item)) {
pr_warn_ratelimited( pr_warn_ratelimited(
...@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
ci->i_flushing_caps = 0; ci->i_flushing_caps = 0;
list_del_init(&ci->i_flushing_item); list_del_init(&ci->i_flushing_item);
mdsc->num_cap_flushing--; mdsc->num_cap_flushing--;
drop = 1; drop = true;
} }
spin_unlock(&mdsc->cap_dirty_lock); spin_unlock(&mdsc->cap_dirty_lock);
...@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
list_del(&cf->list); list_del(&cf->list);
ceph_free_cap_flush(cf); ceph_free_cap_flush(cf);
} }
while (drop--)
wake_up_all(&ci->i_cap_wq);
if (invalidate)
ceph_queue_invalidate(inode);
if (drop)
iput(inode); iput(inode);
return 0; return 0;
} }
...@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
*/ */
static void remove_session_caps(struct ceph_mds_session *session) static void remove_session_caps(struct ceph_mds_session *session)
{ {
struct ceph_fs_client *fsc = session->s_mdsc->fsc;
struct super_block *sb = fsc->sb;
dout("remove_session_caps on %p\n", session); dout("remove_session_caps on %p\n", session);
iterate_session_caps(session, remove_session_caps_cb, NULL); iterate_session_caps(session, remove_session_caps_cb, fsc);
spin_lock(&session->s_cap_lock); spin_lock(&session->s_cap_lock);
if (session->s_nr_caps > 0) { if (session->s_nr_caps > 0) {
struct super_block *sb = session->s_mdsc->fsc->sb;
struct inode *inode; struct inode *inode;
struct ceph_cap *cap, *prev = NULL; struct ceph_cap *cap, *prev = NULL;
struct ceph_vino vino; struct ceph_vino vino;
...@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
wake_up_all(&ci->i_cap_wq);
if (arg) { if (arg) {
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
ci->i_wanted_max_size = 0; ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0; ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
} }
wake_up_all(&ci->i_cap_wq);
return 0; return 0;
} }
...@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, ...@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + size_t size = sizeof(struct ceph_mds_reply_dir_entry);
sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
int order, num_entries; int order, num_entries;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
...@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, ...@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries); order = get_order(size * num_entries);
while (order >= 0) { while (order >= 0) {
rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
__GFP_NOWARN, __GFP_NOWARN,
order); order);
if (rinfo->dir_in) if (rinfo->dir_entries)
break; break;
order--; order--;
} }
if (!rinfo->dir_in) if (!rinfo->dir_entries)
return -ENOMEM; return -ENOMEM;
num_entries = (PAGE_SIZE << order) / size; num_entries = (PAGE_SIZE << order) / size;
...@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) ...@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
INIT_LIST_HEAD(&req->r_unsafe_target_item); INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1; req->r_fmode = -1;
kref_init(&req->r_kref); kref_init(&req->r_kref);
RB_CLEAR_NODE(&req->r_node);
INIT_LIST_HEAD(&req->r_wait); INIT_LIST_HEAD(&req->r_wait);
init_completion(&req->r_completion); init_completion(&req->r_completion);
init_completion(&req->r_safe_completion); init_completion(&req->r_safe_completion);
...@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* get request, session */ /* get request, session */
tid = le64_to_cpu(msg->hdr.tid); tid = le64_to_cpu(msg->hdr.tid);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
req = __lookup_request(mdsc, tid); req = lookup_get_request(mdsc, tid);
if (!req) { if (!req) {
dout("handle_reply on unknown tid %llu\n", tid); dout("handle_reply on unknown tid %llu\n", tid);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
...@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, ...@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
fwd_seq = ceph_decode_32(&p); fwd_seq = ceph_decode_32(&p);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
req = __lookup_request(mdsc, tid); req = lookup_get_request(mdsc, tid);
if (!req) { if (!req) {
dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
goto out; /* dup reply? */ goto out; /* dup reply? */
......
...@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in { ...@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in {
u32 pool_ns_len; u32 pool_ns_len;
}; };
struct ceph_mds_reply_dir_entry {
char *name;
u32 name_len;
struct ceph_mds_reply_lease *lease;
struct ceph_mds_reply_info_in inode;
loff_t offset;
};
/* /*
* parsed info about an mds reply, including information about * parsed info about an mds reply, including information about
* either: 1) the target inode and/or its parent directory and dentry, * either: 1) the target inode and/or its parent directory and dentry,
...@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed { ...@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_dirfrag *dir_dir; struct ceph_mds_reply_dirfrag *dir_dir;
size_t dir_buf_size; size_t dir_buf_size;
int dir_nr; int dir_nr;
char **dir_dname; bool dir_complete;
u32 *dir_dname_len; bool dir_end;
struct ceph_mds_reply_lease **dir_dlease; bool hash_order;
struct ceph_mds_reply_info_in *dir_in; struct ceph_mds_reply_dir_entry *dir_entries;
u8 dir_complete, dir_end;
}; };
/* for create results */ /* for create results */
......
...@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
const void *start = *p; const void *start = *p;
int i, j, n; int i, j, n;
int err = -EINVAL; int err = -EINVAL;
u16 version; u8 mdsmap_v, mdsmap_cv;
m = kzalloc(sizeof(*m), GFP_NOFS); m = kzalloc(sizeof(*m), GFP_NOFS);
if (m == NULL) if (m == NULL)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
ceph_decode_16_safe(p, end, version, bad); ceph_decode_need(p, end, 1 + 1, bad);
if (version > 3) { mdsmap_v = ceph_decode_8(p);
pr_warn("got mdsmap version %d > 3, failing", version); mdsmap_cv = ceph_decode_8(p);
if (mdsmap_v >= 4) {
u32 mdsmap_len;
ceph_decode_32_safe(p, end, mdsmap_len, bad);
if (end < *p + mdsmap_len)
goto bad; goto bad;
end = *p + mdsmap_len;
} }
ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
...@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u32 namelen; u32 namelen;
s32 mds, inc, state; s32 mds, inc, state;
u64 state_seq; u64 state_seq;
u8 infoversion; u8 info_v;
void *info_end = NULL;
struct ceph_entity_addr addr; struct ceph_entity_addr addr;
u32 num_export_targets; u32 num_export_targets;
void *pexport_targets = NULL; void *pexport_targets = NULL;
struct ceph_timespec laggy_since; struct ceph_timespec laggy_since;
struct ceph_mds_info *info; struct ceph_mds_info *info;
ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); ceph_decode_need(p, end, sizeof(u64) + 1, bad);
global_id = ceph_decode_64(p); global_id = ceph_decode_64(p);
infoversion = ceph_decode_8(p); info_v= ceph_decode_8(p);
if (info_v >= 4) {
u32 info_len;
u8 info_cv;
ceph_decode_need(p, end, 1 + sizeof(u32), bad);
info_cv = ceph_decode_8(p);
info_len = ceph_decode_32(p);
info_end = *p + info_len;
if (info_end > end)
goto bad;
}
ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
*p += sizeof(u64); *p += sizeof(u64);
namelen = ceph_decode_32(p); /* skip mds name */ namelen = ceph_decode_32(p); /* skip mds name */
*p += namelen; *p += namelen;
...@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
*p += sizeof(u32); *p += sizeof(u32);
ceph_decode_32_safe(p, end, namelen, bad); ceph_decode_32_safe(p, end, namelen, bad);
*p += namelen; *p += namelen;
if (infoversion >= 2) { if (info_v >= 2) {
ceph_decode_32_safe(p, end, num_export_targets, bad); ceph_decode_32_safe(p, end, num_export_targets, bad);
pexport_targets = *p; pexport_targets = *p;
*p += num_export_targets * sizeof(u32); *p += num_export_targets * sizeof(u32);
...@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
num_export_targets = 0; num_export_targets = 0;
} }
if (info_end && *p != info_end) {
if (*p > info_end)
goto bad;
*p = info_end;
}
dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
i+1, n, global_id, mds, inc, i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr.in_addr), ceph_pr_addr(&addr.in_addr),
...@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_cas_pg_pool = ceph_decode_64(p); m->m_cas_pg_pool = ceph_decode_64(p);
/* ok, we don't care about the rest. */ /* ok, we don't care about the rest. */
*p = end;
dout("mdsmap_decode success epoch %u\n", m->m_epoch); dout("mdsmap_decode success epoch %u\n", m->m_epoch);
return m; return m;
......
...@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) ...@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
* mount options * mount options
*/ */
enum { enum {
Opt_mds_namespace,
Opt_wsize, Opt_wsize,
Opt_rsize, Opt_rsize,
Opt_rasize, Opt_rasize,
...@@ -143,6 +144,7 @@ enum { ...@@ -143,6 +144,7 @@ enum {
}; };
static match_table_t fsopt_tokens = { static match_table_t fsopt_tokens = {
{Opt_mds_namespace, "mds_namespace=%d"},
{Opt_wsize, "wsize=%d"}, {Opt_wsize, "wsize=%d"},
{Opt_rsize, "rsize=%d"}, {Opt_rsize, "rsize=%d"},
{Opt_rasize, "rasize=%d"}, {Opt_rasize, "rasize=%d"},
...@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private) ...@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
break; break;
/* misc */ /* misc */
case Opt_mds_namespace:
fsopt->mds_namespace = intval;
break;
case Opt_wsize: case Opt_wsize:
fsopt->wsize = intval; fsopt->wsize = intval;
break; break;
...@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) ...@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
{ {
dout("destroy_mount_options %p\n", args); dout("destroy_mount_options %p\n", args);
kfree(args->snapdir_name); kfree(args->snapdir_name);
kfree(args->server_path);
kfree(args); kfree(args);
} }
...@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, ...@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
if (ret) if (ret)
return ret; return ret;
ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
if (ret)
return ret;
return ceph_compare_options(new_opt, fsc->client); return ceph_compare_options(new_opt, fsc->client);
} }
static int parse_mount_options(struct ceph_mount_options **pfsopt, static int parse_mount_options(struct ceph_mount_options **pfsopt,
struct ceph_options **popt, struct ceph_options **popt,
int flags, char *options, int flags, char *options,
const char *dev_name, const char *dev_name)
const char **path)
{ {
struct ceph_mount_options *fsopt; struct ceph_mount_options *fsopt;
const char *dev_name_end; const char *dev_name_end;
...@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, ...@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
fsopt->congestion_kb = default_congestion_kb(); fsopt->congestion_kb = default_congestion_kb();
fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
/* /*
* Distinguish the server list from the path in "dev_name". * Distinguish the server list from the path in "dev_name".
...@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, ...@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
*/ */
dev_name_end = strchr(dev_name, '/'); dev_name_end = strchr(dev_name, '/');
if (dev_name_end) { if (dev_name_end) {
/* skip over leading '/' for path */ fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
*path = dev_name_end + 1; if (!fsopt->server_path) {
err = -ENOMEM;
goto out;
}
} else { } else {
/* path is empty */
dev_name_end = dev_name + strlen(dev_name); dev_name_end = dev_name + strlen(dev_name);
*path = dev_name_end;
} }
err = -EINVAL; err = -EINVAL;
dev_name_end--; /* back up to ':' separator */ dev_name_end--; /* back up to ':' separator */
...@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, ...@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
goto out; goto out;
} }
dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
dout("server path '%s'\n", *path); if (fsopt->server_path)
dout("server path '%s'\n", fsopt->server_path);
*popt = ceph_parse_options(options, dev_name, dev_name_end, *popt = ceph_parse_options(options, dev_name, dev_name_end,
parse_fsopt_token, (void *)fsopt); parse_fsopt_token, (void *)fsopt);
...@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) ...@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl"); seq_puts(m, ",noacl");
#endif #endif
if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
if (fsopt->wsize) if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize); seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT) if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
...@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, ...@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
{ {
struct ceph_fs_client *fsc; struct ceph_fs_client *fsc;
const u64 supported_features = const u64 supported_features =
CEPH_FEATURE_FLOCK | CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
CEPH_FEATURE_DIRLAYOUTHASH | CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
CEPH_FEATURE_MDS_INLINE_DATA;
const u64 required_features = 0; const u64 required_features = 0;
int page_count; int page_count;
size_t size; size_t size;
...@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, ...@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail; goto fail;
} }
fsc->client->extra_mon_dispatch = extra_mon_dispatch; fsc->client->extra_mon_dispatch = extra_mon_dispatch;
fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
fsc->mount_options = fsopt; fsc->mount_options = fsopt;
...@@ -785,8 +799,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, ...@@ -785,8 +799,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
/* /*
* mount: join the ceph cluster, and open root directory. * mount: join the ceph cluster, and open root directory.
*/ */
static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
const char *path)
{ {
int err; int err;
unsigned long started = jiffies; /* note the start time */ unsigned long started = jiffies; /* note the start time */
...@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, ...@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
goto fail; goto fail;
} }
if (path[0] == 0) { if (!fsc->mount_options->server_path) {
root = fsc->sb->s_root; root = fsc->sb->s_root;
dget(root); dget(root);
} else { } else {
dout("mount opening base mountpoint\n"); const char *path = fsc->mount_options->server_path + 1;
dout("mount opening path %s\n", path);
root = open_root_dentry(fsc, path, started); root = open_root_dentry(fsc, path, started);
if (IS_ERR(root)) { if (IS_ERR(root)) {
err = PTR_ERR(root); err = PTR_ERR(root);
...@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, ...@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
struct dentry *res; struct dentry *res;
int err; int err;
int (*compare_super)(struct super_block *, void *) = ceph_compare_super; int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
const char *path = NULL;
struct ceph_mount_options *fsopt = NULL; struct ceph_mount_options *fsopt = NULL;
struct ceph_options *opt = NULL; struct ceph_options *opt = NULL;
...@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, ...@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
flags |= MS_POSIXACL; flags |= MS_POSIXACL;
#endif #endif
err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
if (err < 0) { if (err < 0) {
res = ERR_PTR(err); res = ERR_PTR(err);
goto out_final; goto out_final;
...@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, ...@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
} }
} }
res = ceph_real_mount(fsc, path); res = ceph_real_mount(fsc);
if (IS_ERR(res)) if (IS_ERR(res))
goto out_splat; goto out_splat;
dout("root %p inode %p ino %llx.%llx\n", res, dout("root %p inode %p ino %llx.%llx\n", res,
......
...@@ -62,6 +62,7 @@ struct ceph_mount_options { ...@@ -62,6 +62,7 @@ struct ceph_mount_options {
int cap_release_safety; int cap_release_safety;
int max_readdir; /* max readdir result (entires) */ int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */ int max_readdir_bytes; /* max readdir result (bytes) */
int mds_namespace;
/* /*
* everything above this point can be memcmp'd; everything below * everything above this point can be memcmp'd; everything below
...@@ -69,6 +70,7 @@ struct ceph_mount_options { ...@@ -69,6 +70,7 @@ struct ceph_mount_options {
*/ */
char *snapdir_name; /* default ".snap" */ char *snapdir_name; /* default ".snap" */
char *server_path; /* default "/" */
}; };
struct ceph_fs_client { struct ceph_fs_client {
...@@ -295,6 +297,7 @@ struct ceph_inode_info { ...@@ -295,6 +297,7 @@ struct ceph_inode_info {
u64 i_files, i_subdirs; u64 i_files, i_subdirs;
struct rb_root i_fragtree; struct rb_root i_fragtree;
int i_fragtree_nsplits;
struct mutex i_fragtree_mutex; struct mutex i_fragtree_mutex;
struct ceph_inode_xattrs_info i_xattrs; struct ceph_inode_xattrs_info i_xattrs;
...@@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, ...@@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count, long long release_count,
...@@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) ...@@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
return (struct ceph_dentry_info *)dentry->d_fsdata; return (struct ceph_dentry_info *)dentry->d_fsdata;
} }
static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
{
return ((loff_t)frag << 32) | (loff_t)off;
}
/* /*
* caps helpers * caps helpers
*/ */
...@@ -632,7 +631,6 @@ struct ceph_file_info { ...@@ -632,7 +631,6 @@ struct ceph_file_info {
struct ceph_mds_request *last_readdir; struct ceph_mds_request *last_readdir;
/* readdir: position within a frag */ /* readdir: position within a frag */
unsigned offset; /* offset of last chunk, adjusted for . and .. */
unsigned next_offset; /* offset of next chunk (last_name's + 1) */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */ char *last_name; /* last entry in previous chunk */
long long dir_release_count; long long dir_release_count;
...@@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); ...@@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */ /* file.c */
extern const struct file_operations ceph_file_fops; extern const struct file_operations ceph_file_fops;
extern int ceph_renew_caps(struct inode *inode);
extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode, struct file *file, unsigned flags, umode_t mode,
...@@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops; ...@@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops;
extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
ceph_snapdir_dentry_ops; ceph_snapdir_dentry_ops;
extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
extern int ceph_handle_snapdir(struct ceph_mds_request *req, extern int ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry, int err); struct dentry *dentry, int err);
......
...@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, ...@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
char buf[128]; char buf[128];
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
down_read(&osdc->map_sem); down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) { if (pool_name) {
size_t len = strlen(pool_name); size_t len = strlen(pool_name);
...@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, ...@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
ret = -ERANGE; ret = -ERANGE;
} }
} }
up_read(&osdc->map_sem); up_read(&osdc->lock);
return ret; return ret;
} }
...@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, ...@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
s64 pool = ceph_file_layout_pg_pool(ci->i_layout); s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
const char *pool_name; const char *pool_name;
down_read(&osdc->map_sem); down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) if (pool_name)
ret = snprintf(val, size, "%s", pool_name); ret = snprintf(val, size, "%s", pool_name);
else else
ret = snprintf(val, size, "%lld", (unsigned long long)pool); ret = snprintf(val, size, "%lld", (unsigned long long)pool);
up_read(&osdc->map_sem); up_read(&osdc->lock);
return ret; return ret;
} }
...@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, ...@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
struct ceph_mds_request *req; struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_pagelist *pagelist = NULL; struct ceph_pagelist *pagelist = NULL;
int op = CEPH_MDS_OP_SETXATTR;
int err; int err;
if (size > 0) { if (size > 0) {
...@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, ...@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
if (err) if (err)
goto out; goto out;
} else if (!value) { } else if (!value) {
if (flags & CEPH_XATTR_REPLACE)
op = CEPH_MDS_OP_RMXATTR;
else
flags |= CEPH_XATTR_REMOVE; flags |= CEPH_XATTR_REMOVE;
} }
dout("setxattr value=%.*s\n", (int)size, value); dout("setxattr value=%.*s\n", (int)size, value);
/* do request */ /* do request */
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
USE_AUTH_MDS);
if (IS_ERR(req)) { if (IS_ERR(req)) {
err = PTR_ERR(req); err = PTR_ERR(req);
goto out; goto out;
} }
req->r_args.setxattr.flags = cpu_to_le32(flags);
req->r_path2 = kstrdup(name, GFP_NOFS); req->r_path2 = kstrdup(name, GFP_NOFS);
if (!req->r_path2) { if (!req->r_path2) {
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
...@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, ...@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
goto out; goto out;
} }
if (op == CEPH_MDS_OP_SETXATTR) {
req->r_args.setxattr.flags = cpu_to_le32(flags);
req->r_pagelist = pagelist; req->r_pagelist = pagelist;
pagelist = NULL; pagelist = NULL;
}
req->r_inode = inode; req->r_inode = inode;
ihold(inode); ihold(inode);
......
...@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) ...@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
return ceph_frag_make(newbits, return ceph_frag_make(newbits,
ceph_frag_value(f) | (i << (24 - newbits))); ceph_frag_value(f) | (i << (24 - newbits)));
} }
static inline int ceph_frag_is_leftmost(__u32 f) static inline bool ceph_frag_is_leftmost(__u32 f)
{ {
return ceph_frag_value(f) == 0; return ceph_frag_value(f) == 0;
} }
static inline int ceph_frag_is_rightmost(__u32 f) static inline bool ceph_frag_is_rightmost(__u32 f)
{ {
return ceph_frag_value(f) == ceph_frag_mask(f); return ceph_frag_value(f) == ceph_frag_mask(f);
} }
......
...@@ -153,8 +153,9 @@ struct ceph_dir_layout { ...@@ -153,8 +153,9 @@ struct ceph_dir_layout {
/* watch-notify operations */ /* watch-notify operations */
enum { enum {
WATCH_NOTIFY = 1, /* notifying watcher */ CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
}; };
...@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack { ...@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack {
struct ceph_fsid fsid; struct ceph_fsid fsid;
} __attribute__ ((packed)); } __attribute__ ((packed));
#define CEPH_FS_CLUSTER_ID_NONE -1
/* /*
* mdsmap flags * mdsmap flags
*/ */
...@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op); ...@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op);
#define CEPH_XATTR_REPLACE (1 << 1) #define CEPH_XATTR_REPLACE (1 << 1)
#define CEPH_XATTR_REMOVE (1 << 31) #define CEPH_XATTR_REMOVE (1 << 31)
/*
* readdir request flags;
*/
#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)
/*
* readdir reply flags.
*/
#define CEPH_READDIR_FRAG_END (1<<0)
#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
#define CEPH_READDIR_HASH_ORDER (1<<9)
union ceph_mds_request_args { union ceph_mds_request_args {
struct { struct {
__le32 mask; /* CEPH_CAP_* */ __le32 mask; /* CEPH_CAP_* */
...@@ -361,6 +376,7 @@ union ceph_mds_request_args { ...@@ -361,6 +376,7 @@ union ceph_mds_request_args {
__le32 frag; /* which dir fragment */ __le32 frag; /* which dir fragment */
__le32 max_entries; /* how many dentries to grab */ __le32 max_entries; /* how many dentries to grab */
__le32 max_bytes; __le32 max_bytes;
__le16 flags;
} __attribute__ ((packed)) readdir; } __attribute__ ((packed)) readdir;
struct { struct {
__le32 mode; __le32 mode;
......
...@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n) ...@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
/* /*
* bounds check input. * bounds check input.
*/ */
static inline int ceph_has_room(void **p, void *end, size_t n) static inline bool ceph_has_room(void **p, void *end, size_t n)
{ {
return end >= *p && n <= end - *p; return end >= *p && n <= end - *p;
} }
......
...@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len) ...@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
(off >> PAGE_SHIFT); (off >> PAGE_SHIFT);
} }
/*
* These are not meant to be generic - an integer key is assumed.
*/
#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
static void insert_##name(struct rb_root *root, type *t) \
{ \
struct rb_node **n = &root->rb_node; \
struct rb_node *parent = NULL; \
\
BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \
\
while (*n) { \
type *cur = rb_entry(*n, type, nodefld); \
\
parent = *n; \
if (t->keyfld < cur->keyfld) \
n = &(*n)->rb_left; \
else if (t->keyfld > cur->keyfld) \
n = &(*n)->rb_right; \
else \
BUG(); \
} \
\
rb_link_node(&t->nodefld, parent, n); \
rb_insert_color(&t->nodefld, root); \
} \
static void erase_##name(struct rb_root *root, type *t) \
{ \
BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \
rb_erase(&t->nodefld, root); \
RB_CLEAR_NODE(&t->nodefld); \
}
#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
static type *lookup_##name(struct rb_root *root, \
typeof(((type *)0)->keyfld) key) \
{ \
struct rb_node *n = root->rb_node; \
\
while (n) { \
type *cur = rb_entry(n, type, nodefld); \
\
if (key < cur->keyfld) \
n = n->rb_left; \
else if (key > cur->keyfld) \
n = n->rb_right; \
else \
return cur; \
} \
\
return NULL; \
}
#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_inode_cachep;
extern struct kmem_cache *ceph_cap_cachep; extern struct kmem_cache *ceph_cap_cachep;
extern struct kmem_cache *ceph_cap_flush_cachep; extern struct kmem_cache *ceph_cap_flush_cachep;
......
...@@ -39,20 +39,31 @@ struct ceph_mon_request { ...@@ -39,20 +39,31 @@ struct ceph_mon_request {
ceph_monc_request_func_t do_request; ceph_monc_request_func_t do_request;
}; };
typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
/* /*
* ceph_mon_generic_request is being used for the statfs and * ceph_mon_generic_request is being used for the statfs and
* mon_get_version requests which are being done a bit differently * mon_get_version requests which are being done a bit differently
* because we need to get data back to the caller * because we need to get data back to the caller
*/ */
struct ceph_mon_generic_request { struct ceph_mon_generic_request {
struct ceph_mon_client *monc;
struct kref kref; struct kref kref;
u64 tid; u64 tid;
struct rb_node node; struct rb_node node;
int result; int result;
void *buf;
struct completion completion; struct completion completion;
ceph_monc_callback_t complete_cb;
u64 private_data; /* r_tid/linger_id */
struct ceph_msg *request; /* original request */ struct ceph_msg *request; /* original request */
struct ceph_msg *reply; /* and reply */ struct ceph_msg *reply; /* and reply */
union {
struct ceph_statfs *st;
u64 newest;
} u;
}; };
struct ceph_mon_client { struct ceph_mon_client {
...@@ -77,7 +88,6 @@ struct ceph_mon_client { ...@@ -77,7 +88,6 @@ struct ceph_mon_client {
/* pending generic requests */ /* pending generic requests */
struct rb_root generic_request_tree; struct rb_root generic_request_tree;
int num_generic_requests;
u64 last_tid; u64 last_tid;
/* subs, indexed with CEPH_SUB_* */ /* subs, indexed with CEPH_SUB_* */
...@@ -86,6 +96,7 @@ struct ceph_mon_client { ...@@ -86,6 +96,7 @@ struct ceph_mon_client {
bool want; bool want;
u32 have; /* epoch */ u32 have; /* epoch */
} subs[3]; } subs[3];
int fs_cluster_id; /* "mdsmap.<id>" sub */
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_file; struct dentry *debugfs_file;
...@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[]; ...@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[];
bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
bool continuous); bool continuous);
void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
void ceph_monc_renew_subs(struct ceph_mon_client *monc);
extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
unsigned long timeout); unsigned long timeout);
extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
struct ceph_statfs *buf); struct ceph_statfs *buf);
extern int ceph_monc_do_get_version(struct ceph_mon_client *monc, int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
const char *what, u64 *newest); u64 *newest);
int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
ceph_monc_callback_t cb, u64 private_data);
extern int ceph_monc_open_session(struct ceph_mon_client *monc); extern int ceph_monc_open_session(struct ceph_mon_client *monc);
......
...@@ -20,10 +20,11 @@ struct ceph_osd_client; ...@@ -20,10 +20,11 @@ struct ceph_osd_client;
/* /*
* completion callback for async writepages * completion callback for async writepages
*/ */
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
struct ceph_msg *);
typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
#define CEPH_HOMELESS_OSD -1
/* a given osd we're communicating with */ /* a given osd we're communicating with */
struct ceph_osd { struct ceph_osd {
atomic_t o_ref; atomic_t o_ref;
...@@ -32,16 +33,15 @@ struct ceph_osd { ...@@ -32,16 +33,15 @@ struct ceph_osd {
int o_incarnation; int o_incarnation;
struct rb_node o_node; struct rb_node o_node;
struct ceph_connection o_con; struct ceph_connection o_con;
struct list_head o_requests; struct rb_root o_requests;
struct list_head o_linger_requests; struct rb_root o_linger_requests;
struct list_head o_osd_lru; struct list_head o_osd_lru;
struct ceph_auth_handshake o_auth; struct ceph_auth_handshake o_auth;
unsigned long lru_ttl; unsigned long lru_ttl;
int o_marked_for_keepalive;
struct list_head o_keepalive_item; struct list_head o_keepalive_item;
struct mutex lock;
}; };
#define CEPH_OSD_SLAB_OPS 2 #define CEPH_OSD_SLAB_OPS 2
#define CEPH_OSD_MAX_OPS 16 #define CEPH_OSD_MAX_OPS 16
...@@ -104,15 +104,21 @@ struct ceph_osd_req_op { ...@@ -104,15 +104,21 @@ struct ceph_osd_req_op {
struct ceph_osd_data response_data; struct ceph_osd_data response_data;
__u8 class_len; __u8 class_len;
__u8 method_len; __u8 method_len;
__u8 argc; u32 indata_len;
} cls; } cls;
struct { struct {
u64 cookie; u64 cookie;
u64 ver; __u8 op; /* CEPH_OSD_WATCH_OP_ */
u32 prot_ver; u32 gen;
u32 timeout;
__u8 flag;
} watch; } watch;
struct {
struct ceph_osd_data request_data;
} notify_ack;
struct {
u64 cookie;
struct ceph_osd_data request_data;
struct ceph_osd_data response_data;
} notify;
struct { struct {
u64 expected_object_size; u64 expected_object_size;
u64 expected_write_size; u64 expected_write_size;
...@@ -120,60 +126,73 @@ struct ceph_osd_req_op { ...@@ -120,60 +126,73 @@ struct ceph_osd_req_op {
}; };
}; };
struct ceph_osd_request_target {
struct ceph_object_id base_oid;
struct ceph_object_locator base_oloc;
struct ceph_object_id target_oid;
struct ceph_object_locator target_oloc;
struct ceph_pg pgid;
u32 pg_num;
u32 pg_num_mask;
struct ceph_osds acting;
struct ceph_osds up;
int size;
int min_size;
bool sort_bitwise;
unsigned int flags; /* CEPH_OSD_FLAG_* */
bool paused;
int osd;
};
/* an in-flight request */ /* an in-flight request */
struct ceph_osd_request { struct ceph_osd_request {
u64 r_tid; /* unique for this client */ u64 r_tid; /* unique for this client */
struct rb_node r_node; struct rb_node r_node;
struct list_head r_req_lru_item; struct rb_node r_mc_node; /* map check */
struct list_head r_osd_item;
struct list_head r_linger_item;
struct list_head r_linger_osd_item;
struct ceph_osd *r_osd; struct ceph_osd *r_osd;
struct ceph_pg r_pgid;
int r_pg_osds[CEPH_PG_MAX_SIZE]; struct ceph_osd_request_target r_t;
int r_num_pg_osds; #define r_base_oid r_t.base_oid
#define r_base_oloc r_t.base_oloc
#define r_flags r_t.flags
struct ceph_msg *r_request, *r_reply; struct ceph_msg *r_request, *r_reply;
int r_flags; /* any additional flags for the osd */
u32 r_sent; /* >0 if r_request is sending/sent */ u32 r_sent; /* >0 if r_request is sending/sent */
/* request osd ops array */ /* request osd ops array */
unsigned int r_num_ops; unsigned int r_num_ops;
/* these are updated on each send */
__le32 *r_request_osdmap_epoch;
__le32 *r_request_flags;
__le64 *r_request_pool;
void *r_request_pgid;
__le32 *r_request_attempts;
bool r_paused;
struct ceph_eversion *r_request_reassert_version;
int r_result; int r_result;
int r_got_reply; bool r_got_reply;
int r_linger;
struct ceph_osd_client *r_osdc; struct ceph_osd_client *r_osdc;
struct kref r_kref; struct kref r_kref;
bool r_mempool; bool r_mempool;
struct completion r_completion, r_safe_completion; struct completion r_completion;
struct completion r_safe_completion; /* fsync waiter */
ceph_osdc_callback_t r_callback; ceph_osdc_callback_t r_callback;
ceph_osdc_unsafe_callback_t r_unsafe_callback; ceph_osdc_unsafe_callback_t r_unsafe_callback;
struct ceph_eversion r_reassert_version;
struct list_head r_unsafe_item; struct list_head r_unsafe_item;
struct inode *r_inode; /* for use by callbacks */ struct inode *r_inode; /* for use by callbacks */
void *r_priv; /* ditto */ void *r_priv; /* ditto */
struct ceph_object_locator r_base_oloc; /* set by submitter */
struct ceph_object_id r_base_oid; u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */
struct ceph_object_locator r_target_oloc; struct ceph_snap_context *r_snapc; /* for writes */
struct ceph_object_id r_target_oid; struct timespec r_mtime; /* ditto */
u64 r_data_offset; /* ditto */
u64 r_snapid; bool r_linger; /* don't resend on failure */
unsigned long r_stamp; /* send OR check time */
struct ceph_snap_context *r_snapc; /* snap context for writes */ /* internal */
unsigned long r_stamp; /* jiffies, send or check time */
int r_attempts;
struct ceph_eversion r_replay_version; /* aka reassert_version */
u32 r_last_force_resend;
u32 r_map_dne_bound;
struct ceph_osd_req_op r_ops[]; struct ceph_osd_req_op r_ops[];
}; };
...@@ -182,44 +201,70 @@ struct ceph_request_redirect { ...@@ -182,44 +201,70 @@ struct ceph_request_redirect {
struct ceph_object_locator oloc; struct ceph_object_locator oloc;
}; };
struct ceph_osd_event { typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
u64 cookie; u64 notifier_id, void *data, size_t data_len);
int one_shot; typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
struct ceph_osd_linger_request {
struct ceph_osd_client *osdc; struct ceph_osd_client *osdc;
void (*cb)(u64, u64, u8, void *); u64 linger_id;
void *data; bool committed;
struct rb_node node; bool is_watch; /* watch or notify */
struct list_head osd_node;
struct kref kref; struct ceph_osd *osd;
}; struct ceph_osd_request *reg_req;
struct ceph_osd_request *ping_req;
unsigned long ping_sent;
unsigned long watch_valid_thru;
struct list_head pending_lworks;
struct ceph_osd_request_target t;
u32 last_force_resend;
u32 map_dne_bound;
struct timespec mtime;
struct ceph_osd_event_work { struct kref kref;
struct work_struct work; struct mutex lock;
struct ceph_osd_event *event; struct rb_node node; /* osd */
u64 ver; struct rb_node osdc_node; /* osdc */
struct rb_node mc_node; /* map check */
struct list_head scan_item;
struct completion reg_commit_wait;
struct completion notify_finish_wait;
int reg_commit_error;
int notify_finish_error;
int last_error;
u32 register_gen;
u64 notify_id; u64 notify_id;
u8 opcode;
rados_watchcb2_t wcb;
rados_watcherrcb_t errcb;
void *data;
struct page ***preply_pages;
size_t *preply_len;
}; };
struct ceph_osd_client { struct ceph_osd_client {
struct ceph_client *client; struct ceph_client *client;
struct ceph_osdmap *osdmap; /* current map */ struct ceph_osdmap *osdmap; /* current map */
struct rw_semaphore map_sem; struct rw_semaphore lock;
struct completion map_waiters;
u64 last_requested_map;
struct mutex request_mutex;
struct rb_root osds; /* osds */ struct rb_root osds; /* osds */
struct list_head osd_lru; /* idle osds */ struct list_head osd_lru; /* idle osds */
u64 timeout_tid; /* tid of timeout triggering rq */ spinlock_t osd_lru_lock;
u64 last_tid; /* tid of last request */ struct ceph_osd homeless_osd;
struct rb_root requests; /* pending requests */ atomic64_t last_tid; /* tid of last request */
struct list_head req_lru; /* in-flight lru */ u64 last_linger_id;
struct list_head req_unsent; /* unsent/need-resend queue */ struct rb_root linger_requests; /* lingering requests */
struct list_head req_notarget; /* map to no osd */ struct rb_root map_checks;
struct list_head req_linger; /* lingering requests */ struct rb_root linger_map_checks;
int num_requests; atomic_t num_requests;
atomic_t num_homeless;
struct delayed_work timeout_work; struct delayed_work timeout_work;
struct delayed_work osds_timeout_work; struct delayed_work osds_timeout_work;
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS
...@@ -231,10 +276,6 @@ struct ceph_osd_client { ...@@ -231,10 +276,6 @@ struct ceph_osd_client {
struct ceph_msgpool msgpool_op; struct ceph_msgpool msgpool_op;
struct ceph_msgpool msgpool_op_reply; struct ceph_msgpool msgpool_op_reply;
spinlock_t event_lock;
struct rb_root event_tree;
u64 event_count;
struct workqueue_struct *notify_wq; struct workqueue_struct *notify_wq;
}; };
...@@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, ...@@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
extern struct ceph_osd_data *osd_req_op_extent_osd_data( extern struct ceph_osd_data *osd_req_op_extent_osd_data(
struct ceph_osd_request *osd_req, struct ceph_osd_request *osd_req,
unsigned int which); unsigned int which);
extern struct ceph_osd_data *osd_req_op_cls_response_data(
struct ceph_osd_request *osd_req,
unsigned int which);
extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
unsigned int which, unsigned int which,
...@@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, ...@@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *name, const void *value, u16 opcode, const char *name, const void *value,
size_t size, u8 cmp_op, u8 cmp_mode); size_t size, u8 cmp_op, u8 cmp_mode);
extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode,
u64 cookie, u64 version, int flag);
extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which, unsigned int which,
u64 expected_object_size, u64 expected_object_size,
...@@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * ...@@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
unsigned int num_ops, unsigned int num_ops,
bool use_mempool, bool use_mempool,
gfp_t gfp_flags); gfp_t gfp_flags);
int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
struct ceph_snap_context *snapc,
u64 snap_id,
struct timespec *mtime);
extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
struct ceph_file_layout *layout, struct ceph_file_layout *layout,
...@@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, ...@@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
u32 truncate_seq, u64 truncate_size, u32 truncate_seq, u64 truncate_size,
bool use_mempool); bool use_mempool);
extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
extern void ceph_osdc_get_request(struct ceph_osd_request *req); extern void ceph_osdc_get_request(struct ceph_osd_request *req);
extern void ceph_osdc_put_request(struct ceph_osd_request *req); extern void ceph_osdc_put_request(struct ceph_osd_request *req);
...@@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, ...@@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
extern void ceph_osdc_sync(struct ceph_osd_client *osdc); extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
struct ceph_vino vino, struct ceph_vino vino,
...@@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, ...@@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct timespec *mtime, struct timespec *mtime,
struct page **pages, int nr_pages); struct page **pages, int nr_pages);
/* watch/notify events */ /* watch/notify */
extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, struct ceph_osd_linger_request *
void (*event_cb)(u64, u64, u8, void *), ceph_osdc_watch(struct ceph_osd_client *osdc,
void *data, struct ceph_osd_event **pevent); struct ceph_object_id *oid,
extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); struct ceph_object_locator *oloc,
extern void ceph_osdc_put_event(struct ceph_osd_event *event); rados_watchcb2_t wcb,
rados_watcherrcb_t errcb,
void *data);
int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
struct ceph_osd_linger_request *lreq);
int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
u64 notify_id,
u64 cookie,
void *payload,
size_t payload_len);
int ceph_osdc_notify(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
void *payload,
size_t payload_len,
u32 timeout,
struct page ***preply_pages,
size_t *preply_len);
int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
struct ceph_osd_linger_request *lreq);
#endif #endif
...@@ -24,21 +24,29 @@ struct ceph_pg { ...@@ -24,21 +24,29 @@ struct ceph_pg {
uint32_t seed; uint32_t seed;
}; };
#define CEPH_POOL_FLAG_HASHPSPOOL 1 int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
together */
#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */
struct ceph_pg_pool_info { struct ceph_pg_pool_info {
struct rb_node node; struct rb_node node;
s64 id; s64 id;
u8 type; u8 type; /* CEPH_POOL_TYPE_* */
u8 size; u8 size;
u8 min_size;
u8 crush_ruleset; u8 crush_ruleset;
u8 object_hash; u8 object_hash;
u32 last_force_request_resend;
u32 pg_num, pgp_num; u32 pg_num, pgp_num;
int pg_num_mask, pgp_num_mask; int pg_num_mask, pgp_num_mask;
s64 read_tier; s64 read_tier;
s64 write_tier; /* wins for read+write ops */ s64 write_tier; /* wins for read+write ops */
u64 flags; u64 flags; /* CEPH_POOL_FLAG_* */
char *name; char *name;
bool was_full; /* for handle_one_map() */
}; };
static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
...@@ -57,6 +65,22 @@ struct ceph_object_locator { ...@@ -57,6 +65,22 @@ struct ceph_object_locator {
s64 pool; s64 pool;
}; };
static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
{
oloc->pool = -1;
}
static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
{
return oloc->pool == -1;
}
static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
const struct ceph_object_locator *src)
{
dest->pool = src->pool;
}
/* /*
* Maximum supported by kernel client object name length * Maximum supported by kernel client object name length
* *
...@@ -64,11 +88,47 @@ struct ceph_object_locator { ...@@ -64,11 +88,47 @@ struct ceph_object_locator {
*/ */
#define CEPH_MAX_OID_NAME_LEN 100 #define CEPH_MAX_OID_NAME_LEN 100
/*
* 51-char inline_name is long enough for all cephfs and all but one
* rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
* arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all
* other rbd requests fit into inline_name.
*
* Makes ceph_object_id 64 bytes on 64-bit.
*/
#define CEPH_OID_INLINE_LEN 52
/*
* Both inline and external buffers have space for a NUL-terminator,
* which is carried around. It's not required though - RADOS object
* names don't have to be NUL-terminated and may contain NULs.
*/
struct ceph_object_id { struct ceph_object_id {
char name[CEPH_MAX_OID_NAME_LEN]; char *name;
char inline_name[CEPH_OID_INLINE_LEN];
int name_len; int name_len;
}; };
static inline void ceph_oid_init(struct ceph_object_id *oid)
{
oid->name = oid->inline_name;
oid->name_len = 0;
}
static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
{
return oid->name == oid->inline_name && !oid->name_len;
}
void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src);
__printf(2, 3)
void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
__printf(3, 4)
int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
const char *fmt, ...);
void ceph_oid_destroy(struct ceph_object_id *oid);
struct ceph_pg_mapping { struct ceph_pg_mapping {
struct rb_node node; struct rb_node node;
struct ceph_pg pgid; struct ceph_pg pgid;
...@@ -87,7 +147,6 @@ struct ceph_pg_mapping { ...@@ -87,7 +147,6 @@ struct ceph_pg_mapping {
struct ceph_osdmap { struct ceph_osdmap {
struct ceph_fsid fsid; struct ceph_fsid fsid;
u32 epoch; u32 epoch;
u32 mkfs_epoch;
struct ceph_timespec created, modified; struct ceph_timespec created, modified;
u32 flags; /* CEPH_OSDMAP_* */ u32 flags; /* CEPH_OSDMAP_* */
...@@ -113,43 +172,19 @@ struct ceph_osdmap { ...@@ -113,43 +172,19 @@ struct ceph_osdmap {
int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
}; };
static inline void ceph_oid_set_name(struct ceph_object_id *oid, static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
const char *name)
{
int len;
len = strlen(name);
if (len > sizeof(oid->name)) {
WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
name, len, sizeof(oid->name));
len = sizeof(oid->name);
}
memcpy(oid->name, name, len);
oid->name_len = len;
}
static inline void ceph_oid_copy(struct ceph_object_id *dest,
struct ceph_object_id *src)
{
BUG_ON(src->name_len > sizeof(dest->name));
memcpy(dest->name, src->name, src->name_len);
dest->name_len = src->name_len;
}
static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
{ {
return osd >= 0 && osd < map->max_osd && return osd >= 0 && osd < map->max_osd &&
(map->osd_state[osd] & CEPH_OSD_EXISTS); (map->osd_state[osd] & CEPH_OSD_EXISTS);
} }
static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
{ {
return ceph_osd_exists(map, osd) && return ceph_osd_exists(map, osd) &&
(map->osd_state[osd] & CEPH_OSD_UP); (map->osd_state[osd] & CEPH_OSD_UP);
} }
static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd) static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
{ {
return !ceph_osd_is_up(map, osd); return !ceph_osd_is_up(map, osd);
} }
...@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) ...@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
return 0; return 0;
} }
struct ceph_osdmap *ceph_osdmap_alloc(void);
extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map, struct ceph_osdmap *map);
struct ceph_messenger *msgr);
extern void ceph_osdmap_destroy(struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
struct ceph_osds {
int osds[CEPH_PG_MAX_SIZE];
int size;
int primary; /* id, NOT index */
};
static inline void ceph_osds_init(struct ceph_osds *set)
{
set->size = 0;
set->primary = -1;
}
void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
bool ceph_is_new_interval(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
const struct ceph_osds *old_up,
const struct ceph_osds *new_up,
int old_size,
int new_size,
int old_min_size,
int new_min_size,
u32 old_pg_num,
u32 new_pg_num,
bool old_sort_bitwise,
bool new_sort_bitwise,
const struct ceph_pg *pgid);
bool ceph_osds_changed(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
bool any_change);
/* calculate mapping of a file extent to an object */ /* calculate mapping of a file extent to an object */
extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 len, u64 off, u64 len,
u64 *bno, u64 *oxoff, u64 *oxlen); u64 *bno, u64 *oxoff, u64 *oxlen);
/* calculate mapping of object to a placement group */ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
struct ceph_object_locator *oloc,
struct ceph_object_id *oid, struct ceph_object_id *oid,
struct ceph_pg *pg_out); struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid);
extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
struct ceph_pg pgid, void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
int *osds, int *primary); const struct ceph_pg *raw_pgid,
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_osds *up,
struct ceph_pg pgid); struct ceph_osds *acting);
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid);
extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
u64 id); u64 id);
......
...@@ -114,8 +114,8 @@ struct ceph_object_layout { ...@@ -114,8 +114,8 @@ struct ceph_object_layout {
* compound epoch+version, used by storage layer to serialize mutations * compound epoch+version, used by storage layer to serialize mutations
*/ */
struct ceph_eversion { struct ceph_eversion {
__le32 epoch;
__le64 version; __le64 version;
__le32 epoch;
} __attribute__ ((packed)); } __attribute__ ((packed));
/* /*
...@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s); ...@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
/* /*
* The error code to return when an OSD can't handle a write * The error code to return when an OSD can't handle a write
...@@ -389,6 +394,13 @@ enum { ...@@ -389,6 +394,13 @@ enum {
CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */
CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if
pool uses pool snaps */
CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */
CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
}; };
enum { enum {
...@@ -415,7 +427,17 @@ enum { ...@@ -415,7 +427,17 @@ enum {
CEPH_OSD_CMPXATTR_MODE_U64 = 2 CEPH_OSD_CMPXATTR_MODE_U64 = 2
}; };
#define RADOS_NOTIFY_VER 1 enum {
CEPH_OSD_WATCH_OP_UNWATCH = 0,
CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
/* note: use only ODD ids to prevent pre-giant code from
interpreting the op as UNWATCH */
CEPH_OSD_WATCH_OP_WATCH = 3,
CEPH_OSD_WATCH_OP_RECONNECT = 5,
CEPH_OSD_WATCH_OP_PING = 7,
};
const char *ceph_osd_watch_op_name(int o);
/* /*
* an individual object operation. each may be accompanied by some data * an individual object operation. each may be accompanied by some data
...@@ -450,9 +472,13 @@ struct ceph_osd_op { ...@@ -450,9 +472,13 @@ struct ceph_osd_op {
} __attribute__ ((packed)) snap; } __attribute__ ((packed)) snap;
struct { struct {
__le64 cookie; __le64 cookie;
__le64 ver; __le64 ver; /* no longer used */
__u8 flag; /* 0 = unwatch, 1 = watch */ __u8 op; /* CEPH_OSD_WATCH_OP_* */
__le32 gen; /* registration generation */
} __attribute__ ((packed)) watch; } __attribute__ ((packed)) watch;
struct {
__le64 cookie;
} __attribute__ ((packed)) notify;
struct { struct {
__le64 offset, length; __le64 offset, length;
__le64 src_offset; __le64 src_offset;
......
...@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client); ...@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
/* /*
* true if we have the mon map (and have thus joined the cluster) * true if we have the mon map (and have thus joined the cluster)
*/ */
static int have_mon_and_osd_map(struct ceph_client *client) static bool have_mon_and_osd_map(struct ceph_client *client)
{ {
return client->monc.monmap && client->monc.monmap->epoch && return client->monc.monmap && client->monc.monmap->epoch &&
client->osdc.osdmap && client->osdc.osdmap->epoch; client->osdc.osdmap && client->osdc.osdmap->epoch;
......
...@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE) ...@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
} }
} }
const char *ceph_osd_watch_op_name(int o)
{
switch (o) {
case CEPH_OSD_WATCH_OP_UNWATCH:
return "unwatch";
case CEPH_OSD_WATCH_OP_WATCH:
return "watch";
case CEPH_OSD_WATCH_OP_RECONNECT:
return "reconnect";
case CEPH_OSD_WATCH_OP_PING:
return "ping";
default:
return "???";
}
}
const char *ceph_osd_state_name(int s) const char *ceph_osd_state_name(int s)
{ {
switch (s) { switch (s) {
......
...@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p) ...@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
{ {
int i; int i;
struct ceph_client *client = s->private; struct ceph_client *client = s->private;
struct ceph_osdmap *map = client->osdc.osdmap; struct ceph_osd_client *osdc = &client->osdc;
struct ceph_osdmap *map = osdc->osdmap;
struct rb_node *n; struct rb_node *n;
if (map == NULL) if (map == NULL)
return 0; return 0;
seq_printf(s, "epoch %d\n", map->epoch); down_read(&osdc->lock);
seq_printf(s, "flags%s%s\n", seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
(map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
(map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
struct ceph_pg_pool_info *pool = struct ceph_pg_pool_info *pi =
rb_entry(n, struct ceph_pg_pool_info, node); rb_entry(n, struct ceph_pg_pool_info, node);
seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
pool->id, pool->pg_num, pool->pg_num_mask, pi->id, pi->name, pi->type, pi->size, pi->min_size,
pool->read_tier, pool->write_tier); pi->pg_num, pi->pg_num_mask, pi->flags,
pi->last_force_request_resend, pi->read_tier,
pi->write_tier);
} }
for (i = 0; i < map->max_osd; i++) { for (i = 0; i < map->max_osd; i++) {
struct ceph_entity_addr *addr = &map->osd_addr[i]; struct ceph_entity_addr *addr = &map->osd_addr[i];
...@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p) ...@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
pg->pgid.seed, pg->primary_temp.osd); pg->pgid.seed, pg->primary_temp.osd);
} }
up_read(&osdc->lock);
return 0; return 0;
} }
...@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p) ...@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
CEPH_SUBSCRIBE_ONETIME ? "" : "+")); CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
seq_putc(s, '\n'); seq_putc(s, '\n');
} }
seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
__u16 op; __u16 op;
...@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p) ...@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
return 0; return 0;
} }
static int osdc_show(struct seq_file *s, void *pp) static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
{ {
struct ceph_client *client = s->private; int i;
struct ceph_osd_client *osdc = &client->osdc;
struct rb_node *p;
mutex_lock(&osdc->request_mutex);
for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
struct ceph_osd_request *req;
unsigned int i;
int opcode;
req = rb_entry(p, struct ceph_osd_request, r_node); seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
for (i = 0; i < t->up.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
seq_printf(s, "]/%d\t[", t->up.primary);
for (i = 0; i < t->acting.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
t->target_oid.name_len, t->target_oid.name, t->flags);
if (t->paused)
seq_puts(s, "\tP");
}
seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
req->r_osd ? req->r_osd->o_osd : -1, {
req->r_pgid.pool, req->r_pgid.seed); int i;
seq_printf(s, "%.*s", req->r_base_oid.name_len, seq_printf(s, "%llu\t", req->r_tid);
req->r_base_oid.name); dump_target(s, &req->r_t);
if (req->r_reassert_version.epoch) seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
seq_printf(s, "\t%u'%llu", le32_to_cpu(req->r_replay_version.epoch),
(unsigned int)le32_to_cpu(req->r_reassert_version.epoch), le64_to_cpu(req->r_replay_version.version));
le64_to_cpu(req->r_reassert_version.version));
else
seq_printf(s, "\t");
for (i = 0; i < req->r_num_ops; i++) { for (i = 0; i < req->r_num_ops; i++) {
opcode = req->r_ops[i].op; struct ceph_osd_req_op *op = &req->r_ops[i];
seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
ceph_osd_op_name(opcode)); ceph_osd_op_name(op->op));
if (op->op == CEPH_OSD_OP_WATCH)
seq_printf(s, "-%s",
ceph_osd_watch_op_name(op->watch.op));
} }
seq_printf(s, "\n"); seq_putc(s, '\n');
}
static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
{
struct rb_node *n;
mutex_lock(&osd->lock);
for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
struct ceph_osd_request *req =
rb_entry(n, struct ceph_osd_request, r_node);
dump_request(s, req);
} }
mutex_unlock(&osdc->request_mutex);
mutex_unlock(&osd->lock);
}
static void dump_linger_request(struct seq_file *s,
struct ceph_osd_linger_request *lreq)
{
seq_printf(s, "%llu\t", lreq->linger_id);
dump_target(s, &lreq->t);
seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
lreq->last_error);
}
static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
{
struct rb_node *n;
mutex_lock(&osd->lock);
for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
struct ceph_osd_linger_request *lreq =
rb_entry(n, struct ceph_osd_linger_request, node);
dump_linger_request(s, lreq);
}
mutex_unlock(&osd->lock);
}
static int osdc_show(struct seq_file *s, void *pp)
{
struct ceph_client *client = s->private;
struct ceph_osd_client *osdc = &client->osdc;
struct rb_node *n;
down_read(&osdc->lock);
seq_printf(s, "REQUESTS %d homeless %d\n",
atomic_read(&osdc->num_requests),
atomic_read(&osdc->num_homeless));
for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
dump_requests(s, osd);
}
dump_requests(s, &osdc->homeless_osd);
seq_puts(s, "LINGER REQUESTS\n");
for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
dump_linger_requests(s, osd);
}
dump_linger_requests(s, &osdc->homeless_osd);
up_read(&osdc->lock);
return 0; return 0;
} }
......
...@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc) ...@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
BUG_ON(num < 1); /* monmap sub is always there */ BUG_ON(num < 1); /* monmap sub is always there */
ceph_encode_32(&p, num); ceph_encode_32(&p, num);
for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
const char *s = ceph_sub_str[i]; char buf[32];
int len;
if (!monc->subs[i].want) if (!monc->subs[i].want)
continue; continue;
dout("%s %s start %llu flags 0x%x\n", __func__, s, len = sprintf(buf, "%s", ceph_sub_str[i]);
if (i == CEPH_SUB_MDSMAP &&
monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
dout("%s %s start %llu flags 0x%x\n", __func__, buf,
le64_to_cpu(monc->subs[i].item.start), le64_to_cpu(monc->subs[i].item.start),
monc->subs[i].item.flags); monc->subs[i].item.flags);
ceph_encode_string(&p, end, s, strlen(s)); ceph_encode_string(&p, end, buf, len);
memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
p += sizeof(monc->subs[i].item); p += sizeof(monc->subs[i].item);
} }
BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base; msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
ceph_msg_revoke(msg); ceph_msg_revoke(msg);
...@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch) ...@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
} }
EXPORT_SYMBOL(ceph_monc_got_map); EXPORT_SYMBOL(ceph_monc_got_map);
/* void ceph_monc_renew_subs(struct ceph_mon_client *monc)
* Register interest in the next osdmap
*/
void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
{ {
dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
__send_subscribe(monc); __send_subscribe(monc);
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
} }
EXPORT_SYMBOL(ceph_monc_request_next_osdmap); EXPORT_SYMBOL(ceph_monc_renew_subs);
/* /*
* Wait for an osdmap with a given epoch. * Wait for an osdmap with a given epoch.
...@@ -478,51 +478,17 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, ...@@ -478,51 +478,17 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
/* /*
* generic requests (currently statfs, mon_get_version) * generic requests (currently statfs, mon_get_version)
*/ */
static struct ceph_mon_generic_request *__lookup_generic_req( DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
struct ceph_mon_client *monc, u64 tid)
{
struct ceph_mon_generic_request *req;
struct rb_node *n = monc->generic_request_tree.rb_node;
while (n) {
req = rb_entry(n, struct ceph_mon_generic_request, node);
if (tid < req->tid)
n = n->rb_left;
else if (tid > req->tid)
n = n->rb_right;
else
return req;
}
return NULL;
}
static void __insert_generic_request(struct ceph_mon_client *monc,
struct ceph_mon_generic_request *new)
{
struct rb_node **p = &monc->generic_request_tree.rb_node;
struct rb_node *parent = NULL;
struct ceph_mon_generic_request *req = NULL;
while (*p) {
parent = *p;
req = rb_entry(parent, struct ceph_mon_generic_request, node);
if (new->tid < req->tid)
p = &(*p)->rb_left;
else if (new->tid > req->tid)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new->node, parent, p);
rb_insert_color(&new->node, &monc->generic_request_tree);
}
static void release_generic_request(struct kref *kref) static void release_generic_request(struct kref *kref)
{ {
struct ceph_mon_generic_request *req = struct ceph_mon_generic_request *req =
container_of(kref, struct ceph_mon_generic_request, kref); container_of(kref, struct ceph_mon_generic_request, kref);
dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
req->reply);
WARN_ON(!RB_EMPTY_NODE(&req->node));
if (req->reply) if (req->reply)
ceph_msg_put(req->reply); ceph_msg_put(req->reply);
if (req->request) if (req->request)
...@@ -533,6 +499,7 @@ static void release_generic_request(struct kref *kref) ...@@ -533,6 +499,7 @@ static void release_generic_request(struct kref *kref)
static void put_generic_request(struct ceph_mon_generic_request *req) static void put_generic_request(struct ceph_mon_generic_request *req)
{ {
if (req)
kref_put(&req->kref, release_generic_request); kref_put(&req->kref, release_generic_request);
} }
...@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req) ...@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
kref_get(&req->kref); kref_get(&req->kref);
} }
static struct ceph_mon_generic_request *
alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
{
struct ceph_mon_generic_request *req;
req = kzalloc(sizeof(*req), gfp);
if (!req)
return NULL;
req->monc = monc;
kref_init(&req->kref);
RB_CLEAR_NODE(&req->node);
init_completion(&req->completion);
dout("%s greq %p\n", __func__, req);
return req;
}
static void register_generic_request(struct ceph_mon_generic_request *req)
{
struct ceph_mon_client *monc = req->monc;
WARN_ON(req->tid);
get_generic_request(req);
req->tid = ++monc->last_tid;
insert_generic_request(&monc->generic_request_tree, req);
}
static void send_generic_request(struct ceph_mon_client *monc,
struct ceph_mon_generic_request *req)
{
WARN_ON(!req->tid);
dout("%s greq %p tid %llu\n", __func__, req, req->tid);
req->request->hdr.tid = cpu_to_le64(req->tid);
ceph_con_send(&monc->con, ceph_msg_get(req->request));
}
static void __finish_generic_request(struct ceph_mon_generic_request *req)
{
struct ceph_mon_client *monc = req->monc;
dout("%s greq %p tid %llu\n", __func__, req, req->tid);
erase_generic_request(&monc->generic_request_tree, req);
ceph_msg_revoke(req->request);
ceph_msg_revoke_incoming(req->reply);
}
static void finish_generic_request(struct ceph_mon_generic_request *req)
{
__finish_generic_request(req);
put_generic_request(req);
}
static void complete_generic_request(struct ceph_mon_generic_request *req)
{
if (req->complete_cb)
req->complete_cb(req);
else
complete_all(&req->completion);
put_generic_request(req);
}
void cancel_generic_request(struct ceph_mon_generic_request *req)
{
struct ceph_mon_client *monc = req->monc;
struct ceph_mon_generic_request *lookup_req;
dout("%s greq %p tid %llu\n", __func__, req, req->tid);
mutex_lock(&monc->mutex);
lookup_req = lookup_generic_request(&monc->generic_request_tree,
req->tid);
if (lookup_req) {
WARN_ON(lookup_req != req);
finish_generic_request(req);
}
mutex_unlock(&monc->mutex);
}
static int wait_generic_request(struct ceph_mon_generic_request *req)
{
int ret;
dout("%s greq %p tid %llu\n", __func__, req, req->tid);
ret = wait_for_completion_interruptible(&req->completion);
if (ret)
cancel_generic_request(req);
else
ret = req->result; /* completed */
return ret;
}
static struct ceph_msg *get_generic_reply(struct ceph_connection *con, static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
struct ceph_msg_header *hdr, struct ceph_msg_header *hdr,
int *skip) int *skip)
...@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, ...@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
struct ceph_msg *m; struct ceph_msg *m;
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
req = __lookup_generic_req(monc, tid); req = lookup_generic_request(&monc->generic_request_tree, tid);
if (!req) { if (!req) {
dout("get_generic_reply %lld dne\n", tid); dout("get_generic_reply %lld dne\n", tid);
*skip = 1; *skip = 1;
...@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, ...@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
return m; return m;
} }
static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
struct ceph_mon_generic_request *req)
{
int err;
/* register request */
req->tid = tid != 0 ? tid : ++monc->last_tid;
req->request->hdr.tid = cpu_to_le64(req->tid);
__insert_generic_request(monc, req);
monc->num_generic_requests++;
ceph_con_send(&monc->con, ceph_msg_get(req->request));
mutex_unlock(&monc->mutex);
err = wait_for_completion_interruptible(&req->completion);
mutex_lock(&monc->mutex);
rb_erase(&req->node, &monc->generic_request_tree);
monc->num_generic_requests--;
if (!err)
err = req->result;
return err;
}
static int do_generic_request(struct ceph_mon_client *monc,
struct ceph_mon_generic_request *req)
{
int err;
mutex_lock(&monc->mutex);
err = __do_generic_request(monc, 0, req);
mutex_unlock(&monc->mutex);
return err;
}
/* /*
* statfs * statfs
*/ */
...@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, ...@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
struct ceph_mon_statfs_reply *reply = msg->front.iov_base; struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
u64 tid = le64_to_cpu(msg->hdr.tid); u64 tid = le64_to_cpu(msg->hdr.tid);
dout("%s msg %p tid %llu\n", __func__, msg, tid);
if (msg->front.iov_len != sizeof(*reply)) if (msg->front.iov_len != sizeof(*reply))
goto bad; goto bad;
dout("handle_statfs_reply %p tid %llu\n", msg, tid);
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
req = __lookup_generic_req(monc, tid); req = lookup_generic_request(&monc->generic_request_tree, tid);
if (req) { if (!req) {
*(struct ceph_statfs *)req->buf = reply->st;
req->result = 0;
get_generic_request(req);
}
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
if (req) { return;
complete_all(&req->completion);
put_generic_request(req);
} }
req->result = 0;
*req->u.st = reply->st; /* struct */
__finish_generic_request(req);
mutex_unlock(&monc->mutex);
complete_generic_request(req);
return; return;
bad: bad:
...@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) ...@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
{ {
struct ceph_mon_generic_request *req; struct ceph_mon_generic_request *req;
struct ceph_mon_statfs *h; struct ceph_mon_statfs *h;
int err; int ret = -ENOMEM;
req = kzalloc(sizeof(*req), GFP_NOFS); req = alloc_generic_request(monc, GFP_NOFS);
if (!req) if (!req)
return -ENOMEM; goto out;
kref_init(&req->kref);
req->buf = buf;
init_completion(&req->completion);
err = -ENOMEM;
req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
true); true);
if (!req->request) if (!req->request)
goto out; goto out;
req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
true); req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
if (!req->reply) if (!req->reply)
goto out; goto out;
req->u.st = buf;
mutex_lock(&monc->mutex);
register_generic_request(req);
/* fill out request */ /* fill out request */
h = req->request->front.iov_base; h = req->request->front.iov_base;
h->monhdr.have_version = 0; h->monhdr.have_version = 0;
h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon = cpu_to_le16(-1);
h->monhdr.session_mon_tid = 0; h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid; h->fsid = monc->monmap->fsid;
send_generic_request(monc, req);
mutex_unlock(&monc->mutex);
err = do_generic_request(monc, req); ret = wait_generic_request(req);
out: out:
put_generic_request(req); put_generic_request(req);
return err; return ret;
} }
EXPORT_SYMBOL(ceph_monc_do_statfs); EXPORT_SYMBOL(ceph_monc_do_statfs);
...@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, ...@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
void *end = p + msg->front_alloc_len; void *end = p + msg->front_alloc_len;
u64 handle; u64 handle;
dout("%s %p tid %llu\n", __func__, msg, tid); dout("%s msg %p tid %llu\n", __func__, msg, tid);
ceph_decode_need(&p, end, 2*sizeof(u64), bad); ceph_decode_need(&p, end, 2*sizeof(u64), bad);
handle = ceph_decode_64(&p); handle = ceph_decode_64(&p);
...@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, ...@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
goto bad; goto bad;
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
req = __lookup_generic_req(monc, handle); req = lookup_generic_request(&monc->generic_request_tree, handle);
if (req) { if (!req) {
*(u64 *)req->buf = ceph_decode_64(&p);
req->result = 0;
get_generic_request(req);
}
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
if (req) { return;
complete_all(&req->completion);
put_generic_request(req);
} }
req->result = 0;
req->u.newest = ceph_decode_64(&p);
__finish_generic_request(req);
mutex_unlock(&monc->mutex);
complete_generic_request(req);
return; return;
bad: bad:
pr_err("corrupt mon_get_version reply, tid %llu\n", tid); pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
ceph_msg_dump(msg); ceph_msg_dump(msg);
} }
/* static struct ceph_mon_generic_request *
* Send MMonGetVersion and wait for the reply. __ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
* ceph_monc_callback_t cb, u64 private_data)
* @what: one of "mdsmap", "osdmap" or "monmap"
*/
int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
u64 *newest)
{ {
struct ceph_mon_generic_request *req; struct ceph_mon_generic_request *req;
void *p, *end;
u64 tid;
int err;
req = kzalloc(sizeof(*req), GFP_NOFS); req = alloc_generic_request(monc, GFP_NOIO);
if (!req) if (!req)
return -ENOMEM; goto err_put_req;
kref_init(&req->kref);
req->buf = newest;
init_completion(&req->completion);
req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
sizeof(u64) + sizeof(u32) + strlen(what), sizeof(u64) + sizeof(u32) + strlen(what),
GFP_NOFS, true); GFP_NOIO, true);
if (!req->request) { if (!req->request)
err = -ENOMEM; goto err_put_req;
goto out;
}
req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
GFP_NOFS, true); true);
if (!req->reply) { if (!req->reply)
err = -ENOMEM; goto err_put_req;
goto out;
}
p = req->request->front.iov_base; req->complete_cb = cb;
end = p + req->request->front_alloc_len; req->private_data = private_data;
/* fill out request */
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
tid = ++monc->last_tid; register_generic_request(req);
ceph_encode_64(&p, tid); /* handle */ {
void *p = req->request->front.iov_base;
void *const end = p + req->request->front_alloc_len;
ceph_encode_64(&p, req->tid); /* handle */
ceph_encode_string(&p, end, what, strlen(what)); ceph_encode_string(&p, end, what, strlen(what));
WARN_ON(p != end);
}
send_generic_request(monc, req);
mutex_unlock(&monc->mutex);
err = __do_generic_request(monc, tid, req); return req;
mutex_unlock(&monc->mutex); err_put_req:
out:
put_generic_request(req); put_generic_request(req);
return err; return ERR_PTR(-ENOMEM);
} }
EXPORT_SYMBOL(ceph_monc_do_get_version);
/*
* Send MMonGetVersion and wait for the reply.
*
* @what: one of "mdsmap", "osdmap" or "monmap"
*/
int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
u64 *newest)
{
struct ceph_mon_generic_request *req;
int ret;
req = __ceph_monc_get_version(monc, what, NULL, 0);
if (IS_ERR(req))
return PTR_ERR(req);
ret = wait_generic_request(req);
if (!ret)
*newest = req->u.newest;
put_generic_request(req);
return ret;
}
EXPORT_SYMBOL(ceph_monc_get_version);
/*
* Send MMonGetVersion,
*
* @what: one of "mdsmap", "osdmap" or "monmap"
*/
int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
ceph_monc_callback_t cb, u64 private_data)
{
struct ceph_mon_generic_request *req;
req = __ceph_monc_get_version(monc, what, cb, private_data);
if (IS_ERR(req))
return PTR_ERR(req);
put_generic_request(req);
return 0;
}
EXPORT_SYMBOL(ceph_monc_get_version_async);
/* /*
* Resend pending generic requests. * Resend pending generic requests.
...@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ...@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
if (!monc->m_subscribe_ack) if (!monc->m_subscribe_ack)
goto out_auth; goto out_auth;
monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
true); true);
if (!monc->m_subscribe) if (!monc->m_subscribe)
goto out_subscribe_ack; goto out_subscribe_ack;
...@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ...@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
monc->generic_request_tree = RB_ROOT; monc->generic_request_tree = RB_ROOT;
monc->num_generic_requests = 0;
monc->last_tid = 0; monc->last_tid = 0;
monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
return 0; return 0;
out_auth_reply: out_auth_reply:
...@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ...@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
ceph_auth_destroy(monc->auth); ceph_auth_destroy(monc->auth);
WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
ceph_msg_put(monc->m_auth); ceph_msg_put(monc->m_auth);
ceph_msg_put(monc->m_auth_reply); ceph_msg_put(monc->m_auth_reply);
ceph_msg_put(monc->m_subscribe); ceph_msg_put(monc->m_subscribe);
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -380,23 +380,24 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -380,23 +380,24 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
return ERR_PTR(err); return ERR_PTR(err);
} }
/* int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
* to a set of osds) and primary_temp (explicit primary setting)
*/
static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
{ {
if (l.pool < r.pool) if (lhs->pool < rhs->pool)
return -1; return -1;
if (l.pool > r.pool) if (lhs->pool > rhs->pool)
return 1; return 1;
if (l.seed < r.seed) if (lhs->seed < rhs->seed)
return -1; return -1;
if (l.seed > r.seed) if (lhs->seed > rhs->seed)
return 1; return 1;
return 0; return 0;
} }
/*
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
* to a set of osds) and primary_temp (explicit primary setting)
*/
static int __insert_pg_mapping(struct ceph_pg_mapping *new, static int __insert_pg_mapping(struct ceph_pg_mapping *new,
struct rb_root *root) struct rb_root *root)
{ {
...@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new, ...@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
while (*p) { while (*p) {
parent = *p; parent = *p;
pg = rb_entry(parent, struct ceph_pg_mapping, node); pg = rb_entry(parent, struct ceph_pg_mapping, node);
c = pgid_cmp(new->pgid, pg->pgid); c = ceph_pg_compare(&new->pgid, &pg->pgid);
if (c < 0) if (c < 0)
p = &(*p)->rb_left; p = &(*p)->rb_left;
else if (c > 0) else if (c > 0)
...@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, ...@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
while (n) { while (n) {
pg = rb_entry(n, struct ceph_pg_mapping, node); pg = rb_entry(n, struct ceph_pg_mapping, node);
c = pgid_cmp(pgid, pg->pgid); c = ceph_pg_compare(&pgid, &pg->pgid);
if (c < 0) { if (c < 0) {
n = n->rb_left; n = n->rb_left;
} else if (c > 0) { } else if (c > 0) {
...@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) ...@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
*p += 4; /* skip crash_replay_interval */ *p += 4; /* skip crash_replay_interval */
if (ev >= 7) if (ev >= 7)
*p += 1; /* skip min_size */ pi->min_size = ceph_decode_8(p);
else
pi->min_size = pi->size - pi->size / 2;
if (ev >= 8) if (ev >= 8)
*p += 8 + 8; /* skip quota_max_* */ *p += 8 + 8; /* skip quota_max_* */
...@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) ...@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
pi->write_tier = -1; pi->write_tier = -1;
} }
if (ev >= 10) {
/* skip properties */
num = ceph_decode_32(p);
while (num--) {
len = ceph_decode_32(p);
*p += len; /* key */
len = ceph_decode_32(p);
*p += len; /* val */
}
}
if (ev >= 11) {
/* skip hit_set_params */
*p += 1 + 1; /* versions */
len = ceph_decode_32(p);
*p += len;
*p += 4; /* skip hit_set_period */
*p += 4; /* skip hit_set_count */
}
if (ev >= 12)
*p += 4; /* skip stripe_width */
if (ev >= 13) {
*p += 8; /* skip target_max_bytes */
*p += 8; /* skip target_max_objects */
*p += 4; /* skip cache_target_dirty_ratio_micro */
*p += 4; /* skip cache_target_full_ratio_micro */
*p += 4; /* skip cache_min_flush_age */
*p += 4; /* skip cache_min_evict_age */
}
if (ev >= 14) {
/* skip erasure_code_profile */
len = ceph_decode_32(p);
*p += len;
}
if (ev >= 15)
pi->last_force_request_resend = ceph_decode_32(p);
else
pi->last_force_request_resend = 0;
/* ignore the rest */ /* ignore the rest */
*p = pool_end; *p = pool_end;
...@@ -660,6 +707,23 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) ...@@ -660,6 +707,23 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
/* /*
* osd map * osd map
*/ */
struct ceph_osdmap *ceph_osdmap_alloc(void)
{
struct ceph_osdmap *map;
map = kzalloc(sizeof(*map), GFP_NOIO);
if (!map)
return NULL;
map->pg_pools = RB_ROOT;
map->pool_max = -1;
map->pg_temp = RB_ROOT;
map->primary_temp = RB_ROOT;
mutex_init(&map->crush_scratch_mutex);
return map;
}
void ceph_osdmap_destroy(struct ceph_osdmap *map) void ceph_osdmap_destroy(struct ceph_osdmap *map)
{ {
dout("osdmap_destroy %p\n", map); dout("osdmap_destroy %p\n", map);
...@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) ...@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
struct ceph_osdmap *map; struct ceph_osdmap *map;
int ret; int ret;
map = kzalloc(sizeof(*map), GFP_NOFS); map = ceph_osdmap_alloc();
if (!map) if (!map)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
map->pg_temp = RB_ROOT;
map->primary_temp = RB_ROOT;
mutex_init(&map->crush_scratch_mutex);
ret = osdmap_decode(p, end, map); ret = osdmap_decode(p, end, map);
if (ret) { if (ret) {
ceph_osdmap_destroy(map); ceph_osdmap_destroy(map);
...@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) ...@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
* decode and apply an incremental map update. * decode and apply an incremental map update.
*/ */
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map, struct ceph_osdmap *map)
struct ceph_messenger *msgr)
{ {
struct crush_map *newcrush = NULL; struct crush_map *newcrush = NULL;
struct ceph_fsid fsid; struct ceph_fsid fsid;
...@@ -1381,8 +1440,252 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ...@@ -1381,8 +1440,252 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
return ERR_PTR(err); return ERR_PTR(err);
} }
void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src)
{
WARN_ON(!ceph_oid_empty(dest));
if (src->name != src->inline_name) {
/* very rare, see ceph_object_id definition */
dest->name = kmalloc(src->name_len + 1,
GFP_NOIO | __GFP_NOFAIL);
}
memcpy(dest->name, src->name, src->name_len + 1);
dest->name_len = src->name_len;
}
EXPORT_SYMBOL(ceph_oid_copy);
static __printf(2, 0)
int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
{
int len;
WARN_ON(!ceph_oid_empty(oid));
len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
if (len >= sizeof(oid->inline_name))
return len;
oid->name_len = len;
return 0;
}
/*
* If oid doesn't fit into inline buffer, BUG.
*/
void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
BUG_ON(oid_printf_vargs(oid, fmt, ap));
va_end(ap);
}
EXPORT_SYMBOL(ceph_oid_printf);
static __printf(3, 0)
int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
const char *fmt, va_list ap)
{
va_list aq;
int len;
va_copy(aq, ap);
len = oid_printf_vargs(oid, fmt, aq);
va_end(aq);
if (len) {
char *external_name;
external_name = kmalloc(len + 1, gfp);
if (!external_name)
return -ENOMEM;
oid->name = external_name;
WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
oid->name_len = len;
}
return 0;
}
/*
* If oid doesn't fit into inline buffer, allocate.
*/
int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
const char *fmt, ...)
{
va_list ap;
int ret;
va_start(ap, fmt);
ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
va_end(ap);
return ret;
}
EXPORT_SYMBOL(ceph_oid_aprintf);
void ceph_oid_destroy(struct ceph_object_id *oid)
{
if (oid->name != oid->inline_name)
kfree(oid->name);
}
EXPORT_SYMBOL(ceph_oid_destroy);
/*
* osds only
*/
static bool __osds_equal(const struct ceph_osds *lhs,
const struct ceph_osds *rhs)
{
if (lhs->size == rhs->size &&
!memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
return true;
return false;
}
/*
* osds + primary
*/
static bool osds_equal(const struct ceph_osds *lhs,
const struct ceph_osds *rhs)
{
if (__osds_equal(lhs, rhs) &&
lhs->primary == rhs->primary)
return true;
return false;
}
static bool osds_valid(const struct ceph_osds *set)
{
/* non-empty set */
if (set->size > 0 && set->primary >= 0)
return true;
/* empty can_shift_osds set */
if (!set->size && set->primary == -1)
return true;
/* empty !can_shift_osds set - all NONE */
if (set->size > 0 && set->primary == -1) {
int i;
for (i = 0; i < set->size; i++) {
if (set->osds[i] != CRUSH_ITEM_NONE)
break;
}
if (i == set->size)
return true;
}
return false;
}
void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
{
memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
dest->size = src->size;
dest->primary = src->primary;
}
static bool is_split(const struct ceph_pg *pgid,
u32 old_pg_num,
u32 new_pg_num)
{
int old_bits = calc_bits_of(old_pg_num);
int old_mask = (1 << old_bits) - 1;
int n;
WARN_ON(pgid->seed >= old_pg_num);
if (new_pg_num <= old_pg_num)
return false;
for (n = 1; ; n++) {
int next_bit = n << (old_bits - 1);
u32 s = next_bit | pgid->seed;
if (s < old_pg_num || s == pgid->seed)
continue;
if (s >= new_pg_num)
break;
s = ceph_stable_mod(s, old_pg_num, old_mask);
if (s == pgid->seed)
return true;
}
return false;
}
bool ceph_is_new_interval(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
const struct ceph_osds *old_up,
const struct ceph_osds *new_up,
int old_size,
int new_size,
int old_min_size,
int new_min_size,
u32 old_pg_num,
u32 new_pg_num,
bool old_sort_bitwise,
bool new_sort_bitwise,
const struct ceph_pg *pgid)
{
return !osds_equal(old_acting, new_acting) ||
!osds_equal(old_up, new_up) ||
old_size != new_size ||
old_min_size != new_min_size ||
is_split(pgid, old_pg_num, new_pg_num) ||
old_sort_bitwise != new_sort_bitwise;
}
static int calc_pg_rank(int osd, const struct ceph_osds *acting)
{
int i;
for (i = 0; i < acting->size; i++) {
if (acting->osds[i] == osd)
return i;
}
return -1;
}
static bool primary_changed(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting)
{
if (!old_acting->size && !new_acting->size)
return false; /* both still empty */
if (!old_acting->size ^ !new_acting->size)
return true; /* was empty, now not, or vice versa */
if (old_acting->primary != new_acting->primary)
return true; /* primary changed */
if (calc_pg_rank(old_acting->primary, old_acting) !=
calc_pg_rank(new_acting->primary, new_acting))
return true;
return false; /* same primary (tho replicas may have changed) */
}
bool ceph_osds_changed(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
bool any_change)
{
if (primary_changed(old_acting, new_acting))
return true;
if (any_change && !__osds_equal(old_acting, new_acting))
return true;
return false;
}
/* /*
* calculate file layout from given offset, length. * calculate file layout from given offset, length.
...@@ -1455,30 +1758,71 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, ...@@ -1455,30 +1758,71 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
EXPORT_SYMBOL(ceph_calc_file_object_mapping); EXPORT_SYMBOL(ceph_calc_file_object_mapping);
/* /*
* Calculate mapping of a (oloc, oid) pair to a PG. Should only be * Map an object into a PG.
* called with target's (oloc, oid), since tiering isn't taken into *
* account. * Should only be called with target_oid and target_oloc (as opposed to
* base_oid and base_oloc), since tiering isn't taken into account.
*/ */
int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
struct ceph_object_locator *oloc,
struct ceph_object_id *oid, struct ceph_object_id *oid,
struct ceph_pg *pg_out) struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid)
{ {
struct ceph_pg_pool_info *pi; struct ceph_pg_pool_info *pi;
pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
if (!pi) if (!pi)
return -EIO; return -ENOENT;
pg_out->pool = oloc->pool; raw_pgid->pool = oloc->pool;
pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
oid->name_len); oid->name_len);
dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
pg_out->pool, pg_out->seed); oid->name, raw_pgid->pool, raw_pgid->seed);
return 0; return 0;
} }
EXPORT_SYMBOL(ceph_oloc_oid_to_pg); EXPORT_SYMBOL(ceph_object_locator_to_pg);
/*
* Map a raw PG (full precision ps) into an actual PG.
*/
static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_pg *pgid)
{
pgid->pool = raw_pgid->pool;
pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
pi->pg_num_mask);
}
/*
* Map a raw PG (full precision ps) into a placement ps (placement
* seed). Include pool id in that value so that different pools don't
* use the same seeds.
*/
static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid)
{
if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
/* hash pool id and seed so that pool PGs do not overlap */
return crush_hash32_2(CRUSH_HASH_RJENKINS1,
ceph_stable_mod(raw_pgid->seed,
pi->pgp_num,
pi->pgp_num_mask),
raw_pgid->pool);
} else {
/*
* legacy behavior: add ps and pool together. this is
* not a great approach because the PGs from each pool
* will overlap on top of each other: 0.5 == 1.4 ==
* 2.3 == ...
*/
return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
pi->pgp_num_mask) +
(unsigned)raw_pgid->pool;
}
}
static int do_crush(struct ceph_osdmap *map, int ruleno, int x, static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max, int *result, int result_max,
...@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, ...@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
} }
/* /*
* Calculate raw (crush) set for given pgid. * Calculate raw set (CRUSH output) for given PG. The result may
* contain nonexistent OSDs. ->primary is undefined for a raw set.
* *
* Return raw set length, or error. * Placement seed (CRUSH input) is returned through @ppps.
*/ */
static int pg_to_raw_osds(struct ceph_osdmap *osdmap, static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pool, struct ceph_pg_pool_info *pi,
struct ceph_pg pgid, u32 pps, int *osds) const struct ceph_pg *raw_pgid,
struct ceph_osds *raw,
u32 *ppps)
{ {
u32 pps = raw_pg_to_pps(pi, raw_pgid);
int ruleno; int ruleno;
int len; int len;
/* crush */ ceph_osds_init(raw);
ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, if (ppps)
pool->type, pool->size); *ppps = pps;
ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
pi->size);
if (ruleno < 0) { if (ruleno < 0) {
pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
pgid.pool, pool->crush_ruleset, pool->type, pi->id, pi->crush_ruleset, pi->type, pi->size);
pool->size); return;
return -ENOENT;
} }
len = do_crush(osdmap, ruleno, pps, osds, len = do_crush(osdmap, ruleno, pps, raw->osds,
min_t(int, pool->size, CEPH_PG_MAX_SIZE), min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
osdmap->osd_weight, osdmap->max_osd); osdmap->osd_weight, osdmap->max_osd);
if (len < 0) { if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
len, ruleno, pgid.pool, pool->crush_ruleset, len, ruleno, pi->id, pi->crush_ruleset, pi->type,
pool->type, pool->size); pi->size);
return len; return;
} }
return len; raw->size = len;
} }
/* /*
* Given raw set, calculate up set and up primary. * Given raw set, calculate up set and up primary. By definition of an
* up set, the result won't contain nonexistent or down OSDs.
* *
* Return up set length. *primary is set to up primary osd id, or -1 * This is done in-place - on return @set is the up set. If it's
* if up set is empty. * empty, ->primary will remain undefined.
*/ */
static int raw_to_up_osds(struct ceph_osdmap *osdmap, static void raw_to_up_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pool, struct ceph_pg_pool_info *pi,
int *osds, int len, int *primary) struct ceph_osds *set)
{ {
int up_primary = -1;
int i; int i;
if (ceph_can_shift_osds(pool)) { /* ->primary is undefined for a raw set */
BUG_ON(set->primary != -1);
if (ceph_can_shift_osds(pi)) {
int removed = 0; int removed = 0;
for (i = 0; i < len; i++) { /* shift left */
if (ceph_osd_is_down(osdmap, osds[i])) { for (i = 0; i < set->size; i++) {
if (ceph_osd_is_down(osdmap, set->osds[i])) {
removed++; removed++;
continue; continue;
} }
if (removed) if (removed)
osds[i - removed] = osds[i]; set->osds[i - removed] = set->osds[i];
} }
set->size -= removed;
len -= removed; if (set->size > 0)
if (len > 0) set->primary = set->osds[0];
up_primary = osds[0];
} else { } else {
for (i = len - 1; i >= 0; i--) { /* set down/dne devices to NONE */
if (ceph_osd_is_down(osdmap, osds[i])) for (i = set->size - 1; i >= 0; i--) {
osds[i] = CRUSH_ITEM_NONE; if (ceph_osd_is_down(osdmap, set->osds[i]))
set->osds[i] = CRUSH_ITEM_NONE;
else else
up_primary = osds[i]; set->primary = set->osds[i];
} }
} }
*primary = up_primary;
return len;
} }
static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, static void apply_primary_affinity(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pool, struct ceph_pg_pool_info *pi,
int *osds, int len, int *primary) u32 pps,
struct ceph_osds *up)
{ {
int i; int i;
int pos = -1; int pos = -1;
...@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, ...@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (!osdmap->osd_primary_affinity) if (!osdmap->osd_primary_affinity)
return; return;
for (i = 0; i < len; i++) { for (i = 0; i < up->size; i++) {
int osd = osds[i]; int osd = up->osds[i];
if (osd != CRUSH_ITEM_NONE && if (osd != CRUSH_ITEM_NONE &&
osdmap->osd_primary_affinity[osd] != osdmap->osd_primary_affinity[osd] !=
...@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, ...@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
break; break;
} }
} }
if (i == len) if (i == up->size)
return; return;
/* /*
...@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, ...@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
* osd into the hash/rng so that a proportional fraction of an * osd into the hash/rng so that a proportional fraction of an
* osd's pgs get rejected as primary. * osd's pgs get rejected as primary.
*/ */
for (i = 0; i < len; i++) { for (i = 0; i < up->size; i++) {
int osd = osds[i]; int osd = up->osds[i];
u32 aff; u32 aff;
if (osd == CRUSH_ITEM_NONE) if (osd == CRUSH_ITEM_NONE)
...@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, ...@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (pos < 0) if (pos < 0)
return; return;
*primary = osds[pos]; up->primary = up->osds[pos];
if (ceph_can_shift_osds(pool) && pos > 0) { if (ceph_can_shift_osds(pi) && pos > 0) {
/* move the new primary to the front */ /* move the new primary to the front */
for (i = pos; i > 0; i--) for (i = pos; i > 0; i--)
osds[i] = osds[i - 1]; up->osds[i] = up->osds[i - 1];
osds[0] = *primary; up->osds[0] = up->primary;
} }
} }
/* /*
* Given up set, apply pg_temp and primary_temp mappings. * Get pg_temp and primary_temp mappings for given PG.
* *
* Return acting set length. *primary is set to acting primary osd id, * Note that a PG may have none, only pg_temp, only primary_temp or
* or -1 if acting set is empty. * both pg_temp and primary_temp mappings. This means @temp isn't
* always a valid OSD set on return: in the "only primary_temp" case,
* @temp will have its ->primary >= 0 but ->size == 0.
*/ */
static int apply_temps(struct ceph_osdmap *osdmap, static void get_temp_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pool, struct ceph_pg pgid, struct ceph_pg_pool_info *pi,
int *osds, int len, int *primary) const struct ceph_pg *raw_pgid,
struct ceph_osds *temp)
{ {
struct ceph_pg pgid;
struct ceph_pg_mapping *pg; struct ceph_pg_mapping *pg;
int temp_len;
int temp_primary;
int i; int i;
/* raw_pg -> pg */ raw_pg_to_pg(pi, raw_pgid, &pgid);
pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, ceph_osds_init(temp);
pool->pg_num_mask);
/* pg_temp? */ /* pg_temp? */
pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) { if (pg) {
temp_len = 0;
temp_primary = -1;
for (i = 0; i < pg->pg_temp.len; i++) { for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
if (ceph_can_shift_osds(pool)) if (ceph_can_shift_osds(pi))
continue; continue;
else
osds[temp_len++] = CRUSH_ITEM_NONE; temp->osds[temp->size++] = CRUSH_ITEM_NONE;
} else { } else {
osds[temp_len++] = pg->pg_temp.osds[i]; temp->osds[temp->size++] = pg->pg_temp.osds[i];
} }
} }
/* apply pg_temp's primary */ /* apply pg_temp's primary */
for (i = 0; i < temp_len; i++) { for (i = 0; i < temp->size; i++) {
if (osds[i] != CRUSH_ITEM_NONE) { if (temp->osds[i] != CRUSH_ITEM_NONE) {
temp_primary = osds[i]; temp->primary = temp->osds[i];
break; break;
} }
} }
} else {
temp_len = len;
temp_primary = *primary;
} }
/* primary_temp? */ /* primary_temp? */
pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
if (pg) if (pg)
temp_primary = pg->primary_temp.osd; temp->primary = pg->primary_temp.osd;
*primary = temp_primary;
return temp_len;
} }
/* /*
* Calculate acting set for given pgid. * Map a PG to its acting set as well as its up set.
* *
* Return acting set length, or error. *primary is set to acting * Acting set is used for data mapping purposes, while up set can be
* primary osd id, or -1 if acting set is empty or on error. * recorded for detecting interval changes and deciding whether to
* resend a request.
*/ */
int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
int *osds, int *primary) const struct ceph_pg *raw_pgid,
struct ceph_osds *up,
struct ceph_osds *acting)
{ {
struct ceph_pg_pool_info *pool; struct ceph_pg_pool_info *pi;
u32 pps; u32 pps;
int len;
pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
if (!pool) {
*primary = -1;
return -ENOENT;
}
if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
/* hash pool id and seed so that pool PGs do not overlap */ if (!pi) {
pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, ceph_osds_init(up);
ceph_stable_mod(pgid.seed, pool->pgp_num, ceph_osds_init(acting);
pool->pgp_num_mask), goto out;
pgid.pool); }
} else {
/* pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
* legacy behavior: add ps and pool together. this is raw_to_up_osds(osdmap, pi, up);
* not a great approach because the PGs from each pool apply_primary_affinity(osdmap, pi, pps, up);
* will overlap on top of each other: 0.5 == 1.4 == get_temp_osds(osdmap, pi, raw_pgid, acting);
* 2.3 == ... if (!acting->size) {
*/ memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
pps = ceph_stable_mod(pgid.seed, pool->pgp_num, acting->size = up->size;
pool->pgp_num_mask) + if (acting->primary == -1)
(unsigned)pgid.pool; acting->primary = up->primary;
} }
out:
len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); WARN_ON(!osds_valid(up) || !osds_valid(acting));
if (len < 0) {
*primary = -1;
return len;
}
len = raw_to_up_osds(osdmap, pool, osds, len, primary);
apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
len = apply_temps(osdmap, pool, pgid, osds, len, primary);
return len;
} }
/* /*
* Return primary osd for given pgid, or -1 if none. * Return acting primary for given PG, or -1 if none.
*/ */
int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid)
{ {
int osds[CEPH_PG_MAX_SIZE]; struct ceph_osds up, acting;
int primary;
ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
return primary; ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
return acting.primary;
} }
EXPORT_SYMBOL(ceph_calc_pg_primary); EXPORT_SYMBOL(ceph_pg_to_acting_primary);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment