Commit 0c93e1b7 authored by Ilya Dryomov's avatar Ilya Dryomov

rbd: round off and ignore discards that are too small

If, after rounding off, the discard request is smaller than alloc_size,
drop it on the floor in __rbd_img_fill_request().

Default alloc_size to 64k.  This should cover both HDD and SSD based
bluestore OSDs and somewhat improve things for filestore.  For OSDs on
filestore with filestore_punch_hole = false, alloc_size is best set to
object size in order to allow deletes and truncates and disallow zero
op.
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
Reviewed-by: default avatarJason Dillaman <dillaman@redhat.com>
parent 6484cbe9
...@@ -733,6 +733,7 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) ...@@ -733,6 +733,7 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
*/ */
enum { enum {
Opt_queue_depth, Opt_queue_depth,
Opt_alloc_size,
Opt_lock_timeout, Opt_lock_timeout,
Opt_last_int, Opt_last_int,
/* int args above */ /* int args above */
...@@ -749,6 +750,7 @@ enum { ...@@ -749,6 +750,7 @@ enum {
static match_table_t rbd_opts_tokens = { static match_table_t rbd_opts_tokens = {
{Opt_queue_depth, "queue_depth=%d"}, {Opt_queue_depth, "queue_depth=%d"},
{Opt_alloc_size, "alloc_size=%d"},
{Opt_lock_timeout, "lock_timeout=%d"}, {Opt_lock_timeout, "lock_timeout=%d"},
/* int args above */ /* int args above */
{Opt_pool_ns, "_pool_ns=%s"}, {Opt_pool_ns, "_pool_ns=%s"},
...@@ -765,6 +767,7 @@ static match_table_t rbd_opts_tokens = { ...@@ -765,6 +767,7 @@ static match_table_t rbd_opts_tokens = {
struct rbd_options { struct rbd_options {
int queue_depth; int queue_depth;
int alloc_size;
unsigned long lock_timeout; unsigned long lock_timeout;
bool read_only; bool read_only;
bool lock_on_read; bool lock_on_read;
...@@ -773,6 +776,7 @@ struct rbd_options { ...@@ -773,6 +776,7 @@ struct rbd_options {
}; };
#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */ #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
#define RBD_READ_ONLY_DEFAULT false #define RBD_READ_ONLY_DEFAULT false
#define RBD_LOCK_ON_READ_DEFAULT false #define RBD_LOCK_ON_READ_DEFAULT false
...@@ -812,6 +816,17 @@ static int parse_rbd_opts_token(char *c, void *private) ...@@ -812,6 +816,17 @@ static int parse_rbd_opts_token(char *c, void *private)
} }
pctx->opts->queue_depth = intval; pctx->opts->queue_depth = intval;
break; break;
case Opt_alloc_size:
if (intval < 1) {
pr_err("alloc_size out of range\n");
return -EINVAL;
}
if (!is_power_of_2(intval)) {
pr_err("alloc_size must be a power of 2\n");
return -EINVAL;
}
pctx->opts->alloc_size = intval;
break;
case Opt_lock_timeout: case Opt_lock_timeout:
/* 0 is "wait forever" (i.e. infinite timeout) */ /* 0 is "wait forever" (i.e. infinite timeout) */
if (intval < 0 || intval > INT_MAX / 1000) { if (intval < 0 || intval > INT_MAX / 1000) {
...@@ -1853,8 +1868,27 @@ static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) ...@@ -1853,8 +1868,27 @@ static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
{ {
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
u64 off = obj_req->ex.oe_off;
u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
int ret; int ret;
/*
* Align the range to alloc_size boundary and punt on discards
* that are too small to free up any space.
*
* alloc_size == object_size && is_tail() is a special case for
* filestore with filestore_punch_hole = false, needed to allow
* truncate (in addition to delete).
*/
if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
!rbd_obj_is_tail(obj_req)) {
off = round_up(off, rbd_dev->opts->alloc_size);
next_off = round_down(next_off, rbd_dev->opts->alloc_size);
if (off >= next_off)
return 1;
}
/* reverse map the entire object onto the parent */ /* reverse map the entire object onto the parent */
ret = rbd_obj_calc_img_extents(obj_req, true); ret = rbd_obj_calc_img_extents(obj_req, true);
if (ret) if (ret)
...@@ -1867,10 +1901,12 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) ...@@ -1867,10 +1901,12 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0); osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0);
} else { } else {
dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
off, next_off - off);
osd_req_op_extent_init(obj_req->osd_req, 0, osd_req_op_extent_init(obj_req->osd_req, 0,
truncate_or_zero_opcode(obj_req), truncate_or_zero_opcode(obj_req),
obj_req->ex.oe_off, obj_req->ex.oe_len, off, next_off - off, 0, 0);
0, 0);
} }
obj_req->write_state = RBD_OBJ_WRITE_FLAT; obj_req->write_state = RBD_OBJ_WRITE_FLAT;
...@@ -1953,10 +1989,10 @@ static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) ...@@ -1953,10 +1989,10 @@ static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
*/ */
static int __rbd_img_fill_request(struct rbd_img_request *img_req) static int __rbd_img_fill_request(struct rbd_img_request *img_req)
{ {
struct rbd_obj_request *obj_req; struct rbd_obj_request *obj_req, *next_obj_req;
int ret; int ret;
for_each_obj_request(img_req, obj_req) { for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
switch (img_req->op_type) { switch (img_req->op_type) {
case OBJ_OP_READ: case OBJ_OP_READ:
ret = rbd_obj_setup_read(obj_req); ret = rbd_obj_setup_read(obj_req);
...@@ -1973,8 +2009,14 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) ...@@ -1973,8 +2009,14 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req)
default: default:
rbd_assert(0); rbd_assert(0);
} }
if (ret) if (ret < 0)
return ret; return ret;
if (ret > 0) {
img_req->xferred += obj_req->ex.oe_len;
img_req->pending_count--;
rbd_img_obj_request_del(img_req, obj_req);
continue;
}
ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
if (ret) if (ret)
...@@ -3764,7 +3806,7 @@ static void rbd_queue_workfn(struct work_struct *work) ...@@ -3764,7 +3806,7 @@ static void rbd_queue_workfn(struct work_struct *work)
else else
result = rbd_img_fill_from_bio(img_request, offset, length, result = rbd_img_fill_from_bio(img_request, offset, length,
rq->bio); rq->bio);
if (result) if (result || !img_request->pending_count)
goto err_img_request; goto err_img_request;
rbd_img_request_submit(img_request); rbd_img_request_submit(img_request);
...@@ -5425,6 +5467,7 @@ static int rbd_add_parse_args(const char *buf, ...@@ -5425,6 +5467,7 @@ static int rbd_add_parse_args(const char *buf,
pctx.opts->read_only = RBD_READ_ONLY_DEFAULT; pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
...@@ -5922,6 +5965,12 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -5922,6 +5965,12 @@ static ssize_t do_rbd_add(struct bus_type *bus,
if (rbd_dev->spec->snap_id != CEPH_NOSNAP) if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
rbd_dev->opts->read_only = true; rbd_dev->opts->read_only = true;
if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
rbd_warn(rbd_dev, "alloc_size adjusted to %u",
rbd_dev->layout.object_size);
rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
}
rc = rbd_dev_device_setup(rbd_dev); rc = rbd_dev_device_setup(rbd_dev);
if (rc) if (rc)
goto err_out_image_probe; goto err_out_image_probe;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment