Commit ac428036 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ceph-for-4.17-rc5' of git://github.com/ceph/ceph-client

Pull ceph fixes from Ilya Dryomov:
 "These patches fix two long-standing bugs in the DIO code path, one of
  which is a crash trivially triggerable with splice()"

* tag 'ceph-for-4.17-rc5' of git://github.com/ceph/ceph-client:
  ceph: fix iov_iter issues in ceph_direct_read_write()
  libceph: add osd_req_op_extent_osd_data_bvecs()
  ceph: fix rsize/wsize capping in ceph_direct_read_write()
parents 3f5f8596 fc218544
...@@ -2366,7 +2366,9 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) ...@@ -2366,7 +2366,9 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd", osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
"copyup"); "copyup");
osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
obj_req->copyup_bvecs, bytes); obj_req->copyup_bvecs,
obj_req->copyup_bvec_count,
bytes);
switch (obj_req->img_request->op_type) { switch (obj_req->img_request->op_type) {
case OBJ_OP_WRITE: case OBJ_OP_WRITE:
......
...@@ -70,69 +70,104 @@ static __le32 ceph_flags_sys2wire(u32 flags) ...@@ -70,69 +70,104 @@ static __le32 ceph_flags_sys2wire(u32 flags)
*/ */
/* /*
* Calculate the length sum of direct io vectors that can * How many pages to get in one call to iov_iter_get_pages(). This
* be combined into one page vector. * determines the size of the on-stack array used as a buffer.
*/ */
static size_t dio_get_pagev_size(const struct iov_iter *it) #define ITER_GET_BVECS_PAGES 64
static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
struct bio_vec *bvecs)
{ {
const struct iovec *iov = it->iov; size_t size = 0;
const struct iovec *iovend = iov + it->nr_segs; int bvec_idx = 0;
size_t size;
if (maxsize > iov_iter_count(iter))
size = iov->iov_len - it->iov_offset; maxsize = iov_iter_count(iter);
/*
* An iov can be page vectored when both the current tail while (size < maxsize) {
* and the next base are page aligned. struct page *pages[ITER_GET_BVECS_PAGES];
*/ ssize_t bytes;
while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) && size_t start;
(++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) { int idx = 0;
size += iov->iov_len;
} bytes = iov_iter_get_pages(iter, pages, maxsize - size,
dout("dio_get_pagevlen len = %zu\n", size); ITER_GET_BVECS_PAGES, &start);
return size; if (bytes < 0)
return size ?: bytes;
iov_iter_advance(iter, bytes);
size += bytes;
for ( ; bytes; idx++, bvec_idx++) {
struct bio_vec bv = {
.bv_page = pages[idx],
.bv_len = min_t(int, bytes, PAGE_SIZE - start),
.bv_offset = start,
};
bvecs[bvec_idx] = bv;
bytes -= bv.bv_len;
start = 0;
}
}
return size;
} }
/* /*
* Allocate a page vector based on (@it, @nbytes). * iov_iter_get_pages() only considers one iov_iter segment, no matter
* The return value is the tuple describing a page vector, * what maxsize or maxpages are given. For ITER_BVEC that is a single
* that is (@pages, @page_align, @num_pages). * page.
*
* Attempt to get up to @maxsize bytes worth of pages from @iter.
* Return the number of bytes in the created bio_vec array, or an error.
*/ */
static struct page ** static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, struct bio_vec **bvecs, int *num_bvecs)
size_t *page_align, int *num_pages)
{ {
struct iov_iter tmp_it = *it; struct bio_vec *bv;
size_t align; size_t orig_count = iov_iter_count(iter);
struct page **pages; ssize_t bytes;
int ret = 0, idx, npages; int npages;
align = (unsigned long)(it->iov->iov_base + it->iov_offset) & iov_iter_truncate(iter, maxsize);
(PAGE_SIZE - 1); npages = iov_iter_npages(iter, INT_MAX);
npages = calc_pages_for(align, nbytes); iov_iter_reexpand(iter, orig_count);
pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL);
if (!pages)
return ERR_PTR(-ENOMEM);
for (idx = 0; idx < npages; ) { /*
size_t start; * __iter_get_bvecs() may populate only part of the array -- zero it
ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes, * out.
npages - idx, &start); */
if (ret < 0) bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO);
goto fail; if (!bv)
return -ENOMEM;
iov_iter_advance(&tmp_it, ret); bytes = __iter_get_bvecs(iter, maxsize, bv);
nbytes -= ret; if (bytes < 0) {
idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE; /*
* No pages were pinned -- just free the array.
*/
kvfree(bv);
return bytes;
} }
BUG_ON(nbytes != 0); *bvecs = bv;
*num_pages = npages; *num_bvecs = npages;
*page_align = align; return bytes;
dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align); }
return pages;
fail: static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
ceph_put_page_vector(pages, idx, false); {
return ERR_PTR(ret); int i;
for (i = 0; i < num_bvecs; i++) {
if (bvecs[i].bv_page) {
if (should_dirty)
set_page_dirty_lock(bvecs[i].bv_page);
put_page(bvecs[i].bv_page);
}
}
kvfree(bvecs);
} }
/* /*
...@@ -746,11 +781,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) ...@@ -746,11 +781,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct inode *inode = req->r_inode; struct inode *inode = req->r_inode;
struct ceph_aio_request *aio_req = req->r_priv; struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
int num_pages = calc_pages_for((u64)osd_data->alignment,
osd_data->length);
dout("ceph_aio_complete_req %p rc %d bytes %llu\n", BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
inode, rc, osd_data->length); BUG_ON(!osd_data->num_bvecs);
dout("ceph_aio_complete_req %p rc %d bytes %u\n",
inode, rc, osd_data->bvec_pos.iter.bi_size);
if (rc == -EOLDSNAPC) { if (rc == -EOLDSNAPC) {
struct ceph_aio_work *aio_work; struct ceph_aio_work *aio_work;
...@@ -768,9 +804,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) ...@@ -768,9 +804,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
} else if (!aio_req->write) { } else if (!aio_req->write) {
if (rc == -ENOENT) if (rc == -ENOENT)
rc = 0; rc = 0;
if (rc >= 0 && osd_data->length > rc) { if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
int zoff = osd_data->alignment + rc; struct iov_iter i;
int zlen = osd_data->length - rc; int zlen = osd_data->bvec_pos.iter.bi_size - rc;
/* /*
* If read is satisfied by single OSD request, * If read is satisfied by single OSD request,
* it can pass EOF. Otherwise read is within * it can pass EOF. Otherwise read is within
...@@ -785,13 +822,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) ...@@ -785,13 +822,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
aio_req->total_len = rc + zlen; aio_req->total_len = rc + zlen;
} }
if (zlen > 0) iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
ceph_zero_page_vector_range(zoff, zlen, osd_data->num_bvecs,
osd_data->pages); osd_data->bvec_pos.iter.bi_size);
iov_iter_advance(&i, rc);
iov_iter_zero(zlen, &i);
} }
} }
ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty); put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
aio_req->should_dirty);
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
if (rc < 0) if (rc < 0)
...@@ -879,7 +919,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -879,7 +919,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_vino vino; struct ceph_vino vino;
struct ceph_osd_request *req; struct ceph_osd_request *req;
struct page **pages; struct bio_vec *bvecs;
struct ceph_aio_request *aio_req = NULL; struct ceph_aio_request *aio_req = NULL;
int num_pages = 0; int num_pages = 0;
int flags; int flags;
...@@ -914,10 +954,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -914,10 +954,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
} }
while (iov_iter_count(iter) > 0) { while (iov_iter_count(iter) > 0) {
u64 size = dio_get_pagev_size(iter); u64 size = iov_iter_count(iter);
size_t start = 0;
ssize_t len; ssize_t len;
if (write)
size = min_t(u64, size, fsc->mount_options->wsize);
else
size = min_t(u64, size, fsc->mount_options->rsize);
vino = ceph_vino(inode); vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &size, 0, vino, pos, &size, 0,
...@@ -933,18 +977,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -933,18 +977,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
break; break;
} }
if (write) len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
size = min_t(u64, size, fsc->mount_options->wsize); if (len < 0) {
else
size = min_t(u64, size, fsc->mount_options->rsize);
len = size;
pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
if (IS_ERR(pages)) {
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
ret = PTR_ERR(pages); ret = len;
break; break;
} }
if (len != size)
osd_req_op_extent_update(req, 0, len);
/* /*
* To simplify error handling, allow AIO when IO within i_size * To simplify error handling, allow AIO when IO within i_size
...@@ -977,8 +1017,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -977,8 +1017,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
req->r_mtime = mtime; req->r_mtime = mtime;
} }
osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
false, false);
if (aio_req) { if (aio_req) {
aio_req->total_len += len; aio_req->total_len += len;
...@@ -991,7 +1030,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -991,7 +1030,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
pos += len; pos += len;
iov_iter_advance(iter, len);
continue; continue;
} }
...@@ -1004,25 +1042,26 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -1004,25 +1042,26 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (ret == -ENOENT) if (ret == -ENOENT)
ret = 0; ret = 0;
if (ret >= 0 && ret < len && pos + ret < size) { if (ret >= 0 && ret < len && pos + ret < size) {
struct iov_iter i;
int zlen = min_t(size_t, len - ret, int zlen = min_t(size_t, len - ret,
size - pos - ret); size - pos - ret);
ceph_zero_page_vector_range(start + ret, zlen,
pages); iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
len);
iov_iter_advance(&i, ret);
iov_iter_zero(zlen, &i);
ret += zlen; ret += zlen;
} }
if (ret >= 0) if (ret >= 0)
len = ret; len = ret;
} }
ceph_put_page_vector(pages, num_pages, should_dirty); put_bvecs(bvecs, num_pages, should_dirty);
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
if (ret < 0) if (ret < 0)
break; break;
pos += len; pos += len;
iov_iter_advance(iter, len);
if (!write && pos >= size) if (!write && pos >= size)
break; break;
......
...@@ -77,7 +77,10 @@ struct ceph_osd_data { ...@@ -77,7 +77,10 @@ struct ceph_osd_data {
u32 bio_length; u32 bio_length;
}; };
#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLOCK */
struct ceph_bvec_iter bvec_pos; struct {
struct ceph_bvec_iter bvec_pos;
u32 num_bvecs;
};
}; };
}; };
...@@ -412,6 +415,10 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, ...@@ -412,6 +415,10 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
struct ceph_bio_iter *bio_pos, struct ceph_bio_iter *bio_pos,
u32 bio_length); u32 bio_length);
#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLOCK */
void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
unsigned int which,
struct bio_vec *bvecs, u32 num_bvecs,
u32 bytes);
void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
unsigned int which, unsigned int which,
struct ceph_bvec_iter *bvec_pos); struct ceph_bvec_iter *bvec_pos);
...@@ -426,7 +433,8 @@ extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, ...@@ -426,7 +433,8 @@ extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *,
bool own_pages); bool own_pages);
void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
unsigned int which, unsigned int which,
struct bio_vec *bvecs, u32 bytes); struct bio_vec *bvecs, u32 num_bvecs,
u32 bytes);
extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
unsigned int which, unsigned int which,
struct page **pages, u64 length, struct page **pages, u64 length,
......
...@@ -157,10 +157,12 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, ...@@ -157,10 +157,12 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLOCK */
static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
struct ceph_bvec_iter *bvec_pos) struct ceph_bvec_iter *bvec_pos,
u32 num_bvecs)
{ {
osd_data->type = CEPH_OSD_DATA_TYPE_BVECS; osd_data->type = CEPH_OSD_DATA_TYPE_BVECS;
osd_data->bvec_pos = *bvec_pos; osd_data->bvec_pos = *bvec_pos;
osd_data->num_bvecs = num_bvecs;
} }
#define osd_req_op_data(oreq, whch, typ, fld) \ #define osd_req_op_data(oreq, whch, typ, fld) \
...@@ -237,6 +239,22 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, ...@@ -237,6 +239,22 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLOCK */
void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
unsigned int which,
struct bio_vec *bvecs, u32 num_bvecs,
u32 bytes)
{
struct ceph_osd_data *osd_data;
struct ceph_bvec_iter it = {
.bvecs = bvecs,
.iter = { .bi_size = bytes },
};
osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
}
EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvecs);
void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
unsigned int which, unsigned int which,
struct ceph_bvec_iter *bvec_pos) struct ceph_bvec_iter *bvec_pos)
...@@ -244,7 +262,7 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, ...@@ -244,7 +262,7 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
struct ceph_osd_data *osd_data; struct ceph_osd_data *osd_data;
osd_data = osd_req_op_data(osd_req, which, extent, osd_data); osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
ceph_osd_data_bvecs_init(osd_data, bvec_pos); ceph_osd_data_bvecs_init(osd_data, bvec_pos, 0);
} }
EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
...@@ -287,7 +305,8 @@ EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); ...@@ -287,7 +305,8 @@ EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
unsigned int which, unsigned int which,
struct bio_vec *bvecs, u32 bytes) struct bio_vec *bvecs, u32 num_bvecs,
u32 bytes)
{ {
struct ceph_osd_data *osd_data; struct ceph_osd_data *osd_data;
struct ceph_bvec_iter it = { struct ceph_bvec_iter it = {
...@@ -296,7 +315,7 @@ void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, ...@@ -296,7 +315,7 @@ void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
}; };
osd_data = osd_req_op_data(osd_req, which, cls, request_data); osd_data = osd_req_op_data(osd_req, which, cls, request_data);
ceph_osd_data_bvecs_init(osd_data, &it); ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
osd_req->r_ops[which].cls.indata_len += bytes; osd_req->r_ops[which].cls.indata_len += bytes;
osd_req->r_ops[which].indata_len += bytes; osd_req->r_ops[which].indata_len += bytes;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment