Commit 04f83450 authored by Boaz Harrosh's avatar Boaz Harrosh

pnfs-obj: osd raid engine read/write implementation

With the use of the in-kernel osd library. Implement read/write
of data from/to osd-objects according to information specified
in the objects-layout.

Support for stripping over mirrors with a received stripe_unit.
There are however a few constrains which are not supported:
 1. Stripe Unit must be a multiple of PAGE_SIZE
 2. stripe length (stripe_unit * number_of_stripes) can not be
    bigger then 32bit.

Also support raid-groups and partial-layout. Partial-layout is
when not all the groups are received on the line, addressing
only a partial range of the file.

TODO:
  Only raid0! raid 4/5/6 support will come at later stage

A none supported layout will send IO through the MDS

[Important fallout from the last rebase]
Signed-off-by: default avatarBoaz Harrosh <bharrosh@panasas.com>
[gfp_flags]
Signed-off-by: default avatarBenny Halevy <bhalevy@panasas.com>
parent d20581aa
This diff is collapsed.
......@@ -128,6 +128,260 @@ objlayout_free_lseg(struct pnfs_layout_segment *lseg)
objio_free_lseg(lseg);
}
/*
* I/O Operations
*/
static inline u64
end_offset(u64 start, u64 len)
{
u64 end;
end = start + len;
return end >= start ? end : NFS4_MAX_UINT64;
}
/* last octet in a range */
static inline u64
last_byte_offset(u64 start, u64 len)
{
u64 end;
BUG_ON(!len);
end = start + len;
return end > start ? end - 1 : NFS4_MAX_UINT64;
}
static struct objlayout_io_state *
objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
struct page **pages,
unsigned pgbase,
loff_t offset,
size_t count,
struct pnfs_layout_segment *lseg,
void *rpcdata,
gfp_t gfp_flags)
{
struct objlayout_io_state *state;
u64 lseg_end_offset;
dprintk("%s: allocating io_state\n", __func__);
if (objio_alloc_io_state(lseg, &state, gfp_flags))
return NULL;
BUG_ON(offset < lseg->pls_range.offset);
lseg_end_offset = end_offset(lseg->pls_range.offset,
lseg->pls_range.length);
BUG_ON(offset >= lseg_end_offset);
if (offset + count > lseg_end_offset) {
count = lseg->pls_range.length -
(offset - lseg->pls_range.offset);
dprintk("%s: truncated count %Zd\n", __func__, count);
}
if (pgbase > PAGE_SIZE) {
pages += pgbase >> PAGE_SHIFT;
pgbase &= ~PAGE_MASK;
}
state->lseg = lseg;
state->rpcdata = rpcdata;
state->pages = pages;
state->pgbase = pgbase;
state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
state->offset = offset;
state->count = count;
state->sync = 0;
return state;
}
static void
objlayout_free_io_state(struct objlayout_io_state *state)
{
dprintk("%s: freeing io_state\n", __func__);
if (unlikely(!state))
return;
objio_free_io_state(state);
}
/*
* I/O done common code
*/
static void
objlayout_iodone(struct objlayout_io_state *state)
{
dprintk("%s: state %p status\n", __func__, state);
objlayout_free_io_state(state);
}
/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
* This is because the osd completion is called with ints-off from
* the block layer
*/
static void _rpc_read_complete(struct work_struct *work)
{
struct rpc_task *task;
struct nfs_read_data *rdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
rdata = container_of(task, struct nfs_read_data, task);
pnfs_ld_read_done(rdata);
}
void
objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
{
int eof = state->eof;
struct nfs_read_data *rdata;
state->status = status;
dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
rdata = state->rpcdata;
rdata->task.tk_status = status;
if (status >= 0) {
rdata->res.count = status;
rdata->res.eof = eof;
}
objlayout_iodone(state);
/* must not use state after this point */
if (sync)
pnfs_ld_read_done(rdata);
else {
INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
schedule_work(&rdata->task.u.tk_work);
}
}
/*
* Perform sync or async reads.
*/
enum pnfs_try_status
objlayout_read_pagelist(struct nfs_read_data *rdata)
{
loff_t offset = rdata->args.offset;
size_t count = rdata->args.count;
struct objlayout_io_state *state;
ssize_t status = 0;
loff_t eof;
dprintk("%s: Begin inode %p offset %llu count %d\n",
__func__, rdata->inode, offset, (int)count);
eof = i_size_read(rdata->inode);
if (unlikely(offset + count > eof)) {
if (offset >= eof) {
status = 0;
rdata->res.count = 0;
rdata->res.eof = 1;
goto out;
}
count = eof - offset;
}
state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
rdata->args.pages, rdata->args.pgbase,
offset, count,
rdata->lseg, rdata,
GFP_KERNEL);
if (unlikely(!state)) {
status = -ENOMEM;
goto out;
}
state->eof = state->offset + state->count >= eof;
status = objio_read_pagelist(state);
out:
dprintk("%s: Return status %Zd\n", __func__, status);
rdata->pnfs_error = status;
return PNFS_ATTEMPTED;
}
/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
* This is because the osd completion is called with ints-off from
* the block layer
*/
static void _rpc_write_complete(struct work_struct *work)
{
struct rpc_task *task;
struct nfs_write_data *wdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
wdata = container_of(task, struct nfs_write_data, task);
pnfs_ld_write_done(wdata);
}
void
objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
bool sync)
{
struct nfs_write_data *wdata;
dprintk("%s: Begin\n", __func__);
wdata = state->rpcdata;
state->status = status;
wdata->task.tk_status = status;
if (status >= 0) {
wdata->res.count = status;
wdata->verf.committed = state->committed;
dprintk("%s: Return status %d committed %d\n",
__func__, wdata->task.tk_status,
wdata->verf.committed);
} else
dprintk("%s: Return status %d\n",
__func__, wdata->task.tk_status);
objlayout_iodone(state);
/* must not use state after this point */
if (sync)
pnfs_ld_write_done(wdata);
else {
INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
schedule_work(&wdata->task.u.tk_work);
}
}
/*
* Perform sync or async writes.
*/
enum pnfs_try_status
objlayout_write_pagelist(struct nfs_write_data *wdata,
int how)
{
struct objlayout_io_state *state;
ssize_t status;
dprintk("%s: Begin inode %p offset %llu count %u\n",
__func__, wdata->inode, wdata->args.offset, wdata->args.count);
state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
wdata->args.pages,
wdata->args.pgbase,
wdata->args.offset,
wdata->args.count,
wdata->lseg, wdata,
GFP_NOFS);
if (unlikely(!state)) {
status = -ENOMEM;
goto out;
}
state->sync = how & FLUSH_SYNC;
status = objio_write_pagelist(state, how & FLUSH_STABLE);
out:
dprintk("%s: Return status %Zd\n", __func__, status);
wdata->pnfs_error = status;
return PNFS_ATTEMPTED;
}
/*
* Get Device Info API for io engines
*/
......
......@@ -58,6 +58,26 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
return container_of(lo, struct objlayout, pnfs_layout);
}
/*
* per-I/O operation state
* embedded in objects provider io_state data structure
*/
struct objlayout_io_state {
struct pnfs_layout_segment *lseg;
struct page **pages;
unsigned pgbase;
unsigned nr_pages;
unsigned long count;
loff_t offset;
bool sync;
void *rpcdata;
int status; /* res */
int eof; /* res */
int committed; /* res */
};
/*
* Raid engine I/O API
*/
......@@ -68,9 +88,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
gfp_t gfp_flags);
extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
extern int objio_alloc_io_state(
struct pnfs_layout_segment *lseg,
struct objlayout_io_state **outp,
gfp_t gfp_flags);
extern void objio_free_io_state(struct objlayout_io_state *state);
extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
bool stable);
/*
* callback API
*/
extern void objlayout_read_done(struct objlayout_io_state *state,
ssize_t status, bool sync);
extern void objlayout_write_done(struct objlayout_io_state *state,
ssize_t status, bool sync);
extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
gfp_t gfp_flags);
......@@ -89,4 +124,11 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
gfp_t gfp_flags);
extern void objlayout_free_lseg(struct pnfs_layout_segment *);
extern enum pnfs_try_status objlayout_read_pagelist(
struct nfs_read_data *);
extern enum pnfs_try_status objlayout_write_pagelist(
struct nfs_write_data *,
int how);
#endif /* _OBJLAYOUT_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment