Commit 2ded3703 authored by Song Liu's avatar Song Liu Committed by Shaohua Li

md/r5cache: State machine for raid5-cache write back mode

This patch adds state machine for raid5-cache. With log device, the
raid456 array could operate in two different modes (r5c_journal_mode):
  - write-back (R5C_MODE_WRITE_BACK)
  - write-through (R5C_MODE_WRITE_THROUGH)

Existing code of raid5-cache only has write-through mode. For write-back
cache, it is necessary to extend the state machine.

With write-back cache, every stripe could operate in two different
phases:
  - caching
  - writing-out

In caching phase, the stripe handles writes as:
  - write to journal
  - return IO

In writing-out phase, the stripe behaviors as a stripe in write through
mode R5C_MODE_WRITE_THROUGH.

STRIPE_R5C_CACHING is added to sh->state to differentiate caching and
writing-out phase.

Please note: this is a "no-op" patch for raid5-cache write-through
mode.

The following detailed explanation is copied from the raid5-cache.c:

/*
 * raid5 cache state machine
 *
 * With rhe RAID cache, each stripe works in two phases:
 *      - caching phase
 *      - writing-out phase
 *
 * These two phases are controlled by bit STRIPE_R5C_CACHING:
 *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
 *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
 *
 * When there is no journal, or the journal is in write-through mode,
 * the stripe is always in writing-out phase.
 *
 * For write-back journal, the stripe is sent to caching phase on write
 * (r5c_handle_stripe_dirtying). r5c_make_stripe_write_out() kicks off
 * the write-out phase by clearing STRIPE_R5C_CACHING.
 *
 * Stripes in caching phase do not write the raid disks. Instead, all
 * writes are committed from the log device. Therefore, a stripe in
 * caching phase handles writes as:
 *      - write to log device
 *      - return IO
 *
 * Stripes in writing-out phase handle writes as:
 *      - calculate parity
 *      - write pending data and parity to journal
 *      - write data and parity to raid disks
 *      - return IO for pending writes
 */
Signed-off-by: default avatarSong Liu <songliubraving@fb.com>
Signed-off-by: default avatarShaohua Li <shli@fb.com>
parent 937621c3
...@@ -40,6 +40,47 @@ ...@@ -40,6 +40,47 @@
*/ */
#define R5L_POOL_SIZE 4 #define R5L_POOL_SIZE 4
/*
* r5c journal modes of the array: write-back or write-through.
* write-through mode has identical behavior as existing log only
* implementation.
*/
enum r5c_journal_mode {
R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
R5C_JOURNAL_MODE_WRITE_BACK = 1,
};
/*
* raid5 cache state machine
*
* With rhe RAID cache, each stripe works in two phases:
* - caching phase
* - writing-out phase
*
* These two phases are controlled by bit STRIPE_R5C_CACHING:
* if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
* if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
*
* When there is no journal, or the journal is in write-through mode,
* the stripe is always in writing-out phase.
*
* For write-back journal, the stripe is sent to caching phase on write
* (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
* the write-out phase by clearing STRIPE_R5C_CACHING.
*
* Stripes in caching phase do not write the raid disks. Instead, all
* writes are committed from the log device. Therefore, a stripe in
* caching phase handles writes as:
* - write to log device
* - return IO
*
* Stripes in writing-out phase handle writes as:
* - calculate parity
* - write pending data and parity to journal
* - write data and parity to raid disks
* - return IO for pending writes
*/
struct r5l_log { struct r5l_log {
struct md_rdev *rdev; struct md_rdev *rdev;
...@@ -96,6 +137,9 @@ struct r5l_log { ...@@ -96,6 +137,9 @@ struct r5l_log {
spinlock_t no_space_stripes_lock; spinlock_t no_space_stripes_lock;
bool need_cache_flush; bool need_cache_flush;
/* for r5c_cache */
enum r5c_journal_mode r5c_journal_mode;
}; };
/* /*
...@@ -133,6 +177,12 @@ enum r5l_io_unit_state { ...@@ -133,6 +177,12 @@ enum r5l_io_unit_state {
IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
}; };
bool r5c_is_writeback(struct r5l_log *log)
{
return (log != NULL &&
log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
}
static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
{ {
start += inc; start += inc;
...@@ -168,12 +218,51 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io, ...@@ -168,12 +218,51 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
io->state = state; io->state = state;
} }
/*
* Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
* This function should only be called in write-back mode.
*/
static void r5c_make_stripe_write_out(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
struct r5l_log *log = conf->log;
BUG_ON(!r5c_is_writeback(log));
WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
clear_bit(STRIPE_R5C_CACHING, &sh->state);
}
/*
* Setting proper flags after writing (or flushing) data and/or parity to the
* log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
*/
static void r5c_finish_cache_stripe(struct stripe_head *sh)
{
struct r5l_log *log = sh->raid_conf->log;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
/*
* Set R5_InJournal for parity dev[pd_idx]. This means
* all data AND parity in the journal. For RAID 6, it is
* NOT necessary to set the flag for dev[qd_idx], as the
* two parities are written out together.
*/
set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
} else
BUG(); /* write-back logic in next patch */
}
static void r5l_io_run_stripes(struct r5l_io_unit *io) static void r5l_io_run_stripes(struct r5l_io_unit *io)
{ {
struct stripe_head *sh, *next; struct stripe_head *sh, *next;
list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
list_del_init(&sh->log_list); list_del_init(&sh->log_list);
r5c_finish_cache_stripe(sh);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh); raid5_release_stripe(sh);
} }
...@@ -412,18 +501,19 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, ...@@ -412,18 +501,19 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
r5l_append_payload_page(log, sh->dev[i].page); r5l_append_payload_page(log, sh->dev[i].page);
} }
if (sh->qd_idx >= 0) { if (parity_pages == 2) {
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
sh->sector, sh->dev[sh->pd_idx].log_checksum, sh->sector, sh->dev[sh->pd_idx].log_checksum,
sh->dev[sh->qd_idx].log_checksum, true); sh->dev[sh->qd_idx].log_checksum, true);
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
} else { } else if (parity_pages == 1) {
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
sh->sector, sh->dev[sh->pd_idx].log_checksum, sh->sector, sh->dev[sh->pd_idx].log_checksum,
0, false); 0, false);
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
} } else /* Just writing data, not parity, in caching phase */
BUG_ON(parity_pages != 0);
list_add_tail(&sh->log_list, &io->stripe_list); list_add_tail(&sh->log_list, &io->stripe_list);
atomic_inc(&io->pending_stripe); atomic_inc(&io->pending_stripe);
...@@ -455,6 +545,8 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) ...@@ -455,6 +545,8 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
return -EAGAIN; return -EAGAIN;
} }
WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
for (i = 0; i < sh->disks; i++) { for (i = 0; i < sh->disks; i++) {
void *addr; void *addr;
...@@ -1112,6 +1204,49 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp) ...@@ -1112,6 +1204,49 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
} }
/*
* Try handle write operation in caching phase. This function should only
* be called in write-back mode.
*
* If all outstanding writes can be handled in caching phase, returns 0
* If writes requires write-out phase, call r5c_make_stripe_write_out()
* and returns -EAGAIN
*/
int r5c_try_caching_write(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s,
int disks)
{
struct r5l_log *log = conf->log;
BUG_ON(!r5c_is_writeback(log));
/* more write-back logic in next patches */
r5c_make_stripe_write_out(sh);
return -EAGAIN;
}
/*
* clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
* stripe is committed to RAID disks.
*/
void r5c_finish_stripe_write_out(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s)
{
if (!conf->log ||
!test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
return;
WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
BUG(); /* write-back logic in following patches */
}
static int r5l_load_log(struct r5l_log *log) static int r5l_load_log(struct r5l_log *log)
{ {
struct md_rdev *rdev = log->rdev; struct md_rdev *rdev = log->rdev;
...@@ -1249,6 +1384,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) ...@@ -1249,6 +1384,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
INIT_LIST_HEAD(&log->no_space_stripes); INIT_LIST_HEAD(&log->no_space_stripes);
spin_lock_init(&log->no_space_stripes_lock); spin_lock_init(&log->no_space_stripes_lock);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
if (r5l_load_log(log)) if (r5l_load_log(log))
goto error; goto error;
......
...@@ -4107,6 +4107,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4107,6 +4107,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && !test_bit(Faulty, &rdev->flags)) if (rdev && !test_bit(Faulty, &rdev->flags))
do_recovery = 1; do_recovery = 1;
} }
if (test_bit(R5_InJournal, &dev->flags))
s->injournal++;
} }
if (test_bit(STRIPE_SYNCING, &sh->state)) { if (test_bit(STRIPE_SYNCING, &sh->state)) {
/* If there is a failed device being replaced, /* If there is a failed device being replaced,
...@@ -4386,14 +4389,47 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4386,14 +4389,47 @@ static void handle_stripe(struct stripe_head *sh)
|| s.expanding) || s.expanding)
handle_stripe_fill(sh, &s, disks); handle_stripe_fill(sh, &s, disks);
/* Now to consider new write requests and what else, if anything /*
* should be read. We do not handle new writes when: * When the stripe finishes full journal write cycle (write to journal
* and raid disk), this is the clean up procedure so it is ready for
* next operation.
*/
r5c_finish_stripe_write_out(conf, sh, &s);
/*
* Now to consider new write requests, cache write back and what else,
* if anything should be read. We do not handle new writes when:
* 1/ A 'write' operation (copy+xor) is already in flight. * 1/ A 'write' operation (copy+xor) is already in flight.
* 2/ A 'check' operation is in flight, as it may clobber the parity * 2/ A 'check' operation is in flight, as it may clobber the parity
* block. * block.
* 3/ A r5c cache log write is in flight.
*/
if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
if (!r5c_is_writeback(conf->log)) {
if (s.to_write)
handle_stripe_dirtying(conf, sh, &s, disks);
} else { /* write back cache */
int ret = 0;
/* First, try handle writes in caching phase */
if (s.to_write)
ret = r5c_try_caching_write(conf, sh, &s,
disks);
/*
* If caching phase failed: ret == -EAGAIN
* OR
* stripe under reclaim: !caching && injournal
*
* fall back to handle_stripe_dirtying()
*/ */
if (s.to_write && !sh->reconstruct_state && !sh->check_state) if (ret == -EAGAIN ||
/* stripe under reclaim: !caching && injournal */
(!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
s.injournal > 0))
handle_stripe_dirtying(conf, sh, &s, disks); handle_stripe_dirtying(conf, sh, &s, disks);
}
}
/* maybe we need to check and possibly fix the parity for this stripe /* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough * Any reads will already have been scheduled, so we just see if enough
...@@ -5110,6 +5146,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) ...@@ -5110,6 +5146,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
* data on failed drives. * data on failed drives.
*/ */
if (rw == READ && mddev->degraded == 0 && if (rw == READ && mddev->degraded == 0 &&
!r5c_is_writeback(conf->log) &&
mddev->reshape_position == MaxSector) { mddev->reshape_position == MaxSector) {
bi = chunk_aligned_read(mddev, bi); bi = chunk_aligned_read(mddev, bi);
if (!bi) if (!bi)
......
...@@ -264,6 +264,7 @@ struct stripe_head_state { ...@@ -264,6 +264,7 @@ struct stripe_head_state {
int syncing, expanding, expanded, replacing; int syncing, expanding, expanded, replacing;
int locked, uptodate, to_read, to_write, failed, written; int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite; int to_fill, compute, req_compute, non_overwrite;
int injournal;
int failed_num[2]; int failed_num[2];
int p_failed, q_failed; int p_failed, q_failed;
int dec_preread_active; int dec_preread_active;
...@@ -313,6 +314,11 @@ enum r5dev_flags { ...@@ -313,6 +314,11 @@ enum r5dev_flags {
*/ */
R5_Discard, /* Discard the stripe */ R5_Discard, /* Discard the stripe */
R5_SkipCopy, /* Don't copy data from bio to stripe cache */ R5_SkipCopy, /* Don't copy data from bio to stripe cache */
R5_InJournal, /* data being written is in the journal device.
* if R5_InJournal is set for parity pd_idx, all the
* data and parity being written are in the journal
* device
*/
}; };
/* /*
...@@ -345,7 +351,23 @@ enum { ...@@ -345,7 +351,23 @@ enum {
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
* to batch yet. * to batch yet.
*/ */
STRIPE_LOG_TRAPPED, /* trapped into log */ STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
* this bit is used in two scenarios:
*
* 1. write-out phase
* set in first entry of r5l_write_stripe
* clear in second entry of r5l_write_stripe
* used to bypass logic in handle_stripe
*
* 2. caching phase
* set in r5c_try_caching_write()
* clear when journal write is done
* used to initiate r5c_cache_data()
* also used to bypass logic in handle_stripe
*/
STRIPE_R5C_CACHING, /* the stripe is in caching phase
* see more detail in the raid5-cache.c
*/
}; };
#define STRIPE_EXPAND_SYNC_FLAGS \ #define STRIPE_EXPAND_SYNC_FLAGS \
...@@ -710,4 +732,11 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh); ...@@ -710,4 +732,11 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
extern void r5l_quiesce(struct r5l_log *log, int state); extern void r5l_quiesce(struct r5l_log *log, int state);
extern bool r5l_log_disk_error(struct r5conf *conf); extern bool r5l_log_disk_error(struct r5conf *conf);
extern bool r5c_is_writeback(struct r5l_log *log);
extern int
r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks);
extern void
r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s);
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment