Commit a39f7afd authored by Song Liu's avatar Song Liu Committed by Shaohua Li

md/r5cache: write-out phase and reclaim support

There are two limited resources, stripe cache and journal disk space.
For better performance, we priotize reclaim of full stripe writes.
To free up more journal space, we free earliest data on the journal.

In current implementation, reclaim happens when:
1. Periodically (every R5C_RECLAIM_WAKEUP_INTERVAL, 30 seconds) reclaim
   if there is no reclaim in the past 5 seconds.
2. when there are R5C_FULL_STRIPE_FLUSH_BATCH (256) cached full stripes,
   or cached stripes is enough for a full stripe (chunk size / 4k)
   (r5c_check_cached_full_stripe)
3. when there is pressure on stripe cache (r5c_check_stripe_cache_usage)
4. when there is pressure on journal space (r5l_write_stripe, r5c_cache_data)

r5c_do_reclaim() contains new logic of reclaim.

For stripe cache:

When stripe cache pressure is high (more than 3/4 stripes are cached,
or there is empty inactive lists), flush all full stripe. If fewer
than R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) full stripes
are flushed, flush some paritial stripes. When stripe cache pressure
is moderate (1/2 to 3/4 of stripes are cached), flush all full stripes.

For log space:

To avoid deadlock due to log space, we need to reserve enough space
to flush cached data. The size of required log space depends on total
number of cached stripes (stripe_in_journal_count). In current
implementation, the writing-out phase automatically include pending
data writes with parity writes (similar to write through case).
Therefore, we need up to (conf->raid_disks + 1) pages for each cached
stripe (1 page for meta data, raid_disks pages for all data and
parity). r5c_log_required_to_flush_cache() calculates log space
required to flush cache. In the following, we refer to the space
calculated by r5c_log_required_to_flush_cache() as
reclaim_required_space.

Two flags are added to r5conf->cache_state: R5C_LOG_TIGHT and
R5C_LOG_CRITICAL. R5C_LOG_TIGHT is set when free space on the log
device is less than 3x of reclaim_required_space. R5C_LOG_CRITICAL
is set when free space on the log device is less than 2x of
reclaim_required_space.

r5c_cache keeps all data in cache (not fully committed to RAID) in
a list (stripe_in_journal_list). These stripes are in the order of their
first appearance on the journal. So the log tail (last_checkpoint)
should point to the journal_start of the first item in the list.

When R5C_LOG_TIGHT is set, r5l_reclaim_thread starts flushing out
stripes at the head of stripe_in_journal. When R5C_LOG_CRITICAL is
set, the state machine only writes data that are already in the
log device (in stripe_in_journal_list).

This patch includes a fix to improve performance by
Shaohua Li <shli@fb.com>.
Signed-off-by: default avatarSong Liu <songliubraving@fb.com>
Signed-off-by: default avatarShaohua Li <shli@fb.com>
parent 1e6d690b
This diff is collapsed.
......@@ -228,6 +228,16 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
for (i = sh->disks; i--; )
if (test_bit(R5_InJournal, &sh->dev[i].flags))
injournal++;
/*
* When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
* data in journal, so they are not released to cached lists
*/
if (conf->quiesce && r5c_is_writeback(conf->log) &&
!test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh);
set_bit(STRIPE_HANDLE, &sh->state);
}
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
......@@ -268,6 +278,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
atomic_dec(&conf->r5c_cached_partial_stripes);
list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
r5c_check_cached_full_stripe(conf);
} else {
/* partial stripe */
if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
......@@ -639,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
}
if (noblock && sh == NULL)
break;
r5c_check_stripe_cache_usage(conf);
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
r5l_wake_reclaim(conf->log, 0);
wait_event_lock_irq(
conf->wait_for_stripe,
!list_empty(conf->inactive_list + hash) &&
......@@ -1992,7 +2006,9 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
spin_lock_init(&sh->batch_lock);
INIT_LIST_HEAD(&sh->batch_list);
INIT_LIST_HEAD(&sh->lru);
INIT_LIST_HEAD(&sh->r5c);
atomic_set(&sh->count, 1);
sh->log_start = MaxSector;
for (i = 0; i < disks; i++) {
struct r5dev *dev = &sh->dev[i];
......@@ -4759,6 +4775,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
return 1;
/* Also checks whether there is pressure on r5cache log space */
if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
return 1;
if (conf->quiesce)
return 1;
if (atomic_read(&conf->empty_inactive_list_nr))
......@@ -7661,6 +7681,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
/* '2' tells resync/reshape to pause so that all
* active stripes can drain
*/
r5c_flush_cache(conf, INT_MAX);
conf->quiesce = 2;
wait_event_cmd(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0 &&
......
......@@ -226,6 +226,8 @@ struct stripe_head {
struct r5l_io_unit *log_io;
struct list_head log_list;
sector_t log_start; /* first meta block on the journal */
struct list_head r5c; /* for r5c_cache->stripe_in_journal */
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
......@@ -537,6 +539,27 @@ struct r5worker_group {
int stripes_cnt;
};
enum r5_cache_state {
R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked,
* waiting for 25% to be free
*/
R5_ALLOC_MORE, /* It might help to allocate another
* stripe.
*/
R5_DID_ALLOC, /* A stripe was allocated, don't allocate
* more until at least one has been
* released. This avoids flooding
* the cache.
*/
R5C_LOG_TIGHT, /* log device space tight, need to
* prioritize stripes at last_checkpoint
*/
R5C_LOG_CRITICAL, /* log device is running out of space,
* only process stripes that are already
* occupying the log
*/
};
struct r5conf {
struct hlist_head *stripe_hashtbl;
/* only protect corresponding hash list and inactive_list */
......@@ -636,17 +659,6 @@ struct r5conf {
wait_queue_head_t wait_for_stripe;
wait_queue_head_t wait_for_overlap;
unsigned long cache_state;
#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,
* waiting for 25% to be free
*/
#define R5_ALLOC_MORE 2 /* It might help to allocate another
* stripe.
*/
#define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate
* more until at least one has been
* released. This avoids flooding
* the cache.
*/
struct shrinker shrinker;
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
......@@ -752,8 +764,13 @@ extern void
r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s);
extern void r5c_release_extra_page(struct stripe_head *sh);
extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
extern void r5c_handle_cached_data_endio(struct r5conf *conf,
struct stripe_head *sh, int disks, struct bio_list *return_bi);
extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
struct stripe_head_state *s);
extern void r5c_make_stripe_write_out(struct stripe_head *sh);
extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
extern void r5c_check_cached_full_stripe(struct r5conf *conf);
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment