Commit 8b4822de authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md/4.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD fixes from Shaohua Li:

 - Several bug fixes for raid5-cache from Song Liu, mainly handle
   journal disk error

 - Fix bad block handling in choosing raid1 disk from Tomasz Majchrzak

 - Simplify external metadata array sysfs handling from Artur
   Paszkiewicz

 - Optimize raid0 discard handling from me, now raid0 will dispatch
   large discard IO directly to underlayer disks.

* tag 'md/4.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
  raid1: prefer disk without bad blocks
  md/r5cache: handle sync with data in write back cache
  md/r5cache: gracefully handle journal device errors for writeback mode
  md/raid1/10: avoid unnecessary locking
  md/raid5-cache: in r5l_do_submit_io(), submit io->split_bio first
  md/md0: optimize raid0 discard handling
  md: don't return -EAGAIN in md_allow_write for external metadata arrays
  md/raid5: make use of spin_lock_irq over local_irq_disable + spin_lock
parents 667f867c d82dd0e3
...@@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end); ...@@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end);
* may proceed without blocking. It is important to call this before * may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock. * attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held. * Must be called with mddev_lock held.
*
* In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
* is dropped, so return -EAGAIN after notifying userspace.
*/ */
int md_allow_write(struct mddev *mddev) void md_allow_write(struct mddev *mddev)
{ {
if (!mddev->pers) if (!mddev->pers)
return 0; return;
if (mddev->ro) if (mddev->ro)
return 0; return;
if (!mddev->pers->sync_request) if (!mddev->pers->sync_request)
return 0; return;
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
if (mddev->in_sync) { if (mddev->in_sync) {
...@@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev) ...@@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev)
spin_unlock(&mddev->lock); spin_unlock(&mddev->lock);
md_update_sb(mddev, 0); md_update_sb(mddev, 0);
sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_state);
/* wait for the dirty state to be recorded in the metadata */
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
} else } else
spin_unlock(&mddev->lock); spin_unlock(&mddev->lock);
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
return -EAGAIN;
else
return 0;
} }
EXPORT_SYMBOL_GPL(md_allow_write); EXPORT_SYMBOL_GPL(md_allow_write);
......
...@@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, ...@@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
bool metadata_op); bool metadata_op);
extern void md_do_sync(struct md_thread *thread); extern void md_do_sync(struct md_thread *thread);
extern void md_new_event(struct mddev *mddev); extern void md_new_event(struct mddev *mddev);
extern int md_allow_write(struct mddev *mddev); extern void md_allow_write(struct mddev *mddev);
extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(struct mddev *mddev); extern int md_check_no_bitmap(struct mddev *mddev);
......
...@@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev) ...@@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue, blk_queue_io_opt(mddev->queue,
...@@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev, ...@@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
} }
} }
static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
{
struct r0conf *conf = mddev->private;
struct strip_zone *zone;
sector_t start = bio->bi_iter.bi_sector;
sector_t end;
unsigned int stripe_size;
sector_t first_stripe_index, last_stripe_index;
sector_t start_disk_offset;
unsigned int start_disk_index;
sector_t end_disk_offset;
unsigned int end_disk_index;
unsigned int disk;
zone = find_zone(conf, &start);
if (bio_end_sector(bio) > zone->zone_end) {
struct bio *split = bio_split(bio,
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
mddev->bio_set);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
end = zone->zone_end;
} else
end = bio_end_sector(bio);
if (zone != conf->strip_zone)
end = end - zone[-1].zone_end;
/* Now start and end is the offset in zone */
stripe_size = zone->nb_dev * mddev->chunk_sectors;
first_stripe_index = start;
sector_div(first_stripe_index, stripe_size);
last_stripe_index = end;
sector_div(last_stripe_index, stripe_size);
start_disk_index = (int)(start - first_stripe_index * stripe_size) /
mddev->chunk_sectors;
start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
mddev->chunk_sectors) +
first_stripe_index * mddev->chunk_sectors;
end_disk_index = (int)(end - last_stripe_index * stripe_size) /
mddev->chunk_sectors;
end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
mddev->chunk_sectors) +
last_stripe_index * mddev->chunk_sectors;
for (disk = 0; disk < zone->nb_dev; disk++) {
sector_t dev_start, dev_end;
struct bio *discard_bio = NULL;
struct md_rdev *rdev;
if (disk < start_disk_index)
dev_start = (first_stripe_index + 1) *
mddev->chunk_sectors;
else if (disk > start_disk_index)
dev_start = first_stripe_index * mddev->chunk_sectors;
else
dev_start = start_disk_offset;
if (disk < end_disk_index)
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
else if (disk > end_disk_index)
dev_end = last_stripe_index * mddev->chunk_sectors;
else
dev_end = end_disk_offset;
if (dev_end <= dev_start)
continue;
rdev = conf->devlist[(zone - conf->strip_zone) *
conf->strip_zone[0].nb_dev + disk];
if (__blkdev_issue_discard(rdev->bdev,
dev_start + zone->dev_start + rdev->data_offset,
dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
!discard_bio)
continue;
bio_chain(discard_bio, bio);
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
discard_bio, disk_devt(mddev->gendisk),
bio->bi_iter.bi_sector);
generic_make_request(discard_bio);
}
bio_endio(bio);
}
static void raid0_make_request(struct mddev *mddev, struct bio *bio) static void raid0_make_request(struct mddev *mddev, struct bio *bio)
{ {
struct strip_zone *zone; struct strip_zone *zone;
...@@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) ...@@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
return; return;
} }
if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
raid0_handle_discard(mddev, bio);
return;
}
bio_sector = bio->bi_iter.bi_sector; bio_sector = bio->bi_iter.bi_sector;
sector = bio_sector; sector = bio_sector;
chunk_sects = mddev->chunk_sectors; chunk_sects = mddev->chunk_sectors;
...@@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) ...@@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
bio->bi_iter.bi_sector = sector + zone->dev_start + bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset; tmp_dev->data_offset;
if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && if (mddev->gendisk)
!blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
/* Just ignore it */ bio, disk_devt(mddev->gendisk),
bio_endio(bio); bio_sector);
} else { mddev_check_writesame(mddev, bio);
if (mddev->gendisk) mddev_check_write_zeroes(mddev, bio);
trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), generic_make_request(bio);
bio, disk_devt(mddev->gendisk),
bio_sector);
mddev_check_writesame(mddev, bio);
mddev_check_write_zeroes(mddev, bio);
generic_make_request(bio);
}
} }
static void raid0_status(struct seq_file *seq, struct mddev *mddev) static void raid0_status(struct seq_file *seq, struct mddev *mddev)
......
...@@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect ...@@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
break; break;
} }
continue; continue;
} else } else {
if ((sectors > best_good_sectors) && (best_disk >= 0))
best_disk = -1;
best_good_sectors = sectors; best_good_sectors = sectors;
}
if (best_disk >= 0) if (best_disk >= 0)
/* At least two disks to choose from so failfast is OK */ /* At least two disks to choose from so failfast is OK */
...@@ -1529,17 +1532,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, ...@@ -1529,17 +1532,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
plug = container_of(cb, struct raid1_plug_cb, cb); plug = container_of(cb, struct raid1_plug_cb, cb);
else else
plug = NULL; plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
if (plug) { if (plug) {
bio_list_add(&plug->pending, mbio); bio_list_add(&plug->pending, mbio);
plug->pending_cnt++; plug->pending_cnt++;
} else { } else {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio); bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++; conf->pending_count++;
} spin_unlock_irqrestore(&conf->device_lock, flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!plug)
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
}
} }
r1_bio_write_done(r1_bio); r1_bio_write_done(r1_bio);
...@@ -3197,7 +3199,7 @@ static int raid1_reshape(struct mddev *mddev) ...@@ -3197,7 +3199,7 @@ static int raid1_reshape(struct mddev *mddev)
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int cnt, raid_disks; int cnt, raid_disks;
unsigned long flags; unsigned long flags;
int d, d2, err; int d, d2;
/* Cannot change chunk_size, layout, or level */ /* Cannot change chunk_size, layout, or level */
if (mddev->chunk_sectors != mddev->new_chunk_sectors || if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
...@@ -3209,11 +3211,8 @@ static int raid1_reshape(struct mddev *mddev) ...@@ -3209,11 +3211,8 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL; return -EINVAL;
} }
if (!mddev_is_clustered(mddev)) { if (!mddev_is_clustered(mddev))
err = md_allow_write(mddev); md_allow_write(mddev);
if (err)
return err;
}
raid_disks = mddev->raid_disks + mddev->delta_disks; raid_disks = mddev->raid_disks + mddev->delta_disks;
......
...@@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, ...@@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
plug = container_of(cb, struct raid10_plug_cb, cb); plug = container_of(cb, struct raid10_plug_cb, cb);
else else
plug = NULL; plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
if (plug) { if (plug) {
bio_list_add(&plug->pending, mbio); bio_list_add(&plug->pending, mbio);
plug->pending_cnt++; plug->pending_cnt++;
} else { } else {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio); bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++; conf->pending_count++;
} spin_unlock_irqrestore(&conf->device_lock, flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!plug)
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
}
} }
static void raid10_write_request(struct mddev *mddev, struct bio *bio, static void raid10_write_request(struct mddev *mddev, struct bio *bio,
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "md.h" #include "md.h"
#include "raid5.h" #include "raid5.h"
#include "bitmap.h" #include "bitmap.h"
#include "raid5-log.h"
/* /*
* metadata/data stored in disk with 4k size unit (a block) regardless * metadata/data stored in disk with 4k size unit (a block) regardless
...@@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) ...@@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
__r5l_set_io_unit_state(io, IO_UNIT_IO_START); __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
spin_unlock_irqrestore(&log->io_list_lock, flags); spin_unlock_irqrestore(&log->io_list_lock, flags);
/*
* In case of journal device failures, submit_bio will get error
* and calls endio, then active stripes will continue write
* process. Therefore, it is not necessary to check Faulty bit
* of journal device here.
*
* We can't check split_bio after current_bio is submitted. If
* io->split_bio is null, after current_bio is submitted, current_bio
* might already be completed and the io_unit is freed. We submit
* split_bio first to avoid the issue.
*/
if (io->split_bio) {
if (io->has_flush)
io->split_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->split_bio->bi_opf |= REQ_FUA;
submit_bio(io->split_bio);
}
if (io->has_flush) if (io->has_flush)
io->current_bio->bi_opf |= REQ_PREFLUSH; io->current_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua) if (io->has_fua)
io->current_bio->bi_opf |= REQ_FUA; io->current_bio->bi_opf |= REQ_FUA;
submit_bio(io->current_bio); submit_bio(io->current_bio);
if (!io->split_bio)
return;
if (io->has_flush)
io->split_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->split_bio->bi_opf |= REQ_FUA;
submit_bio(io->split_bio);
} }
/* deferred io_unit will be dispatched here */ /* deferred io_unit will be dispatched here */
...@@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work) ...@@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
return; return;
pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
mdname(mddev)); mdname(mddev));
/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_suspend(mddev); mddev_suspend(mddev);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev); mddev_resume(mddev);
...@@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf, ...@@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
* When run in degraded mode, array is set to write-through mode. * When run in degraded mode, array is set to write-through mode.
* This check helps drain pending write safely in the transition to * This check helps drain pending write safely in the transition to
* write-through mode. * write-through mode.
*
* When a stripe is syncing, the write is also handled in write
* through mode.
*/ */
if (s->failed) { if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
r5c_make_stripe_write_out(sh); r5c_make_stripe_write_out(sh);
return -EAGAIN; return -EAGAIN;
} }
...@@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, ...@@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
} }
r5l_append_flush_payload(log, sh->sector); r5l_append_flush_payload(log, sh->sector);
/* stripe is flused to raid disks, we can do resync now */
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
set_bit(STRIPE_HANDLE, &sh->state);
} }
int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
...@@ -2973,7 +2995,7 @@ static int r5l_load_log(struct r5l_log *log) ...@@ -2973,7 +2995,7 @@ static int r5l_load_log(struct r5l_log *log)
return ret; return ret;
} }
void r5c_update_on_rdev_error(struct mddev *mddev) void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log; struct r5l_log *log = conf->log;
...@@ -2981,7 +3003,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev) ...@@ -2981,7 +3003,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
if (!log) if (!log)
return; return;
if (raid5_calc_degraded(conf) > 0 && if ((raid5_calc_degraded(conf) > 0 ||
test_bit(Journal, &rdev->flags)) &&
conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work); schedule_work(&log->disable_writeback_work);
} }
......
...@@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num); ...@@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf); extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
extern void r5c_check_cached_full_stripe(struct r5conf *conf); extern void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode; extern struct md_sysfs_entry r5c_journal_mode;
extern void r5c_update_on_rdev_error(struct mddev *mddev); extern void r5c_update_on_rdev_error(struct mddev *mddev,
struct md_rdev *rdev);
extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
extern struct dma_async_tx_descriptor * extern struct dma_async_tx_descriptor *
......
...@@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) ...@@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
{ {
int i; int i;
local_irq_disable(); spin_lock_irq(conf->hash_locks);
spin_lock(conf->hash_locks);
for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
spin_lock(&conf->device_lock); spin_lock(&conf->device_lock);
...@@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) ...@@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
{ {
int i; int i;
spin_unlock(&conf->device_lock); spin_unlock(&conf->device_lock);
for (i = NR_STRIPE_HASH_LOCKS; i; i--) for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
spin_unlock(conf->hash_locks + i - 1); spin_unlock(conf->hash_locks + i);
local_irq_enable(); spin_unlock_irq(conf->hash_locks);
} }
/* Find first data disk in a raid6 stripe */ /* Find first data disk in a raid6 stripe */
...@@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, ...@@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
if (test_bit(R5_InJournal, &sh->dev[i].flags)) if (test_bit(R5_InJournal, &sh->dev[i].flags))
injournal++; injournal++;
/* /*
* When quiesce in r5c write back, set STRIPE_HANDLE for stripes with * In the following cases, the stripe cannot be released to cached
* data in journal, so they are not released to cached lists * lists. Therefore, we make the stripe write out and set
* STRIPE_HANDLE:
* 1. when quiesce in r5c write back;
* 2. when resync is requested fot the stripe.
*/ */
if (conf->quiesce && r5c_is_writeback(conf->log) && if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
!test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { (conf->quiesce && r5c_is_writeback(conf->log) &&
!test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
if (test_bit(STRIPE_R5C_CACHING, &sh->state)) if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh); r5c_make_stripe_write_out(sh);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
...@@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh) ...@@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{ {
local_irq_disable();
if (sh1 > sh2) { if (sh1 > sh2) {
spin_lock(&sh2->stripe_lock); spin_lock_irq(&sh2->stripe_lock);
spin_lock_nested(&sh1->stripe_lock, 1); spin_lock_nested(&sh1->stripe_lock, 1);
} else { } else {
spin_lock(&sh1->stripe_lock); spin_lock_irq(&sh1->stripe_lock);
spin_lock_nested(&sh2->stripe_lock, 1); spin_lock_nested(&sh2->stripe_lock, 1);
} }
} }
...@@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) ...@@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{ {
spin_unlock(&sh1->stripe_lock); spin_unlock(&sh1->stripe_lock);
spin_unlock(&sh2->stripe_lock); spin_unlock_irq(&sh2->stripe_lock);
local_irq_enable();
} }
/* Only freshly new full stripe normal write stripe can be added to a batch list */ /* Only freshly new full stripe normal write stripe can be added to a batch list */
...@@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize) ...@@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
struct stripe_head *osh, *nsh; struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes); LIST_HEAD(newstripes);
struct disk_info *ndisks; struct disk_info *ndisks;
int err; int err = 0;
struct kmem_cache *sc; struct kmem_cache *sc;
int i; int i;
int hash, cnt; int hash, cnt;
err = md_allow_write(conf->mddev); md_allow_write(conf->mddev);
if (err)
return err;
/* Step 1 */ /* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name], sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
...@@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) ...@@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
bdevname(rdev->bdev, b), bdevname(rdev->bdev, b),
mdname(mddev), mdname(mddev),
conf->raid_disks - mddev->degraded); conf->raid_disks - mddev->degraded);
r5c_update_on_rdev_error(mddev); r5c_update_on_rdev_error(mddev, rdev);
} }
/* /*
...@@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) ...@@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
* When LOG_CRITICAL, stripes with injournal == 0 will be sent to * When LOG_CRITICAL, stripes with injournal == 0 will be sent to
* no_space_stripes list. * no_space_stripes list.
* *
* 3. during journal failure
* In journal failure, we try to flush all cached data to raid disks
* based on data in stripe cache. The array is read-only to upper
* layers, so we would skip all pending writes.
*
*/ */
static inline bool delay_towrite(struct r5conf *conf, static inline bool delay_towrite(struct r5conf *conf,
struct r5dev *dev, struct r5dev *dev,
...@@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf, ...@@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf,
if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
s->injournal > 0) s->injournal > 0)
return true; return true;
/* case 3 above */
if (s->log_failed && s->injournal)
return true;
return false; return false;
} }
...@@ -4653,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4653,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock); spin_lock(&sh->stripe_lock);
/* Cannot process 'sync' concurrently with 'discard' */ /*
if (!test_bit(STRIPE_DISCARD, &sh->state) && * Cannot process 'sync' concurrently with 'discard'.
* Flush data in r5cache before 'sync'.
*/
if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
!test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
!test_bit(STRIPE_DISCARD, &sh->state) &&
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state); set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state);
...@@ -4701,10 +4713,15 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4701,10 +4713,15 @@ static void handle_stripe(struct stripe_head *sh)
" to_write=%d failed=%d failed_num=%d,%d\n", " to_write=%d failed=%d failed_num=%d,%d\n",
s.locked, s.uptodate, s.to_read, s.to_write, s.failed, s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
s.failed_num[0], s.failed_num[1]); s.failed_num[0], s.failed_num[1]);
/* check if the array has lost more than max_degraded devices and, /*
* check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed. * if so, some requests might need to be failed.
*
* When journal device failed (log_failed), we will only process
* the stripe if there is data need write to raid disks
*/ */
if (s.failed > conf->max_degraded || s.log_failed) { if (s.failed > conf->max_degraded ||
(s.log_failed && s.injournal == 0)) {
sh->check_state = 0; sh->check_state = 0;
sh->reconstruct_state = 0; sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0); break_stripe_batch_list(sh, 0);
...@@ -5277,8 +5294,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) ...@@ -5277,8 +5294,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
struct stripe_head *sh, *tmp; struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL; struct list_head *handle_list = NULL;
struct r5worker_group *wg; struct r5worker_group *wg;
bool second_try = !r5c_is_writeback(conf->log); bool second_try = !r5c_is_writeback(conf->log) &&
bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); !r5l_log_disk_error(conf);
bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
r5l_log_disk_error(conf);
again: again:
wg = NULL; wg = NULL;
...@@ -6313,7 +6332,6 @@ int ...@@ -6313,7 +6332,6 @@ int
raid5_set_cache_size(struct mddev *mddev, int size) raid5_set_cache_size(struct mddev *mddev, int size)
{ {
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
int err;
if (size <= 16 || size > 32768) if (size <= 16 || size > 32768)
return -EINVAL; return -EINVAL;
...@@ -6325,10 +6343,7 @@ raid5_set_cache_size(struct mddev *mddev, int size) ...@@ -6325,10 +6343,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
; ;
mutex_unlock(&conf->cache_size_mutex); mutex_unlock(&conf->cache_size_mutex);
md_allow_write(mddev);
err = md_allow_write(mddev);
if (err)
return err;
mutex_lock(&conf->cache_size_mutex); mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes) while (size > conf->max_nr_stripes)
...@@ -7530,7 +7545,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -7530,7 +7545,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* neilb: there is no locking about new writes here, * neilb: there is no locking about new writes here,
* so this cannot be safe. * so this cannot be safe.
*/ */
if (atomic_read(&conf->active_stripes)) { if (atomic_read(&conf->active_stripes) ||
atomic_read(&conf->r5c_cached_full_stripes) ||
atomic_read(&conf->r5c_cached_partial_stripes)) {
return -EBUSY; return -EBUSY;
} }
log_exit(conf); log_exit(conf);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment