Commit 8d54094e authored by Jens Axboe's avatar Jens Axboe

Merge branch 'md-next' of https://github.com/liu-song-6/linux into for-5.3/block

Pull MD changes from Song.

* 'md-next' of https://github.com/liu-song-6/linux:
  md: add bitmap_abort label in md_run
  md-bitmap: create and destroy wb_info_pool with the change of bitmap
  md-bitmap: create and destroy wb_info_pool with the change of backlog
  md: introduce mddev_create/destroy_wb_pool for the change of member device
  md/raid1: fix potential data inconsistency issue with write behind device
parents 0ce35379 d494549a
...@@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev) ...@@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev)
return; return;
md_bitmap_wait_behind_writes(mddev); md_bitmap_wait_behind_writes(mddev);
mempool_destroy(mddev->wb_info_pool);
mddev->wb_info_pool = NULL;
mutex_lock(&mddev->bitmap_info.mutex); mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
...@@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev) ...@@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev)
sector_t start = 0; sector_t start = 0;
sector_t sector = 0; sector_t sector = 0;
struct bitmap *bitmap = mddev->bitmap; struct bitmap *bitmap = mddev->bitmap;
struct md_rdev *rdev;
if (!bitmap) if (!bitmap)
goto out; goto out;
rdev_for_each(rdev, mddev)
mddev_create_wb_pool(mddev, rdev, true);
if (mddev_is_clustered(mddev)) if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
...@@ -2462,12 +2468,26 @@ static ssize_t ...@@ -2462,12 +2468,26 @@ static ssize_t
backlog_store(struct mddev *mddev, const char *buf, size_t len) backlog_store(struct mddev *mddev, const char *buf, size_t len)
{ {
unsigned long backlog; unsigned long backlog;
unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
int rv = kstrtoul(buf, 10, &backlog); int rv = kstrtoul(buf, 10, &backlog);
if (rv) if (rv)
return rv; return rv;
if (backlog > COUNTER_MAX) if (backlog > COUNTER_MAX)
return -EINVAL; return -EINVAL;
mddev->bitmap_info.max_write_behind = backlog; mddev->bitmap_info.max_write_behind = backlog;
if (!backlog && mddev->wb_info_pool) {
/* wb_info_pool is not needed if backlog is zero */
mempool_destroy(mddev->wb_info_pool);
mddev->wb_info_pool = NULL;
} else if (backlog && !mddev->wb_info_pool) {
/* wb_info_pool is needed since backlog is not zero */
struct md_rdev *rdev;
rdev_for_each(rdev, mddev)
mddev_create_wb_pool(mddev, rdev, false);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
return len; return len;
} }
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
*/ */
#include <linux/sched/mm.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
...@@ -124,6 +125,77 @@ static inline int speed_max(struct mddev *mddev) ...@@ -124,6 +125,77 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max; mddev->sync_speed_max : sysctl_speed_limit_max;
} }
static int rdev_init_wb(struct md_rdev *rdev)
{
if (rdev->bdev->bd_queue->nr_hw_queues == 1)
return 0;
spin_lock_init(&rdev->wb_list_lock);
INIT_LIST_HEAD(&rdev->wb_list);
init_waitqueue_head(&rdev->wb_io_wait);
set_bit(WBCollisionCheck, &rdev->flags);
return 1;
}
/*
* Create wb_info_pool if rdev is the first multi-queue device flaged
* with writemostly, also write-behind mode is enabled.
*/
void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend)
{
if (mddev->bitmap_info.max_write_behind == 0)
return;
if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
return;
if (mddev->wb_info_pool == NULL) {
unsigned int noio_flag;
if (!is_suspend)
mddev_suspend(mddev);
noio_flag = memalloc_noio_save();
mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
sizeof(struct wb_info));
memalloc_noio_restore(noio_flag);
if (!mddev->wb_info_pool)
pr_err("can't alloc memory pool for writemostly\n");
if (!is_suspend)
mddev_resume(mddev);
}
}
EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
/*
* destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
*/
static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
{
if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
return;
if (mddev->wb_info_pool) {
struct md_rdev *temp;
int num = 0;
/*
* Check if other rdevs need wb_info_pool.
*/
rdev_for_each(temp, mddev)
if (temp != rdev &&
test_bit(WBCollisionCheck, &temp->flags))
num++;
if (!num) {
mddev_suspend(rdev->mddev);
mempool_destroy(mddev->wb_info_pool);
mddev->wb_info_pool = NULL;
mddev_resume(rdev->mddev);
}
}
}
static struct ctl_table_header *raid_table_header; static struct ctl_table_header *raid_table_header;
static struct ctl_table raid_table[] = { static struct ctl_table raid_table[] = {
...@@ -2210,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) ...@@ -2210,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
rdev->mddev = mddev; rdev->mddev = mddev;
pr_debug("md: bind<%s>\n", b); pr_debug("md: bind<%s>\n", b);
if (mddev->raid_disks)
mddev_create_wb_pool(mddev, rdev, false);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail; goto fail;
...@@ -2246,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) ...@@ -2246,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set); list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
mddev_destroy_wb_pool(rdev->mddev, rdev);
rdev->mddev = NULL; rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block"); sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state); sysfs_put(rdev->sysfs_state);
...@@ -2758,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2758,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} }
} else if (cmd_match(buf, "writemostly")) { } else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags); set_bit(WriteMostly, &rdev->flags);
mddev_create_wb_pool(rdev->mddev, rdev, false);
err = 0; err = 0;
} else if (cmd_match(buf, "-writemostly")) { } else if (cmd_match(buf, "-writemostly")) {
mddev_destroy_wb_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags); clear_bit(WriteMostly, &rdev->flags);
err = 0; err = 0;
} else if (cmd_match(buf, "blocked")) { } else if (cmd_match(buf, "blocked")) {
...@@ -5588,15 +5666,28 @@ int md_run(struct mddev *mddev) ...@@ -5588,15 +5666,28 @@ int md_run(struct mddev *mddev)
mddev->bitmap = bitmap; mddev->bitmap = bitmap;
} }
if (err) { if (err)
mddev_detach(mddev); goto bitmap_abort;
if (mddev->private)
pers->free(mddev, mddev->private); if (mddev->bitmap_info.max_write_behind > 0) {
mddev->private = NULL; bool creat_pool = false;
module_put(pers->owner);
md_bitmap_destroy(mddev); rdev_for_each(rdev, mddev) {
goto abort; if (test_bit(WriteMostly, &rdev->flags) &&
rdev_init_wb(rdev))
creat_pool = true;
}
if (creat_pool && mddev->wb_info_pool == NULL) {
mddev->wb_info_pool =
mempool_create_kmalloc_pool(NR_WB_INFOS,
sizeof(struct wb_info));
if (!mddev->wb_info_pool) {
err = -ENOMEM;
goto bitmap_abort;
}
}
} }
if (mddev->queue) { if (mddev->queue) {
bool nonrot = true; bool nonrot = true;
...@@ -5657,6 +5748,13 @@ int md_run(struct mddev *mddev) ...@@ -5657,6 +5748,13 @@ int md_run(struct mddev *mddev)
sysfs_notify(&mddev->kobj, NULL, "degraded"); sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0; return 0;
bitmap_abort:
mddev_detach(mddev);
if (mddev->private)
pers->free(mddev, mddev->private);
mddev->private = NULL;
module_put(pers->owner);
md_bitmap_destroy(mddev);
abort: abort:
bioset_exit(&mddev->bio_set); bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set); bioset_exit(&mddev->sync_set);
...@@ -5825,6 +5923,8 @@ static void __md_stop_writes(struct mddev *mddev) ...@@ -5825,6 +5923,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1; mddev->in_sync = 1;
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
} }
mempool_destroy(mddev->wb_info_pool);
mddev->wb_info_pool = NULL;
} }
void md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev)
......
...@@ -109,6 +109,14 @@ struct md_rdev { ...@@ -109,6 +109,14 @@ struct md_rdev {
* for reporting to userspace and storing * for reporting to userspace and storing
* in superblock. * in superblock.
*/ */
/*
* The members for check collision of write behind IOs.
*/
struct list_head wb_list;
spinlock_t wb_list_lock;
wait_queue_head_t wb_io_wait;
struct work_struct del_work; /* used for delayed sysfs removal */ struct work_struct del_work; /* used for delayed sysfs removal */
struct kernfs_node *sysfs_state; /* handle for 'state' struct kernfs_node *sysfs_state; /* handle for 'state'
...@@ -193,6 +201,10 @@ enum flag_bits { ...@@ -193,6 +201,10 @@ enum flag_bits {
* it didn't fail, so don't use FailFast * it didn't fail, so don't use FailFast
* any more for metadata * any more for metadata
*/ */
WBCollisionCheck, /*
* multiqueue device should check if there
* is collision between write behind bios.
*/
}; };
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
...@@ -245,6 +257,14 @@ enum mddev_sb_flags { ...@@ -245,6 +257,14 @@ enum mddev_sb_flags {
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
}; };
#define NR_WB_INFOS 8
/* record current range of write behind IOs */
struct wb_info {
sector_t lo;
sector_t hi;
struct list_head list;
};
struct mddev { struct mddev {
void *private; void *private;
struct md_personality *pers; struct md_personality *pers;
...@@ -461,6 +481,7 @@ struct mddev { ...@@ -461,6 +481,7 @@ struct mddev {
*/ */
struct work_struct flush_work; struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */ struct work_struct event_work; /* used by dm to report failure event */
mempool_t *wb_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info; struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int good_device_nr; /* good device num within cluster raid */
...@@ -709,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, ...@@ -709,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force); extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev); extern void md_kick_rdev_from_array(struct md_rdev * rdev);
extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
......
...@@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); ...@@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#include "raid1-10.c" #include "raid1-10.c"
static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
struct wb_info *wi, *temp_wi;
unsigned long flags;
int ret = 0;
struct mddev *mddev = rdev->mddev;
wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
spin_lock_irqsave(&rdev->wb_list_lock, flags);
list_for_each_entry(temp_wi, &rdev->wb_list, list) {
/* collision happened */
if (hi > temp_wi->lo && lo < temp_wi->hi) {
ret = -EBUSY;
break;
}
}
if (!ret) {
wi->lo = lo;
wi->hi = hi;
list_add(&wi->list, &rdev->wb_list);
} else
mempool_free(wi, mddev->wb_info_pool);
spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
return ret;
}
static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
struct wb_info *wi;
unsigned long flags;
int found = 0;
struct mddev *mddev = rdev->mddev;
spin_lock_irqsave(&rdev->wb_list_lock, flags);
list_for_each_entry(wi, &rdev->wb_list, list)
if (hi == wi->hi && lo == wi->lo) {
list_del(&wi->list);
mempool_free(wi, mddev->wb_info_pool);
found = 1;
break;
}
if (!found)
WARN_ON("The write behind IO is not recorded\n");
spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
wake_up(&rdev->wb_io_wait);
}
/* /*
* for resync bio, r1bio pointer can be retrieved from the per-bio * for resync bio, r1bio pointer can be retrieved from the per-bio
* 'struct resync_pages'. * 'struct resync_pages'.
...@@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio) ...@@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio)
} }
if (behind) { if (behind) {
if (test_bit(WBCollisionCheck, &rdev->flags)) {
sector_t lo = r1_bio->sector;
sector_t hi = r1_bio->sector + r1_bio->sectors;
remove_wb(rdev, lo, hi);
}
if (test_bit(WriteMostly, &rdev->flags)) if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining); atomic_dec(&r1_bio->behind_remaining);
...@@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, ...@@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) { if (r1_bio->behind_master_bio) {
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) struct md_rdev *rdev = conf->mirrors[i].rdev;
if (test_bit(WBCollisionCheck, &rdev->flags)) {
sector_t lo = r1_bio->sector;
sector_t hi = r1_bio->sector + r1_bio->sectors;
wait_event(rdev->wb_io_wait,
check_and_add_wb(rdev, lo, hi) == 0);
}
if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining); atomic_inc(&r1_bio->behind_remaining);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment