Commit 267d7b23 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md-3.4' of git://neil.brown.name/md

Pull md updates for 3.4 from Neil Brown:
 "Mostly tidying up code in preparation for some bigger changes next
  time.

  A few bug fixes tagged for -stable.

  Main functionality change is that some RAID10 arrays can now grow to
  use extra space that may have been made available on the individual
  devices."

Fixed up trivial conflicts with the k[un]map_atomic() cleanups in
drivers/md/bitmap.c.

* tag 'md-3.4' of git://neil.brown.name/md: (22 commits)
  md: Add judgement bb->unacked_exist in function md_ack_all_badblocks().
  md: fix clearing of the 'changed' flags for the bad blocks list.
  md/bitmap: discard CHUNK_BLOCK_SHIFT macro
  md/bitmap: remove unnecessary indirection when allocating.
  md/bitmap: remove some pointless locking.
  md/bitmap: change a 'goto' to a normal 'if' construct.
  md/bitmap: move printing of bitmap status to bitmap.c
  md/bitmap: remove some unused noise from bitmap.h
  md/raid10 - support resizing some RAID10 arrays.
  md/raid1: handle merge_bvec_fn in member devices.
  md/raid10: handle merge_bvec_fn in member devices.
  md: add proper merge_bvec handling to RAID0 and Linear.
  md: tidy up rdev_for_each usage.
  md/raid1,raid10: avoid deadlock during resync/recovery.
  md/bitmap: ensure to load bitmap when creating via sysfs.
  md: don't set md arrays to readonly on shutdown.
  md: allow re-add to failed arrays.
  md/raid5: use atomic_dec_return() instead of atomic_dec() and atomic_read().
  md: Use existed macros instead of numbers
  md/raid5: removed unused 'added_devices' variable.
  ...
parents 28f23d1f ecb178bb
This diff is collapsed.
...@@ -13,8 +13,6 @@ ...@@ -13,8 +13,6 @@
#define BITMAP_MAJOR_HI 4 #define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_HOSTENDIAN 3 #define BITMAP_MAJOR_HOSTENDIAN 3
#define BITMAP_MINOR 39
/* /*
* in-memory bitmap: * in-memory bitmap:
* *
...@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t; ...@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t;
/* same, except a mask value for more efficient bitops */ /* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
#define BITMAP_BLOCK_SIZE 512
#define BITMAP_BLOCK_SHIFT 9 #define BITMAP_BLOCK_SHIFT 9
/* how many blocks per chunk? (this is variable) */ /* how many blocks per chunk? (this is variable) */
#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
/* when hijacked, the counters and bits represent even larger "chunks" */
/* there will be 1024 chunks represented by each counter in the page pointers */
#define PAGEPTR_BLOCK_RATIO(bitmap) \
(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
#define PAGEPTR_BLOCK_SHIFT(bitmap) \
(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
#endif #endif
...@@ -181,12 +168,6 @@ struct bitmap_page { ...@@ -181,12 +168,6 @@ struct bitmap_page {
unsigned int count:31; unsigned int count:31;
}; };
/* keep track of bitmap file pages that have pending writes on them */
struct page_list {
struct list_head list;
struct page *page;
};
/* the main bitmap structure - one per mddev */ /* the main bitmap structure - one per mddev */
struct bitmap { struct bitmap {
struct bitmap_page *bp; struct bitmap_page *bp;
...@@ -196,7 +177,7 @@ struct bitmap { ...@@ -196,7 +177,7 @@ struct bitmap {
struct mddev *mddev; /* the md device that the bitmap is for */ struct mddev *mddev; /* the md device that the bitmap is for */
/* bitmap chunksize -- how much data does each bit represent? */ /* bitmap chunksize -- how much data does each bit represent? */
unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
unsigned long chunks; /* total number of data chunks for the array */ unsigned long chunks; /* total number of data chunks for the array */
__u64 events_cleared; __u64 events_cleared;
...@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev); ...@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev);
void bitmap_print_sb(struct bitmap *bitmap); void bitmap_print_sb(struct bitmap *bitmap);
void bitmap_update_sb(struct bitmap *bitmap); void bitmap_update_sb(struct bitmap *bitmap);
void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
int bitmap_setallbits(struct bitmap *bitmap); int bitmap_setallbits(struct bitmap *bitmap);
void bitmap_write_all(struct bitmap *bitmap); void bitmap_write_all(struct bitmap *bitmap);
......
...@@ -615,14 +615,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size) ...@@ -615,14 +615,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
static void super_sync(struct mddev *mddev, struct md_rdev *rdev) static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct md_rdev *r, *t; struct md_rdev *r;
uint64_t failed_devices; uint64_t failed_devices;
struct dm_raid_superblock *sb; struct dm_raid_superblock *sb;
sb = page_address(rdev->sb_page); sb = page_address(rdev->sb_page);
failed_devices = le64_to_cpu(sb->failed_devices); failed_devices = le64_to_cpu(sb->failed_devices);
rdev_for_each(r, t, mddev) rdev_for_each(r, mddev)
if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
failed_devices |= (1ULL << r->raid_disk); failed_devices |= (1ULL << r->raid_disk);
...@@ -707,7 +707,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) ...@@ -707,7 +707,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
struct dm_raid_superblock *sb; struct dm_raid_superblock *sb;
uint32_t new_devs = 0; uint32_t new_devs = 0;
uint32_t rebuilds = 0; uint32_t rebuilds = 0;
struct md_rdev *r, *t; struct md_rdev *r;
struct dm_raid_superblock *sb2; struct dm_raid_superblock *sb2;
sb = page_address(rdev->sb_page); sb = page_address(rdev->sb_page);
...@@ -750,7 +750,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) ...@@ -750,7 +750,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
* case the In_sync bit will /not/ be set and * case the In_sync bit will /not/ be set and
* recovery_cp must be MaxSector. * recovery_cp must be MaxSector.
*/ */
rdev_for_each(r, t, mddev) { rdev_for_each(r, mddev) {
if (!test_bit(In_sync, &r->flags)) { if (!test_bit(In_sync, &r->flags)) {
DMINFO("Device %d specified for rebuild: " DMINFO("Device %d specified for rebuild: "
"Clearing superblock", r->raid_disk); "Clearing superblock", r->raid_disk);
...@@ -782,7 +782,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) ...@@ -782,7 +782,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
* Now we set the Faulty bit for those devices that are * Now we set the Faulty bit for those devices that are
* recorded in the superblock as failed. * recorded in the superblock as failed.
*/ */
rdev_for_each(r, t, mddev) { rdev_for_each(r, mddev) {
if (!r->sb_page) if (!r->sb_page)
continue; continue;
sb2 = page_address(r->sb_page); sb2 = page_address(r->sb_page);
...@@ -855,11 +855,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -855,11 +855,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
{ {
int ret; int ret;
struct md_rdev *rdev, *freshest, *tmp; struct md_rdev *rdev, *freshest;
struct mddev *mddev = &rs->md; struct mddev *mddev = &rs->md;
freshest = NULL; freshest = NULL;
rdev_for_each(rdev, tmp, mddev) { rdev_for_each(rdev, mddev) {
if (!rdev->meta_bdev) if (!rdev->meta_bdev)
continue; continue;
...@@ -888,7 +888,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) ...@@ -888,7 +888,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (super_validate(mddev, freshest)) if (super_validate(mddev, freshest))
return -EINVAL; return -EINVAL;
rdev_for_each(rdev, tmp, mddev) rdev_for_each(rdev, mddev)
if ((rdev != freshest) && super_validate(mddev, rdev)) if ((rdev != freshest) && super_validate(mddev, rdev))
return -EINVAL; return -EINVAL;
......
...@@ -315,7 +315,7 @@ static int run(struct mddev *mddev) ...@@ -315,7 +315,7 @@ static int run(struct mddev *mddev)
} }
conf->nfaults = 0; conf->nfaults = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) rdev_for_each(rdev, mddev)
conf->rdev = rdev; conf->rdev = rdev;
md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
......
...@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q, ...@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q,
struct dev_info *dev0; struct dev_info *dev0;
unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int maxbytes = biovec->bv_len;
struct request_queue *subq;
rcu_read_lock(); rcu_read_lock();
dev0 = which_dev(mddev, sector); dev0 = which_dev(mddev, sector);
maxsectors = dev0->end_sector - sector; maxsectors = dev0->end_sector - sector;
subq = bdev_get_queue(dev0->rdev->bdev);
if (subq->merge_bvec_fn) {
bvm->bi_bdev = dev0->rdev->bdev;
bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
biovec));
}
rcu_read_unlock(); rcu_read_unlock();
if (maxsectors < bio_sectors) if (maxsectors < bio_sectors)
...@@ -80,11 +89,11 @@ static int linear_mergeable_bvec(struct request_queue *q, ...@@ -80,11 +89,11 @@ static int linear_mergeable_bvec(struct request_queue *q,
maxsectors -= bio_sectors; maxsectors -= bio_sectors;
if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
return biovec->bv_len; return maxbytes;
/* The bytes available at this offset could be really big,
* so we cap at 2^31 to avoid overflow */ if (maxsectors > (maxbytes >> 9))
if (maxsectors > (1 << (31-9))) return maxbytes;
return 1<<31; else
return maxsectors << 9; return maxsectors << 9;
} }
...@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) ...@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
cnt = 0; cnt = 0;
conf->array_sectors = 0; conf->array_sectors = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) { rdev_for_each(rdev, mddev) {
int j = rdev->raid_disk; int j = rdev->raid_disk;
struct dev_info *disk = conf->disks + j; struct dev_info *disk = conf->disks + j;
sector_t sectors; sector_t sectors;
...@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) ...@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit max_segments to 1 lying within
* a single page.
*/
if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
blk_queue_max_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
conf->array_sectors += rdev->sectors; conf->array_sectors += rdev->sectors;
cnt++; cnt++;
......
This diff is collapsed.
...@@ -128,6 +128,10 @@ struct md_rdev { ...@@ -128,6 +128,10 @@ struct md_rdev {
enum flag_bits { enum flag_bits {
Faulty, /* device is known to have a fault */ Faulty, /* device is known to have a fault */
In_sync, /* device is in_sync with rest of array */ In_sync, /* device is in_sync with rest of array */
Unmerged, /* device is being added to array and should
* be considerred for bvec_merge_fn but not
* yet for actual IO
*/
WriteMostly, /* Avoid reading if at all possible */ WriteMostly, /* Avoid reading if at all possible */
AutoDetected, /* added by auto-detect */ AutoDetected, /* added by auto-detect */
Blocked, /* An error occurred but has not yet Blocked, /* An error occurred but has not yet
...@@ -345,6 +349,10 @@ struct mddev { ...@@ -345,6 +349,10 @@ struct mddev {
int degraded; /* whether md should consider int degraded; /* whether md should consider
* adding a spare * adding a spare
*/ */
int merge_check_needed; /* at least one
* member device
* has a
* merge_bvec_fn */
atomic_t recovery_active; /* blocks scheduled, but not written */ atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait; wait_queue_head_t recovery_wait;
...@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) ...@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
/* /*
* iterates through the 'same array disks' ringlist * iterates through the 'same array disks' ringlist
*/ */
#define rdev_for_each(rdev, tmp, mddev) \ #define rdev_for_each(rdev, mddev) \
list_for_each_entry(rdev, &((mddev)->disks), same_set)
#define rdev_for_each_safe(rdev, tmp, mddev) \
list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
#define rdev_for_each_rcu(rdev, mddev) \ #define rdev_for_each_rcu(rdev, mddev) \
......
...@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev) ...@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
} }
working_disks = 0; working_disks = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) { rdev_for_each(rdev, mddev) {
disk_idx = rdev->raid_disk; disk_idx = rdev->raid_disk;
if (disk_idx < 0 || if (disk_idx < 0 ||
disk_idx >= mddev->raid_disks) disk_idx >= mddev->raid_disks)
......
...@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
if (!conf) if (!conf)
return -ENOMEM; return -ENOMEM;
list_for_each_entry(rdev1, &mddev->disks, same_set) { rdev_for_each(rdev1, mddev) {
pr_debug("md/raid0:%s: looking at %s\n", pr_debug("md/raid0:%s: looking at %s\n",
mdname(mddev), mdname(mddev),
bdevname(rdev1->bdev, b)); bdevname(rdev1->bdev, b));
...@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
sector_div(sectors, mddev->chunk_sectors); sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors; rdev1->sectors = sectors * mddev->chunk_sectors;
list_for_each_entry(rdev2, &mddev->disks, same_set) { rdev_for_each(rdev2, mddev) {
pr_debug("md/raid0:%s: comparing %s(%llu)" pr_debug("md/raid0:%s: comparing %s(%llu)"
" with %s(%llu)\n", " with %s(%llu)\n",
mdname(mddev), mdname(mddev),
...@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
smallest = NULL; smallest = NULL;
dev = conf->devlist; dev = conf->devlist;
err = -EINVAL; err = -EINVAL;
list_for_each_entry(rdev1, &mddev->disks, same_set) { rdev_for_each(rdev1, mddev) {
int j = rdev1->raid_disk; int j = rdev1->raid_disk;
if (mddev->level == 10) { if (mddev->level == 10) {
...@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
disk_stack_limits(mddev->gendisk, rdev1->bdev, disk_stack_limits(mddev->gendisk, rdev1->bdev,
rdev1->data_offset << 9); rdev1->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_segments to 1, lying within
* a single page.
*/
if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
blk_queue_max_segments(mddev->queue, 1); conf->has_merge_bvec = 1;
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
if (!smallest || (rdev1->sectors < smallest->sectors)) if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1; smallest = rdev1;
cnt++; cnt++;
...@@ -290,8 +284,64 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -290,8 +284,64 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
return err; return err;
} }
/* Find the zone which holds a particular offset
* Update *sectorp to be an offset in that zone
*/
static struct strip_zone *find_zone(struct r0conf *conf,
sector_t *sectorp)
{
int i;
struct strip_zone *z = conf->strip_zone;
sector_t sector = *sectorp;
for (i = 0; i < conf->nr_strip_zones; i++)
if (sector < z[i].zone_end) {
if (i)
*sectorp = sector - z[i-1].zone_end;
return z + i;
}
BUG();
}
/*
* remaps the bio to the target device. we separate two flows.
* power 2 flow and a general flow for the sake of perfromance
*/
static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
sector_t sector, sector_t *sector_offset)
{
unsigned int sect_in_chunk;
sector_t chunk;
struct r0conf *conf = mddev->private;
int raid_disks = conf->strip_zone[0].nb_dev;
unsigned int chunk_sects = mddev->chunk_sectors;
if (is_power_of_2(chunk_sects)) {
int chunksect_bits = ffz(~chunk_sects);
/* find the sector offset inside the chunk */
sect_in_chunk = sector & (chunk_sects - 1);
sector >>= chunksect_bits;
/* chunk in zone */
chunk = *sector_offset;
/* quotient is the chunk in real device*/
sector_div(chunk, zone->nb_dev << chunksect_bits);
} else{
sect_in_chunk = sector_div(sector, chunk_sects);
chunk = *sector_offset;
sector_div(chunk, chunk_sects * zone->nb_dev);
}
/*
* position the bio over the real device
* real sector = chunk in device + starting of zone
* + the position in the chunk
*/
*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
return conf->devlist[(zone - conf->strip_zone)*raid_disks
+ sector_div(sector, zone->nb_dev)];
}
/** /**
* raid0_mergeable_bvec -- tell bio layer if a two requests can be merged * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
* @q: request queue * @q: request queue
* @bvm: properties of new bio * @bvm: properties of new bio
* @biovec: the request that could be merged to it. * @biovec: the request that could be merged to it.
...@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q, ...@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
struct bio_vec *biovec) struct bio_vec *biovec)
{ {
struct mddev *mddev = q->queuedata; struct mddev *mddev = q->queuedata;
struct r0conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
sector_t sector_offset = sector;
int max; int max;
unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9; unsigned int bio_sectors = bvm->bi_size >> 9;
struct strip_zone *zone;
struct md_rdev *rdev;
struct request_queue *subq;
if (is_power_of_2(chunk_sectors)) if (is_power_of_2(chunk_sectors))
max = (chunk_sectors - ((sector & (chunk_sectors-1)) max = (chunk_sectors - ((sector & (chunk_sectors-1))
...@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q, ...@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q,
else else
max = (chunk_sectors - (sector_div(sector, chunk_sectors) max = (chunk_sectors - (sector_div(sector, chunk_sectors)
+ bio_sectors)) << 9; + bio_sectors)) << 9;
if (max < 0) max = 0; /* bio_add cannot handle a negative return */ if (max < 0)
max = 0; /* bio_add cannot handle a negative return */
if (max <= biovec->bv_len && bio_sectors == 0) if (max <= biovec->bv_len && bio_sectors == 0)
return biovec->bv_len; return biovec->bv_len;
else if (max < biovec->bv_len)
/* too small already, no need to check further */
return max;
if (!conf->has_merge_bvec)
return max;
/* May need to check subordinate device */
sector = sector_offset;
zone = find_zone(mddev->private, &sector_offset);
rdev = map_sector(mddev, zone, sector, &sector_offset);
subq = bdev_get_queue(rdev->bdev);
if (subq->merge_bvec_fn) {
bvm->bi_bdev = rdev->bdev;
bvm->bi_sector = sector_offset + zone->dev_start +
rdev->data_offset;
return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
} else
return max; return max;
} }
...@@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks ...@@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
WARN_ONCE(sectors || raid_disks, WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__); "%s does not support generic reshape\n", __func__);
list_for_each_entry(rdev, &mddev->disks, same_set) rdev_for_each(rdev, mddev)
array_sectors += rdev->sectors; array_sectors += rdev->sectors;
return array_sectors; return array_sectors;
...@@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev) ...@@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev)
return 0; return 0;
} }
/* Find the zone which holds a particular offset
* Update *sectorp to be an offset in that zone
*/
static struct strip_zone *find_zone(struct r0conf *conf,
sector_t *sectorp)
{
int i;
struct strip_zone *z = conf->strip_zone;
sector_t sector = *sectorp;
for (i = 0; i < conf->nr_strip_zones; i++)
if (sector < z[i].zone_end) {
if (i)
*sectorp = sector - z[i-1].zone_end;
return z + i;
}
BUG();
}
/*
* remaps the bio to the target device. we separate two flows.
* power 2 flow and a general flow for the sake of perfromance
*/
static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
sector_t sector, sector_t *sector_offset)
{
unsigned int sect_in_chunk;
sector_t chunk;
struct r0conf *conf = mddev->private;
int raid_disks = conf->strip_zone[0].nb_dev;
unsigned int chunk_sects = mddev->chunk_sectors;
if (is_power_of_2(chunk_sects)) {
int chunksect_bits = ffz(~chunk_sects);
/* find the sector offset inside the chunk */
sect_in_chunk = sector & (chunk_sects - 1);
sector >>= chunksect_bits;
/* chunk in zone */
chunk = *sector_offset;
/* quotient is the chunk in real device*/
sector_div(chunk, zone->nb_dev << chunksect_bits);
} else{
sect_in_chunk = sector_div(sector, chunk_sects);
chunk = *sector_offset;
sector_div(chunk, chunk_sects * zone->nb_dev);
}
/*
* position the bio over the real device
* real sector = chunk in device + starting of zone
* + the position in the chunk
*/
*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
return conf->devlist[(zone - conf->strip_zone)*raid_disks
+ sector_div(sector, zone->nb_dev)];
}
/* /*
* Is io distribute over 1 or more chunks ? * Is io distribute over 1 or more chunks ?
*/ */
...@@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) ...@@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
list_for_each_entry(rdev, &mddev->disks, same_set) { rdev_for_each(rdev, mddev) {
/* check slot number for a disk */ /* check slot number for a disk */
if (rdev->raid_disk == mddev->raid_disks-1) { if (rdev->raid_disk == mddev->raid_disks-1) {
printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
......
...@@ -9,8 +9,11 @@ struct strip_zone { ...@@ -9,8 +9,11 @@ struct strip_zone {
struct r0conf { struct r0conf {
struct strip_zone *strip_zone; struct strip_zone *strip_zone;
struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ struct md_rdev **devlist; /* lists of rdevs, pointed to
* by strip_zone->dev */
int nr_strip_zones; int nr_strip_zones;
int has_merge_bvec; /* at least one member has
* a merge_bvec_fn */
}; };
#endif #endif
...@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect ...@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
rdev = rcu_dereference(conf->mirrors[disk].rdev); rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL || rdev == NULL
|| test_bit(Unmerged, &rdev->flags)
|| test_bit(Faulty, &rdev->flags)) || test_bit(Faulty, &rdev->flags))
continue; continue;
if (!test_bit(In_sync, &rdev->flags) && if (!test_bit(In_sync, &rdev->flags) &&
...@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect ...@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk; return best_disk;
} }
static int raid1_mergeable_bvec(struct request_queue *q,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
struct mddev *mddev = q->queuedata;
struct r1conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max = biovec->bv_len;
if (mddev->merge_check_needed) {
int disk;
rcu_read_lock();
for (disk = 0; disk < conf->raid_disks * 2; disk++) {
struct md_rdev *rdev = rcu_dereference(
conf->mirrors[disk].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q =
bdev_get_queue(rdev->bdev);
if (q->merge_bvec_fn) {
bvm->bi_sector = sector +
rdev->data_offset;
bvm->bi_bdev = rdev->bdev;
max = min(max, q->merge_bvec_fn(
q, bvm, biovec));
}
}
}
rcu_read_unlock();
}
return max;
}
int md_raid1_congested(struct mddev *mddev, int bits) int md_raid1_congested(struct mddev *mddev, int bits)
{ {
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
...@@ -737,7 +771,20 @@ static void wait_barrier(struct r1conf *conf) ...@@ -737,7 +771,20 @@ static void wait_barrier(struct r1conf *conf)
spin_lock_irq(&conf->resync_lock); spin_lock_irq(&conf->resync_lock);
if (conf->barrier) { if (conf->barrier) {
conf->nr_waiting++; conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier, !conf->barrier, /* Wait for the barrier to drop.
* However if there are already pending
* requests (preventing the barrier from
* rising completely), and the
* pre-process bio queue isn't empty,
* then don't wait, as we need to empty
* that queue to get the nr_pending
* count down.
*/
wait_event_lock_irq(conf->wait_barrier,
!conf->barrier ||
(conf->nr_pending &&
current->bio_list &&
!bio_list_empty(current->bio_list)),
conf->resync_lock, conf->resync_lock,
); );
conf->nr_waiting--; conf->nr_waiting--;
...@@ -1002,7 +1049,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1002,7 +1049,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
break; break;
} }
r1_bio->bios[i] = NULL; r1_bio->bios[i] = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) { if (!rdev || test_bit(Faulty, &rdev->flags)
|| test_bit(Unmerged, &rdev->flags)) {
if (i < conf->raid_disks) if (i < conf->raid_disks)
set_bit(R1BIO_Degraded, &r1_bio->state); set_bit(R1BIO_Degraded, &r1_bio->state);
continue; continue;
...@@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct mirror_info *p; struct mirror_info *p;
int first = 0; int first = 0;
int last = conf->raid_disks - 1; int last = conf->raid_disks - 1;
struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_disabled == conf->recovery_disabled) if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY; return -EBUSY;
...@@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0) if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk; first = last = rdev->raid_disk;
if (q->merge_bvec_fn) {
set_bit(Unmerged, &rdev->flags);
mddev->merge_check_needed = 1;
}
for (mirror = first; mirror <= last; mirror++) { for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror; p = conf->mirrors+mirror;
if (!p->rdev) { if (!p->rdev) {
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must
* never risk violating it, so limit
* ->max_segments to one lying with a single
* page, as a one page request is never in
* violation.
*/
if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
blk_queue_max_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
p->head_position = 0; p->head_position = 0;
rdev->raid_disk = mirror; rdev->raid_disk = mirror;
...@@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break; break;
} }
} }
if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
/* Some requests might not have seen this new
* merge_bvec_fn. We must wait for them to complete
* before merging the device fully.
* First we make sure any code which has tested
* our function has submitted the request, then
* we wait for all outstanding requests to complete.
*/
synchronize_sched();
raise_barrier(conf);
lower_barrier(conf);
clear_bit(Unmerged, &rdev->flags);
}
md_integrity_add_rdev(rdev, mddev); md_integrity_add_rdev(rdev, mddev);
print_conf(conf); print_conf(conf);
return err; return err;
...@@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err = -EINVAL; err = -EINVAL;
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
list_for_each_entry(rdev, &mddev->disks, same_set) { rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk; int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks if (disk_idx >= mddev->raid_disks
|| disk_idx < 0) || disk_idx < 0)
...@@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev) ...@@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf)) if (IS_ERR(conf))
return PTR_ERR(conf); return PTR_ERR(conf);
list_for_each_entry(rdev, &mddev->disks, same_set) { rdev_for_each(rdev, mddev) {
if (!mddev->gendisk) if (!mddev->gendisk)
continue; continue;
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_segments to 1 lying within
* a single page, as a one page request is never in violation.
*/
if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
blk_queue_max_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
} }
mddev->degraded = 0; mddev->degraded = 0;
...@@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev) ...@@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev)
if (mddev->queue) { if (mddev->queue) {
mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_data = mddev;
blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
} }
return md_integrity_register(mddev); return md_integrity_register(mddev);
} }
......
This diff is collapsed.
...@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) ...@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
md_wakeup_thread(conf->mddev->thread); md_wakeup_thread(conf->mddev->thread);
} else { } else {
BUG_ON(stripe_operations_active(sh)); BUG_ON(stripe_operations_active(sh));
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_dec(&conf->preread_active_stripes); if (atomic_dec_return(&conf->preread_active_stripes)
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) < IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread); md_wakeup_thread(conf->mddev->thread);
}
atomic_dec(&conf->active_stripes); atomic_dec(&conf->active_stripes);
if (!test_bit(STRIPE_EXPANDING, &sh->state)) { if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
list_add_tail(&sh->lru, &conf->inactive_list); list_add_tail(&sh->lru, &conf->inactive_list);
...@@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) ...@@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
pr_debug("raid456: run(%s) called.\n", mdname(mddev)); pr_debug("raid456: run(%s) called.\n", mdname(mddev));
list_for_each_entry(rdev, &mddev->disks, same_set) { rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk; raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks if (raid_disk >= max_disks
|| raid_disk < 0) || raid_disk < 0)
...@@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev) ...@@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev)
blk_queue_io_opt(mddev->queue, chunk_size * blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded)); (conf->raid_disks - conf->max_degraded));
list_for_each_entry(rdev, &mddev->disks, same_set) rdev_for_each(rdev, mddev)
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
} }
...@@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (mddev->recovery_disabled == conf->recovery_disabled) if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY; return -EBUSY;
if (has_failed(conf)) if (rdev->saved_raid_disk < 0 && has_failed(conf))
/* no point adding a device */ /* no point adding a device */
return -EINVAL; return -EINVAL;
...@@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev)
if (!check_stripe_cache(mddev)) if (!check_stripe_cache(mddev))
return -ENOSPC; return -ENOSPC;
list_for_each_entry(rdev, &mddev->disks, same_set) rdev_for_each(rdev, mddev)
if (!test_bit(In_sync, &rdev->flags) if (!test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags)) && !test_bit(Faulty, &rdev->flags))
spares++; spares++;
...@@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev)
* such devices during the reshape and confusion could result. * such devices during the reshape and confusion could result.
*/ */
if (mddev->delta_disks >= 0) { if (mddev->delta_disks >= 0) {
int added_devices = 0; rdev_for_each(rdev, mddev)
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk < 0 && if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) { !test_bit(Faulty, &rdev->flags)) {
if (raid5_add_disk(mddev, rdev) == 0) { if (raid5_add_disk(mddev, rdev) == 0) {
if (rdev->raid_disk if (rdev->raid_disk
>= conf->previous_raid_disks) { >= conf->previous_raid_disks)
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
added_devices++; else
} else
rdev->recovery_offset = 0; rdev->recovery_offset = 0;
if (sysfs_link_rdev(mddev, rdev)) if (sysfs_link_rdev(mddev, rdev))
...@@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev)
&& !test_bit(Faulty, &rdev->flags)) { && !test_bit(Faulty, &rdev->flags)) {
/* This is a spare that was manually added */ /* This is a spare that was manually added */
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
added_devices++;
} }
/* When a reshape changes the number of devices, /* When a reshape changes the number of devices,
...@@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev)
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
conf->reshape_progress = MaxSector; conf->reshape_progress = MaxSector;
mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
return -EAGAIN; return -EAGAIN;
} }
......
...@@ -281,6 +281,10 @@ struct mdp_superblock_1 { ...@@ -281,6 +281,10 @@ struct mdp_superblock_1 {
* active device with same 'role'. * active device with same 'role'.
* 'recovery_offset' is also set. * 'recovery_offset' is also set.
*/ */
#define MD_FEATURE_ALL (1|2|4|8|16) #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
|MD_FEATURE_BAD_BLOCKS \
|MD_FEATURE_REPLACEMENT)
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment