Commit 09b63e46 authored by Neil Brown's avatar Neil Brown Committed by Linus Torvalds

[PATCH] md: Remove per-personality 'operational' and 'write_only' flags

raid1, raid5 and multipath maintain their own
'operational' flag.  This is equivalent to
   !rdev->faulty
and so isn't needed.
Similarly raid1 and raid1 maintain a "write_only" flag
that is equivalnt to
   !rdev->in_sync
so it isn't needed either.

As part of implementing this change, we introduce some extra
flag bit in raid5 that are meaningful only inside 'handle_stripe'.
Some of these replace the "action" array which recorded what
actions were required (and would be performed after the stripe
spinlock was released).  This has the advantage of reducing our
dependance on MD_SB_DISKS which personalities shouldn't need
to know about.
parent 0ce3712f
......@@ -365,9 +365,6 @@ static void free_disk_sb(mdk_rdev_t * rdev)
rdev->sb_page = NULL;
rdev->sb_offset = 0;
rdev->size = 0;
} else {
if (!rdev->faulty)
MD_BUG();
}
}
......@@ -586,7 +583,6 @@ static void export_rdev(mdk_rdev_t * rdev)
md_autodetect_dev(rdev->bdev->bd_dev);
#endif
unlock_rdev(rdev);
rdev->faulty = 0;
kfree(rdev);
}
......@@ -671,9 +667,9 @@ static void print_sb(mdp_super_t *sb)
static void print_rdev(mdk_rdev_t *rdev)
{
printk(KERN_INFO "md: rdev %s, SZ:%08ld F:%d DN:%d ",
printk(KERN_INFO "md: rdev %s, SZ:%08ld F:%d S:%d DN:%d ",
bdev_partition_name(rdev->bdev),
rdev->size, rdev->faulty, rdev->desc_nr);
rdev->size, rdev->faulty, rdev->in_sync, rdev->desc_nr);
if (rdev->sb) {
printk(KERN_INFO "md: rdev superblock:\n");
print_sb(rdev->sb);
......@@ -1006,6 +1002,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
}
rdev->desc_nr = -1;
rdev->faulty = 0;
rdev->in_sync = 0;
atomic_set(&rdev->nr_pending, 0);
size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
......@@ -2182,14 +2179,13 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
static int set_disk_faulty(mddev_t *mddev, dev_t dev)
{
mdk_rdev_t *rdev;
int ret;
rdev = find_rdev(mddev, dev);
if (!rdev)
return 0;
ret = md_error(mddev, rdev);
return ret;
md_error(mddev, rdev);
return 1;
}
static int md_ioctl(struct inode *inode, struct file *file,
......@@ -2604,9 +2600,8 @@ static void md_recover_arrays(void)
}
int md_error(mddev_t *mddev, mdk_rdev_t *rdev)
void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
{
dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
MD_MAJOR,mdidx(mddev),MAJOR(bdev->bd_dev),MINOR(bdev->bd_dev),
__builtin_return_address(0),__builtin_return_address(1),
......@@ -2614,25 +2609,15 @@ int md_error(mddev_t *mddev, mdk_rdev_t *rdev)
if (!mddev) {
MD_BUG();
return 0;
return;
}
if (!rdev || rdev->faulty)
return 0;
if (!mddev->pers->error_handler
|| mddev->pers->error_handler(mddev,rdev) <= 0) {
rdev->faulty = 1;
rdev->in_sync = 0;
} else
return 1;
/*
* if recovery was running, stop it now.
*/
if (mddev->recovery_running)
mddev->recovery_running = -EIO;
return;
if (!mddev->pers->error_handler)
return;
mddev->pers->error_handler(mddev,rdev);
md_recover_arrays();
return 0;
}
static int status_unused(char * page)
......@@ -3510,7 +3495,7 @@ static int __init raid_setup(char *str)
return 1;
}
int __init md_run_setup(void)
static int __init md_run_setup(void)
{
if (raid_setup_args.noautodetect)
printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
......
......@@ -70,7 +70,7 @@ static void mp_pool_free(void *mpb, void *data)
kfree(mpb);
}
static int multipath_map (mddev_t *mddev, mdk_rdev_t **rdev)
static int multipath_map (mddev_t *mddev, mdk_rdev_t **rdevp)
{
multipath_conf_t *conf = mddev_to_conf(mddev);
int i, disks = MD_SB_DISKS;
......@@ -82,10 +82,10 @@ static int multipath_map (mddev_t *mddev, mdk_rdev_t **rdev)
spin_lock_irq(&conf->device_lock);
for (i = 0; i < disks; i++) {
if (conf->multipaths[i].operational &&
conf->multipaths[i].rdev) {
*rdev = conf->multipaths[i].rdev;
atomic_inc(&(*rdev)->nr_pending);
mdk_rdev_t *rdev = conf->multipaths[i].rdev;
if (rdev && rdev->in_sync) {
*rdevp = rdev;
atomic_inc(&rdev->nr_pending);
spin_unlock_irq(&conf->device_lock);
return 0;
}
......@@ -158,10 +158,11 @@ static int multipath_read_balance (multipath_conf_t *conf)
{
int disk;
for (disk = 0; disk < MD_SB_DISKS; disk++)
if (conf->multipaths[disk].operational &&
conf->multipaths[disk].rdev)
for (disk = 0; disk < MD_SB_DISKS; disk++) {
mdk_rdev_t *rdev = conf->multipaths[disk].rdev;
if (rdev && rdev->in_sync)
return disk;
}
BUG();
return 0;
}
......@@ -204,7 +205,8 @@ static int multipath_status (char *page, mddev_t *mddev)
conf->working_disks);
for (i = 0; i < conf->raid_disks; i++)
sz += sprintf (page+sz, "%s",
conf->multipaths[i].operational ? "U" : "_");
conf->multipaths[i].rdev &&
conf->multipaths[i].rdev->in_sync ? "U" : "_");
sz += sprintf (page+sz, "]");
return sz;
}
......@@ -219,28 +221,13 @@ static int multipath_status (char *page, mddev_t *mddev)
"multipath: IO failure on %s, disabling IO path. \n" \
" Operation continuing on %d IO paths.\n"
static void mark_disk_bad (mddev_t *mddev, int failed)
{
multipath_conf_t *conf = mddev_to_conf(mddev);
struct multipath_info *multipath = conf->multipaths+failed;
multipath->operational = 0;
mddev->sb_dirty = 1;
conf->working_disks--;
printk (DISK_FAILED, bdev_partition_name (multipath->rdev->bdev),
conf->working_disks);
}
/*
* Careful, this can execute in IRQ contexts as well!
*/
static int multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
{
multipath_conf_t *conf = mddev_to_conf(mddev);
struct multipath_info * multipaths = conf->multipaths;
int disks = MD_SB_DISKS;
int i;
if (conf->working_disks <= 1) {
/*
......@@ -248,24 +235,21 @@ static int multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
* first check if this is a queued request for a device
* which has just failed.
*/
for (i = 0; i < disks; i++) {
if (multipaths[i].rdev == rdev && !multipaths[i].operational)
return 0;
}
printk (LAST_DISK);
return 1; /* leave it active... it's all we have */
/* leave it active... it's all we have */
} else {
/*
* Mark disk as unusable
*/
for (i = 0; i < disks; i++) {
if (multipaths[i].rdev == rdev && multipaths[i].operational) {
mark_disk_bad(mddev, i);
break;
}
if (!rdev->faulty) {
rdev->in_sync = 0;
rdev->faulty = 1;
mddev->sb_dirty = 1;
conf->working_disks--;
printk (DISK_FAILED, bdev_partition_name (rdev->bdev),
conf->working_disks);
}
}
return 0;
}
#undef LAST_DISK
......@@ -290,7 +274,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
tmp = conf->multipaths + i;
if (tmp->rdev)
printk(" disk%d, o:%d, dev:%s\n",
i,tmp->operational,
i,!tmp->rdev->faulty,
bdev_partition_name(tmp->rdev->bdev));
}
}
......@@ -308,7 +292,6 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
for (path=0; path<mddev->raid_disks; path++)
if ((p=conf->multipaths+path)->rdev == NULL) {
p->rdev = rdev;
p->operational = 1;
conf->working_disks++;
rdev->raid_disk = path;
found = 1;
......@@ -329,8 +312,8 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
spin_lock_irq(&conf->device_lock);
if (p->rdev) {
if (p->operational ||
(p->rdev && atomic_read(&p->rdev->nr_pending))) {
if (p->rdev->in_sync ||
atomic_read(&p->rdev->nr_pending)) {
printk(KERN_ERR "hot-remove-disk, slot %d is identified but is still operational!\n", number);
err = -EBUSY;
goto abort;
......@@ -474,19 +457,9 @@ static int multipath_run (mddev_t *mddev)
disk = conf->multipaths + disk_idx;
disk->rdev = rdev;
if (rdev->faulty)
disk->operational = 0;
else {
/*
* Mark all disks as active to start with, there are no
* spares. multipath_read_balance deals with choose
* the "best" operational device.
*/
disk->operational = 1;
if (!rdev->faulty)
conf->working_disks++;
}
}
conf->raid_disks = mddev->raid_disks;
mddev->sb_dirty = 1;
......
......@@ -188,7 +188,7 @@ static inline void put_buf(r1bio_t *r1_bio)
mempool_free(r1_bio, conf->r1buf_pool);
}
static int map(mddev_t *mddev, mdk_rdev_t **rdev)
static int map(mddev_t *mddev, mdk_rdev_t **rdevp)
{
conf_t *conf = mddev_to_conf(mddev);
int i, disks = conf->raid_disks;
......@@ -200,11 +200,10 @@ static int map(mddev_t *mddev, mdk_rdev_t **rdev)
spin_lock_irq(&conf->device_lock);
for (i = 0; i < disks; i++) {
if (conf->mirrors[i].operational
&& !conf->mirrors[i].write_only
&& conf->mirrors[i].rdev) {
*rdev = conf->mirrors[i].rdev;
atomic_inc(&(*rdev)->nr_pending);
mdk_rdev_t *rdev = conf->mirrors[i].rdev;
if (rdev && rdev->in_sync) {
*rdevp = rdev;
atomic_inc(&rdev->nr_pending);
spin_unlock_irq(&conf->device_lock);
return 0;
}
......@@ -346,7 +345,9 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) {
/* make sure that disk is operational */
new_disk = 0;
while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) {
while (!conf->mirrors[new_disk].rdev ||
!conf->mirrors[new_disk].rdev->in_sync) {
new_disk++;
if (new_disk == conf->raid_disks) {
new_disk = 0;
......@@ -358,7 +359,8 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
/* make sure the disk is operational */
while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) {
while (!conf->mirrors[new_disk].rdev ||
!conf->mirrors[new_disk].rdev->in_sync) {
if (new_disk <= 0)
new_disk = conf->raid_disks;
new_disk--;
......@@ -387,8 +389,8 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
disk = conf->raid_disks;
disk--;
if (conf->mirrors[disk].write_only ||
!conf->mirrors[disk].operational)
if (!conf->mirrors[disk].rdev ||
!conf->mirrors[disk].rdev->in_sync)
continue;
if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) {
......@@ -509,8 +511,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
*/
spin_lock_irq(&conf->device_lock);
for (i = 0; i < disks; i++) {
if (conf->mirrors[i].operational &&
conf->mirrors[i].rdev) {
if (conf->mirrors[i].rdev &&
!conf->mirrors[i].rdev->faulty) {
atomic_inc(&conf->mirrors[i].rdev->nr_pending);
r1_bio->write_bios[i] = bio;
} else
......@@ -573,7 +575,8 @@ static int status(char *page, mddev_t *mddev)
conf->working_disks);
for (i = 0; i < conf->raid_disks; i++)
sz += sprintf(page+sz, "%s",
conf->mirrors[i].operational ? "U" : "_");
conf->mirrors[i].rdev &&
conf->mirrors[i].rdev->in_sync ? "U" : "_");
sz += sprintf (page+sz, "]");
return sz;
}
......@@ -594,49 +597,37 @@ static int status(char *page, mddev_t *mddev)
#define ALREADY_SYNCING KERN_INFO \
"raid1: syncing already in progress.\n"
static void mark_disk_bad(mddev_t *mddev, int failed)
{
conf_t *conf = mddev_to_conf(mddev);
mirror_info_t *mirror = conf->mirrors+failed;
mirror->operational = 0;
if (!mirror->write_only) {
mddev->degraded++;
conf->working_disks--;
}
mddev->sb_dirty = 1;
printk(DISK_FAILED, bdev_partition_name(mirror->rdev->bdev), conf->working_disks);
}
static int error(mddev_t *mddev, mdk_rdev_t *rdev)
static void error(mddev_t *mddev, mdk_rdev_t *rdev)
{
conf_t *conf = mddev_to_conf(mddev);
mirror_info_t * mirrors = conf->mirrors;
int disks = conf->raid_disks;
int i;
/*
* Find the drive.
* If it is not operational, then we have already marked it as dead
* else if it is the last working disks, ignore the error, let the
* next level up know.
* else mark the drive as failed
*/
for (i = 0; i < disks; i++)
if (mirrors[i].operational && mirrors[i].rdev == rdev)
break;
if (i == disks)
return 0;
if (mirrors[i].operational && !mirrors[i].write_only
if (rdev->in_sync
&& conf->working_disks == 1)
/*
* Don't fail the drive, act as though we were just a
* normal single drive
*/
return 1;
mark_disk_bad(mddev, i);
return 0;
return;
if (rdev->in_sync) {
mddev->degraded++;
conf->working_disks--;
/*
* if recovery was running, stop it now.
*/
if (mddev->recovery_running)
mddev->recovery_running = -EIO;
}
rdev->in_sync = 0;
rdev->faulty = 1;
mddev->sb_dirty = 1;
printk(DISK_FAILED, bdev_partition_name(rdev->bdev), conf->working_disks);
}
static void print_conf(conf_t *conf)
......@@ -656,7 +647,7 @@ static void print_conf(conf_t *conf)
tmp = conf->mirrors + i;
if (tmp->rdev)
printk(" disk %d, wo:%d, o:%d, dev:%s\n",
i, tmp->write_only, tmp->operational,
i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
bdev_partition_name(tmp->rdev->bdev));
}
}
......@@ -688,12 +679,11 @@ static int raid1_spare_active(mddev_t *mddev)
*/
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->mirrors + i;
if (tmp->operational && tmp->rdev
if (tmp->rdev
&& !tmp->rdev->faulty
&& tmp->write_only) {
&& !tmp->rdev->in_sync) {
conf->working_disks++;
mddev->degraded--;
tmp->write_only = 0;
tmp->rdev->in_sync = 1;
}
}
......@@ -715,8 +705,6 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
for (mirror=0; mirror < mddev->raid_disks; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
p->rdev = rdev;
p->write_only = 1;
p->operational = 1;
p->head_position = 0;
rdev->raid_disk = mirror;
found = 1;
......@@ -737,8 +725,8 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
print_conf(conf);
spin_lock_irq(&conf->device_lock);
if (p->rdev) {
if (p->operational ||
(p->rdev && atomic_read(&p->rdev->nr_pending))) {
if (p->rdev->in_sync ||
atomic_read(&p->rdev->nr_pending)) {
err = -EBUSY;
goto abort;
}
......@@ -837,20 +825,19 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
spin_lock_irq(&conf->device_lock);
for (i = 0; i < disks ; i++) {
r1_bio->write_bios[i] = NULL;
if (!conf->mirrors[i].operational)
if (!conf->mirrors[i].rdev ||
conf->mirrors[i].rdev->faulty)
continue;
if (i == conf->last_used)
/*
* we read from here, no need to write
*/
continue;
if (!conf->mirrors[i].write_only && mddev->in_sync)
if (conf->mirrors[i].rdev->in_sync && mddev->in_sync)
/*
* don't need to write this we are just rebuilding
*/
continue;
if (!conf->mirrors[i].rdev)
continue;
atomic_inc(&conf->mirrors[i].rdev->nr_pending);
r1_bio->write_bios[i] = bio;
}
......@@ -1009,9 +996,8 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
disk = conf->last_used;
/* make sure disk is operational */
spin_lock_irq(&conf->device_lock);
while (!conf->mirrors[disk].operational ||
conf->mirrors[disk].write_only ||
!conf->mirrors[disk].rdev) {
while (conf->mirrors[disk].rdev == NULL ||
!conf->mirrors[disk].rdev->in_sync) {
if (disk <= 0)
disk = conf->raid_disks;
disk--;
......@@ -1149,8 +1135,6 @@ static int run(mddev_t *mddev)
disk = conf->mirrors + disk_idx;
disk->rdev = rdev;
disk->operational = ! rdev->faulty;
disk->write_only = ! rdev->in_sync;
disk->head_position = 0;
if (!rdev->faulty && rdev->in_sync)
conf->working_disks++;
......@@ -1174,8 +1158,6 @@ static int run(mddev_t *mddev)
disk = conf->mirrors + i;
if (!disk->rdev) {
disk->operational = 0;
disk->write_only = 0;
disk->head_position = 0;
mddev->degraded++;
}
......@@ -1186,8 +1168,8 @@ static int run(mddev_t *mddev)
* to read balancing.
*/
for (j = 0; j < conf->raid_disks &&
(!conf->mirrors[j].operational ||
conf->mirrors[j].write_only) ; j++)
(!conf->mirrors[j].rdev ||
!conf->mirrors[j].rdev->in_sync) ; j++)
/* nothing */;
conf->last_used = j;
......
This diff is collapsed.
......@@ -77,8 +77,7 @@ extern void md_wakeup_thread(mdk_thread_t *thread);
extern void md_interrupt_thread (mdk_thread_t *thread);
extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
extern int md_error (mddev_t *mddev, mdk_rdev_t *rdev);
extern int md_run_setup(void);
extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
extern void md_print_devices (void);
......
......@@ -154,6 +154,16 @@ struct mdk_rdev_s
mdp_super_t *sb;
unsigned long sb_offset;
/* A device can be in one of three states based on two flags:
* Not working: faulty==1 in_sync==0
* Fully working: faulty==0 in_sync==1
* Working, but not
* in sync with array
* faulty==0 in_sync==0
*
* It can never have faulty==1, in_sync==1
* This reduces the burden of testing multiple flags in many cases
*/
int faulty; /* if faulty do not issue IO requests */
int in_sync; /* device is a full member of the array */
......@@ -227,7 +237,10 @@ struct mdk_personality_s
int (*run)(mddev_t *mddev);
int (*stop)(mddev_t *mddev);
int (*status)(char *page, mddev_t *mddev);
int (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
/* error_handler must set ->faulty and clear ->in_sync
* if appropriate, and should abort recovery if needed
*/
void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
int (*hot_remove_disk) (mddev_t *mddev, int number);
int (*spare_active) (mddev_t *mddev);
......
......@@ -6,11 +6,6 @@
struct multipath_info {
mdk_rdev_t *rdev;
/*
* State bits:
*/
int operational;
};
struct multipath_private_data {
......
......@@ -8,12 +8,6 @@ typedef struct mirror_info mirror_info_t;
struct mirror_info {
mdk_rdev_t *rdev;
sector_t head_position;
/*
* State bits:
*/
int operational;
int write_only;
};
typedef struct r1bio_s r1bio_t;
......
......@@ -148,6 +148,11 @@ struct stripe_head {
#define R5_UPTODATE 0 /* page contains current data */
#define R5_LOCKED 1 /* IO has been submitted on "req" */
#define R5_OVERWRITE 2 /* towrite covers whole page */
/* and some that are internal to handle_stripe */
#define R5_Insync 3 /* rdev && rdev->in_sync at start */
#define R5_Wantread 4 /* want to schedule a read */
#define R5_Wantwrite 5
#define R5_Syncio 6 /* this io need to be accounted as resync io */
/*
* Write method
......@@ -193,8 +198,6 @@ struct stripe_head {
struct disk_info {
mdk_rdev_t *rdev;
int operational;
int write_only;
};
struct raid5_private_data {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment