Merge penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/md-merge

into penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/linux

Merge penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/md-merge
into penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/linux
ab6094f9 · Linus Torvalds · 8309f3a8 · 86711d5e · ab6094f9 · ab6094f9
Commit ab6094f9 authored Jun 18, 2002 by Linus Torvalds
10 changed files
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
 /*
   linear.c : Multiple Devices driver for Linux
-              Copyright (C) 1994-96 Marc ZYNGIER
+	      Copyright (C) 1994-96 Marc ZYNGIER
 	      <zyngier@ufr-info-p7.ibp.fr> or
 	      <maz@gloups.fdn.fr>
@@ -33,39 +33,45 @@ static int linear_run (mddev_t *mddev)
 	linear_conf_t *conf;
 	struct linear_hash *table;
 	mdk_rdev_t *rdev;
-	int size, i, j, nb_zone;
+	int size, i, nb_zone, cnt;
 	unsigned int curr_offset;
+	struct list_head *tmp;
 	MOD_INC_USE_COUNT;
 	conf = kmalloc (sizeof (*conf), GFP_KERNEL);
 	if (!conf)
 		goto out;
+	memset(conf, 0, sizeof(*conf));
 	mddev->private = conf;
-	if (md_check_ordering(mddev)) {
-		printk("linear: disks are not ordered, aborting!\n");
-		goto out;
-	}
 	/*
 	 * Find the smallest device.
 	 */
 	conf->smallest = NULL;
-	curr_offset = 0;
+	cnt = 0;
-	ITERATE_RDEV_ORDERED(mddev,rdev,j) {
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		int j = rdev->sb->this_disk.raid_disk;
 		dev_info_t *disk = conf->disks + j;
+		if (j < 0 || j > mddev->sb->raid_disks || disk->bdev) {
+			printk("linear: disk numbering problem. Aborting!\n");
+			goto out;
+		}
 		disk->dev = rdev->dev;
 		disk->bdev = rdev->bdev;
 		atomic_inc(&rdev->bdev->bd_count);
 		disk->size = rdev->size;
-		disk->offset = curr_offset;
-		curr_offset += disk->size;
 		if (!conf->smallest || (disk->size < conf->smallest->size))
 			conf->smallest = disk;
+		cnt++;
+	}
+	if (cnt != mddev->sb->raid_disks) {
+		printk("linear: not enough drives present. Aborting!\n");
+		goto out;
 	}
 	nb_zone = conf->nr_zones =
@@ -81,10 +87,13 @@ static int linear_run (mddev_t *mddev)
 	 * Here we generate the linear hash table
 	 */
 	table = conf->hash_table;
-	i = 0;
 	size = 0;
-	for (j = 0; j < mddev->nb_dev; j++) {
+	curr_offset = 0;
-		dev_info_t *disk = conf->disks + j;
+	for (i = 0; i < cnt; i++) {
+		dev_info_t *disk = conf->disks + i;
+		disk->offset = curr_offset;
+		curr_offset += disk->size;
 		if (size < 0) {
 			table[-1].dev1 = disk;
@@ -130,12 +139,13 @@ static int linear_stop (mddev_t *mddev)
 	return 0;
 }
-static int linear_make_request (mddev_t *mddev, int rw, struct bio *bio)
+static int linear_make_request (request_queue_t *q, struct bio *bio)
 {
-        linear_conf_t *conf = mddev_to_conf(mddev);
+	mddev_t *mddev = q->queuedata;
-        struct linear_hash *hash;
+	linear_conf_t *conf = mddev_to_conf(mddev);
-        dev_info_t *tmp_dev;
+	struct linear_hash *hash;
-        long block;
+	dev_info_t *tmp_dev;
+	long block;
 	block = bio->bi_sector >> 1;
 	hash = conf->hash_table + (block / conf->smallest->size);
@@ -186,7 +196,7 @@ static int linear_status (char *page, mddev_t *mddev)
 	}
 	sz += sprintf(page+sz, "\n");
 #endif
-	sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
+	sz += sprintf(page+sz, " %dk rounding", mddev->sb->chunk_size/1024);
 	return sz;
 }

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -244,27 +244,19 @@ static int multipath_read_balance (multipath_conf_t *conf)
 	return 0;
 }
-static int multipath_make_request (mddev_t *mddev, int rw, struct bio * bio)
+static int multipath_make_request (request_queue_t *q, struct bio * bio)
 {
+	mddev_t *mddev = q->queuedata;
 	multipath_conf_t *conf = mddev_to_conf(mddev);
 	struct bio *real_bio;
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
-/*
- * make_request() can abort the operation when READA is being
- * used and no empty request is available.
- *
- * Currently, just replace the command with READ/WRITE.
- */
-	if (rw == READA)
-		rw = READ;
 	mp_bh = multipath_alloc_mpbh (conf);
 	mp_bh->master_bio = bio;
 	mp_bh->mddev = mddev;
-	mp_bh->cmd = rw;
+	mp_bh->cmd = bio_data_dir(bio);
 	/*
 	 * read balancing logic:
@@ -273,7 +265,7 @@ static int multipath_make_request (mddev_t *mddev, int rw, struct bio * bio)
 	real_bio = bio_clone(bio, GFP_NOIO);
 	real_bio->bi_bdev = multipath->bdev;
-	real_bio->bi_rw = rw;
+	real_bio->bi_rw = bio_data_dir(bio);
 	real_bio->bi_end_io = multipath_end_request;
 	real_bio->bi_private = mp_bh;
 	mp_bh->bio = real_bio;
@@ -708,7 +700,6 @@ static void multipathd (void *data)
 		mddev = mp_bh->mddev;
 		if (mddev->sb_dirty) {
 			printk(KERN_INFO "dirty sb detected, updating.\n");
-			mddev->sb_dirty = 0;
 			md_update_sb(mddev);
 		}
 		bio = mp_bh->bio;

--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,21 +29,26 @@
 static int create_strip_zones (mddev_t *mddev)
 {
-	int i, c, j, j1, j2;
+	int i, c, j;
 	unsigned long current_offset, curr_zone_offset;
 	raid0_conf_t *conf = mddev_to_conf(mddev);
 	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
+	struct list_head *tmp1, *tmp2;
+	struct strip_zone *zone;
+	int cnt;
 	/*
 	 * The number of 'same size groups'
 	 */
 	conf->nr_strip_zones = 0;
-	ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
+	ITERATE_RDEV(mddev,rdev1,tmp1) {
 		printk("raid0: looking at %s\n", partition_name(rdev1->dev));
 		c = 0;
-		ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
+		ITERATE_RDEV(mddev,rdev2,tmp2) {
-			printk("raid0:   comparing %s(%ld) with %s(%ld)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
+			printk("raid0:   comparing %s(%ld) with %s(%ld)\n",
+			       partition_name(rdev1->dev), rdev1->size,
+			       partition_name(rdev2->dev), rdev2->size);
 			if (rdev2 == rdev1) {
 				printk("raid0:   END\n");
 				break;
@@ -51,7 +56,7 @@ static int create_strip_zones (mddev_t *mddev)
 			if (rdev2->size == rdev1->size)
 			{
 				/*
-				 * Not unique, dont count it as a new
+				 * Not unique, don't count it as a new
 				 * group
 				 */
 				printk("raid0:   EQUAL\n");
@@ -66,29 +71,62 @@ static int create_strip_zones (mddev_t *mddev)
 			printk("raid0: %d zones\n", conf->nr_strip_zones);
 		}
 	}
-		printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
+	printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
 	conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones);
 	if (!conf->strip_zone)
 		return 1;
+	memset(conf->strip_zone, 0,sizeof(struct strip_zone)*
+				   conf->nr_strip_zones);
+	/* The first zone must contain all devices, so here we check that
+	 * there is a properly alignment of slots to devices and find them all
+	 */
+	zone = &conf->strip_zone[0];
+	cnt = 0;
+	smallest = NULL;
+	ITERATE_RDEV(mddev, rdev1, tmp1) {
+		int j = rdev1->sb->this_disk.raid_disk;
+		if (j < 0 || j >= mddev->sb->raid_disks) {
+			printk("raid0: bad disk number %d - aborting!\n", j);
+			goto abort;
+		}
+		if (zone->dev[j]) {
+			printk("raid0: multiple devices for %d - aborting!\n", j);
+			goto abort;
+		}
+		zone->dev[j] = rdev1;
+		if (!smallest || (rdev1->size <smallest->size))
+			smallest = rdev1;
+		cnt++;
+	}
+	if (cnt != mddev->sb->raid_disks) {
+		printk("raid0: too few disks (%d of %d) - aborting!\n", cnt, 
+		       mddev->sb->raid_disks);
+		goto abort;
+	}
+	zone->nb_dev = cnt;
+	zone->size = smallest->size * cnt;
+	zone->zone_offset = 0;
-	conf->smallest = NULL;
+	conf->smallest = zone;
-	current_offset = 0;
+	current_offset = smallest->size;
-	curr_zone_offset = 0;
+	curr_zone_offset = zone->size;
-	for (i = 0; i < conf->nr_strip_zones; i++)
+	/* now do the other zones */
+	for (i = 1; i < conf->nr_strip_zones; i++)
 	{
-		struct strip_zone *zone = conf->strip_zone + i;
+		zone = conf->strip_zone + i;
 		printk("raid0: zone %d\n", i);
 		zone->dev_offset = current_offset;
 		smallest = NULL;
 		c = 0;
-		ITERATE_RDEV_ORDERED(mddev,rdev,j) {
+		for (j=0; j<cnt; j++) {
+			rdev = conf->strip_zone[0].dev[j];
 			printk("raid0: checking %s ...", partition_name(rdev->dev));
 			if (rdev->size > current_offset)
 			{
@@ -118,6 +156,9 @@ static int create_strip_zones (mddev_t *mddev)
 	}
 	printk("raid0: done.\n");
 	return 0;
+ abort:
+	vfree(conf->strip_zone);
+	return 1;
 }
 static int raid0_run (mddev_t *mddev)
@@ -132,11 +173,6 @@ static int raid0_run (mddev_t *mddev)
 		goto out;
 	mddev->private = (void *)conf;
-	if (md_check_ordering(mddev)) {
-		printk("raid0: disks are not ordered, aborting!\n");
-		goto out_free_conf;
-	}
 	if (create_strip_zones (mddev)) 
 		goto out_free_conf;
@@ -225,8 +261,9 @@ static int raid0_stop (mddev_t *mddev)
 * Of course, those facts may not be valid anymore (and surely won't...)
 * Hey guys, there's some work out there ;-)
 */
-static int raid0_make_request (mddev_t *mddev, int rw, struct bio *bio)
+static int raid0_make_request (request_queue_t *q, struct bio *bio)
 {
+	mddev_t *mddev = q->queuedata;
 	unsigned int sect_in_chunk, chunksize_bits,  chunk_size;
 	raid0_conf_t *conf = mddev_to_conf(mddev);
 	struct raid0_hash *hash;
@@ -234,7 +271,7 @@ static int raid0_make_request (mddev_t *mddev, int rw, struct bio *bio)
 	mdk_rdev_t *tmp_dev;
 	unsigned long chunk, block, rsect;
-	chunk_size = mddev->param.chunk_size >> 10;
+	chunk_size = mddev->sb->chunk_size >> 10;
 	chunksize_bits = ffz(~chunk_size);
 	block = bio->bi_sector >> 1;
 	hash = conf->hash_table + block / conf->smallest->size;
@@ -323,7 +360,7 @@ static int raid0_status (char *page, mddev_t *mddev)
 				conf->strip_zone[j].size);
 	}
 #endif
-	sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
+	sz += sprintf(page + sz, " %dk chunks", mddev->sb->chunk_size/1024);
 	return sz;
 }

--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -334,7 +334,7 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
 	 * device if no resync is going on, or below the resync window.
 	 * We take the first readable disk when above the resync window.
 	 */
-	if (conf->resync_mirrors && (this_sector + sectors >= conf->next_resync)) {
+	if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) {
 		/* make sure that disk is operational */
 		new_disk = 0;
 		while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) {
@@ -434,8 +434,9 @@ static void resume_device(conf_t *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
-static int make_request(mddev_t *mddev, int rw, struct bio * bio)
+static int make_request(request_queue_t *q, struct bio * bio)
 {
+	mddev_t *mddev = q->queuedata;
 	conf_t *conf = mddev_to_conf(mddev);
 	mirror_info_t *mirror;
 	r1bio_t *r1_bio;
@@ -456,20 +457,16 @@ static int make_request(mddev_t *mddev, int rw, struct bio * bio)
 	 * make_request() can abort the operation when READA is being
 	 * used and no empty request is available.
 	 *
-	 * Currently, just replace the command with READ.
 	 */
-	if (rw == READA)
-		rw = READ;
 	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 	r1_bio->master_bio = bio;
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
-	r1_bio->cmd = rw;
+	r1_bio->cmd = bio_data_dir(bio);
-	if (rw == READ) {
+	if (r1_bio->cmd == READ) {
 		/*
 		 * read balancing logic:
 		 */
@@ -483,7 +480,7 @@ static int make_request(mddev_t *mddev, int rw, struct bio * bio)
 		read_bio->bi_sector = r1_bio->sector;
 		read_bio->bi_bdev = mirror->bdev;
 		read_bio->bi_end_io = end_request;
-		read_bio->bi_rw = rw;
+		read_bio->bi_rw = r1_bio->cmd;
 		read_bio->bi_private = r1_bio;
 		generic_make_request(read_bio);
@@ -507,7 +504,7 @@ static int make_request(mddev_t *mddev, int rw, struct bio * bio)
 		mbio->bi_sector	= r1_bio->sector;
 		mbio->bi_bdev = conf->mirrors[i].bdev;
 		mbio->bi_end_io	= end_request;
-		mbio->bi_rw = rw;
+		mbio->bi_rw = r1_bio->cmd;
 		mbio->bi_private = r1_bio;
 		sum_bios++;
@@ -656,6 +653,9 @@ static void close_sync(conf_t *conf)
 	if (conf->barrier) BUG();
 	if (waitqueue_active(&conf->wait_idle)) BUG();
 	if (waitqueue_active(&conf->wait_resume)) BUG();
+	mempool_destroy(conf->r1buf_pool);
+	conf->r1buf_pool = NULL;
 }
 static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
@@ -772,7 +772,6 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 	 * Deactivate a spare disk:
 	 */
 	case DISKOP_SPARE_INACTIVE:
-		close_sync(conf);
 		sdisk = conf->mirrors + spare_disk;
 		sdisk->operational = 0;
 		sdisk->write_only = 0;
@@ -785,7 +784,6 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 	 * property)
 	 */
 	case DISKOP_SPARE_ACTIVE:
-		close_sync(conf);
 		sdisk = conf->mirrors + spare_disk;
 		fdisk = conf->mirrors + failed_disk;
@@ -919,10 +917,6 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 	}
 abort:
 	spin_unlock_irq(&conf->device_lock);
-	if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) {
-		mempool_destroy(conf->r1buf_pool);
-		conf->r1buf_pool = NULL;
-	}
 	print_conf(conf);
 	return err;
@@ -1012,7 +1006,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 			 * we read from here, no need to write
 			 */
 			continue;
-		if (i < conf->raid_disks && !conf->resync_mirrors)
+		if (i < conf->raid_disks && mddev->in_sync)
 			/*
 			 * don't need to write this we are just rebuilding
 			 */
@@ -1088,7 +1082,6 @@ static void raid1d(void *data)
 		conf = mddev_to_conf(mddev);
 		if (mddev->sb_dirty) {
 			printk(KERN_INFO "raid1: dirty sb detected, updating.\n");
-			mddev->sb_dirty = 0;
 			md_update_sb(mddev);
 		}
 		bio = r1_bio->master_bio;
@@ -1118,31 +1111,6 @@ static void raid1d(void *data)
 	spin_unlock_irqrestore(&retry_list_lock, flags);
 }
-/*
- * Private kernel thread to reconstruct mirrors after an unclean
- * shutdown.
- */
-static void raid1syncd(void *data)
-{
-	conf_t *conf = data;
-	mddev_t *mddev = conf->mddev;
-	if (!conf->resync_mirrors)
-		return;
-	if (conf->resync_mirrors == 2)
-		return;
-	down(&mddev->recovery_sem);
-	if (!md_do_sync(mddev, NULL)) {
-		/*
-		 * Only if everything went Ok.
-		 */
-		conf->resync_mirrors = 0;
-	}
-	close_sync(conf);
-	up(&mddev->recovery_sem);
-}
 static int init_resync(conf_t *conf)
 {
@@ -1177,9 +1145,16 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
 	sector_t max_sector, nr_sectors;
 	int disk, partial;
-	if (!sector_nr)
+	if (sector_nr == 0)
 		if (init_resync(conf))
 			return -ENOMEM;
+	max_sector = mddev->sb->size << 1;
+	if (sector_nr >= max_sector) {
+		close_sync(conf);
+		return 0;
+	}
 	/*
 	 * If there is non-resync activity waiting for us then
 	 * put in a delay to throttle resync.
@@ -1216,10 +1191,6 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
 	r1_bio->sector = sector_nr;
 	r1_bio->cmd = SPECIAL;
-	max_sector = mddev->sb->size << 1;
-	if (sector_nr >= max_sector)
-		BUG();
 	bio = r1_bio->master_bio;
 	nr_sectors = RESYNC_BLOCK_SIZE >> 9;
 	if (max_sector - sector_nr < nr_sectors)
@@ -1302,7 +1273,6 @@ static int run(mddev_t *mddev)
 	mdp_disk_t *descriptor;
 	mdk_rdev_t *rdev;
 	struct list_head *tmp;
-	int start_recovery = 0;
 	MOD_INC_USE_COUNT;
@@ -1454,10 +1424,6 @@ static int run(mddev_t *mddev)
 	conf->last_used = j;
-	if (conf->working_disks != sb->raid_disks) {
-		printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
-		start_recovery = 1;
-	}
 	{
 		const char * name = "raid1d";
@@ -1469,20 +1435,6 @@ static int run(mddev_t *mddev)
 		}
 	}
-	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
-						(conf->working_disks > 1)) {
-		const char * name = "raid1syncd";
-		conf->resync_thread = md_register_thread(raid1syncd, conf, name);
-		if (!conf->resync_thread) {
-			printk(THREAD_ERROR, mdidx(mddev));
-			goto out_free_conf;
-		}
-		printk(START_RESYNC, mdidx(mddev));
-		conf->resync_mirrors = 1;
-		md_wakeup_thread(conf->resync_thread);
-	}
 	/*
 	 * Regenerate the "device is in sync with the raid set" bit for
@@ -1499,10 +1451,6 @@ static int run(mddev_t *mddev)
 	}
 	sb->active_disks = conf->working_disks;
-	if (start_recovery)
-		md_recover_arrays();
 	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
 	/*
 	 * Ok, everything is just fine now
@@ -1522,47 +1470,12 @@ static int run(mddev_t *mddev)
 	return -EIO;
 }
-static int stop_resync(mddev_t *mddev)
-{
-	conf_t *conf = mddev_to_conf(mddev);
-	if (conf->resync_thread) {
-		if (conf->resync_mirrors) {
-			conf->resync_mirrors = 2;
-			md_interrupt_thread(conf->resync_thread);
-			printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
-			return 1;
-		}
-		return 0;
-	}
-	return 0;
-}
-static int restart_resync(mddev_t *mddev)
-{
-	conf_t *conf = mddev_to_conf(mddev);
-	if (conf->resync_mirrors) {
-		if (!conf->resync_thread) {
-			MD_BUG();
-			return 0;
-		}
-		conf->resync_mirrors = 1;
-		md_wakeup_thread(conf->resync_thread);
-		return 1;
-	}
-	return 0;
-}
 static int stop(mddev_t *mddev)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	int i;
 	md_unregister_thread(conf->thread);
-	if (conf->resync_thread)
-		md_unregister_thread(conf->resync_thread);
 	if (conf->r1bio_pool)
 		mempool_destroy(conf->r1bio_pool);
 	for (i = 0; i < MD_SB_DISKS; i++)
@@ -1583,8 +1496,6 @@ static mdk_personality_t raid1_personality =
 	status:		status,
 	error_handler:	error,
 	diskop:		diskop,
-	stop_resync:	stop_resync,
-	restart_resync:	restart_resync,
 	sync_request:	sync_request
 };

--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -634,7 +634,6 @@ static void copy_data(int frombio, struct bio *bio,
 		else 
 			page_offset = (signed)(sector - bio->bi_sector) * -512;
 		bio_for_each_segment(bvl, bio, i) {
-			char *ba = __bio_kmap(bio, i);
 			int len = bio_iovec_idx(bio,i)->bv_len;
 			int clen;
 			int b_offset = 0;			
@@ -649,13 +648,16 @@ static void copy_data(int frombio, struct bio *bio,
 				clen = STRIPE_SIZE - page_offset;	
 			else clen = len;
-			if (len > 0) {
+			if (clen > 0) {
+				char *ba = __bio_kmap(bio, i);
 				if (frombio)
 					memcpy(pa+page_offset, ba+b_offset, clen);
 				else
 					memcpy(ba+b_offset, pa+page_offset, clen);
-			}
+				__bio_kunmap(bio, i);
-			__bio_kunmap(bio, i);
+			}	
+			if (clen < len) /* hit end of page */
+				break;
 			page_offset +=  len;
 		}
 	}
@@ -810,6 +812,8 @@ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx,
 	spin_unlock_irq(&conf->device_lock);
 	spin_unlock(&sh->lock);
+	PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
 	if (forwrite) {
 		/* check if page is coverred */
 		sector_t sector = sh->dev[dd_idx].sector;
@@ -823,8 +827,6 @@ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx,
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
-	PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
 }
@@ -1036,7 +1038,7 @@ static void handle_stripe(struct stripe_head *sh)
 				    ) &&
 			    !test_bit(R5_UPTODATE, &dev->flags)) {
 				if (conf->disks[i].operational 
-/*				    && !(conf->resync_parity && i == sh->pd_idx) */
+/*				    && !(!mddev->insync && i == sh->pd_idx) */
 					)
 					rmw++;
 				else rmw += 2*disks;  /* cannot read it */
@@ -1226,14 +1228,15 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
 }
 static void raid5_unplug_device(void *data)
 {
-	raid5_conf_t *conf = (raid5_conf_t *)data;
+	request_queue_t *q = data;
+	mddev_t *mddev = q->queuedata;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
 	spin_lock_irqsave(&conf->device_lock, flags);
-	raid5_activate_delayed(conf);
+	if (blk_remove_plug(q))
+		raid5_activate_delayed(conf);
-	conf->plugged = 0;
 	md_wakeup_thread(conf->thread);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1242,31 +1245,21 @@ static void raid5_unplug_device(void *data)
 static inline void raid5_plug_device(raid5_conf_t *conf)
 {
 	spin_lock_irq(&conf->device_lock);
-	if (list_empty(&conf->delayed_list))
+	blk_plug_device(&conf->mddev->queue);
-		if (!conf->plugged) {
-			conf->plugged = 1;
-			queue_task(&conf->plug_tq, &tq_disk);
-		}
 	spin_unlock_irq(&conf->device_lock);
 }
-static int make_request (mddev_t *mddev, int rw, struct bio * bi)
+static int make_request (request_queue_t *q, struct bio * bi)
 {
-	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	mddev_t *mddev = q->queuedata;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
 	const unsigned int raid_disks = conf->raid_disks;
 	const unsigned int data_disks = raid_disks - 1;
 	unsigned int dd_idx, pd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
-	int read_ahead = 0;
 	struct stripe_head *sh;
-	if (rw == READA) {
-		rw = READ;
-		read_ahead=1;
-	}
 	logical_sector = bi->bi_sector & ~(STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
@@ -1281,10 +1274,10 @@ static int make_request (mddev_t *mddev, int rw, struct bio * bi)
 		PRINTK("raid5: make_request, sector %ul logical %ul\n", 
 		       new_sector, logical_sector);
-		sh = get_active_stripe(conf, new_sector, pd_idx, read_ahead);
+		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
-			add_stripe_bio(sh, bi, dd_idx, rw);
+			add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
 			raid5_plug_device(conf);
 			handle_stripe(sh);
@@ -1311,6 +1304,10 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
+	if (sector_nr >= mddev->sb->size <<1)
+		/* just being told to finish up .. nothing to do */
+		return 0;
 	first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
 		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
 	sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
@@ -1343,17 +1340,15 @@ static void raid5d (void *data)
 	handled = 0;
-	if (mddev->sb_dirty) {
+	if (mddev->sb_dirty)
-		mddev->sb_dirty = 0;
 		md_update_sb(mddev);
-	}
 	spin_lock_irq(&conf->device_lock);
 	while (1) {
 		struct list_head *first;
 		if (list_empty(&conf->handle_list) &&
 		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
-		    !conf->plugged &&
+		    !blk_queue_plugged(&mddev->queue) &&
 		    !list_empty(&conf->delayed_list))
 			raid5_activate_delayed(conf);
@@ -1382,31 +1377,6 @@ static void raid5d (void *data)
 	PRINTK("--- raid5d inactive\n");
 }
-/*
- * Private kernel thread for parity reconstruction after an unclean
- * shutdown. Reconstruction on spare drives in case of a failed drive
- * is done by the generic mdsyncd.
- */
-static void raid5syncd (void *data)
-{
-	raid5_conf_t *conf = data;
-	mddev_t *mddev = conf->mddev;
-	if (!conf->resync_parity)
-		return;
-	if (conf->resync_parity == 2)
-		return;
-	down(&mddev->recovery_sem);
-	if (md_do_sync(mddev,NULL)) {
-		up(&mddev->recovery_sem);
-		printk("raid5: resync aborted!\n");
-		return;
-	}
-	conf->resync_parity = 0;
-	up(&mddev->recovery_sem);
-	printk("raid5: resync finished.\n");
-}
 static int run (mddev_t *mddev)
 {
 	raid5_conf_t *conf;
@@ -1416,7 +1386,6 @@ static int run (mddev_t *mddev)
 	mdk_rdev_t *rdev;
 	struct disk_info *disk;
 	struct list_head *tmp;
-	int start_recovery = 0;
 	MOD_INC_USE_COUNT;
@@ -1444,10 +1413,7 @@ static int run (mddev_t *mddev)
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
-	conf->plugged = 0;
+	mddev->queue.unplug_fn = raid5_unplug_device;
-	conf->plug_tq.sync = 0;
-	conf->plug_tq.routine = &raid5_unplug_device;
-	conf->plug_tq.data = conf;
 	PRINTK("raid5: run(md%d) called.\n", mdidx(mddev));
@@ -1571,9 +1537,10 @@ static int run (mddev_t *mddev)
 		goto abort;
 	}
-	if (conf->working_disks != sb->raid_disks) {
+	if (conf->failed_disks == 1 &&
-		printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
+	    !(sb->state & (1<<MD_SB_CLEAN))) {
-		start_recovery = 1;
+		printk(KERN_ERR "raid5: cannot start dirty degraded array for md%d\n", mdidx(mddev));
+		goto abort;
 	}
 	{
@@ -1587,10 +1554,11 @@ static int run (mddev_t *mddev)
 	}
 	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-		 conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
+		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	if (grow_stripes(conf, conf->max_nr_stripes)) {
 		printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
 		shrink_stripes(conf);
+		md_unregister_thread(conf->thread);
 		goto abort;
 	} else
 		printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
@@ -1615,23 +1583,6 @@ static int run (mddev_t *mddev)
 	else
 		printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
-	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
-		const char * name = "raid5syncd";
-		conf->resync_thread = md_register_thread(raid5syncd, conf,name);
-		if (!conf->resync_thread) {
-			printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
-			goto abort;
-		}
-		printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
-		conf->resync_parity = 1;
-		md_wakeup_thread(conf->resync_thread);
-	}
-	print_raid5_conf(conf);
-	if (start_recovery)
-		md_recover_arrays();
 	print_raid5_conf(conf);
 	/* Ok, everything is just fine now */
@@ -1650,48 +1601,12 @@ static int run (mddev_t *mddev)
 	return -EIO;
 }
-static int stop_resync (mddev_t *mddev)
-{
-	raid5_conf_t *conf = mddev_to_conf(mddev);
-	mdk_thread_t *thread = conf->resync_thread;
-	if (thread) {
-		if (conf->resync_parity) {
-			conf->resync_parity = 2;
-			md_interrupt_thread(thread);
-			printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
-			return 1;
-		}
-		return 0;
-	}
-	return 0;
-}
-static int restart_resync (mddev_t *mddev)
-{
-	raid5_conf_t *conf = mddev_to_conf(mddev);
-	if (conf->resync_parity) {
-		if (!conf->resync_thread) {
-			MD_BUG();
-			return 0;
-		}
-		printk("raid5: waking up raid5resync.\n");
-		conf->resync_parity = 1;
-		md_wakeup_thread(conf->resync_thread);
-		return 1;
-	} else
-		printk("raid5: no restart-resync needed.\n");
-	return 0;
-}
 static int stop (mddev_t *mddev)
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
-	if (conf->resync_thread)
-		md_unregister_thread(conf->resync_thread);
 	md_unregister_thread(conf->thread);
 	shrink_stripes(conf);
 	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
@@ -2066,8 +1981,6 @@ static mdk_personality_t raid5_personality=
 	status:		status,
 	error_handler:	error,
 	diskop:		diskop,
-	stop_resync:	stop_resync,
-	restart_resync:	restart_resync,
 	sync_request:	sync_request
 };

--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -63,8 +63,6 @@
 extern int md_size[MAX_MD_DEVS];
 extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
-extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
-extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
 extern char * partition_name (kdev_t dev);
 extern inline char * bdev_partition_name (struct block_device *bdev)
 {
@@ -77,14 +75,9 @@ extern mdk_thread_t * md_register_thread (void (*run) (void *data),
 extern void md_unregister_thread (mdk_thread_t *thread);
 extern void md_wakeup_thread(mdk_thread_t *thread);
 extern void md_interrupt_thread (mdk_thread_t *thread);
-extern int md_update_sb (mddev_t *mddev);
+extern void md_update_sb (mddev_t *mddev);
-extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_sync_acct(kdev_t dev, unsigned long nr_sectors);
-extern void md_recover_arrays (void);
-extern int md_check_ordering (mddev_t *mddev);
-extern int md_notify_reboot(struct notifier_block *this,
-					unsigned long code, void *x);
 extern int md_error (mddev_t *mddev, struct block_device *bdev);
 extern int md_run_setup(void);

--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -64,24 +64,6 @@ typedef struct mdk_rdev_s mdk_rdev_t;
 #define MAX_MD_DEVS  (1<<MINORBITS)	/* Max number of md dev */
-/*
- * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
- * the personality. (eg. HSM uses this to identify individual LVs)
- */
-typedef struct dev_mapping_s {
-	mddev_t *mddev;
-	void *data;
-} dev_mapping_t;
-extern dev_mapping_t mddev_map [MAX_MD_DEVS];
-static inline mddev_t * kdev_to_mddev (kdev_t dev)
-{
-	if (major(dev) != MD_MAJOR)
-		BUG();
-        return mddev_map[minor(dev)].mddev;
-}
 /*
 * options passed in raidrun:
 */
@@ -196,31 +178,38 @@ struct mddev_s
 	mdk_personality_t		*pers;
 	int				__minor;
 	mdp_super_t			*sb;
-	int				nb_dev;
 	struct list_head 		disks;
 	int				sb_dirty;
-	mdu_param_t			param;
 	int				ro;
+	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
 	unsigned long			curr_resync;	/* blocks scheduled */
 	unsigned long			resync_mark;	/* a recent timestamp */
 	unsigned long			resync_mark_cnt;/* blocks written at resync_mark */
-	char				*name;
+	/* recovery_running is 0 for no recovery/resync,
+	 * 1 for active recovery
+	 * 2 for active resync
+	 * -error for an error (e.g. -EINTR)
+	 * it can only be set > 0 under reconfig_sem
+	 */
 	int				recovery_running;
+	int				in_sync;	/* know to not need resync */
 	struct semaphore		reconfig_sem;
-	struct semaphore		recovery_sem;
-	struct semaphore		resync_sem;
 	atomic_t			active;
+	mdp_disk_t			*spare;
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
+	request_queue_t			queue;	/* for plugging ... */
 	struct list_head		all_mddevs;
 };
 struct mdk_personality_s
 {
 	char *name;
-	int (*make_request)(mddev_t *mddev, int rw, struct bio *bio);
+	int (*make_request)(request_queue_t *q, struct bio *bio);
 	int (*run)(mddev_t *mddev);
 	int (*stop)(mddev_t *mddev);
 	int (*status)(char *page, mddev_t *mddev);
@@ -237,9 +226,6 @@ struct mdk_personality_s
 * SPARE_ACTIVE expects such a change)
 */
 	int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
-	int (*stop_resync)(mddev_t *mddev);
-	int (*restart_resync)(mddev_t *mddev);
 	int (*sync_request)(mddev_t *mddev, sector_t sector_nr, int go_faster);
 };
@@ -279,13 +265,6 @@ extern mdp_disk_t *get_spare(mddev_t *mddev);
 #define ITERATE_RDEV(mddev,rdev,tmp)					\
 	ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
-/*
- * Same as above, but assumes that the device has rdev->desc_nr numbered
- * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
- */
-#define ITERATE_RDEV_ORDERED(mddev,rdev,i)				\
-	for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
 /*
 * Iterates through all 'RAID managed disks'
@@ -299,26 +278,6 @@ extern mdp_disk_t *get_spare(mddev_t *mddev);
 #define ITERATE_RDEV_PENDING(rdev,tmp)					\
 	ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
-/*
- * iterates through all used mddevs in the system.
- */
-#define ITERATE_MDDEV(mddev,tmp)					\
-									\
-	for (tmp = all_mddevs.next;					\
-		mddev = list_entry(tmp, mddev_t, all_mddevs),	\
-			tmp = tmp->next, tmp->prev != &all_mddevs	\
-		; )
-static inline int lock_mddev (mddev_t * mddev)
-{
-	return down_interruptible(&mddev->reconfig_sem);
-}
-static inline void unlock_mddev (mddev_t * mddev)
-{
-	up(&mddev->reconfig_sem);
-}
 #define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
 				x = y; y = __tmp; } while (0)

--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -33,8 +33,7 @@ struct r1_private_data_s {
 	int			working_disks;
 	int			last_used;
 	sector_t		next_seq_sect;
-	mdk_thread_t		*thread, *resync_thread;
+	mdk_thread_t		*thread;
-	int			resync_mirrors;
 	mirror_info_t		*spare;
 	spinlock_t		device_lock;

--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -177,7 +177,7 @@ struct stripe_head {
 * is put on a "delayed" queue until there are no stripes currently
 * in a pre-read phase.  Further, if the "delayed" queue is empty when
 * a stripe is put on it then we "plug" the queue and do not process it
- * until an unplg call is made. (the tq_disk list is run).
+ * until an unplug call is made. (blk_run_queues is run).
 *
 * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
 * it to the count of prereading stripes.
@@ -205,12 +205,11 @@ struct disk_info {
 struct raid5_private_data {
 	struct stripe_head	**stripe_hashtbl;
 	mddev_t			*mddev;
-	mdk_thread_t		*thread, *resync_thread;
+	mdk_thread_t		*thread;
 	struct disk_info	disks[MD_SB_DISKS];
 	struct disk_info	*spare;
 	int			chunk_size, level, algorithm;
 	int			raid_disks, working_disks, failed_disks;
-	int			resync_parity;
 	int			max_nr_stripes;
 	struct list_head	handle_list; /* stripes needing handling */
@@ -229,9 +228,6 @@ struct raid5_private_data {
 							 * waiting for 25% to be free
 							 */        
 	spinlock_t		device_lock;
-	int			plugged;
-	struct tq_struct	plug_tq;
 };
 typedef struct raid5_private_data raid5_conf_t;