diff --git a/Makefile b/Makefile
index 787c8a8b9a5866cb7aa6fa5d3bf5cbe70913a392..f2eda9ca6c0353a65de913108c68451c62053c73 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 2
 PATCHLEVEL = 5
-SUBLEVEL = 22
+SUBLEVEL = 23
 EXTRAVERSION =
 
 # We are using a recursive build, so we need to do a little thinking
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c
index 4ba5641b271fdf00cbff933e98170ee8813413be..55b8fc43a9bfe68358eb9e3b05b47bd4a2ca17d4 100644
--- a/arch/i386/mm/ioremap.c
+++ b/arch/i386/mm/ioremap.c
@@ -221,8 +221,6 @@ void iounmap(void *addr)
 		return;
 	} 
 
-	BUG_ON(p->phys_addr == 0);  /* not allocated with ioremap */	
-
 	vmfree_area_pages(VMALLOC_VMADDR(p->addr), p->size);	
 	if (p->flags && p->phys_addr < virt_to_phys(high_memory)) { 
 		change_page_attr(virt_to_page(__va(p->phys_addr)),
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 16abcb3f5481814810691da07bd73dc8b7f0452a..318ff55529fbde2b58898e34952d7079364a0b09 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -161,6 +161,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 
 	init_waitqueue_head(&q->queue_wait);
+	INIT_LIST_HEAD(&q->plug_list);
 }
 
 /**
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 44909021aa06e418625ca81ed4031f06ea5c303e..e046885bb67bff284a59cde6e402b99aa90180be 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -129,6 +129,8 @@ struct cardinfo {
 				    */
 	struct bio	*bio, *currentbio, **biotail;
 
+	request_queue_t queue;
+
 	struct mm_page {
 		dma_addr_t		page_dma;
 		struct mm_dma_desc	*desc;
@@ -142,8 +144,6 @@ struct cardinfo {
 	struct tasklet_struct	tasklet;
 	unsigned int dma_status;
 
-	struct tq_struct plug_tq;
-
 	struct {
 		int		good;
 		int		warned;
@@ -293,7 +293,7 @@ static void dump_dmastat(struct cardinfo *card, unsigned int dmastat)
  * Whenever IO on the active page completes, the Ready page is activated
  * and the ex-Active page is clean out and made Ready.
  * Otherwise the Ready page is only activated when it becomes full, or
- * when mm_unplug_device is called via run_task_queue(&tq_disk).
+ * when mm_unplug_device is called via blk_run_queues().
  *
  * If a request arrives while both pages a full, it is queued, and b_rdev is
  * overloaded to record whether it was a read or a write.
@@ -341,8 +341,9 @@ static void mm_start_io(struct cardinfo *card)
 	offset = ((char*)desc) - ((char*)page->desc);
 	writel(cpu_to_le32((page->page_dma+offset)&0xffffffff),
 	       card->csr_remap + DMA_DESCRIPTOR_ADDR);
-	/* if sizeof(dma_addr_t) == 32, this will generate a warning, sorry */
-	writel(cpu_to_le32((page->page_dma)>>32),
+	/* Force the value to u64 before shifting otherwise >> 32 is undefined C
+	 * and on some ports will do nothing ! */
+	writel(cpu_to_le32(((u64)page->page_dma)>>32),
 	       card->csr_remap + DMA_DESCRIPTOR_ADDR + 4);
 
 	/* Go, go, go */
@@ -384,10 +385,12 @@ static inline void reset_page(struct mm_page *page)
 
 static void mm_unplug_device(void *data)
 {
-	struct cardinfo *card = data;
+	request_queue_t *q = data;
+	struct cardinfo *card = q->queuedata;
 
 	spin_lock_bh(&card->lock);
-	activate(card);
+	if (blk_remove_plug(q))
+		activate(card);
 	spin_unlock_bh(&card->lock);
 }
 
@@ -565,8 +568,7 @@ static void process_page(unsigned long data)
 */
 static int mm_make_request(request_queue_t *q, struct bio *bio)
 {
-	struct cardinfo *card = &cards[DEVICE_NR(
-		bio->bi_bdev->bd_dev)];
+	struct cardinfo *card = q->queuedata;
 	PRINTK("mm_make_request %ld %d\n", bh->b_rsector, bh->b_size);
 
 	/* set uptodate now, and clear it if there are any errors */
@@ -576,9 +578,9 @@ static int mm_make_request(request_queue_t *q, struct bio *bio)
 	*card->biotail = bio;
 	bio->bi_next = NULL;
 	card->biotail = &bio->bi_next;
+	blk_plug_device(q);
 	spin_unlock_bh(&card->lock);
 
-	queue_task(&card->plug_tq, &tq_disk);
 	return 0;
 }
 
@@ -1065,11 +1067,12 @@ static int __devinit mm_pci_probe(struct pci_dev *dev, const struct pci_device_i
 	card->bio = NULL;
 	card->biotail = &card->bio;
 
+	blk_queue_make_request(&card->queue, mm_make_request);
+	card->queue.queuedata = card;
+	card->queue.unplug_fn = mm_unplug_device;
+
 	tasklet_init(&card->tasklet, process_page, (unsigned long)card);
 
-	card->plug_tq.sync = 0;
-	card->plug_tq.routine = &mm_unplug_device;
-	card->plug_tq.data = card;
 	card->check_batteries = 0;
 	
 	mem_present = readb(card->csr_remap + MEMCTRLSTATUS_MEMORY);
@@ -1237,6 +1240,17 @@ static struct pci_driver mm_pci_driver = {
 --                               mm_init
 -----------------------------------------------------------------------------------
 */
+
+static request_queue_t * mm_queue_proc(kdev_t dev)
+{
+	int c = DEVICE_NR(kdev_val(dev));
+
+	if (c < MM_MAXCARDS)
+		return &cards[c].queue;
+	else
+		return BLK_DEFAULT_QUEUE(MAJOR_NR);
+}
+	
 int __init mm_init(void)
 {
 	int retval, i;
@@ -1276,11 +1290,9 @@ int __init mm_init(void)
 	mm_gendisk.part      = mm_partitions;
 	mm_gendisk.nr_real   = num_cards;
 
+	blk_dev[MAJOR_NR].queue = mm_queue_proc;
 	add_gendisk(&mm_gendisk);
 
-	blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR),
-			       mm_make_request);
-
         blk_size[MAJOR_NR]      = mm_gendisk.sizes;
         for (i = 0; i < num_cards; i++) {
 		register_disk(&mm_gendisk, mk_kdev(MAJOR_NR, i<<MM_SHIFT), MM_SHIFT,
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 48fb74e50d5cb2c41d937fdeb76bf43fa5c20bf1..d8f29104dacf9b8c57a68123aa05894273e0f7eb 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -1,6 +1,6 @@
 /*
    linear.c : Multiple Devices driver for Linux
-              Copyright (C) 1994-96 Marc ZYNGIER
+	      Copyright (C) 1994-96 Marc ZYNGIER
 	      <zyngier@ufr-info-p7.ibp.fr> or
 	      <maz@gloups.fdn.fr>
 
@@ -33,39 +33,45 @@ static int linear_run (mddev_t *mddev)
 	linear_conf_t *conf;
 	struct linear_hash *table;
 	mdk_rdev_t *rdev;
-	int size, i, j, nb_zone;
+	int size, i, nb_zone, cnt;
 	unsigned int curr_offset;
+	struct list_head *tmp;
 
 	MOD_INC_USE_COUNT;
 
 	conf = kmalloc (sizeof (*conf), GFP_KERNEL);
 	if (!conf)
 		goto out;
+	memset(conf, 0, sizeof(*conf));
 	mddev->private = conf;
 
-	if (md_check_ordering(mddev)) {
-		printk("linear: disks are not ordered, aborting!\n");
-		goto out;
-	}
 	/*
 	 * Find the smallest device.
 	 */
 
 	conf->smallest = NULL;
-	curr_offset = 0;
-	ITERATE_RDEV_ORDERED(mddev,rdev,j) {
+	cnt = 0;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		int j = rdev->sb->this_disk.raid_disk;
 		dev_info_t *disk = conf->disks + j;
 
+		if (j < 0 || j > mddev->sb->raid_disks || disk->bdev) {
+			printk("linear: disk numbering problem. Aborting!\n");
+			goto out;
+		}
+
 		disk->dev = rdev->dev;
 		disk->bdev = rdev->bdev;
 		atomic_inc(&rdev->bdev->bd_count);
 		disk->size = rdev->size;
-		disk->offset = curr_offset;
-
-		curr_offset += disk->size;
 
 		if (!conf->smallest || (disk->size < conf->smallest->size))
 			conf->smallest = disk;
+		cnt++;
+	}
+	if (cnt != mddev->sb->raid_disks) {
+		printk("linear: not enough drives present. Aborting!\n");
+		goto out;
 	}
 
 	nb_zone = conf->nr_zones =
@@ -81,10 +87,13 @@ static int linear_run (mddev_t *mddev)
 	 * Here we generate the linear hash table
 	 */
 	table = conf->hash_table;
-	i = 0;
 	size = 0;
-	for (j = 0; j < mddev->nb_dev; j++) {
-		dev_info_t *disk = conf->disks + j;
+	curr_offset = 0;
+	for (i = 0; i < cnt; i++) {
+		dev_info_t *disk = conf->disks + i;
+
+		disk->offset = curr_offset;
+		curr_offset += disk->size;
 
 		if (size < 0) {
 			table[-1].dev1 = disk;
@@ -130,12 +139,13 @@ static int linear_stop (mddev_t *mddev)
 	return 0;
 }
 
-static int linear_make_request (mddev_t *mddev, int rw, struct bio *bio)
+static int linear_make_request (request_queue_t *q, struct bio *bio)
 {
-        linear_conf_t *conf = mddev_to_conf(mddev);
-        struct linear_hash *hash;
-        dev_info_t *tmp_dev;
-        long block;
+	mddev_t *mddev = q->queuedata;
+	linear_conf_t *conf = mddev_to_conf(mddev);
+	struct linear_hash *hash;
+	dev_info_t *tmp_dev;
+	long block;
 
 	block = bio->bi_sector >> 1;
 	hash = conf->hash_table + (block / conf->smallest->size);
@@ -186,7 +196,7 @@ static int linear_status (char *page, mddev_t *mddev)
 	}
 	sz += sprintf(page+sz, "\n");
 #endif
-	sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
+	sz += sprintf(page+sz, " %dk rounding", mddev->sb->chunk_size/1024);
 	return sz;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d232703228045fd1247d27e8bc32809f50222382..acce321b5938d0cb1c8031590c22658b61c84ad8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -107,7 +107,7 @@ static ctl_table raid_root_table[] = {
  * subsystems want to have a pre-defined structure
  */
 struct hd_struct md_hd_struct[MAX_MD_DEVS];
-static int md_maxreadahead[MAX_MD_DEVS];
+static void md_recover_arrays(void);
 static mdk_thread_t *md_recovery_thread;
 
 int md_size[MAX_MD_DEVS];
@@ -129,93 +129,111 @@ static struct gendisk md_gendisk=
 
 /*
  * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list as well as mddev_map.
  */
 static LIST_HEAD(all_mddevs);
+static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
+
 
 /*
- * The mapping between kdev and mddev is not necessary a simple
- * one! Eg. HSM uses several sub-devices to implement Logical
- * Volumes. All these sub-devices map to the same mddev.
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
  */
-dev_mapping_t mddev_map[MAX_MD_DEVS];
+#define ITERATE_MDDEV(mddev,tmp)					\
+									\
+	for (spin_lock(&all_mddevs_lock), 				\
+		     (tmp = all_mddevs.next),				\
+		     (mddev = NULL);					\
+	     (void)(tmp != &all_mddevs &&				\
+			mddev_get(list_entry(tmp, mddev_t, all_mddevs))),\
+		     spin_unlock(&all_mddevs_lock),			\
+		     (mddev ? mddev_put(mddev):(void)NULL),		\
+		     (mddev = list_entry(tmp, mddev_t, all_mddevs)),	\
+		     (tmp != &all_mddevs);				\
+	     spin_lock(&all_mddevs_lock),				\
+		     (tmp = tmp->next)					\
+		)
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+	bio_io_error(bio);
+	return 0;
+}
 
-void add_mddev_mapping(mddev_t * mddev, kdev_t dev, void *data)
+static inline mddev_t *mddev_get(mddev_t *mddev)
 {
-	unsigned int minor = minor(dev);
-
-	if (major(dev) != MD_MAJOR) {
-		MD_BUG();
-		return;
-	}
-	if (mddev_map[minor].mddev) {
-		MD_BUG();
-		return;
-	}
-	mddev_map[minor].mddev = mddev;
-	mddev_map[minor].data = data;
+	atomic_inc(&mddev->active);
+	return mddev;
 }
 
-void del_mddev_mapping(mddev_t * mddev, kdev_t dev)
+static void mddev_put(mddev_t *mddev)
 {
-	unsigned int minor = minor(dev);
-
-	if (major(dev) != MD_MAJOR) {
-		MD_BUG();
+	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 		return;
+	if (!mddev->sb && list_empty(&mddev->disks)) {
+		list_del(&mddev->all_mddevs);
+		mddev_map[mdidx(mddev)] = NULL;
+		kfree(mddev);
+		MOD_DEC_USE_COUNT;
 	}
-	if (mddev_map[minor].mddev != mddev) {
-		MD_BUG();
-		return;
-	}
-	mddev_map[minor].mddev = NULL;
-	mddev_map[minor].data = NULL;
+	spin_unlock(&all_mddevs_lock);
 }
 
-static int md_make_request (request_queue_t *q, struct bio *bio)
+static mddev_t * mddev_find(int unit)
 {
-	mddev_t *mddev = kdev_to_mddev(to_kdev_t(bio->bi_bdev->bd_dev));
+	mddev_t *mddev, *new = NULL;
 
-	if (mddev && mddev->pers)
-		return mddev->pers->make_request(mddev, bio_rw(bio), bio);
-	else {
-		bio_io_error(bio);
-		return 0;
+ retry:
+	spin_lock(&all_mddevs_lock);
+	if (mddev_map[unit]) {
+		mddev =  mddev_get(mddev_map[unit]);
+		spin_unlock(&all_mddevs_lock);
+		if (new)
+			kfree(new);
+		return mddev;
 	}
-}
-
-static mddev_t * alloc_mddev(kdev_t dev)
-{
-	mddev_t *mddev;
-
-	if (major(dev) != MD_MAJOR) {
-		MD_BUG();
-		return 0;
+	if (new) {
+		mddev_map[unit] = new;
+		list_add(&new->all_mddevs, &all_mddevs);
+		spin_unlock(&all_mddevs_lock);
+		MOD_INC_USE_COUNT;
+		return new;
 	}
-	mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
-	if (!mddev)
+	spin_unlock(&all_mddevs_lock);
+
+	new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
 		return NULL;
 
-	memset(mddev, 0, sizeof(*mddev));
+	memset(new, 0, sizeof(*new));
 
-	mddev->__minor = minor(dev);
-	init_MUTEX(&mddev->reconfig_sem);
-	init_MUTEX(&mddev->recovery_sem);
-	init_MUTEX(&mddev->resync_sem);
-	INIT_LIST_HEAD(&mddev->disks);
-	INIT_LIST_HEAD(&mddev->all_mddevs);
-	atomic_set(&mddev->active, 0);
+	new->__minor = unit;
+	init_MUTEX(&new->reconfig_sem);
+	INIT_LIST_HEAD(&new->disks);
+	INIT_LIST_HEAD(&new->all_mddevs);
+	atomic_set(&new->active, 1);
 
-	/*
-	 * The 'base' mddev is the one with data NULL.
-	 * personalities can create additional mddevs
-	 * if necessary.
-	 */
-	add_mddev_mapping(mddev, dev, 0);
-	list_add(&mddev->all_mddevs, &all_mddevs);
+	goto retry;
+}
 
-	MOD_INC_USE_COUNT;
+static inline int mddev_lock(mddev_t * mddev)
+{
+	return down_interruptible(&mddev->reconfig_sem);
+}
 
-	return mddev;
+static inline int mddev_trylock(mddev_t * mddev)
+{
+	return down_trylock(&mddev->reconfig_sem);
+}
+
+static inline void mddev_unlock(mddev_t * mddev)
+{
+	up(&mddev->reconfig_sem);
 }
 
 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
@@ -249,13 +267,12 @@ char * partition_name(kdev_t dev)
 	struct gendisk *hd;
 	static char nomem [] = "<nomem>";
 	dev_name_t *dname;
-	struct list_head *tmp = device_names.next;
+	struct list_head *tmp;
 
-	while (tmp != &device_names) {
+	list_for_each(tmp, &device_names) {
 		dname = list_entry(tmp, dev_name_t, list);
 		if (kdev_same(dname->dev, dev))
 			return dname->name;
-		tmp = tmp->next;
 	}
 
 	dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
@@ -275,7 +292,6 @@ char * partition_name(kdev_t dev)
 	}
 
 	dname->dev = dev;
-	INIT_LIST_HEAD(&dname->list);
 	list_add(&dname->list, &device_names);
 
 	return dname->name;
@@ -326,69 +342,6 @@ static unsigned int zoned_raid_size(mddev_t *mddev)
 	return 0;
 }
 
-/*
- * We check wether all devices are numbered from 0 to nb_dev-1. The
- * order is guaranteed even after device name changes.
- *
- * Some personalities (raid0, linear) use this. Personalities that
- * provide data have to be able to deal with loss of individual
- * disks, so they do their checking themselves.
- */
-int md_check_ordering(mddev_t *mddev)
-{
-	int i, c;
-	mdk_rdev_t *rdev;
-	struct list_head *tmp;
-
-	/*
-	 * First, all devices must be fully functional
-	 */
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		if (rdev->faulty) {
-			printk(KERN_ERR "md: md%d's device %s faulty, aborting.\n",
-			       mdidx(mddev), partition_name(rdev->dev));
-			goto abort;
-		}
-	}
-
-	c = 0;
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		c++;
-	}
-	if (c != mddev->nb_dev) {
-		MD_BUG();
-		goto abort;
-	}
-	if (mddev->nb_dev != mddev->sb->raid_disks) {
-		printk(KERN_ERR "md: md%d, array needs %d disks, has %d, aborting.\n",
-			mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
-		goto abort;
-	}
-	/*
-	 * Now the numbering check
-	 */
-	for (i = 0; i < mddev->nb_dev; i++) {
-		c = 0;
-		ITERATE_RDEV(mddev,rdev,tmp) {
-			if (rdev->desc_nr == i)
-				c++;
-		}
-		if (!c) {
-			printk(KERN_ERR "md: md%d, missing disk #%d, aborting.\n",
-			       mdidx(mddev), i);
-			goto abort;
-		}
-		if (c > 1) {
-			printk(KERN_ERR "md: md%d, too many disks #%d, aborting.\n",
-			       mdidx(mddev), i);
-			goto abort;
-		}
-	}
-	return 0;
-abort:
-	return 1;
-}
-
 static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
 {
 	if (disk_active(disk)) {
@@ -618,8 +571,7 @@ static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 
 	list_add(&rdev->same_set, &mddev->disks);
 	rdev->mddev = mddev;
-	mddev->nb_dev++;
-	printk(KERN_INFO "md: bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
+	printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev));
 }
 
 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
@@ -628,11 +580,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		MD_BUG();
 		return;
 	}
-	list_del(&rdev->same_set);
-	INIT_LIST_HEAD(&rdev->same_set);
-	rdev->mddev->nb_dev--;
-	printk(KERN_INFO "md: unbind<%s,%d>\n", partition_name(rdev->dev),
-						 rdev->mddev->nb_dev);
+	list_del_init(&rdev->same_set);
+	printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev));
 	rdev->mddev = NULL;
 }
 
@@ -682,13 +631,11 @@ static void export_rdev(mdk_rdev_t * rdev)
 		MD_BUG();
 	unlock_rdev(rdev);
 	free_disk_sb(rdev);
-	list_del(&rdev->all);
-	INIT_LIST_HEAD(&rdev->all);
-	if (rdev->pending.next != &rdev->pending) {
+	list_del_init(&rdev->all);
+	if (!list_empty(&rdev->pending)) {
 		printk(KERN_INFO "md: (%s was pending)\n",
 			partition_name(rdev->dev));
-		list_del(&rdev->pending);
-		INIT_LIST_HEAD(&rdev->pending);
+		list_del_init(&rdev->pending);
 	}
 #ifndef MODULE
 	md_autodetect_dev(rdev->dev);
@@ -722,7 +669,7 @@ static void export_array(mddev_t *mddev)
 		}
 		kick_rdev_from_array(rdev);
 	}
-	if (mddev->nb_dev)
+	if (!list_empty(&mddev->disks))
 		MD_BUG();
 }
 
@@ -736,21 +683,6 @@ static void free_mddev(mddev_t *mddev)
 	export_array(mddev);
 	md_size[mdidx(mddev)] = 0;
 	md_hd_struct[mdidx(mddev)].nr_sects = 0;
-
-	/*
-	 * Make sure nobody else is using this mddev
-	 * (careful, we rely on the global kernel lock here)
-	 */
-	while (atomic_read(&mddev->resync_sem.count) != 1)
-		schedule();
-	while (atomic_read(&mddev->recovery_sem.count) != 1)
-		schedule();
-
-	del_mddev_mapping(mddev, mk_kdev(MD_MAJOR, mdidx(mddev)));
-	list_del(&mddev->all_mddevs);
-	INIT_LIST_HEAD(&mddev->all_mddevs);
-	kfree(mddev);
-	MOD_DEC_USE_COUNT;
 }
 
 #undef BAD_CSUM
@@ -892,12 +824,10 @@ static mdk_rdev_t * find_rdev_all(kdev_t dev)
 	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 
-	tmp = all_raid_disks.next;
-	while (tmp != &all_raid_disks) {
+	list_for_each(tmp, &all_raid_disks) {
 		rdev = list_entry(tmp, mdk_rdev_t, all);
 		if (kdev_same(rdev->dev, dev))
 			return rdev;
-		tmp = tmp->next;
 	}
 	return NULL;
 }
@@ -993,12 +923,13 @@ static int sync_sbs(mddev_t * mddev)
 	return 0;
 }
 
-int md_update_sb(mddev_t * mddev)
+void __md_update_sb(mddev_t * mddev)
 {
 	int err, count = 100;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 
+	mddev->sb_dirty = 0;
 repeat:
 	mddev->sb->utime = CURRENT_TIME;
 	if (!(++mddev->sb->events_lo))
@@ -1020,7 +951,7 @@ int md_update_sb(mddev_t * mddev)
 	 * nonpersistent superblocks
 	 */
 	if (mddev->sb->not_persistent)
-		return 0;
+		return;
 
 	printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
 					mdidx(mddev));
@@ -1048,9 +979,18 @@ int md_update_sb(mddev_t * mddev)
 		}
 		printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
 	}
-	return 0;
 }
 
+void md_update_sb(mddev_t *mddev)
+{
+	if (mddev_lock(mddev))
+		return;
+	if (mddev->sb_dirty)
+		__md_update_sb(mddev);
+	mddev_unlock(mddev);
+}
+
+
 /*
  * Import a device. If 'on_disk', then sanity check the superblock
  *
@@ -1122,6 +1062,7 @@ static int md_import_device(kdev_t newdev, int on_disk)
 	}
 	list_add(&rdev->all, &all_raid_disks);
 	INIT_LIST_HEAD(&rdev->pending);
+	INIT_LIST_HEAD(&rdev->same_set);
 
 	if (rdev->faulty && rdev->sb)
 		free_disk_sb(rdev);
@@ -1574,7 +1515,6 @@ static int device_size_calculation(mddev_t * mddev)
 		if (sb->level == -3)
 			readahead = 0;
 	}
-	md_maxreadahead[mdidx(mddev)] = readahead;
 
 	printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
 		mdidx(mddev), readahead*(PAGE_SIZE/1024));
@@ -1605,7 +1545,7 @@ static int do_md_run(mddev_t * mddev)
 	mdk_rdev_t *rdev;
 
 
-	if (!mddev->nb_dev) {
+	if (list_empty(&mddev->disks)) {
 		MD_BUG();
 		return -EINVAL;
 	}
@@ -1630,9 +1570,6 @@ static int do_md_run(mddev_t * mddev)
 	chunk_size = mddev->sb->chunk_size;
 	pnum = level_to_pers(mddev->sb->level);
 
-	mddev->param.chunk_size = chunk_size;
-	mddev->param.personality = pnum;
-
 	if ((pnum != MULTIPATH) && (pnum != RAID1)) {
 		if (!chunk_size) {
 			/*
@@ -1712,6 +1649,9 @@ static int do_md_run(mddev_t * mddev)
 	}
 	mddev->pers = pers[pnum];
 
+	blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+	mddev->queue.queuedata = mddev;
+
 	err = mddev->pers->run(mddev);
 	if (err) {
 		printk(KERN_ERR "md: pers->run() failed ...\n");
@@ -1719,9 +1659,15 @@ static int do_md_run(mddev_t * mddev)
 		return -EINVAL;
 	}
 
-	mddev->sb->state &= ~(1 << MD_SB_CLEAN);
-	md_update_sb(mddev);
+	mddev->in_sync = (mddev->sb->state & (1<<MD_SB_CLEAN));
+	/* if personality doesn't have "sync_request", then
+	 * a dirty array doesn't mean anything
+	 */
+	if (mddev->pers->sync_request)
+		mddev->sb->state &= ~(1 << MD_SB_CLEAN);
+	__md_update_sb(mddev);
 
+	md_recover_arrays();
 	/*
 	 * md_size has units of 1K blocks, which are
 	 * twice as large as sectors.
@@ -1736,21 +1682,21 @@ static int do_md_run(mddev_t * mddev)
 #undef TOO_BIG_CHUNKSIZE
 #undef BAD_CHUNKSIZE
 
-#define OUT(x) do { err = (x); goto out; } while (0)
-
 static int restart_array(mddev_t *mddev)
 {
-	int err = 0;
+	int err;
 
 	/*
 	 * Complain if it has no devices
 	 */
-	if (!mddev->nb_dev)
-		OUT(-ENXIO);
+	err = -ENXIO;
+	if (list_empty(&mddev->disks))
+		goto out;
 
 	if (mddev->pers) {
+		err = -EBUSY;
 		if (!mddev->ro)
-			OUT(-EBUSY);
+			goto out;
 
 		mddev->ro = 0;
 		set_device_ro(mddev_to_kdev(mddev), 0);
@@ -1761,8 +1707,7 @@ static int restart_array(mddev_t *mddev)
 		 * Kick recovery or resync if necessary
 		 */
 		md_recover_arrays();
-		if (mddev->pers->restart_resync)
-			mddev->pers->restart_resync(mddev);
+		err = 0;
 	} else {
 		printk(KERN_ERR "md: md%d has no personality assigned.\n",
 			mdidx(mddev));
@@ -1780,49 +1725,43 @@ static int restart_array(mddev_t *mddev)
 
 static int do_md_stop(mddev_t * mddev, int ro)
 {
-	int err = 0, resync_interrupted = 0;
+	int err = 0;
 	kdev_t dev = mddev_to_kdev(mddev);
 
 	if (atomic_read(&mddev->active)>1) {
 		printk(STILL_IN_USE, mdidx(mddev));
-		OUT(-EBUSY);
+		err = -EBUSY;
+		goto out;
 	}
 
 	if (mddev->pers) {
-		/*
-		 * It is safe to call stop here, it only frees private
-		 * data. Also, it tells us if a device is unstoppable
-		 * (eg. resyncing is in progress)
-		 */
-		if (mddev->pers->stop_resync)
-			if (mddev->pers->stop_resync(mddev))
-				resync_interrupted = 1;
-
-		if (mddev->recovery_running)
-			md_interrupt_thread(md_recovery_thread);
-
-		/*
-		 * This synchronizes with signal delivery to the
-		 * resync or reconstruction thread. It also nicely
-		 * hangs the process if some reconstruction has not
-		 * finished.
-		 */
-		down(&mddev->recovery_sem);
-		up(&mddev->recovery_sem);
+		if (mddev->sync_thread) {
+			if (mddev->recovery_running > 0)
+				mddev->recovery_running = -EINTR;
+			md_unregister_thread(mddev->sync_thread);
+			mddev->sync_thread = NULL;
+			if (mddev->spare) {
+				mddev->pers->diskop(mddev, &mddev->spare,
+						    DISKOP_SPARE_INACTIVE);
+				mddev->spare = NULL;
+			}
+		}
 
 		invalidate_device(dev, 1);
 
 		if (ro) {
+			err  = -ENXIO;
 			if (mddev->ro)
-				OUT(-ENXIO);
+				goto out;
 			mddev->ro = 1;
 		} else {
 			if (mddev->ro)
 				set_device_ro(dev, 0);
 			if (mddev->pers->stop(mddev)) {
+				err = -EBUSY;
 				if (mddev->ro)
 					set_device_ro(dev, 1);
-				OUT(-EBUSY);
+				goto out;
 			}
 			if (mddev->ro)
 				mddev->ro = 0;
@@ -1832,11 +1771,11 @@ static int do_md_stop(mddev_t * mddev, int ro)
 			 * mark it clean only if there was no resync
 			 * interrupted.
 			 */
-			if (!mddev->recovery_running && !resync_interrupted) {
+			if (mddev->in_sync) {
 				printk(KERN_INFO "md: marking sb clean...\n");
 				mddev->sb->state |= 1 << MD_SB_CLEAN;
 			}
-			md_update_sb(mddev);
+			__md_update_sb(mddev);
 		}
 		if (ro)
 			set_device_ro(dev, 1);
@@ -1848,15 +1787,13 @@ static int do_md_stop(mddev_t * mddev, int ro)
 	if (!ro) {
 		printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
 		free_mddev(mddev);
-
 	} else
 		printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
+	err = 0;
 out:
 	return err;
 }
 
-#undef OUT
-
 /*
  * We have to safely support old arrays too.
  */
@@ -1877,7 +1814,7 @@ static void autorun_array(mddev_t *mddev)
 	struct list_head *tmp;
 	int err;
 
-	if (mddev->disks.prev == &mddev->disks) {
+	if (list_empty(&mddev->disks)) {
 		MD_BUG();
 		return;
 	}
@@ -1912,17 +1849,15 @@ static void autorun_array(mddev_t *mddev)
  *
  * If "unit" is allocated, then bump its reference count
  */
-static void autorun_devices(kdev_t countdev)
+static void autorun_devices(void)
 {
 	struct list_head candidates;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev0, *rdev;
 	mddev_t *mddev;
-	kdev_t md_kdev;
-
 
 	printk(KERN_INFO "md: autorun ...\n");
-	while (pending_raid_disks.next != &pending_raid_disks) {
+	while (!list_empty(&pending_raid_disks)) {
 		rdev0 = list_entry(pending_raid_disks.next,
 					 mdk_rdev_t, pending);
 
@@ -1946,29 +1881,34 @@ static void autorun_devices(kdev_t countdev)
 		 * mostly sane superblocks. It's time to allocate the
 		 * mddev.
 		 */
-		md_kdev = mk_kdev(MD_MAJOR, rdev0->sb->md_minor);
-		mddev = kdev_to_mddev(md_kdev);
-		if (mddev) {
-			printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
-			       mdidx(mddev), partition_name(rdev0->dev));
-			ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
-				export_rdev(rdev);
-			continue;
-		}
-		mddev = alloc_mddev(md_kdev);
+
+		mddev = mddev_find(rdev0->sb->md_minor);
 		if (!mddev) {
 			printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
 			break;
 		}
-		if (kdev_same(md_kdev, countdev))
-			atomic_inc(&mddev->active);
-		printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
-		ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
-			bind_rdev_to_array(rdev, mddev);
-			list_del(&rdev->pending);
-			INIT_LIST_HEAD(&rdev->pending);
+		if (mddev_lock(mddev)) 
+			printk(KERN_WARNING "md: md%d locked, cannot run\n",
+			       mdidx(mddev));
+		else if (mddev->sb || !list_empty(&mddev->disks)) {
+			printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
+			       mdidx(mddev), partition_name(rdev0->dev));
+			mddev_unlock(mddev);
+		} else {
+			printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+			ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
+				bind_rdev_to_array(rdev, mddev);
+				list_del_init(&rdev->pending);
+			}
+			autorun_array(mddev);
+			mddev_unlock(mddev);
 		}
-		autorun_array(mddev);
+		/* on success, candidates will be empty, on error
+		 * it wont...
+		 */
+		ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
+			export_rdev(rdev);
+		mddev_put(mddev);
 	}
 	printk(KERN_INFO "md: ... autorun DONE.\n");
 }
@@ -2005,7 +1945,7 @@ static void autorun_devices(kdev_t countdev)
 #define AUTORUNNING KERN_INFO \
 "md: auto-running md%d.\n"
 
-static int autostart_array(kdev_t startdev, kdev_t countdev)
+static int autostart_array(kdev_t startdev)
 {
 	int err = -EINVAL, i;
 	mdp_super_t *sb = NULL;
@@ -2065,7 +2005,7 @@ static int autostart_array(kdev_t startdev, kdev_t countdev)
 	/*
 	 * possibly return codes
 	 */
-	autorun_devices(countdev);
+	autorun_devices();
 	return 0;
 
 abort:
@@ -2191,7 +2131,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			MD_BUG();
 			return -EINVAL;
 		}
-		if (mddev->nb_dev) {
+		if (!list_empty(&mddev->disks)) {
 			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
 							mdk_rdev_t, same_set);
 			if (!uuid_equal(rdev0, rdev)) {
@@ -2346,8 +2286,7 @@ static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
 
 	remove_descriptor(disk, mddev->sb);
 	kick_rdev_from_array(rdev);
-	mddev->sb_dirty = 1;
-	md_update_sb(mddev);
+	__md_update_sb(mddev);
 
 	return 0;
 busy:
@@ -2458,9 +2397,7 @@ static int hot_add_disk(mddev_t * mddev, kdev_t dev)
 	mddev->sb->spare_disks++;
 	mddev->sb->working_disks++;
 
-	mddev->sb_dirty = 1;
-
-	md_update_sb(mddev);
+	__md_update_sb(mddev);
 
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
@@ -2520,36 +2457,6 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 }
 #undef SET_SB
 
-static int set_disk_info(mddev_t * mddev, void * arg)
-{
-	printk(KERN_INFO "md: not yet");
-	return -EINVAL;
-}
-
-static int clear_array(mddev_t * mddev)
-{
-	printk(KERN_INFO "md: not yet");
-	return -EINVAL;
-}
-
-static int write_raid_info(mddev_t * mddev)
-{
-	printk(KERN_INFO "md: not yet");
-	return -EINVAL;
-}
-
-static int protect_array(mddev_t * mddev)
-{
-	printk(KERN_INFO "md: not yet");
-	return -EINVAL;
-}
-
-static int unprotect_array(mddev_t * mddev)
-{
-	printk(KERN_INFO "md: not yet");
-	return -EINVAL;
-}
-
 static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
 {
 	mdk_rdev_t *rdev;
@@ -2595,7 +2502,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
 		case PRINT_RAID_DEBUG:
 			err = 0;
 			md_print_devices();
-			goto done_unlock;
+			goto done;
 
 #ifndef MODULE
 		case RAID_AUTORUN:
@@ -2632,40 +2539,30 @@ static int md_ioctl(struct inode *inode, struct file *file,
 	 * Commands creating/starting a new array:
 	 */
 
-	mddev = kdev_to_mddev(dev);
+	mddev = inode->i_bdev->bd_inode->u.generic_ip;
 
-	switch (cmd)
-	{
-		case SET_ARRAY_INFO:
-		case START_ARRAY:
-			if (mddev) {
-				printk(KERN_WARNING "md: array md%d already exists!\n",
-								mdidx(mddev));
-				err = -EEXIST;
-				goto abort;
-			}
-		default:;
+	if (!mddev) {
+		BUG();
+		goto abort;
 	}
+
+	err = mddev_lock(mddev);
+	if (err) {
+		printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",
+		       err, cmd);
+		goto abort;
+	}
+
 	switch (cmd)
 	{
 		case SET_ARRAY_INFO:
-			mddev = alloc_mddev(dev);
-			if (!mddev) {
-				err = -ENOMEM;
-				goto abort;
-			}
-			atomic_inc(&mddev->active);
 
-			/*
-			 * alloc_mddev() should possibly self-lock.
-			 */
-			err = lock_mddev(mddev);
-			if (err) {
-				printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
-				       err, cmd);
-				goto abort;
+			if (!list_empty(&mddev->disks)) {
+				printk(KERN_WARNING "md: array md%d already has disks!\n",
+					mdidx(mddev));
+				err = -EBUSY;
+				goto abort_unlock;
 			}
-
 			if (mddev->sb) {
 				printk(KERN_WARNING "md: array md%d already has a superblock!\n",
 					mdidx(mddev));
@@ -2690,13 +2587,13 @@ static int md_ioctl(struct inode *inode, struct file *file,
 			/*
 			 * possibly make it lock the array ...
 			 */
-			err = autostart_array(val_to_kdev(arg), dev);
+			err = autostart_array(val_to_kdev(arg));
 			if (err) {
 				printk(KERN_WARNING "md: autostart %s failed!\n",
 					partition_name(val_to_kdev(arg)));
-				goto abort;
+				goto abort_unlock;
 			}
-			goto done;
+			goto done_unlock;
 
 		default:;
 	}
@@ -2704,16 +2601,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
 	/*
 	 * Commands querying/configuring an existing array:
 	 */
-
-	if (!mddev) {
-		err = -ENODEV;
-		goto abort;
-	}
-	err = lock_mddev(mddev);
-	if (err) {
-		printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
-		goto abort;
-	}
 	/* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
 	if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
 		err = -ENODEV;
@@ -2738,8 +2625,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
 			goto done_unlock;
 
 		case STOP_ARRAY:
-			if (!(err = do_md_stop (mddev, 0)))
-				mddev = NULL;
+			err = do_md_stop (mddev, 0);
 			goto done_unlock;
 
 		case STOP_ARRAY_RO:
@@ -2784,10 +2670,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
 
 	switch (cmd)
 	{
-		case CLEAR_ARRAY:
-			err = clear_array(mddev);
-			goto done_unlock;
-
 		case ADD_NEW_DISK:
 		{
 			mdu_disk_info_t info;
@@ -2808,35 +2690,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
 			err = hot_add_disk(mddev, val_to_kdev(arg));
 			goto done_unlock;
 
-		case SET_DISK_INFO:
-			err = set_disk_info(mddev, (void *)arg);
-			goto done_unlock;
-
-		case WRITE_RAID_INFO:
-			err = write_raid_info(mddev);
-			goto done_unlock;
-
-		case UNPROTECT_ARRAY:
-			err = unprotect_array(mddev);
-			goto done_unlock;
-
-		case PROTECT_ARRAY:
-			err = protect_array(mddev);
-			goto done_unlock;
-
 		case SET_DISK_FAULTY:
 			err = set_disk_faulty(mddev, val_to_kdev(arg));
 			goto done_unlock;
 
 		case RUN_ARRAY:
 		{
-/* The data is never used....
-			mdu_param_t param;
-			err = copy_from_user(&param, (mdu_param_t *)arg,
-							 sizeof(param));
-			if (err)
-				goto abort_unlock;
-*/
 			err = do_md_run (mddev);
 			/*
 			 * we have to clean up the mess if
@@ -2845,8 +2704,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
 			 */
 			if (err) {
 				mddev->sb_dirty = 0;
-				if (!do_md_stop (mddev, 0))
-					mddev = NULL;
+				do_md_stop (mddev, 0);
 			}
 			goto done_unlock;
 		}
@@ -2861,8 +2719,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
 
 done_unlock:
 abort_unlock:
-	if (mddev)
-		unlock_mddev(mddev);
+	mddev_unlock(mddev);
 
 	return err;
 done:
@@ -2875,19 +2732,34 @@ static int md_ioctl(struct inode *inode, struct file *file,
 static int md_open(struct inode *inode, struct file *file)
 {
 	/*
-	 * Always succeed, but increment the usage count
+	 * Succeed if we can find or allocate a mddev structure.
 	 */
-	mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
-	if (mddev)
-		atomic_inc(&mddev->active);
-	return (0);
+	mddev_t *mddev = mddev_find(minor(inode->i_rdev));
+	int err = -ENOMEM;
+
+	if (!mddev)
+		goto out;
+
+	if ((err = mddev_lock(mddev)))
+		goto put;
+
+	err = 0;
+	mddev_unlock(mddev);
+	inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
+ put:
+	mddev_put(mddev);
+ out:
+	return err;
 }
 
 static int md_release(struct inode *inode, struct file * file)
 {
-	mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
-	if (mddev)
-		atomic_dec(&mddev->active);
+ 	mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+	if (!mddev)
+		BUG();
+	mddev_put(mddev);
+
 	return 0;
 }
 
@@ -2918,6 +2790,7 @@ int md_thread(void * arg)
 	 */
 
 	daemonize();
+	reparent_to_init();
 
 	sprintf(current->comm, thread->name);
 	current->exit_signal = SIGCHLD;
@@ -2941,17 +2814,10 @@ int md_thread(void * arg)
 	complete(thread->event);
 	while (thread->run) {
 		void (*run)(void *data);
-		DECLARE_WAITQUEUE(wait, current);
 
-		add_wait_queue(&thread->wqueue, &wait);
-		set_task_state(current, TASK_INTERRUPTIBLE);
-		if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
-			dprintk("md: thread %p went to sleep.\n", thread);
-			schedule();
-			dprintk("md: thread %p woke up.\n", thread);
-		}
-		current->state = TASK_RUNNING;
-		remove_wait_queue(&thread->wqueue, &wait);
+		wait_event_interruptible(thread->wqueue,
+					 test_bit(THREAD_WAKEUP, &thread->flags));
+
 		clear_bit(THREAD_WAKEUP, &thread->flags);
 
 		run = thread->run;
@@ -3026,7 +2892,7 @@ void md_unregister_thread(mdk_thread_t *thread)
 	kfree(thread);
 }
 
-void md_recover_arrays(void)
+static void md_recover_arrays(void)
 {
 	if (!md_recovery_thread) {
 		MD_BUG();
@@ -3042,7 +2908,7 @@ int md_error(mddev_t *mddev, struct block_device *bdev)
 	kdev_t rdev = to_kdev_t(bdev->bd_dev);
 
 	dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
-		major(dev),minor(dev),major(rdev),minor(rdev),
+		MD_MAJOR,mdidx(mddev),major(rdev),minor(rdev),
 		__builtin_return_address(0),__builtin_return_address(1),
 		__builtin_return_address(2),__builtin_return_address(3));
 
@@ -3055,17 +2921,14 @@ int md_error(mddev_t *mddev, struct block_device *bdev)
 		return 0;
 	if (!mddev->pers->error_handler
 			|| mddev->pers->error_handler(mddev,rdev) <= 0) {
-		free_disk_sb(rrdev);
 		rrdev->faulty = 1;
 	} else
 		return 1;
 	/*
 	 * if recovery was running, stop it now.
 	 */
-	if (mddev->pers->stop_resync)
-		mddev->pers->stop_resync(mddev);
-	if (mddev->recovery_running)
-		md_interrupt_thread(md_recovery_thread);
+	if (mddev->recovery_running) 
+		mddev->recovery_running = -EIO;
 	md_recover_arrays();
 
 	return 0;
@@ -3080,7 +2943,7 @@ static int status_unused(char * page)
 	sz += sprintf(page + sz, "unused devices: ");
 
 	ITERATE_RDEV_ALL(rdev,tmp) {
-		if (!rdev->same_set.next && !rdev->same_set.prev) {
+		if (list_empty(&rdev->same_set)) {
 			/*
 			 * The device is not yet used by any array.
 			 */
@@ -3123,18 +2986,9 @@ static int status_resync(char * page, mddev_t * mddev)
 			sz += sprintf(page + sz, ".");
 		sz += sprintf(page + sz, "] ");
 	}
-	if (!mddev->recovery_running)
-		/*
-		 * true resync
-		 */
-		sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)",
-				res/10, res % 10, resync, max_blocks);
-	else
-		/*
-		 * recovery ...
-		 */
-		sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)",
-				res/10, res % 10, resync, max_blocks);
+	sz += sprintf(page + sz, " %s =%3lu.%lu%% (%lu/%lu)",
+		      (mddev->spare ? "recovery" : "resync"),
+		      res/10, res % 10, resync, max_blocks);
 
 	/*
 	 * We do not want to overflow, so the order of operands and
@@ -3172,7 +3026,7 @@ static int md_status_read_proc(char *page, char **start, off_t off,
 
 	sz += sprintf(page+sz, "\n");
 
-	ITERATE_MDDEV(mddev,tmp) {
+	ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
 		sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
 						mddev->pers ? "" : "in");
 		if (mddev->pers) {
@@ -3192,7 +3046,7 @@ static int md_status_read_proc(char *page, char **start, off_t off,
 			size += rdev->size;
 		}
 
-		if (mddev->nb_dev) {
+		if (!list_empty(&mddev->disks)) {
 			if (mddev->pers)
 				sz += sprintf(page + sz, "\n      %d blocks",
 						 md_size[mdidx(mddev)]);
@@ -3202,19 +3056,20 @@ static int md_status_read_proc(char *page, char **start, off_t off,
 
 		if (!mddev->pers) {
 			sz += sprintf(page+sz, "\n");
+			mddev_unlock(mddev);
 			continue;
 		}
 
 		sz += mddev->pers->status (page+sz, mddev);
 
 		sz += sprintf(page+sz, "\n      ");
-		if (mddev->curr_resync) {
+		if (mddev->curr_resync > 1)
 			sz += status_resync (page+sz, mddev);
-		} else {
-			if (atomic_read(&mddev->resync_sem.count) != 1)
+		else if (mddev->curr_resync == 1)
 				sz += sprintf(page + sz, "	resync=DELAYED");
-		}
+
 		sz += sprintf(page + sz, "\n");
+		mddev_unlock(mddev);
 	}
 	sz += status_unused(page + sz);
 
@@ -3315,60 +3170,70 @@ static int is_mddev_idle(mddev_t *mddev)
 	return idle;
 }
 
-DECLARE_WAIT_QUEUE_HEAD(resync_wait);
-
 void md_done_sync(mddev_t *mddev, int blocks, int ok)
 {
 	/* another "blocks" (512byte) blocks have been synced */
 	atomic_sub(blocks, &mddev->recovery_active);
 	wake_up(&mddev->recovery_wait);
 	if (!ok) {
+		mddev->recovery_running = -EIO;
+		md_recover_arrays();
 		// stop recovery, signal do_sync ....
 	}
 }
 
+
+DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
-int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
+static void md_do_sync(void *data)
 {
+	mddev_t *mddev = data;
 	mddev_t *mddev2;
 	unsigned int max_sectors, currspeed = 0,
-		j, window, err, serialize;
+		j, window, err;
 	unsigned long mark[SYNC_MARKS];
 	unsigned long mark_cnt[SYNC_MARKS];
 	int last_mark,m;
 	struct list_head *tmp;
 	unsigned long last_check;
 
+	/* just incase thread restarts... */
+	if (mddev->recovery_running <= 0)
+		return;
 
-	err = down_interruptible(&mddev->resync_sem);
-	if (err)
-		goto out_nolock;
+	/* we overload curr_resync somewhat here.
+	 * 0 == not engaged in resync at all
+	 * 2 == checking that there is no conflict with another sync
+	 * 1 == like 2, but have yielded to allow conflicting resync to
+	 *		commense
+	 * other == active in resync - this many blocks
+	 */
+	do {
+		mddev->curr_resync = 2;
 
-recheck:
-	serialize = 0;
-	ITERATE_MDDEV(mddev2,tmp) {
-		if (mddev2 == mddev)
-			continue;
-		if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
-			printk(KERN_INFO "md: delaying resync of md%d until md%d "
-			       "has finished resync (they share one or more physical units)\n",
-			       mdidx(mddev), mdidx(mddev2));
-			serialize = 1;
-			break;
-		}
-	}
-	if (serialize) {
-		interruptible_sleep_on(&resync_wait);
-		if (signal_pending(current)) {
-			flush_curr_signals();
-			err = -EINTR;
-			goto out;
+		ITERATE_MDDEV(mddev2,tmp) {
+			if (mddev2 == mddev)
+				continue;
+			if (mddev2->curr_resync && 
+			    match_mddev_units(mddev,mddev2)) {
+				printk(KERN_INFO "md: delaying resync of md%d until md%d "
+				       "has finished resync (they share one or more physical units)\n",
+				       mdidx(mddev), mdidx(mddev2));
+				if (mddev < mddev2) /* arbitrarily yield */
+					mddev->curr_resync = 1;
+				if (wait_event_interruptible(resync_wait,
+							     mddev2->curr_resync < 2)) {
+					flush_curr_signals();
+					err = -EINTR;
+					mddev_put(mddev2);
+					goto out;
+				}
+			}
 		}
-		goto recheck;
-	}
+	} while (mddev->curr_resync < 2);
 
-	mddev->curr_resync = 1;
 	max_sectors = mddev->sb->size << 1;
 
 	printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
@@ -3406,7 +3271,7 @@ int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
 		}
 		atomic_add(sectors, &mddev->recovery_active);
 		j += sectors;
-		mddev->curr_resync = j;
+		if (j>1) mddev->curr_resync = j;
 
 		if (last_check + window > j)
 			continue;
@@ -3432,7 +3297,6 @@ int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
 			/*
 			 * got a signal, exit.
 			 */
-			mddev->curr_resync = 0;
 			printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
 			flush_curr_signals();
 			err = -EINTR;
@@ -3467,106 +3331,116 @@ int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
 	 */
 out:
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
-	up(&mddev->resync_sem);
-out_nolock:
+	/* tell personality that we are finished */
+	mddev->pers->sync_request(mddev, max_sectors, 1);
+
 	mddev->curr_resync = 0;
-	wake_up(&resync_wait);
-	return err;
+	if (err)
+		mddev->recovery_running = err;
+	if (mddev->recovery_running > 0)
+		mddev->recovery_running = 0;
+	if (mddev->recovery_running == 0)
+		mddev->in_sync = 1;
+	md_recover_arrays();
 }
 
 
 /*
- * This is a kernel thread which syncs a spare disk with the active array
- *
- * the amount of foolproofing might seem to be a tad excessive, but an
- * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
- * of my root partition with the first 0.5 gigs of my /home partition ... so
- * i'm a bit nervous ;)
+ * This is the kernel thread that watches all md arrays for re-sync action
+ * that might be needed.
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set "->recovery_running" and
+ * create a thread at ->sync_thread.
+ * When the thread finishes is clears recovery_running (or set and error)
+ * and wakeup up this thread which will reap the thread and finish up.
  */
 void md_do_recovery(void *data)
 {
-	int err;
 	mddev_t *mddev;
 	mdp_super_t *sb;
-	mdp_disk_t *spare;
 	struct list_head *tmp;
 
-	printk(KERN_INFO "md: recovery thread got woken up ...\n");
-restart:
-	ITERATE_MDDEV(mddev,tmp) {
+	dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+
+	ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
 		sb = mddev->sb;
-		if (!sb)
-			continue;
-		if (mddev->recovery_running)
-			continue;
-		if (sb->active_disks == sb->raid_disks)
-			continue;
-		if (!sb->spare_disks) {
-			printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
-			       "-- continuing in degraded mode\n", mdidx(mddev));
-			continue;
-		}
-		/*
-		 * now here we get the spare and resync it.
-		 */
-		spare = get_spare(mddev);
-		if (!spare)
-			continue;
-		printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
-		       mdidx(mddev), partition_name(mk_kdev(spare->major,spare->minor)));
-		if (!mddev->pers->diskop)
-			continue;
-		if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
-			continue;
-		down(&mddev->recovery_sem);
-		mddev->recovery_running = 1;
-		err = md_do_sync(mddev, spare);
-		if (err == -EIO) {
-			printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
-			       mdidx(mddev), partition_name(mk_kdev(spare->major,spare->minor)));
-			if (!disk_faulty(spare)) {
-				mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
-				mark_disk_faulty(spare);
-				mark_disk_nonsync(spare);
-				mark_disk_inactive(spare);
-				sb->spare_disks--;
-				sb->working_disks--;
-				sb->failed_disks++;
+		if (!sb || !mddev->pers || !mddev->pers->diskop || mddev->ro)
+			goto unlock;
+		if (mddev->recovery_running > 0)
+			/* resync/recovery still happening */
+			goto unlock;
+		if (mddev->sync_thread) {
+			/* resync has finished, collect result */
+			md_unregister_thread(mddev->sync_thread);
+			mddev->sync_thread = NULL;
+			if (mddev->recovery_running < 0) {
+				/* some sort of failure.
+				 * If we were doing a reconstruction,
+				 * we need to retrieve the spare
+				 */
+				if (mddev->spare) {
+					mddev->pers->diskop(mddev, &mddev->spare,
+							    DISKOP_SPARE_INACTIVE);
+					mddev->spare = NULL;
+				}
+			} else {
+				/* success...*/
+				if (mddev->spare) {
+					mddev->pers->diskop(mddev, &mddev->spare,
+							    DISKOP_SPARE_ACTIVE);
+					mark_disk_sync(mddev->spare);
+					mark_disk_active(mddev->spare);
+					sb->active_disks++;
+					sb->spare_disks--;
+					mddev->spare = NULL;
+				}
 			}
-		} else
-			if (disk_faulty(spare))
-				mddev->pers->diskop(mddev, &spare,
-						DISKOP_SPARE_INACTIVE);
-		if (err == -EINTR || err == -ENOMEM) {
-			/*
-			 * Recovery got interrupted, or ran out of mem ...
-			 * signal back that we have finished using the array.
-			 */
-			mddev->pers->diskop(mddev, &spare,
-							 DISKOP_SPARE_INACTIVE);
-			up(&mddev->recovery_sem);
+			__md_update_sb(mddev);
 			mddev->recovery_running = 0;
-			continue;
-		} else {
+			wake_up(&resync_wait);
+			goto unlock;
+		}
+		if (mddev->recovery_running) {
+			/* that's odd.. */
 			mddev->recovery_running = 0;
-			up(&mddev->recovery_sem);
+			wake_up(&resync_wait);
 		}
-		if (!disk_faulty(spare)) {
-			/*
-			 * the SPARE_ACTIVE diskop possibly changes the
-			 * pointer too
-			 */
-			mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
-			mark_disk_sync(spare);
-			mark_disk_active(spare);
-			sb->active_disks++;
-			sb->spare_disks--;
+
+		if (sb->active_disks < sb->raid_disks) {
+			mddev->spare = get_spare(mddev);
+			if (!mddev->spare)
+				printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
+				       "-- continuing in degraded mode\n", mdidx(mddev));
+			else
+				printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
+				       mdidx(mddev), partition_name(mk_kdev(mddev->spare->major,mddev->spare->minor)));
+		}
+		if (!mddev->spare && mddev->in_sync) {
+			/* nothing we can do ... */
+			goto unlock;
+		}
+		if (mddev->pers->sync_request) {
+			mddev->sync_thread = md_register_thread(md_do_sync,
+								mddev,
+								"md_resync");
+			if (!mddev->sync_thread) {
+				printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev));
+				if (mddev->spare)
+					mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_INACTIVE);
+				mddev->spare = NULL;
+				mddev->recovery_running = 0;
+			} else {
+				if (mddev->spare)
+					mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_WRITE);
+				mddev->recovery_running = 1;
+				md_wakeup_thread(mddev->sync_thread);
+			}
 		}
-		mddev->sb_dirty = 1;
-		md_update_sb(mddev);
-		goto restart;
+	unlock:
+		mddev_unlock(mddev);
 	}
-	printk(KERN_INFO "md: recovery thread finished ...\n");
+	dprintk(KERN_INFO "md: recovery thread finished ...\n");
 
 }
 
@@ -3582,7 +3456,8 @@ int md_notify_reboot(struct notifier_block *this,
 		return NOTIFY_DONE;
 
 		ITERATE_MDDEV(mddev,tmp)
-			do_md_stop (mddev, 1);
+			if (mddev_trylock(mddev)==0)
+				do_md_stop (mddev, 1);
 		/*
 		 * certain more exotic SCSI devices are known to be
 		 * volatile wrt too early system reboots. While the
@@ -3606,7 +3481,6 @@ static void md_geninit(void)
 
 	for(i = 0; i < MAX_MD_DEVS; i++) {
 		md_size[i] = 0;
-		md_maxreadahead[i] = 32;
 	}
 	blk_size[MAJOR_NR] = md_size;
 
@@ -3617,6 +3491,18 @@ static void md_geninit(void)
 #endif
 }
 
+request_queue_t * md_queue_proc(kdev_t dev)
+{
+	mddev_t *mddev = mddev_find(minor(dev));
+	request_queue_t *q = BLK_DEFAULT_QUEUE(MAJOR_NR);
+	if (!mddev || atomic_read(&mddev->active)<2)
+		BUG();
+	if (mddev->pers)
+		q = &mddev->queue;
+	mddev_put(mddev); /* the caller must hold a reference... */
+	return q;
+}
+
 int __init md_init(void)
 {
 	static char * name = "mdrecoveryd";
@@ -3641,8 +3527,9 @@ int __init md_init(void)
 			S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
 	}
 
-	/* forward all md request to md_make_request */
-	blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request);
+	/* all requests on an uninitialised device get failed... */
+	blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request);
+	blk_dev[MAJOR_NR].queue = md_queue_proc;
 
 	add_gendisk(&md_gendisk);
 
@@ -3720,7 +3607,7 @@ static void autostart_arrays(void)
 	}
 	dev_cnt = 0;
 
-	autorun_devices(to_kdev_t(-1));
+	autorun_devices();
 }
 
 static struct {
@@ -3859,17 +3746,27 @@ void __init md_setup_drive(void)
 		if (!md_setup_args.device_set[minor])
 			continue;
 
-		if (mddev_map[minor].mddev) {
+		printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
+
+		mddev = mddev_find(minor);
+		if (!mddev) {
+			printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
+			continue;
+		}
+		if (mddev_lock(mddev)) {
 			printk(KERN_WARNING
-			       "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
+			       "md: Ignoring md=%d, cannot lock!\n",
 			       minor);
+			mddev_put(mddev);
 			continue;
 		}
-		printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
 
-		mddev = alloc_mddev(mk_kdev(MD_MAJOR,minor));
-		if (!mddev) {
-			printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
+		if (mddev->sb || !list_empty(&mddev->disks)) {
+			printk(KERN_WARNING
+			       "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
+			       minor);
+			mddev_unlock(mddev);
+			mddev_put(mddev);
 			continue;
 		}
 		if (md_setup_args.pers[minor]) {
@@ -3923,6 +3820,8 @@ void __init md_setup_drive(void)
 			do_md_stop(mddev, 0);
 			printk(KERN_WARNING "md: starting md%d failed\n", minor);
 		}
+		mddev_unlock(mddev);
+		mddev_put(mddev);
 	}
 }
 
@@ -3973,9 +3872,10 @@ int init_module(void)
 
 static void free_device_names(void)
 {
-	while (device_names.next != &device_names) {
-		struct list_head *tmp = device_names.next;
-		list_del(tmp);
+	while (!list_empty(&device_names)) {
+		struct dname *tmp = list_entry(device_names.next,
+					       dev_name_t, list);
+		list_del(&tmp->list);
 		kfree(tmp);
 	}
 }
@@ -4006,10 +3906,8 @@ EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);
 EXPORT_SYMBOL(partition_name);
 EXPORT_SYMBOL(md_error);
-EXPORT_SYMBOL(md_do_sync);
 EXPORT_SYMBOL(md_sync_acct);
 EXPORT_SYMBOL(md_done_sync);
-EXPORT_SYMBOL(md_recover_arrays);
 EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_update_sb);
@@ -4017,7 +3915,5 @@ EXPORT_SYMBOL(md_wakeup_thread);
 EXPORT_SYMBOL(md_print_devices);
 EXPORT_SYMBOL(find_rdev_nr);
 EXPORT_SYMBOL(md_interrupt_thread);
-EXPORT_SYMBOL(mddev_map);
-EXPORT_SYMBOL(md_check_ordering);
 EXPORT_SYMBOL(get_spare);
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 6db555317b135bff56226ea9f28189df91b5eff6..32dc200aee669366284a86f619afc34865ef67fa 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -244,27 +244,19 @@ static int multipath_read_balance (multipath_conf_t *conf)
 	return 0;
 }
 
-static int multipath_make_request (mddev_t *mddev, int rw, struct bio * bio)
+static int multipath_make_request (request_queue_t *q, struct bio * bio)
 {
+	mddev_t *mddev = q->queuedata;
 	multipath_conf_t *conf = mddev_to_conf(mddev);
 	struct bio *real_bio;
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
 
-/*
- * make_request() can abort the operation when READA is being
- * used and no empty request is available.
- *
- * Currently, just replace the command with READ/WRITE.
- */
-	if (rw == READA)
-		rw = READ;
-
 	mp_bh = multipath_alloc_mpbh (conf);
 
 	mp_bh->master_bio = bio;
 	mp_bh->mddev = mddev;
-	mp_bh->cmd = rw;
+	mp_bh->cmd = bio_data_dir(bio);
 
 	/*
 	 * read balancing logic:
@@ -273,7 +265,7 @@ static int multipath_make_request (mddev_t *mddev, int rw, struct bio * bio)
 
 	real_bio = bio_clone(bio, GFP_NOIO);
 	real_bio->bi_bdev = multipath->bdev;
-	real_bio->bi_rw = rw;
+	real_bio->bi_rw = bio_data_dir(bio);
 	real_bio->bi_end_io = multipath_end_request;
 	real_bio->bi_private = mp_bh;
 	mp_bh->bio = real_bio;
@@ -708,7 +700,6 @@ static void multipathd (void *data)
 		mddev = mp_bh->mddev;
 		if (mddev->sb_dirty) {
 			printk(KERN_INFO "dirty sb detected, updating.\n");
-			mddev->sb_dirty = 0;
 			md_update_sb(mddev);
 		}
 		bio = mp_bh->bio;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 8f149a1efe1b75b56963ff1b0fcfc7b1550fc038..2dd6e9d5f9851fce3bcee2e13764574535bbe295 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,21 +29,26 @@
 
 static int create_strip_zones (mddev_t *mddev)
 {
-	int i, c, j, j1, j2;
+	int i, c, j;
 	unsigned long current_offset, curr_zone_offset;
 	raid0_conf_t *conf = mddev_to_conf(mddev);
 	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
+	struct list_head *tmp1, *tmp2;
+	struct strip_zone *zone;
+	int cnt;
  
 	/*
 	 * The number of 'same size groups'
 	 */
 	conf->nr_strip_zones = 0;
  
-	ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
+	ITERATE_RDEV(mddev,rdev1,tmp1) {
 		printk("raid0: looking at %s\n", partition_name(rdev1->dev));
 		c = 0;
-		ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
-			printk("raid0:   comparing %s(%ld) with %s(%ld)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
+		ITERATE_RDEV(mddev,rdev2,tmp2) {
+			printk("raid0:   comparing %s(%ld) with %s(%ld)\n",
+			       partition_name(rdev1->dev), rdev1->size,
+			       partition_name(rdev2->dev), rdev2->size);
 			if (rdev2 == rdev1) {
 				printk("raid0:   END\n");
 				break;
@@ -51,7 +56,7 @@ static int create_strip_zones (mddev_t *mddev)
 			if (rdev2->size == rdev1->size)
 			{
 				/*
-				 * Not unique, dont count it as a new
+				 * Not unique, don't count it as a new
 				 * group
 				 */
 				printk("raid0:   EQUAL\n");
@@ -66,29 +71,62 @@ static int create_strip_zones (mddev_t *mddev)
 			printk("raid0: %d zones\n", conf->nr_strip_zones);
 		}
 	}
-		printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
+	printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
 
 	conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones);
 	if (!conf->strip_zone)
 		return 1;
 
+	memset(conf->strip_zone, 0,sizeof(struct strip_zone)*
+				   conf->nr_strip_zones);
+	/* The first zone must contain all devices, so here we check that
+	 * there is a properly alignment of slots to devices and find them all
+	 */
+	zone = &conf->strip_zone[0];
+	cnt = 0;
+	smallest = NULL;
+	ITERATE_RDEV(mddev, rdev1, tmp1) {
+		int j = rdev1->sb->this_disk.raid_disk;
+
+		if (j < 0 || j >= mddev->sb->raid_disks) {
+			printk("raid0: bad disk number %d - aborting!\n", j);
+			goto abort;
+		}
+		if (zone->dev[j]) {
+			printk("raid0: multiple devices for %d - aborting!\n", j);
+			goto abort;
+		}
+		zone->dev[j] = rdev1;
+		if (!smallest || (rdev1->size <smallest->size))
+			smallest = rdev1;
+		cnt++;
+	}
+	if (cnt != mddev->sb->raid_disks) {
+		printk("raid0: too few disks (%d of %d) - aborting!\n", cnt, 
+		       mddev->sb->raid_disks);
+		goto abort;
+	}
+	zone->nb_dev = cnt;
+	zone->size = smallest->size * cnt;
+	zone->zone_offset = 0;
 
-	conf->smallest = NULL;
-	current_offset = 0;
-	curr_zone_offset = 0;
+	conf->smallest = zone;
+	current_offset = smallest->size;
+	curr_zone_offset = zone->size;
 
-	for (i = 0; i < conf->nr_strip_zones; i++)
+	/* now do the other zones */
+	for (i = 1; i < conf->nr_strip_zones; i++)
 	{
-		struct strip_zone *zone = conf->strip_zone + i;
+		zone = conf->strip_zone + i;
 
 		printk("raid0: zone %d\n", i);
 		zone->dev_offset = current_offset;
 		smallest = NULL;
 		c = 0;
 
-		ITERATE_RDEV_ORDERED(mddev,rdev,j) {
-
+		for (j=0; j<cnt; j++) {
+			rdev = conf->strip_zone[0].dev[j];
 			printk("raid0: checking %s ...", partition_name(rdev->dev));
 			if (rdev->size > current_offset)
 			{
@@ -118,6 +156,9 @@ static int create_strip_zones (mddev_t *mddev)
 	}
 	printk("raid0: done.\n");
 	return 0;
+ abort:
+	vfree(conf->strip_zone);
+	return 1;
 }
 
 static int raid0_run (mddev_t *mddev)
@@ -132,11 +173,6 @@ static int raid0_run (mddev_t *mddev)
 		goto out;
 	mddev->private = (void *)conf;
  
-	if (md_check_ordering(mddev)) {
-		printk("raid0: disks are not ordered, aborting!\n");
-		goto out_free_conf;
-	}
-
 	if (create_strip_zones (mddev)) 
 		goto out_free_conf;
 
@@ -225,8 +261,9 @@ static int raid0_stop (mddev_t *mddev)
  * Of course, those facts may not be valid anymore (and surely won't...)
  * Hey guys, there's some work out there ;-)
  */
-static int raid0_make_request (mddev_t *mddev, int rw, struct bio *bio)
+static int raid0_make_request (request_queue_t *q, struct bio *bio)
 {
+	mddev_t *mddev = q->queuedata;
 	unsigned int sect_in_chunk, chunksize_bits,  chunk_size;
 	raid0_conf_t *conf = mddev_to_conf(mddev);
 	struct raid0_hash *hash;
@@ -234,7 +271,7 @@ static int raid0_make_request (mddev_t *mddev, int rw, struct bio *bio)
 	mdk_rdev_t *tmp_dev;
 	unsigned long chunk, block, rsect;
 
-	chunk_size = mddev->param.chunk_size >> 10;
+	chunk_size = mddev->sb->chunk_size >> 10;
 	chunksize_bits = ffz(~chunk_size);
 	block = bio->bi_sector >> 1;
 	hash = conf->hash_table + block / conf->smallest->size;
@@ -323,7 +360,7 @@ static int raid0_status (char *page, mddev_t *mddev)
 				conf->strip_zone[j].size);
 	}
 #endif
-	sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
+	sz += sprintf(page + sz, " %dk chunks", mddev->sb->chunk_size/1024);
 	return sz;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 96ad858cf033681183b218bb91cafad8a0c6a79a..4c855576f9fecf5b9f42a1661332a37d639460d8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -334,7 +334,7 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
 	 * device if no resync is going on, or below the resync window.
 	 * We take the first readable disk when above the resync window.
 	 */
-	if (conf->resync_mirrors && (this_sector + sectors >= conf->next_resync)) {
+	if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) {
 		/* make sure that disk is operational */
 		new_disk = 0;
 		while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) {
@@ -434,8 +434,9 @@ static void resume_device(conf_t *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static int make_request(mddev_t *mddev, int rw, struct bio * bio)
+static int make_request(request_queue_t *q, struct bio * bio)
 {
+	mddev_t *mddev = q->queuedata;
 	conf_t *conf = mddev_to_conf(mddev);
 	mirror_info_t *mirror;
 	r1bio_t *r1_bio;
@@ -456,20 +457,16 @@ static int make_request(mddev_t *mddev, int rw, struct bio * bio)
 	 * make_request() can abort the operation when READA is being
 	 * used and no empty request is available.
 	 *
-	 * Currently, just replace the command with READ.
 	 */
-	if (rw == READA)
-		rw = READ;
-
 	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 
 	r1_bio->master_bio = bio;
 
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
-	r1_bio->cmd = rw;
+	r1_bio->cmd = bio_data_dir(bio);
 
-	if (rw == READ) {
+	if (r1_bio->cmd == READ) {
 		/*
 		 * read balancing logic:
 		 */
@@ -483,7 +480,7 @@ static int make_request(mddev_t *mddev, int rw, struct bio * bio)
 		read_bio->bi_sector = r1_bio->sector;
 		read_bio->bi_bdev = mirror->bdev;
 		read_bio->bi_end_io = end_request;
-		read_bio->bi_rw = rw;
+		read_bio->bi_rw = r1_bio->cmd;
 		read_bio->bi_private = r1_bio;
 
 		generic_make_request(read_bio);
@@ -507,7 +504,7 @@ static int make_request(mddev_t *mddev, int rw, struct bio * bio)
 		mbio->bi_sector	= r1_bio->sector;
 		mbio->bi_bdev = conf->mirrors[i].bdev;
 		mbio->bi_end_io	= end_request;
-		mbio->bi_rw = rw;
+		mbio->bi_rw = r1_bio->cmd;
 		mbio->bi_private = r1_bio;
 
 		sum_bios++;
@@ -656,6 +653,9 @@ static void close_sync(conf_t *conf)
 	if (conf->barrier) BUG();
 	if (waitqueue_active(&conf->wait_idle)) BUG();
 	if (waitqueue_active(&conf->wait_resume)) BUG();
+
+	mempool_destroy(conf->r1buf_pool);
+	conf->r1buf_pool = NULL;
 }
 
 static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
@@ -772,7 +772,6 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 	 * Deactivate a spare disk:
 	 */
 	case DISKOP_SPARE_INACTIVE:
-		close_sync(conf);
 		sdisk = conf->mirrors + spare_disk;
 		sdisk->operational = 0;
 		sdisk->write_only = 0;
@@ -785,7 +784,6 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 	 * property)
 	 */
 	case DISKOP_SPARE_ACTIVE:
-		close_sync(conf);
 		sdisk = conf->mirrors + spare_disk;
 		fdisk = conf->mirrors + failed_disk;
 
@@ -919,10 +917,6 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 	}
 abort:
 	spin_unlock_irq(&conf->device_lock);
-	if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) {
-		mempool_destroy(conf->r1buf_pool);
-		conf->r1buf_pool = NULL;
-	}
 
 	print_conf(conf);
 	return err;
@@ -1012,7 +1006,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 			 * we read from here, no need to write
 			 */
 			continue;
-		if (i < conf->raid_disks && !conf->resync_mirrors)
+		if (i < conf->raid_disks && mddev->in_sync)
 			/*
 			 * don't need to write this we are just rebuilding
 			 */
@@ -1088,7 +1082,6 @@ static void raid1d(void *data)
 		conf = mddev_to_conf(mddev);
 		if (mddev->sb_dirty) {
 			printk(KERN_INFO "raid1: dirty sb detected, updating.\n");
-			mddev->sb_dirty = 0;
 			md_update_sb(mddev);
 		}
 		bio = r1_bio->master_bio;
@@ -1118,31 +1111,6 @@ static void raid1d(void *data)
 	spin_unlock_irqrestore(&retry_list_lock, flags);
 }
 
-/*
- * Private kernel thread to reconstruct mirrors after an unclean
- * shutdown.
- */
-static void raid1syncd(void *data)
-{
-	conf_t *conf = data;
-	mddev_t *mddev = conf->mddev;
-
-	if (!conf->resync_mirrors)
-		return;
-	if (conf->resync_mirrors == 2)
-		return;
-	down(&mddev->recovery_sem);
-	if (!md_do_sync(mddev, NULL)) {
-		/*
-		 * Only if everything went Ok.
-		 */
-		conf->resync_mirrors = 0;
-	}
-
-	close_sync(conf);
-
-	up(&mddev->recovery_sem);
-}
 
 static int init_resync(conf_t *conf)
 {
@@ -1177,9 +1145,16 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
 	sector_t max_sector, nr_sectors;
 	int disk, partial;
 
-	if (!sector_nr)
+	if (sector_nr == 0)
 		if (init_resync(conf))
 			return -ENOMEM;
+
+	max_sector = mddev->sb->size << 1;
+	if (sector_nr >= max_sector) {
+		close_sync(conf);
+		return 0;
+	}
+
 	/*
 	 * If there is non-resync activity waiting for us then
 	 * put in a delay to throttle resync.
@@ -1216,10 +1191,6 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
 	r1_bio->sector = sector_nr;
 	r1_bio->cmd = SPECIAL;
 
-	max_sector = mddev->sb->size << 1;
-	if (sector_nr >= max_sector)
-		BUG();
-
 	bio = r1_bio->master_bio;
 	nr_sectors = RESYNC_BLOCK_SIZE >> 9;
 	if (max_sector - sector_nr < nr_sectors)
@@ -1302,7 +1273,6 @@ static int run(mddev_t *mddev)
 	mdp_disk_t *descriptor;
 	mdk_rdev_t *rdev;
 	struct list_head *tmp;
-	int start_recovery = 0;
 
 	MOD_INC_USE_COUNT;
 
@@ -1454,10 +1424,6 @@ static int run(mddev_t *mddev)
 	conf->last_used = j;
 
 
-	if (conf->working_disks != sb->raid_disks) {
-		printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
-		start_recovery = 1;
-	}
 
 	{
 		const char * name = "raid1d";
@@ -1469,20 +1435,6 @@ static int run(mddev_t *mddev)
 		}
 	}
 
-	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
-						(conf->working_disks > 1)) {
-		const char * name = "raid1syncd";
-
-		conf->resync_thread = md_register_thread(raid1syncd, conf, name);
-		if (!conf->resync_thread) {
-			printk(THREAD_ERROR, mdidx(mddev));
-			goto out_free_conf;
-		}
-
-		printk(START_RESYNC, mdidx(mddev));
-		conf->resync_mirrors = 1;
-		md_wakeup_thread(conf->resync_thread);
-	}
 
 	/*
 	 * Regenerate the "device is in sync with the raid set" bit for
@@ -1499,10 +1451,6 @@ static int run(mddev_t *mddev)
 	}
 	sb->active_disks = conf->working_disks;
 
-	if (start_recovery)
-		md_recover_arrays();
-
-
 	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
 	/*
 	 * Ok, everything is just fine now
@@ -1522,47 +1470,12 @@ static int run(mddev_t *mddev)
 	return -EIO;
 }
 
-static int stop_resync(mddev_t *mddev)
-{
-	conf_t *conf = mddev_to_conf(mddev);
-
-	if (conf->resync_thread) {
-		if (conf->resync_mirrors) {
-			conf->resync_mirrors = 2;
-			md_interrupt_thread(conf->resync_thread);
-
-			printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
-			return 1;
-		}
-		return 0;
-	}
-	return 0;
-}
-
-static int restart_resync(mddev_t *mddev)
-{
-	conf_t *conf = mddev_to_conf(mddev);
-
-	if (conf->resync_mirrors) {
-		if (!conf->resync_thread) {
-			MD_BUG();
-			return 0;
-		}
-		conf->resync_mirrors = 1;
-		md_wakeup_thread(conf->resync_thread);
-		return 1;
-	}
-	return 0;
-}
-
 static int stop(mddev_t *mddev)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	int i;
 
 	md_unregister_thread(conf->thread);
-	if (conf->resync_thread)
-		md_unregister_thread(conf->resync_thread);
 	if (conf->r1bio_pool)
 		mempool_destroy(conf->r1bio_pool);
 	for (i = 0; i < MD_SB_DISKS; i++)
@@ -1583,8 +1496,6 @@ static mdk_personality_t raid1_personality =
 	status:		status,
 	error_handler:	error,
 	diskop:		diskop,
-	stop_resync:	stop_resync,
-	restart_resync:	restart_resync,
 	sync_request:	sync_request
 };
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 62873d89e395c0734759b5540a1dc5c8824ee45f..f19d8d936f44ae2c53bc3b15403f2b9e78dc6fc4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -634,7 +634,6 @@ static void copy_data(int frombio, struct bio *bio,
 		else 
 			page_offset = (signed)(sector - bio->bi_sector) * -512;
 		bio_for_each_segment(bvl, bio, i) {
-			char *ba = __bio_kmap(bio, i);
 			int len = bio_iovec_idx(bio,i)->bv_len;
 			int clen;
 			int b_offset = 0;			
@@ -649,13 +648,16 @@ static void copy_data(int frombio, struct bio *bio,
 				clen = STRIPE_SIZE - page_offset;	
 			else clen = len;
 			
-			if (len > 0) {
+			if (clen > 0) {
+				char *ba = __bio_kmap(bio, i);
 				if (frombio)
 					memcpy(pa+page_offset, ba+b_offset, clen);
 				else
 					memcpy(ba+b_offset, pa+page_offset, clen);
-			}
-			__bio_kunmap(bio, i);
+				__bio_kunmap(bio, i);
+			}	
+			if (clen < len) /* hit end of page */
+				break;
 			page_offset +=  len;
 		}
 	}
@@ -810,6 +812,8 @@ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx,
 	spin_unlock_irq(&conf->device_lock);
 	spin_unlock(&sh->lock);
 
+	PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
+
 	if (forwrite) {
 		/* check if page is coverred */
 		sector_t sector = sh->dev[dd_idx].sector;
@@ -823,8 +827,6 @@ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx,
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
-
-	PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
 }
 
 
@@ -1036,7 +1038,7 @@ static void handle_stripe(struct stripe_head *sh)
 				    ) &&
 			    !test_bit(R5_UPTODATE, &dev->flags)) {
 				if (conf->disks[i].operational 
-/*				    && !(conf->resync_parity && i == sh->pd_idx) */
+/*				    && !(!mddev->insync && i == sh->pd_idx) */
 					)
 					rmw++;
 				else rmw += 2*disks;  /* cannot read it */
@@ -1226,14 +1228,15 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
 }
 static void raid5_unplug_device(void *data)
 {
-	raid5_conf_t *conf = (raid5_conf_t *)data;
+	request_queue_t *q = data;
+	mddev_t *mddev = q->queuedata;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 
-	raid5_activate_delayed(conf);
-	
-	conf->plugged = 0;
+	if (blk_remove_plug(q))
+		raid5_activate_delayed(conf);
 	md_wakeup_thread(conf->thread);
 
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1242,31 +1245,21 @@ static void raid5_unplug_device(void *data)
 static inline void raid5_plug_device(raid5_conf_t *conf)
 {
 	spin_lock_irq(&conf->device_lock);
-	if (list_empty(&conf->delayed_list))
-		if (!conf->plugged) {
-			conf->plugged = 1;
-			queue_task(&conf->plug_tq, &tq_disk);
-		}
+	blk_plug_device(&conf->mddev->queue);
 	spin_unlock_irq(&conf->device_lock);
 }
 
-static int make_request (mddev_t *mddev, int rw, struct bio * bi)
+static int make_request (request_queue_t *q, struct bio * bi)
 {
-	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	mddev_t *mddev = q->queuedata;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
 	const unsigned int raid_disks = conf->raid_disks;
 	const unsigned int data_disks = raid_disks - 1;
 	unsigned int dd_idx, pd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
-	int read_ahead = 0;
-
 	struct stripe_head *sh;
 
-	if (rw == READA) {
-		rw = READ;
-		read_ahead=1;
-	}
-
 	logical_sector = bi->bi_sector & ~(STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
 
@@ -1281,10 +1274,10 @@ static int make_request (mddev_t *mddev, int rw, struct bio * bi)
 		PRINTK("raid5: make_request, sector %ul logical %ul\n", 
 		       new_sector, logical_sector);
 
-		sh = get_active_stripe(conf, new_sector, pd_idx, read_ahead);
+		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
 
-			add_stripe_bio(sh, bi, dd_idx, rw);
+			add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
 
 			raid5_plug_device(conf);
 			handle_stripe(sh);
@@ -1311,6 +1304,10 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
 
+	if (sector_nr >= mddev->sb->size <<1)
+		/* just being told to finish up .. nothing to do */
+		return 0;
+
 	first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
 		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
 	sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
@@ -1343,17 +1340,15 @@ static void raid5d (void *data)
 
 	handled = 0;
 
-	if (mddev->sb_dirty) {
-		mddev->sb_dirty = 0;
+	if (mddev->sb_dirty)
 		md_update_sb(mddev);
-	}
 	spin_lock_irq(&conf->device_lock);
 	while (1) {
 		struct list_head *first;
 
 		if (list_empty(&conf->handle_list) &&
 		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
-		    !conf->plugged &&
+		    !blk_queue_plugged(&mddev->queue) &&
 		    !list_empty(&conf->delayed_list))
 			raid5_activate_delayed(conf);
 
@@ -1382,31 +1377,6 @@ static void raid5d (void *data)
 	PRINTK("--- raid5d inactive\n");
 }
 
-/*
- * Private kernel thread for parity reconstruction after an unclean
- * shutdown. Reconstruction on spare drives in case of a failed drive
- * is done by the generic mdsyncd.
- */
-static void raid5syncd (void *data)
-{
-	raid5_conf_t *conf = data;
-	mddev_t *mddev = conf->mddev;
-
-	if (!conf->resync_parity)
-		return;
-	if (conf->resync_parity == 2)
-		return;
-	down(&mddev->recovery_sem);
-	if (md_do_sync(mddev,NULL)) {
-		up(&mddev->recovery_sem);
-		printk("raid5: resync aborted!\n");
-		return;
-	}
-	conf->resync_parity = 0;
-	up(&mddev->recovery_sem);
-	printk("raid5: resync finished.\n");
-}
-
 static int run (mddev_t *mddev)
 {
 	raid5_conf_t *conf;
@@ -1416,7 +1386,6 @@ static int run (mddev_t *mddev)
 	mdk_rdev_t *rdev;
 	struct disk_info *disk;
 	struct list_head *tmp;
-	int start_recovery = 0;
 
 	MOD_INC_USE_COUNT;
 
@@ -1444,10 +1413,7 @@ static int run (mddev_t *mddev)
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
 
-	conf->plugged = 0;
-	conf->plug_tq.sync = 0;
-	conf->plug_tq.routine = &raid5_unplug_device;
-	conf->plug_tq.data = conf;
+	mddev->queue.unplug_fn = raid5_unplug_device;
 
 	PRINTK("raid5: run(md%d) called.\n", mdidx(mddev));
 
@@ -1571,9 +1537,10 @@ static int run (mddev_t *mddev)
 		goto abort;
 	}
 
-	if (conf->working_disks != sb->raid_disks) {
-		printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
-		start_recovery = 1;
+	if (conf->failed_disks == 1 &&
+	    !(sb->state & (1<<MD_SB_CLEAN))) {
+		printk(KERN_ERR "raid5: cannot start dirty degraded array for md%d\n", mdidx(mddev));
+		goto abort;
 	}
 
 	{
@@ -1587,10 +1554,11 @@ static int run (mddev_t *mddev)
 	}
 
 	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-		 conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
+		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	if (grow_stripes(conf, conf->max_nr_stripes)) {
 		printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
 		shrink_stripes(conf);
+		md_unregister_thread(conf->thread);
 		goto abort;
 	} else
 		printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
@@ -1615,23 +1583,6 @@ static int run (mddev_t *mddev)
 	else
 		printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
 
-	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
-		const char * name = "raid5syncd";
-
-		conf->resync_thread = md_register_thread(raid5syncd, conf,name);
-		if (!conf->resync_thread) {
-			printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
-			goto abort;
-		}
-
-		printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
-		conf->resync_parity = 1;
-		md_wakeup_thread(conf->resync_thread);
-	}
-
-	print_raid5_conf(conf);
-	if (start_recovery)
-		md_recover_arrays();
 	print_raid5_conf(conf);
 
 	/* Ok, everything is just fine now */
@@ -1650,48 +1601,12 @@ static int run (mddev_t *mddev)
 	return -EIO;
 }
 
-static int stop_resync (mddev_t *mddev)
-{
-	raid5_conf_t *conf = mddev_to_conf(mddev);
-	mdk_thread_t *thread = conf->resync_thread;
-
-	if (thread) {
-		if (conf->resync_parity) {
-			conf->resync_parity = 2;
-			md_interrupt_thread(thread);
-			printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
-			return 1;
-		}
-		return 0;
-	}
-	return 0;
-}
-
-static int restart_resync (mddev_t *mddev)
-{
-	raid5_conf_t *conf = mddev_to_conf(mddev);
-
-	if (conf->resync_parity) {
-		if (!conf->resync_thread) {
-			MD_BUG();
-			return 0;
-		}
-		printk("raid5: waking up raid5resync.\n");
-		conf->resync_parity = 1;
-		md_wakeup_thread(conf->resync_thread);
-		return 1;
-	} else
-		printk("raid5: no restart-resync needed.\n");
-	return 0;
-}
 
 
 static int stop (mddev_t *mddev)
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 
-	if (conf->resync_thread)
-		md_unregister_thread(conf->resync_thread);
 	md_unregister_thread(conf->thread);
 	shrink_stripes(conf);
 	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
@@ -2066,8 +1981,6 @@ static mdk_personality_t raid5_personality=
 	status:		status,
 	error_handler:	error,
 	diskop:		diskop,
-	stop_resync:	stop_resync,
-	restart_resync:	restart_resync,
 	sync_request:	sync_request
 };
 
diff --git a/drivers/usb/class/usb-midi.c b/drivers/usb/class/usb-midi.c
index 4c2b52180638ffbbcc19af4555ffbec4758cc3fc..8aae77591839098d18f85f5ebe404a7b991e46fc 100644
--- a/drivers/usb/class/usb-midi.c
+++ b/drivers/usb/class/usb-midi.c
@@ -106,9 +106,7 @@ MODULE_PARM_DESC(ulangid, "The optional preferred USB Language ID for all device
 
 MODULE_AUTHOR("NAGANO Daisuke <breeze.nagano@nifty.ne.jp>");
 MODULE_DESCRIPTION("USB-MIDI driver");
-#if LINUX_VERSION_CODE  >= KERNEL_VERSION(2,4,14)
 MODULE_LICENSE("GPL");
-#endif
 
 /* ------------------------------------------------------------------------- */
 
diff --git a/drivers/usb/host/ohci-dbg.c b/drivers/usb/host/ohci-dbg.c
index c3e79562190785218280a86f7f5cfe09b894e745..b6aba6fb9c4940fb177a3b347330746b11be7db6 100644
--- a/drivers/usb/host/ohci-dbg.c
+++ b/drivers/usb/host/ohci-dbg.c
@@ -74,9 +74,9 @@ static void urb_print (struct urb * urb, char * str, int small)
 static inline struct ed *
 dma_to_ed (struct ohci_hcd *hc, dma_addr_t ed_dma);
 
-#ifdef OHCI_VERBOSE_DEBUG
 /* print non-empty branches of the periodic ed tree */
-void ohci_dump_periodic (struct ohci_hcd *ohci, char *label)
+static void __attribute__ ((unused))
+ohci_dump_periodic (struct ohci_hcd *ohci, char *label)
 {
 	int i, j;
 	u32 *ed_p;
@@ -101,7 +101,6 @@ void ohci_dump_periodic (struct ohci_hcd *ohci, char *label)
 		printk (KERN_DEBUG "%s, ohci %s, empty periodic schedule\n",
 				label, ohci->hcd.self.bus_name);
 }
-#endif
 
 static void ohci_dump_intr_mask (char *label, __u32 mask)
 {
@@ -241,6 +240,97 @@ static void ohci_dump (struct ohci_hcd *controller, int verbose)
 	ohci_dump_roothub (controller, 1);
 }
 
+static void ohci_dump_td (char *label, struct td *td)
+{
+	u32	tmp = le32_to_cpup (&td->hwINFO);
+
+	dbg ("%s td %p; urb %p index %d; hw next td %08x",
+		label, td,
+		td->urb, td->index,
+		le32_to_cpup (&td->hwNextTD));
+	if ((tmp & TD_ISO) == 0) {
+		char	*toggle, *pid;
+		u32	cbp, be;
+
+		switch (tmp & TD_T) {
+		case TD_T_DATA0: toggle = "DATA0"; break;
+		case TD_T_DATA1: toggle = "DATA1"; break;
+		case TD_T_TOGGLE: toggle = "(CARRY)"; break;
+		default: toggle = "(?)"; break;
+		}
+		switch (tmp & TD_DP) {
+		case TD_DP_SETUP: pid = "SETUP"; break;
+		case TD_DP_IN: pid = "IN"; break;
+		case TD_DP_OUT: pid = "OUT"; break;
+		default: pid = "(bad pid)"; break;
+		}
+		dbg ("     info %08x CC=%x %s DI=%d %s %s", tmp,
+			TD_CC_GET(tmp), /* EC, */ toggle,
+			(tmp & TD_DI) >> 21, pid,
+			(tmp & TD_R) ? "R" : "");
+		cbp = le32_to_cpup (&td->hwCBP);
+		be = le32_to_cpup (&td->hwBE);
+		dbg ("     cbp %08x be %08x (len %d)", cbp, be,
+			cbp ? (be + 1 - cbp) : 0);
+	} else {
+		unsigned	i;
+		dbg ("     info %08x CC=%x DI=%d START=%04x", tmp,
+			TD_CC_GET(tmp), /* FC, */
+			(tmp & TD_DI) >> 21,
+			tmp & 0x0000ffff);
+		dbg ("     bp0 %08x be %08x",
+			le32_to_cpup (&td->hwCBP) & ~0x0fff,
+			le32_to_cpup (&td->hwBE));
+		for (i = 0; i < MAXPSW; i++) {
+			dbg ("       psw [%d] = %2x", i,
+				le16_to_cpu (td->hwPSW [i]));
+		}
+	}
+}
+
+/* caller MUST own hcd spinlock if verbose is set! */
+static void __attribute__((unused))
+ohci_dump_ed (struct ohci_hcd *ohci, char *label, struct ed *ed, int verbose)
+{
+	u32	tmp = ed->hwINFO;
+	char	*type = "";
+
+	dbg ("%s: %s, ed %p state 0x%x type %d; next ed %08x",
+		ohci->hcd.self.bus_name, label,
+		ed, ed->state, ed->type,
+		le32_to_cpup (&ed->hwNextED));
+	switch (tmp & (ED_IN|ED_OUT)) {
+	case ED_OUT: type = "-OUT"; break;
+	case ED_IN: type = "-IN"; break;
+	/* else from TDs ... control */
+	}
+	dbg ("  info %08x MAX=%d%s%s%s EP=%d%s DEV=%d", le32_to_cpu (tmp),
+		0x0fff & (le32_to_cpu (tmp) >> 16),
+		(tmp & ED_ISO) ? " ISO" : "",
+		(tmp & ED_SKIP) ? " SKIP" : "",
+		(tmp & ED_LOWSPEED) ? " LOW" : "",
+		0x000f & (le32_to_cpu (tmp) >> 7),
+		type,
+		0x007f & le32_to_cpu (tmp));
+	dbg ("  tds: head %08x%s%s tail %08x%s",
+		tmp = le32_to_cpup (&ed->hwHeadP),
+		(ed->hwHeadP & ED_H) ? " HALT" : "",
+		(ed->hwHeadP & ED_C) ? " CARRY" : "",
+		le32_to_cpup (&ed->hwTailP),
+		verbose ? "" : " (not listing)");
+	if (verbose) {
+		struct list_head	*tmp;
+
+		/* use ed->td_list because HC concurrently modifies
+		 * hwNextTD as it accumulates ed_donelist.
+		 */
+		list_for_each (tmp, &ed->td_list) {
+			struct td		*td;
+			td = list_entry (tmp, struct td, td_list);
+			ohci_dump_td ("  ->", td);
+		}
+	}
+}
 
 #endif
 
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 185d1d3cd6d526459362256312f67afeff3f1a13..bfd4bc4315d65b83dd5db041390c5befe65aecef 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -100,7 +100,7 @@
  *	- lots more testing!!
  */
 
-#define DRIVER_VERSION "2002-Jun-10"
+#define DRIVER_VERSION "2002-Jun-15"
 #define DRIVER_AUTHOR "Roman Weissgaerber <weissg@vienna.at>, David Brownell"
 #define DRIVER_DESC "USB 1.1 'Open' Host Controller (OHCI) Driver"
 
@@ -145,8 +145,8 @@ static int ohci_urb_enqueue (
 	urb_print (urb, "SUB", usb_pipein (pipe));
 #endif
 	
-	/* every endpoint has a ed, locate and fill it */
-	if (! (ed = ep_add_ed (urb->dev, pipe, urb->interval, 1, mem_flags)))
+	/* every endpoint has a ed, locate and maybe (re)initialize it */
+	if (! (ed = ed_get (ohci, urb->dev, pipe, urb->interval)))
 		return -ENOMEM;
 
 	/* for the private part of the URB we need the number of TDs (size) */
@@ -498,6 +498,7 @@ static void ohci_irq (struct usb_hcd *hcd)
 	struct ohci_regs	*regs = ohci->regs;
  	int			ints; 
 
+	/* we can eliminate a (slow) readl() if _only_ WDH caused this irq */
 	if ((ohci->hcca->done_head != 0)
 			&& ! (le32_to_cpup (&ohci->hcca->done_head) & 0x01)) {
 		ints =  OHCI_INTR_WDH;
diff --git a/drivers/usb/host/ohci-mem.c b/drivers/usb/host/ohci-mem.c
index f2b2df91bc008a2c29d172cc0e232069025db366..c2b0b2ac8be954cd691eb333c86f08b472924d4b 100644
--- a/drivers/usb/host/ohci-mem.c
+++ b/drivers/usb/host/ohci-mem.c
@@ -221,6 +221,7 @@ ed_alloc (struct ohci_hcd *hc, int mem_flags)
 	ed = pci_pool_alloc (hc->ed_cache, mem_flags, &dma);
 	if (ed) {
 		memset (ed, 0, sizeof (*ed));
+		INIT_LIST_HEAD (&ed->td_list);
 		ed->dma = dma;
 		/* hash it for later reverse mapping */
 		if (!hash_add_ed (hc, ed, mem_flags)) {
diff --git a/drivers/usb/host/ohci-q.c b/drivers/usb/host/ohci-q.c
index 837210e9a0f408a6bdd73675f03056c63141ce26..638c1d3828e2d80035e2cc3bb0199cbf752aa493 100644
--- a/drivers/usb/host/ohci-q.c
+++ b/drivers/usb/host/ohci-q.c
@@ -131,8 +131,9 @@ static void intr_resub (struct ohci_hcd *hc, struct urb *urb)
 
 /* search for the right branch to insert an interrupt ed into the int tree 
  * do some load balancing;
- * returns the branch and 
- * sets the interval to interval = 2^integer (ld (interval))
+ * returns the branch
+ * FIXME allow for failure, when there's no bandwidth left;
+ * and consider iso loads too
  */
 static int ep_int_balance (struct ohci_hcd *ohci, int interval, int load)
 {
@@ -152,19 +153,6 @@ static int ep_int_balance (struct ohci_hcd *ohci, int interval, int load)
 
 /*-------------------------------------------------------------------------*/
 
-/*  2^int ( ld (inter)) */
-
-static int ep_2_n_interval (int inter)
-{	
-	int	i;
-
-	for (i = 0; ((inter >> i) > 1 ) && (i < 5); i++)
-		continue;
-	return 1 << i;
-}
-
-/*-------------------------------------------------------------------------*/
-
 /* the int tree is a binary tree 
  * in order to process it sequentially the indexes of the branches have
  * to be mapped the mapping reverses the bits of a word of num_bits length
@@ -230,8 +218,7 @@ static int ep_link (struct ohci_hcd *ohci, struct ed *edi)
 
 	case PIPE_INTERRUPT:
 		load = ed->intriso.intr_info.int_load;
-		interval = ep_2_n_interval (ed->intriso.intr_info.int_period);
-		ed->interval = interval;
+		interval = ed->interval;
 		int_branch = ep_int_balance (ohci, interval, load);
 		ed->intriso.intr_info.int_branch = int_branch;
 
@@ -301,6 +288,7 @@ static void periodic_unlink (
  * just the link to the ed is unlinked.
  * the link from the ed still points to another operational ed or 0
  * so the HC can eventually finish the processing of the unlinked ed
+ * caller guarantees the ED has no active TDs.
  */
 static int start_ed_unlink (struct ohci_hcd *ohci, struct ed *ed) 
 {
@@ -387,84 +375,99 @@ static int start_ed_unlink (struct ohci_hcd *ohci, struct ed *ed)
 
 /*-------------------------------------------------------------------------*/
 
-/* (re)init an endpoint; this _should_ be done once at the
- * usb_set_configuration command, but the USB stack is a bit stateless
- * so we do it at every transaction.
- * if the state of the ed is ED_NEW then a dummy td is added and the
- * state is changed to ED_UNLINK
- * in all other cases the state is left unchanged
- * the ed info fields are set even though most of them should
- * not change
+/* get and maybe (re)init an endpoint. init _should_ be done only as part
+ * of usb_set_configuration() or usb_set_interface() ... but the USB stack
+ * isn't very stateful, so we re-init whenever the HC isn't looking.
  */
-static struct ed *ep_add_ed (
+static struct ed *ed_get (
+	struct ohci_hcd		*ohci,
 	struct usb_device	*udev,
 	unsigned int		pipe,
-	int			interval,
-	int			load,
-	int			mem_flags
+	int			interval
 ) {
-   	struct ohci_hcd		*ohci = hcd_to_ohci (udev->bus->hcpriv);
+	int			is_out = !usb_pipein (pipe);
+	int			type = usb_pipetype (pipe);
+	int			bus_msecs = 0;
 	struct hcd_dev		*dev = (struct hcd_dev *) udev->hcpriv;
-	struct td		*td;
 	struct ed		*ed; 
 	unsigned		ep;
 	unsigned long		flags;
 
-	spin_lock_irqsave (&ohci->lock, flags);
-
 	ep = usb_pipeendpoint (pipe) << 1;
-	if (!usb_pipecontrol (pipe) && usb_pipeout (pipe))
+	if (type != PIPE_CONTROL && is_out)
 		ep |= 1;
+	if (type == PIPE_INTERRUPT)
+		bus_msecs = usb_calc_bus_time (udev->speed, !is_out, 0,
+			usb_maxpacket (udev, pipe, is_out)) / 1000;
+
+	spin_lock_irqsave (&ohci->lock, flags);
+
 	if (!(ed = dev->ep [ep])) {
 		ed = ed_alloc (ohci, SLAB_ATOMIC);
 		if (!ed) {
 			/* out of memory */
-			spin_unlock_irqrestore (&ohci->lock, flags);
-			return NULL;
+			goto done;
 		}
 		dev->ep [ep] = ed;
 	}
 
 	if (ed->state & ED_URB_DEL) {
 		/* pending unlink request */
-		spin_unlock_irqrestore (&ohci->lock, flags);
-		return NULL;
+		ed = 0;
+		goto done;
 	}
 
 	if (ed->state == ED_NEW) {
+		struct td		*td;
+
 		ed->hwINFO = ED_SKIP;
   		/* dummy td; end of td list for ed */
 		td = td_alloc (ohci, SLAB_ATOMIC);
  		if (!td) {
 			/* out of memory */
-			spin_unlock_irqrestore (&ohci->lock, flags);
-			return NULL;
+			ed = 0;
+			goto done;
 		}
 		ed->dummy = td;
 		ed->hwTailP = cpu_to_le32 (td->td_dma);
 		ed->hwHeadP = ed->hwTailP;	/* ED_C, ED_H zeroed */
 		ed->state = ED_UNLINK;
-		ed->type = usb_pipetype (pipe);
+		ed->type = type;
 	}
 
-// FIXME:  don't do this if it's linked to the HC, or without knowing it's
-// safe to clobber state/mode info tied to (previous) config/altsetting.
-// (but dev0/ep0, used by set_address, must get clobbered)
-
-	ed->hwINFO = cpu_to_le32 (usb_pipedevice (pipe)
-			| usb_pipeendpoint (pipe) << 7
-			| (usb_pipeisoc (pipe)? 0x8000: 0)
-			| (usb_pipecontrol (pipe)
-				? 0: (usb_pipeout (pipe)? 0x800: 0x1000)) 
-			| (udev->speed == USB_SPEED_LOW) << 13
-			| usb_maxpacket (udev, pipe, usb_pipeout (pipe))
-				<< 16);
-
-  	if (ed->type == PIPE_INTERRUPT && ed->state == ED_UNLINK) {
-  		ed->intriso.intr_info.int_period = interval;
-  		ed->intriso.intr_info.int_load = load;
-  	}
+	/* FIXME:  Don't do this without knowing it's safe to clobber this
+	 * state/mode info.  Currently the upper layers don't support such
+	 * guarantees; we're lucky changing config/altsetting is rare.
+	 */
+  	if (ed->state == ED_UNLINK) {
+		u32	info;
+
+		info = usb_pipedevice (pipe);
+		info |= (ep >> 1) << 7;
+		info |= usb_maxpacket (udev, pipe, is_out) << 16;
+		info = cpu_to_le32 (info);
+		if (udev->speed == USB_SPEED_LOW)
+			info |= ED_LOWSPEED;
+		/* control transfers store pids in tds */
+		if (type != PIPE_CONTROL) {
+			info |= is_out ? ED_OUT : ED_IN;
+			if (type == PIPE_ISOCHRONOUS)
+				info |= ED_ISO;
+			if (type == PIPE_INTERRUPT) {
+				ed->intriso.intr_info.int_load = bus_msecs;
+				if (interval > 32)
+					interval = 32;
+			}
+		}
+		ed->hwINFO = info;
 
+		/* value ignored except on periodic EDs, where
+		 * we know it's already a power of 2
+		 */
+		ed->interval = interval;
+	}
+
+done:
 	spin_unlock_irqrestore (&ohci->lock, flags);
 	return ed; 
 }
@@ -736,8 +739,8 @@ static void td_done (struct urb *urb, struct td *td)
 		urb->iso_frame_desc [td->index].status = cc_to_error [cc];
 
 		if (cc != 0)
-			dbg ("  urb %p iso TD %d len %d CC %d",
-				urb, td->index, dlen, cc);
+			dbg ("  urb %p iso TD %p (%d) len %d CC %d",
+				urb, td, 1 + td->index, dlen, cc);
 
 	/* BULK, INT, CONTROL ... drivers see aggregate length/status,
 	 * except that "setup" bytes aren't counted and "short" transfers
@@ -776,9 +779,13 @@ static void td_done (struct urb *urb, struct td *td)
 					- td->data_dma;
 		}
 
+#ifdef VERBOSE_DEBUG
 		if (cc != 0)
-			dbg ("  urb %p TD %d CC %d, len=%d",
-				urb, td->index, cc, urb->actual_length);
+			dbg ("  urb %p TD %p (%d) CC %d, len=%d/%d",
+				urb, td, 1 + td->index, cc,
+				urb->actual_length,
+				urb->transfer_buffer_length);
+#endif
   	}
 }
 
@@ -812,8 +819,8 @@ static struct td *dl_reverse_done_list (struct ohci_hcd *ohci)
 				if (urb_priv && ((td_list->index + 1)
 						< urb_priv->length)) {
 #ifdef OHCI_VERBOSE_DEBUG
-					dbg ("urb %p TD %d of %d, patch ED",
-						td_list->urb,
+					dbg ("urb %p TD %p (%d/%d), patch ED",
+						td_list->urb, td_list,
 						1 + td_list->index,
 						urb_priv->length);
 #endif
diff --git a/drivers/usb/host/ohci.h b/drivers/usb/host/ohci.h
index a5bbe43fb75fbe10fb3137f57d711b1673ba51fd..d5fc9517f1329106ef5386675027fdd3088c1ab7 100644
--- a/drivers/usb/host/ohci.h
+++ b/drivers/usb/host/ohci.h
@@ -19,7 +19,7 @@ struct ed {
 #define ED_SKIP		__constant_cpu_to_le32(1 << 14)
 #define ED_LOWSPEED	__constant_cpu_to_le32(1 << 13)
 #define ED_OUT		__constant_cpu_to_le32(0x01 << 11)
-#define ED_IN		__constant_cpu_to_le32(0x10 << 11)
+#define ED_IN		__constant_cpu_to_le32(0x02 << 11)
 	__u32			hwTailP;	/* tail of TD list */
 	__u32			hwHeadP;	/* head of TD list */
 #define ED_C		__constant_cpu_to_le32(0x02)	/* toggle carry */
@@ -30,24 +30,24 @@ struct ed {
 	dma_addr_t		dma;		/* addr of ED */
 	struct ed		*ed_prev;	/* for non-interrupt EDs */
 	struct td		*dummy;
+	struct list_head	td_list;	/* "shadow list" of our TDs */
+
+	u8			state;		/* ED_{NEW,UNLINK,OPER} */
+#define ED_NEW 		0x00		/* unused, no dummy td */
+#define ED_UNLINK 	0x01		/* dummy td, maybe linked to hc */
+#define ED_OPER		0x02		/* dummy td, _is_ linked to hc */
+#define ED_URB_DEL  	0x08		/* for unlinking; masked in */
 
 	u8			type; 		/* PIPE_{BULK,...} */
-	u8			interval;	/* interrupt, isochronous */
+	u16			interval;	/* interrupt, isochronous */
 	union {
 		struct intr_info {		/* interrupt */
-			u8	int_period;
 			u8	int_branch;
 			u8	int_load; 
 		} intr_info;
 		u16		last_iso;	/* isochronous */
 	} intriso;
 
-	u8			state;		/* ED_{NEW,UNLINK,OPER} */
-#define ED_NEW 		0x00		/* unused, no dummy td */
-#define ED_UNLINK 	0x01		/* dummy td, maybe linked to hc */
-#define ED_OPER		0x02		/* dummy td, _is_ linked to hc */
-#define ED_URB_DEL  	0x08		/* for unlinking; masked in */
-
 	/* HC may see EDs on rm_list until next frame (frame_no == tick) */
 	u16			tick;
 	struct ed		*ed_rm_list;
@@ -108,6 +108,8 @@ struct td {
 
 	dma_addr_t	td_dma;		/* addr of this TD */
 	dma_addr_t	data_dma;	/* addr of data it points to */
+
+	struct list_head td_list;	/* "shadow list", TDs on same ED */
 } __attribute__ ((aligned(32)));	/* c/b/i need 16; only iso needs 32 */
 
 #define TD_MASK	((u32)~0x1f)		/* strip hw status in low addr bits */
diff --git a/drivers/usb/net/kaweth.c b/drivers/usb/net/kaweth.c
index 66102b6f85e759533a95de42362cbe99bb576385..096a8b1aded263e80f0046fef33dbed005f8e877 100644
--- a/drivers/usb/net/kaweth.c
+++ b/drivers/usb/net/kaweth.c
@@ -220,10 +220,11 @@ struct kaweth_device
 	struct urb *rx_urb;
 	struct urb *tx_urb;
 	struct urb *irq_urb;
+	
+	struct sk_buff *tx_skb;
 
 	__u8 *firmware_buf;
 	__u8 scratch[KAWETH_SCRATCH_SIZE];
-	__u8 tx_buf[KAWETH_BUF_SIZE];
 	__u8 rx_buf[KAWETH_BUF_SIZE];
 	__u8 intbuffer[INTBUFFERSIZE];
 	__u16 packet_filter_bitmap;
@@ -650,11 +651,13 @@ static int kaweth_ioctl(struct net_device *net, struct ifreq *rq, int cmd)
 static void kaweth_usb_transmit_complete(struct urb *urb)
 {
 	struct kaweth_device *kaweth = urb->context;
+	struct sk_buff *skb = kaweth->tx_skb;
 
 	if (unlikely(urb->status != 0))
 		kaweth_dbg("%s: TX status %d.", kaweth->net->name, urb->status);
 
 	netif_wake_queue(kaweth->net);
+	dev_kfree_skb(skb);
 }
 
 /****************************************************************
@@ -663,7 +666,7 @@ static void kaweth_usb_transmit_complete(struct urb *urb)
 static int kaweth_start_xmit(struct sk_buff *skb, struct net_device *net)
 {
 	struct kaweth_device *kaweth = net->priv;
-	int count = skb->len;
+	char *private_header;
 
 	int res;
 
@@ -679,15 +682,30 @@ static int kaweth_start_xmit(struct sk_buff *skb, struct net_device *net)
 	kaweth_async_set_rx_mode(kaweth);
 	netif_stop_queue(net);
 
-	*((__u16 *)kaweth->tx_buf) = cpu_to_le16(skb->len);
+	/* We now decide whether we can put our special header into the sk_buff */
+	if (skb_cloned(skb) || skb_headroom(skb) < 2) {
+		/* no such luck - we make our own */
+		struct sk_buff *copied_skb;
+		copied_skb = skb_copy_expand(skb, 2, 0, GFP_ATOMIC);
+		dev_kfree_skb_any(skb);
+		skb = copied_skb;
+		if (!copied_skb) {
+			kaweth->stats.tx_errors++;
+			netif_start_queue(net);
+			spin_unlock(&kaweth->device_lock);
+			return 0;
+		}
+	}
 
-	memcpy(kaweth->tx_buf + 2, skb->data, skb->len);
+	private_header = __skb_push(skb, 2);
+	*private_header = cpu_to_le16(skb->len);
+	kaweth->tx_skb = skb;
 
 	FILL_BULK_URB(kaweth->tx_urb,
 		      kaweth->dev,
 		      usb_sndbulkpipe(kaweth->dev, 2),
-		      kaweth->tx_buf,
-		      count + 2,
+		      private_header,
+		      skb->len,
 		      kaweth_usb_transmit_complete,
 		      kaweth);
 	kaweth->end = 0;
@@ -699,6 +717,7 @@ static int kaweth_start_xmit(struct sk_buff *skb, struct net_device *net)
 		kaweth->stats.tx_errors++;
 
 		netif_start_queue(net);
+		dev_kfree_skb(skb);
 	}
 	else
 	{
@@ -707,8 +726,6 @@ static int kaweth_start_xmit(struct sk_buff *skb, struct net_device *net)
 		net->trans_start = jiffies;
 	}
 
-	dev_kfree_skb(skb);
-
 	spin_unlock(&kaweth->device_lock);
 
 	return 0;
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 17e861fcae9f2f5938212b6f292cb4f256cc0f76..4e9f376c38f59e979ad418ef95f7d4d3a0a6d500 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -51,12 +51,6 @@
 
 #include <linux/slab.h>
 
-/*
- * kernel thread actions
- */
-
-#define US_ACT_COMMAND		1
-#define US_ACT_EXIT		5
 
 /***********************************************************************
  * Host functions 
@@ -204,7 +198,7 @@ static int device_reset( Scsi_Cmnd *srb )
 	US_DEBUGP("device_reset() called\n" );
 
 	/* if the device was removed, then we're already reset */
-	if (atomic_read(&us->sm_state) == US_STATE_DETACHED)
+	if (!test_bit(DEV_ATTACHED, &us->bitflags))
 		return SUCCESS;
 
 	scsi_unlock(srb->host);
@@ -235,7 +229,7 @@ static int bus_reset( Scsi_Cmnd *srb )
 	US_DEBUGP("bus_reset() called\n");
 
 	/* if the device has been removed, this worked */
-	if (atomic_read(&us->sm_state) == US_STATE_DETACHED) {
+	if (!test_bit(DEV_ATTACHED, &us->bitflags)) {
 		US_DEBUGP("-- device removed already\n");
 		return SUCCESS;
 	}
@@ -337,8 +331,8 @@ static int proc_info (char *buffer, char **start, off_t offset, int length,
 
 	/* show the GUID of the device */
 	SPRINTF("         GUID: " GUID_FORMAT "\n", GUID_ARGS(us->guid));
-	SPRINTF("     Attached: %s\n", (atomic_read(&us->sm_state) ==
-			US_STATE_DETACHED) ? "Yes" : "No");
+	SPRINTF("     Attached: %s\n", (test_bit(DEV_ATTACHED, &us->bitflags)
+			? "Yes" : "No"));
 
 	/*
 	 * Calculate start of next buffer, and return value.
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index dfdc30db5489e39623b242e395571ed758bcc9ba..76d70eb5a9bf3ad574b76de7918c59ae94c19d09 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -99,13 +99,6 @@ MODULE_LICENSE("GPL");
 
 static int my_host_number;
 
-/*
- * kernel thread actions
- */
-
-#define US_ACT_COMMAND		1
-#define US_ACT_EXIT		5
-
 /* The list of structures and the protective lock for them */
 struct us_data *us_list;
 struct semaphore us_list_semaphore;
@@ -426,7 +419,7 @@ static int usb_stor_control_thread(void * __us)
 		down(&(us->dev_semaphore));
 
 		/* our device has gone - pretend not ready */
-		if (atomic_read(&us->device_state) == US_STATE_DETACHED) {
+		if (!test_bit(DEV_ATTACHED, &us->bitflags)) {
 			US_DEBUGP("Request is for removed device\n");
 			/* For REQUEST_SENSE, it's the data.  But
 			 * for anything else, it should look like
@@ -450,7 +443,7 @@ static int usb_stor_control_thread(void * __us)
 				       sizeof(usb_stor_sense_notready));
 				us->srb->result = CHECK_CONDITION << 1;
 			}
-		} else { /* atomic_read(&us->device_state) == STATE_DETACHED */
+		} else { /* test_bit(DEV_ATTACHED, &us->bitflags) */
 
 			/* Handle those devices which need us to fake 
 			 * their inquiry data */
@@ -557,9 +550,8 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 	unsigned int flags;
 	struct us_unusual_dev *unusual_dev;
 	struct us_data *ss = NULL;
-#ifdef CONFIG_USB_STORAGE_SDDR09
 	int result;
-#endif
+	int new_device = 0;
 
 	/* these are temporary copies -- we test on these, then put them
 	 * in the us-data structure 
@@ -570,13 +562,13 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 	u8 subclass = 0;
 	u8 protocol = 0;
 
-	/* the altsettting on the interface we're probing that matched our
+	/* the altsetting on the interface we're probing that matched our
 	 * usb_match_id table
 	 */
 	struct usb_interface *intf = dev->actconfig->interface;
 	struct usb_interface_descriptor *altsetting =
 		intf[ifnum].altsetting + intf[ifnum].act_altsetting;
-	US_DEBUGP("act_altsettting is %d\n", intf[ifnum].act_altsetting);
+	US_DEBUGP("act_altsetting is %d\n", intf[ifnum].act_altsetting);
 
 	/* clear the temporary strings */
 	memset(mf, 0, sizeof(mf));
@@ -663,7 +655,7 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 		return NULL;
 	}
 
-	/* At this point, we're committed to using the device */
+	/* At this point, we've decided to try to use the device */
 	usb_get_dev(dev);
 
 	/* clear the GUID and fetch the strings */
@@ -696,7 +688,8 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 	 */
 	ss = us_list;
 	while ((ss != NULL) && 
-	       ((ss->pusb_dev) || !GUID_EQUAL(guid, ss->guid)))
+	           (test_bit(DEV_ATTACHED, &ss->bitflags) ||
+		    !GUID_EQUAL(guid, ss->guid)))
 		ss = ss->next;
 
 	if (ss != NULL) {
@@ -710,29 +703,23 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 		/* establish the connection to the new device upon reconnect */
 		ss->ifnum = ifnum;
 		ss->pusb_dev = dev;
-		atomic_set(&ss->device_state, US_STATE_ATTACHED);
+		set_bit(DEV_ATTACHED, &ss->bitflags);
 
 		/* copy over the endpoint data */
-		if (ep_in)
-			ss->ep_in = ep_in->bEndpointAddress & 
-				USB_ENDPOINT_NUMBER_MASK;
-		if (ep_out)
-			ss->ep_out = ep_out->bEndpointAddress & 
-				USB_ENDPOINT_NUMBER_MASK;
+		ss->ep_in = ep_in->bEndpointAddress & 
+			USB_ENDPOINT_NUMBER_MASK;
+		ss->ep_out = ep_out->bEndpointAddress & 
+			USB_ENDPOINT_NUMBER_MASK;
 		ss->ep_int = ep_int;
 
 		/* allocate an IRQ callback if one is needed */
-		if ((ss->protocol == US_PR_CBI) && usb_stor_allocate_irq(ss)) {
-			usb_put_dev(dev);
-			return NULL;
-		}
+		if ((ss->protocol == US_PR_CBI) && usb_stor_allocate_irq(ss))
+			goto BadDevice;
 
 		/* allocate the URB we're going to use */
 		ss->current_urb = usb_alloc_urb(0, GFP_KERNEL);
-		if (!ss->current_urb) {
-			usb_put_dev(dev);
-			return NULL;
-		}
+		if (!ss->current_urb)
+			goto BadDevice;
 
                 /* Re-Initialize the device if it needs it */
 		if (unusual_dev && unusual_dev->initFunction)
@@ -752,14 +739,12 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 			return NULL;
 		}
 		memset(ss, 0, sizeof(struct us_data));
+		new_device = 1;
 
 		/* allocate the URB we're going to use */
 		ss->current_urb = usb_alloc_urb(0, GFP_KERNEL);
-		if (!ss->current_urb) {
-			kfree(ss);
-			usb_put_dev(dev);
-			return NULL;
-		}
+		if (!ss->current_urb)
+			goto BadDevice;
 
 		/* Initialize the mutexes only when the struct is new */
 		init_completion(&(ss->notify));
@@ -776,12 +761,10 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 		ss->unusual_dev = unusual_dev;
 
 		/* copy over the endpoint data */
-		if (ep_in)
-			ss->ep_in = ep_in->bEndpointAddress & 
-				USB_ENDPOINT_NUMBER_MASK;
-		if (ep_out)
-			ss->ep_out = ep_out->bEndpointAddress & 
-				USB_ENDPOINT_NUMBER_MASK;
+		ss->ep_in = ep_in->bEndpointAddress & 
+			USB_ENDPOINT_NUMBER_MASK;
+		ss->ep_out = ep_out->bEndpointAddress & 
+			USB_ENDPOINT_NUMBER_MASK;
 		ss->ep_int = ep_int;
 
 		/* establish the connection to the new device */
@@ -904,12 +887,8 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 #endif
 
 		default:
-			ss->transport_name = "Unknown";
-			kfree(ss->current_urb);
-			kfree(ss);
-			usb_put_dev(dev);
-			return NULL;
-			break;
+			/* ss->transport_name = "Unknown"; */
+			goto BadDevice;
 		}
 		US_DEBUGP("Transport: %s\n", ss->transport_name);
 
@@ -959,22 +938,14 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 #endif
 
 		default:
-			ss->protocol_name = "Unknown";
-			kfree(ss->current_urb);
-			kfree(ss);
-			usb_put_dev(dev);
-			return NULL;
-			break;
+			/* ss->protocol_name = "Unknown"; */
+			goto BadDevice;
 		}
 		US_DEBUGP("Protocol: %s\n", ss->protocol_name);
 
 		/* allocate an IRQ callback if one is needed */
-		if ((ss->protocol == US_PR_CBI) && usb_stor_allocate_irq(ss)) {
-			kfree(ss->current_urb);
-			kfree(ss);
-			usb_put_dev(dev);
-			return NULL;
-		}
+		if ((ss->protocol == US_PR_CBI) && usb_stor_allocate_irq(ss))
+			goto BadDevice;
 
 		/*
 		 * Since this is a new device, we need to generate a scsi 
@@ -1001,16 +972,13 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 
 		/* start up our control thread */
 		atomic_set(&ss->sm_state, US_STATE_IDLE);
-		atomic_set(&ss->device_state, US_STATE_ATTACHED);
+		set_bit(DEV_ATTACHED, &ss->bitflags);
 		ss->pid = kernel_thread(usb_stor_control_thread, ss,
 					CLONE_VM);
 		if (ss->pid < 0) {
 			printk(KERN_WARNING USB_STORAGE 
 			       "Unable to start control thread\n");
-			kfree(ss->current_urb);
-			kfree(ss);
-			usb_put_dev(dev);
-			return NULL;
+			goto BadDevice;
 		}
 
 		/* wait for the thread to start */
@@ -1018,7 +986,17 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 
 		/* now register	 - our detect function will be called */
 		ss->htmplt.module = THIS_MODULE;
-		scsi_register_host(&(ss->htmplt));
+		result = scsi_register_host(&(ss->htmplt));
+		if (result) {
+			printk(KERN_WARNING USB_STORAGE
+				"Unable to register the scsi host\n");
+
+			/* tell the control thread to exit */
+			ss->action = US_ACT_EXIT;
+			up(&ss->sema);
+			wait_for_completion(&ss->notify);
+			goto BadDevice;
+		}
 
 		/* lock access to the data structures */
 		down(&us_list_semaphore);
@@ -1038,6 +1016,31 @@ static void * storage_probe(struct usb_device *dev, unsigned int ifnum,
 
 	/* return a pointer for the disconnect function */
 	return ss;
+
+	/* we come here if there are any problems */
+	BadDevice:
+	US_DEBUGP("storage_probe() failed\n");
+	down(&ss->irq_urb_sem);
+	if (ss->irq_urb) {
+		usb_unlink_urb(ss->irq_urb);
+		usb_free_urb(ss->irq_urb);
+		ss->irq_urb = NULL;
+	}
+	up(&ss->irq_urb_sem);
+	if (ss->current_urb) {
+		usb_unlink_urb(ss->current_urb);
+		usb_free_urb(ss->current_urb);
+		ss->current_urb = NULL;
+	}
+
+	clear_bit(DEV_ATTACHED, &ss->bitflags);
+	ss->pusb_dev = NULL;
+	if (new_device)
+		kfree(ss);
+	else
+		up(&ss->dev_semaphore);
+	usb_put_dev(dev);
+	return NULL;
 }
 
 /* Handle a disconnect event from the USB core */
@@ -1078,7 +1081,7 @@ static void storage_disconnect(struct usb_device *dev, void *ptr)
 	/* mark the device as gone */
 	usb_put_dev(ss->pusb_dev);
 	ss->pusb_dev = NULL;
-	atomic_set(&ss->sm_state, US_STATE_DETACHED);
+	clear_bit(DEV_ATTACHED, &ss->bitflags);
 
 	/* unlock access to the device data structure */
 	up(&(ss->dev_semaphore));
diff --git a/drivers/usb/storage/usb.h b/drivers/usb/storage/usb.h
index 6c90eb638a4b53ed56f46d64c881b7f0c7437ec7..d0f1f24ded28638b952c958ee59d07969e2d7290 100644
--- a/drivers/usb/storage/usb.h
+++ b/drivers/usb/storage/usb.h
@@ -103,9 +103,10 @@ struct us_unusual_dev {
 #define US_FL_SCM_MULT_TARG   0x00000020 /* supports multiple targets */
 #define US_FL_FIX_INQUIRY     0x00000040 /* INQUIRY response needs fixing */
 
-/* device attached/detached states */
-#define US_STATE_DETACHED	1
-#define US_STATE_ATTACHED	2
+
+/* kernel thread actions */
+#define US_ACT_COMMAND		1
+#define US_ACT_EXIT		5
 
 /* processing state machine states */
 #define US_STATE_IDLE		1
@@ -127,10 +128,9 @@ struct us_data {
 	/* The device we're working with
 	 * It's important to note:
 	 *    (o) you must hold dev_semaphore to change pusb_dev
-	 *    (o) device_state should change whenever pusb_dev does
+	 *    (o) DEV_ATTACHED in bitflags should change whenever pusb_dev does
 	 */
 	struct semaphore	dev_semaphore;	 /* protect pusb_dev */
-	atomic_t		device_state;	 /* attached or detached */
 	struct usb_device	*pusb_dev;	 /* this usb_device */
 
 	unsigned int		flags;		 /* from filter initially */
@@ -174,6 +174,7 @@ struct us_data {
 	struct semaphore	ip_waitq;	 /* for CBI interrupts	 */
 	unsigned long		bitflags;	 /* single-bit flags:	 */
 #define IP_WANTED	1			 /* is an IRQ expected?	 */
+#define DEV_ATTACHED	2			 /* is the dev. attached?*/
 
 	/* interrupt communications data */
 	struct semaphore	irq_urb_sem;	 /* to protect irq_urb	 */
diff --git a/fs/select.c b/fs/select.c
index 6a5909a75677acc1041b1484f3ce3a73c0ea3da1..30c29f1e49f8830973ce0f43ea897fe0e4fd418a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -12,9 +12,6 @@
  *  24 January 2000
  *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
  *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
- * 
- *  Dec 2001
- *     Stack allocation and fast path (Andi Kleen) 
  */
 
 #include <linux/slab.h>
@@ -29,6 +26,21 @@
 #define ROUND_UP(x,y) (((x)+(y)-1)/(y))
 #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
 
+struct poll_table_entry {
+	struct file * filp;
+	wait_queue_t wait;
+	wait_queue_head_t * wait_address;
+};
+
+struct poll_table_page {
+	struct poll_table_page * next;
+	struct poll_table_entry * entry;
+	struct poll_table_entry entries[0];
+};
+
+#define POLL_TABLE_FULL(table) \
+	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
+
 /*
  * Ok, Peter made a complicated, but straightforward multiple_wait() function.
  * I have rewritten this, taking some shortcuts: This code may not be easy to
@@ -50,39 +62,30 @@ void poll_freewait(poll_table* pt)
 		struct poll_table_page *old;
 
 		entry = p->entry;
-		while (entry > p->entries) {
+		do {
 			entry--;
 			remove_wait_queue(entry->wait_address,&entry->wait);
 			fput(entry->filp);
-		}
+		} while (entry > p->entries);
 		old = p;
 		p = p->next;
-		if (old != &pt->inline_page) 
-			free_page((unsigned long) old);
+		free_page((unsigned long) old);
 	}
 }
 
 void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
 {
 	struct poll_table_page *table = p->table;
-	struct poll_table_page *new_table = NULL;
-	int sz;
-
-	if (!table) { 
-		new_table = &p->inline_page; 
-	} else { 
-		sz = (table == &p->inline_page) ? POLL_INLINE_TABLE_LEN : PAGE_SIZE; 
-		if ((char*)table->entry >= (char*)table + sz) {
-			new_table = (struct poll_table_page *)__get_free_page(GFP_KERNEL);
-			if (!new_table) {
-				p->error = -ENOMEM;
-				__set_current_state(TASK_RUNNING);
-				return;
-			}
-		}
-	} 
 
-	if (new_table) { 
+	if (!table || POLL_TABLE_FULL(table)) {
+		struct poll_table_page *new_table;
+
+		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
+		if (!new_table) {
+			p->error = -ENOMEM;
+			__set_current_state(TASK_RUNNING);
+			return;
+		}
 		new_table->entry = new_table->entries;
 		new_table->next = table;
 		p->table = new_table;
@@ -110,6 +113,48 @@ void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table
 
 #define BITS(fds, n)		(*__IN(fds, n)|*__OUT(fds, n)|*__EX(fds, n))
 
+static int max_select_fd(unsigned long n, fd_set_bits *fds)
+{
+	unsigned long *open_fds;
+	unsigned long set;
+	int max;
+
+	/* handle last in-complete long-word first */
+	set = ~(~0UL << (n & (__NFDBITS-1)));
+	n /= __NFDBITS;
+	open_fds = current->files->open_fds->fds_bits+n;
+	max = 0;
+	if (set) {
+		set &= BITS(fds, n);
+		if (set) {
+			if (!(set & ~*open_fds))
+				goto get_max;
+			return -EBADF;
+		}
+	}
+	while (n) {
+		open_fds--;
+		n--;
+		set = BITS(fds, n);
+		if (!set)
+			continue;
+		if (set & ~*open_fds)
+			return -EBADF;
+		if (max)
+			continue;
+get_max:
+		do {
+			max++;
+			set >>= 1;
+		} while (set);
+		max += n * __NFDBITS;
+	}
+
+	return max;
+}
+
+#define BIT(i)		(1UL << ((i)&(__NFDBITS-1)))
+#define MEM(i,m)	((m)+(unsigned)(i)/__NFDBITS)
 #define ISSET(i,m)	(((i)&*(m)) != 0)
 #define SET(i,m)	(*(m) |= (i))
 
@@ -120,71 +165,56 @@ void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table
 int do_select(int n, fd_set_bits *fds, long *timeout)
 {
 	poll_table table, *wait;
-	int retval, off, max, maxoff;
+	int retval, i, off;
 	long __timeout = *timeout;
 
+ 	read_lock(&current->files->file_lock);
+	retval = max_select_fd(n, fds);
+	read_unlock(&current->files->file_lock);
+
+	if (retval < 0)
+		return retval;
+	n = retval;
+
 	poll_initwait(&table);
 	wait = &table;
 	if (!__timeout)
 		wait = NULL;
-	
 	retval = 0;
-	maxoff = n/BITS_PER_LONG; 
-	max = 0; 
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		for (off = 0; off <= maxoff; off++) { 
-			unsigned long val = BITS(fds, off); 
+		for (i = 0 ; i < n; i++) {
+			unsigned long bit = BIT(i);
+			unsigned long mask;
+			struct file *file;
 
-			if (!val) 
+			off = i / __NFDBITS;
+			if (!(bit & BITS(fds, off)))
 				continue;
-			while (val) { 
-				int k = ffz(~val); 
-				unsigned long mask, bit;
-				struct file *file;
-
-				if (k > n%BITS_PER_LONG) 
-					break;
-
-				bit = (1UL << k); 
-				val &= ~bit; 
-
-				file = fget((off * BITS_PER_LONG) + k);
-				mask = POLLNVAL;
-				if (file) {
-					mask = DEFAULT_POLLMASK;
-					if (file->f_op && file->f_op->poll)
-						mask = file->f_op->poll(file, wait);
-					fput(file);
-				} else { 
-					/* This error will shadow all other results. 
-					 * This matches previous linux behaviour */
-					retval = -EBADF; 
-					goto out; 
-				} 
-				if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) {
-					SET(bit, __RES_IN(fds,off));
-					retval++;
-					wait = NULL;
-				}
-				if ((mask& POLLOUT_SET) && ISSET(bit,__OUT(fds,off))) {
-					SET(bit, __RES_OUT(fds,off));
-					retval++;
-					wait = NULL;
-				}
-				if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) {
-					SET(bit, __RES_EX(fds,off));
-					retval++;
-					wait = NULL;
-				}
-
-				if (!(val &= ~bit))
-					break;
+			file = fget(i);
+			mask = POLLNVAL;
+			if (file) {
+				mask = DEFAULT_POLLMASK;
+				if (file->f_op && file->f_op->poll)
+					mask = file->f_op->poll(file, wait);
+				fput(file);
+			}
+			if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) {
+				SET(bit, __RES_IN(fds,off));
+				retval++;
+				wait = NULL;
+			}
+			if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(fds,off))) {
+				SET(bit, __RES_OUT(fds,off));
+				retval++;
+				wait = NULL;
+			}
+			if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) {
+				SET(bit, __RES_EX(fds,off));
+				retval++;
+				wait = NULL;
 			}
 		}
-
-		
-		maxoff = max; 
 		wait = NULL;
 		if (retval || !__timeout || signal_pending(current))
 			break;
@@ -194,43 +224,25 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
 		}
 		__timeout = schedule_timeout(__timeout);
 	}
-
-out:	
 	current->state = TASK_RUNNING;
 
 	poll_freewait(&table);
 
 	/*
-	 * Update the caller timeout.
+	 * Up-to-date the caller timeout.
 	 */
 	*timeout = __timeout;
 	return retval;
 }
 
-/*
- * We do a VERIFY_WRITE here even though we are only reading this time:
- * we'll write to it eventually..
- */
+static void *select_bits_alloc(int size)
+{
+	return kmalloc(6 * size, GFP_KERNEL);
+}
 
-static int get_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
+static void select_bits_free(void *bits, int size)
 {
-	unsigned long rounded = FDS_BYTES(nr), mask; 
-	if (ufdset) {
-		int error = verify_area(VERIFY_WRITE, ufdset, rounded);
-		if (!error && __copy_from_user(fdset, ufdset, rounded))
-			error = -EFAULT;
-		if (nr % __NFDBITS == 0) 
-			mask = 0;
-		else { 
-			/* This includes one bit too much according to SU;
-			   but without this some programs hang. */ 
-			mask = ~(~0UL << (nr%__NFDBITS)); 
-		} 
-		fdset[nr/__NFDBITS] &= mask; 
-		return error;
-	}
-	memset(fdset, 0, rounded);
-	return 0;
+	kfree(bits);
 }
 
 /*
@@ -251,7 +263,6 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
 	char *bits;
 	long timeout;
 	int ret, size, max_fdset;
-	char stack_bits[FDS_BYTES(FAST_SELECT_MAX) * 6]; 
 
 	timeout = MAX_SCHEDULE_TIMEOUT;
 	if (tvp) {
@@ -286,16 +297,11 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
 	 * since we used fdset we need to allocate memory in units of
 	 * long-words. 
 	 */
+	ret = -ENOMEM;
 	size = FDS_BYTES(n);
-	if (n < FAST_SELECT_MAX) { 
-		bits = stack_bits;
-	} else { 
-		ret = -ENOMEM;
-		bits = kmalloc(6*size, GFP_KERNEL);
-		if (!bits)
-			goto out_nofds;
-	} 
-
+	bits = select_bits_alloc(size);
+	if (!bits)
+		goto out_nofds;
 	fds.in      = (unsigned long *)  bits;
 	fds.out     = (unsigned long *) (bits +   size);
 	fds.ex      = (unsigned long *) (bits + 2*size);
@@ -307,7 +313,9 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
 	    (ret = get_fd_set(n, outp, fds.out)) ||
 	    (ret = get_fd_set(n, exp, fds.ex)))
 		goto out;
-	memset(fds.res_in, 0, 3*size); 
+	zero_fd_set(n, fds.res_in);
+	zero_fd_set(n, fds.res_out);
+	zero_fd_set(n, fds.res_ex);
 
 	ret = do_select(n, &fds, &timeout);
 
@@ -318,8 +326,8 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
 			usec = timeout % HZ;
 			usec *= (1000000/HZ);
 		}
-		__put_user(sec, &tvp->tv_sec);
-		__put_user(usec, &tvp->tv_usec);
+		put_user(sec, &tvp->tv_sec);
+		put_user(usec, &tvp->tv_usec);
 	}
 
 	if (ret < 0)
@@ -336,10 +344,8 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
 	set_fd_set(n, exp, fds.res_ex);
 
 out:
-	if (n >= FAST_SELECT_MAX) 
-		kfree(bits);
+	select_bits_free(bits, size);
 out_nofds:
-
 	return ret;
 }
 
@@ -404,42 +410,12 @@ static int do_poll(unsigned int nfds, unsigned int nchunks, unsigned int nleft,
 	return count;
 }
 
-static int fast_poll(poll_table *table, poll_table *wait, struct pollfd *ufds, 
-		     unsigned int nfds, long timeout)
-{ 
-	poll_table *pt = wait; 
-	struct pollfd fds[FAST_POLL_MAX];
-	int count, i; 
-
-	if (copy_from_user(fds, ufds, nfds * sizeof(struct pollfd)))
-		return -EFAULT; 
-	for (;;) { 
-		set_current_state(TASK_INTERRUPTIBLE);
-		count = 0; 
-		do_pollfd(nfds, fds, &pt, &count); 
-		pt = NULL;
-		if (count || !timeout || signal_pending(current))
-			break;
-		count = wait->error; 
-		if (count) 
-			break; 		
-		timeout = schedule_timeout(timeout);
-	} 
-	current->state = TASK_RUNNING;
-	for (i = 0; i < nfds; i++) 
-		__put_user(fds[i].revents, &ufds[i].revents);
-	poll_freewait(table);	
-	if (!count && signal_pending(current)) 
-		return -EINTR; 
-	return count; 
-} 
-
 asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
 {
-	int i, j, err, fdcount;
+	int i, j, fdcount, err;
 	struct pollfd **fds;
 	poll_table table, *wait;
-	int nchunks, nleft; 
+	int nchunks, nleft;
 
 	/* Do a sanity check on nfds ... */
 	if (nfds > NR_OPEN)
@@ -453,45 +429,43 @@ asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
 			timeout = MAX_SCHEDULE_TIMEOUT;
 	}
 
-
 	poll_initwait(&table);
 	wait = &table;
 	if (!timeout)
 		wait = NULL;
 
-	if (nfds < FAST_POLL_MAX) 
-		return fast_poll(&table, wait, ufds, nfds, timeout); 
-
 	err = -ENOMEM;
-	fds = (struct pollfd **)kmalloc(
-		(1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *),
-		GFP_KERNEL);
-	if (fds == NULL)
-		goto out;
-	
+	fds = NULL;
+	if (nfds != 0) {
+		fds = (struct pollfd **)kmalloc(
+			(1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *),
+			GFP_KERNEL);
+		if (fds == NULL)
+			goto out;
+	}
+
 	nchunks = 0;
 	nleft = nfds;
-	while (nleft > POLLFD_PER_PAGE) { 
+	while (nleft > POLLFD_PER_PAGE) { /* allocate complete PAGE_SIZE chunks */
 		fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
 		if (fds[nchunks] == NULL)
 			goto out_fds;
 		nchunks++;
 		nleft -= POLLFD_PER_PAGE;
 	}
-	if (nleft) { 
+	if (nleft) { /* allocate last PAGE_SIZE chunk, only nleft elements used */
 		fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
 		if (fds[nchunks] == NULL)
 			goto out_fds;
-	} 
-	
+	}
+
 	err = -EFAULT;
 	for (i=0; i < nchunks; i++)
 		if (copy_from_user(fds[i], ufds + i*POLLFD_PER_PAGE, PAGE_SIZE))
 			goto out_fds1;
-	
 	if (nleft) {
 		if (copy_from_user(fds[nchunks], ufds + nchunks*POLLFD_PER_PAGE, 
-				   nleft * sizeof(struct pollfd)))
+				nleft * sizeof(struct pollfd)))
 			goto out_fds1;
 	}
 
@@ -515,7 +489,8 @@ asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
 out_fds:
 	for (i=0; i < nchunks; i++)
 		free_page((unsigned long)(fds[i]));
-	kfree(fds);
+	if (nfds != 0)
+		kfree(fds);
 out:
 	poll_freewait(&table);
 	return err;
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 86b1ee2d3eb319e5c3ec56722247c0c34b39ee82..796aac51388a499a917e855a2cc16d179f0495ee 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -10,32 +10,13 @@
 #include <linux/mm.h>
 #include <asm/uaccess.h>
 
-#define POLL_INLINE_BYTES 256
-#define FAST_SELECT_MAX  128
-#define FAST_POLL_MAX    128
-#define POLL_INLINE_ENTRIES (1+(POLL_INLINE_BYTES / sizeof(struct poll_table_entry)))
-
-struct poll_table_entry {
-	struct file * filp;
-	wait_queue_t wait;
-	wait_queue_head_t * wait_address;
-};
-
-struct poll_table_page {
-	struct poll_table_page * next;
-	struct poll_table_entry * entry;
-	struct poll_table_entry entries[0];
-};
+struct poll_table_page;
 
 typedef struct poll_table_struct {
 	int error;
 	struct poll_table_page * table;
-	struct poll_table_page inline_page; 
-	struct poll_table_entry inline_table[POLL_INLINE_ENTRIES]; 
 } poll_table;
 
-#define POLL_INLINE_TABLE_LEN (sizeof(poll_table) - offsetof(poll_table, inline_page))
-
 extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p);
 
 static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
@@ -49,7 +30,6 @@ static inline void poll_initwait(poll_table* pt)
 	pt->error = 0;
 	pt->table = NULL;
 }
-
 extern void poll_freewait(poll_table* pt);
 
 
@@ -69,6 +49,27 @@ typedef struct {
 #define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
 #define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))
 
+/*
+ * We do a VERIFY_WRITE here even though we are only reading this time:
+ * we'll write to it eventually..
+ *
+ * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
+ */
+static inline
+int get_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
+{
+	nr = FDS_BYTES(nr);
+	if (ufdset) {
+		int error;
+		error = verify_area(VERIFY_WRITE, ufdset, nr);
+		if (!error && __copy_from_user(fdset, ufdset, nr))
+			error = -EFAULT;
+		return error;
+	}
+	memset(fdset, 0, nr);
+	return 0;
+}
+
 static inline
 void set_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
 {
@@ -76,6 +77,12 @@ void set_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
 		__copy_to_user(ufdset, fdset, FDS_BYTES(nr));
 }
 
+static inline
+void zero_fd_set(unsigned long nr, unsigned long *fdset)
+{
+	memset(fdset, 0, FDS_BYTES(nr));
+}
+
 extern int do_select(int n, fd_set_bits *fds, long *timeout);
 
 #endif /* KERNEL */
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index cb6332482af27cfd0a875f5660da2f67147cb4cc..19ebeb0a4988cb50e69387cea50c793197fe458b 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -63,8 +63,6 @@
 extern int md_size[MAX_MD_DEVS];
 extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
 
-extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
-extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
 extern char * partition_name (kdev_t dev);
 extern inline char * bdev_partition_name (struct block_device *bdev)
 {
@@ -77,14 +75,9 @@ extern mdk_thread_t * md_register_thread (void (*run) (void *data),
 extern void md_unregister_thread (mdk_thread_t *thread);
 extern void md_wakeup_thread(mdk_thread_t *thread);
 extern void md_interrupt_thread (mdk_thread_t *thread);
-extern int md_update_sb (mddev_t *mddev);
-extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
+extern void md_update_sb (mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_sync_acct(kdev_t dev, unsigned long nr_sectors);
-extern void md_recover_arrays (void);
-extern int md_check_ordering (mddev_t *mddev);
-extern int md_notify_reboot(struct notifier_block *this,
-					unsigned long code, void *x);
 extern int md_error (mddev_t *mddev, struct block_device *bdev);
 extern int md_run_setup(void);
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 7b270a50487ee7b9fb3fb1f11870d48a12e6df5f..e238bd222ea2607d56363d3e5363a3eb3477612f 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -64,24 +64,6 @@ typedef struct mdk_rdev_s mdk_rdev_t;
 
 #define MAX_MD_DEVS  (1<<MINORBITS)	/* Max number of md dev */
 
-/*
- * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
- * the personality. (eg. HSM uses this to identify individual LVs)
- */
-typedef struct dev_mapping_s {
-	mddev_t *mddev;
-	void *data;
-} dev_mapping_t;
-
-extern dev_mapping_t mddev_map [MAX_MD_DEVS];
-
-static inline mddev_t * kdev_to_mddev (kdev_t dev)
-{
-	if (major(dev) != MD_MAJOR)
-		BUG();
-        return mddev_map[minor(dev)].mddev;
-}
-
 /*
  * options passed in raidrun:
  */
@@ -196,31 +178,38 @@ struct mddev_s
 	mdk_personality_t		*pers;
 	int				__minor;
 	mdp_super_t			*sb;
-	int				nb_dev;
 	struct list_head 		disks;
 	int				sb_dirty;
-	mdu_param_t			param;
 	int				ro;
+
+	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
 	unsigned long			curr_resync;	/* blocks scheduled */
 	unsigned long			resync_mark;	/* a recent timestamp */
 	unsigned long			resync_mark_cnt;/* blocks written at resync_mark */
-	char				*name;
+	/* recovery_running is 0 for no recovery/resync,
+	 * 1 for active recovery
+	 * 2 for active resync
+	 * -error for an error (e.g. -EINTR)
+	 * it can only be set > 0 under reconfig_sem
+	 */
 	int				recovery_running;
+	int				in_sync;	/* know to not need resync */
 	struct semaphore		reconfig_sem;
-	struct semaphore		recovery_sem;
-	struct semaphore		resync_sem;
 	atomic_t			active;
+	mdp_disk_t			*spare;
 
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
 
+	request_queue_t			queue;	/* for plugging ... */
+
 	struct list_head		all_mddevs;
 };
 
 struct mdk_personality_s
 {
 	char *name;
-	int (*make_request)(mddev_t *mddev, int rw, struct bio *bio);
+	int (*make_request)(request_queue_t *q, struct bio *bio);
 	int (*run)(mddev_t *mddev);
 	int (*stop)(mddev_t *mddev);
 	int (*status)(char *page, mddev_t *mddev);
@@ -237,9 +226,6 @@ struct mdk_personality_s
  * SPARE_ACTIVE expects such a change)
  */
 	int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
-
-	int (*stop_resync)(mddev_t *mddev);
-	int (*restart_resync)(mddev_t *mddev);
 	int (*sync_request)(mddev_t *mddev, sector_t sector_nr, int go_faster);
 };
 
@@ -279,13 +265,6 @@ extern mdp_disk_t *get_spare(mddev_t *mddev);
 #define ITERATE_RDEV(mddev,rdev,tmp)					\
 	ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
 
-/*
- * Same as above, but assumes that the device has rdev->desc_nr numbered
- * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
- */
-#define ITERATE_RDEV_ORDERED(mddev,rdev,i)				\
-	for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
-
 
 /*
  * Iterates through all 'RAID managed disks'
@@ -299,26 +278,6 @@ extern mdp_disk_t *get_spare(mddev_t *mddev);
 #define ITERATE_RDEV_PENDING(rdev,tmp)					\
 	ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
 
-/*
- * iterates through all used mddevs in the system.
- */
-#define ITERATE_MDDEV(mddev,tmp)					\
-									\
-	for (tmp = all_mddevs.next;					\
-		mddev = list_entry(tmp, mddev_t, all_mddevs),	\
-			tmp = tmp->next, tmp->prev != &all_mddevs	\
-		; )
-
-static inline int lock_mddev (mddev_t * mddev)
-{
-	return down_interruptible(&mddev->reconfig_sem);
-}
-
-static inline void unlock_mddev (mddev_t * mddev)
-{
-	up(&mddev->reconfig_sem);
-}
-
 #define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
 				x = y; y = __tmp; } while (0)
 
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 3185c754e3fb3b3f9084f80639b1708568df73ea..f9f02ab19bd3a206c97a4623a21cd76d40ba388e 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -33,8 +33,7 @@ struct r1_private_data_s {
 	int			working_disks;
 	int			last_used;
 	sector_t		next_seq_sect;
-	mdk_thread_t		*thread, *resync_thread;
-	int			resync_mirrors;
+	mdk_thread_t		*thread;
 	mirror_info_t		*spare;
 	spinlock_t		device_lock;
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 67f7bf47179882068c4ae70c4a0679e1ffa203b1..e14864259ffdbd094ff37debcc245b4065f1c03c 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -177,7 +177,7 @@ struct stripe_head {
  * is put on a "delayed" queue until there are no stripes currently
  * in a pre-read phase.  Further, if the "delayed" queue is empty when
  * a stripe is put on it then we "plug" the queue and do not process it
- * until an unplg call is made. (the tq_disk list is run).
+ * until an unplug call is made. (blk_run_queues is run).
  *
  * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
  * it to the count of prereading stripes.
@@ -205,12 +205,11 @@ struct disk_info {
 struct raid5_private_data {
 	struct stripe_head	**stripe_hashtbl;
 	mddev_t			*mddev;
-	mdk_thread_t		*thread, *resync_thread;
+	mdk_thread_t		*thread;
 	struct disk_info	disks[MD_SB_DISKS];
 	struct disk_info	*spare;
 	int			chunk_size, level, algorithm;
 	int			raid_disks, working_disks, failed_disks;
-	int			resync_parity;
 	int			max_nr_stripes;
 
 	struct list_head	handle_list; /* stripes needing handling */
@@ -229,9 +228,6 @@ struct raid5_private_data {
 							 * waiting for 25% to be free
 							 */        
 	spinlock_t		device_lock;
-
-	int			plugged;
-	struct tq_struct	plug_tq;
 };
 
 typedef struct raid5_private_data raid5_conf_t;