- pre4:

- Andrea Arcangeli: SMP scheduler memory barrier fixup - Richard Henderson: fix alpha semaphores and spinlock bugs. - Richard Henderson: clean up the file from hell: "xor.c"

- pre4:
- Andrea Arcangeli: SMP scheduler memory barrier fixup - Richard Henderson: fix alpha semaphores and spinlock bugs. - Richard Henderson: clean up the file from hell: "xor.c"
079dff39 · Linus Torvalds · 7f6760c7 · 079dff39 · 079dff39 · 079dff39
Commit 079dff39 authored Nov 23, 2007 by Linus Torvalds
32 changed files
--- a/arch/alpha/config.in
+++ b/arch/alpha/config.in
@@ -63,7 +63,6 @@ unset CONFIG_ALPHA_T2 CONFIG_ALPHA_PYXIS CONFIG_ALPHA_POLARIS
 unset CONFIG_ALPHA_TSUNAMI CONFIG_ALPHA_MCPCIA
 unset CONFIG_ALPHA_IRONGATE
 unset CONFIG_ALPHA_BROKEN_IRQ_MASK
-unset CONFIG_ALPHA_LARGE_VMALLOC

 # Most of these machines have ISA slots; not exactly sure which don't,
 # and this doesn't activate hordes of code, so do it always.
@@ -215,6 +214,8 @@ if [ "$CONFIG_ALPHA_GENERIC" = "y" -o "$CONFIG_ALPHA_DP264" = "y" \
 	-o "$CONFIG_ALPHA_WILDFIRE" = "y" -o "$CONFIG_ALPHA_TITAN" = "y" ]
 then
 	bool 'Large VMALLOC support' CONFIG_ALPHA_LARGE_VMALLOC
+else
+	define_bool CONFIG_ALPHA_LARGE_VMALLOC n
 fi

 source drivers/pci/Config.in

--- a/arch/alpha/kernel/alpha_ksyms.c
+++ b/arch/alpha/kernel/alpha_ksyms.c
@@ -160,15 +160,20 @@ EXPORT_SYMBOL_NOVERS(__do_clear_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(__strnlen_user);

-/*
- * The following are specially called from the semaphore assembly stubs.
- */
-EXPORT_SYMBOL_NOVERS(__down_failed);
-EXPORT_SYMBOL_NOVERS(__down_failed_interruptible);
-EXPORT_SYMBOL_NOVERS(__up_wakeup);
-EXPORT_SYMBOL_NOVERS(__down_read_failed);
-EXPORT_SYMBOL_NOVERS(__down_write_failed);
-EXPORT_SYMBOL_NOVERS(__rwsem_wake);
+/* Semaphore helper functions.  */
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__up_wakeup);
+EXPORT_SYMBOL(down);
+EXPORT_SYMBOL(down_interruptible);
+EXPORT_SYMBOL(up);
+EXPORT_SYMBOL(__down_read_failed);
+EXPORT_SYMBOL(__down_write_failed);
+EXPORT_SYMBOL(__rwsem_wake);
+EXPORT_SYMBOL(down_read);
+EXPORT_SYMBOL(down_write);
+EXPORT_SYMBOL(up_read);
+EXPORT_SYMBOL(up_write);

 /* 
 * SMP-specific symbols.

--- a/arch/alpha/kernel/semaphore.c
+++ b/arch/alpha/kernel/semaphore.c
 /*
- *  Generic semaphore code. Buyer beware. Do your own
- * specific changes in <asm/semaphore-helper.h>
+ * Alpha semaphore implementation.
+ *
+ * (C) Copyright 1996 Linus Torvalds
+ * (C) Copyright 1999, 2000 Richard Henderson
 */

 #include <linux/sched.h>
-#include <asm/semaphore-helper.h>
+

 /*
 * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to sleep, while the "waking" variable is
- * incremented when the "up()" code goes to wake up waiting
- * processes.
+ * 
+ * The "count" variable is decremented for each process that tries to sleep,
+ * while the "waking" variable is incremented when the "up()" code goes to
+ * wake up waiting processes.
 *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
+ * Notably, the inline "up()" and "down()" functions can efficiently test
+ * if they need to do any extra work (up needs to do something only if count
+ * was negative before the increment operation.
 *
- * waking_non_zero() (from asm/semaphore.h) must execute
- * atomically.
+ * waking_non_zero() (from asm/semaphore.h) must execute atomically.
 *
- * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody.
+ * When __up() is called, the count was negative before incrementing it,
+ * and we need to wake up somebody.
 *
- * This routine adds one to the count of processes that need to
- * wake up and exit.  ALL waiting processes actually wake up but
- * only the one that gets to the "waking" field first will gate
- * through and acquire the semaphore.  The others will go back
- * to sleep.
+ * This routine adds one to the count of processes that need to wake up and
+ * exit.  ALL waiting processes actually wake up but only the one that gets
+ * to the "waking" field first will gate through and acquire the semaphore.
+ * The others will go back to sleep.
 *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
+ * Note that these functions are only called when there is contention on the
+ * lock, and as such all this is the "non-critical" part of the whole
+ * semaphore business. The critical part is the inline stuff in
+ * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
 */

-void
-__up(struct semaphore *sem)
-{
-	wake_one_more(sem);
-	wake_up(&sem->wait);
-}
-
 /*
 * Perform the "down" function.  Return zero for semaphore acquired,
 * return negative for signalled out of the function.
 *
- * If called from __down, the return is ignored and the wait loop is
+ * If called from down, the return is ignored and the wait loop is
 * not interruptible.  This means that a task waiting on a semaphore
 * using "down()" cannot be killed until someone does an "up()" on
 * the semaphore.
 *
- * If called from __down_interruptible, the return value gets checked
+ * If called from down_interruptible, the return value gets checked
 * upon return.  If the return value is negative then the task continues
 * with the negative value in the return register (it can be tested by
 * the caller).
 *
 * Either form may be used in conjunction with "up()".
- *
 */

-#define DOWN_VAR				\
-	struct task_struct *tsk = current;	\
-	wait_queue_t wait;			\
-	init_waitqueue_entry(&wait, tsk)
-
-#define DOWN_HEAD(task_state)						\
-									\
-									\
-	tsk->state = (task_state);					\
-	add_wait_queue(&sem->wait, &wait);				\
-									\
-	/*								\
-	 * Ok, we're set up.  sem->count is known to be less than zero	\
-	 * so we must wait.						\
-	 *								\
-	 * We can let go the lock for purposes of waiting.		\
-	 * We re-acquire it after awaking so as to protect		\
-	 * all semaphore operations.					\
-	 *								\
-	 * If "up()" is called before we call waking_non_zero() then	\
-	 * we will catch it right away.  If it is called later then	\
-	 * we will have to go through a wakeup cycle to catch it.	\
-	 *								\
-	 * Multiple waiters contend for the semaphore lock to see	\
-	 * who gets to gate through and who has to wait some more.	\
-	 */								\
-	for (;;) {
-
-#define DOWN_TAIL(task_state)			\
-		tsk->state = (task_state);	\
-	}					\
-	tsk->state = TASK_RUNNING;		\
-	remove_wait_queue(&sem->wait, &wait)
-
 void
-__down(struct semaphore * sem)
+__down_failed(struct semaphore *sem)
 {
-	DOWN_VAR;
-	DOWN_HEAD(TASK_UNINTERRUPTIBLE);
+	DECLARE_WAITQUEUE(wait, current);
+
+#if DEBUG_SEMAPHORE
+	printk("%s(%d): down failed(%p)\n",
+	       current->comm, current->pid, sem);
+#endif
+
+	current->state = TASK_UNINTERRUPTIBLE;
+	wmb();
+	add_wait_queue_exclusive(&sem->wait, &wait);
+
+	/* At this point we know that sem->count is negative.  In order
+	   to avoid racing with __up, we must check for wakeup before
+	   going to sleep the first time.  */
+
+	while (1) {
+		long ret, tmp;
+
+		/* An atomic conditional decrement of sem->waking.  */
+		__asm__ __volatile__(
+			"1:	ldl_l	%1,%2\n"
+			"	blt	%1,2f\n"
+			"	subl	%1,1,%0\n"
+			"	stl_c	%0,%2\n"
+			"	beq	%0,3f\n"
+			"2:\n"
+			".subsection 2\n"
+			"3:	br	1b\n"
+			".previous"
+			: "=r"(ret), "=&r"(tmp), "=m"(sem->waking)
+			: "0"(0));
+
+		if (ret)
+			break;

-	if (waking_non_zero(sem))
-		break;
-	schedule();
+		schedule();
+		set_task_state(current, TASK_UNINTERRUPTIBLE);
+	}

-	DOWN_TAIL(TASK_UNINTERRUPTIBLE);
+	remove_wait_queue(&sem->wait, &wait);
+	current->state = TASK_RUNNING;
+
+#if DEBUG_SEMAPHORE
+	printk("%s(%d): down acquired(%p)\n",
+	       current->comm, current->pid, sem);
+#endif
 }

 int
-__down_interruptible(struct semaphore * sem)
+__down_failed_interruptible(struct semaphore *sem)
 {
-	int ret = 0;
-	DOWN_VAR;
-	DOWN_HEAD(TASK_INTERRUPTIBLE);
-
-	ret = waking_non_zero_interruptible(sem, tsk);
-	if (ret)
-	{
-		if (ret == 1)
-			/* ret != 0 only if we get interrupted -arca */
-			ret = 0;
-		break;
+	DECLARE_WAITQUEUE(wait, current);
+	long ret;
+
+#if DEBUG_SEMAPHORE
+	printk("%s(%d): down failed(%p)\n",
+	       current->comm, current->pid, sem);
+#endif
+
+	current->state = TASK_INTERRUPTIBLE;
+	wmb();
+	add_wait_queue_exclusive(&sem->wait, &wait);
+
+	while (1) {
+		long tmp, tmp2, tmp3;
+
+		/* We must undo the sem->count down_interruptible decrement
+		   simultaneously and atomicly with the sem->waking
+		   adjustment, otherwise we can race with __up.  This is
+		   accomplished by doing a 64-bit ll/sc on two 32-bit words.
+		
+		   "Equivalent" C.  Note that we have to do this all without
+		   (taken) branches in order to be a valid ll/sc sequence.
+
+		   do {
+		       tmp = ldq_l;
+		       ret = 0;
+		       if (tmp >= 0) {			// waking >= 0
+		           tmp += 0xffffffff00000000;	// waking -= 1
+		           ret = 1;
+		       }
+		       else if (pending) {
+			   // count += 1, but since -1 + 1 carries into the
+			   // high word, we have to be more careful here.
+			   tmp = (tmp & 0xffffffff00000000)
+				 | ((tmp + 1) & 0x00000000ffffffff);
+		           ret = -EINTR;
+		       }
+		       tmp = stq_c = tmp;
+		   } while (tmp == 0);
+		*/
+
+		__asm__ __volatile__(
+			"1:	ldq_l	%1,%4\n"
+			"	lda	%0,0\n"
+			"	cmovne	%5,%6,%0\n"
+			"	addq	%1,1,%2\n"
+			"	and	%1,%7,%3\n"
+			"	andnot	%2,%7,%2\n"
+			"	cmovge	%1,1,%0\n"
+			"	or	%3,%2,%2\n"
+			"	addq	%1,%7,%3\n"
+			"	cmovne	%5,%2,%1\n"
+			"	cmovge	%2,%3,%1\n"
+			"	stq_c	%1,%4\n"
+			"	beq	%1,3f\n"
+			"2:\n"
+			".subsection 2\n"
+			"3:	br	1b\n"
+			".previous"
+			: "=&r"(ret), "=&r"(tmp), "=&r"(tmp2),
+			  "=&r"(tmp3), "=m"(*sem)
+			: "r"(signal_pending(current)), "r"(-EINTR),
+			  "r"(0xffffffff00000000));
+
+		/* At this point we have ret
+		  	1	got the lock
+		  	0	go to sleep
+		  	-EINTR	interrupted  */
+		if (ret != 0)
+			break;
+
+		schedule();
+		set_task_state(current, TASK_INTERRUPTIBLE);
 	}
-	schedule();

-	DOWN_TAIL(TASK_INTERRUPTIBLE);
-	return ret;
+	remove_wait_queue(&sem->wait, &wait);
+	current->state = TASK_RUNNING;
+	wake_up(&sem->wait);
+
+#if DEBUG_SEMAPHORE
+	printk("%s(%d): down %s(%p)\n",
+	       current->comm, current->pid,
+	       (ret < 0 ? "interrupted" : "acquired"), sem);
+#endif
+
+	/* Convert "got the lock" to 0==success.  */
+	return (ret < 0 ? ret : 0);
+}
+
+void
+__up_wakeup(struct semaphore *sem)
+{
+	wake_up(&sem->wait);
+}
+
+void
+down(struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+#if DEBUG_SEMAPHORE
+	printk("%s(%d): down(%p) <count=%d> from %p\n",
+	       current->comm, current->pid, sem,
+	       atomic_read(&sem->count), __builtin_return_address(0));
+#endif
+	__down(sem);
+}
+
+int
+down_interruptible(struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+#if DEBUG_SEMAPHORE
+	printk("%s(%d): down(%p) <count=%d> from %p\n",
+	       current->comm, current->pid, sem,
+	       atomic_read(&sem->count), __builtin_return_address(0));
+#endif
+	return __down_interruptible(sem);
 }

 int
-__down_trylock(struct semaphore * sem)
+down_trylock(struct semaphore *sem)
 {
-	return waking_non_zero_trylock(sem);
+	int ret;
+
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+
+	ret = __down_trylock(sem);
+
+#if DEBUG_SEMAPHORE
+	printk("%s(%d): down_trylock %s from %p\n",
+	       current->comm, current->pid,
+	       ret ? "failed" : "acquired",
+	       __builtin_return_address(0));
+#endif
+
+	return ret;
+}
+
+void
+up(struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+#if DEBUG_SEMAPHORE
+	printk("%s(%d): up(%p) <count=%d> from %p\n",
+	       current->comm, current->pid, sem,
+	       atomic_read(&sem->count), __builtin_return_address(0));
+#endif
+	__up(sem);
 }


@@ -142,122 +270,106 @@ __down_trylock(struct semaphore * sem)
 */

 void
-__down_read(struct rw_semaphore *sem, int count)
+__down_read_failed(struct rw_semaphore *sem, int count)
 {
-	long tmp;
-	DOWN_VAR;
+	DECLARE_WAITQUEUE(wait, current);

 retry_down:
 	if (count < 0) {
-		/* Wait for the lock to become unbiased.  Readers
-		   are non-exclusive.  */
+		/* Waiting on multiple readers and/or writers.  */
 		
-		/* This takes care of granting the lock.  */
-		up_read(sem);
+		/* Undo the acquisition we started in down_read.  */
+		atomic_inc(&sem->count);

+		current->state = TASK_UNINTERRUPTIBLE;
+		wmb();
 		add_wait_queue(&sem->wait, &wait);
-		while (sem->count < 0) {
-			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-			if (sem->count >= 0)
-				break;
+		mb();
+		while (atomic_read(&sem->count) < 0) {
 			schedule();
+			set_task_state(current, TASK_UNINTERRUPTIBLE);
 		}

 		remove_wait_queue(&sem->wait, &wait);
-		tsk->state = TASK_RUNNING;
-
-		__asm __volatile (
-			"	mb\n"
-			"1:	ldl_l	%0,%1\n"
-			"	subl	%0,1,%2\n"
-			"	subl	%0,1,%0\n"
-			"	stl_c	%2,%1\n"
-			"	bne	%2,2f\n"
-			".subsection 2\n"
-			"2:	br	1b\n"
-			".previous"
-			: "=r"(count), "=m"(sem->count), "=r"(tmp)
-			: : "memory");
+		current->state = TASK_RUNNING;
+
+		mb();
+		count = atomic_dec_return(&sem->count);
 		if (count <= 0)
 			goto retry_down;
 	} else {
+		/* Waiting on exactly one writer.  */
+
+		current->state = TASK_UNINTERRUPTIBLE;
+		wmb();
 		add_wait_queue(&sem->wait, &wait);
+		mb();

-		while (1) {
-			if (test_and_clear_bit(0, &sem->granted))
-				break;
-			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-			if ((sem->granted & 1) == 0)
-				schedule();
+		while (!test_and_clear_bit(0, &sem->granted)) {
+			schedule();
+			set_task_state(current, TASK_UNINTERRUPTIBLE);
 		}

 		remove_wait_queue(&sem->wait, &wait);
-		tsk->state = TASK_RUNNING;
+		current->state = TASK_RUNNING;
 	}
 }

 void
-__down_write(struct rw_semaphore *sem, int count)
+__down_write_failed(struct rw_semaphore *sem, int count)
 {
-	long tmp;
-	DOWN_VAR;
+	DECLARE_WAITQUEUE(wait, current);

 retry_down:
 	if (count + RW_LOCK_BIAS < 0) {
-		up_write(sem);
+		/* Waiting on multiple readers and/or writers.  */
+
+		/* Undo the acquisition we started in down_write.  */
+		atomic_add(RW_LOCK_BIAS, &sem->count);

+		current->state = TASK_UNINTERRUPTIBLE;
+		wmb();
 		add_wait_queue_exclusive(&sem->wait, &wait);
+		mb();
 	
-		while (sem->count < 0) {
-			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-			if (sem->count >= RW_LOCK_BIAS)
-				break;
+		while (atomic_read(&sem->count) + RW_LOCK_BIAS < 0) {
 			schedule();
+			set_task_state(current, TASK_UNINTERRUPTIBLE);
 		}

 		remove_wait_queue(&sem->wait, &wait);
-		tsk->state = TASK_RUNNING;
-
-		__asm __volatile (
-			"	mb\n"
-			"1:	ldl_l	%0,%1\n"
-			"	ldah	%2,%3(%0)\n"
-			"	ldah	%0,%3(%0)\n"
-			"	stl_c	%2,%1\n"
-			"	bne	%2,2f\n"
-			".subsection 2\n"
-			"2:	br	1b\n"
-			".previous"
-			: "=r"(count), "=m"(sem->count), "=r"(tmp)
-			: "i"(-(RW_LOCK_BIAS >> 16))
-			: "memory");
+		current->state = TASK_RUNNING;
+
+		count = atomic_sub_return(RW_LOCK_BIAS, &sem->count);
 		if (count != 0)
 			goto retry_down;
 	} else {
-		/* Put ourselves at the end of the list.  */
-		add_wait_queue_exclusive(&sem->write_bias_wait, &wait);
-
-		while (1) {
-			if (test_and_clear_bit(1, &sem->granted))
-				break;
-			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-			if ((sem->granted & 2) == 0)
-				schedule();
+		/* Waiting on exactly one writer.  */
+
+		current->state = TASK_UNINTERRUPTIBLE;
+		wmb();
+		add_wait_queue_exclusive(&sem->wait, &wait);
+		mb();
+
+		while (!test_and_clear_bit(1, &sem->granted)) {
+			schedule();
+			set_task_state(current, TASK_UNINTERRUPTIBLE);
 		}

 		remove_wait_queue(&sem->write_bias_wait, &wait);
-		tsk->state = TASK_RUNNING;
+		current->state = TASK_RUNNING;

 		/* If the lock is currently unbiased, awaken the sleepers.
 		   FIXME: This wakes up the readers early in a bit of a
 		   stampede -> bad!  */
-		if (sem->count >= 0)
+		count = atomic_read(&sem->count);
+		if (__builtin_expect(count >= 0, 0))
 			wake_up(&sem->wait);
 	}
 }

 void
-__do_rwsem_wake(struct rw_semaphore *sem, int readers)
+__rwsem_wake(struct rw_semaphore *sem, int readers)
 {
 	if (readers) {
 		if (test_and_set_bit(0, &sem->granted))
@@ -269,3 +381,67 @@ __do_rwsem_wake(struct rw_semaphore *sem, int readers)
 		wake_up(&sem->write_bias_wait);
 	}
 }
+
+void
+down_read(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+	__down_read(sem);
+#if WAITQUEUE_DEBUG
+	if (sem->granted & 2)
+		BUG();
+	if (atomic_read(&sem->writers))
+		BUG();
+	atomic_inc(&sem->readers);
+#endif
+}
+
+void
+down_write(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+	__down_write(sem);
+#if WAITQUEUE_DEBUG
+	if (sem->granted & 3)
+		BUG();
+	if (atomic_read(&sem->writers))
+		BUG();
+	if (atomic_read(&sem->readers))
+		BUG();
+	atomic_inc(&sem->writers);
+#endif
+}
+
+void
+up_read(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+	if (sem->granted & 2)
+		BUG();
+	if (atomic_read(&sem->writers))
+		BUG();
+	atomic_dec(&sem->readers);
+#endif
+	__up_read(sem);
+}
+
+void
+up_write(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+	if (sem->granted & 3)
+		BUG();
+	if (atomic_read(&sem->readers))
+		BUG();
+	if (atomic_read(&sem->writers) != 1)
+		BUG();
+	atomic_dec(&sem->writers);
+#endif
+	__up_write(sem);
+}
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -378,6 +378,9 @@ do_settimeofday(struct timeval *tv)
 * BUG: This routine does not handle hour overflow properly; it just
 *      sets the minutes. Usually you won't notice until after reboot!
 */
+
+extern int abs(int);
+
 static int
 set_rtc_mmss(unsigned long nowtime)
 {

--- a/arch/alpha/lib/Makefile
+++ b/arch/alpha/lib/Makefile
@@ -12,7 +12,7 @@ OBJS  = __divqu.o __remqu.o __divlu.o __remlu.o memset.o memcpy.o io.o \
 	strcat.o strcpy.o strncat.o strncpy.o stxcpy.o stxncpy.o \
 	strchr.o strrchr.o memchr.o \
 	copy_user.o clear_user.o strncpy_from_user.o strlen_user.o \
-	csum_ipv6_magic.o strcasecmp.o semaphore.o fpreg.o \
+	csum_ipv6_magic.o strcasecmp.o fpreg.o \
 	callback_srm.o srm_puts.o srm_printk.o

 lib.a: $(OBJS)

--- a/arch/alpha/lib/semaphore.S
+++ b/arch/alpha/lib/semaphore.S
-/*
- *  linux/arch/alpha/lib/semaphore.S
- *
- *  Copyright (C) 1999, 2000  Richard Henderson
- */
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- */
-
-	.set noat
-	.set noreorder
-	.align 4
-
-/* __down_failed takes the semaphore in $24, clobbers $24 and $28.  */
-
-	.globl	__down_failed
-	.ent	__down_failed
-__down_failed:
-	ldgp	$29,0($27)
-	lda	$30, -20*8($30)
-	stq	$28, 0*8($30)
-	stq	$0, 1*8($30)
-	stq	$1, 2*8($30)
-	stq	$2, 3*8($30)
-	stq	$3, 4*8($30)
-	stq	$4, 5*8($30)
-	stq	$5, 6*8($30)
-	stq	$6, 7*8($30)
-	stq	$7, 8*8($30)
-	stq	$16, 9*8($30)
-	stq	$17, 10*8($30)
-	stq	$18, 11*8($30)
-	stq	$19, 12*8($30)
-	stq	$20, 13*8($30)
-	stq	$21, 14*8($30)
-	stq	$22, 15*8($30)
-	stq	$23, 16*8($30)
-	stq	$25, 17*8($30)
-	stq	$26, 18*8($30)
-	.frame $30, 20*8, $28
-	.prologue 1
-	
-	mov	$24, $16
-	jsr	__down
-	
-	ldq	$28, 0*8($30)
-	ldq	$0, 1*8($30)
-	ldq	$1, 2*8($30)
-	ldq	$2, 3*8($30)
-	ldq	$3, 4*8($30)
-	ldq	$4, 5*8($30)
-	ldq	$5, 6*8($30)
-	ldq	$6, 7*8($30)
-	ldq	$7, 8*8($30)
-	ldq	$16, 9*8($30)
-	ldq	$17, 10*8($30)
-	ldq	$18, 11*8($30)
-	ldq	$19, 12*8($30)
-	ldq	$20, 13*8($30)
-	ldq	$21, 14*8($30)
-	ldq	$22, 15*8($30)
-	ldq	$23, 16*8($30)
-	ldq	$25, 17*8($30)
-	ldq	$26, 18*8($30)
-	lda	$30, 20*8($30)
-	ret	$31, ($28), 0
-	.end	__down_failed
-
-/* __down_failed_interruptible takes the semaphore in $24,
-   clobbers $28, returns success in $24.  */
-
-	.globl	__down_failed_interruptible
-	.ent	__down_failed_interruptible
-__down_failed_interruptible:
-	ldgp	$29,0($27)
-	lda	$30, -20*8($30)
-	stq	$28, 0*8($30)
-	stq	$0, 1*8($30)
-	stq	$1, 2*8($30)
-	stq	$2, 3*8($30)
-	stq	$3, 4*8($30)
-	stq	$4, 5*8($30)
-	stq	$5, 6*8($30)
-	stq	$6, 7*8($30)
-	stq	$7, 8*8($30)
-	stq	$16, 9*8($30)
-	stq	$17, 10*8($30)
-	stq	$18, 11*8($30)
-	stq	$19, 12*8($30)
-	stq	$20, 13*8($30)
-	stq	$21, 14*8($30)
-	stq	$22, 15*8($30)
-	stq	$23, 16*8($30)
-	stq	$25, 17*8($30)
-	stq	$26, 18*8($30)
-	.frame $30, 20*8, $28
-	.prologue 1
-	
-	mov	$24, $16
-	jsr	__down_interruptible
-	mov	$0, $24
-	
-	ldq	$28, 0*8($30)
-	ldq	$0, 1*8($30)
-	ldq	$1, 2*8($30)
-	ldq	$2, 3*8($30)
-	ldq	$3, 4*8($30)
-	ldq	$4, 5*8($30)
-	ldq	$5, 6*8($30)
-	ldq	$6, 7*8($30)
-	ldq	$7, 8*8($30)
-	ldq	$16, 9*8($30)
-	ldq	$17, 10*8($30)
-	ldq	$18, 11*8($30)
-	ldq	$19, 12*8($30)
-	ldq	$20, 13*8($30)
-	ldq	$21, 14*8($30)
-	ldq	$22, 15*8($30)
-	ldq	$23, 16*8($30)
-	ldq	$25, 17*8($30)
-	ldq	$26, 18*8($30)
-	lda	$30, 20*8($30)
-	ret	$31, ($28), 0
-	.end	__down_failed_interruptible
-
-/* __up_wakeup takes the semaphore in $24, clobbers $24 and $28.  */
-
-	.globl	__up_wakeup
-	.ent	__up_wakeup
-__up_wakeup:
-	ldgp	$29,0($27)
-	lda	$30, -20*8($30)
-	stq	$28, 0*8($30)
-	stq	$0, 1*8($30)
-	stq	$1, 2*8($30)
-	stq	$2, 3*8($30)
-	stq	$3, 4*8($30)
-	stq	$4, 5*8($30)
-	stq	$5, 6*8($30)
-	stq	$6, 7*8($30)
-	stq	$7, 8*8($30)
-	stq	$16, 9*8($30)
-	stq	$17, 10*8($30)
-	stq	$18, 11*8($30)
-	stq	$19, 12*8($30)
-	stq	$20, 13*8($30)
-	stq	$21, 14*8($30)
-	stq	$22, 15*8($30)
-	stq	$23, 16*8($30)
-	stq	$25, 17*8($30)
-	stq	$26, 18*8($30)
-	.frame $30, 20*8, $28
-	.prologue 1
-	
-	mov	$24, $16
-	jsr	__up
-	
-	ldq	$28, 0*8($30)
-	ldq	$0, 1*8($30)
-	ldq	$1, 2*8($30)
-	ldq	$2, 3*8($30)
-	ldq	$3, 4*8($30)
-	ldq	$4, 5*8($30)
-	ldq	$5, 6*8($30)
-	ldq	$6, 7*8($30)
-	ldq	$7, 8*8($30)
-	ldq	$16, 9*8($30)
-	ldq	$17, 10*8($30)
-	ldq	$18, 11*8($30)
-	ldq	$19, 12*8($30)
-	ldq	$20, 13*8($30)
-	ldq	$21, 14*8($30)
-	ldq	$22, 15*8($30)
-	ldq	$23, 16*8($30)
-	ldq	$25, 17*8($30)
-	ldq	$26, 18*8($30)
-	lda	$30, 20*8($30)
-	ret	$31, ($28), 0
-	.end	__up_wakeup
-
-/* __down_read_failed takes the semaphore in $24, count in $25;
-   clobbers $24, $25 and $28.  */
-
-	.globl	__down_read_failed
-	.ent	__down_read_failed
-__down_read_failed:
-	ldgp	$29,0($27)
-	lda	$30, -18*8($30)
-	stq	$28, 0*8($30)
-	stq	$0, 1*8($30)
-	stq	$1, 2*8($30)
-	stq	$2, 3*8($30)
-	stq	$3, 4*8($30)
-	stq	$4, 5*8($30)
-	stq	$5, 6*8($30)
-	stq	$6, 7*8($30)
-	stq	$7, 8*8($30)
-	stq	$16, 9*8($30)
-	stq	$17, 10*8($30)
-	stq	$18, 11*8($30)
-	stq	$19, 12*8($30)
-	stq	$20, 13*8($30)
-	stq	$21, 14*8($30)
-	stq	$22, 15*8($30)
-	stq	$23, 16*8($30)
-	stq	$26, 17*8($30)
-	.frame $30, 18*8, $28
-	.prologue 1
-	
-	mov	$24, $16
-	mov	$25, $17
-	jsr	__down_read
-	
-	ldq	$28, 0*8($30)
-	ldq	$0, 1*8($30)
-	ldq	$1, 2*8($30)
-	ldq	$2, 3*8($30)
-	ldq	$3, 4*8($30)
-	ldq	$4, 5*8($30)
-	ldq	$5, 6*8($30)
-	ldq	$6, 7*8($30)
-	ldq	$7, 8*8($30)
-	ldq	$16, 9*8($30)
-	ldq	$17, 10*8($30)
-	ldq	$18, 11*8($30)
-	ldq	$19, 12*8($30)
-	ldq	$20, 13*8($30)
-	ldq	$21, 14*8($30)
-	ldq	$22, 15*8($30)
-	ldq	$23, 16*8($30)
-	ldq	$26, 17*8($30)
-	lda	$30, 18*8($30)
-	ret	$31, ($28), 0
-	.end	__down_read_failed
-
-/* __down_write_failed takes the semaphore in $24, count in $25;
-   clobbers $24, $25 and $28.  */
-
-	.globl	__down_write_failed
-	.ent	__down_write_failed
-__down_write_failed:
-	ldgp	$29,0($27)
-	lda	$30, -20*8($30)
-	stq	$28, 0*8($30)
-	stq	$0, 1*8($30)
-	stq	$1, 2*8($30)
-	stq	$2, 3*8($30)
-	stq	$3, 4*8($30)
-	stq	$4, 5*8($30)
-	stq	$5, 6*8($30)
-	stq	$6, 7*8($30)
-	stq	$7, 8*8($30)
-	stq	$16, 9*8($30)
-	stq	$17, 10*8($30)
-	stq	$18, 11*8($30)
-	stq	$19, 12*8($30)
-	stq	$20, 13*8($30)
-	stq	$21, 14*8($30)
-	stq	$22, 15*8($30)
-	stq	$23, 16*8($30)
-	stq	$26, 17*8($30)
-	.frame $30, 18*8, $28
-	.prologue 1
-	
-	mov	$24, $16
-	mov	$25, $17
-	jsr	__down_write
-	
-	ldq	$28, 0*8($30)
-	ldq	$0, 1*8($30)
-	ldq	$1, 2*8($30)
-	ldq	$2, 3*8($30)
-	ldq	$3, 4*8($30)
-	ldq	$4, 5*8($30)
-	ldq	$5, 6*8($30)
-	ldq	$6, 7*8($30)
-	ldq	$7, 8*8($30)
-	ldq	$16, 9*8($30)
-	ldq	$17, 10*8($30)
-	ldq	$18, 11*8($30)
-	ldq	$19, 12*8($30)
-	ldq	$20, 13*8($30)
-	ldq	$21, 14*8($30)
-	ldq	$22, 15*8($30)
-	ldq	$23, 16*8($30)
-	ldq	$26, 17*8($30)
-	lda	$30, 18*8($30)
-	ret	$31, ($28), 0
-	.end	__down_write_failed
-
-/* __rwsem_wake takes the semaphore in $24, readers in $25;
-   clobbers $24, $25, and $28.  */
-
-	.globl	__rwsem_wake
-	.ent	__rwsem_wake
-__rwsem_wake:
-	ldgp	$29,0($27)
-	lda	$30, -18*8($30)
-	stq	$28, 0*8($30)
-	stq	$0, 1*8($30)
-	stq	$1, 2*8($30)
-	stq	$2, 3*8($30)
-	stq	$3, 4*8($30)
-	stq	$4, 5*8($30)
-	stq	$5, 6*8($30)
-	stq	$6, 7*8($30)
-	stq	$7, 8*8($30)
-	stq	$16, 9*8($30)
-	stq	$17, 10*8($30)
-	stq	$18, 11*8($30)
-	stq	$19, 12*8($30)
-	stq	$20, 13*8($30)
-	stq	$21, 14*8($30)
-	stq	$22, 15*8($30)
-	stq	$23, 16*8($30)
-	stq	$26, 17*8($30)
-	.frame $30, 18*8, $28
-	.prologue 1
-	
-	mov	$24, $16
-	mov	$25, $17
-	jsr	__do_rwsem_wake
-	
-	ldq	$28, 0*8($30)
-	ldq	$0, 1*8($30)
-	ldq	$1, 2*8($30)
-	ldq	$2, 3*8($30)
-	ldq	$3, 4*8($30)
-	ldq	$4, 5*8($30)
-	ldq	$5, 6*8($30)
-	ldq	$6, 7*8($30)
-	ldq	$7, 8*8($30)
-	ldq	$16, 9*8($30)
-	ldq	$17, 10*8($30)
-	ldq	$18, 11*8($30)
-	ldq	$19, 12*8($30)
-	ldq	$20, 13*8($30)
-	ldq	$21, 14*8($30)
-	ldq	$22, 15*8($30)
-	ldq	$23, 16*8($30)
-	ldq	$26, 17*8($30)
-	lda	$30, 18*8($30)
-	ret	$31, ($28), 0
-	.end	__rwsem_wake
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2344,18 +2344,7 @@ static mdk_personality_t raid5_personality=

 int raid5_init (void)
 {
-	int err;
-
-	err = register_md_personality (RAID5, &raid5_personality);
-	if (err)
-		return err;
-
-	/*
-	 * pick a XOR routine, runtime.
-	 */
-	calibrate_xor_block();
-
-	return 0;
+	return register_md_personality (RAID5, &raid5_personality);
 }

 #ifdef MODULE

--- a/drivers/md/xor.c
+++ b/drivers/md/xor.c
 /*
 * xor.c : Multiple Devices driver for Linux
 *
- * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
 *
- *
- * optimized RAID-5 checksumming functions.
+ * Dispatch optimized RAID-5 checksumming functions.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -15,2584 +15,66 @@
 * (for example /usr/src/linux/COPYING); if not, write to the Free
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
+
 #include <linux/config.h>
 #define BH_TRACE 0
 #include <linux/module.h>
 #include <linux/raid/md.h>
-#ifdef __sparc_v9__
-#include <asm/head.h>
-#include <asm/asi.h>
-#include <asm/visasm.h>
-#endif
-
-/*
- * we use the 'XOR function template' to register multiple xor
- * functions runtime. The kernel measures their speed upon bootup
- * and decides which one to use. (compile-time registration is
- * not enough as certain CPU features like MMX can only be detected
- * runtime)
- *
- * this architecture makes it pretty easy to add new routines
- * that are faster on certain CPUs, without killing other CPU's
- * 'native' routine. Although the current routines are belived
- * to be the physically fastest ones on all CPUs tested, but
- * feel free to prove me wrong and add yet another routine =B-)
- * --mingo
- */
-
-#define MAX_XOR_BLOCKS 5
-
-#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
-
-typedef void (*xor_block_t) XOR_ARGS;
-xor_block_t xor_block = NULL;
-
-#ifndef __sparc_v9__
-
-struct xor_block_template;
-
-struct xor_block_template {
-	char * name;
-	xor_block_t xor_block;
-	int speed;
-	struct xor_block_template * next;
-};
-
-struct xor_block_template * xor_functions = NULL;
-
-#define XORBLOCK_TEMPLATE(x) \
-static void xor_block_##x XOR_ARGS; \
-static struct xor_block_template t_xor_block_##x = \
-				 { #x, xor_block_##x, 0, NULL }; \
-static void xor_block_##x XOR_ARGS
-
-#ifdef __i386__
-
-#ifdef CONFIG_X86_XMM
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-
-XORBLOCK_TEMPLATE(pIII_kni)
-{
-	char xmm_save[16*4];
-	int cr0;
-        int lines = (bh_ptr[0]->b_size>>8);
-
-	__asm__ __volatile__ ( 
-		"movl %%cr0,%0		;\n\t"
-		"clts			;\n\t"
-		"movups %%xmm0,(%1)	;\n\t"
-		"movups %%xmm1,0x10(%1)	;\n\t"
-		"movups %%xmm2,0x20(%1)	;\n\t"
-		"movups %%xmm3,0x30(%1)	;\n\t"
-		: "=r" (cr0)
-		: "r" (xmm_save) 
-		: "memory" );
-
-#define OFFS(x) "8*("#x"*2)"
-#define	PF0(x) \
-	"	prefetcht0  "OFFS(x)"(%1)   ;\n"
-#define LD(x,y) \
-        "       movaps   "OFFS(x)"(%1), %%xmm"#y"   ;\n"
-#define ST(x,y) \
-        "       movaps %%xmm"#y",   "OFFS(x)"(%1)   ;\n"
-#define PF1(x) \
-	"	prefetchnta "OFFS(x)"(%2)   ;\n"
-#define PF2(x) \
-	"	prefetchnta "OFFS(x)"(%3)   ;\n"
-#define PF3(x) \
-	"	prefetchnta "OFFS(x)"(%4)   ;\n"
-#define PF4(x) \
-	"	prefetchnta "OFFS(x)"(%5)   ;\n"
-#define PF5(x) \
-	"	prefetchnta "OFFS(x)"(%6)   ;\n"
-#define XO1(x,y) \
-        "       xorps   "OFFS(x)"(%2), %%xmm"#y"   ;\n"
-#define XO2(x,y) \
-        "       xorps   "OFFS(x)"(%3), %%xmm"#y"   ;\n"
-#define XO3(x,y) \
-        "       xorps   "OFFS(x)"(%4), %%xmm"#y"   ;\n"
-#define XO4(x,y) \
-        "       xorps   "OFFS(x)"(%5), %%xmm"#y"   ;\n"
-#define XO5(x,y) \
-        "       xorps   "OFFS(x)"(%6), %%xmm"#y"   ;\n"
-
-	switch(count) {
-		case 2:
-		        __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-		LD(i,0)					\
-			LD(i+1,1)			\
-		PF1(i)					\
-				PF1(i+2)		\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32,0x90		;\n"
-        " 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-
-        		:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-        		  "r" (bh_ptr[1]->b_data)
-		        : "memory" );
-			break;
-		case 3:
-		        __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i+2)		\
-		LD(i,0)					\
-			LD(i+1,1)			\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
-		PF2(i)					\
-				PF2(i+2)		\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		XO2(i,0)				\
-			XO2(i+1,1)			\
-				XO2(i+2,2)		\
-					XO2(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32,0x90		;\n"
-        " 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-        		:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-        		  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data)
-		        : "memory" );
-			break;
-		case 4:
-		        __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i+2)		\
-		LD(i,0)					\
-			LD(i+1,1)			\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
-		PF2(i)					\
-				PF2(i+2)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		PF3(i)					\
-				PF3(i+2)		\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO2(i,0)				\
-			XO2(i+1,1)			\
-				XO2(i+2,2)		\
-					XO2(i+3,3)	\
-		XO3(i,0)				\
-			XO3(i+1,1)			\
-				XO3(i+2,2)		\
-					XO3(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32,0x90		;\n"
-        " 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
+#include <linux/raid/xor.h>
+#include <asm/xor.h>

-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       addl $256, %4           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
+/* The xor routines to use.  */
+static struct xor_block_template *active_template;

-        		:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-        		  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data),
-			  "r" (bh_ptr[3]->b_data)
-		        : "memory" );
-			break;
-		case 5:
-		        __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i+2)		\
-		LD(i,0)					\
-			LD(i+1,1)			\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
-		PF2(i)					\
-				PF2(i+2)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		PF3(i)					\
-				PF3(i+2)		\
-		XO2(i,0)				\
-			XO2(i+1,1)			\
-				XO2(i+2,2)		\
-					XO2(i+3,3)	\
-		PF4(i)					\
-				PF4(i+2)		\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO3(i,0)				\
-			XO3(i+1,1)			\
-				XO3(i+2,2)		\
-					XO3(i+3,3)	\
-		XO4(i,0)				\
-			XO4(i+1,1)			\
-				XO4(i+2,2)		\
-					XO4(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32,0x90		;\n"
-        " 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       addl $256, %4           ;\n"
-        "       addl $256, %5           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-
-        		:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-        		  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data),
-			  "r" (bh_ptr[3]->b_data),
-			  "r" (bh_ptr[4]->b_data)
-			: "memory");
-			break;
-	}
-
-	__asm__ __volatile__ ( 
-		"sfence			;\n\t"
-		"movups (%1),%%xmm0	;\n\t"
-		"movups 0x10(%1),%%xmm1	;\n\t"
-		"movups 0x20(%1),%%xmm2	;\n\t"
-		"movups 0x30(%1),%%xmm3	;\n\t"
-		"movl 	%0,%%cr0	;\n\t"
-		:
-		: "r" (cr0), "r" (xmm_save)
-		: "memory" );
-}
-
-#undef OFFS
-#undef LD
-#undef ST
-#undef PF0
-#undef PF1
-#undef PF2
-#undef PF3
-#undef PF4
-#undef PF5
-#undef XO1
-#undef XO2
-#undef XO3
-#undef XO4
-#undef XO5
-#undef BLOCK
-
-#endif /* CONFIG_X86_XMM */
-
-/*
- * high-speed RAID5 checksumming functions utilizing MMX instructions
- * Copyright (C) 1998 Ingo Molnar
- */
-XORBLOCK_TEMPLATE(pII_mmx)
+void
+xor_block(unsigned int count, struct buffer_head **bh_ptr)
 {
-	char fpu_save[108];
-        int lines = (bh_ptr[0]->b_size>>7);
-
-	if (!(current->flags & PF_USEDFPU))
-		__asm__ __volatile__ ( " clts;\n");
-
-	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
-
-#define LD(x,y) \
-        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
-#define ST(x,y) \
-        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
-#define XO1(x,y) \
-        "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
-#define XO2(x,y) \
-        "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
-#define XO3(x,y) \
-        "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
-#define XO4(x,y) \
-        "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
-
-	switch(count) {
-		case 2:
-			__asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-			LD(i,0)					\
-				LD(i+1,1)			\
-					LD(i+2,2)		\
-						LD(i+3,3)	\
-			XO1(i,0)				\
-			ST(i,0)					\
-				XO1(i+1,1)			\
-				ST(i+1,1)			\
-					XO1(i+2,2)		\
-					ST(i+2,2)		\
-						XO1(i+3,3)	\
-						ST(i+3,3)
-
-			" .align 32,0x90		;\n"
-  			" 1:                            ;\n"
-
-			BLOCK(0)
-			BLOCK(4)
-			BLOCK(8)
-			BLOCK(12)
+	unsigned long *p0, *p1, *p2, *p3, *p4;
+	unsigned long bytes = bh_ptr[0]->b_size;

-		        "       addl $128, %1         ;\n"
-		        "       addl $128, %2         ;\n"
-		        "       decl %0               ;\n"
-		        "       jnz 1b                ;\n"
-	        	:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-			  "r" (bh_ptr[1]->b_data)
-			: "memory");
-			break;
-		case 3:
-			__asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-			LD(i,0)					\
-				LD(i+1,1)			\
-					LD(i+2,2)		\
-						LD(i+3,3)	\
-			XO1(i,0)				\
-				XO1(i+1,1)			\
-					XO1(i+2,2)		\
-						XO1(i+3,3)	\
-			XO2(i,0)				\
-			ST(i,0)					\
-				XO2(i+1,1)			\
-				ST(i+1,1)			\
-					XO2(i+2,2)		\
-					ST(i+2,2)		\
-						XO2(i+3,3)	\
-						ST(i+3,3)
-
-			" .align 32,0x90		;\n"
-  			" 1:                            ;\n"
-
-			BLOCK(0)
-			BLOCK(4)
-			BLOCK(8)
-			BLOCK(12)
-
-		        "       addl $128, %1         ;\n"
-		        "       addl $128, %2         ;\n"
-		        "       addl $128, %3         ;\n"
-		        "       decl %0               ;\n"
-		        "       jnz 1b                ;\n"
-	        	:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-			  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data)
-			: "memory");
-			break;
-		case 4:
-			__asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-			LD(i,0)					\
-				LD(i+1,1)			\
-					LD(i+2,2)		\
-						LD(i+3,3)	\
-			XO1(i,0)				\
-				XO1(i+1,1)			\
-					XO1(i+2,2)		\
-						XO1(i+3,3)	\
-			XO2(i,0)				\
-				XO2(i+1,1)			\
-					XO2(i+2,2)		\
-						XO2(i+3,3)	\
-			XO3(i,0)				\
-			ST(i,0)					\
-				XO3(i+1,1)			\
-				ST(i+1,1)			\
-					XO3(i+2,2)		\
-					ST(i+2,2)		\
-						XO3(i+3,3)	\
-						ST(i+3,3)
-
-			" .align 32,0x90		;\n"
-  			" 1:                            ;\n"
-
-			BLOCK(0)
-			BLOCK(4)
-			BLOCK(8)
-			BLOCK(12)
-
-		        "       addl $128, %1         ;\n"
-		        "       addl $128, %2         ;\n"
-		        "       addl $128, %3         ;\n"
-		        "       addl $128, %4         ;\n"
-		        "       decl %0               ;\n"
-		        "       jnz 1b                ;\n"
-	        	:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-			  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data),
-			  "r" (bh_ptr[3]->b_data)
-			: "memory");
-			break;
-		case 5:
-			__asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-			LD(i,0)					\
-				LD(i+1,1)			\
-					LD(i+2,2)		\
-						LD(i+3,3)	\
-			XO1(i,0)				\
-				XO1(i+1,1)			\
-					XO1(i+2,2)		\
-						XO1(i+3,3)	\
-			XO2(i,0)				\
-				XO2(i+1,1)			\
-					XO2(i+2,2)		\
-						XO2(i+3,3)	\
-			XO3(i,0)				\
-				XO3(i+1,1)			\
-					XO3(i+2,2)		\
-						XO3(i+3,3)	\
-			XO4(i,0)				\
-			ST(i,0)					\
-				XO4(i+1,1)			\
-				ST(i+1,1)			\
-					XO4(i+2,2)		\
-					ST(i+2,2)		\
-						XO4(i+3,3)	\
-						ST(i+3,3)
-
-			" .align 32,0x90		;\n"
-  			" 1:                            ;\n"
-
-			BLOCK(0)
-			BLOCK(4)
-			BLOCK(8)
-			BLOCK(12)
-
-		        "       addl $128, %1         ;\n"
-		        "       addl $128, %2         ;\n"
-		        "       addl $128, %3         ;\n"
-		        "       addl $128, %4         ;\n"
-		        "       addl $128, %5         ;\n"
-		        "       decl %0               ;\n"
-		        "       jnz 1b                ;\n"
-	        	:
-			: "g" (lines),
-			  "r" (bh_ptr[0]->b_data),
-			  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data),
-			  "r" (bh_ptr[3]->b_data),
-			  "r" (bh_ptr[4]->b_data)
-			: "memory");
-			break;
+	p0 = (unsigned long *) bh_ptr[0]->b_data;
+	p1 = (unsigned long *) bh_ptr[1]->b_data;
+	if (count == 2) {
+		active_template->do_2(bytes, p0, p1);
+		return;
 	}

-	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
-	if (!(current->flags & PF_USEDFPU))
-		stts();
-}
-
-#undef LD
-#undef XO1
-#undef XO2
-#undef XO3
-#undef XO4
-#undef ST
-#undef BLOCK
-
-XORBLOCK_TEMPLATE(p5_mmx)
-{
-	char fpu_save[108];
-        int lines = (bh_ptr[0]->b_size>>6);
-
-	if (!(current->flags & PF_USEDFPU))
-		__asm__ __volatile__ ( " clts;\n");
-
-	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
-
-	switch(count) {
-		case 2:
-		        __asm__ __volatile__ (
-
-			        " .align 32,0x90             ;\n"
-			        " 1:                         ;\n"
-			        "       movq   (%1), %%mm0   ;\n"
-			        "       movq  8(%1), %%mm1   ;\n"
-			        "       pxor   (%2), %%mm0   ;\n"
-			        "       movq 16(%1), %%mm2   ;\n"
-			        "       movq %%mm0,   (%1)   ;\n"
-			        "       pxor  8(%2), %%mm1   ;\n"
-			        "       movq 24(%1), %%mm3   ;\n"
-			        "       movq %%mm1,  8(%1)   ;\n"
-			        "       pxor 16(%2), %%mm2   ;\n"
-			        "       movq 32(%1), %%mm4   ;\n"
-			        "       movq %%mm2, 16(%1)   ;\n"
-			        "       pxor 24(%2), %%mm3   ;\n"
-			        "       movq 40(%1), %%mm5   ;\n"
-			        "       movq %%mm3, 24(%1)   ;\n"
-			        "       pxor 32(%2), %%mm4   ;\n"
-			        "       movq 48(%1), %%mm6   ;\n"
-			        "       movq %%mm4, 32(%1)   ;\n"
-			        "       pxor 40(%2), %%mm5   ;\n"
-			        "       movq 56(%1), %%mm7   ;\n"
-			        "       movq %%mm5, 40(%1)   ;\n"
-			        "       pxor 48(%2), %%mm6   ;\n"
-			        "       pxor 56(%2), %%mm7   ;\n"
-			        "       movq %%mm6, 48(%1)   ;\n"
-			        "       movq %%mm7, 56(%1)   ;\n"
-        
-			        "       addl $64, %1         ;\n"
-			        "       addl $64, %2         ;\n"
-			        "       decl %0              ;\n"
-			        "       jnz 1b               ;\n"
-
-			        : 
-			        : "r" (lines),
-				  "r" (bh_ptr[0]->b_data),
-				  "r" (bh_ptr[1]->b_data)
-			        : "memory" );
-			break;
-		case 3:
-			__asm__ __volatile__ (
-
-			        " .align 32,0x90             ;\n"
-			        " 1:                         ;\n"
-			        "       movq   (%1), %%mm0   ;\n"
-			        "       movq  8(%1), %%mm1   ;\n"
-			        "       pxor   (%2), %%mm0   ;\n"
-			        "       movq 16(%1), %%mm2   ;\n"
-			        "       pxor  8(%2), %%mm1   ;\n"
-			        "       pxor   (%3), %%mm0   ;\n"
-			        "       pxor 16(%2), %%mm2   ;\n"
-			        "       movq %%mm0,   (%1)   ;\n"
-			        "       pxor  8(%3), %%mm1   ;\n"
-			        "       pxor 16(%3), %%mm2   ;\n"
-			        "       movq 24(%1), %%mm3   ;\n"
-			        "       movq %%mm1,  8(%1)   ;\n"
-			        "       movq 32(%1), %%mm4   ;\n"
-			        "       movq 40(%1), %%mm5   ;\n"
-			        "       pxor 24(%2), %%mm3   ;\n"
-			        "       movq %%mm2, 16(%1)   ;\n"
-			        "       pxor 32(%2), %%mm4   ;\n"
-			        "       pxor 24(%3), %%mm3   ;\n"
-			        "       pxor 40(%2), %%mm5   ;\n"
-			        "       movq %%mm3, 24(%1)   ;\n"
-			        "       pxor 32(%3), %%mm4   ;\n"
-			        "       pxor 40(%3), %%mm5   ;\n"
-			        "       movq 48(%1), %%mm6   ;\n"
-			        "       movq %%mm4, 32(%1)   ;\n"
-			        "       movq 56(%1), %%mm7   ;\n"
-			        "       pxor 48(%2), %%mm6   ;\n"
-			        "       movq %%mm5, 40(%1)   ;\n"
-			        "       pxor 56(%2), %%mm7   ;\n"
-			        "       pxor 48(%3), %%mm6   ;\n"
-			        "       pxor 56(%3), %%mm7   ;\n"
-			        "       movq %%mm6, 48(%1)   ;\n"
-			        "       movq %%mm7, 56(%1)   ;\n"
-        
-			        "       addl $64, %1         ;\n"
-			        "       addl $64, %2         ;\n"
-			        "       addl $64, %3         ;\n"
-			        "       decl %0              ;\n"
-			        "       jnz 1b               ;\n"
-
-			        : 
-			        : "r" (lines),
-				  "r" (bh_ptr[0]->b_data),
-				  "r" (bh_ptr[1]->b_data),
-				  "r" (bh_ptr[2]->b_data)
-			        : "memory" );
-			break;
-		case 4:
-			__asm__ __volatile__ (
-
-			        " .align 32,0x90             ;\n"
-			        " 1:                         ;\n"
-			        "       movq   (%1), %%mm0   ;\n"
-			        "       movq  8(%1), %%mm1   ;\n"
-			        "       pxor   (%2), %%mm0   ;\n"
-			        "       movq 16(%1), %%mm2   ;\n"
-			        "       pxor  8(%2), %%mm1   ;\n"
-			        "       pxor   (%3), %%mm0   ;\n"
-			        "       pxor 16(%2), %%mm2   ;\n"
-			        "       pxor  8(%3), %%mm1   ;\n"
-			        "       pxor   (%4), %%mm0   ;\n"
-			        "       movq 24(%1), %%mm3   ;\n"
-			        "       pxor 16(%3), %%mm2   ;\n"
-			        "       pxor  8(%4), %%mm1   ;\n"
-			        "       movq %%mm0,   (%1)   ;\n"
-			        "       movq 32(%1), %%mm4   ;\n"
-			        "       pxor 24(%2), %%mm3   ;\n"
-			        "       pxor 16(%4), %%mm2   ;\n"
-			        "       movq %%mm1,  8(%1)   ;\n"
-			        "       movq 40(%1), %%mm5   ;\n"
-			        "       pxor 32(%2), %%mm4   ;\n"
-			        "       pxor 24(%3), %%mm3   ;\n"
-			        "       movq %%mm2, 16(%1)   ;\n"
-			        "       pxor 40(%2), %%mm5   ;\n"
-			        "       pxor 32(%3), %%mm4   ;\n"
-			        "       pxor 24(%4), %%mm3   ;\n"
-			        "       movq %%mm3, 24(%1)   ;\n"
-			        "       movq 56(%1), %%mm7   ;\n"
-			        "       movq 48(%1), %%mm6   ;\n"
-			        "       pxor 40(%3), %%mm5   ;\n"
-			        "       pxor 32(%4), %%mm4   ;\n"
-			        "       pxor 48(%2), %%mm6   ;\n"
-			        "       movq %%mm4, 32(%1)   ;\n"
-			        "       pxor 56(%2), %%mm7   ;\n"
-			        "       pxor 40(%4), %%mm5   ;\n"
-			        "       pxor 48(%3), %%mm6   ;\n"
-			        "       pxor 56(%3), %%mm7   ;\n"
-			        "       movq %%mm5, 40(%1)   ;\n"
-			        "       pxor 48(%4), %%mm6   ;\n"
-			        "       pxor 56(%4), %%mm7   ;\n"
-			        "       movq %%mm6, 48(%1)   ;\n"
-			        "       movq %%mm7, 56(%1)   ;\n"
-        
-			        "       addl $64, %1         ;\n"
-			        "       addl $64, %2         ;\n"
-			        "       addl $64, %3         ;\n"
-			        "       addl $64, %4         ;\n"
-			        "       decl %0              ;\n"
-			        "       jnz 1b               ;\n"
-
-			        : 
-			        : "r" (lines),
-				  "r" (bh_ptr[0]->b_data),
-				  "r" (bh_ptr[1]->b_data),
-				  "r" (bh_ptr[2]->b_data),
-				  "r" (bh_ptr[3]->b_data)
-			        : "memory" );
-			break;
-		case 5:
-			__asm__ __volatile__ (
-
-			        " .align 32,0x90             ;\n"
-			        " 1:                         ;\n"
-			        "       movq   (%1), %%mm0   ;\n"
-			        "       movq  8(%1), %%mm1   ;\n"
-			        "       pxor   (%2), %%mm0   ;\n"
-			        "       pxor  8(%2), %%mm1   ;\n"
-			        "       movq 16(%1), %%mm2   ;\n"
-			        "       pxor   (%3), %%mm0   ;\n"
-			        "       pxor  8(%3), %%mm1   ;\n"
-			        "       pxor 16(%2), %%mm2   ;\n"
-			        "       pxor   (%4), %%mm0   ;\n"
-			        "       pxor  8(%4), %%mm1   ;\n"
-			        "       pxor 16(%3), %%mm2   ;\n"
-			        "       movq 24(%1), %%mm3   ;\n"
-			        "       pxor   (%5), %%mm0   ;\n"
-			        "       pxor  8(%5), %%mm1   ;\n"
-			        "       movq %%mm0,   (%1)   ;\n"
-			        "       pxor 16(%4), %%mm2   ;\n"
-			        "       pxor 24(%2), %%mm3   ;\n"
-			        "       movq %%mm1,  8(%1)   ;\n"
-			        "       pxor 16(%5), %%mm2   ;\n"
-			        "       pxor 24(%3), %%mm3   ;\n"
-			        "       movq 32(%1), %%mm4   ;\n"
-			        "       movq %%mm2, 16(%1)   ;\n"
-			        "       pxor 24(%4), %%mm3   ;\n"
-			        "       pxor 32(%2), %%mm4   ;\n"
-			        "       movq 40(%1), %%mm5   ;\n"
-			        "       pxor 24(%5), %%mm3   ;\n"
-			        "       pxor 32(%3), %%mm4   ;\n"
-			        "       pxor 40(%2), %%mm5   ;\n"
-			        "       movq %%mm3, 24(%1)   ;\n"
-			        "       pxor 32(%4), %%mm4   ;\n"
-			        "       pxor 40(%3), %%mm5   ;\n"
-			        "       movq 48(%1), %%mm6   ;\n"
-			        "       movq 56(%1), %%mm7   ;\n"
-			        "       pxor 32(%5), %%mm4   ;\n"
-			        "       pxor 40(%4), %%mm5   ;\n"
-			        "       pxor 48(%2), %%mm6   ;\n"
-			        "       pxor 56(%2), %%mm7   ;\n"
-			        "       movq %%mm4, 32(%1)   ;\n"
-			        "       pxor 48(%3), %%mm6   ;\n"
-			        "       pxor 56(%3), %%mm7   ;\n"
-			        "       pxor 40(%5), %%mm5   ;\n"
-			        "       pxor 48(%4), %%mm6   ;\n"
-			        "       pxor 56(%4), %%mm7   ;\n"
-			        "       movq %%mm5, 40(%1)   ;\n"
-			        "       pxor 48(%5), %%mm6   ;\n"
-			        "       pxor 56(%5), %%mm7   ;\n"
-			        "       movq %%mm6, 48(%1)   ;\n"
-			        "       movq %%mm7, 56(%1)   ;\n"
-        
-			        "       addl $64, %1         ;\n"
-			        "       addl $64, %2         ;\n"
-			        "       addl $64, %3         ;\n"
-			        "       addl $64, %4         ;\n"
-			        "       addl $64, %5         ;\n"
-			        "       decl %0              ;\n"
-			        "       jnz 1b               ;\n"
-
-			        : 
-			        : "g" (lines),
-				  "r" (bh_ptr[0]->b_data),
-				  "r" (bh_ptr[1]->b_data),
-				  "r" (bh_ptr[2]->b_data),
-				  "r" (bh_ptr[3]->b_data),
-				  "r" (bh_ptr[4]->b_data)
-			        : "memory" );
-			break;
+	p2 = (unsigned long *) bh_ptr[2]->b_data;
+	if (count == 3) {
+		active_template->do_3(bytes, p0, p1, p2);
+		return;
 	}

-	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
-	if (!(current->flags & PF_USEDFPU))
-		stts();
-}
-#endif /* __i386__ */
-#endif /* !__sparc_v9__ */
-
-#ifdef __sparc_v9__
-/*
- * High speed xor_block operation for RAID4/5 utilizing the
- * UltraSparc Visual Instruction Set.
- *
- * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
- *
- *	Requirements:
- *	!(((long)dest | (long)sourceN) & (64 - 1)) &&
- *	!(len & 127) && len >= 256
- *
- * It is done in pure assembly, as otherwise gcc makes it
- * a non-leaf function, which is not what we want.
- * Also, we don't measure the speeds as on other architectures,
- * as the measuring routine does not take into account cold caches
- * and the fact that xor_block_VIS bypasses the caches.
- * xor_block_32regs might be 5% faster for count 2 if caches are hot
- * and things just right (for count 3 VIS is about as fast as 32regs for
- * hot caches and for count 4 and 5 VIS is faster by good margin always),
- * but I think it is better not to pollute the caches.
- * Actually, if I'd just fight for speed for hot caches, I could
- * write a hybrid VIS/integer routine, which would do always two
- * 64B blocks in VIS and two in IEUs, but I really care more about
- * caches.
- */
-extern void *VISenter(void);
-extern void xor_block_VIS XOR_ARGS;
-
-void __xor_block_VIS(void)
-{
-__asm__ ("
-	.globl xor_block_VIS
-xor_block_VIS:
-	ldx	[%%o1 + 0], %%o4
-	ldx	[%%o1 + 8], %%o3
-	ldx	[%%o4 + %1], %%g5
-	ldx	[%%o4 + %0], %%o4
-	ldx	[%%o3 + %0], %%o3
-	rd	%%fprs, %%o5
-	andcc	%%o5, %2, %%g0
-	be,pt	%%icc, 297f
-	 sethi	%%hi(%5), %%g1
-	jmpl	%%g1 + %%lo(%5), %%g7
-	 add	%%g7, 8, %%g7
-297:	wr	%%g0, %4, %%fprs
-	membar	#LoadStore|#StoreLoad|#StoreStore
-	sub	%%g5, 64, %%g5
-	ldda	[%%o4] %3, %%f0
-	ldda	[%%o3] %3, %%f16
-	cmp	%%o0, 4
-	bgeu,pt	%%xcc, 10f
-	 cmp	%%o0, 3
-	be,pn	%%xcc, 13f
-	 mov	-64, %%g1
-	sub	%%g5, 64, %%g5
-	rd	%%asi, %%g1
-	wr	%%g0, %3, %%asi
-
-2:	ldda	[%%o4 + 64] %%asi, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	stda	%%f16, [%%o4] %3
-	ldda	[%%o3 + 64] %%asi, %%f48
-	ldda	[%%o4 + 128] %%asi, %%f0
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	add	%%o4, 128, %%o4
-	fxor	%%f36, %%f52, %%f52
-	add	%%o3, 128, %%o3
-	fxor	%%f38, %%f54, %%f54
-	subcc	%%g5, 128, %%g5
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%%o4 - 64] %%asi
-	bne,pt	%%xcc, 2b
-	 ldda	[%%o3] %3, %%f16
-
-	ldda	[%%o4 + 64] %%asi, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	stda	%%f16, [%%o4] %3
-	ldda	[%%o3 + 64] %%asi, %%f48
-	membar	#Sync
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	fxor	%%f36, %%f52, %%f52
-	fxor	%%f38, %%f54, %%f54
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%%o4 + 64] %%asi
-	membar	#Sync|#StoreStore|#StoreLoad
-	wr	%%g0, 0, %%fprs
-	retl
-	 wr	%%g1, %%g0, %%asi
-
-13:	ldx	[%%o1 + 16], %%o2
-	ldx	[%%o2 + %0], %%o2
-
-3:	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f48
-	fxor	%%f2, %%f18, %%f50
-	add	%%o4, 64, %%o4
-	fxor	%%f4, %%f20, %%f52
-	fxor	%%f6, %%f22, %%f54
-	add	%%o3, 64, %%o3
-	fxor	%%f8, %%f24, %%f56
-	fxor	%%f10, %%f26, %%f58
-	fxor	%%f12, %%f28, %%f60
-	fxor	%%f14, %%f30, %%f62
-	ldda	[%%o4] %3, %%f0
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	add	%%o2, 64, %%o2
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	subcc	%%g5, 64, %%g5
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	stda	%%f48, [%%o4 + %%g1] %3
-	bne,pt	%%xcc, 3b
-	 ldda	[%%o3] %3, %%f16
-
-	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f48
-	fxor	%%f2, %%f18, %%f50
-	fxor	%%f4, %%f20, %%f52
-	fxor	%%f6, %%f22, %%f54
-	fxor	%%f8, %%f24, %%f56
-	fxor	%%f10, %%f26, %%f58
-	fxor	%%f12, %%f28, %%f60
-	fxor	%%f14, %%f30, %%f62
-	membar	#Sync
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	stda	%%f48, [%%o4] %3
-	membar	#Sync|#StoreStore|#StoreLoad
-	retl
-	 wr	%%g0, 0, %%fprs
-
-10:	cmp	%%o0, 5
-	be,pt	%%xcc, 15f
-	 mov	-64, %%g1
-
-14:	ldx	[%%o1 + 16], %%o2
-	ldx	[%%o1 + 24], %%o0
-	ldx	[%%o2 + %0], %%o2
-	ldx	[%%o0 + %0], %%o0
-
-4:	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	add	%%o4, 64, %%o4
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	add	%%o3, 64, %%o3
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	ldda	[%%o0] %3, %%f48
-	fxor	%%f16, %%f32, %%f32
-	fxor	%%f18, %%f34, %%f34
-	fxor	%%f20, %%f36, %%f36
-	fxor	%%f22, %%f38, %%f38
-	add	%%o2, 64, %%o2
-	fxor	%%f24, %%f40, %%f40
-	fxor	%%f26, %%f42, %%f42
-	fxor	%%f28, %%f44, %%f44
-	fxor	%%f30, %%f46, %%f46
-	ldda	[%%o4] %3, %%f0
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	fxor	%%f36, %%f52, %%f52
-	add	%%o0, 64, %%o0
-	fxor	%%f38, %%f54, %%f54
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	subcc	%%g5, 64, %%g5
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%%o4 + %%g1] %3
-	bne,pt	%%xcc, 4b
-	 ldda	[%%o3] %3, %%f16
-
-	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	ldda	[%%o0] %3, %%f48
-	fxor	%%f16, %%f32, %%f32
-	fxor	%%f18, %%f34, %%f34
-	fxor	%%f20, %%f36, %%f36
-	fxor	%%f22, %%f38, %%f38
-	fxor	%%f24, %%f40, %%f40
-	fxor	%%f26, %%f42, %%f42
-	fxor	%%f28, %%f44, %%f44
-	fxor	%%f30, %%f46, %%f46
-	membar	#Sync
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	fxor	%%f36, %%f52, %%f52
-	fxor	%%f38, %%f54, %%f54
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%%o4] %3
-	membar	#Sync|#StoreStore|#StoreLoad
-	retl
-	 wr	%%g0, 0, %%fprs
-
-15:	ldx	[%%o1 + 16], %%o2
-	ldx	[%%o1 + 24], %%o0
-	ldx	[%%o1 + 32], %%o1
-	ldx	[%%o2 + %0], %%o2
-	ldx	[%%o0 + %0], %%o0
-	ldx	[%%o1 + %0], %%o1
-
-5:	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f48
-	fxor	%%f2, %%f18, %%f50
-	add	%%o4, 64, %%o4
-	fxor	%%f4, %%f20, %%f52
-	fxor	%%f6, %%f22, %%f54
-	add	%%o3, 64, %%o3
-	fxor	%%f8, %%f24, %%f56
-	fxor	%%f10, %%f26, %%f58
-	fxor	%%f12, %%f28, %%f60
-	fxor	%%f14, %%f30, %%f62
-	ldda	[%%o0] %3, %%f16
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	add	%%o2, 64, %%o2
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	ldda	[%%o1] %3, %%f32
-	fxor	%%f48, %%f16, %%f48
-	fxor	%%f50, %%f18, %%f50
-	add	%%o0, 64, %%o0
-	fxor	%%f52, %%f20, %%f52
-	fxor	%%f54, %%f22, %%f54
-	add	%%o1, 64, %%o1
-	fxor	%%f56, %%f24, %%f56
-	fxor	%%f58, %%f26, %%f58
-	fxor	%%f60, %%f28, %%f60
-	fxor	%%f62, %%f30, %%f62
-	ldda	[%%o4] %3, %%f0
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	subcc	%%g5, 64, %%g5
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	stda	%%f48, [%%o4 + %%g1] %3
-	bne,pt	%%xcc, 5b
-	 ldda	[%%o3] %3, %%f16
-
-	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f48
-	fxor	%%f2, %%f18, %%f50
-	fxor	%%f4, %%f20, %%f52
-	fxor	%%f6, %%f22, %%f54
-	fxor	%%f8, %%f24, %%f56
-	fxor	%%f10, %%f26, %%f58
-	fxor	%%f12, %%f28, %%f60
-	fxor	%%f14, %%f30, %%f62
-	ldda	[%%o0] %3, %%f16
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	ldda	[%%o1] %3, %%f32
-	fxor	%%f48, %%f16, %%f48
-	fxor	%%f50, %%f18, %%f50
-	fxor	%%f52, %%f20, %%f52
-	fxor	%%f54, %%f22, %%f54
-	fxor	%%f56, %%f24, %%f56
-	fxor	%%f58, %%f26, %%f58
-	fxor	%%f60, %%f28, %%f60
-	fxor	%%f62, %%f30, %%f62
-	membar	#Sync
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	stda	%%f48, [%%o4] %3
-	membar	#Sync|#StoreStore|#StoreLoad
-	retl
-	 wr	%%g0, 0, %%fprs
-	" : :
-	"i" (&((struct buffer_head *)0)->b_data),
-	"i" (&((struct buffer_head *)0)->b_size),
-	"i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
-	"i" (FPRS_FEF), "i" (VISenter));
-}
-#endif /* __sparc_v9__ */
-
-#if defined(__sparc__) && !defined(__sparc_v9__)
-/*
- * High speed xor_block operation for RAID4/5 utilizing the
- * ldd/std SPARC instructions.
- *
- * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
- *
- */
-
-XORBLOCK_TEMPLATE(SPARC)
-{
-	int size  = bh_ptr[0]->b_size;
-	int lines = size / (sizeof (long)) / 8, i;
-	long *destp   = (long *) bh_ptr[0]->b_data;
-	long *source1 = (long *) bh_ptr[1]->b_data;
-	long *source2, *source3, *source4;
-
-	switch (count) {
-	case 2:
-		for (i = lines; i > 0; i--) {
-		  __asm__ __volatile__("
-		  ldd [%0 + 0x00], %%g2
-		  ldd [%0 + 0x08], %%g4
-		  ldd [%0 + 0x10], %%o0
-		  ldd [%0 + 0x18], %%o2
-		  ldd [%1 + 0x00], %%o4
-		  ldd [%1 + 0x08], %%l0
-		  ldd [%1 + 0x10], %%l2
-		  ldd [%1 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  std %%g2, [%0 + 0x00]
-		  std %%g4, [%0 + 0x08]
-		  std %%o0, [%0 + 0x10]
-		  std %%o2, [%0 + 0x18]
-		  " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", 
-		  "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
-		  destp += 8;
-		  source1 += 8;
-		}
-		break;
-	case 3:
-		source2 = (long *) bh_ptr[2]->b_data;
-		for (i = lines; i > 0; i--) {
-		  __asm__ __volatile__("
-		  ldd [%0 + 0x00], %%g2
-		  ldd [%0 + 0x08], %%g4
-		  ldd [%0 + 0x10], %%o0
-		  ldd [%0 + 0x18], %%o2
-		  ldd [%1 + 0x00], %%o4
-		  ldd [%1 + 0x08], %%l0
-		  ldd [%1 + 0x10], %%l2
-		  ldd [%1 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%2 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%2 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%2 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%2 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  std %%g2, [%0 + 0x00]
-		  std %%g4, [%0 + 0x08]
-		  std %%o0, [%0 + 0x10]
-		  std %%o2, [%0 + 0x18]
-		  " : : "r" (destp), "r" (source1), "r" (source2)
-		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
-		  "l0", "l1", "l2", "l3", "l4", "l5");
-		  destp += 8;
-		  source1 += 8;
-		  source2 += 8;
-		}
-		break;
-	case 4:
-		source2 = (long *) bh_ptr[2]->b_data;
-		source3 = (long *) bh_ptr[3]->b_data;
-		for (i = lines; i > 0; i--) {
-		  __asm__ __volatile__("
-		  ldd [%0 + 0x00], %%g2
-		  ldd [%0 + 0x08], %%g4
-		  ldd [%0 + 0x10], %%o0
-		  ldd [%0 + 0x18], %%o2
-		  ldd [%1 + 0x00], %%o4
-		  ldd [%1 + 0x08], %%l0
-		  ldd [%1 + 0x10], %%l2
-		  ldd [%1 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%2 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%2 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%2 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%2 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%3 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%3 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%3 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%3 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  std %%g2, [%0 + 0x00]
-		  std %%g4, [%0 + 0x08]
-		  std %%o0, [%0 + 0x10]
-		  std %%o2, [%0 + 0x18]
-		  " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
-		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
-		  "l0", "l1", "l2", "l3", "l4", "l5");
-		  destp += 8;
-		  source1 += 8;
-		  source2 += 8;
-		  source3 += 8;
-		}
-		break;
-	case 5:
-		source2 = (long *) bh_ptr[2]->b_data;
-		source3 = (long *) bh_ptr[3]->b_data;
-		source4 = (long *) bh_ptr[4]->b_data;
-		for (i = lines; i > 0; i--) {
-		  __asm__ __volatile__("
-		  ldd [%0 + 0x00], %%g2
-		  ldd [%0 + 0x08], %%g4
-		  ldd [%0 + 0x10], %%o0
-		  ldd [%0 + 0x18], %%o2
-		  ldd [%1 + 0x00], %%o4
-		  ldd [%1 + 0x08], %%l0
-		  ldd [%1 + 0x10], %%l2
-		  ldd [%1 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%2 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%2 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%2 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%2 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%3 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%3 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%3 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%3 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%4 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%4 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%4 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%4 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  std %%g2, [%0 + 0x00]
-		  std %%g4, [%0 + 0x08]
-		  std %%o0, [%0 + 0x10]
-		  std %%o2, [%0 + 0x18]
-		  " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
-		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
-		  "l0", "l1", "l2", "l3", "l4", "l5");
-		  destp += 8;
-		  source1 += 8;
-		  source2 += 8;
-		  source3 += 8;
-		  source4 += 8;
-		}
-		break;
+	p3 = (unsigned long *) bh_ptr[3]->b_data;
+	if (count == 4) {
+		active_template->do_4(bytes, p0, p1, p2, p3);
+		return;
 	}
-}
-#endif /* __sparc_v[78]__ */
-
-#ifdef __alpha__
-/*
- * High speed xor_block operation for RAID4/5 pipelined for Alpha EV5.
- * There is a second version using EV6 prefetch instructions.
- *
- * Copyright (C) 2000 Richard Henderson (rth@redhat.com)
- */
-
-XORBLOCK_TEMPLATE(alpha)
-{
-	long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
-	long *d = (long *) bh_ptr[0]->b_data;
-	long *s1 = (long *) bh_ptr[1]->b_data;
-	long *s2, *s3, *s4;
-
-	if (count == 2) goto two_blocks;
-
-	s2 = (long *) bh_ptr[2]->b_data;
-	if (count == 3) goto three_blocks;
-
-	s3 = (long *) bh_ptr[3]->b_data;
-	if (count == 4) goto four_blocks;
-
-	s4 = (long *) bh_ptr[4]->b_data;
-	goto five_blocks;
-
-two_blocks:
-asm volatile ("
-	.align 4
-2:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,8(%0)
-	ldq $3,8(%1)
-
-	ldq $4,16(%0)
-	ldq $5,16(%1)
-	ldq $6,24(%0)
-	ldq $7,24(%1)
-
-	ldq $16,32(%0)
-	ldq $17,32(%1)
-	ldq $18,40(%0)
-	ldq $19,40(%1)
-
-	ldq $20,48(%0)
-	ldq $21,48(%1)
-	ldq $22,56(%0)
-	xor $0,$1,$0		# 7 cycles from $1 load
-
-	ldq $23,56(%1)
-	xor $2,$3,$2
-	stq $0,0(%0)
-	xor $4,$5,$4
-
-	stq $2,8(%0)
-	xor $6,$7,$6
-	stq $4,16(%0)
-	xor $16,$17,$16
-
-	stq $6,24(%0)
-	xor $18,$19,$18
-	stq $16,32(%0)
-	xor $20,$21,$20
-
-	stq $18,40(%0)
-	xor $22,$23,$22
-	stq $20,48(%0)
-	subq %2,1,%2
-
-	stq $22,56(%0)
-	addq %0,64,%0
-	addq %1,64,%1
-	bgt %2,2b"
-	: "=r"(d), "=r"(s1), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
-	return;
-
-three_blocks:
-asm volatile ("
-	.align 4
-3:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,8(%0)
-
-	ldq $4,8(%1)
-	ldq $6,16(%0)
-	ldq $7,16(%1)
-	ldq $17,24(%0)
-
-	ldq $18,24(%1)
-	ldq $20,32(%0)
-	ldq $21,32(%1)
-	ldq $5,8(%2)
-
-	ldq $16,16(%2)
-	ldq $19,24(%2)
-	ldq $22,32(%2)
-	nop
-
-	xor $0,$1,$1		# 8 cycles from $0 load
-	xor $3,$4,$4		# 6 cycles from $4 load
-	xor $6,$7,$7		# 6 cycles from $7 load
-	xor $17,$18,$18		# 5 cycles from $18 load
-
-	xor $1,$2,$2		# 9 cycles from $2 load
-	xor $20,$21,$21		# 5 cycles from $21 load
-	stq $2,0(%0)
-	xor $4,$5,$5		# 6 cycles from $5 load
-
-	stq $5,8(%0)
-	xor $7,$16,$16		# 7 cycles from $16 load
-	stq $16,16(%0)
-	xor $18,$19,$19		# 7 cycles from $19 load
-
-	stq $19,24(%0)
-	xor $21,$22,$22		# 7 cycles from $22 load
-	stq $22,32(%0)
-	nop
-
-	ldq $0,40(%0)
-	ldq $1,40(%1)
-	ldq $3,48(%0)
-	ldq $4,48(%1)
-
-	ldq $6,56(%0)
-	ldq $7,56(%1)
-	ldq $2,40(%2)
-	ldq $5,48(%2)
-
-	ldq $16,56(%2)
-	xor $0,$1,$1		# 4 cycles from $1 load
-	xor $3,$4,$4		# 5 cycles from $4 load
-	xor $6,$7,$7		# 5 cycles from $7 load
-
-	xor $1,$2,$2		# 4 cycles from $2 load
-	xor $4,$5,$5		# 5 cycles from $5 load
-	stq $2,40(%0)
-	xor $7,$16,$16		# 4 cycles from $16 load
-
-	stq $5,48(%0)
-	subq %3,1,%3
-	stq $16,56(%0)
-	addq %2,64,%2
-
-	addq %1,64,%1
-	addq %0,64,%0
-	bgt %3,3b"
-	: "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(s2), "3"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21", "$22");
-	return;
-
-four_blocks:
-asm volatile ("
-	.align 4
-4:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,0(%3)
-
-	ldq $4,8(%0)
-	ldq $5,8(%1)
-	ldq $6,8(%2)
-	ldq $7,8(%3)

-	ldq $16,16(%0)
-	ldq $17,16(%1)
-	ldq $18,16(%2)
-	ldq $19,16(%3)
-
-	ldq $20,24(%0)
-	xor $0,$1,$1		# 6 cycles from $1 load
-	ldq $21,24(%1)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	ldq $0,24(%2)
-	xor $1,$3,$3
-	ldq $1,24(%3)
-	xor $4,$5,$5		# 7 cycles from $5 load
-
-	stq $3,0(%0)
-	xor $6,$7,$7
-	xor $16,$17,$17		# 7 cycles from $17 load
-	xor $5,$7,$7
-
-	stq $7,8(%0)
-	xor $18,$19,$19		# 7 cycles from $19 load
-	ldq $2,32(%0)
-	xor $17,$19,$19
-
-	ldq $3,32(%1)
-	ldq $4,32(%2)
-	ldq $5,32(%3)
-	xor $20,$21,$21		# 8 cycles from $21 load
-
-	ldq $6,40(%0)
-	ldq $7,40(%1)
-	ldq $16,40(%2)
-	ldq $17,40(%3)
-
-	stq $19,16(%0)
-	xor $0,$1,$1		# 9 cycles from $1 load
-	xor $2,$3,$3		# 5 cycles from $3 load
-	xor $21,$1,$1
-
-	ldq $18,48(%0)
-	xor $4,$5,$5		# 5 cycles from $5 load
-	ldq $19,48(%1)
-	xor $3,$5,$5
-
-	ldq $20,48(%2)
-	ldq $21,48(%3)
-	ldq $0,56(%0)
-	ldq $1,56(%1)
-
-	ldq $2,56(%2)
-	xor $6,$7,$7		# 8 cycles from $6 load
-	ldq $3,56(%3)
-	xor $16,$17,$17		# 8 cycles from $17 load
-
-	xor $7,$17,$17
-	xor $18,$19,$19		# 5 cycles from $19 load
-	xor $20,$21,$21		# 5 cycles from $21 load
-	xor $19,$21,$21
-
-	stq $1,24(%0)
-	xor $0,$1,$1		# 5 cycles from $1 load
-	stq $5,32(%0)
-	xor $2,$3,$3		# 4 cycles from $3 load
-
-	stq $17,40(%0)
-	xor $1,$3,$3
-	stq $21,48(%0)
-	subq %4,1,%4
-
-	stq $3,56(%0)
-	addq %3,64,%3
-	addq %2,64,%2
-	addq %1,64,%1
-
-	addq %0,64,%0
-	bgt %4,4b"
-	: "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21");
-	return;
-
-five_blocks:
-asm volatile ("
-	ldq %0,0(%6)
-	ldq %1,8(%6)
-	ldq %2,16(%6)
-	ldq %3,24(%6)
-	ldq %4,32(%6)
-	ldq %0,%7(%0)
-	ldq %1,%7(%1)
-	ldq %2,%7(%2)
-	ldq %3,%7(%3)
-	ldq %4,%7(%4)
-	.align 4
-5:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,0(%3)
-
-	ldq $4,0(%4)
-	ldq $5,8(%0)
-	ldq $6,8(%1)
-	ldq $7,8(%2)
-
-	ldq $16,8(%3)
-	ldq $17,8(%4)
-	ldq $18,16(%0)
-	ldq $19,16(%1)
-
-	ldq $20,16(%2)
-	xor $0,$1,$1		# 6 cycles from $1 load
-	ldq $21,16(%3)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	ldq $0,16(%4)
-	xor $1,$3,$3
-	ldq $1,24(%0)
-	xor $3,$4,$4		# 7 cycles from $4 load
-
-	stq $4,0(%0)
-	xor $5,$6,$6		# 7 cycles from $6 load
-	xor $7,$16,$16		# 7 cycles from $16 load
-	xor $6,$17,$17		# 7 cycles from $17 load
-
-	ldq $2,24(%1)
-	xor $16,$17,$17
-	ldq $3,24(%2)
-	xor $18,$19,$19		# 8 cycles from $19 load
-
-	stq $17,8(%0)
-	xor $19,$20,$20		# 8 cycles from $20 load
-	ldq $4,24(%3)
-	xor $21,$0,$0		# 7 cycles from $0 load
-
-	ldq $5,24(%4)
-	xor $20,$0,$0
-	ldq $6,32(%0)
-	ldq $7,32(%1)
-
-	stq $0,16(%0)
-	xor $1,$2,$2		# 6 cycles from $2 load
-	ldq $16,32(%2)
-	xor $3,$4,$4		# 4 cycles from $4 load
-	
-	ldq $17,32(%3)
-	xor $2,$4,$4
-	ldq $18,32(%4)
-	ldq $19,40(%0)
-
-	ldq $20,40(%1)
-	ldq $21,40(%2)
-	ldq $0,40(%3)
-	xor $4,$5,$5		# 7 cycles from $5 load
-
-	stq $5,24(%0)
-	xor $6,$7,$7		# 7 cycles from $7 load
-	ldq $1,40(%4)
-	ldq $2,48(%0)
-
-	ldq $3,48(%1)
-	xor $7,$16,$16		# 7 cycles from $16 load
-	ldq $4,48(%2)
-	xor $17,$18,$18		# 6 cycles from $18 load
-
-	ldq $5,48(%3)
-	xor $16,$18,$18
-	ldq $6,48(%4)
-	xor $19,$20,$20		# 7 cycles from $20 load
-
-	stq $18,32(%0)
-	xor $20,$21,$21		# 8 cycles from $21 load
-	ldq $7,56(%0)
-	xor $0,$1,$1		# 6 cycles from $1 load
-
-	ldq $16,56(%1)
-	ldq $17,56(%2)
-	ldq $18,56(%3)
-	ldq $19,56(%4)
-
-	xor $21,$1,$1
-	xor $2,$3,$3		# 9 cycles from $3 load
-	xor $3,$4,$4		# 9 cycles from $4 load
-	xor $5,$6,$6		# 8 cycles from $6 load
-
-	unop
-	xor $4,$6,$6
-	xor $7,$16,$16		# 7 cycles from $16 load
-	xor $17,$18,$18		# 6 cycles from $18 load
-
-	stq $6,48(%0)
-	xor $16,$18,$18
-	subq %5,1,%5
-	xor $18,$19,$19		# 8 cycles from $19 load
-
-	stq $19,56(%0)
-	addq %4,64,%4
-	addq %3,64,%3
-	addq %2,64,%2
-
-	addq %1,64,%1
-	addq %0,64,%0
-	bgt %5,5b"
-	: "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
-	/* ARG! We've run out of asm arguments!  We've got to reload
-	   all those pointers we just loaded.  */
-	: "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21");
-	return;
+	p4 = (unsigned long *) bh_ptr[4]->b_data;
+	active_template->do_5(bytes, p0, p1, p2, p3, p4);
 }

-#define prefetch(base, ofs) \
-	asm("ldq $31,%2(%0)" : "=r"(base) : "0"(base), "i"(ofs))
-
-XORBLOCK_TEMPLATE(alpha_prefetch)
-{
-	long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
-	long *d = (long *) bh_ptr[0]->b_data;
-	long *s1 = (long *) bh_ptr[1]->b_data;
-	long *s2, *s3, *s4;
-	long p;
-
-	p = count == 2;
-	prefetch(d, 0);
-	prefetch(s1, 0);
-	prefetch(d, 64);
-	prefetch(s1, 64);
-	prefetch(d, 128);
-	prefetch(s1, 128);
-	prefetch(d, 192);
-	prefetch(s1, 192);
-	if (p) goto two_blocks;
-
-	s2 = (long *) bh_ptr[2]->b_data;
-	p = count == 3;
-	prefetch(s2, 0);
-	prefetch(s2, 64);
-	prefetch(s2, 128);
-	prefetch(s2, 192);
-	if (p) goto three_blocks;
-
-	s3 = (long *) bh_ptr[3]->b_data;
-	p = count == 4;
-	prefetch(s3, 0);
-	prefetch(s3, 64);
-	prefetch(s3, 128);
-	prefetch(s3, 192);
-	if (p) goto four_blocks;
-
-	s4 = (long *) bh_ptr[4]->b_data;
-	prefetch(s4, 0);
-	prefetch(s4, 64);
-	prefetch(s4, 128);
-	prefetch(s4, 192);
-	goto five_blocks;
-
-two_blocks:
-asm volatile ("
-	.align 4
-2:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,8(%0)
-	ldq $3,8(%1)
-
-	ldq $4,16(%0)
-	ldq $5,16(%1)
-	ldq $6,24(%0)
-	ldq $7,24(%1)
-
-	ldq $16,32(%0)
-	ldq $17,32(%1)
-	ldq $18,40(%0)
-	ldq $19,40(%1)
-
-	ldq $20,48(%0)
-	ldq $21,48(%1)
-	ldq $22,56(%0)
-	ldq $23,56(%1)
-
-	ldq $31,256(%0)
-	xor $0,$1,$0		# 8 cycles from $1 load
-	ldq $31,256(%1)
-	xor $2,$3,$2
-
-	stq $0,0(%0)
-	xor $4,$5,$4
-	stq $2,8(%0)
-	xor $6,$7,$6
-
-	stq $4,16(%0)
-	xor $16,$17,$16
-	stq $6,24(%0)
-	xor $18,$19,$18
-
-	stq $16,32(%0)
-	xor $20,$21,$20
-	stq $18,40(%0)
-	xor $22,$23,$22
-
-	stq $20,48(%0)
-	subq %2,1,%2
-	stq $22,56(%0)
-	addq %0,64,%0
-
-	addq %1,64,%1
-	bgt %2,2b"
-	: "=r"(d), "=r"(s1), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
-	return;
-
-three_blocks:
-asm volatile ("
-	.align 4
-3:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,8(%0)
-
-	ldq $4,8(%1)
-	ldq $6,16(%0)
-	ldq $7,16(%1)
-	ldq $17,24(%0)
-
-	ldq $18,24(%1)
-	ldq $20,32(%0)
-	ldq $21,32(%1)
-	ldq $5,8(%2)
-
-	ldq $16,16(%2)
-	ldq $19,24(%2)
-	ldq $22,32(%2)
-	nop
-
-	xor $0,$1,$1		# 8 cycles from $0 load
-	xor $3,$4,$4		# 7 cycles from $4 load
-	xor $6,$7,$7		# 6 cycles from $7 load
-	xor $17,$18,$18		# 5 cycles from $18 load
-
-	xor $1,$2,$2		# 9 cycles from $2 load
-	xor $20,$21,$21		# 5 cycles from $21 load
-	stq $2,0(%0)
-	xor $4,$5,$5		# 6 cycles from $5 load
-
-	stq $5,8(%0)
-	xor $7,$16,$16		# 7 cycles from $16 load
-	stq $16,16(%0)
-	xor $18,$19,$19		# 7 cycles from $19 load
-
-	stq $19,24(%0)
-	xor $21,$22,$22		# 7 cycles from $22 load
-	stq $22,32(%0)
-	nop
-
-	ldq $0,40(%0)
-	ldq $1,40(%1)
-	ldq $3,48(%0)
-	ldq $4,48(%1)
-
-	ldq $6,56(%0)
-	ldq $7,56(%1)
-	ldq $2,40(%2)
-	ldq $5,48(%2)
-
-	ldq $16,56(%2)
-	ldq $31,256(%0)
-	ldq $31,256(%1)
-	ldq $31,256(%2)
-
-	xor $0,$1,$1		# 6 cycles from $1 load
-	xor $3,$4,$4		# 5 cycles from $4 load
-	xor $6,$7,$7		# 5 cycles from $7 load
-	xor $1,$2,$2		# 4 cycles from $2 load
-	
-	xor $4,$5,$5		# 5 cycles from $5 load
-	xor $7,$16,$16		# 4 cycles from $16 load
-	stq $2,40(%0)
-	subq %3,1,%3
-
-	stq $5,48(%0)
-	addq %2,64,%2
-	stq $16,56(%0)
-	addq %1,64,%1
-
-	addq %0,64,%0
-	bgt %3,3b"
-	: "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(s2), "3"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21", "$22");
-	return;
-
-four_blocks:
-asm volatile ("
-	.align 4
-4:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,0(%3)
-
-	ldq $4,8(%0)
-	ldq $5,8(%1)
-	ldq $6,8(%2)
-	ldq $7,8(%3)
-
-	ldq $16,16(%0)
-	ldq $17,16(%1)
-	ldq $18,16(%2)
-	ldq $19,16(%3)
-
-	ldq $20,24(%0)
-	xor $0,$1,$1		# 6 cycles from $1 load
-	ldq $21,24(%1)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	ldq $0,24(%2)
-	xor $1,$3,$3
-	ldq $1,24(%3)
-	xor $4,$5,$5		# 7 cycles from $5 load
-
-	stq $3,0(%0)
-	xor $6,$7,$7
-	xor $16,$17,$17		# 7 cycles from $17 load
-	xor $5,$7,$7
-
-	stq $7,8(%0)
-	xor $18,$19,$19		# 7 cycles from $19 load
-	ldq $2,32(%0)
-	xor $17,$19,$19
-
-	ldq $3,32(%1)
-	ldq $4,32(%2)
-	ldq $5,32(%3)
-	xor $20,$21,$21		# 8 cycles from $21 load
-
-	ldq $6,40(%0)
-	ldq $7,40(%1)
-	ldq $16,40(%2)
-	ldq $17,40(%3)
-
-	stq $19,16(%0)
-	xor $0,$1,$1		# 9 cycles from $1 load
-	xor $2,$3,$3		# 5 cycles from $3 load
-	xor $21,$1,$1
-
-	ldq $18,48(%0)
-	xor $4,$5,$5		# 5 cycles from $5 load
-	ldq $19,48(%1)
-	xor $3,$5,$5
-
-	ldq $20,48(%2)
-	ldq $21,48(%3)
-	ldq $0,56(%0)
-	ldq $1,56(%1)
-
-	ldq $2,56(%2)
-	xor $6,$7,$7		# 8 cycles from $6 load
-	ldq $3,56(%3)
-	xor $16,$17,$17		# 8 cycles from $17 load
-
-	ldq $31,256(%0)
-	xor $7,$17,$17
-	ldq $31,256(%1)
-	xor $18,$19,$19		# 6 cycles from $19 load
+/* Set of all registered templates.  */
+static struct xor_block_template *template_list;

-	ldq $31,256(%2)
-	xor $20,$21,$21		# 6 cycles from $21 load
-	ldq $31,256(%3)
-	xor $19,$21,$21
+/* The -6*32 shift factor colors the cache.  */
+#define BENCH_SIZE (PAGE_SIZE-6*32)

-	stq $1,24(%0)
-	xor $0,$1,$1		# 7 cycles from $1 load
-	stq $5,32(%0)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	stq $17,40(%0)
-	xor $1,$3,$3
-	stq $21,48(%0)
-	subq %4,1,%4
-
-	stq $3,56(%0)
-	addq %3,64,%3
-	addq %2,64,%2
-	addq %1,64,%1
-
-	addq %0,64,%0
-	bgt %4,4b"
-	: "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21");
-	return;
-
-five_blocks:
-asm volatile ("
-	ldq %0,0(%6)
-	ldq %1,8(%6)
-	ldq %2,16(%6)
-	ldq %3,24(%6)
-	ldq %4,32(%6)
-	ldq %0,%7(%0)
-	ldq %1,%7(%1)
-	ldq %2,%7(%2)
-	ldq %3,%7(%3)
-	ldq %4,%7(%4)
-	.align 4
-5:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,0(%3)
-
-	ldq $4,0(%4)
-	ldq $5,8(%0)
-	ldq $6,8(%1)
-	ldq $7,8(%2)
-
-	ldq $16,8(%3)
-	ldq $17,8(%4)
-	ldq $18,16(%0)
-	ldq $19,16(%1)
-
-	ldq $20,16(%2)
-	xor $0,$1,$1		# 6 cycles from $1 load
-	ldq $21,16(%3)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	ldq $0,16(%4)
-	xor $1,$3,$3
-	ldq $1,24(%0)
-	xor $3,$4,$4		# 7 cycles from $4 load
-
-	stq $4,0(%0)
-	xor $5,$6,$6		# 7 cycles from $6 load
-	xor $7,$16,$16		# 7 cycles from $16 load
-	xor $6,$17,$17		# 7 cycles from $17 load
-
-	ldq $2,24(%1)
-	xor $16,$17,$17
-	ldq $3,24(%2)
-	xor $18,$19,$19		# 8 cycles from $19 load
-
-	stq $17,8(%0)
-	xor $19,$20,$20		# 8 cycles from $20 load
-	ldq $4,24(%3)
-	xor $21,$0,$0		# 7 cycles from $0 load
-
-	ldq $5,24(%4)
-	xor $20,$0,$0
-	ldq $6,32(%0)
-	ldq $7,32(%1)
-
-	stq $0,16(%0)
-	xor $1,$2,$2		# 6 cycles from $2 load
-	ldq $16,32(%2)
-	xor $3,$4,$4		# 4 cycles from $4 load
-	
-	ldq $17,32(%3)
-	xor $2,$4,$4
-	ldq $18,32(%4)
-	ldq $19,40(%0)
-
-	ldq $20,40(%1)
-	ldq $21,40(%2)
-	ldq $0,40(%3)
-	xor $4,$5,$5		# 7 cycles from $5 load
-
-	stq $5,24(%0)
-	xor $6,$7,$7		# 7 cycles from $7 load
-	ldq $1,40(%4)
-	ldq $2,48(%0)
-
-	ldq $3,48(%1)
-	xor $7,$16,$16		# 7 cycles from $16 load
-	ldq $4,48(%2)
-	xor $17,$18,$18		# 6 cycles from $18 load
-
-	ldq $5,48(%3)
-	xor $16,$18,$18
-	ldq $6,48(%4)
-	xor $19,$20,$20		# 7 cycles from $20 load
-
-	stq $18,32(%0)
-	xor $20,$21,$21		# 8 cycles from $21 load
-	ldq $7,56(%0)
-	xor $0,$1,$1		# 6 cycles from $1 load
-
-	ldq $16,56(%1)
-	ldq $17,56(%2)
-	ldq $18,56(%3)
-	ldq $19,56(%4)
-
-	ldq $31,256(%0)
-	xor $21,$1,$1
-	ldq $31,256(%1)
-	xor $2,$3,$3		# 9 cycles from $3 load
-
-	ldq $31,256(%2)
-	xor $3,$4,$4		# 9 cycles from $4 load
-	ldq $31,256(%3)
-	xor $5,$6,$6		# 8 cycles from $6 load
-
-	ldq $31,256(%4)
-	xor $4,$6,$6
-	xor $7,$16,$16		# 7 cycles from $16 load
-	xor $17,$18,$18		# 6 cycles from $18 load
-
-	stq $6,48(%0)
-	xor $16,$18,$18
-	subq %5,1,%5
-	xor $18,$19,$19		# 8 cycles from $19 load
-
-	stq $19,56(%0)
-	addq %4,64,%4
-	addq %3,64,%3
-	addq %2,64,%2
-
-	addq %1,64,%1
-	addq %0,64,%0
-	bgt %5,5b"
-	: "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
-	/* ARG! We've run out of asm arguments!  We've got to reload
-	   all those pointers we just loaded.  */
-	: "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21");
-	return;
-}
-
-#undef prefetch
-
-#endif /* __alpha__ */
-
-#ifndef __sparc_v9__
-
-/*
- * this one works reasonably on any x86 CPU
- * (send me an assembly version for inclusion if you can make it faster)
- *
- * this one is just as fast as written in pure assembly on x86.
- * the reason for this separate version is that the
- * fast open-coded xor routine "32reg" produces suboptimal code
- * on x86, due to lack of registers.
- */
-XORBLOCK_TEMPLATE(8regs)
-{
-	int len  = bh_ptr[0]->b_size;
-	long *destp   = (long *) bh_ptr[0]->b_data;
-	long *source1, *source2, *source3, *source4;
-	long lines = len / (sizeof (long)) / 8, i;
-
-	switch(count) {
-		case 2:
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-				*(destp + 0) ^= *(source1 + 0);
-				*(destp + 1) ^= *(source1 + 1);
-				*(destp + 2) ^= *(source1 + 2);
-				*(destp + 3) ^= *(source1 + 3);
-				*(destp + 4) ^= *(source1 + 4);
-				*(destp + 5) ^= *(source1 + 5);
-				*(destp + 6) ^= *(source1 + 6);
-				*(destp + 7) ^= *(source1 + 7);
-				source1 += 8;
-				destp += 8;
-			}
-			break;
-		case 3:
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-				*(destp + 0) ^= *(source1 + 0);
-				*(destp + 0) ^= *(source2 + 0);
-				*(destp + 1) ^= *(source1 + 1);
-				*(destp + 1) ^= *(source2 + 1);
-				*(destp + 2) ^= *(source1 + 2);
-				*(destp + 2) ^= *(source2 + 2);
-				*(destp + 3) ^= *(source1 + 3);
-				*(destp + 3) ^= *(source2 + 3);
-				*(destp + 4) ^= *(source1 + 4);
-				*(destp + 4) ^= *(source2 + 4);
-				*(destp + 5) ^= *(source1 + 5);
-				*(destp + 5) ^= *(source2 + 5);
-				*(destp + 6) ^= *(source1 + 6);
-				*(destp + 6) ^= *(source2 + 6);
-				*(destp + 7) ^= *(source1 + 7);
-				*(destp + 7) ^= *(source2 + 7);
-				source1 += 8;
-				source2 += 8;
-				destp += 8;
-			}
-			break;
-		case 4:
-			source3 = (long *) bh_ptr[3]->b_data;
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-				*(destp + 0) ^= *(source1 + 0);
-				*(destp + 0) ^= *(source2 + 0);
-				*(destp + 0) ^= *(source3 + 0);
-				*(destp + 1) ^= *(source1 + 1);
-				*(destp + 1) ^= *(source2 + 1);
-				*(destp + 1) ^= *(source3 + 1);
-				*(destp + 2) ^= *(source1 + 2);
-				*(destp + 2) ^= *(source2 + 2);
-				*(destp + 2) ^= *(source3 + 2);
-				*(destp + 3) ^= *(source1 + 3);
-				*(destp + 3) ^= *(source2 + 3);
-				*(destp + 3) ^= *(source3 + 3);
-				*(destp + 4) ^= *(source1 + 4);
-				*(destp + 4) ^= *(source2 + 4);
-				*(destp + 4) ^= *(source3 + 4);
-				*(destp + 5) ^= *(source1 + 5);
-				*(destp + 5) ^= *(source2 + 5);
-				*(destp + 5) ^= *(source3 + 5);
-				*(destp + 6) ^= *(source1 + 6);
-				*(destp + 6) ^= *(source2 + 6);
-				*(destp + 6) ^= *(source3 + 6);
-				*(destp + 7) ^= *(source1 + 7);
-				*(destp + 7) ^= *(source2 + 7);
-				*(destp + 7) ^= *(source3 + 7);
-				source1 += 8;
-				source2 += 8;
-				source3 += 8;
-				destp += 8;
-			}
-			break;
-		case 5:
-			source4 = (long *) bh_ptr[4]->b_data;
-			source3 = (long *) bh_ptr[3]->b_data;
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-				*(destp + 0) ^= *(source1 + 0);
-				*(destp + 0) ^= *(source2 + 0);
-				*(destp + 0) ^= *(source3 + 0);
-				*(destp + 0) ^= *(source4 + 0);
-				*(destp + 1) ^= *(source1 + 1);
-				*(destp + 1) ^= *(source2 + 1);
-				*(destp + 1) ^= *(source3 + 1);
-				*(destp + 1) ^= *(source4 + 1);
-				*(destp + 2) ^= *(source1 + 2);
-				*(destp + 2) ^= *(source2 + 2);
-				*(destp + 2) ^= *(source3 + 2);
-				*(destp + 2) ^= *(source4 + 2);
-				*(destp + 3) ^= *(source1 + 3);
-				*(destp + 3) ^= *(source2 + 3);
-				*(destp + 3) ^= *(source3 + 3);
-				*(destp + 3) ^= *(source4 + 3);
-				*(destp + 4) ^= *(source1 + 4);
-				*(destp + 4) ^= *(source2 + 4);
-				*(destp + 4) ^= *(source3 + 4);
-				*(destp + 4) ^= *(source4 + 4);
-				*(destp + 5) ^= *(source1 + 5);
-				*(destp + 5) ^= *(source2 + 5);
-				*(destp + 5) ^= *(source3 + 5);
-				*(destp + 5) ^= *(source4 + 5);
-				*(destp + 6) ^= *(source1 + 6);
-				*(destp + 6) ^= *(source2 + 6);
-				*(destp + 6) ^= *(source3 + 6);
-				*(destp + 6) ^= *(source4 + 6);
-				*(destp + 7) ^= *(source1 + 7);
-				*(destp + 7) ^= *(source2 + 7);
-				*(destp + 7) ^= *(source3 + 7);
-				*(destp + 7) ^= *(source4 + 7);
-				source1 += 8;
-				source2 += 8;
-				source3 += 8;
-				source4 += 8;
-				destp += 8;
-			}
-			break;
-	}
-}
-
-/*
- * platform independent RAID5 checksum calculation, this should
- * be very fast on any platform that has a decent amount of
- * registers. (32 or more)
- */
-XORBLOCK_TEMPLATE(32regs)
-{
-	int size  = bh_ptr[0]->b_size;
-	int lines = size / (sizeof (long)) / 8, i;
-	long *destp   = (long *) bh_ptr[0]->b_data;
-	long *source1, *source2, *source3, *source4;
-	
-	  /* LOTS of registers available...
-	     We do explicite loop-unrolling here for code which
-	     favours RISC machines.  In fact this is almoast direct
-	     RISC assembly on Alpha and SPARC :-)  */
-
-
-	switch(count) {
-		case 2:
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
-				d0 = destp[0];	/* Pull the stuff into registers	*/
-				d1 = destp[1];	/*  ... in bursts, if possible.		*/
-				d2 = destp[2];
-				d3 = destp[3];
-				d4 = destp[4];
-				d5 = destp[5];
-				d6 = destp[6];
-				d7 = destp[7];
-				d0 ^= source1[0];
-				d1 ^= source1[1];
-				d2 ^= source1[2];
-				d3 ^= source1[3];
-				d4 ^= source1[4];
-				d5 ^= source1[5];
-				d6 ^= source1[6];
-				d7 ^= source1[7];
-				destp[0] = d0;	/* Store the result (in burts)		*/
-				destp[1] = d1;
-				destp[2] = d2;
-				destp[3] = d3;
-				destp[4] = d4;	/* Store the result (in burts)		*/
-				destp[5] = d5;
-				destp[6] = d6;
-				destp[7] = d7;
-				source1 += 8;
-				destp += 8;
-			}
-			break;
-	  	case 3:
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
-				d0 = destp[0];	/* Pull the stuff into registers	*/
-				d1 = destp[1];	/*  ... in bursts, if possible.		*/
-				d2 = destp[2];
-				d3 = destp[3];
-				d4 = destp[4];
-				d5 = destp[5];
-				d6 = destp[6];
-				d7 = destp[7];
-				d0 ^= source1[0];
-				d1 ^= source1[1];
-				d2 ^= source1[2];
-				d3 ^= source1[3];
-				d4 ^= source1[4];
-				d5 ^= source1[5];
-				d6 ^= source1[6];
-				d7 ^= source1[7];
-				d0 ^= source2[0];
-				d1 ^= source2[1];
-				d2 ^= source2[2];
-				d3 ^= source2[3];
-				d4 ^= source2[4];
-				d5 ^= source2[5];
-				d6 ^= source2[6];
-				d7 ^= source2[7];
-				destp[0] = d0;	/* Store the result (in burts)		*/
-				destp[1] = d1;
-				destp[2] = d2;
-				destp[3] = d3;
-				destp[4] = d4;	/* Store the result (in burts)		*/
-				destp[5] = d5;
-				destp[6] = d6;
-				destp[7] = d7;
-				source1 += 8;
-				source2 += 8;
-				destp += 8;
-			}
-			break;
-		case 4:
-			source3 = (long *) bh_ptr[3]->b_data;
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
-				d0 = destp[0];	/* Pull the stuff into registers	*/
-				d1 = destp[1];	/*  ... in bursts, if possible.		*/
-				d2 = destp[2];
-				d3 = destp[3];
-				d4 = destp[4];
-				d5 = destp[5];
-				d6 = destp[6];
-				d7 = destp[7];
-				d0 ^= source1[0];
-				d1 ^= source1[1];
-				d2 ^= source1[2];
-				d3 ^= source1[3];
-				d4 ^= source1[4];
-				d5 ^= source1[5];
-				d6 ^= source1[6];
-				d7 ^= source1[7];
-				d0 ^= source2[0];
-				d1 ^= source2[1];
-				d2 ^= source2[2];
-				d3 ^= source2[3];
-				d4 ^= source2[4];
-				d5 ^= source2[5];
-				d6 ^= source2[6];
-				d7 ^= source2[7];
-				d0 ^= source3[0];
-				d1 ^= source3[1];
-				d2 ^= source3[2];
-				d3 ^= source3[3];
-				d4 ^= source3[4];
-				d5 ^= source3[5];
-				d6 ^= source3[6];
-				d7 ^= source3[7];
-				destp[0] = d0;	/* Store the result (in burts)		*/
-				destp[1] = d1;
-				destp[2] = d2;
-				destp[3] = d3;
-				destp[4] = d4;	/* Store the result (in burts)		*/
-				destp[5] = d5;
-				destp[6] = d6;
-				destp[7] = d7;
-				source1 += 8;
-				source2 += 8;
-				source3 += 8;
-				destp += 8;
-			}
-			break;
-		case 5:
-			source4 = (long *) bh_ptr[4]->b_data;
-			source3 = (long *) bh_ptr[3]->b_data;
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
-				d0 = destp[0];	/* Pull the stuff into registers	*/
-				d1 = destp[1];	/*  ... in bursts, if possible.		*/
-				d2 = destp[2];
-				d3 = destp[3];
-				d4 = destp[4];
-				d5 = destp[5];
-				d6 = destp[6];
-				d7 = destp[7];
-				d0 ^= source1[0];
-				d1 ^= source1[1];
-				d2 ^= source1[2];
-				d3 ^= source1[3];
-				d4 ^= source1[4];
-				d5 ^= source1[5];
-				d6 ^= source1[6];
-				d7 ^= source1[7];
-				d0 ^= source2[0];
-				d1 ^= source2[1];
-				d2 ^= source2[2];
-				d3 ^= source2[3];
-				d4 ^= source2[4];
-				d5 ^= source2[5];
-				d6 ^= source2[6];
-				d7 ^= source2[7];
-				d0 ^= source3[0];
-				d1 ^= source3[1];
-				d2 ^= source3[2];
-				d3 ^= source3[3];
-				d4 ^= source3[4];
-				d5 ^= source3[5];
-				d6 ^= source3[6];
-				d7 ^= source3[7];
-				d0 ^= source4[0];
-				d1 ^= source4[1];
-				d2 ^= source4[2];
-				d3 ^= source4[3];
-				d4 ^= source4[4];
-				d5 ^= source4[5];
-				d6 ^= source4[6];
-				d7 ^= source4[7];
-				destp[0] = d0;	/* Store the result (in burts)		*/
-				destp[1] = d1;
-				destp[2] = d2;
-				destp[3] = d3;
-				destp[4] = d4;	/* Store the result (in burts)		*/
-				destp[5] = d5;
-				destp[6] = d6;
-				destp[7] = d7;
-				source1 += 8;
-				source2 += 8;
-				source3 += 8;
-				source4 += 8;
-				destp += 8;
-			}
-			break;
-	}
-}
-
-/*
- * (the -6*32 shift factor colors the cache)
- */
-#define SIZE (PAGE_SIZE-6*32)
-
-static void xor_speed ( struct xor_block_template * func, 
-	struct buffer_head *b1, struct buffer_head *b2)
+static void
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 {
 	int speed;
 	unsigned long now;
 	int i, count, max;
-	struct buffer_head *bh_ptr[6];

-	func->next = xor_functions;
-	xor_functions = func;
-	bh_ptr[0] = b1;
-	bh_ptr[1] = b2;
+	tmpl->next = template_list;
+	template_list = tmpl;

 	/*
-	 * count the number of XORs done during a whole jiffy.
-	 * calculate the speed of checksumming from this.
-	 * (we use a 2-page allocation to have guaranteed
-	 * color L1-cache layout)
+	 * Count the number of XORs done during a whole jiffy, and use
+	 * this to calculate the speed of checksumming.  We use a 2-page
+	 * allocation to have guaranteed color L1-cache layout.
 	 */
 	max = 0;
 	for (i = 0; i < 5; i++) {
@@ -2600,7 +82,7 @@ static void xor_speed ( struct xor_block_template * func,
 		count = 0;
 		while (jiffies == now) {
 			mb();
-			func->xor_block(2,bh_ptr);
+			tmpl->do_2(BENCH_SIZE, b1, b2);
 			mb();
 			count++;
 			mb();
@@ -2609,120 +91,53 @@ static void xor_speed ( struct xor_block_template * func,
 			max = count;
 	}

-	speed = max * (HZ*SIZE/1024);
-	func->speed = speed;
+	speed = max * (HZ * BENCH_SIZE / 1024);
+	tmpl->speed = speed;

-	printk( "   %-10s: %5d.%03d MB/sec\n", func->name,
-		speed / 1000, speed % 1000);
+	printk("   %-10s: %5d.%03d MB/sec\n", tmpl->name,
+	       speed / 1000, speed % 1000);
 }

-static inline void pick_fastest_function(void)
+static int
+calibrate_xor_block(void)
 {
+	void *b1, *b2;
 	struct xor_block_template *f, *fastest;

-	fastest = xor_functions;
-	for (f = fastest; f; f = f->next) {
-		if (f->speed > fastest->speed)
-			fastest = f;
-	}
-#ifdef CONFIG_X86_XMM 
-	if (cpu_has_xmm) {
-		/* we force the use of the KNI xor block because it
-			can write around l2.  we may also be able
-			to load into the l1 only depending on how
-			the cpu deals with a load to a line that is
-			being prefetched.
-		*/
-		fastest = &t_xor_block_pIII_kni;
+	b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
+	if (! b1) {
+		printk("raid5: Yikes!  No memory available.\n");
+		return -ENOMEM;
 	}
-#endif
-#ifdef __alpha__
-	if (implver() == IMPLVER_EV6) {
-		/* Force the use of alpha_prefetch if EV6, as it
-		   is significantly faster in the cold cache case.  */
-		fastest = &t_xor_block_alpha_prefetch;
-	}
-#endif
-	xor_block = fastest->xor_block;
-	printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
-		fastest->speed / 1000, fastest->speed % 1000);
-}
- 
-static struct buffer_head b1, b2;
-
-void calibrate_xor_block(void)
-{
-	if (xor_block)
-		return;
-	memset(&b1,0,sizeof(b1));
-	b2 = b1;
-
-	b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
-	if (!b1.b_data) {
-		pick_fastest_function();
-		return;
-	}
-	b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
-
-	b1.b_size = SIZE;
+	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;

 	printk(KERN_INFO "raid5: measuring checksumming speed\n");
+	sti();

-	sti(); /* should be safe */
+#define xor_speed(templ)	do_xor_speed((templ), b1, b2)

-#if defined(__sparc__) && !defined(__sparc_v9__)
-	printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
-	xor_speed(&t_xor_block_SPARC,&b1,&b2);
-#endif
+	XOR_TRY_TEMPLATES;

-#ifdef CONFIG_X86_XMM 
-	if (cpu_has_xmm) {
-		printk(KERN_INFO
-			"raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
-		xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
-	}
-#endif /* CONFIG_X86_XMM */
+#undef xor_speed

-#ifdef __i386__
-	if (md_cpu_has_mmx()) {
-		printk(KERN_INFO
-			"raid5: MMX detected, trying high-speed MMX checksum routines\n");
-		xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
-		xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
-	}
-#endif /* __i386__ */
+	free_pages((unsigned long)b1, 2);

-#ifdef __alpha__
-	xor_speed(&t_xor_block_alpha,&b1,&b2);
-	xor_speed(&t_xor_block_alpha_prefetch,&b1,&b2);
-#endif
-	
-	xor_speed(&t_xor_block_8regs,&b1,&b2);
-	xor_speed(&t_xor_block_32regs,&b1,&b2);
+	fastest = template_list;
+	for (f = fastest; f; f = f->next)
+		if (f->speed > fastest->speed)
+			fastest = f;

-	free_pages((unsigned long)b1.b_data,2);
-	pick_fastest_function();
-}
+#ifdef XOR_SELECT_TEMPLATE
+	fastest = XOR_SELECT_TEMPLATE(fastest);
+#endif

-#else /* __sparc_v9__ */
+	active_template = fastest;
+	printk("raid5: using function: %s (%d.%03d MB/sec)\n",
+	       fastest->name, fastest->speed / 1000, fastest->speed % 1000);

-void calibrate_xor_block(void)
-{
-	if (xor_block)
-		return;
-	printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
-	xor_block = xor_block_VIS;
+	return 0;
 }

-#endif /* __sparc_v9__ */
-
 MD_EXPORT_SYMBOL(xor_block);
-MD_EXPORT_SYMBOL(calibrate_xor_block);

-#ifdef MODULE
-int init_module(void)
-{
-	calibrate_xor_block();
-	return 0;
-}
-#endif
+module_init(calibrate_xor_block);
--- a/drivers/net/irda/nsc-ircc.c
+++ b/drivers/net/irda/nsc-ircc.c
@@ -1129,7 +1129,7 @@ static int nsc_ircc_hard_xmit_fir(struct sk_buff *skb, struct net_device *dev)
 	if ((speed = irda_get_speed(skb)) != self->io.speed) {
 		/* Check for empty frame */
 		if (!skb->len) {
-			nsc_ircc_change_speed_complete(self, speed); 
+			nsc_ircc_change_speed(self, speed); 
 			return 0;
 		} else
 			self->new_speed = speed;

--- a/drivers/scsi/a2091.c
+++ b/drivers/scsi/a2091.c
@@ -207,8 +207,10 @@ int __init a2091_detect(Scsi_Host_Template *tpnt)
 	    continue;

 	instance = scsi_register (tpnt, sizeof (struct WD33C93_hostdata));
-	if(instance == NULL)
-		continue;
+	if (instance == NULL) {
+	    release_mem_region(address, 256);
+	    continue;
+	}
 	instance->base = ZTWO_VADDR(address);
 	instance->irq = IRQ_AMIGA_PORTS;
 	instance->unique_id = z->slotaddr;

--- a/include/asm-alpha/atomic.h
+++ b/include/asm-alpha/atomic.h
@@ -66,8 +66,8 @@ static __inline__ long atomic_add_return(int i, atomic_t * v)
 	long temp, result;
 	__asm__ __volatile__(
 	"1:	ldl_l %0,%1\n"
+	"	addl %0,%3,%2\n"
 	"	addl %0,%3,%0\n"
-	"	mov %0,%2\n"
 	"	stl_c %0,%1\n"
 	"	beq %0,2f\n"
 	"	mb\n"
@@ -84,8 +84,8 @@ static __inline__ long atomic_sub_return(int i, atomic_t * v)
 	long temp, result;
 	__asm__ __volatile__(
 	"1:	ldl_l %0,%1\n"
+	"	subl %0,%3,%2\n"
 	"	subl %0,%3,%0\n"
-	"	mov %0,%2\n"
 	"	stl_c %0,%1\n"
 	"	beq %0,2f\n"
 	"	mb\n"

--- a/include/asm-alpha/compiler.h
+++ b/include/asm-alpha/compiler.h
@@ -72,4 +72,13 @@
  __asm__("stw %1,%0" : "=m"(mem) : "r"(val))
 #endif

+/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
+   a mechanism by which the user can annotate likely branch directions and
+   expect the blocks to be reordered appropriately.  Define __builtin_expect
+   to nothing for earlier compilers.  */
+
+#if __GNUC__ == 2 && __GNUC_MINOR__ < 96
+#define __builtin_expect(x, expected_value) (x)
+#endif
+
 #endif /* __ALPHA_COMPILER_H */
--- a/include/asm-alpha/semaphore-helper.h
+++ b/include/asm-alpha/semaphore-helper.h
-#ifndef _ALPHA_SEMAPHORE_HELPER_H
-#define _ALPHA_SEMAPHORE_HELPER_H
-
-/*
- * SMP- and interrupt-safe semaphores helper functions.
- *
- * (C) Copyright 1996 Linus Torvalds
- * (C) Copyright 1999 Richard Henderson
- */
-
-/*
- * These two _must_ execute atomically wrt each other.
- *
- * This is trivially done with load_locked/store_cond,
- * which we have.  Let the rest of the losers suck eggs.
- */
-
-static inline void
-wake_one_more(struct semaphore * sem)
-{
-	atomic_inc(&sem->waking);
-}
-
-static inline int
-waking_non_zero(struct semaphore *sem)
-{
-	long ret, tmp;
-
-	/* An atomic conditional decrement.  */
-	__asm__ __volatile__(
-		"1:	ldl_l	%1,%2\n"
-		"	blt	%1,2f\n"
-		"	subl	%1,1,%0\n"
-		"	stl_c	%0,%2\n"
-		"	beq	%0,3f\n"
-		"2:\n"
-		".subsection 2\n"
-		"3:	br	1b\n"
-		".previous"
-		: "=r"(ret), "=r"(tmp), "=m"(sem->waking.counter)
-		: "0"(0));
-
-	return ret > 0;
-}
-
-
-/*
- * waking_non_zero_interruptible:
- *	1	got the lock
- *	0	go to sleep
- *	-EINTR	interrupted
- *
- * We must undo the sem->count down_interruptible decrement
- * simultaneously and atomicly with the sem->waking adjustment,
- * otherwise we can race with wake_one_more.
- *
- * This is accomplished by doing a 64-bit ll/sc on the 2 32-bit words.
- */
-
-static inline int
-waking_non_zero_interruptible(struct semaphore *sem, struct task_struct *tsk)
-{
-	long ret, tmp, tmp2, tmp3;
-
-	/* "Equivalent" C.  Note that we have to do this all without
-	   (taken) branches in order to be a valid ll/sc sequence.
-
-	   do {
-	       tmp = ldq_l;
-	       ret = 0;
-	       if (tmp >= 0) {
-	           tmp += 0xffffffff00000000;
-	           ret = 1;
-	       }
-	       else if (pending) {
-		   // Since -1 + 1 carries into the high word, we have
-		   // to be more careful adding 1 here.
-		   tmp = (tmp & 0xffffffff00000000)
-			 | ((tmp + 1) & 0x00000000ffffffff;
-	           ret = -EINTR;
-	       }
-	       else {
-		   break;	// ideally.  we don't actually break 
-		   		// since this is a predicate we don't
-				// have, and is more trouble to build
-				// than to elide the noop stq_c.
-	       }
-	       tmp = stq_c = tmp;
-	   } while (tmp == 0);
-	*/
-
-	__asm__ __volatile__(
-		"1:	ldq_l	%1,%4\n"
-		"	lda	%0,0\n"
-		"	cmovne	%5,%6,%0\n"
-		"	addq	%1,1,%2\n"
-		"	and	%1,%7,%3\n"
-		"	andnot	%2,%7,%2\n"
-		"	cmovge	%1,1,%0\n"
-		"	or	%3,%2,%2\n"
-		"	addq	%1,%7,%3\n"
-		"	cmovne	%5,%2,%1\n"
-		"	cmovge	%2,%3,%1\n"
-		"	stq_c	%1,%4\n"
-		"	beq	%1,3f\n"
-		"2:\n"
-		".subsection 2\n"
-		"3:	br	1b\n"
-		".previous"
-		: "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3), "=m"(*sem)
-		: "r"(signal_pending(tsk)), "r"(-EINTR),
-		  "r"(0xffffffff00000000));
-
-	return ret;
-}
-
-/*
- * waking_non_zero_trylock is unused.  we do everything in 
- * down_trylock and let non-ll/sc hosts bounce around.
- */
-
-static inline int
-waking_non_zero_trylock(struct semaphore *sem)
-{
-	return 0;
-}
-
-#endif
--- a/include/asm-alpha/semaphore.h
+++ b/include/asm-alpha/semaphore.h
@@ -12,10 +12,14 @@
 #include <asm/system.h>
 #include <asm/atomic.h>

+#define DEBUG_SEMAPHORE 0
+#define DEBUG_RW_SEMAPHORE 0
+
 struct semaphore {
 	/* Careful, inline assembly knows about the position of these two.  */
-	atomic_t count;
+	atomic_t count __attribute__((aligned(8)));
 	atomic_t waking;		/* biased by -1 */
+
 	wait_queue_head_t wait;
 #if WAITQUEUE_DEBUG
 	long __magic;
@@ -42,7 +46,7 @@ struct semaphore {
 #define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
 #define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)

-extern inline void sema_init(struct semaphore *sem, int val)
+static inline void sema_init(struct semaphore *sem, int val)
 {
 	/*
 	 * Logically, 
@@ -68,103 +72,33 @@ static inline void init_MUTEX_LOCKED (struct semaphore *sem)
 	sema_init(sem, 0);
 }

-
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern int  __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
-
-/* All have custom assembly linkages.  */
-extern void __down_failed(struct semaphore * sem);
-extern void __down_failed_interruptible(struct semaphore * sem);
-extern void __down_failed_trylock(struct semaphore * sem);
-extern void __up_wakeup(struct semaphore * sem);
+extern void down(struct semaphore *);
+extern void __down_failed(struct semaphore *);
+extern int  down_interruptible(struct semaphore *);
+extern int  __down_failed_interruptible(struct semaphore *);
+extern int  down_trylock(struct semaphore *);
+extern void up(struct semaphore *);
+extern void __up_wakeup(struct semaphore *);

 /*
- * Whee.  Hidden out of line code is fun.  The contention cases are
- * handled out of line in kernel/sched.c; arch/alpha/lib/semaphore.S
- * takes care of making sure we can call it without clobbering regs.
+ * Hidden out of line code is fun, but extremely messy.  Rely on newer
+ * compilers to do a respectable job with this.  The contention cases
+ * are handled out of line in arch/alpha/kernel/semaphore.c.
 */

-extern inline void down(struct semaphore * sem)
+static inline void __down(struct semaphore *sem)
 {
-	/* Given that we have to use particular hard registers to 
-	   communicate with __down_failed anyway, reuse them in 
-	   the atomic operation as well. 
-
-	   __down_failed takes the semaphore address in $24, and
-	   it's return address in $28.  The pv is loaded as usual.
-	   The gp is clobbered (in the module case) as usual.  */
-
-	/* This little bit of silliness is to get the GP loaded for
-	   a function that ordinarily wouldn't.  Otherwise we could
-	   have it done by the macro directly, which can be optimized
-	   the linker.  */
-	register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-	
-	pv = __down_failed;
-	__asm__ __volatile__ (
-		"/* semaphore down operation */\n"
-		"1:	ldl_l	$24,%1\n"
-		"	subl	$24,1,$28\n"
-		"	subl	$24,1,$24\n"
-		"	stl_c	$28,%1\n"
-		"	beq	$28,2f\n"
-		"	blt	$24,3f\n"
-		"4:	mb\n"
-		".subsection 2\n"
-		"2:	br	1b\n"
-		"3:	lda	$24,%1\n"
-		"	jsr	$28,($27),__down_failed\n"
-		"	ldgp	$29,0($28)\n"
-		"	br	4b\n"
-		".previous"
-		: "=r"(pv)
-		: "m"(sem->count), "r"(pv)
-		: "$24", "$28", "memory");
+	long count = atomic_dec_return(&sem->count);
+	if (__builtin_expect(count < 0, 0))
+		__down_failed(sem);
 }

-extern inline int down_interruptible(struct semaphore * sem)
+static inline int __down_interruptible(struct semaphore *sem)
 {
-	/* __down_failed_interruptible takes the semaphore address in $24,
-	   and it's return address in $28.  The pv is loaded as usual.
-	   The gp is clobbered (in the module case) as usual.  The return
-	   value is in $24.  */
-
-	register int ret __asm__("$24");
-	register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-	
-	pv = __down_failed_interruptible;
-	__asm__ __volatile__ (
-		"/* semaphore down interruptible operation */\n"
-		"1:	ldl_l	$24,%2\n"
-		"	subl	$24,1,$28\n"
-		"	subl	$24,1,$24\n"
-		"	stl_c	$28,%2\n"
-		"	beq	$28,2f\n"
-		"	blt	$24,3f\n"
-		"	mov	$31,%0\n"
-		"4:	mb\n"
-		".subsection 2\n"
-		"2:	br	1b\n"
-		"3:	lda	$24,%2\n"
-		"	jsr	$28,($27),__down_failed_interruptible\n"
-		"	ldgp	$29,0($28)\n"
-		"	br	4b\n"
-		".previous"
-		: "=r"(ret), "=r"(pv)
-		: "m"(sem->count), "r"(pv)
-		: "$28", "memory");
-
-	return ret;
+	long count = atomic_dec_return(&sem->count);
+	if (__builtin_expect(count < 0, 0))
+		return __down_failed_interruptible(sem);
+	return 0;
 }

 /*
@@ -174,7 +108,7 @@ extern inline int down_interruptible(struct semaphore * sem)
 * Do this by using ll/sc on the pair of 32-bit words.
 */

-extern inline int down_trylock(struct semaphore * sem)
+static inline int __down_trylock(struct semaphore * sem)
 {
 	long ret, tmp, tmp2, sub;

@@ -182,25 +116,21 @@ extern inline int down_trylock(struct semaphore * sem)
 	   (taken) branches in order to be a valid ll/sc sequence.

 	   do {
-	       tmp = ldq_l;
-	       sub = 0x0000000100000000;
-	       ret = ((int)tmp <= 0);		// count =< 0 ?
-	       if ((int)tmp >= 0) sub = 0;	// count >= 0 ?
-			// note that if count=0 subq overflows to the high
-			// longword (i.e waking)
-	       ret &= ((long)tmp < 0);		// waking < 0 ?
-	       sub += 1;
-	       if (ret) 
-			break;	
-	       tmp -= sub;
-	       tmp = stq_c = tmp;
+		tmp = ldq_l;
+		sub = 0x0000000100000000;	
+		ret = ((int)tmp <= 0);		// count <= 0 ?
+		// Note that if count=0, the decrement overflows into
+		// waking, so cancel the 1 loaded above.  Also cancel
+		// it if the lock was already free.
+		if ((int)tmp >= 0) sub = 0;	// count >= 0 ?
+		ret &= ((long)tmp < 0);		// waking < 0 ?
+		sub += 1;
+		if (ret) break;	
+		tmp -= sub;
+		tmp = stq_c = tmp;
 	   } while (tmp == 0);
 	*/

-#if WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-	
 	__asm__ __volatile__(
 		"1:	ldq_l	%1,%4\n"
 		"	lda	%3,1\n"
@@ -215,7 +145,7 @@ extern inline int down_trylock(struct semaphore * sem)
 		"	subq	%1,%3,%1\n"
 		"	stq_c	%1,%4\n"
 		"	beq	%1,3f\n"
-		"2:\n"
+		"2:	mb\n"
 		".subsection 2\n"
 		"3:	br	1b\n"
 		".previous"
@@ -226,45 +156,70 @@ extern inline int down_trylock(struct semaphore * sem)
 	return ret;
 }

-extern inline void up(struct semaphore * sem)
+static inline void __up(struct semaphore *sem)
 {
-	/* Given that we have to use particular hard registers to 
-	   communicate with __up_wakeup anyway, reuse them in 
-	   the atomic operation as well. 
+	long ret, tmp, tmp2, tmp3;

-	   __up_wakeup takes the semaphore address in $24, and
-	   it's return address in $28.  The pv is loaded as usual.
-	   The gp is clobbered (in the module case) as usual.  */
+	/* We must manipulate count and waking simultaneously and atomically.
+	   Otherwise we have races between up and __down_failed_interruptible
+	   waking up on a signal.

-	register void *pv __asm__("$27");
+	   "Equivalent" C.  Note that we have to do this all without
+	   (taken) branches in order to be a valid ll/sc sequence.

-#if WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-	
-	pv = __up_wakeup;
-	__asm__ __volatile__ (
-		"/* semaphore up operation */\n"
+	   do {
+		tmp = ldq_l;
+		ret = (int)tmp + 1;			// count += 1;
+		tmp2 = tmp & 0xffffffff00000000;	// extract waking
+		if (ret <= 0)				// still sleepers?
+			tmp2 += 0x0000000100000000;	// waking += 1;
+		tmp = ret & 0x00000000ffffffff;		// insert count
+		tmp |= tmp2;				// insert waking;
+	       tmp = stq_c = tmp;
+	   } while (tmp == 0);
+	*/
+
+	__asm__ __volatile__(
 		"	mb\n"
-		"1:	ldl_l	$24,%1\n"
-		"	addl	$24,1,$28\n"
-		"	addl	$24,1,$24\n"
-		"	stl_c	$28,%1\n"
-		"	beq	$28,2f\n"
-		"	ble	$24,3f\n"
-		"4:\n"
+		"1:	ldq_l	%1,%4\n"
+		"	addl	%1,1,%0\n"
+		"	zapnot	%1,0xf0,%2\n"
+		"	addq	%2,%5,%3\n"
+		"	cmovle	%0,%3,%2\n"
+		"	zapnot	%0,0x0f,%1\n"
+		"	bis	%1,%2,%1\n"
+		"	stq_c	%1,%4\n"
+		"	beq	%1,3f\n"
+		"2:\n"
 		".subsection 2\n"
-		"2:	br	1b\n"
-		"3:	lda	$24,%1\n"
-		"	jsr	$28,($27),__up_wakeup\n"
-		"	ldgp	$29,0($28)\n"
-		"	br	4b\n"
+		"3:	br	1b\n"
 		".previous"
-		: "=r"(pv)
-		: "m"(sem->count), "r"(pv)
-		: "$24", "$28", "memory");
+		: "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
+		: "m"(*sem), "r"(0x0000000100000000)
+		: "memory");
+
+	if (__builtin_expect(ret <= 0, 0))
+		__up_wakeup(sem);
 }

+#if !WAITQUEUE_DEBUG && !DEBUG_SEMAPHORE
+extern inline void down(struct semaphore *sem)
+{
+	__down(sem);
+}
+extern inline int down_interruptible(struct semaphore *sem)
+{
+	return __down_interruptible(sem);
+}
+extern inline int down_trylock(struct semaphore *sem)
+{
+	return __down_trylock(sem);
+}
+extern inline void up(struct semaphore *sem)
+{
+	__up(sem);
+}
+#endif

 /* rw mutexes (should that be mutices? =) -- throw rw
 * spinlocks and semaphores together, and this is what we
@@ -297,7 +252,7 @@ extern inline void up(struct semaphore * sem)
 #define RW_LOCK_BIAS		0x01000000

 struct rw_semaphore {
-	int			count;
+	atomic_t		count;
 	/* bit 0 means read bias granted;
 	   bit 1 means write bias granted.  */
 	unsigned		granted;
@@ -317,7 +272,7 @@ struct rw_semaphore {
 #endif

 #define __RWSEM_INITIALIZER(name,count)					\
-	{ (count), 0, __WAIT_QUEUE_HEAD_INITIALIZER((name).wait),	\
+	{ ATOMIC_INIT(count), 0, __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \
 	  __WAIT_QUEUE_HEAD_INITIALIZER((name).write_bias_wait)		\
 	  __SEM_DEBUG_INIT(name) __RWSEM_DEBUG_INIT }

@@ -331,9 +286,9 @@ struct rw_semaphore {
 #define DECLARE_RWSEM_WRITE_LOCKED(name) \
 	__DECLARE_RWSEM_GENERIC(name, 0)

-extern inline void init_rwsem(struct rw_semaphore *sem)
+static inline void init_rwsem(struct rw_semaphore *sem)
 {
-	sem->count = RW_LOCK_BIAS;
+	atomic_set (&sem->count, RW_LOCK_BIAS);
 	sem->granted = 0;
 	init_waitqueue_head(&sem->wait);
 	init_waitqueue_head(&sem->write_bias_wait);
@@ -344,213 +299,73 @@ extern inline void init_rwsem(struct rw_semaphore *sem)
 #endif
 }

-/* All have custom assembly linkages.  */
-extern void __down_read_failed(struct rw_semaphore *sem);
-extern void __down_write_failed(struct rw_semaphore *sem);
-extern void __rwsem_wake(struct rw_semaphore *sem, unsigned long readers);
+extern void down_read(struct rw_semaphore *);
+extern void down_write(struct rw_semaphore *);
+extern void up_read(struct rw_semaphore *);
+extern void up_write(struct rw_semaphore *);
+extern void __down_read_failed(struct rw_semaphore *, int);
+extern void __down_write_failed(struct rw_semaphore *, int);
+extern void __rwsem_wake(struct rw_semaphore *, int);

-extern inline void down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct rw_semaphore *sem)
 {
-	/* Given that we have to use particular hard registers to 
-	   communicate with __down_read_failed anyway, reuse them in 
-	   the atomic operation as well. 
+	long count = atomic_dec_return(&sem->count);
+	if (__builtin_expect(count < 0, 0))
+		__down_read_failed(sem, count);
+}

-	   __down_read_failed takes the semaphore address in $24, the count
-	   we read in $25, and it's return address in $28. The pv is loaded
-	   as usual. The gp is clobbered (in the module case) as usual.  */
+static inline void __down_write(struct rw_semaphore *sem)
+{
+	long count = atomic_sub_return(RW_LOCK_BIAS, &sem->count);
+	if (__builtin_expect(count != 0, 0))
+		__down_write_failed(sem, count);
+}

-	/* This little bit of silliness is to get the GP loaded for
-	   a function that ordinarily wouldn't.  Otherwise we could
-	   have it done by the macro directly, which can be optimized
-	   the linker.  */
-	register void *pv __asm__("$27");
+/* When a reader does a release, the only significant case is when there
+   was a writer waiting, and we've bumped the count to 0, then we must
+   wake the writer up.  */

-#if WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
+static inline void __up_read(struct rw_semaphore *sem)
+{
+	long count;
+	mb();
+	count = atomic_inc_return(&sem->count);
+	if (__builtin_expect(count == 0, 0))
+		__rwsem_wake(sem, 0);
+}

-	pv = __down_read_failed;
-	__asm__ __volatile__(
-		"/* semaphore down_read operation */\n"
-		"1:	ldl_l	$24,%1\n"
-		"	subl	$24,1,$28\n"
-		"	subl	$24,1,$25\n"
-		"	stl_c	$28,%1\n"
-		"	beq	$28,2f\n"
-		"	blt	$25,3f\n"
-		"4:	mb\n"
-		".subsection 2\n"
-		"2:	br	1b\n"
-		"3:	lda	$24,%1\n"
-		"	jsr	$28,($27),__down_read_failed\n"
-		"	ldgp	$29,0($28)\n"
-		"	br	4b\n"
-		".previous"
-		: "=r"(pv)
-		: "m"(sem->count), "r"(pv)
-		: "$24", "$25", "$28", "memory");
+/* Releasing the writer is easy -- just release it and wake up
+   any sleepers.  */

-#if WAITQUEUE_DEBUG
-	if (sem->granted & 2)
-		BUG();
-	if (atomic_read(&sem->writers))
-		BUG();
-	atomic_inc(&sem->readers);
-#endif
+static inline void __up_write(struct rw_semaphore *sem)
+{
+	long count, wake;
+	mb();
+	count = atomic_add_return(RW_LOCK_BIAS, &sem->count);
+
+	/* Only do the wake if we were, but are no longer, negative.  */
+	wake = ((int)(count - RW_LOCK_BIAS) < 0) && count >= 0;
+	if (__builtin_expect(wake, 0))
+		__rwsem_wake(sem, count);
 }

+#if !WAITQUEUE_DEBUG && !DEBUG_RW_SEMAPHORE
+extern inline void down_read(struct rw_semaphore *sem)
+{
+	__down_read(sem);
+}
 extern inline void down_write(struct rw_semaphore *sem)
 {
-	/* Given that we have to use particular hard registers to 
-	   communicate with __down_write_failed anyway, reuse them in 
-	   the atomic operation as well. 
-
-	   __down_write_failed takes the semaphore address in $24, the count
-	   we read in $25, and it's return address in $28. The pv is loaded
-	   as usual. The gp is clobbered (in the module case) as usual.  */
-
-	/* This little bit of silliness is to get the GP loaded for
-	   a function that ordinarily wouldn't.  Otherwise we could
-	   have it done by the macro directly, which can be optimized
-	   the linker.  */
-	register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	pv = __down_write_failed;
-	__asm__ __volatile__(
-		"/* semaphore down_write operation */\n"
-		"1:	ldl_l	$24,%1\n"
-		"	ldah	$28,%3($24)\n"
-		"	ldah	$25,%3($24)\n"
-		"	stl_c	$28,%1\n"
-		"	beq	$28,2f\n"
-		"	bne	$25,3f\n"
-		"4:	mb\n"
-		".subsection 2\n"
-		"2:	br	1b\n"
-		"3:	lda	$24,%1\n"
-		"	jsr	$28,($27),__down_write_failed\n"
-		"	ldgp	$29,0($28)\n"
-		"	br	4b\n"
-		".previous"
-		: "=r"(pv)
-		: "m"(sem->count), "r"(pv), "i"(-(RW_LOCK_BIAS >> 16))
-		: "$24", "$25", "$28", "memory");
-
-#if WAITQUEUE_DEBUG
-	if (atomic_read(&sem->writers))
-		BUG();
-	if (atomic_read(&sem->readers))
-		BUG();
-	if (sem->granted & 3)
-		BUG();
-	atomic_inc(&sem->writers);
-#endif
+	__down_write(sem);
 }
-
-/* When a reader does a release, the only significant case is when
-  there was a writer waiting, and we've * bumped the count to 0: we must
-wake the writer up.  */
-
 extern inline void up_read(struct rw_semaphore *sem)
 {
-	/* Given that we have to use particular hard registers to 
-	   communicate with __rwsem_wake anyway, reuse them in 
-	   the atomic operation as well. 
-
-	   __rwsem_wake takes the semaphore address in $24, the
-	   number of waiting readers in $25, and it's return address
-	   in $28.  The pv is loaded as usual. The gp is clobbered
-	   (in the module case) as usual.  */
-
-	register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-	if (sem->granted & 2)
-		BUG();
-	if (atomic_read(&sem->writers))
-		BUG();
-	atomic_dec(&sem->readers);
-#endif
-
-	pv = __rwsem_wake;
-	__asm__ __volatile__(
-		"/* semaphore up_read operation */\n"
-		"	mb\n"
-		"1:	ldl_l	$24,%1\n"
-		"	addl	$24,1,$28\n"
-		"	addl	$24,1,$24\n"
-		"	stl_c	$28,%1\n"
-		"	beq	$28,2f\n"
-		"	beq	$24,3f\n"
-		"4:\n"
-		".subsection 2\n"
-		"2:	br	1b\n"
-		"3:	lda	$24,%1\n"
-		"	mov	0,$25\n"
-		"	jsr	$28,($27),__rwsem_wake\n"
-		"	ldgp	$29,0($28)\n"
-		"	br	4b\n"
-		".previous"
-		: "=r"(pv)
-		: "m"(sem->count), "r"(pv)
-		: "$24", "$25", "$28", "memory");
+	__up_read(sem);
 }
-
-/* releasing the writer is easy -- just release it and
- * wake up any sleepers.
- */
 extern inline void up_write(struct rw_semaphore *sem)
 {
-	/* Given that we have to use particular hard registers to 
-	   communicate with __rwsem_wake anyway, reuse them in 
-	   the atomic operation as well. 
-
-	   __rwsem_wake takes the semaphore address in $24, the
-	   number of waiting readers in $25, and it's return address
-	   in $28.  The pv is loaded as usual. The gp is clobbered
-	   (in the module case) as usual.  */
-
-	register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-	if (sem->granted & 3)
-		BUG();
-	if (atomic_read(&sem->readers))
-		BUG();
-	if (atomic_read(&sem->writers) != 1)
-		BUG();
-	atomic_dec(&sem->writers);
-#endif
-
-	pv = __rwsem_wake;
-	__asm__ __volatile__(
-		"/* semaphore up_write operation */\n"
-		"	mb\n"
-		"1:	ldl_l	$24,%1\n"
-		"	ldah	$28,%3($24)\n"
-		"	stl_c	$28,%1\n"
-		"	beq	$28,2f\n"
-		"	blt	$24,3f\n"
-		"4:\n"
-		".subsection 2\n"
-		"2:	br	1b\n"
-		"3:	ldah	$25,%3($24)\n"
-		/* Only do the wake if we're no longer negative.  */
-		"	blt	$25,4b\n"
-		"	lda	$24,%1\n"
-		"	jsr	$28,($27),__rwsem_wake\n"
-		"	ldgp	$29,0($28)\n"
-		"	br	4b\n"
-		".previous"
-		: "=r"(pv)
-		: "m"(sem->count), "r"(pv), "i"(RW_LOCK_BIAS >> 16)
-		: "$24", "$25", "$28", "memory");
+	__up_write(sem);
 }
+#endif

 #endif
--- a/include/asm-alpha/spinlock.h
+++ b/include/asm-alpha/spinlock.h
@@ -80,7 +80,7 @@ static inline void spin_lock(spinlock_t * lock)
 	"	blbs	%0,2b\n"
 	"	br	1b\n"
 	".previous"
-	: "=r" (tmp), "=m" (lock->lock)
+	: "=&r" (tmp), "=m" (lock->lock)
 	: "m"(lock->lock) : "memory");
 }


--- a/include/asm-alpha/xor.h
+++ b/include/asm-alpha/xor.h
+/*
+ * include/asm-alpha/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for alpha EV5 and EV6
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+extern void xor_alpha_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_alpha_3(unsigned long, unsigned long *, unsigned long *,
+		        unsigned long *);
+extern void xor_alpha_4(unsigned long, unsigned long *, unsigned long *,
+		        unsigned long *, unsigned long *);
+extern void xor_alpha_5(unsigned long, unsigned long *, unsigned long *,
+		        unsigned long *, unsigned long *, unsigned long *);
+
+extern void xor_alpha_prefetch_2(unsigned long, unsigned long *,
+				 unsigned long *);
+extern void xor_alpha_prefetch_3(unsigned long, unsigned long *,
+				 unsigned long *, unsigned long *);
+extern void xor_alpha_prefetch_4(unsigned long, unsigned long *,
+				 unsigned long *, unsigned long *,
+				 unsigned long *);
+extern void xor_alpha_prefetch_5(unsigned long, unsigned long *,
+				 unsigned long *, unsigned long *,
+				 unsigned long *, unsigned long *);
+
+asm("
+	.text
+	.align 3
+	.ent xor_alpha_2
+xor_alpha_2:
+	.prologue 0
+	srl $16, 6, $16
+	.align 4
+2:
+	ldq $0,0($17)
+	ldq $1,0($18)
+	ldq $2,8($17)
+	ldq $3,8($18)
+
+	ldq $4,16($17)
+	ldq $5,16($18)
+	ldq $6,24($17)
+	ldq $7,24($18)
+
+	ldq $19,32($17)
+	ldq $20,32($18)
+	ldq $21,40($17)
+	ldq $22,40($18)
+
+	ldq $23,48($17)
+	ldq $24,48($18)
+	ldq $25,56($17)
+	xor $0,$1,$0		# 7 cycles from $1 load
+
+	ldq $27,56($18)
+	xor $2,$3,$2
+	stq $0,0($17)
+	xor $4,$5,$4
+
+	stq $2,8($17)
+	xor $6,$7,$6
+	stq $4,16($17)
+	xor $19,$20,$19
+
+	stq $6,24($17)
+	xor $21,$22,$21
+	stq $19,32($17)
+	xor $23,$24,$23
+
+	stq $21,40($17)
+	xor $25,$27,$25
+	stq $23,48($17)
+	subq $16,1,$16
+
+	stq $25,56($17)
+	addq $17,64,$17
+	addq $18,64,$18
+	bgt $16,2b
+
+	ret
+	.end xor_alpha_2
+
+	.align 3
+	.ent xor_alpha_3
+xor_alpha_3:
+	.prologue 0
+	srl $16, 6, $16
+	.align 4
+3:
+	ldq $0,0($17)
+	ldq $1,0($18)
+	ldq $2,0($19)
+	ldq $3,8($17)
+
+	ldq $4,8($18)
+	ldq $6,16($17)
+	ldq $7,16($18)
+	ldq $21,24($17)
+
+	ldq $22,24($18)
+	ldq $24,32($17)
+	ldq $25,32($18)
+	ldq $5,8($19)
+
+	ldq $20,16($19)
+	ldq $23,24($19)
+	ldq $27,32($19)
+	nop
+
+	xor $0,$1,$1		# 8 cycles from $0 load
+	xor $3,$4,$4		# 6 cycles from $4 load
+	xor $6,$7,$7		# 6 cycles from $7 load
+	xor $21,$22,$22		# 5 cycles from $22 load
+
+	xor $1,$2,$2		# 9 cycles from $2 load
+	xor $24,$25,$25		# 5 cycles from $25 load
+	stq $2,0($17)
+	xor $4,$5,$5		# 6 cycles from $5 load
+
+	stq $5,8($17)
+	xor $7,$20,$20		# 7 cycles from $20 load
+	stq $20,16($17)
+	xor $22,$23,$23		# 7 cycles from $23 load
+
+	stq $23,24($17)
+	xor $25,$27,$27		# 7 cycles from $27 load
+	stq $27,32($17)
+	nop
+
+	ldq $0,40($17)
+	ldq $1,40($18)
+	ldq $3,48($17)
+	ldq $4,48($18)
+
+	ldq $6,56($17)
+	ldq $7,56($18)
+	ldq $2,40($19)
+	ldq $5,48($19)
+
+	ldq $20,56($19)
+	xor $0,$1,$1		# 4 cycles from $1 load
+	xor $3,$4,$4		# 5 cycles from $4 load
+	xor $6,$7,$7		# 5 cycles from $7 load
+
+	xor $1,$2,$2		# 4 cycles from $2 load
+	xor $4,$5,$5		# 5 cycles from $5 load
+	stq $2,40($17)
+	xor $7,$20,$20		# 4 cycles from $20 load
+
+	stq $5,48($17)
+	subq $16,1,$16
+	stq $20,56($17)
+	addq $19,64,$19
+
+	addq $18,64,$18
+	addq $17,64,$17
+	bgt $16,3b
+	ret
+	.end xor_alpha_3
+
+	.align 3
+	.ent xor_alpha_4
+xor_alpha_4:
+	.prologue 0
+	srl $16, 6, $16
+	.align 4
+4:
+	ldq $0,0($17)
+	ldq $1,0($18)
+	ldq $2,0($19)
+	ldq $3,0($20)
+
+	ldq $4,8($17)
+	ldq $5,8($18)
+	ldq $6,8($19)
+	ldq $7,8($20)
+
+	ldq $21,16($17)
+	ldq $22,16($18)
+	ldq $23,16($19)
+	ldq $24,16($20)
+
+	ldq $25,24($17)
+	xor $0,$1,$1		# 6 cycles from $1 load
+	ldq $27,24($18)
+	xor $2,$3,$3		# 6 cycles from $3 load
+
+	ldq $0,24($19)
+	xor $1,$3,$3
+	ldq $1,24($20)
+	xor $4,$5,$5		# 7 cycles from $5 load
+
+	stq $3,0($17)
+	xor $6,$7,$7
+	xor $21,$22,$22		# 7 cycles from $22 load
+	xor $5,$7,$7
+
+	stq $7,8($17)
+	xor $23,$24,$24		# 7 cycles from $24 load
+	ldq $2,32($17)
+	xor $22,$24,$24
+
+	ldq $3,32($18)
+	ldq $4,32($19)
+	ldq $5,32($20)
+	xor $25,$27,$27		# 8 cycles from $27 load
+
+	ldq $6,40($17)
+	ldq $7,40($18)
+	ldq $21,40($19)
+	ldq $22,40($20)
+
+	stq $24,16($17)
+	xor $0,$1,$1		# 9 cycles from $1 load
+	xor $2,$3,$3		# 5 cycles from $3 load
+	xor $27,$1,$1
+
+	stq $1,24($17)
+	xor $4,$5,$5		# 5 cycles from $5 load
+	ldq $23,48($17)
+	ldq $24,48($18)
+
+	ldq $25,48($19)
+	xor $3,$5,$5
+	ldq $27,48($20)
+	ldq $0,56($17)
+
+	ldq $1,56($18)
+	ldq $2,56($19)
+	xor $6,$7,$7		# 8 cycles from $6 load
+	ldq $3,56($20)
+
+	stq $5,32($17)
+	xor $21,$22,$22		# 8 cycles from $22 load
+	xor $7,$22,$22
+	xor $23,$24,$24		# 5 cycles from $24 load
+
+	stq $22,40($17)
+	xor $25,$27,$27		# 5 cycles from $27 load
+	xor $24,$27,$27
+	xor $0,$1,$1		# 5 cycles from $1 load
+
+	stq $27,48($17)
+	xor $2,$3,$3		# 4 cycles from $3 load
+	xor $1,$3,$3
+	subq $16,1,$16
+
+	stq $3,56($17)
+	addq $20,64,$20
+	addq $19,64,$19
+	addq $18,64,$18
+
+	addq $17,64,$17
+	bgt $16,4b
+	ret
+	.end xor_alpha_4
+
+	.align 3
+	.ent xor_alpha_5
+xor_alpha_5:
+	.prologue 0
+	srl $16, 6, $16
+	.align 4
+5:
+	ldq $0,0($17)
+	ldq $1,0($18)
+	ldq $2,0($19)
+	ldq $3,0($20)
+
+	ldq $4,0($21)
+	ldq $5,8($17)
+	ldq $6,8($18)
+	ldq $7,8($19)
+
+	ldq $22,8($20)
+	ldq $23,8($21)
+	ldq $24,16($17)
+	ldq $25,16($18)
+
+	ldq $27,16($19)
+	xor $0,$1,$1		# 6 cycles from $1 load
+	ldq $28,16($20)
+	xor $2,$3,$3		# 6 cycles from $3 load
+
+	ldq $0,16($21)
+	xor $1,$3,$3
+	ldq $1,24($17)
+	xor $3,$4,$4		# 7 cycles from $4 load
+
+	stq $4,0($17)
+	xor $5,$6,$6		# 7 cycles from $6 load
+	xor $7,$22,$22		# 7 cycles from $22 load
+	xor $6,$23,$23		# 7 cycles from $23 load
+
+	ldq $2,24($18)
+	xor $22,$23,$23
+	ldq $3,24($19)
+	xor $24,$25,$25		# 8 cycles from $25 load
+
+	stq $23,8($17)
+	xor $25,$27,$27		# 8 cycles from $27 load
+	ldq $4,24($20)
+	xor $28,$0,$0		# 7 cycles from $0 load
+
+	ldq $5,24($21)
+	xor $27,$0,$0
+	ldq $6,32($17)
+	ldq $7,32($18)
+
+	stq $0,16($17)
+	xor $1,$2,$2		# 6 cycles from $2 load
+	ldq $22,32($19)
+	xor $3,$4,$4		# 4 cycles from $4 load
+	
+	ldq $23,32($20)
+	xor $2,$4,$4
+	ldq $24,32($21)
+	ldq $25,40($17)
+
+	ldq $27,40($18)
+	ldq $28,40($19)
+	ldq $0,40($20)
+	xor $4,$5,$5		# 7 cycles from $5 load
+
+	stq $5,24($17)
+	xor $6,$7,$7		# 7 cycles from $7 load
+	ldq $1,40($21)
+	ldq $2,48($17)
+
+	ldq $3,48($18)
+	xor $7,$22,$22		# 7 cycles from $22 load
+	ldq $4,48($19)
+	xor $23,$24,$24		# 6 cycles from $24 load
+
+	ldq $5,48($20)
+	xor $22,$24,$24
+	ldq $6,48($21)
+	xor $25,$27,$27		# 7 cycles from $27 load
+
+	stq $24,32($17)
+	xor $27,$28,$28		# 8 cycles from $28 load
+	ldq $7,56($17)
+	xor $0,$1,$1		# 6 cycles from $1 load
+
+	ldq $22,56($18)
+	ldq $23,56($19)
+	ldq $24,56($20)
+	ldq $25,56($21)
+
+	xor $28,$1,$1
+	xor $2,$3,$3		# 9 cycles from $3 load
+	xor $3,$4,$4		# 9 cycles from $4 load
+	xor $5,$6,$6		# 8 cycles from $6 load
+
+	stq $1,40($17)
+	xor $4,$6,$6
+	xor $7,$22,$22		# 7 cycles from $22 load
+	xor $23,$24,$24		# 6 cycles from $24 load
+
+	stq $6,48($17)
+	xor $22,$24,$24
+	subq $16,1,$16
+	xor $24,$25,$25		# 8 cycles from $25 load
+
+	stq $25,56($17)
+	addq $21,64,$21
+	addq $20,64,$20
+	addq $19,64,$19
+
+	addq $18,64,$18
+	addq $17,64,$17
+	bgt $16,5b
+	ret
+	.end xor_alpha_5
+
+	.align 3
+	.ent xor_alpha_prefetch_2
+xor_alpha_prefetch_2:
+	.prologue 0
+	srl $16, 6, $16
+
+	ldq $31, 0($17)
+	ldq $31, 0($18)
+
+	ldq $31, 64($17)
+	ldq $31, 64($18)
+
+	ldq $31, 128($17)
+	ldq $31, 128($18)
+
+	ldq $31, 192($17)
+	ldq $31, 192($18)
+	.align 4
+2:
+	ldq $0,0($17)
+	ldq $1,0($18)
+	ldq $2,8($17)
+	ldq $3,8($18)
+
+	ldq $4,16($17)
+	ldq $5,16($18)
+	ldq $6,24($17)
+	ldq $7,24($18)
+
+	ldq $19,32($17)
+	ldq $20,32($18)
+	ldq $21,40($17)
+	ldq $22,40($18)
+
+	ldq $23,48($17)
+	ldq $24,48($18)
+	ldq $25,56($17)
+	ldq $27,56($18)
+
+	ldq $31,256($17)
+	xor $0,$1,$0		# 8 cycles from $1 load
+	ldq $31,256($18)
+	xor $2,$3,$2
+
+	stq $0,0($17)
+	xor $4,$5,$4
+	stq $2,8($17)
+	xor $6,$7,$6
+
+	stq $4,16($17)
+	xor $19,$20,$19
+	stq $6,24($17)
+	xor $21,$22,$21
+
+	stq $19,32($17)
+	xor $23,$24,$23
+	stq $21,40($17)
+	xor $25,$27,$25
+
+	stq $23,48($17)
+	subq $16,1,$16
+	stq $25,56($17)
+	addq $17,64,$17
+
+	addq $18,64,$18
+	bgt $16,2b
+	ret
+	.end xor_alpha_prefetch_2
+
+	.align 3
+	.ent xor_alpha_prefetch_3
+xor_alpha_prefetch_3:
+	.prologue 0
+	srl $16, 6, $16
+
+	ldq $31, 0($17)
+	ldq $31, 0($18)
+	ldq $31, 0($19)
+
+	ldq $31, 64($17)
+	ldq $31, 64($18)
+	ldq $31, 64($19)
+
+	ldq $31, 128($17)
+	ldq $31, 128($18)
+	ldq $31, 128($19)
+
+	ldq $31, 192($17)
+	ldq $31, 192($18)
+	ldq $31, 192($19)
+	.align 4
+3:
+	ldq $0,0($17)
+	ldq $1,0($18)
+	ldq $2,0($19)
+	ldq $3,8($17)
+
+	ldq $4,8($18)
+	ldq $6,16($17)
+	ldq $7,16($18)
+	ldq $21,24($17)
+
+	ldq $22,24($18)
+	ldq $24,32($17)
+	ldq $25,32($18)
+	ldq $5,8($19)
+
+	ldq $20,16($19)
+	ldq $23,24($19)
+	ldq $27,32($19)
+	nop
+
+	xor $0,$1,$1		# 8 cycles from $0 load
+	xor $3,$4,$4		# 7 cycles from $4 load
+	xor $6,$7,$7		# 6 cycles from $7 load
+	xor $21,$22,$22		# 5 cycles from $22 load
+
+	xor $1,$2,$2		# 9 cycles from $2 load
+	xor $24,$25,$25		# 5 cycles from $25 load
+	stq $2,0($17)
+	xor $4,$5,$5		# 6 cycles from $5 load
+
+	stq $5,8($17)
+	xor $7,$20,$20		# 7 cycles from $20 load
+	stq $20,16($17)
+	xor $22,$23,$23		# 7 cycles from $23 load
+
+	stq $23,24($17)
+	xor $25,$27,$27		# 7 cycles from $27 load
+	stq $27,32($17)
+	nop
+
+	ldq $0,40($17)
+	ldq $1,40($18)
+	ldq $3,48($17)
+	ldq $4,48($18)
+
+	ldq $6,56($17)
+	ldq $7,56($18)
+	ldq $2,40($19)
+	ldq $5,48($19)
+
+	ldq $20,56($19)
+	ldq $31,256($17)
+	ldq $31,256($18)
+	ldq $31,256($19)
+
+	xor $0,$1,$1		# 6 cycles from $1 load
+	xor $3,$4,$4		# 5 cycles from $4 load
+	xor $6,$7,$7		# 5 cycles from $7 load
+	xor $1,$2,$2		# 4 cycles from $2 load
+	
+	xor $4,$5,$5		# 5 cycles from $5 load
+	xor $7,$20,$20		# 4 cycles from $20 load
+	stq $2,40($17)
+	subq $16,1,$16
+
+	stq $5,48($17)
+	addq $19,64,$19
+	stq $20,56($17)
+	addq $18,64,$18
+
+	addq $17,64,$17
+	bgt $16,3b
+	ret
+	.end xor_alpha_prefetch_3
+
+	.align 3
+	.ent xor_alpha_prefetch_4
+xor_alpha_prefetch_4:
+	.prologue 0
+	srl $16, 6, $16
+
+	ldq $31, 0($17)
+	ldq $31, 0($18)
+	ldq $31, 0($19)
+	ldq $31, 0($20)
+
+	ldq $31, 64($17)
+	ldq $31, 64($18)
+	ldq $31, 64($19)
+	ldq $31, 64($20)
+
+	ldq $31, 128($17)
+	ldq $31, 128($18)
+	ldq $31, 128($19)
+	ldq $31, 128($20)
+
+	ldq $31, 192($17)
+	ldq $31, 192($18)
+	ldq $31, 192($19)
+	ldq $31, 192($20)
+	.align 4
+4:
+	ldq $0,0($17)
+	ldq $1,0($18)
+	ldq $2,0($19)
+	ldq $3,0($20)
+
+	ldq $4,8($17)
+	ldq $5,8($18)
+	ldq $6,8($19)
+	ldq $7,8($20)
+
+	ldq $21,16($17)
+	ldq $22,16($18)
+	ldq $23,16($19)
+	ldq $24,16($20)
+
+	ldq $25,24($17)
+	xor $0,$1,$1		# 6 cycles from $1 load
+	ldq $27,24($18)
+	xor $2,$3,$3		# 6 cycles from $3 load
+
+	ldq $0,24($19)
+	xor $1,$3,$3
+	ldq $1,24($20)
+	xor $4,$5,$5		# 7 cycles from $5 load
+
+	stq $3,0($17)
+	xor $6,$7,$7
+	xor $21,$22,$22		# 7 cycles from $22 load
+	xor $5,$7,$7
+
+	stq $7,8($17)
+	xor $23,$24,$24		# 7 cycles from $24 load
+	ldq $2,32($17)
+	xor $22,$24,$24
+
+	ldq $3,32($18)
+	ldq $4,32($19)
+	ldq $5,32($20)
+	xor $25,$27,$27		# 8 cycles from $27 load
+
+	ldq $6,40($17)
+	ldq $7,40($18)
+	ldq $21,40($19)
+	ldq $22,40($20)
+
+	stq $24,16($17)
+	xor $0,$1,$1		# 9 cycles from $1 load
+	xor $2,$3,$3		# 5 cycles from $3 load
+	xor $27,$1,$1
+
+	stq $1,24($17)
+	xor $4,$5,$5		# 5 cycles from $5 load
+	ldq $23,48($17)
+	xor $3,$5,$5
+
+	ldq $24,48($18)
+	ldq $25,48($19)
+	ldq $27,48($20)
+	ldq $0,56($17)
+
+	ldq $1,56($18)
+	ldq $2,56($19)
+	ldq $3,56($20)
+	xor $6,$7,$7		# 8 cycles from $6 load
+
+	ldq $31,256($17)
+	xor $21,$22,$22		# 8 cycles from $22 load
+	ldq $31,256($18)
+	xor $7,$22,$22
+
+	ldq $31,256($19)
+	xor $23,$24,$24		# 6 cycles from $24 load
+	ldq $31,256($20)
+	xor $25,$27,$27		# 6 cycles from $27 load
+
+	stq $5,32($17)
+	xor $24,$27,$27
+	xor $0,$1,$1		# 7 cycles from $1 load
+	xor $2,$3,$3		# 6 cycles from $3 load
+
+	stq $22,40($17)
+	xor $1,$3,$3
+	stq $27,48($17)
+	subq $16,1,$16
+
+	stq $3,56($17)
+	addq $20,64,$20
+	addq $19,64,$19
+	addq $18,64,$18
+
+	addq $17,64,$17
+	bgt $16,4b
+	ret
+	.end xor_alpha_prefetch_4
+
+	.align 3
+	.ent xor_alpha_prefetch_5
+xor_alpha_prefetch_5:
+	.prologue 0
+	srl $16, 6, $16
+
+	ldq $31, 0($17)
+	ldq $31, 0($18)
+	ldq $31, 0($19)
+	ldq $31, 0($20)
+	ldq $31, 0($21)
+
+	ldq $31, 64($17)
+	ldq $31, 64($18)
+	ldq $31, 64($19)
+	ldq $31, 64($20)
+	ldq $31, 64($21)
+
+	ldq $31, 128($17)
+	ldq $31, 128($18)
+	ldq $31, 128($19)
+	ldq $31, 128($20)
+	ldq $31, 128($21)
+
+	ldq $31, 192($17)
+	ldq $31, 192($18)
+	ldq $31, 192($19)
+	ldq $31, 192($20)
+	ldq $31, 192($21)
+	.align 4
+5:
+	ldq $0,0($17)
+	ldq $1,0($18)
+	ldq $2,0($19)
+	ldq $3,0($20)
+
+	ldq $4,0($21)
+	ldq $5,8($17)
+	ldq $6,8($18)
+	ldq $7,8($19)
+
+	ldq $22,8($20)
+	ldq $23,8($21)
+	ldq $24,16($17)
+	ldq $25,16($18)
+
+	ldq $27,16($19)
+	xor $0,$1,$1		# 6 cycles from $1 load
+	ldq $28,16($20)
+	xor $2,$3,$3		# 6 cycles from $3 load
+
+	ldq $0,16($21)
+	xor $1,$3,$3
+	ldq $1,24($17)
+	xor $3,$4,$4		# 7 cycles from $4 load
+
+	stq $4,0($17)
+	xor $5,$6,$6		# 7 cycles from $6 load
+	xor $7,$22,$22		# 7 cycles from $22 load
+	xor $6,$23,$23		# 7 cycles from $23 load
+
+	ldq $2,24($18)
+	xor $22,$23,$23
+	ldq $3,24($19)
+	xor $24,$25,$25		# 8 cycles from $25 load
+
+	stq $23,8($17)
+	xor $25,$27,$27		# 8 cycles from $27 load
+	ldq $4,24($20)
+	xor $28,$0,$0		# 7 cycles from $0 load
+
+	ldq $5,24($21)
+	xor $27,$0,$0
+	ldq $6,32($17)
+	ldq $7,32($18)
+
+	stq $0,16($17)
+	xor $1,$2,$2		# 6 cycles from $2 load
+	ldq $22,32($19)
+	xor $3,$4,$4		# 4 cycles from $4 load
+	
+	ldq $23,32($20)
+	xor $2,$4,$4
+	ldq $24,32($21)
+	ldq $25,40($17)
+
+	ldq $27,40($18)
+	ldq $28,40($19)
+	ldq $0,40($20)
+	xor $4,$5,$5		# 7 cycles from $5 load
+
+	stq $5,24($17)
+	xor $6,$7,$7		# 7 cycles from $7 load
+	ldq $1,40($21)
+	ldq $2,48($17)
+
+	ldq $3,48($18)
+	xor $7,$22,$22		# 7 cycles from $22 load
+	ldq $4,48($19)
+	xor $23,$24,$24		# 6 cycles from $24 load
+
+	ldq $5,48($20)
+	xor $22,$24,$24
+	ldq $6,48($21)
+	xor $25,$27,$27		# 7 cycles from $27 load
+
+	stq $24,32($17)
+	xor $27,$28,$28		# 8 cycles from $28 load
+	ldq $7,56($17)
+	xor $0,$1,$1		# 6 cycles from $1 load
+
+	ldq $22,56($18)
+	ldq $23,56($19)
+	ldq $24,56($20)
+	ldq $25,56($21)
+
+	ldq $31,256($17)
+	xor $28,$1,$1
+	ldq $31,256($18)
+	xor $2,$3,$3		# 9 cycles from $3 load
+
+	ldq $31,256($19)
+	xor $3,$4,$4		# 9 cycles from $4 load
+	ldq $31,256($20)
+	xor $5,$6,$6		# 8 cycles from $6 load
+
+	stq $1,40($17)
+	xor $4,$6,$6
+	xor $7,$22,$22		# 7 cycles from $22 load
+	xor $23,$24,$24		# 6 cycles from $24 load
+
+	stq $6,48($17)
+	xor $22,$24,$24
+	ldq $31,256($21)
+	xor $24,$25,$25		# 8 cycles from $25 load
+
+	stq $25,56($17)
+	subq $16,1,$16
+	addq $21,64,$21
+	addq $20,64,$20
+
+	addq $19,64,$19
+	addq $18,64,$18
+	addq $17,64,$17
+	bgt $16,5b
+
+	ret
+	.end xor_alpha_prefetch_5
+");
+
+static struct xor_block_template xor_block_alpha = {
+	name: "alpha",
+	do_2: xor_alpha_2,
+	do_3: xor_alpha_3,
+	do_4: xor_alpha_4,
+	do_5: xor_alpha_5,
+};
+
+static struct xor_block_template xor_block_alpha_prefetch = {
+	name: "alpha prefetch",
+	do_2: xor_alpha_prefetch_2,
+	do_3: xor_alpha_prefetch_3,
+	do_4: xor_alpha_prefetch_4,
+	do_5: xor_alpha_prefetch_5,
+};
+
+/* For grins, also test the generic routines.  */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_alpha);		\
+		xor_speed(&xor_block_alpha_prefetch);	\
+	} while (0)
+
+/* Force the use of alpha_prefetch if EV6, as it is significantly
+   faster in the cold cache case.  */
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+	(implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST)
--- a/include/asm-arm/xor.h
+++ b/include/asm-arm/xor.h
+#include <asm-generic/xor.h>
--- a/include/asm-generic/xor.h
+++ b/include/asm-generic/xor.h
+/*
+ * include/asm-generic/xor.h
+ *
+ * Generic optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+static void
+xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		p1[0] ^= p2[0];
+		p1[1] ^= p2[1];
+		p1[2] ^= p2[2];
+		p1[3] ^= p2[3];
+		p1[4] ^= p2[4];
+		p1[5] ^= p2[5];
+		p1[6] ^= p2[6];
+		p1[7] ^= p2[7];
+		p1 += 8;
+		p2 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	    unsigned long *p3)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		p1[0] ^= p2[0] ^ p3[0];
+		p1[1] ^= p2[1] ^ p3[1];
+		p1[2] ^= p2[2] ^ p3[2];
+		p1[3] ^= p2[3] ^ p3[3];
+		p1[4] ^= p2[4] ^ p3[4];
+		p1[5] ^= p2[5] ^ p3[5];
+		p1[6] ^= p2[6] ^ p3[6];
+		p1[7] ^= p2[7] ^ p3[7];
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	    unsigned long *p3, unsigned long *p4)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
+		p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
+		p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
+		p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
+		p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
+		p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
+		p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
+		p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	    unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
+		p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
+		p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
+		p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
+		p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
+		p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
+		p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
+		p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+		p5 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		p1[0] = d0;	/* Store the result (in burts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	    unsigned long *p3)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		d0 ^= p3[0];
+		d1 ^= p3[1];
+		d2 ^= p3[2];
+		d3 ^= p3[3];
+		d4 ^= p3[4];
+		d5 ^= p3[5];
+		d6 ^= p3[6];
+		d7 ^= p3[7];
+		p1[0] = d0;	/* Store the result (in burts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	    unsigned long *p3, unsigned long *p4)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		d0 ^= p3[0];
+		d1 ^= p3[1];
+		d2 ^= p3[2];
+		d3 ^= p3[3];
+		d4 ^= p3[4];
+		d5 ^= p3[5];
+		d6 ^= p3[6];
+		d7 ^= p3[7];
+		d0 ^= p4[0];
+		d1 ^= p4[1];
+		d2 ^= p4[2];
+		d3 ^= p4[3];
+		d4 ^= p4[4];
+		d5 ^= p4[5];
+		d6 ^= p4[6];
+		d7 ^= p4[7];
+		p1[0] = d0;	/* Store the result (in burts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	    unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		d0 ^= p3[0];
+		d1 ^= p3[1];
+		d2 ^= p3[2];
+		d3 ^= p3[3];
+		d4 ^= p3[4];
+		d5 ^= p3[5];
+		d6 ^= p3[6];
+		d7 ^= p3[7];
+		d0 ^= p4[0];
+		d1 ^= p4[1];
+		d2 ^= p4[2];
+		d3 ^= p4[3];
+		d4 ^= p4[4];
+		d5 ^= p4[5];
+		d6 ^= p4[6];
+		d7 ^= p4[7];
+		d0 ^= p5[0];
+		d1 ^= p5[1];
+		d2 ^= p5[2];
+		d3 ^= p5[3];
+		d4 ^= p5[4];
+		d5 ^= p5[5];
+		d6 ^= p5[6];
+		d7 ^= p5[7];
+		p1[0] = d0;	/* Store the result (in burts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+		p5 += 8;
+	} while (--lines > 0);
+}
+
+static struct xor_block_template xor_block_8regs = {
+	name: "8regs",
+	do_2: xor_8regs_2,
+	do_3: xor_8regs_3,
+	do_4: xor_8regs_4,
+	do_5: xor_8regs_5,
+};
+
+static struct xor_block_template xor_block_32regs = {
+	name: "32regs",
+	do_2: xor_32regs_2,
+	do_3: xor_32regs_3,
+	do_4: xor_32regs_4,
+	do_5: xor_32regs_5,
+};
+
+#define XOR_TRY_TEMPLATES			\
+	do {					\
+		xor_speed(&xor_block_8regs);	\
+		xor_speed(&xor_block_32regs);	\
+	} while (0)
--- a/include/asm-i386/xor.h
+++ b/include/asm-i386/xor.h
+/*
+ * include/asm-i386/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for MMX and SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * High-speed RAID5 checksumming functions utilizing MMX instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+#define FPU_SAVE							\
+  do {									\
+	if (!(current->flags & PF_USEDFPU))				\
+		__asm__ __volatile__ (" clts;\n");			\
+	__asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0]));	\
+  } while (0)
+
+#define FPU_RESTORE							\
+  do {									\
+	__asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0]));		\
+	if (!(current->flags & PF_USEDFPU))				\
+		stts();							\
+  } while (0)
+
+#define LD(x,y)		"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
+#define ST(x,y)		"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
+#define XO1(x,y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
+#define XO2(x,y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
+#define XO3(x,y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
+#define XO4(x,y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
+
+
+static void
+xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	unsigned long lines = bytes >> 7;
+	char fpu_save[108];
+
+	FPU_SAVE;
+
+	__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+	LD(i,0)					\
+		LD(i+1,1)			\
+			LD(i+2,2)		\
+				LD(i+3,3)	\
+	XO1(i,0)				\
+	ST(i,0)					\
+		XO1(i+1,1)			\
+		ST(i+1,1)			\
+			XO1(i+2,2)		\
+			ST(i+2,2)		\
+				XO1(i+3,3)	\
+				ST(i+3,3)
+
+	" .align 32			;\n"
+  	" 1:                            ;\n"
+
+	BLOCK(0)
+	BLOCK(4)
+	BLOCK(8)
+	BLOCK(12)
+
+	"       addl $128, %1         ;\n"
+	"       addl $128, %2         ;\n"
+	"       decl %0               ;\n"
+	"       jnz 1b                ;\n"
+       	:
+	: "r" (lines),
+	  "r" (p1), "r" (p2)
+	: "memory");
+
+	FPU_RESTORE;
+}
+
+static void
+xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	      unsigned long *p3)
+{
+	unsigned long lines = bytes >> 7;
+	char fpu_save[108];
+
+	FPU_SAVE;
+
+	__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+	LD(i,0)					\
+		LD(i+1,1)			\
+			LD(i+2,2)		\
+				LD(i+3,3)	\
+	XO1(i,0)				\
+		XO1(i+1,1)			\
+			XO1(i+2,2)		\
+				XO1(i+3,3)	\
+	XO2(i,0)				\
+	ST(i,0)					\
+		XO2(i+1,1)			\
+		ST(i+1,1)			\
+			XO2(i+2,2)		\
+			ST(i+2,2)		\
+				XO2(i+3,3)	\
+				ST(i+3,3)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+	BLOCK(0)
+	BLOCK(4)
+	BLOCK(8)
+	BLOCK(12)
+
+	"       addl $128, %1         ;\n"
+	"       addl $128, %2         ;\n"
+	"       addl $128, %3         ;\n"
+	"       decl %0               ;\n"
+	"       jnz 1b                ;\n"
+       	:
+	: "r" (lines),
+	  "r" (p1), "r" (p2), "r" (p3)
+	: "memory");
+
+	FPU_RESTORE;
+}
+
+static void
+xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	      unsigned long *p3, unsigned long *p4)
+{
+	unsigned long lines = bytes >> 7;
+	char fpu_save[108];
+
+	FPU_SAVE;
+
+	__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+	LD(i,0)					\
+		LD(i+1,1)			\
+			LD(i+2,2)		\
+				LD(i+3,3)	\
+	XO1(i,0)				\
+		XO1(i+1,1)			\
+			XO1(i+2,2)		\
+				XO1(i+3,3)	\
+	XO2(i,0)				\
+		XO2(i+1,1)			\
+			XO2(i+2,2)		\
+				XO2(i+3,3)	\
+	XO3(i,0)				\
+	ST(i,0)					\
+		XO3(i+1,1)			\
+		ST(i+1,1)			\
+			XO3(i+2,2)		\
+			ST(i+2,2)		\
+				XO3(i+3,3)	\
+				ST(i+3,3)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+	BLOCK(0)
+	BLOCK(4)
+	BLOCK(8)
+	BLOCK(12)
+
+	"       addl $128, %1         ;\n"
+	"       addl $128, %2         ;\n"
+	"       addl $128, %3         ;\n"
+	"       addl $128, %4         ;\n"
+	"       decl %0               ;\n"
+	"       jnz 1b                ;\n"
+       	:
+	: "r" (lines),
+	  "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+	: "memory");
+
+	FPU_RESTORE;
+}
+
+static void
+xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	unsigned long lines = bytes >> 7;
+	char fpu_save[108];
+
+	FPU_SAVE;
+
+	__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+	LD(i,0)					\
+		LD(i+1,1)			\
+			LD(i+2,2)		\
+				LD(i+3,3)	\
+	XO1(i,0)				\
+		XO1(i+1,1)			\
+			XO1(i+2,2)		\
+				XO1(i+3,3)	\
+	XO2(i,0)				\
+		XO2(i+1,1)			\
+			XO2(i+2,2)		\
+				XO2(i+3,3)	\
+	XO3(i,0)				\
+		XO3(i+1,1)			\
+			XO3(i+2,2)		\
+				XO3(i+3,3)	\
+	XO4(i,0)				\
+	ST(i,0)					\
+		XO4(i+1,1)			\
+		ST(i+1,1)			\
+			XO4(i+2,2)		\
+			ST(i+2,2)		\
+				XO4(i+3,3)	\
+				ST(i+3,3)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+	BLOCK(0)
+	BLOCK(4)
+	BLOCK(8)
+	BLOCK(12)
+
+	"       addl $128, %1         ;\n"
+	"       addl $128, %2         ;\n"
+	"       addl $128, %3         ;\n"
+	"       addl $128, %4         ;\n"
+	"       addl $128, %5         ;\n"
+	"       decl %0               ;\n"
+	"       jnz 1b                ;\n"
+       	:
+	: "g" (lines),
+	  "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+	: "memory");
+
+	FPU_RESTORE;
+}
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef BLOCK
+
+static void
+xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	unsigned long lines = bytes >> 6;
+	char fpu_save[108];
+
+	FPU_SAVE;
+
+	__asm__ __volatile__ (
+	" .align 32	             ;\n"
+	" 1:                         ;\n"
+	"       movq   (%1), %%mm0   ;\n"
+	"       movq  8(%1), %%mm1   ;\n"
+	"       pxor   (%2), %%mm0   ;\n"
+	"       movq 16(%1), %%mm2   ;\n"
+	"       movq %%mm0,   (%1)   ;\n"
+	"       pxor  8(%2), %%mm1   ;\n"
+	"       movq 24(%1), %%mm3   ;\n"
+	"       movq %%mm1,  8(%1)   ;\n"
+	"       pxor 16(%2), %%mm2   ;\n"
+	"       movq 32(%1), %%mm4   ;\n"
+	"       movq %%mm2, 16(%1)   ;\n"
+	"       pxor 24(%2), %%mm3   ;\n"
+	"       movq 40(%1), %%mm5   ;\n"
+	"       movq %%mm3, 24(%1)   ;\n"
+	"       pxor 32(%2), %%mm4   ;\n"
+	"       movq 48(%1), %%mm6   ;\n"
+	"       movq %%mm4, 32(%1)   ;\n"
+	"       pxor 40(%2), %%mm5   ;\n"
+	"       movq 56(%1), %%mm7   ;\n"
+	"       movq %%mm5, 40(%1)   ;\n"
+	"       pxor 48(%2), %%mm6   ;\n"
+	"       pxor 56(%2), %%mm7   ;\n"
+	"       movq %%mm6, 48(%1)   ;\n"
+	"       movq %%mm7, 56(%1)   ;\n"
+	
+	"       addl $64, %1         ;\n"
+	"       addl $64, %2         ;\n"
+	"       decl %0              ;\n"
+	"       jnz 1b               ;\n"
+	: 
+	: "r" (lines),
+	  "r" (p1), "r" (p2)
+	: "memory");
+
+	FPU_RESTORE;
+}
+
+static void
+xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	     unsigned long *p3)
+{
+	unsigned long lines = bytes >> 6;
+	char fpu_save[108];
+
+	FPU_SAVE;
+
+	__asm__ __volatile__ (
+	" .align 32,0x90             ;\n"
+	" 1:                         ;\n"
+	"       movq   (%1), %%mm0   ;\n"
+	"       movq  8(%1), %%mm1   ;\n"
+	"       pxor   (%2), %%mm0   ;\n"
+	"       movq 16(%1), %%mm2   ;\n"
+	"       pxor  8(%2), %%mm1   ;\n"
+	"       pxor   (%3), %%mm0   ;\n"
+	"       pxor 16(%2), %%mm2   ;\n"
+	"       movq %%mm0,   (%1)   ;\n"
+	"       pxor  8(%3), %%mm1   ;\n"
+	"       pxor 16(%3), %%mm2   ;\n"
+	"       movq 24(%1), %%mm3   ;\n"
+	"       movq %%mm1,  8(%1)   ;\n"
+	"       movq 32(%1), %%mm4   ;\n"
+	"       movq 40(%1), %%mm5   ;\n"
+	"       pxor 24(%2), %%mm3   ;\n"
+	"       movq %%mm2, 16(%1)   ;\n"
+	"       pxor 32(%2), %%mm4   ;\n"
+	"       pxor 24(%3), %%mm3   ;\n"
+	"       pxor 40(%2), %%mm5   ;\n"
+	"       movq %%mm3, 24(%1)   ;\n"
+	"       pxor 32(%3), %%mm4   ;\n"
+	"       pxor 40(%3), %%mm5   ;\n"
+	"       movq 48(%1), %%mm6   ;\n"
+	"       movq %%mm4, 32(%1)   ;\n"
+	"       movq 56(%1), %%mm7   ;\n"
+	"       pxor 48(%2), %%mm6   ;\n"
+	"       movq %%mm5, 40(%1)   ;\n"
+	"       pxor 56(%2), %%mm7   ;\n"
+	"       pxor 48(%3), %%mm6   ;\n"
+	"       pxor 56(%3), %%mm7   ;\n"
+	"       movq %%mm6, 48(%1)   ;\n"
+	"       movq %%mm7, 56(%1)   ;\n"
+      
+	"       addl $64, %1         ;\n"
+	"       addl $64, %2         ;\n"
+	"       addl $64, %3         ;\n"
+	"       decl %0              ;\n"
+	"       jnz 1b               ;\n"
+	: 
+	: "r" (lines),
+	  "r" (p1), "r" (p2), "r" (p3)
+	: "memory" );
+
+	FPU_RESTORE;
+}
+
+static void
+xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	     unsigned long *p3, unsigned long *p4)
+{
+	unsigned long lines = bytes >> 6;
+	char fpu_save[108];
+
+	FPU_SAVE;
+
+	__asm__ __volatile__ (
+	" .align 32,0x90             ;\n"
+	" 1:                         ;\n"
+	"       movq   (%1), %%mm0   ;\n"
+	"       movq  8(%1), %%mm1   ;\n"
+	"       pxor   (%2), %%mm0   ;\n"
+	"       movq 16(%1), %%mm2   ;\n"
+	"       pxor  8(%2), %%mm1   ;\n"
+	"       pxor   (%3), %%mm0   ;\n"
+	"       pxor 16(%2), %%mm2   ;\n"
+	"       pxor  8(%3), %%mm1   ;\n"
+	"       pxor   (%4), %%mm0   ;\n"
+	"       movq 24(%1), %%mm3   ;\n"
+	"       pxor 16(%3), %%mm2   ;\n"
+	"       pxor  8(%4), %%mm1   ;\n"
+	"       movq %%mm0,   (%1)   ;\n"
+	"       movq 32(%1), %%mm4   ;\n"
+	"       pxor 24(%2), %%mm3   ;\n"
+	"       pxor 16(%4), %%mm2   ;\n"
+	"       movq %%mm1,  8(%1)   ;\n"
+	"       movq 40(%1), %%mm5   ;\n"
+	"       pxor 32(%2), %%mm4   ;\n"
+	"       pxor 24(%3), %%mm3   ;\n"
+	"       movq %%mm2, 16(%1)   ;\n"
+	"       pxor 40(%2), %%mm5   ;\n"
+	"       pxor 32(%3), %%mm4   ;\n"
+	"       pxor 24(%4), %%mm3   ;\n"
+	"       movq %%mm3, 24(%1)   ;\n"
+	"       movq 56(%1), %%mm7   ;\n"
+	"       movq 48(%1), %%mm6   ;\n"
+	"       pxor 40(%3), %%mm5   ;\n"
+	"       pxor 32(%4), %%mm4   ;\n"
+	"       pxor 48(%2), %%mm6   ;\n"
+	"       movq %%mm4, 32(%1)   ;\n"
+	"       pxor 56(%2), %%mm7   ;\n"
+	"       pxor 40(%4), %%mm5   ;\n"
+	"       pxor 48(%3), %%mm6   ;\n"
+	"       pxor 56(%3), %%mm7   ;\n"
+	"       movq %%mm5, 40(%1)   ;\n"
+	"       pxor 48(%4), %%mm6   ;\n"
+	"       pxor 56(%4), %%mm7   ;\n"
+	"       movq %%mm6, 48(%1)   ;\n"
+	"       movq %%mm7, 56(%1)   ;\n"
+      
+	"       addl $64, %1         ;\n"
+	"       addl $64, %2         ;\n"
+	"       addl $64, %3         ;\n"
+	"       addl $64, %4         ;\n"
+	"       decl %0              ;\n"
+	"       jnz 1b               ;\n"
+	: 
+	: "r" (lines),
+	  "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+	: "memory");
+
+	FPU_RESTORE;
+}
+
+static void
+xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	unsigned long lines = bytes >> 6;
+	char fpu_save[108];
+
+	FPU_SAVE;
+
+	__asm__ __volatile__ (
+	" .align 32,0x90             ;\n"
+	" 1:                         ;\n"
+	"       movq   (%1), %%mm0   ;\n"
+	"       movq  8(%1), %%mm1   ;\n"
+	"       pxor   (%2), %%mm0   ;\n"
+	"       pxor  8(%2), %%mm1   ;\n"
+	"       movq 16(%1), %%mm2   ;\n"
+	"       pxor   (%3), %%mm0   ;\n"
+	"       pxor  8(%3), %%mm1   ;\n"
+	"       pxor 16(%2), %%mm2   ;\n"
+	"       pxor   (%4), %%mm0   ;\n"
+	"       pxor  8(%4), %%mm1   ;\n"
+	"       pxor 16(%3), %%mm2   ;\n"
+	"       movq 24(%1), %%mm3   ;\n"
+	"       pxor   (%5), %%mm0   ;\n"
+	"       pxor  8(%5), %%mm1   ;\n"
+	"       movq %%mm0,   (%1)   ;\n"
+	"       pxor 16(%4), %%mm2   ;\n"
+	"       pxor 24(%2), %%mm3   ;\n"
+	"       movq %%mm1,  8(%1)   ;\n"
+	"       pxor 16(%5), %%mm2   ;\n"
+	"       pxor 24(%3), %%mm3   ;\n"
+	"       movq 32(%1), %%mm4   ;\n"
+	"       movq %%mm2, 16(%1)   ;\n"
+	"       pxor 24(%4), %%mm3   ;\n"
+	"       pxor 32(%2), %%mm4   ;\n"
+	"       movq 40(%1), %%mm5   ;\n"
+	"       pxor 24(%5), %%mm3   ;\n"
+	"       pxor 32(%3), %%mm4   ;\n"
+	"       pxor 40(%2), %%mm5   ;\n"
+	"       movq %%mm3, 24(%1)   ;\n"
+	"       pxor 32(%4), %%mm4   ;\n"
+	"       pxor 40(%3), %%mm5   ;\n"
+	"       movq 48(%1), %%mm6   ;\n"
+	"       movq 56(%1), %%mm7   ;\n"
+	"       pxor 32(%5), %%mm4   ;\n"
+	"       pxor 40(%4), %%mm5   ;\n"
+	"       pxor 48(%2), %%mm6   ;\n"
+	"       pxor 56(%2), %%mm7   ;\n"
+	"       movq %%mm4, 32(%1)   ;\n"
+	"       pxor 48(%3), %%mm6   ;\n"
+	"       pxor 56(%3), %%mm7   ;\n"
+	"       pxor 40(%5), %%mm5   ;\n"
+	"       pxor 48(%4), %%mm6   ;\n"
+	"       pxor 56(%4), %%mm7   ;\n"
+	"       movq %%mm5, 40(%1)   ;\n"
+	"       pxor 48(%5), %%mm6   ;\n"
+	"       pxor 56(%5), %%mm7   ;\n"
+	"       movq %%mm6, 48(%1)   ;\n"
+	"       movq %%mm7, 56(%1)   ;\n"
+      
+	"       addl $64, %1         ;\n"
+	"       addl $64, %2         ;\n"
+	"       addl $64, %3         ;\n"
+	"       addl $64, %4         ;\n"
+	"       addl $64, %5         ;\n"
+	"       decl %0              ;\n"
+	"       jnz 1b               ;\n"
+	: 
+	: "g" (lines),
+	  "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+	: "memory");
+
+	FPU_RESTORE;
+}
+
+static struct xor_block_template xor_block_pII_mmx = {
+	name: "pII_mmx",
+	do_2: xor_pII_mmx_2,
+	do_3: xor_pII_mmx_3,
+	do_4: xor_pII_mmx_4,
+	do_5: xor_pII_mmx_5,
+};
+
+static struct xor_block_template xor_block_p5_mmx = {
+	name: "p5_mmx",
+	do_2: xor_p5_mmx_2,
+	do_3: xor_p5_mmx_3,
+	do_4: xor_p5_mmx_4,
+	do_5: xor_p5_mmx_5,
+};
+
+#undef FPU_SAVE
+#undef FPU_RESTORE
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+#define XMMS_SAVE				\
+	__asm__ __volatile__ ( 			\
+		"movl %%cr0,%0		;\n\t"	\
+		"clts			;\n\t"	\
+		"movups %%xmm0,(%1)	;\n\t"	\
+		"movups %%xmm1,0x10(%1)	;\n\t"	\
+		"movups %%xmm2,0x20(%1)	;\n\t"	\
+		"movups %%xmm3,0x30(%1)	;\n\t"	\
+		: "=r" (cr0)			\
+		: "r" (xmm_save) 		\
+		: "memory")
+
+#define XMMS_RESTORE				\
+	__asm__ __volatile__ ( 			\
+		"sfence			;\n\t"	\
+		"movups (%1),%%xmm0	;\n\t"	\
+		"movups 0x10(%1),%%xmm1	;\n\t"	\
+		"movups 0x20(%1),%%xmm2	;\n\t"	\
+		"movups 0x30(%1),%%xmm3	;\n\t"	\
+		"movl 	%0,%%cr0	;\n\t"	\
+		:				\
+		: "r" (cr0), "r" (xmm_save)	\
+		: "memory")
+
+#define OFFS(x)		"16*("#x")"
+#define	PF0(x)		"	prefetcht0  "OFFS(x)"(%1)   ;\n"
+#define LD(x,y)		"       movaps   "OFFS(x)"(%1), %%xmm"#y"   ;\n"
+#define ST(x,y)		"       movaps %%xmm"#y",   "OFFS(x)"(%1)   ;\n"
+#define PF1(x)		"	prefetchnta "OFFS(x)"(%2)   ;\n"
+#define PF2(x)		"	prefetchnta "OFFS(x)"(%3)   ;\n"
+#define PF3(x)		"	prefetchnta "OFFS(x)"(%4)   ;\n"
+#define PF4(x)		"	prefetchnta "OFFS(x)"(%5)   ;\n"
+#define PF5(x)		"	prefetchnta "OFFS(x)"(%6)   ;\n"
+#define XO1(x,y)	"       xorps   "OFFS(x)"(%2), %%xmm"#y"   ;\n"
+#define XO2(x,y)	"       xorps   "OFFS(x)"(%3), %%xmm"#y"   ;\n"
+#define XO3(x,y)	"       xorps   "OFFS(x)"(%4), %%xmm"#y"   ;\n"
+#define XO4(x,y)	"       xorps   "OFFS(x)"(%5), %%xmm"#y"   ;\n"
+#define XO5(x,y)	"       xorps   "OFFS(x)"(%6), %%xmm"#y"   ;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+        unsigned long lines = bytes >> 8;
+	char xmm_save[16*4];
+	int cr0;
+
+	XMMS_SAVE;
+
+        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		LD(i,0)					\
+			LD(i+1,1)			\
+		PF1(i)					\
+				PF1(i+2)		\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+	:
+	: "r" (lines),
+	  "r" (p1), "r" (p2)
+        : "memory");
+
+	XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	  unsigned long *p3)
+{
+        unsigned long lines = bytes >> 8;
+	char xmm_save[16*4];
+	int cr0;
+
+	XMMS_SAVE;
+
+        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i+2)		\
+		LD(i,0)					\
+			LD(i+1,1)			\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF2(i)					\
+				PF2(i+2)		\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		XO2(i,0)				\
+			XO2(i+1,1)			\
+				XO2(i+2,2)		\
+					XO2(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+	:
+	: "r" (lines),
+	  "r" (p1), "r"(p2), "r"(p3)
+        : "memory" );
+
+	XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	  unsigned long *p3, unsigned long *p4)
+{
+        unsigned long lines = bytes >> 8;
+	char xmm_save[16*4];
+	int cr0;
+
+	XMMS_SAVE;
+
+        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i+2)		\
+		LD(i,0)					\
+			LD(i+1,1)			\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF2(i)					\
+				PF2(i+2)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		PF3(i)					\
+				PF3(i+2)		\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO2(i,0)				\
+			XO2(i+1,1)			\
+				XO2(i+2,2)		\
+					XO2(i+3,3)	\
+		XO3(i,0)				\
+			XO3(i+1,1)			\
+				XO3(i+2,2)		\
+					XO3(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       addl $256, %4           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+	:
+	: "r" (lines),
+	  "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+        : "memory" );
+
+	XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+        unsigned long lines = bytes >> 8;
+	char xmm_save[16*4];
+	int cr0;
+
+	XMMS_SAVE;
+
+        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i+2)		\
+		LD(i,0)					\
+			LD(i+1,1)			\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF2(i)					\
+				PF2(i+2)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		PF3(i)					\
+				PF3(i+2)		\
+		XO2(i,0)				\
+			XO2(i+1,1)			\
+				XO2(i+2,2)		\
+					XO2(i+3,3)	\
+		PF4(i)					\
+				PF4(i+2)		\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO3(i,0)				\
+			XO3(i+1,1)			\
+				XO3(i+2,2)		\
+					XO3(i+3,3)	\
+		XO4(i,0)				\
+			XO4(i+1,1)			\
+				XO4(i+2,2)		\
+					XO4(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       addl $256, %4           ;\n"
+        "       addl $256, %5           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+	:
+	: "r" (lines),
+	  "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+	: "memory");
+
+	XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_pIII_sse = {
+        name: "pIII_sse",
+        do_2: xor_sse_2,
+        do_3: xor_sse_3,
+        do_4: xor_sse_4,
+        do_5: xor_sse_5,
+};
+
+/* Also try the generic routines.  */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_32regs);		\
+	        if (cpu_has_xmm)			\
+			xor_speed(&xor_block_pIII_sse);	\
+	        if (md_cpu_has_mmx()) {			\
+	                xor_speed(&xor_block_pII_mmx);	\
+	                xor_speed(&xor_block_p5_mmx);	\
+	        }					\
+	} while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
--- a/include/asm-ia64/xor.h
+++ b/include/asm-ia64/xor.h
+/*
+ * include/asm-ia64/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for IA-64.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+extern void xor_ia64_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_ia64_3(unsigned long, unsigned long *, unsigned long *,
+		       unsigned long *);
+extern void xor_ia64_4(unsigned long, unsigned long *, unsigned long *,
+		       unsigned long *, unsigned long *);
+extern void xor_ia64_5(unsigned long, unsigned long *, unsigned long *,
+		       unsigned long *, unsigned long *, unsigned long *);
+
+asm ("
+	.text
+
+	// Assume L2 memory latency of 6 cycles.
+
+	.proc xor_ia64_2
+xor_ia64_2:
+	.prologue
+	.fframe 0
+	{ .mii
+	  .save ar.pfs, r31
+	  alloc r31 = ar.pfs, 3, 0, 13, 16
+	  .save ar.lc, r30
+	  mov r30 = ar.lc
+	  .save pr, r29
+	  mov r29 = pr
+	  ;;
+	}
+	.body
+	{ .mii
+	  mov r8 = in1
+	  mov ar.ec = 6 + 2
+	  shr in0 = in0, 3
+	  ;;
+	}
+	{ .mmi
+	  adds in0 = -1, in0
+	  mov r16 = in1
+	  mov r17 = in2
+	  ;;
+	}
+	{ .mii
+	  mov ar.lc = in0
+	  mov pr.rot = 1 << 16
+	  ;;
+	}
+	.rotr s1[6+1], s2[6+1], d[2]
+	.rotp p[6+2]
+0:	 { .mmi
+(p[0])	  ld8.nta s1[0] = [r16], 8
+(p[0])	  ld8.nta s2[0] = [r17], 8
+(p[6])	  xor d[0] = s1[6], s2[6]
+	}
+	{ .mfb
+(p[6+1])  st8.nta [r8] = d[1], 8
+	  nop.f 0
+	  br.ctop.dptk.few 0b
+	  ;;
+	}
+	{ .mii
+	  mov ar.lc = r30
+	  mov pr = r29, -1
+	}
+	{ .bbb
+	  br.ret.sptk.few rp
+	}
+	.endp xor_ia64_2
+
+	.proc xor_ia64_3
+xor_ia64_3:
+	.prologue
+	.fframe 0
+	{ .mii
+	  .save ar.pfs, r31
+	  alloc r31 = ar.pfs, 4, 0, 20, 24
+	  .save ar.lc, r30
+	  mov r30 = ar.lc
+	  .save pr, r29
+	  mov r29 = pr
+	  ;;
+	}
+	.body
+	{ .mii
+	  mov r8 = in1
+	  mov ar.ec = 6 + 2
+	  shr in0 = in0, 3
+	  ;;
+	}
+	{ .mmi
+	  adds in0 = -1, in0
+	  mov r16 = in1
+	  mov r17 = in2
+	  ;;
+	}
+	{ .mii
+	  mov r18 = in3
+	  mov ar.lc = in0
+	  mov pr.rot = 1 << 16
+	  ;;
+	}
+	.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
+	.rotp p[6+2]
+0:	{ .mmi
+(p[0])	  ld8.nta s1[0] = [r16], 8
+(p[0])	  ld8.nta s2[0] = [r17], 8
+(p[6])	  xor d[0] = s1[6], s2[6]
+	  ;;
+	}
+	{ .mmi
+(p[0])	  ld8.nta s3[0] = [r18], 8
+(p[6+1])  st8.nta [r8] = d[1], 8
+(p[6])	  xor d[0] = d[0], s3[6]
+	}
+	{ .bbb
+	  br.ctop.dptk.few 0b
+	  ;;
+	}
+	{ .mii
+	  mov ar.lc = r30
+	  mov pr = r29, -1
+	}
+	{ .bbb
+	  br.ret.sptk.few rp
+	}
+	.endp xor_ia64_3
+
+	.proc xor_ia64_4
+xor_ia64_4:
+	.prologue
+	.fframe 0
+	{ .mii
+	  .save ar.pfs, r31
+	  alloc r31 = ar.pfs, 5, 0, 27, 32
+	  .save ar.lc, r30
+	  mov r30 = ar.lc
+	  .save pr, r29
+	  mov r29 = pr
+	  ;;
+	}
+	.body
+	{ .mii
+	  mov r8 = in1
+	  mov ar.ec = 6 + 2
+	  shr in0 = in0, 3
+	  ;;
+	}
+	{ .mmi
+	  adds in0 = -1, in0
+	  mov r16 = in1
+	  mov r17 = in2
+	  ;;
+	}
+	{ .mii
+	  mov r18 = in3
+	  mov ar.lc = in0
+	  mov pr.rot = 1 << 16
+	}
+	{ .mfb
+	  mov r19 = in4
+	  ;;
+	}
+	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
+	.rotp p[6+2]
+0:	{ .mmi
+(p[0])	  ld8.nta s1[0] = [r16], 8
+(p[0])	  ld8.nta s2[0] = [r17], 8
+(p[6])	  xor d[0] = s1[6], s2[6]
+	}
+	{ .mmi
+(p[0])	  ld8.nta s3[0] = [r18], 8
+(p[0])	  ld8.nta s4[0] = [r19], 8
+(p[6])	  xor r20 = s3[6], s4[6]
+	  ;;
+	}
+	{ .mib
+(p[6+1])  st8.nta [r8] = d[1], 8
+(p[6])	  xor d[0] = d[0], r20
+	  br.ctop.dptk.few 0b
+	  ;;
+	}
+	{ .mii
+	  mov ar.lc = r30
+	  mov pr = r29, -1
+	}
+	{ .bbb
+	  br.ret.sptk.few rp
+	}
+	.endp xor_ia64_4
+
+	.proc xor_ia64_5
+xor_ia64_5:
+	.prologue
+	.fframe 0
+	{ .mii
+	  .save ar.pfs, r31
+	  alloc r31 = ar.pfs, 6, 0, 34, 40
+	  .save ar.lc, r30
+	  mov r30 = ar.lc
+	  .save pr, r29
+	  mov r29 = pr
+	  ;;
+	}
+	.body
+	{ .mii
+	  mov r8 = in1
+	  mov ar.ec = 6 + 2
+	  shr in0 = in0, 3
+	  ;;
+	}
+	{ .mmi
+	  adds in0 = -1, in0
+	  mov r16 = in1
+	  mov r17 = in2
+	  ;;
+	}
+	{ .mii
+	  mov r18 = in3
+	  mov ar.lc = in0
+	  mov pr.rot = 1 << 16
+	}
+	{ .mib
+	  mov r19 = in4
+	  mov r20 = in5
+	  ;;
+	}
+	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
+	.rotp p[6+2]
+0:	{ .mmi
+(p[0])	  ld8.nta s1[0] = [r16], 8
+(p[0])	  ld8.nta s2[0] = [r17], 8
+(p[6])	  xor d[0] = s1[6], s2[6]
+	}
+	{ .mmi
+(p[0])	  ld8.nta s3[0] = [r18], 8
+(p[0])	  ld8.nta s4[0] = [r19], 8
+(p[6])	  xor r21 = s3[6], s4[6]
+	  ;;
+	}
+	{ .mmi
+(p[0])	  ld8.nta s5[0] = [r20], 8
+(p[6+1])  st8.nta [r8] = d[1], 8
+(p[6])	  xor d[0] = d[0], r21
+	  ;;
+	}
+	{ .mfb
+(p[6])	  xor d[0] = d[0], s5[6]
+	  nop.f 0
+	  br.ctop.dptk.few 0b
+	  ;;
+	}
+	{ .mii
+	  mov ar.lc = r30
+	  mov pr = r29, -1
+	}
+	{ .bbb
+	  br.ret.sptk.few rp
+	}
+	.endp xor_ia64_5
+");
+
+static struct xor_block_template xor_block_ia64 = {
+	name: "ia64",
+	do_2: xor_ia64_2,
+	do_3: xor_ia64_3,
+	do_4: xor_ia64_4,
+	do_5: xor_ia64_5,
+};
+
+#define XOR_TRY_TEMPLATES	xor_speed(&xor_block_ia64)
--- a/include/asm-m68k/xor.h
+++ b/include/asm-m68k/xor.h
+#include <asm-generic/xor.h>
--- a/include/asm-mips/xor.h
+++ b/include/asm-mips/xor.h
+#include <asm-generic/xor.h>
--- a/include/asm-mips64/xor.h
+++ b/include/asm-mips64/xor.h
+#include <asm-generic/xor.h>
--- a/include/asm-ppc/xor.h
+++ b/include/asm-ppc/xor.h
+#include <asm-generic/xor.h>
--- a/include/asm-s390/xor.h
+++ b/include/asm-s390/xor.h
+#include <asm-generic/xor.h>
--- a/include/asm-sh/xor.h
+++ b/include/asm-sh/xor.h
+#include <asm-generic/xor.h>
--- a/include/asm-sparc/xor.h
+++ b/include/asm-sparc/xor.h
+/*
+ * include/asm-sparc/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for 32-bit Sparc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * ldd/std SPARC instructions.
+ *
+ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ */
+
+static void
+sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	int lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		__asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  "
+		:
+		: "r" (p1), "r" (p2)
+		: "g2", "g3", "g4", "g5",
+		  "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		p1 += 8;
+		p2 += 8;
+	} while (--lines > 0);
+}
+
+static void
+sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	unsigned long *p3)
+{
+	int lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		__asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%2 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%2 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%2 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%2 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  "
+		:
+		: "r" (p1), "r" (p2), "r" (p3)
+		: "g2", "g3", "g4", "g5",
+		  "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+	} while (--lines > 0);
+}
+
+static void
+sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	unsigned long *p3, unsigned long *p4)
+{
+	int lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		__asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%2 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%2 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%2 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%2 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%3 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%3 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%3 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%3 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  "
+		:
+		: "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+		: "g2", "g3", "g4", "g5",
+		  "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+	} while (--lines > 0);
+}
+
+static void
+sparc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	int lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		__asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%2 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%2 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%2 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%2 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%3 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%3 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%3 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%3 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%4 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%4 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%4 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%4 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  "
+		:
+		: "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+		: "g2", "g3", "g4", "g5",
+		  "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+		p5 += 8;
+	} while (--lines > 0);
+}
+
+static struct xor_block_template xor_block_SPARC = {
+	name: "SPARC",
+	do_2: sparc_2,
+	do_3: sparc_3,
+	do_4: sparc_4,
+	do_5: sparc_5,
+};
+
+/* For grins, also test the generic routines.  */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_SPARC);		\
+	} while (0)
--- a/include/asm-sparc64/xor.h
+++ b/include/asm-sparc64/xor.h
+/*
+ * include/asm-sparc64/xor.h
+ *
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * UltraSparc Visual Instruction Set.
+ *
+ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ *	Requirements:
+ *	!(((long)dest | (long)sourceN) & (64 - 1)) &&
+ *	!(len & 127) && len >= 256
+ *
+ * It is done in pure assembly, as otherwise gcc makes it a non-leaf
+ * function, which is not what we want.
+ */
+
+#include <asm/pstate.h>
+#include <asm/asi.h>
+
+extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
+		      unsigned long *);
+extern void xor_vis_4(unsigned long, unsigned long *, unsigned long *,
+		      unsigned long *, unsigned long *);
+extern void xor_vis_5(unsigned long, unsigned long *, unsigned long *,
+		      unsigned long *, unsigned long *, unsigned long *);
+
+#define _S(x) __S(x)
+#define __S(x) #x
+#define DEF(x) __asm__(#x " = " _S(x))
+
+DEF(FPRS_FEF);
+DEF(FPRS_DU);
+DEF(ASI_BLK_P);
+
+/* ??? We set and use %asi instead of using ASI_BLK_P directly because gas
+   currently does not accept symbolic constants for the ASI specifier.  */
+
+__asm__ ("
+	.text
+	.globl xor_vis_2
+	.type xor_vis_2,@function
+xor_vis_2:
+	rd	%fprs, %g1
+	andcc	%g1, FPRS_FEF|FPRS_DU, %g0
+	be,pt	%icc, 0f
+	 sethi	%hi(VISenter), %g1
+	jmpl	%g1 + %lo(VISenter), %g7
+	 add	%g7, 8, %g7
+0:	wr	%g0, FPRS_FEF, %fprs
+	rd	%asi, %g1
+	wr	%g0, ASI_BLK_P, %asi
+	membar	#LoadStore|#StoreLoad|#StoreStore
+	sub	%o0, 128, %o0
+	ldda	[%o1] %asi, %f0
+	ldda	[%o2] %asi, %f16
+
+2:	ldda	[%o1 + 64] %asi, %f32
+	fxor	%f0, %f16, %f16
+	fxor	%f2, %f18, %f18
+	fxor	%f4, %f20, %f20
+	fxor	%f6, %f22, %f22
+	fxor	%f8, %f24, %f24
+	fxor	%f10, %f26, %f26
+	fxor	%f12, %f28, %f28
+	fxor	%f14, %f30, %f30
+	stda	%f16, [%o1] %asi
+	ldda	[%o2 + 64] %asi, %f48
+	ldda	[%o1 + 128] %asi, %f0
+	fxor	%f32, %f48, %f48
+	fxor	%f34, %f50, %f50
+	add	%o1, 128, %o1
+	fxor	%f36, %f52, %f52
+	add	%o2, 128, %o2
+	fxor	%f38, %f54, %f54
+	subcc	%o0, 128, %o0
+	fxor	%f40, %f56, %f56
+	fxor	%f42, %f58, %f58
+	fxor	%f44, %f60, %f60
+	fxor	%f46, %f62, %f62
+	stda	%f48, [%o1 - 64] %asi
+	bne,pt	%xcc, 2b
+	 ldda	[%o2] %asi, %f16
+
+	ldda	[%o1 + 64] %asi, %f32
+	fxor	%f0, %f16, %f16
+	fxor	%f2, %f18, %f18
+	fxor	%f4, %f20, %f20
+	fxor	%f6, %f22, %f22
+	fxor	%f8, %f24, %f24
+	fxor	%f10, %f26, %f26
+	fxor	%f12, %f28, %f28
+	fxor	%f14, %f30, %f30
+	stda	%f16, [%o1] %asi
+	ldda	[%o2 + 64] %asi, %f48
+	membar	#Sync
+	fxor	%f32, %f48, %f48
+	fxor	%f34, %f50, %f50
+	fxor	%f36, %f52, %f52
+	fxor	%f38, %f54, %f54
+	fxor	%f40, %f56, %f56
+	fxor	%f42, %f58, %f58
+	fxor	%f44, %f60, %f60
+	fxor	%f46, %f62, %f62
+	stda	%f48, [%o1 + 64] %asi
+	membar	#Sync|#StoreStore|#StoreLoad
+	wr	%g1, %g0, %asi
+	retl
+	  wr	%g0, 0, %fprs
+	.size xor_vis_2, .-xor_vis_2
+
+
+	.globl xor_vis_3
+	.type xor_vis_3,@function
+xor_vis_3:
+	rd	%fprs, %g1
+	andcc	%g1, FPRS_FEF|FPRS_DU, %g0
+	be,pt	%icc, 0f
+	 sethi	%hi(VISenter), %g1
+	jmpl	%g1 + %lo(VISenter), %g7
+	 add	%g7, 8, %g7
+0:	wr	%g0, FPRS_FEF, %fprs
+	rd	%asi, %g1
+	wr	%g0, ASI_BLK_P, %asi
+	membar	#LoadStore|#StoreLoad|#StoreStore
+	sub	%o0, 64, %o0
+	ldda	[%o1] %asi, %f0
+	ldda	[%o2] %asi, %f16
+
+3:	ldda	[%o3] %asi, %f32
+	fxor	%f0, %f16, %f48
+	fxor	%f2, %f18, %f50
+	add	%o1, 64, %o1
+	fxor	%f4, %f20, %f52
+	fxor	%f6, %f22, %f54
+	add	%o2, 64, %o2
+	fxor	%f8, %f24, %f56
+	fxor	%f10, %f26, %f58
+	fxor	%f12, %f28, %f60
+	fxor	%f14, %f30, %f62
+	ldda	[%o1] %asi, %f0
+	fxor	%f48, %f32, %f48
+	fxor	%f50, %f34, %f50
+	fxor	%f52, %f36, %f52
+	fxor	%f54, %f38, %f54
+	add	%o3, 64, %o3
+	fxor	%f56, %f40, %f56
+	fxor	%f58, %f42, %f58
+	subcc	%o0, 64, %o0
+	fxor	%f60, %f44, %f60
+	fxor	%f62, %f46, %f62
+	stda	%f48, [%o1 - 64] %asi
+	bne,pt	%xcc, 3b
+	 ldda	[%o2] %asi, %f16
+
+	ldda	[%o3] %asi, %f32
+	fxor	%f0, %f16, %f48
+	fxor	%f2, %f18, %f50
+	fxor	%f4, %f20, %f52
+	fxor	%f6, %f22, %f54
+	fxor	%f8, %f24, %f56
+	fxor	%f10, %f26, %f58
+	fxor	%f12, %f28, %f60
+	fxor	%f14, %f30, %f62
+	membar	#Sync
+	fxor	%f48, %f32, %f48
+	fxor	%f50, %f34, %f50
+	fxor	%f52, %f36, %f52
+	fxor	%f54, %f38, %f54
+	fxor	%f56, %f40, %f56
+	fxor	%f58, %f42, %f58
+	fxor	%f60, %f44, %f60
+	fxor	%f62, %f46, %f62
+	stda	%f48, [%o1] %asi
+	membar	#Sync|#StoreStore|#StoreLoad
+	wr	%g1, %g0, %asi
+	retl
+	 wr	%g0, 0, %fprs
+	.size xor_vis_3, .-xor_vis_3
+
+
+	.globl xor_vis_4
+	.type xor_vis_4,@function
+xor_vis_4:
+	rd	%fprs, %g1
+	andcc	%g1, FPRS_FEF|FPRS_DU, %g0
+	be,pt	%icc, 0f
+	 sethi	%hi(VISenter), %g1
+	jmpl	%g1 + %lo(VISenter), %g7
+	 add	%g7, 8, %g7
+0:	wr	%g0, FPRS_FEF, %fprs
+	rd	%asi, %g1
+	wr	%g0, ASI_BLK_P, %asi
+	membar	#LoadStore|#StoreLoad|#StoreStore
+	sub	%o0, 64, %o0
+	ldda	[%o1] %asi, %f0
+	ldda	[%o2] %asi, %f16
+
+4:	ldda	[%o3] %asi, %f32
+	fxor	%f0, %f16, %f16
+	fxor	%f2, %f18, %f18
+	add	%o1, 64, %o1
+	fxor	%f4, %f20, %f20
+	fxor	%f6, %f22, %f22
+	add	%o2, 64, %o2
+	fxor	%f8, %f24, %f24
+	fxor	%f10, %f26, %f26
+	fxor	%f12, %f28, %f28
+	fxor	%f14, %f30, %f30
+	ldda	[%o4] %asi, %f48
+	fxor	%f16, %f32, %f32
+	fxor	%f18, %f34, %f34
+	fxor	%f20, %f36, %f36
+	fxor	%f22, %f38, %f38
+	add	%o3, 64, %o3
+	fxor	%f24, %f40, %f40
+	fxor	%f26, %f42, %f42
+	fxor	%f28, %f44, %f44
+	fxor	%f30, %f46, %f46
+	ldda	[%o1] %asi, %f0
+	fxor	%f32, %f48, %f48
+	fxor	%f34, %f50, %f50
+	fxor	%f36, %f52, %f52
+	add	%o4, 64, %o4
+	fxor	%f38, %f54, %f54
+	fxor	%f40, %f56, %f56
+	fxor	%f42, %f58, %f58
+	subcc	%o0, 64, %o0
+	fxor	%f44, %f60, %f60
+	fxor	%f46, %f62, %f62
+	stda	%f48, [%o1 - 64] %asi
+	bne,pt	%xcc, 4b
+	 ldda	[%o2] %asi, %f16
+
+	ldda	[%o3] %asi, %f32
+	fxor	%f0, %f16, %f16
+	fxor	%f2, %f18, %f18
+	fxor	%f4, %f20, %f20
+	fxor	%f6, %f22, %f22
+	fxor	%f8, %f24, %f24
+	fxor	%f10, %f26, %f26
+	fxor	%f12, %f28, %f28
+	fxor	%f14, %f30, %f30
+	ldda	[%o4] %asi, %f48
+	fxor	%f16, %f32, %f32
+	fxor	%f18, %f34, %f34
+	fxor	%f20, %f36, %f36
+	fxor	%f22, %f38, %f38
+	fxor	%f24, %f40, %f40
+	fxor	%f26, %f42, %f42
+	fxor	%f28, %f44, %f44
+	fxor	%f30, %f46, %f46
+	membar	#Sync
+	fxor	%f32, %f48, %f48
+	fxor	%f34, %f50, %f50
+	fxor	%f36, %f52, %f52
+	fxor	%f38, %f54, %f54
+	fxor	%f40, %f56, %f56
+	fxor	%f42, %f58, %f58
+	fxor	%f44, %f60, %f60
+	fxor	%f46, %f62, %f62
+	stda	%f48, [%o1] %asi
+	membar	#Sync|#StoreStore|#StoreLoad
+	wr	%g1, %g0, %asi
+	retl
+	 wr	%g0, 0, %fprs
+	.size xor_vis_4, .-xor_vis_4
+
+
+	.globl xor_vis_5
+	.type xor_vis_5,@function
+xor_vis_5:
+	rd	%fprs, %g1
+	andcc	%g1, FPRS_FEF|FPRS_DU, %g0
+	be,pt	%icc, 0f
+	 sethi	%hi(VISenter), %g1
+	jmpl	%g1 + %lo(VISenter), %g7
+	 add	%g7, 8, %g7
+0:	wr	%g0, FPRS_FEF, %fprs
+	rd	%asi, %g1
+	wr	%g0, ASI_BLK_P, %asi
+	membar	#LoadStore|#StoreLoad|#StoreStore
+	sub	%o0, 64, %o0
+	ldda	[%o1] %asi, %f0
+	ldda	[%o2] %asi, %f16
+
+5:	ldda	[%o3] %asi, %f32
+	fxor	%f0, %f16, %f48
+	fxor	%f2, %f18, %f50
+	add	%o1, 64, %o1
+	fxor	%f4, %f20, %f52
+	fxor	%f6, %f22, %f54
+	add	%o2, 64, %o2
+	fxor	%f8, %f24, %f56
+	fxor	%f10, %f26, %f58
+	fxor	%f12, %f28, %f60
+	fxor	%f14, %f30, %f62
+	ldda	[%o4] %asi, %f16
+	fxor	%f48, %f32, %f48
+	fxor	%f50, %f34, %f50
+	fxor	%f52, %f36, %f52
+	fxor	%f54, %f38, %f54
+	add	%o3, 64, %o3
+	fxor	%f56, %f40, %f56
+	fxor	%f58, %f42, %f58
+	fxor	%f60, %f44, %f60
+	fxor	%f62, %f46, %f62
+	ldda	[%o5] %asi, %f32
+	fxor	%f48, %f16, %f48
+	fxor	%f50, %f18, %f50
+	add	%o4, 64, %o4
+	fxor	%f52, %f20, %f52
+	fxor	%f54, %f22, %f54
+	add	%o5, 64, %o5
+	fxor	%f56, %f24, %f56
+	fxor	%f58, %f26, %f58
+	fxor	%f60, %f28, %f60
+	fxor	%f62, %f30, %f62
+	ldda	[%o1] %asi, %f0
+	fxor	%f48, %f32, %f48
+	fxor	%f50, %f34, %f50
+	fxor	%f52, %f36, %f52
+	fxor	%f54, %f38, %f54
+	fxor	%f56, %f40, %f56
+	fxor	%f58, %f42, %f58
+	subcc	%o0, 64, %o0
+	fxor	%f60, %f44, %f60
+	fxor	%f62, %f46, %f62
+	stda	%f48, [%o1 - 64] %asi
+	bne,pt	%xcc, 5b
+	 ldda	[%o2] %asi, %f16
+
+	ldda	[%o3] %asi, %f32
+	fxor	%f0, %f16, %f48
+	fxor	%f2, %f18, %f50
+	fxor	%f4, %f20, %f52
+	fxor	%f6, %f22, %f54
+	fxor	%f8, %f24, %f56
+	fxor	%f10, %f26, %f58
+	fxor	%f12, %f28, %f60
+	fxor	%f14, %f30, %f62
+	ldda	[%o4] %asi, %f16
+	fxor	%f48, %f32, %f48
+	fxor	%f50, %f34, %f50
+	fxor	%f52, %f36, %f52
+	fxor	%f54, %f38, %f54
+	fxor	%f56, %f40, %f56
+	fxor	%f58, %f42, %f58
+	fxor	%f60, %f44, %f60
+	fxor	%f62, %f46, %f62
+	ldda	[%o5] %asi, %f32
+	fxor	%f48, %f16, %f48
+	fxor	%f50, %f18, %f50
+	fxor	%f52, %f20, %f52
+	fxor	%f54, %f22, %f54
+	fxor	%f56, %f24, %f56
+	fxor	%f58, %f26, %f58
+	fxor	%f60, %f28, %f60
+	fxor	%f62, %f30, %f62
+	membar	#Sync
+	fxor	%f48, %f32, %f48
+	fxor	%f50, %f34, %f50
+	fxor	%f52, %f36, %f52
+	fxor	%f54, %f38, %f54
+	fxor	%f56, %f40, %f56
+	fxor	%f58, %f42, %f58
+	fxor	%f60, %f44, %f60
+	fxor	%f62, %f46, %f62
+	stda	%f48, [%o1] %asi
+	membar	#Sync|#StoreStore|#StoreLoad
+	wr	%g1, %g0, %asi
+	retl
+	 wr	%g0, 0, %fprs
+	.size xor_vis_5, .-xor_vis_5
+");
+
+static struct xor_block_template xor_block_VIS = {
+        name: "VIS",
+        do_2: xor_vis_2,
+        do_3: xor_vis_3,
+        do_4: xor_vis_4,
+        do_5: xor_vis_5,
+};
+
+#define XOR_TRY_TEMPLATES       xor_speed(&xor_block_VIS)
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -73,7 +73,7 @@ extern struct kernel_param __setup_start, __setup_end;
 * Mark functions and data as being only used at initialization
 * or exit time.
 */
-#define __init		__attribute__ ((__section__ (".text.init")))
+#define __init		/* __attribute__ ((__section__ (".text.init"))) */
 #define __exit		__attribute__ ((unused, __section__(".text.exit")))
 #define __initdata	__attribute__ ((__section__ (".data.init")))
 #define __exitdata	__attribute__ ((unused, __section__ (".data.exit")))

--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -3,10 +3,21 @@

 #include <linux/raid/md.h>

-#define MAX_XOR_BLOCKS 4
+#define MAX_XOR_BLOCKS 5

-extern void calibrate_xor_block(void);
-extern void (*xor_block)(unsigned int count,
-                         struct buffer_head **bh_ptr);
+extern void xor_block(unsigned int count, struct buffer_head **bh_ptr);
+
+struct xor_block_template {
+        struct xor_block_template *next;
+        const char *name;
+        int speed;
+	void (*do_2)(unsigned long, unsigned long *, unsigned long *);
+	void (*do_3)(unsigned long, unsigned long *, unsigned long *,
+		     unsigned long *);
+	void (*do_4)(unsigned long, unsigned long *, unsigned long *,
+		     unsigned long *, unsigned long *);
+	void (*do_5)(unsigned long, unsigned long *, unsigned long *,
+		     unsigned long *, unsigned long *, unsigned long *);
+};

 #endif
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -486,10 +486,6 @@ EXPORT_SYMBOL(remove_inode_hash);
 EXPORT_SYMBOL(make_bad_inode);
 EXPORT_SYMBOL(is_bad_inode);
 EXPORT_SYMBOL(event);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__down_trylock);
-EXPORT_SYMBOL(__up);
 EXPORT_SYMBOL(brw_page);

 #ifdef CONFIG_UID16

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -432,16 +432,28 @@ static inline void __schedule_tail(struct task_struct *prev)
 #ifdef CONFIG_SMP
 	int policy;

+	/*
+	 * prev->policy can be written from here only before `prev'
+	 * can be scheduled (before setting prev->has_cpu to zero).
+	 * Of course it must also be read before allowing prev
+	 * to be rescheduled, but since the write depends on the read
+	 * to complete, wmb() is enough. (the spin_lock() acquired
+	 * before setting has_cpu is not enough because the spin_lock()
+	 * common code semantics allows code outside the critical section
+	 * to enter inside the critical section)
+	 */
+	policy = prev->policy;
+	prev->policy = policy & ~SCHED_YIELD;
+	wmb();
+
 	/*
 	 * fast path falls through. We have to clear has_cpu before
 	 * checking prev->state to avoid a wakeup race - thus we
 	 * also have to protect against the task exiting early.
 	 */
 	task_lock(prev);
-	policy = prev->policy;
-	prev->policy = policy & ~SCHED_YIELD;
 	prev->has_cpu = 0;
-	wmb();
+	mb();
 	if (prev->state == TASK_RUNNING)
 		goto needs_resched;