Commit 079dff39 authored by Linus Torvalds's avatar Linus Torvalds

- pre4:

   - Andrea Arcangeli: SMP scheduler memory barrier fixup
   - Richard Henderson: fix alpha semaphores and spinlock bugs.
   - Richard Henderson: clean up the file from hell: "xor.c"
parent 7f6760c7
......@@ -63,7 +63,6 @@ unset CONFIG_ALPHA_T2 CONFIG_ALPHA_PYXIS CONFIG_ALPHA_POLARIS
unset CONFIG_ALPHA_TSUNAMI CONFIG_ALPHA_MCPCIA
unset CONFIG_ALPHA_IRONGATE
unset CONFIG_ALPHA_BROKEN_IRQ_MASK
unset CONFIG_ALPHA_LARGE_VMALLOC
# Most of these machines have ISA slots; not exactly sure which don't,
# and this doesn't activate hordes of code, so do it always.
......@@ -215,6 +214,8 @@ if [ "$CONFIG_ALPHA_GENERIC" = "y" -o "$CONFIG_ALPHA_DP264" = "y" \
-o "$CONFIG_ALPHA_WILDFIRE" = "y" -o "$CONFIG_ALPHA_TITAN" = "y" ]
then
bool 'Large VMALLOC support' CONFIG_ALPHA_LARGE_VMALLOC
else
define_bool CONFIG_ALPHA_LARGE_VMALLOC n
fi
source drivers/pci/Config.in
......
......@@ -160,15 +160,20 @@ EXPORT_SYMBOL_NOVERS(__do_clear_user);
EXPORT_SYMBOL(__strncpy_from_user);
EXPORT_SYMBOL(__strnlen_user);
/*
* The following are specially called from the semaphore assembly stubs.
*/
EXPORT_SYMBOL_NOVERS(__down_failed);
EXPORT_SYMBOL_NOVERS(__down_failed_interruptible);
EXPORT_SYMBOL_NOVERS(__up_wakeup);
EXPORT_SYMBOL_NOVERS(__down_read_failed);
EXPORT_SYMBOL_NOVERS(__down_write_failed);
EXPORT_SYMBOL_NOVERS(__rwsem_wake);
/* Semaphore helper functions. */
EXPORT_SYMBOL(__down_failed);
EXPORT_SYMBOL(__down_failed_interruptible);
EXPORT_SYMBOL(__up_wakeup);
EXPORT_SYMBOL(down);
EXPORT_SYMBOL(down_interruptible);
EXPORT_SYMBOL(up);
EXPORT_SYMBOL(__down_read_failed);
EXPORT_SYMBOL(__down_write_failed);
EXPORT_SYMBOL(__rwsem_wake);
EXPORT_SYMBOL(down_read);
EXPORT_SYMBOL(down_write);
EXPORT_SYMBOL(up_read);
EXPORT_SYMBOL(up_write);
/*
* SMP-specific symbols.
......
/*
* Generic semaphore code. Buyer beware. Do your own
* specific changes in <asm/semaphore-helper.h>
* Alpha semaphore implementation.
*
* (C) Copyright 1996 Linus Torvalds
* (C) Copyright 1999, 2000 Richard Henderson
*/
#include <linux/sched.h>
#include <asm/semaphore-helper.h>
/*
* Semaphores are implemented using a two-way counter:
* The "count" variable is decremented for each process
* that tries to sleep, while the "waking" variable is
* incremented when the "up()" code goes to wake up waiting
* processes.
*
* The "count" variable is decremented for each process that tries to sleep,
* while the "waking" variable is incremented when the "up()" code goes to
* wake up waiting processes.
*
* Notably, the inline "up()" and "down()" functions can
* efficiently test if they need to do any extra work (up
* needs to do something only if count was negative before
* the increment operation.
* Notably, the inline "up()" and "down()" functions can efficiently test
* if they need to do any extra work (up needs to do something only if count
* was negative before the increment operation.
*
* waking_non_zero() (from asm/semaphore.h) must execute
* atomically.
* waking_non_zero() (from asm/semaphore.h) must execute atomically.
*
* When __up() is called, the count was negative before
* incrementing it, and we need to wake up somebody.
* When __up() is called, the count was negative before incrementing it,
* and we need to wake up somebody.
*
* This routine adds one to the count of processes that need to
* wake up and exit. ALL waiting processes actually wake up but
* only the one that gets to the "waking" field first will gate
* through and acquire the semaphore. The others will go back
* to sleep.
* This routine adds one to the count of processes that need to wake up and
* exit. ALL waiting processes actually wake up but only the one that gets
* to the "waking" field first will gate through and acquire the semaphore.
* The others will go back to sleep.
*
* Note that these functions are only called when there is
* contention on the lock, and as such all this is the
* "non-critical" part of the whole semaphore business. The
* critical part is the inline stuff in <asm/semaphore.h>
* where we want to avoid any extra jumps and calls.
* Note that these functions are only called when there is contention on the
* lock, and as such all this is the "non-critical" part of the whole
* semaphore business. The critical part is the inline stuff in
* <asm/semaphore.h> where we want to avoid any extra jumps and calls.
*/
void
__up(struct semaphore *sem)
{
wake_one_more(sem);
wake_up(&sem->wait);
}
/*
* Perform the "down" function. Return zero for semaphore acquired,
* return negative for signalled out of the function.
*
* If called from __down, the return is ignored and the wait loop is
* If called from down, the return is ignored and the wait loop is
* not interruptible. This means that a task waiting on a semaphore
* using "down()" cannot be killed until someone does an "up()" on
* the semaphore.
*
* If called from __down_interruptible, the return value gets checked
* If called from down_interruptible, the return value gets checked
* upon return. If the return value is negative then the task continues
* with the negative value in the return register (it can be tested by
* the caller).
*
* Either form may be used in conjunction with "up()".
*
*/
#define DOWN_VAR \
struct task_struct *tsk = current; \
wait_queue_t wait; \
init_waitqueue_entry(&wait, tsk)
#define DOWN_HEAD(task_state) \
\
\
tsk->state = (task_state); \
add_wait_queue(&sem->wait, &wait); \
\
/* \
* Ok, we're set up. sem->count is known to be less than zero \
* so we must wait. \
* \
* We can let go the lock for purposes of waiting. \
* We re-acquire it after awaking so as to protect \
* all semaphore operations. \
* \
* If "up()" is called before we call waking_non_zero() then \
* we will catch it right away. If it is called later then \
* we will have to go through a wakeup cycle to catch it. \
* \
* Multiple waiters contend for the semaphore lock to see \
* who gets to gate through and who has to wait some more. \
*/ \
for (;;) {
#define DOWN_TAIL(task_state) \
tsk->state = (task_state); \
} \
tsk->state = TASK_RUNNING; \
remove_wait_queue(&sem->wait, &wait)
void
__down(struct semaphore * sem)
__down_failed(struct semaphore *sem)
{
DOWN_VAR;
DOWN_HEAD(TASK_UNINTERRUPTIBLE);
DECLARE_WAITQUEUE(wait, current);
#if DEBUG_SEMAPHORE
printk("%s(%d): down failed(%p)\n",
current->comm, current->pid, sem);
#endif
current->state = TASK_UNINTERRUPTIBLE;
wmb();
add_wait_queue_exclusive(&sem->wait, &wait);
/* At this point we know that sem->count is negative. In order
to avoid racing with __up, we must check for wakeup before
going to sleep the first time. */
while (1) {
long ret, tmp;
/* An atomic conditional decrement of sem->waking. */
__asm__ __volatile__(
"1: ldl_l %1,%2\n"
" blt %1,2f\n"
" subl %1,1,%0\n"
" stl_c %0,%2\n"
" beq %0,3f\n"
"2:\n"
".subsection 2\n"
"3: br 1b\n"
".previous"
: "=r"(ret), "=&r"(tmp), "=m"(sem->waking)
: "0"(0));
if (ret)
break;
if (waking_non_zero(sem))
break;
schedule();
schedule();
set_task_state(current, TASK_UNINTERRUPTIBLE);
}
DOWN_TAIL(TASK_UNINTERRUPTIBLE);
remove_wait_queue(&sem->wait, &wait);
current->state = TASK_RUNNING;
#if DEBUG_SEMAPHORE
printk("%s(%d): down acquired(%p)\n",
current->comm, current->pid, sem);
#endif
}
int
__down_interruptible(struct semaphore * sem)
__down_failed_interruptible(struct semaphore *sem)
{
int ret = 0;
DOWN_VAR;
DOWN_HEAD(TASK_INTERRUPTIBLE);
ret = waking_non_zero_interruptible(sem, tsk);
if (ret)
{
if (ret == 1)
/* ret != 0 only if we get interrupted -arca */
ret = 0;
break;
DECLARE_WAITQUEUE(wait, current);
long ret;
#if DEBUG_SEMAPHORE
printk("%s(%d): down failed(%p)\n",
current->comm, current->pid, sem);
#endif
current->state = TASK_INTERRUPTIBLE;
wmb();
add_wait_queue_exclusive(&sem->wait, &wait);
while (1) {
long tmp, tmp2, tmp3;
/* We must undo the sem->count down_interruptible decrement
simultaneously and atomicly with the sem->waking
adjustment, otherwise we can race with __up. This is
accomplished by doing a 64-bit ll/sc on two 32-bit words.
"Equivalent" C. Note that we have to do this all without
(taken) branches in order to be a valid ll/sc sequence.
do {
tmp = ldq_l;
ret = 0;
if (tmp >= 0) { // waking >= 0
tmp += 0xffffffff00000000; // waking -= 1
ret = 1;
}
else if (pending) {
// count += 1, but since -1 + 1 carries into the
// high word, we have to be more careful here.
tmp = (tmp & 0xffffffff00000000)
| ((tmp + 1) & 0x00000000ffffffff);
ret = -EINTR;
}
tmp = stq_c = tmp;
} while (tmp == 0);
*/
__asm__ __volatile__(
"1: ldq_l %1,%4\n"
" lda %0,0\n"
" cmovne %5,%6,%0\n"
" addq %1,1,%2\n"
" and %1,%7,%3\n"
" andnot %2,%7,%2\n"
" cmovge %1,1,%0\n"
" or %3,%2,%2\n"
" addq %1,%7,%3\n"
" cmovne %5,%2,%1\n"
" cmovge %2,%3,%1\n"
" stq_c %1,%4\n"
" beq %1,3f\n"
"2:\n"
".subsection 2\n"
"3: br 1b\n"
".previous"
: "=&r"(ret), "=&r"(tmp), "=&r"(tmp2),
"=&r"(tmp3), "=m"(*sem)
: "r"(signal_pending(current)), "r"(-EINTR),
"r"(0xffffffff00000000));
/* At this point we have ret
1 got the lock
0 go to sleep
-EINTR interrupted */
if (ret != 0)
break;
schedule();
set_task_state(current, TASK_INTERRUPTIBLE);
}
schedule();
DOWN_TAIL(TASK_INTERRUPTIBLE);
return ret;
remove_wait_queue(&sem->wait, &wait);
current->state = TASK_RUNNING;
wake_up(&sem->wait);
#if DEBUG_SEMAPHORE
printk("%s(%d): down %s(%p)\n",
current->comm, current->pid,
(ret < 0 ? "interrupted" : "acquired"), sem);
#endif
/* Convert "got the lock" to 0==success. */
return (ret < 0 ? ret : 0);
}
void
__up_wakeup(struct semaphore *sem)
{
wake_up(&sem->wait);
}
void
down(struct semaphore *sem)
{
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
#if DEBUG_SEMAPHORE
printk("%s(%d): down(%p) <count=%d> from %p\n",
current->comm, current->pid, sem,
atomic_read(&sem->count), __builtin_return_address(0));
#endif
__down(sem);
}
int
down_interruptible(struct semaphore *sem)
{
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
#if DEBUG_SEMAPHORE
printk("%s(%d): down(%p) <count=%d> from %p\n",
current->comm, current->pid, sem,
atomic_read(&sem->count), __builtin_return_address(0));
#endif
return __down_interruptible(sem);
}
int
__down_trylock(struct semaphore * sem)
down_trylock(struct semaphore *sem)
{
return waking_non_zero_trylock(sem);
int ret;
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
ret = __down_trylock(sem);
#if DEBUG_SEMAPHORE
printk("%s(%d): down_trylock %s from %p\n",
current->comm, current->pid,
ret ? "failed" : "acquired",
__builtin_return_address(0));
#endif
return ret;
}
void
up(struct semaphore *sem)
{
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
#if DEBUG_SEMAPHORE
printk("%s(%d): up(%p) <count=%d> from %p\n",
current->comm, current->pid, sem,
atomic_read(&sem->count), __builtin_return_address(0));
#endif
__up(sem);
}
......@@ -142,122 +270,106 @@ __down_trylock(struct semaphore * sem)
*/
void
__down_read(struct rw_semaphore *sem, int count)
__down_read_failed(struct rw_semaphore *sem, int count)
{
long tmp;
DOWN_VAR;
DECLARE_WAITQUEUE(wait, current);
retry_down:
if (count < 0) {
/* Wait for the lock to become unbiased. Readers
are non-exclusive. */
/* Waiting on multiple readers and/or writers. */
/* This takes care of granting the lock. */
up_read(sem);
/* Undo the acquisition we started in down_read. */
atomic_inc(&sem->count);
current->state = TASK_UNINTERRUPTIBLE;
wmb();
add_wait_queue(&sem->wait, &wait);
while (sem->count < 0) {
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (sem->count >= 0)
break;
mb();
while (atomic_read(&sem->count) < 0) {
schedule();
set_task_state(current, TASK_UNINTERRUPTIBLE);
}
remove_wait_queue(&sem->wait, &wait);
tsk->state = TASK_RUNNING;
__asm __volatile (
" mb\n"
"1: ldl_l %0,%1\n"
" subl %0,1,%2\n"
" subl %0,1,%0\n"
" stl_c %2,%1\n"
" bne %2,2f\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
: "=r"(count), "=m"(sem->count), "=r"(tmp)
: : "memory");
current->state = TASK_RUNNING;
mb();
count = atomic_dec_return(&sem->count);
if (count <= 0)
goto retry_down;
} else {
/* Waiting on exactly one writer. */
current->state = TASK_UNINTERRUPTIBLE;
wmb();
add_wait_queue(&sem->wait, &wait);
mb();
while (1) {
if (test_and_clear_bit(0, &sem->granted))
break;
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if ((sem->granted & 1) == 0)
schedule();
while (!test_and_clear_bit(0, &sem->granted)) {
schedule();
set_task_state(current, TASK_UNINTERRUPTIBLE);
}
remove_wait_queue(&sem->wait, &wait);
tsk->state = TASK_RUNNING;
current->state = TASK_RUNNING;
}
}
void
__down_write(struct rw_semaphore *sem, int count)
__down_write_failed(struct rw_semaphore *sem, int count)
{
long tmp;
DOWN_VAR;
DECLARE_WAITQUEUE(wait, current);
retry_down:
if (count + RW_LOCK_BIAS < 0) {
up_write(sem);
/* Waiting on multiple readers and/or writers. */
/* Undo the acquisition we started in down_write. */
atomic_add(RW_LOCK_BIAS, &sem->count);
current->state = TASK_UNINTERRUPTIBLE;
wmb();
add_wait_queue_exclusive(&sem->wait, &wait);
mb();
while (sem->count < 0) {
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (sem->count >= RW_LOCK_BIAS)
break;
while (atomic_read(&sem->count) + RW_LOCK_BIAS < 0) {
schedule();
set_task_state(current, TASK_UNINTERRUPTIBLE);
}
remove_wait_queue(&sem->wait, &wait);
tsk->state = TASK_RUNNING;
__asm __volatile (
" mb\n"
"1: ldl_l %0,%1\n"
" ldah %2,%3(%0)\n"
" ldah %0,%3(%0)\n"
" stl_c %2,%1\n"
" bne %2,2f\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
: "=r"(count), "=m"(sem->count), "=r"(tmp)
: "i"(-(RW_LOCK_BIAS >> 16))
: "memory");
current->state = TASK_RUNNING;
count = atomic_sub_return(RW_LOCK_BIAS, &sem->count);
if (count != 0)
goto retry_down;
} else {
/* Put ourselves at the end of the list. */
add_wait_queue_exclusive(&sem->write_bias_wait, &wait);
while (1) {
if (test_and_clear_bit(1, &sem->granted))
break;
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if ((sem->granted & 2) == 0)
schedule();
/* Waiting on exactly one writer. */
current->state = TASK_UNINTERRUPTIBLE;
wmb();
add_wait_queue_exclusive(&sem->wait, &wait);
mb();
while (!test_and_clear_bit(1, &sem->granted)) {
schedule();
set_task_state(current, TASK_UNINTERRUPTIBLE);
}
remove_wait_queue(&sem->write_bias_wait, &wait);
tsk->state = TASK_RUNNING;
current->state = TASK_RUNNING;
/* If the lock is currently unbiased, awaken the sleepers.
FIXME: This wakes up the readers early in a bit of a
stampede -> bad! */
if (sem->count >= 0)
count = atomic_read(&sem->count);
if (__builtin_expect(count >= 0, 0))
wake_up(&sem->wait);
}
}
void
__do_rwsem_wake(struct rw_semaphore *sem, int readers)
__rwsem_wake(struct rw_semaphore *sem, int readers)
{
if (readers) {
if (test_and_set_bit(0, &sem->granted))
......@@ -269,3 +381,67 @@ __do_rwsem_wake(struct rw_semaphore *sem, int readers)
wake_up(&sem->write_bias_wait);
}
}
void
down_read(struct rw_semaphore *sem)
{
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
__down_read(sem);
#if WAITQUEUE_DEBUG
if (sem->granted & 2)
BUG();
if (atomic_read(&sem->writers))
BUG();
atomic_inc(&sem->readers);
#endif
}
void
down_write(struct rw_semaphore *sem)
{
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
__down_write(sem);
#if WAITQUEUE_DEBUG
if (sem->granted & 3)
BUG();
if (atomic_read(&sem->writers))
BUG();
if (atomic_read(&sem->readers))
BUG();
atomic_inc(&sem->writers);
#endif
}
void
up_read(struct rw_semaphore *sem)
{
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
if (sem->granted & 2)
BUG();
if (atomic_read(&sem->writers))
BUG();
atomic_dec(&sem->readers);
#endif
__up_read(sem);
}
void
up_write(struct rw_semaphore *sem)
{
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
if (sem->granted & 3)
BUG();
if (atomic_read(&sem->readers))
BUG();
if (atomic_read(&sem->writers) != 1)
BUG();
atomic_dec(&sem->writers);
#endif
__up_write(sem);
}
......@@ -378,6 +378,9 @@ do_settimeofday(struct timeval *tv)
* BUG: This routine does not handle hour overflow properly; it just
* sets the minutes. Usually you won't notice until after reboot!
*/
extern int abs(int);
static int
set_rtc_mmss(unsigned long nowtime)
{
......
......@@ -12,7 +12,7 @@ OBJS = __divqu.o __remqu.o __divlu.o __remlu.o memset.o memcpy.o io.o \
strcat.o strcpy.o strncat.o strncpy.o stxcpy.o stxncpy.o \
strchr.o strrchr.o memchr.o \
copy_user.o clear_user.o strncpy_from_user.o strlen_user.o \
csum_ipv6_magic.o strcasecmp.o semaphore.o fpreg.o \
csum_ipv6_magic.o strcasecmp.o fpreg.o \
callback_srm.o srm_puts.o srm_printk.o
lib.a: $(OBJS)
......
/*
* linux/arch/alpha/lib/semaphore.S
*
* Copyright (C) 1999, 2000 Richard Henderson
*/
/*
* The semaphore operations have a special calling sequence that
* allow us to do a simpler in-line version of them. These routines
* need to convert that sequence back into the C sequence when
* there is contention on the semaphore.
*/
.set noat
.set noreorder
.align 4
/* __down_failed takes the semaphore in $24, clobbers $24 and $28. */
.globl __down_failed
.ent __down_failed
__down_failed:
ldgp $29,0($27)
lda $30, -20*8($30)
stq $28, 0*8($30)
stq $0, 1*8($30)
stq $1, 2*8($30)
stq $2, 3*8($30)
stq $3, 4*8($30)
stq $4, 5*8($30)
stq $5, 6*8($30)
stq $6, 7*8($30)
stq $7, 8*8($30)
stq $16, 9*8($30)
stq $17, 10*8($30)
stq $18, 11*8($30)
stq $19, 12*8($30)
stq $20, 13*8($30)
stq $21, 14*8($30)
stq $22, 15*8($30)
stq $23, 16*8($30)
stq $25, 17*8($30)
stq $26, 18*8($30)
.frame $30, 20*8, $28
.prologue 1
mov $24, $16
jsr __down
ldq $28, 0*8($30)
ldq $0, 1*8($30)
ldq $1, 2*8($30)
ldq $2, 3*8($30)
ldq $3, 4*8($30)
ldq $4, 5*8($30)
ldq $5, 6*8($30)
ldq $6, 7*8($30)
ldq $7, 8*8($30)
ldq $16, 9*8($30)
ldq $17, 10*8($30)
ldq $18, 11*8($30)
ldq $19, 12*8($30)
ldq $20, 13*8($30)
ldq $21, 14*8($30)
ldq $22, 15*8($30)
ldq $23, 16*8($30)
ldq $25, 17*8($30)
ldq $26, 18*8($30)
lda $30, 20*8($30)
ret $31, ($28), 0
.end __down_failed
/* __down_failed_interruptible takes the semaphore in $24,
clobbers $28, returns success in $24. */
.globl __down_failed_interruptible
.ent __down_failed_interruptible
__down_failed_interruptible:
ldgp $29,0($27)
lda $30, -20*8($30)
stq $28, 0*8($30)
stq $0, 1*8($30)
stq $1, 2*8($30)
stq $2, 3*8($30)
stq $3, 4*8($30)
stq $4, 5*8($30)
stq $5, 6*8($30)
stq $6, 7*8($30)
stq $7, 8*8($30)
stq $16, 9*8($30)
stq $17, 10*8($30)
stq $18, 11*8($30)
stq $19, 12*8($30)
stq $20, 13*8($30)
stq $21, 14*8($30)
stq $22, 15*8($30)
stq $23, 16*8($30)
stq $25, 17*8($30)
stq $26, 18*8($30)
.frame $30, 20*8, $28
.prologue 1
mov $24, $16
jsr __down_interruptible
mov $0, $24
ldq $28, 0*8($30)
ldq $0, 1*8($30)
ldq $1, 2*8($30)
ldq $2, 3*8($30)
ldq $3, 4*8($30)
ldq $4, 5*8($30)
ldq $5, 6*8($30)
ldq $6, 7*8($30)
ldq $7, 8*8($30)
ldq $16, 9*8($30)
ldq $17, 10*8($30)
ldq $18, 11*8($30)
ldq $19, 12*8($30)
ldq $20, 13*8($30)
ldq $21, 14*8($30)
ldq $22, 15*8($30)
ldq $23, 16*8($30)
ldq $25, 17*8($30)
ldq $26, 18*8($30)
lda $30, 20*8($30)
ret $31, ($28), 0
.end __down_failed_interruptible
/* __up_wakeup takes the semaphore in $24, clobbers $24 and $28. */
.globl __up_wakeup
.ent __up_wakeup
__up_wakeup:
ldgp $29,0($27)
lda $30, -20*8($30)
stq $28, 0*8($30)
stq $0, 1*8($30)
stq $1, 2*8($30)
stq $2, 3*8($30)
stq $3, 4*8($30)
stq $4, 5*8($30)
stq $5, 6*8($30)
stq $6, 7*8($30)
stq $7, 8*8($30)
stq $16, 9*8($30)
stq $17, 10*8($30)
stq $18, 11*8($30)
stq $19, 12*8($30)
stq $20, 13*8($30)
stq $21, 14*8($30)
stq $22, 15*8($30)
stq $23, 16*8($30)
stq $25, 17*8($30)
stq $26, 18*8($30)
.frame $30, 20*8, $28
.prologue 1
mov $24, $16
jsr __up
ldq $28, 0*8($30)
ldq $0, 1*8($30)
ldq $1, 2*8($30)
ldq $2, 3*8($30)
ldq $3, 4*8($30)
ldq $4, 5*8($30)
ldq $5, 6*8($30)
ldq $6, 7*8($30)
ldq $7, 8*8($30)
ldq $16, 9*8($30)
ldq $17, 10*8($30)
ldq $18, 11*8($30)
ldq $19, 12*8($30)
ldq $20, 13*8($30)
ldq $21, 14*8($30)
ldq $22, 15*8($30)
ldq $23, 16*8($30)
ldq $25, 17*8($30)
ldq $26, 18*8($30)
lda $30, 20*8($30)
ret $31, ($28), 0
.end __up_wakeup
/* __down_read_failed takes the semaphore in $24, count in $25;
clobbers $24, $25 and $28. */
.globl __down_read_failed
.ent __down_read_failed
__down_read_failed:
ldgp $29,0($27)
lda $30, -18*8($30)
stq $28, 0*8($30)
stq $0, 1*8($30)
stq $1, 2*8($30)
stq $2, 3*8($30)
stq $3, 4*8($30)
stq $4, 5*8($30)
stq $5, 6*8($30)
stq $6, 7*8($30)
stq $7, 8*8($30)
stq $16, 9*8($30)
stq $17, 10*8($30)
stq $18, 11*8($30)
stq $19, 12*8($30)
stq $20, 13*8($30)
stq $21, 14*8($30)
stq $22, 15*8($30)
stq $23, 16*8($30)
stq $26, 17*8($30)
.frame $30, 18*8, $28
.prologue 1
mov $24, $16
mov $25, $17
jsr __down_read
ldq $28, 0*8($30)
ldq $0, 1*8($30)
ldq $1, 2*8($30)
ldq $2, 3*8($30)
ldq $3, 4*8($30)
ldq $4, 5*8($30)
ldq $5, 6*8($30)
ldq $6, 7*8($30)
ldq $7, 8*8($30)
ldq $16, 9*8($30)
ldq $17, 10*8($30)
ldq $18, 11*8($30)
ldq $19, 12*8($30)
ldq $20, 13*8($30)
ldq $21, 14*8($30)
ldq $22, 15*8($30)
ldq $23, 16*8($30)
ldq $26, 17*8($30)
lda $30, 18*8($30)
ret $31, ($28), 0
.end __down_read_failed
/* __down_write_failed takes the semaphore in $24, count in $25;
clobbers $24, $25 and $28. */
.globl __down_write_failed
.ent __down_write_failed
__down_write_failed:
ldgp $29,0($27)
lda $30, -20*8($30)
stq $28, 0*8($30)
stq $0, 1*8($30)
stq $1, 2*8($30)
stq $2, 3*8($30)
stq $3, 4*8($30)
stq $4, 5*8($30)
stq $5, 6*8($30)
stq $6, 7*8($30)
stq $7, 8*8($30)
stq $16, 9*8($30)
stq $17, 10*8($30)
stq $18, 11*8($30)
stq $19, 12*8($30)
stq $20, 13*8($30)
stq $21, 14*8($30)
stq $22, 15*8($30)
stq $23, 16*8($30)
stq $26, 17*8($30)
.frame $30, 18*8, $28
.prologue 1
mov $24, $16
mov $25, $17
jsr __down_write
ldq $28, 0*8($30)
ldq $0, 1*8($30)
ldq $1, 2*8($30)
ldq $2, 3*8($30)
ldq $3, 4*8($30)
ldq $4, 5*8($30)
ldq $5, 6*8($30)
ldq $6, 7*8($30)
ldq $7, 8*8($30)
ldq $16, 9*8($30)
ldq $17, 10*8($30)
ldq $18, 11*8($30)
ldq $19, 12*8($30)
ldq $20, 13*8($30)
ldq $21, 14*8($30)
ldq $22, 15*8($30)
ldq $23, 16*8($30)
ldq $26, 17*8($30)
lda $30, 18*8($30)
ret $31, ($28), 0
.end __down_write_failed
/* __rwsem_wake takes the semaphore in $24, readers in $25;
clobbers $24, $25, and $28. */
.globl __rwsem_wake
.ent __rwsem_wake
__rwsem_wake:
ldgp $29,0($27)
lda $30, -18*8($30)
stq $28, 0*8($30)
stq $0, 1*8($30)
stq $1, 2*8($30)
stq $2, 3*8($30)
stq $3, 4*8($30)
stq $4, 5*8($30)
stq $5, 6*8($30)
stq $6, 7*8($30)
stq $7, 8*8($30)
stq $16, 9*8($30)
stq $17, 10*8($30)
stq $18, 11*8($30)
stq $19, 12*8($30)
stq $20, 13*8($30)
stq $21, 14*8($30)
stq $22, 15*8($30)
stq $23, 16*8($30)
stq $26, 17*8($30)
.frame $30, 18*8, $28
.prologue 1
mov $24, $16
mov $25, $17
jsr __do_rwsem_wake
ldq $28, 0*8($30)
ldq $0, 1*8($30)
ldq $1, 2*8($30)
ldq $2, 3*8($30)
ldq $3, 4*8($30)
ldq $4, 5*8($30)
ldq $5, 6*8($30)
ldq $6, 7*8($30)
ldq $7, 8*8($30)
ldq $16, 9*8($30)
ldq $17, 10*8($30)
ldq $18, 11*8($30)
ldq $19, 12*8($30)
ldq $20, 13*8($30)
ldq $21, 14*8($30)
ldq $22, 15*8($30)
ldq $23, 16*8($30)
ldq $26, 17*8($30)
lda $30, 18*8($30)
ret $31, ($28), 0
.end __rwsem_wake
......@@ -2344,18 +2344,7 @@ static mdk_personality_t raid5_personality=
int raid5_init (void)
{
int err;
err = register_md_personality (RAID5, &raid5_personality);
if (err)
return err;
/*
* pick a XOR routine, runtime.
*/
calibrate_xor_block();
return 0;
return register_md_personality (RAID5, &raid5_personality);
}
#ifdef MODULE
......
/*
* xor.c : Multiple Devices driver for Linux
*
* Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
* Copyright (C) 1996, 1997, 1998, 1999, 2000,
* Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
*
*
* optimized RAID-5 checksumming functions.
* Dispatch optimized RAID-5 checksumming functions.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -15,2584 +15,66 @@
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/config.h>
#define BH_TRACE 0
#include <linux/module.h>
#include <linux/raid/md.h>
#ifdef __sparc_v9__
#include <asm/head.h>
#include <asm/asi.h>
#include <asm/visasm.h>
#endif
/*
* we use the 'XOR function template' to register multiple xor
* functions runtime. The kernel measures their speed upon bootup
* and decides which one to use. (compile-time registration is
* not enough as certain CPU features like MMX can only be detected
* runtime)
*
* this architecture makes it pretty easy to add new routines
* that are faster on certain CPUs, without killing other CPU's
* 'native' routine. Although the current routines are belived
* to be the physically fastest ones on all CPUs tested, but
* feel free to prove me wrong and add yet another routine =B-)
* --mingo
*/
#define MAX_XOR_BLOCKS 5
#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
typedef void (*xor_block_t) XOR_ARGS;
xor_block_t xor_block = NULL;
#ifndef __sparc_v9__
struct xor_block_template;
struct xor_block_template {
char * name;
xor_block_t xor_block;
int speed;
struct xor_block_template * next;
};
struct xor_block_template * xor_functions = NULL;
#define XORBLOCK_TEMPLATE(x) \
static void xor_block_##x XOR_ARGS; \
static struct xor_block_template t_xor_block_##x = \
{ #x, xor_block_##x, 0, NULL }; \
static void xor_block_##x XOR_ARGS
#ifdef __i386__
#ifdef CONFIG_X86_XMM
/*
* Cache avoiding checksumming functions utilizing KNI instructions
* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
*/
XORBLOCK_TEMPLATE(pIII_kni)
{
char xmm_save[16*4];
int cr0;
int lines = (bh_ptr[0]->b_size>>8);
__asm__ __volatile__ (
"movl %%cr0,%0 ;\n\t"
"clts ;\n\t"
"movups %%xmm0,(%1) ;\n\t"
"movups %%xmm1,0x10(%1) ;\n\t"
"movups %%xmm2,0x20(%1) ;\n\t"
"movups %%xmm3,0x30(%1) ;\n\t"
: "=r" (cr0)
: "r" (xmm_save)
: "memory" );
#define OFFS(x) "8*("#x"*2)"
#define PF0(x) \
" prefetcht0 "OFFS(x)"(%1) ;\n"
#define LD(x,y) \
" movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
#define ST(x,y) \
" movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
#define PF1(x) \
" prefetchnta "OFFS(x)"(%2) ;\n"
#define PF2(x) \
" prefetchnta "OFFS(x)"(%3) ;\n"
#define PF3(x) \
" prefetchnta "OFFS(x)"(%4) ;\n"
#define PF4(x) \
" prefetchnta "OFFS(x)"(%5) ;\n"
#define PF5(x) \
" prefetchnta "OFFS(x)"(%6) ;\n"
#define XO1(x,y) \
" xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
#define XO2(x,y) \
" xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
#define XO3(x,y) \
" xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
#define XO4(x,y) \
" xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
#define XO5(x,y) \
" xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
switch(count) {
case 2:
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
PF1(i) \
PF1(i+2) \
LD(i+2,2) \
LD(i+3,3) \
PF0(i+4) \
PF0(i+6) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
ST(i,0) \
ST(i+1,1) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
PF0(2)
" .align 32,0x90 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $256, %1 ;\n"
" addl $256, %2 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data)
: "memory" );
break;
case 3:
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i+2) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
PF2(i) \
PF2(i+2) \
PF0(i+4) \
PF0(i+6) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
ST(i,0) \
ST(i+1,1) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
PF0(2)
" .align 32,0x90 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $256, %1 ;\n"
" addl $256, %2 ;\n"
" addl $256, %3 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data)
: "memory" );
break;
case 4:
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i+2) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
PF2(i) \
PF2(i+2) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
PF3(i) \
PF3(i+2) \
PF0(i+4) \
PF0(i+6) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
XO3(i,0) \
XO3(i+1,1) \
XO3(i+2,2) \
XO3(i+3,3) \
ST(i,0) \
ST(i+1,1) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
PF0(2)
" .align 32,0x90 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
#include <linux/raid/xor.h>
#include <asm/xor.h>
" addl $256, %1 ;\n"
" addl $256, %2 ;\n"
" addl $256, %3 ;\n"
" addl $256, %4 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
/* The xor routines to use. */
static struct xor_block_template *active_template;
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
"r" (bh_ptr[3]->b_data)
: "memory" );
break;
case 5:
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i+2) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
PF2(i) \
PF2(i+2) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
PF3(i) \
PF3(i+2) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
PF4(i) \
PF4(i+2) \
PF0(i+4) \
PF0(i+6) \
XO3(i,0) \
XO3(i+1,1) \
XO3(i+2,2) \
XO3(i+3,3) \
XO4(i,0) \
XO4(i+1,1) \
XO4(i+2,2) \
XO4(i+3,3) \
ST(i,0) \
ST(i+1,1) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
PF0(2)
" .align 32,0x90 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $256, %1 ;\n"
" addl $256, %2 ;\n"
" addl $256, %3 ;\n"
" addl $256, %4 ;\n"
" addl $256, %5 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
"r" (bh_ptr[3]->b_data),
"r" (bh_ptr[4]->b_data)
: "memory");
break;
}
__asm__ __volatile__ (
"sfence ;\n\t"
"movups (%1),%%xmm0 ;\n\t"
"movups 0x10(%1),%%xmm1 ;\n\t"
"movups 0x20(%1),%%xmm2 ;\n\t"
"movups 0x30(%1),%%xmm3 ;\n\t"
"movl %0,%%cr0 ;\n\t"
:
: "r" (cr0), "r" (xmm_save)
: "memory" );
}
#undef OFFS
#undef LD
#undef ST
#undef PF0
#undef PF1
#undef PF2
#undef PF3
#undef PF4
#undef PF5
#undef XO1
#undef XO2
#undef XO3
#undef XO4
#undef XO5
#undef BLOCK
#endif /* CONFIG_X86_XMM */
/*
* high-speed RAID5 checksumming functions utilizing MMX instructions
* Copyright (C) 1998 Ingo Molnar
*/
XORBLOCK_TEMPLATE(pII_mmx)
void
xor_block(unsigned int count, struct buffer_head **bh_ptr)
{
char fpu_save[108];
int lines = (bh_ptr[0]->b_size>>7);
if (!(current->flags & PF_USEDFPU))
__asm__ __volatile__ ( " clts;\n");
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
#define LD(x,y) \
" movq 8*("#x")(%1), %%mm"#y" ;\n"
#define ST(x,y) \
" movq %%mm"#y", 8*("#x")(%1) ;\n"
#define XO1(x,y) \
" pxor 8*("#x")(%2), %%mm"#y" ;\n"
#define XO2(x,y) \
" pxor 8*("#x")(%3), %%mm"#y" ;\n"
#define XO3(x,y) \
" pxor 8*("#x")(%4), %%mm"#y" ;\n"
#define XO4(x,y) \
" pxor 8*("#x")(%5), %%mm"#y" ;\n"
switch(count) {
case 2:
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
XO1(i,0) \
ST(i,0) \
XO1(i+1,1) \
ST(i+1,1) \
XO1(i+2,2) \
ST(i+2,2) \
XO1(i+3,3) \
ST(i+3,3)
" .align 32,0x90 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
unsigned long *p0, *p1, *p2, *p3, *p4;
unsigned long bytes = bh_ptr[0]->b_size;
" addl $128, %1 ;\n"
" addl $128, %2 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data)
: "memory");
break;
case 3:
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
XO2(i,0) \
ST(i,0) \
XO2(i+1,1) \
ST(i+1,1) \
XO2(i+2,2) \
ST(i+2,2) \
XO2(i+3,3) \
ST(i+3,3)
" .align 32,0x90 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $128, %1 ;\n"
" addl $128, %2 ;\n"
" addl $128, %3 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data)
: "memory");
break;
case 4:
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
XO3(i,0) \
ST(i,0) \
XO3(i+1,1) \
ST(i+1,1) \
XO3(i+2,2) \
ST(i+2,2) \
XO3(i+3,3) \
ST(i+3,3)
" .align 32,0x90 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $128, %1 ;\n"
" addl $128, %2 ;\n"
" addl $128, %3 ;\n"
" addl $128, %4 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
"r" (bh_ptr[3]->b_data)
: "memory");
break;
case 5:
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
XO3(i,0) \
XO3(i+1,1) \
XO3(i+2,2) \
XO3(i+3,3) \
XO4(i,0) \
ST(i,0) \
XO4(i+1,1) \
ST(i+1,1) \
XO4(i+2,2) \
ST(i+2,2) \
XO4(i+3,3) \
ST(i+3,3)
" .align 32,0x90 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $128, %1 ;\n"
" addl $128, %2 ;\n"
" addl $128, %3 ;\n"
" addl $128, %4 ;\n"
" addl $128, %5 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "g" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
"r" (bh_ptr[3]->b_data),
"r" (bh_ptr[4]->b_data)
: "memory");
break;
p0 = (unsigned long *) bh_ptr[0]->b_data;
p1 = (unsigned long *) bh_ptr[1]->b_data;
if (count == 2) {
active_template->do_2(bytes, p0, p1);
return;
}
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
if (!(current->flags & PF_USEDFPU))
stts();
}
#undef LD
#undef XO1
#undef XO2
#undef XO3
#undef XO4
#undef ST
#undef BLOCK
XORBLOCK_TEMPLATE(p5_mmx)
{
char fpu_save[108];
int lines = (bh_ptr[0]->b_size>>6);
if (!(current->flags & PF_USEDFPU))
__asm__ __volatile__ ( " clts;\n");
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
switch(count) {
case 2:
__asm__ __volatile__ (
" .align 32,0x90 ;\n"
" 1: ;\n"
" movq (%1), %%mm0 ;\n"
" movq 8(%1), %%mm1 ;\n"
" pxor (%2), %%mm0 ;\n"
" movq 16(%1), %%mm2 ;\n"
" movq %%mm0, (%1) ;\n"
" pxor 8(%2), %%mm1 ;\n"
" movq 24(%1), %%mm3 ;\n"
" movq %%mm1, 8(%1) ;\n"
" pxor 16(%2), %%mm2 ;\n"
" movq 32(%1), %%mm4 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 24(%2), %%mm3 ;\n"
" movq 40(%1), %%mm5 ;\n"
" movq %%mm3, 24(%1) ;\n"
" pxor 32(%2), %%mm4 ;\n"
" movq 48(%1), %%mm6 ;\n"
" movq %%mm4, 32(%1) ;\n"
" pxor 40(%2), %%mm5 ;\n"
" movq 56(%1), %%mm7 ;\n"
" movq %%mm5, 40(%1) ;\n"
" pxor 48(%2), %%mm6 ;\n"
" pxor 56(%2), %%mm7 ;\n"
" movq %%mm6, 48(%1) ;\n"
" movq %%mm7, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data)
: "memory" );
break;
case 3:
__asm__ __volatile__ (
" .align 32,0x90 ;\n"
" 1: ;\n"
" movq (%1), %%mm0 ;\n"
" movq 8(%1), %%mm1 ;\n"
" pxor (%2), %%mm0 ;\n"
" movq 16(%1), %%mm2 ;\n"
" pxor 8(%2), %%mm1 ;\n"
" pxor (%3), %%mm0 ;\n"
" pxor 16(%2), %%mm2 ;\n"
" movq %%mm0, (%1) ;\n"
" pxor 8(%3), %%mm1 ;\n"
" pxor 16(%3), %%mm2 ;\n"
" movq 24(%1), %%mm3 ;\n"
" movq %%mm1, 8(%1) ;\n"
" movq 32(%1), %%mm4 ;\n"
" movq 40(%1), %%mm5 ;\n"
" pxor 24(%2), %%mm3 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 32(%2), %%mm4 ;\n"
" pxor 24(%3), %%mm3 ;\n"
" pxor 40(%2), %%mm5 ;\n"
" movq %%mm3, 24(%1) ;\n"
" pxor 32(%3), %%mm4 ;\n"
" pxor 40(%3), %%mm5 ;\n"
" movq 48(%1), %%mm6 ;\n"
" movq %%mm4, 32(%1) ;\n"
" movq 56(%1), %%mm7 ;\n"
" pxor 48(%2), %%mm6 ;\n"
" movq %%mm5, 40(%1) ;\n"
" pxor 56(%2), %%mm7 ;\n"
" pxor 48(%3), %%mm6 ;\n"
" pxor 56(%3), %%mm7 ;\n"
" movq %%mm6, 48(%1) ;\n"
" movq %%mm7, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
" addl $64, %3 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data)
: "memory" );
break;
case 4:
__asm__ __volatile__ (
" .align 32,0x90 ;\n"
" 1: ;\n"
" movq (%1), %%mm0 ;\n"
" movq 8(%1), %%mm1 ;\n"
" pxor (%2), %%mm0 ;\n"
" movq 16(%1), %%mm2 ;\n"
" pxor 8(%2), %%mm1 ;\n"
" pxor (%3), %%mm0 ;\n"
" pxor 16(%2), %%mm2 ;\n"
" pxor 8(%3), %%mm1 ;\n"
" pxor (%4), %%mm0 ;\n"
" movq 24(%1), %%mm3 ;\n"
" pxor 16(%3), %%mm2 ;\n"
" pxor 8(%4), %%mm1 ;\n"
" movq %%mm0, (%1) ;\n"
" movq 32(%1), %%mm4 ;\n"
" pxor 24(%2), %%mm3 ;\n"
" pxor 16(%4), %%mm2 ;\n"
" movq %%mm1, 8(%1) ;\n"
" movq 40(%1), %%mm5 ;\n"
" pxor 32(%2), %%mm4 ;\n"
" pxor 24(%3), %%mm3 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 40(%2), %%mm5 ;\n"
" pxor 32(%3), %%mm4 ;\n"
" pxor 24(%4), %%mm3 ;\n"
" movq %%mm3, 24(%1) ;\n"
" movq 56(%1), %%mm7 ;\n"
" movq 48(%1), %%mm6 ;\n"
" pxor 40(%3), %%mm5 ;\n"
" pxor 32(%4), %%mm4 ;\n"
" pxor 48(%2), %%mm6 ;\n"
" movq %%mm4, 32(%1) ;\n"
" pxor 56(%2), %%mm7 ;\n"
" pxor 40(%4), %%mm5 ;\n"
" pxor 48(%3), %%mm6 ;\n"
" pxor 56(%3), %%mm7 ;\n"
" movq %%mm5, 40(%1) ;\n"
" pxor 48(%4), %%mm6 ;\n"
" pxor 56(%4), %%mm7 ;\n"
" movq %%mm6, 48(%1) ;\n"
" movq %%mm7, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
" addl $64, %3 ;\n"
" addl $64, %4 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
"r" (bh_ptr[3]->b_data)
: "memory" );
break;
case 5:
__asm__ __volatile__ (
" .align 32,0x90 ;\n"
" 1: ;\n"
" movq (%1), %%mm0 ;\n"
" movq 8(%1), %%mm1 ;\n"
" pxor (%2), %%mm0 ;\n"
" pxor 8(%2), %%mm1 ;\n"
" movq 16(%1), %%mm2 ;\n"
" pxor (%3), %%mm0 ;\n"
" pxor 8(%3), %%mm1 ;\n"
" pxor 16(%2), %%mm2 ;\n"
" pxor (%4), %%mm0 ;\n"
" pxor 8(%4), %%mm1 ;\n"
" pxor 16(%3), %%mm2 ;\n"
" movq 24(%1), %%mm3 ;\n"
" pxor (%5), %%mm0 ;\n"
" pxor 8(%5), %%mm1 ;\n"
" movq %%mm0, (%1) ;\n"
" pxor 16(%4), %%mm2 ;\n"
" pxor 24(%2), %%mm3 ;\n"
" movq %%mm1, 8(%1) ;\n"
" pxor 16(%5), %%mm2 ;\n"
" pxor 24(%3), %%mm3 ;\n"
" movq 32(%1), %%mm4 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 24(%4), %%mm3 ;\n"
" pxor 32(%2), %%mm4 ;\n"
" movq 40(%1), %%mm5 ;\n"
" pxor 24(%5), %%mm3 ;\n"
" pxor 32(%3), %%mm4 ;\n"
" pxor 40(%2), %%mm5 ;\n"
" movq %%mm3, 24(%1) ;\n"
" pxor 32(%4), %%mm4 ;\n"
" pxor 40(%3), %%mm5 ;\n"
" movq 48(%1), %%mm6 ;\n"
" movq 56(%1), %%mm7 ;\n"
" pxor 32(%5), %%mm4 ;\n"
" pxor 40(%4), %%mm5 ;\n"
" pxor 48(%2), %%mm6 ;\n"
" pxor 56(%2), %%mm7 ;\n"
" movq %%mm4, 32(%1) ;\n"
" pxor 48(%3), %%mm6 ;\n"
" pxor 56(%3), %%mm7 ;\n"
" pxor 40(%5), %%mm5 ;\n"
" pxor 48(%4), %%mm6 ;\n"
" pxor 56(%4), %%mm7 ;\n"
" movq %%mm5, 40(%1) ;\n"
" pxor 48(%5), %%mm6 ;\n"
" pxor 56(%5), %%mm7 ;\n"
" movq %%mm6, 48(%1) ;\n"
" movq %%mm7, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
" addl $64, %3 ;\n"
" addl $64, %4 ;\n"
" addl $64, %5 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "g" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
"r" (bh_ptr[3]->b_data),
"r" (bh_ptr[4]->b_data)
: "memory" );
break;
p2 = (unsigned long *) bh_ptr[2]->b_data;
if (count == 3) {
active_template->do_3(bytes, p0, p1, p2);
return;
}
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
if (!(current->flags & PF_USEDFPU))
stts();
}
#endif /* __i386__ */
#endif /* !__sparc_v9__ */
#ifdef __sparc_v9__
/*
* High speed xor_block operation for RAID4/5 utilizing the
* UltraSparc Visual Instruction Set.
*
* Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
*
* Requirements:
* !(((long)dest | (long)sourceN) & (64 - 1)) &&
* !(len & 127) && len >= 256
*
* It is done in pure assembly, as otherwise gcc makes it
* a non-leaf function, which is not what we want.
* Also, we don't measure the speeds as on other architectures,
* as the measuring routine does not take into account cold caches
* and the fact that xor_block_VIS bypasses the caches.
* xor_block_32regs might be 5% faster for count 2 if caches are hot
* and things just right (for count 3 VIS is about as fast as 32regs for
* hot caches and for count 4 and 5 VIS is faster by good margin always),
* but I think it is better not to pollute the caches.
* Actually, if I'd just fight for speed for hot caches, I could
* write a hybrid VIS/integer routine, which would do always two
* 64B blocks in VIS and two in IEUs, but I really care more about
* caches.
*/
extern void *VISenter(void);
extern void xor_block_VIS XOR_ARGS;
void __xor_block_VIS(void)
{
__asm__ ("
.globl xor_block_VIS
xor_block_VIS:
ldx [%%o1 + 0], %%o4
ldx [%%o1 + 8], %%o3
ldx [%%o4 + %1], %%g5
ldx [%%o4 + %0], %%o4
ldx [%%o3 + %0], %%o3
rd %%fprs, %%o5
andcc %%o5, %2, %%g0
be,pt %%icc, 297f
sethi %%hi(%5), %%g1
jmpl %%g1 + %%lo(%5), %%g7
add %%g7, 8, %%g7
297: wr %%g0, %4, %%fprs
membar #LoadStore|#StoreLoad|#StoreStore
sub %%g5, 64, %%g5
ldda [%%o4] %3, %%f0
ldda [%%o3] %3, %%f16
cmp %%o0, 4
bgeu,pt %%xcc, 10f
cmp %%o0, 3
be,pn %%xcc, 13f
mov -64, %%g1
sub %%g5, 64, %%g5
rd %%asi, %%g1
wr %%g0, %3, %%asi
2: ldda [%%o4 + 64] %%asi, %%f32
fxor %%f0, %%f16, %%f16
fxor %%f2, %%f18, %%f18
fxor %%f4, %%f20, %%f20
fxor %%f6, %%f22, %%f22
fxor %%f8, %%f24, %%f24
fxor %%f10, %%f26, %%f26
fxor %%f12, %%f28, %%f28
fxor %%f14, %%f30, %%f30
stda %%f16, [%%o4] %3
ldda [%%o3 + 64] %%asi, %%f48
ldda [%%o4 + 128] %%asi, %%f0
fxor %%f32, %%f48, %%f48
fxor %%f34, %%f50, %%f50
add %%o4, 128, %%o4
fxor %%f36, %%f52, %%f52
add %%o3, 128, %%o3
fxor %%f38, %%f54, %%f54
subcc %%g5, 128, %%g5
fxor %%f40, %%f56, %%f56
fxor %%f42, %%f58, %%f58
fxor %%f44, %%f60, %%f60
fxor %%f46, %%f62, %%f62
stda %%f48, [%%o4 - 64] %%asi
bne,pt %%xcc, 2b
ldda [%%o3] %3, %%f16
ldda [%%o4 + 64] %%asi, %%f32
fxor %%f0, %%f16, %%f16
fxor %%f2, %%f18, %%f18
fxor %%f4, %%f20, %%f20
fxor %%f6, %%f22, %%f22
fxor %%f8, %%f24, %%f24
fxor %%f10, %%f26, %%f26
fxor %%f12, %%f28, %%f28
fxor %%f14, %%f30, %%f30
stda %%f16, [%%o4] %3
ldda [%%o3 + 64] %%asi, %%f48
membar #Sync
fxor %%f32, %%f48, %%f48
fxor %%f34, %%f50, %%f50
fxor %%f36, %%f52, %%f52
fxor %%f38, %%f54, %%f54
fxor %%f40, %%f56, %%f56
fxor %%f42, %%f58, %%f58
fxor %%f44, %%f60, %%f60
fxor %%f46, %%f62, %%f62
stda %%f48, [%%o4 + 64] %%asi
membar #Sync|#StoreStore|#StoreLoad
wr %%g0, 0, %%fprs
retl
wr %%g1, %%g0, %%asi
13: ldx [%%o1 + 16], %%o2
ldx [%%o2 + %0], %%o2
3: ldda [%%o2] %3, %%f32
fxor %%f0, %%f16, %%f48
fxor %%f2, %%f18, %%f50
add %%o4, 64, %%o4
fxor %%f4, %%f20, %%f52
fxor %%f6, %%f22, %%f54
add %%o3, 64, %%o3
fxor %%f8, %%f24, %%f56
fxor %%f10, %%f26, %%f58
fxor %%f12, %%f28, %%f60
fxor %%f14, %%f30, %%f62
ldda [%%o4] %3, %%f0
fxor %%f48, %%f32, %%f48
fxor %%f50, %%f34, %%f50
fxor %%f52, %%f36, %%f52
fxor %%f54, %%f38, %%f54
add %%o2, 64, %%o2
fxor %%f56, %%f40, %%f56
fxor %%f58, %%f42, %%f58
subcc %%g5, 64, %%g5
fxor %%f60, %%f44, %%f60
fxor %%f62, %%f46, %%f62
stda %%f48, [%%o4 + %%g1] %3
bne,pt %%xcc, 3b
ldda [%%o3] %3, %%f16
ldda [%%o2] %3, %%f32
fxor %%f0, %%f16, %%f48
fxor %%f2, %%f18, %%f50
fxor %%f4, %%f20, %%f52
fxor %%f6, %%f22, %%f54
fxor %%f8, %%f24, %%f56
fxor %%f10, %%f26, %%f58
fxor %%f12, %%f28, %%f60
fxor %%f14, %%f30, %%f62
membar #Sync
fxor %%f48, %%f32, %%f48
fxor %%f50, %%f34, %%f50
fxor %%f52, %%f36, %%f52
fxor %%f54, %%f38, %%f54
fxor %%f56, %%f40, %%f56
fxor %%f58, %%f42, %%f58
fxor %%f60, %%f44, %%f60
fxor %%f62, %%f46, %%f62
stda %%f48, [%%o4] %3
membar #Sync|#StoreStore|#StoreLoad
retl
wr %%g0, 0, %%fprs
10: cmp %%o0, 5
be,pt %%xcc, 15f
mov -64, %%g1
14: ldx [%%o1 + 16], %%o2
ldx [%%o1 + 24], %%o0
ldx [%%o2 + %0], %%o2
ldx [%%o0 + %0], %%o0
4: ldda [%%o2] %3, %%f32
fxor %%f0, %%f16, %%f16
fxor %%f2, %%f18, %%f18
add %%o4, 64, %%o4
fxor %%f4, %%f20, %%f20
fxor %%f6, %%f22, %%f22
add %%o3, 64, %%o3
fxor %%f8, %%f24, %%f24
fxor %%f10, %%f26, %%f26
fxor %%f12, %%f28, %%f28
fxor %%f14, %%f30, %%f30
ldda [%%o0] %3, %%f48
fxor %%f16, %%f32, %%f32
fxor %%f18, %%f34, %%f34
fxor %%f20, %%f36, %%f36
fxor %%f22, %%f38, %%f38
add %%o2, 64, %%o2
fxor %%f24, %%f40, %%f40
fxor %%f26, %%f42, %%f42
fxor %%f28, %%f44, %%f44
fxor %%f30, %%f46, %%f46
ldda [%%o4] %3, %%f0
fxor %%f32, %%f48, %%f48
fxor %%f34, %%f50, %%f50
fxor %%f36, %%f52, %%f52
add %%o0, 64, %%o0
fxor %%f38, %%f54, %%f54
fxor %%f40, %%f56, %%f56
fxor %%f42, %%f58, %%f58
subcc %%g5, 64, %%g5
fxor %%f44, %%f60, %%f60
fxor %%f46, %%f62, %%f62
stda %%f48, [%%o4 + %%g1] %3
bne,pt %%xcc, 4b
ldda [%%o3] %3, %%f16
ldda [%%o2] %3, %%f32
fxor %%f0, %%f16, %%f16
fxor %%f2, %%f18, %%f18
fxor %%f4, %%f20, %%f20
fxor %%f6, %%f22, %%f22
fxor %%f8, %%f24, %%f24
fxor %%f10, %%f26, %%f26
fxor %%f12, %%f28, %%f28
fxor %%f14, %%f30, %%f30
ldda [%%o0] %3, %%f48
fxor %%f16, %%f32, %%f32
fxor %%f18, %%f34, %%f34
fxor %%f20, %%f36, %%f36
fxor %%f22, %%f38, %%f38
fxor %%f24, %%f40, %%f40
fxor %%f26, %%f42, %%f42
fxor %%f28, %%f44, %%f44
fxor %%f30, %%f46, %%f46
membar #Sync
fxor %%f32, %%f48, %%f48
fxor %%f34, %%f50, %%f50
fxor %%f36, %%f52, %%f52
fxor %%f38, %%f54, %%f54
fxor %%f40, %%f56, %%f56
fxor %%f42, %%f58, %%f58
fxor %%f44, %%f60, %%f60
fxor %%f46, %%f62, %%f62
stda %%f48, [%%o4] %3
membar #Sync|#StoreStore|#StoreLoad
retl
wr %%g0, 0, %%fprs
15: ldx [%%o1 + 16], %%o2
ldx [%%o1 + 24], %%o0
ldx [%%o1 + 32], %%o1
ldx [%%o2 + %0], %%o2
ldx [%%o0 + %0], %%o0
ldx [%%o1 + %0], %%o1
5: ldda [%%o2] %3, %%f32
fxor %%f0, %%f16, %%f48
fxor %%f2, %%f18, %%f50
add %%o4, 64, %%o4
fxor %%f4, %%f20, %%f52
fxor %%f6, %%f22, %%f54
add %%o3, 64, %%o3
fxor %%f8, %%f24, %%f56
fxor %%f10, %%f26, %%f58
fxor %%f12, %%f28, %%f60
fxor %%f14, %%f30, %%f62
ldda [%%o0] %3, %%f16
fxor %%f48, %%f32, %%f48
fxor %%f50, %%f34, %%f50
fxor %%f52, %%f36, %%f52
fxor %%f54, %%f38, %%f54
add %%o2, 64, %%o2
fxor %%f56, %%f40, %%f56
fxor %%f58, %%f42, %%f58
fxor %%f60, %%f44, %%f60
fxor %%f62, %%f46, %%f62
ldda [%%o1] %3, %%f32
fxor %%f48, %%f16, %%f48
fxor %%f50, %%f18, %%f50
add %%o0, 64, %%o0
fxor %%f52, %%f20, %%f52
fxor %%f54, %%f22, %%f54
add %%o1, 64, %%o1
fxor %%f56, %%f24, %%f56
fxor %%f58, %%f26, %%f58
fxor %%f60, %%f28, %%f60
fxor %%f62, %%f30, %%f62
ldda [%%o4] %3, %%f0
fxor %%f48, %%f32, %%f48
fxor %%f50, %%f34, %%f50
fxor %%f52, %%f36, %%f52
fxor %%f54, %%f38, %%f54
fxor %%f56, %%f40, %%f56
fxor %%f58, %%f42, %%f58
subcc %%g5, 64, %%g5
fxor %%f60, %%f44, %%f60
fxor %%f62, %%f46, %%f62
stda %%f48, [%%o4 + %%g1] %3
bne,pt %%xcc, 5b
ldda [%%o3] %3, %%f16
ldda [%%o2] %3, %%f32
fxor %%f0, %%f16, %%f48
fxor %%f2, %%f18, %%f50
fxor %%f4, %%f20, %%f52
fxor %%f6, %%f22, %%f54
fxor %%f8, %%f24, %%f56
fxor %%f10, %%f26, %%f58
fxor %%f12, %%f28, %%f60
fxor %%f14, %%f30, %%f62
ldda [%%o0] %3, %%f16
fxor %%f48, %%f32, %%f48
fxor %%f50, %%f34, %%f50
fxor %%f52, %%f36, %%f52
fxor %%f54, %%f38, %%f54
fxor %%f56, %%f40, %%f56
fxor %%f58, %%f42, %%f58
fxor %%f60, %%f44, %%f60
fxor %%f62, %%f46, %%f62
ldda [%%o1] %3, %%f32
fxor %%f48, %%f16, %%f48
fxor %%f50, %%f18, %%f50
fxor %%f52, %%f20, %%f52
fxor %%f54, %%f22, %%f54
fxor %%f56, %%f24, %%f56
fxor %%f58, %%f26, %%f58
fxor %%f60, %%f28, %%f60
fxor %%f62, %%f30, %%f62
membar #Sync
fxor %%f48, %%f32, %%f48
fxor %%f50, %%f34, %%f50
fxor %%f52, %%f36, %%f52
fxor %%f54, %%f38, %%f54
fxor %%f56, %%f40, %%f56
fxor %%f58, %%f42, %%f58
fxor %%f60, %%f44, %%f60
fxor %%f62, %%f46, %%f62
stda %%f48, [%%o4] %3
membar #Sync|#StoreStore|#StoreLoad
retl
wr %%g0, 0, %%fprs
" : :
"i" (&((struct buffer_head *)0)->b_data),
"i" (&((struct buffer_head *)0)->b_size),
"i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
"i" (FPRS_FEF), "i" (VISenter));
}
#endif /* __sparc_v9__ */
#if defined(__sparc__) && !defined(__sparc_v9__)
/*
* High speed xor_block operation for RAID4/5 utilizing the
* ldd/std SPARC instructions.
*
* Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
*
*/
XORBLOCK_TEMPLATE(SPARC)
{
int size = bh_ptr[0]->b_size;
int lines = size / (sizeof (long)) / 8, i;
long *destp = (long *) bh_ptr[0]->b_data;
long *source1 = (long *) bh_ptr[1]->b_data;
long *source2, *source3, *source4;
switch (count) {
case 2:
for (i = lines; i > 0; i--) {
__asm__ __volatile__("
ldd [%0 + 0x00], %%g2
ldd [%0 + 0x08], %%g4
ldd [%0 + 0x10], %%o0
ldd [%0 + 0x18], %%o2
ldd [%1 + 0x00], %%o4
ldd [%1 + 0x08], %%l0
ldd [%1 + 0x10], %%l2
ldd [%1 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
std %%g2, [%0 + 0x00]
std %%g4, [%0 + 0x08]
std %%o0, [%0 + 0x10]
std %%o2, [%0 + 0x18]
" : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0",
"o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
destp += 8;
source1 += 8;
}
break;
case 3:
source2 = (long *) bh_ptr[2]->b_data;
for (i = lines; i > 0; i--) {
__asm__ __volatile__("
ldd [%0 + 0x00], %%g2
ldd [%0 + 0x08], %%g4
ldd [%0 + 0x10], %%o0
ldd [%0 + 0x18], %%o2
ldd [%1 + 0x00], %%o4
ldd [%1 + 0x08], %%l0
ldd [%1 + 0x10], %%l2
ldd [%1 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%2 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%2 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%2 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%2 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
std %%g2, [%0 + 0x00]
std %%g4, [%0 + 0x08]
std %%o0, [%0 + 0x10]
std %%o2, [%0 + 0x18]
" : : "r" (destp), "r" (source1), "r" (source2)
: "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
"l0", "l1", "l2", "l3", "l4", "l5");
destp += 8;
source1 += 8;
source2 += 8;
}
break;
case 4:
source2 = (long *) bh_ptr[2]->b_data;
source3 = (long *) bh_ptr[3]->b_data;
for (i = lines; i > 0; i--) {
__asm__ __volatile__("
ldd [%0 + 0x00], %%g2
ldd [%0 + 0x08], %%g4
ldd [%0 + 0x10], %%o0
ldd [%0 + 0x18], %%o2
ldd [%1 + 0x00], %%o4
ldd [%1 + 0x08], %%l0
ldd [%1 + 0x10], %%l2
ldd [%1 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%2 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%2 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%2 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%2 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%3 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%3 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%3 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%3 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
std %%g2, [%0 + 0x00]
std %%g4, [%0 + 0x08]
std %%o0, [%0 + 0x10]
std %%o2, [%0 + 0x18]
" : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
: "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
"l0", "l1", "l2", "l3", "l4", "l5");
destp += 8;
source1 += 8;
source2 += 8;
source3 += 8;
}
break;
case 5:
source2 = (long *) bh_ptr[2]->b_data;
source3 = (long *) bh_ptr[3]->b_data;
source4 = (long *) bh_ptr[4]->b_data;
for (i = lines; i > 0; i--) {
__asm__ __volatile__("
ldd [%0 + 0x00], %%g2
ldd [%0 + 0x08], %%g4
ldd [%0 + 0x10], %%o0
ldd [%0 + 0x18], %%o2
ldd [%1 + 0x00], %%o4
ldd [%1 + 0x08], %%l0
ldd [%1 + 0x10], %%l2
ldd [%1 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%2 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%2 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%2 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%2 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%3 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%3 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%3 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%3 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%4 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%4 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%4 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%4 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
std %%g2, [%0 + 0x00]
std %%g4, [%0 + 0x08]
std %%o0, [%0 + 0x10]
std %%o2, [%0 + 0x18]
" : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
: "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
"l0", "l1", "l2", "l3", "l4", "l5");
destp += 8;
source1 += 8;
source2 += 8;
source3 += 8;
source4 += 8;
}
break;
p3 = (unsigned long *) bh_ptr[3]->b_data;
if (count == 4) {
active_template->do_4(bytes, p0, p1, p2, p3);
return;
}
}
#endif /* __sparc_v[78]__ */
#ifdef __alpha__
/*
* High speed xor_block operation for RAID4/5 pipelined for Alpha EV5.
* There is a second version using EV6 prefetch instructions.
*
* Copyright (C) 2000 Richard Henderson (rth@redhat.com)
*/
XORBLOCK_TEMPLATE(alpha)
{
long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
long *d = (long *) bh_ptr[0]->b_data;
long *s1 = (long *) bh_ptr[1]->b_data;
long *s2, *s3, *s4;
if (count == 2) goto two_blocks;
s2 = (long *) bh_ptr[2]->b_data;
if (count == 3) goto three_blocks;
s3 = (long *) bh_ptr[3]->b_data;
if (count == 4) goto four_blocks;
s4 = (long *) bh_ptr[4]->b_data;
goto five_blocks;
two_blocks:
asm volatile ("
.align 4
2:
ldq $0,0(%0)
ldq $1,0(%1)
ldq $2,8(%0)
ldq $3,8(%1)
ldq $4,16(%0)
ldq $5,16(%1)
ldq $6,24(%0)
ldq $7,24(%1)
ldq $16,32(%0)
ldq $17,32(%1)
ldq $18,40(%0)
ldq $19,40(%1)
ldq $20,48(%0)
ldq $21,48(%1)
ldq $22,56(%0)
xor $0,$1,$0 # 7 cycles from $1 load
ldq $23,56(%1)
xor $2,$3,$2
stq $0,0(%0)
xor $4,$5,$4
stq $2,8(%0)
xor $6,$7,$6
stq $4,16(%0)
xor $16,$17,$16
stq $6,24(%0)
xor $18,$19,$18
stq $16,32(%0)
xor $20,$21,$20
stq $18,40(%0)
xor $22,$23,$22
stq $20,48(%0)
subq %2,1,%2
stq $22,56(%0)
addq %0,64,%0
addq %1,64,%1
bgt %2,2b"
: "=r"(d), "=r"(s1), "=r"(lines)
: "0"(d), "1"(s1), "2"(lines)
: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
"$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
return;
three_blocks:
asm volatile ("
.align 4
3:
ldq $0,0(%0)
ldq $1,0(%1)
ldq $2,0(%2)
ldq $3,8(%0)
ldq $4,8(%1)
ldq $6,16(%0)
ldq $7,16(%1)
ldq $17,24(%0)
ldq $18,24(%1)
ldq $20,32(%0)
ldq $21,32(%1)
ldq $5,8(%2)
ldq $16,16(%2)
ldq $19,24(%2)
ldq $22,32(%2)
nop
xor $0,$1,$1 # 8 cycles from $0 load
xor $3,$4,$4 # 6 cycles from $4 load
xor $6,$7,$7 # 6 cycles from $7 load
xor $17,$18,$18 # 5 cycles from $18 load
xor $1,$2,$2 # 9 cycles from $2 load
xor $20,$21,$21 # 5 cycles from $21 load
stq $2,0(%0)
xor $4,$5,$5 # 6 cycles from $5 load
stq $5,8(%0)
xor $7,$16,$16 # 7 cycles from $16 load
stq $16,16(%0)
xor $18,$19,$19 # 7 cycles from $19 load
stq $19,24(%0)
xor $21,$22,$22 # 7 cycles from $22 load
stq $22,32(%0)
nop
ldq $0,40(%0)
ldq $1,40(%1)
ldq $3,48(%0)
ldq $4,48(%1)
ldq $6,56(%0)
ldq $7,56(%1)
ldq $2,40(%2)
ldq $5,48(%2)
ldq $16,56(%2)
xor $0,$1,$1 # 4 cycles from $1 load
xor $3,$4,$4 # 5 cycles from $4 load
xor $6,$7,$7 # 5 cycles from $7 load
xor $1,$2,$2 # 4 cycles from $2 load
xor $4,$5,$5 # 5 cycles from $5 load
stq $2,40(%0)
xor $7,$16,$16 # 4 cycles from $16 load
stq $5,48(%0)
subq %3,1,%3
stq $16,56(%0)
addq %2,64,%2
addq %1,64,%1
addq %0,64,%0
bgt %3,3b"
: "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
: "0"(d), "1"(s1), "2"(s2), "3"(lines)
: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
"$16", "$17", "$18", "$19", "$20", "$21", "$22");
return;
four_blocks:
asm volatile ("
.align 4
4:
ldq $0,0(%0)
ldq $1,0(%1)
ldq $2,0(%2)
ldq $3,0(%3)
ldq $4,8(%0)
ldq $5,8(%1)
ldq $6,8(%2)
ldq $7,8(%3)
ldq $16,16(%0)
ldq $17,16(%1)
ldq $18,16(%2)
ldq $19,16(%3)
ldq $20,24(%0)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $21,24(%1)
xor $2,$3,$3 # 6 cycles from $3 load
ldq $0,24(%2)
xor $1,$3,$3
ldq $1,24(%3)
xor $4,$5,$5 # 7 cycles from $5 load
stq $3,0(%0)
xor $6,$7,$7
xor $16,$17,$17 # 7 cycles from $17 load
xor $5,$7,$7
stq $7,8(%0)
xor $18,$19,$19 # 7 cycles from $19 load
ldq $2,32(%0)
xor $17,$19,$19
ldq $3,32(%1)
ldq $4,32(%2)
ldq $5,32(%3)
xor $20,$21,$21 # 8 cycles from $21 load
ldq $6,40(%0)
ldq $7,40(%1)
ldq $16,40(%2)
ldq $17,40(%3)
stq $19,16(%0)
xor $0,$1,$1 # 9 cycles from $1 load
xor $2,$3,$3 # 5 cycles from $3 load
xor $21,$1,$1
ldq $18,48(%0)
xor $4,$5,$5 # 5 cycles from $5 load
ldq $19,48(%1)
xor $3,$5,$5
ldq $20,48(%2)
ldq $21,48(%3)
ldq $0,56(%0)
ldq $1,56(%1)
ldq $2,56(%2)
xor $6,$7,$7 # 8 cycles from $6 load
ldq $3,56(%3)
xor $16,$17,$17 # 8 cycles from $17 load
xor $7,$17,$17
xor $18,$19,$19 # 5 cycles from $19 load
xor $20,$21,$21 # 5 cycles from $21 load
xor $19,$21,$21
stq $1,24(%0)
xor $0,$1,$1 # 5 cycles from $1 load
stq $5,32(%0)
xor $2,$3,$3 # 4 cycles from $3 load
stq $17,40(%0)
xor $1,$3,$3
stq $21,48(%0)
subq %4,1,%4
stq $3,56(%0)
addq %3,64,%3
addq %2,64,%2
addq %1,64,%1
addq %0,64,%0
bgt %4,4b"
: "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
: "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
"$16", "$17", "$18", "$19", "$20", "$21");
return;
five_blocks:
asm volatile ("
ldq %0,0(%6)
ldq %1,8(%6)
ldq %2,16(%6)
ldq %3,24(%6)
ldq %4,32(%6)
ldq %0,%7(%0)
ldq %1,%7(%1)
ldq %2,%7(%2)
ldq %3,%7(%3)
ldq %4,%7(%4)
.align 4
5:
ldq $0,0(%0)
ldq $1,0(%1)
ldq $2,0(%2)
ldq $3,0(%3)
ldq $4,0(%4)
ldq $5,8(%0)
ldq $6,8(%1)
ldq $7,8(%2)
ldq $16,8(%3)
ldq $17,8(%4)
ldq $18,16(%0)
ldq $19,16(%1)
ldq $20,16(%2)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $21,16(%3)
xor $2,$3,$3 # 6 cycles from $3 load
ldq $0,16(%4)
xor $1,$3,$3
ldq $1,24(%0)
xor $3,$4,$4 # 7 cycles from $4 load
stq $4,0(%0)
xor $5,$6,$6 # 7 cycles from $6 load
xor $7,$16,$16 # 7 cycles from $16 load
xor $6,$17,$17 # 7 cycles from $17 load
ldq $2,24(%1)
xor $16,$17,$17
ldq $3,24(%2)
xor $18,$19,$19 # 8 cycles from $19 load
stq $17,8(%0)
xor $19,$20,$20 # 8 cycles from $20 load
ldq $4,24(%3)
xor $21,$0,$0 # 7 cycles from $0 load
ldq $5,24(%4)
xor $20,$0,$0
ldq $6,32(%0)
ldq $7,32(%1)
stq $0,16(%0)
xor $1,$2,$2 # 6 cycles from $2 load
ldq $16,32(%2)
xor $3,$4,$4 # 4 cycles from $4 load
ldq $17,32(%3)
xor $2,$4,$4
ldq $18,32(%4)
ldq $19,40(%0)
ldq $20,40(%1)
ldq $21,40(%2)
ldq $0,40(%3)
xor $4,$5,$5 # 7 cycles from $5 load
stq $5,24(%0)
xor $6,$7,$7 # 7 cycles from $7 load
ldq $1,40(%4)
ldq $2,48(%0)
ldq $3,48(%1)
xor $7,$16,$16 # 7 cycles from $16 load
ldq $4,48(%2)
xor $17,$18,$18 # 6 cycles from $18 load
ldq $5,48(%3)
xor $16,$18,$18
ldq $6,48(%4)
xor $19,$20,$20 # 7 cycles from $20 load
stq $18,32(%0)
xor $20,$21,$21 # 8 cycles from $21 load
ldq $7,56(%0)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $16,56(%1)
ldq $17,56(%2)
ldq $18,56(%3)
ldq $19,56(%4)
xor $21,$1,$1
xor $2,$3,$3 # 9 cycles from $3 load
xor $3,$4,$4 # 9 cycles from $4 load
xor $5,$6,$6 # 8 cycles from $6 load
unop
xor $4,$6,$6
xor $7,$16,$16 # 7 cycles from $16 load
xor $17,$18,$18 # 6 cycles from $18 load
stq $6,48(%0)
xor $16,$18,$18
subq %5,1,%5
xor $18,$19,$19 # 8 cycles from $19 load
stq $19,56(%0)
addq %4,64,%4
addq %3,64,%3
addq %2,64,%2
addq %1,64,%1
addq %0,64,%0
bgt %5,5b"
: "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
/* ARG! We've run out of asm arguments! We've got to reload
all those pointers we just loaded. */
: "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
"$16", "$17", "$18", "$19", "$20", "$21");
return;
p4 = (unsigned long *) bh_ptr[4]->b_data;
active_template->do_5(bytes, p0, p1, p2, p3, p4);
}
#define prefetch(base, ofs) \
asm("ldq $31,%2(%0)" : "=r"(base) : "0"(base), "i"(ofs))
XORBLOCK_TEMPLATE(alpha_prefetch)
{
long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
long *d = (long *) bh_ptr[0]->b_data;
long *s1 = (long *) bh_ptr[1]->b_data;
long *s2, *s3, *s4;
long p;
p = count == 2;
prefetch(d, 0);
prefetch(s1, 0);
prefetch(d, 64);
prefetch(s1, 64);
prefetch(d, 128);
prefetch(s1, 128);
prefetch(d, 192);
prefetch(s1, 192);
if (p) goto two_blocks;
s2 = (long *) bh_ptr[2]->b_data;
p = count == 3;
prefetch(s2, 0);
prefetch(s2, 64);
prefetch(s2, 128);
prefetch(s2, 192);
if (p) goto three_blocks;
s3 = (long *) bh_ptr[3]->b_data;
p = count == 4;
prefetch(s3, 0);
prefetch(s3, 64);
prefetch(s3, 128);
prefetch(s3, 192);
if (p) goto four_blocks;
s4 = (long *) bh_ptr[4]->b_data;
prefetch(s4, 0);
prefetch(s4, 64);
prefetch(s4, 128);
prefetch(s4, 192);
goto five_blocks;
two_blocks:
asm volatile ("
.align 4
2:
ldq $0,0(%0)
ldq $1,0(%1)
ldq $2,8(%0)
ldq $3,8(%1)
ldq $4,16(%0)
ldq $5,16(%1)
ldq $6,24(%0)
ldq $7,24(%1)
ldq $16,32(%0)
ldq $17,32(%1)
ldq $18,40(%0)
ldq $19,40(%1)
ldq $20,48(%0)
ldq $21,48(%1)
ldq $22,56(%0)
ldq $23,56(%1)
ldq $31,256(%0)
xor $0,$1,$0 # 8 cycles from $1 load
ldq $31,256(%1)
xor $2,$3,$2
stq $0,0(%0)
xor $4,$5,$4
stq $2,8(%0)
xor $6,$7,$6
stq $4,16(%0)
xor $16,$17,$16
stq $6,24(%0)
xor $18,$19,$18
stq $16,32(%0)
xor $20,$21,$20
stq $18,40(%0)
xor $22,$23,$22
stq $20,48(%0)
subq %2,1,%2
stq $22,56(%0)
addq %0,64,%0
addq %1,64,%1
bgt %2,2b"
: "=r"(d), "=r"(s1), "=r"(lines)
: "0"(d), "1"(s1), "2"(lines)
: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
"$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
return;
three_blocks:
asm volatile ("
.align 4
3:
ldq $0,0(%0)
ldq $1,0(%1)
ldq $2,0(%2)
ldq $3,8(%0)
ldq $4,8(%1)
ldq $6,16(%0)
ldq $7,16(%1)
ldq $17,24(%0)
ldq $18,24(%1)
ldq $20,32(%0)
ldq $21,32(%1)
ldq $5,8(%2)
ldq $16,16(%2)
ldq $19,24(%2)
ldq $22,32(%2)
nop
xor $0,$1,$1 # 8 cycles from $0 load
xor $3,$4,$4 # 7 cycles from $4 load
xor $6,$7,$7 # 6 cycles from $7 load
xor $17,$18,$18 # 5 cycles from $18 load
xor $1,$2,$2 # 9 cycles from $2 load
xor $20,$21,$21 # 5 cycles from $21 load
stq $2,0(%0)
xor $4,$5,$5 # 6 cycles from $5 load
stq $5,8(%0)
xor $7,$16,$16 # 7 cycles from $16 load
stq $16,16(%0)
xor $18,$19,$19 # 7 cycles from $19 load
stq $19,24(%0)
xor $21,$22,$22 # 7 cycles from $22 load
stq $22,32(%0)
nop
ldq $0,40(%0)
ldq $1,40(%1)
ldq $3,48(%0)
ldq $4,48(%1)
ldq $6,56(%0)
ldq $7,56(%1)
ldq $2,40(%2)
ldq $5,48(%2)
ldq $16,56(%2)
ldq $31,256(%0)
ldq $31,256(%1)
ldq $31,256(%2)
xor $0,$1,$1 # 6 cycles from $1 load
xor $3,$4,$4 # 5 cycles from $4 load
xor $6,$7,$7 # 5 cycles from $7 load
xor $1,$2,$2 # 4 cycles from $2 load
xor $4,$5,$5 # 5 cycles from $5 load
xor $7,$16,$16 # 4 cycles from $16 load
stq $2,40(%0)
subq %3,1,%3
stq $5,48(%0)
addq %2,64,%2
stq $16,56(%0)
addq %1,64,%1
addq %0,64,%0
bgt %3,3b"
: "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
: "0"(d), "1"(s1), "2"(s2), "3"(lines)
: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
"$16", "$17", "$18", "$19", "$20", "$21", "$22");
return;
four_blocks:
asm volatile ("
.align 4
4:
ldq $0,0(%0)
ldq $1,0(%1)
ldq $2,0(%2)
ldq $3,0(%3)
ldq $4,8(%0)
ldq $5,8(%1)
ldq $6,8(%2)
ldq $7,8(%3)
ldq $16,16(%0)
ldq $17,16(%1)
ldq $18,16(%2)
ldq $19,16(%3)
ldq $20,24(%0)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $21,24(%1)
xor $2,$3,$3 # 6 cycles from $3 load
ldq $0,24(%2)
xor $1,$3,$3
ldq $1,24(%3)
xor $4,$5,$5 # 7 cycles from $5 load
stq $3,0(%0)
xor $6,$7,$7
xor $16,$17,$17 # 7 cycles from $17 load
xor $5,$7,$7
stq $7,8(%0)
xor $18,$19,$19 # 7 cycles from $19 load
ldq $2,32(%0)
xor $17,$19,$19
ldq $3,32(%1)
ldq $4,32(%2)
ldq $5,32(%3)
xor $20,$21,$21 # 8 cycles from $21 load
ldq $6,40(%0)
ldq $7,40(%1)
ldq $16,40(%2)
ldq $17,40(%3)
stq $19,16(%0)
xor $0,$1,$1 # 9 cycles from $1 load
xor $2,$3,$3 # 5 cycles from $3 load
xor $21,$1,$1
ldq $18,48(%0)
xor $4,$5,$5 # 5 cycles from $5 load
ldq $19,48(%1)
xor $3,$5,$5
ldq $20,48(%2)
ldq $21,48(%3)
ldq $0,56(%0)
ldq $1,56(%1)
ldq $2,56(%2)
xor $6,$7,$7 # 8 cycles from $6 load
ldq $3,56(%3)
xor $16,$17,$17 # 8 cycles from $17 load
ldq $31,256(%0)
xor $7,$17,$17
ldq $31,256(%1)
xor $18,$19,$19 # 6 cycles from $19 load
/* Set of all registered templates. */
static struct xor_block_template *template_list;
ldq $31,256(%2)
xor $20,$21,$21 # 6 cycles from $21 load
ldq $31,256(%3)
xor $19,$21,$21
/* The -6*32 shift factor colors the cache. */
#define BENCH_SIZE (PAGE_SIZE-6*32)
stq $1,24(%0)
xor $0,$1,$1 # 7 cycles from $1 load
stq $5,32(%0)
xor $2,$3,$3 # 6 cycles from $3 load
stq $17,40(%0)
xor $1,$3,$3
stq $21,48(%0)
subq %4,1,%4
stq $3,56(%0)
addq %3,64,%3
addq %2,64,%2
addq %1,64,%1
addq %0,64,%0
bgt %4,4b"
: "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
: "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
"$16", "$17", "$18", "$19", "$20", "$21");
return;
five_blocks:
asm volatile ("
ldq %0,0(%6)
ldq %1,8(%6)
ldq %2,16(%6)
ldq %3,24(%6)
ldq %4,32(%6)
ldq %0,%7(%0)
ldq %1,%7(%1)
ldq %2,%7(%2)
ldq %3,%7(%3)
ldq %4,%7(%4)
.align 4
5:
ldq $0,0(%0)
ldq $1,0(%1)
ldq $2,0(%2)
ldq $3,0(%3)
ldq $4,0(%4)
ldq $5,8(%0)
ldq $6,8(%1)
ldq $7,8(%2)
ldq $16,8(%3)
ldq $17,8(%4)
ldq $18,16(%0)
ldq $19,16(%1)
ldq $20,16(%2)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $21,16(%3)
xor $2,$3,$3 # 6 cycles from $3 load
ldq $0,16(%4)
xor $1,$3,$3
ldq $1,24(%0)
xor $3,$4,$4 # 7 cycles from $4 load
stq $4,0(%0)
xor $5,$6,$6 # 7 cycles from $6 load
xor $7,$16,$16 # 7 cycles from $16 load
xor $6,$17,$17 # 7 cycles from $17 load
ldq $2,24(%1)
xor $16,$17,$17
ldq $3,24(%2)
xor $18,$19,$19 # 8 cycles from $19 load
stq $17,8(%0)
xor $19,$20,$20 # 8 cycles from $20 load
ldq $4,24(%3)
xor $21,$0,$0 # 7 cycles from $0 load
ldq $5,24(%4)
xor $20,$0,$0
ldq $6,32(%0)
ldq $7,32(%1)
stq $0,16(%0)
xor $1,$2,$2 # 6 cycles from $2 load
ldq $16,32(%2)
xor $3,$4,$4 # 4 cycles from $4 load
ldq $17,32(%3)
xor $2,$4,$4
ldq $18,32(%4)
ldq $19,40(%0)
ldq $20,40(%1)
ldq $21,40(%2)
ldq $0,40(%3)
xor $4,$5,$5 # 7 cycles from $5 load
stq $5,24(%0)
xor $6,$7,$7 # 7 cycles from $7 load
ldq $1,40(%4)
ldq $2,48(%0)
ldq $3,48(%1)
xor $7,$16,$16 # 7 cycles from $16 load
ldq $4,48(%2)
xor $17,$18,$18 # 6 cycles from $18 load
ldq $5,48(%3)
xor $16,$18,$18
ldq $6,48(%4)
xor $19,$20,$20 # 7 cycles from $20 load
stq $18,32(%0)
xor $20,$21,$21 # 8 cycles from $21 load
ldq $7,56(%0)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $16,56(%1)
ldq $17,56(%2)
ldq $18,56(%3)
ldq $19,56(%4)
ldq $31,256(%0)
xor $21,$1,$1
ldq $31,256(%1)
xor $2,$3,$3 # 9 cycles from $3 load
ldq $31,256(%2)
xor $3,$4,$4 # 9 cycles from $4 load
ldq $31,256(%3)
xor $5,$6,$6 # 8 cycles from $6 load
ldq $31,256(%4)
xor $4,$6,$6
xor $7,$16,$16 # 7 cycles from $16 load
xor $17,$18,$18 # 6 cycles from $18 load
stq $6,48(%0)
xor $16,$18,$18
subq %5,1,%5
xor $18,$19,$19 # 8 cycles from $19 load
stq $19,56(%0)
addq %4,64,%4
addq %3,64,%3
addq %2,64,%2
addq %1,64,%1
addq %0,64,%0
bgt %5,5b"
: "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
/* ARG! We've run out of asm arguments! We've got to reload
all those pointers we just loaded. */
: "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
"$16", "$17", "$18", "$19", "$20", "$21");
return;
}
#undef prefetch
#endif /* __alpha__ */
#ifndef __sparc_v9__
/*
* this one works reasonably on any x86 CPU
* (send me an assembly version for inclusion if you can make it faster)
*
* this one is just as fast as written in pure assembly on x86.
* the reason for this separate version is that the
* fast open-coded xor routine "32reg" produces suboptimal code
* on x86, due to lack of registers.
*/
XORBLOCK_TEMPLATE(8regs)
{
int len = bh_ptr[0]->b_size;
long *destp = (long *) bh_ptr[0]->b_data;
long *source1, *source2, *source3, *source4;
long lines = len / (sizeof (long)) / 8, i;
switch(count) {
case 2:
source1 = (long *) bh_ptr[1]->b_data;
for (i = lines; i > 0; i--) {
*(destp + 0) ^= *(source1 + 0);
*(destp + 1) ^= *(source1 + 1);
*(destp + 2) ^= *(source1 + 2);
*(destp + 3) ^= *(source1 + 3);
*(destp + 4) ^= *(source1 + 4);
*(destp + 5) ^= *(source1 + 5);
*(destp + 6) ^= *(source1 + 6);
*(destp + 7) ^= *(source1 + 7);
source1 += 8;
destp += 8;
}
break;
case 3:
source2 = (long *) bh_ptr[2]->b_data;
source1 = (long *) bh_ptr[1]->b_data;
for (i = lines; i > 0; i--) {
*(destp + 0) ^= *(source1 + 0);
*(destp + 0) ^= *(source2 + 0);
*(destp + 1) ^= *(source1 + 1);
*(destp + 1) ^= *(source2 + 1);
*(destp + 2) ^= *(source1 + 2);
*(destp + 2) ^= *(source2 + 2);
*(destp + 3) ^= *(source1 + 3);
*(destp + 3) ^= *(source2 + 3);
*(destp + 4) ^= *(source1 + 4);
*(destp + 4) ^= *(source2 + 4);
*(destp + 5) ^= *(source1 + 5);
*(destp + 5) ^= *(source2 + 5);
*(destp + 6) ^= *(source1 + 6);
*(destp + 6) ^= *(source2 + 6);
*(destp + 7) ^= *(source1 + 7);
*(destp + 7) ^= *(source2 + 7);
source1 += 8;
source2 += 8;
destp += 8;
}
break;
case 4:
source3 = (long *) bh_ptr[3]->b_data;
source2 = (long *) bh_ptr[2]->b_data;
source1 = (long *) bh_ptr[1]->b_data;
for (i = lines; i > 0; i--) {
*(destp + 0) ^= *(source1 + 0);
*(destp + 0) ^= *(source2 + 0);
*(destp + 0) ^= *(source3 + 0);
*(destp + 1) ^= *(source1 + 1);
*(destp + 1) ^= *(source2 + 1);
*(destp + 1) ^= *(source3 + 1);
*(destp + 2) ^= *(source1 + 2);
*(destp + 2) ^= *(source2 + 2);
*(destp + 2) ^= *(source3 + 2);
*(destp + 3) ^= *(source1 + 3);
*(destp + 3) ^= *(source2 + 3);
*(destp + 3) ^= *(source3 + 3);
*(destp + 4) ^= *(source1 + 4);
*(destp + 4) ^= *(source2 + 4);
*(destp + 4) ^= *(source3 + 4);
*(destp + 5) ^= *(source1 + 5);
*(destp + 5) ^= *(source2 + 5);
*(destp + 5) ^= *(source3 + 5);
*(destp + 6) ^= *(source1 + 6);
*(destp + 6) ^= *(source2 + 6);
*(destp + 6) ^= *(source3 + 6);
*(destp + 7) ^= *(source1 + 7);
*(destp + 7) ^= *(source2 + 7);
*(destp + 7) ^= *(source3 + 7);
source1 += 8;
source2 += 8;
source3 += 8;
destp += 8;
}
break;
case 5:
source4 = (long *) bh_ptr[4]->b_data;
source3 = (long *) bh_ptr[3]->b_data;
source2 = (long *) bh_ptr[2]->b_data;
source1 = (long *) bh_ptr[1]->b_data;
for (i = lines; i > 0; i--) {
*(destp + 0) ^= *(source1 + 0);
*(destp + 0) ^= *(source2 + 0);
*(destp + 0) ^= *(source3 + 0);
*(destp + 0) ^= *(source4 + 0);
*(destp + 1) ^= *(source1 + 1);
*(destp + 1) ^= *(source2 + 1);
*(destp + 1) ^= *(source3 + 1);
*(destp + 1) ^= *(source4 + 1);
*(destp + 2) ^= *(source1 + 2);
*(destp + 2) ^= *(source2 + 2);
*(destp + 2) ^= *(source3 + 2);
*(destp + 2) ^= *(source4 + 2);
*(destp + 3) ^= *(source1 + 3);
*(destp + 3) ^= *(source2 + 3);
*(destp + 3) ^= *(source3 + 3);
*(destp + 3) ^= *(source4 + 3);
*(destp + 4) ^= *(source1 + 4);
*(destp + 4) ^= *(source2 + 4);
*(destp + 4) ^= *(source3 + 4);
*(destp + 4) ^= *(source4 + 4);
*(destp + 5) ^= *(source1 + 5);
*(destp + 5) ^= *(source2 + 5);
*(destp + 5) ^= *(source3 + 5);
*(destp + 5) ^= *(source4 + 5);
*(destp + 6) ^= *(source1 + 6);
*(destp + 6) ^= *(source2 + 6);
*(destp + 6) ^= *(source3 + 6);
*(destp + 6) ^= *(source4 + 6);
*(destp + 7) ^= *(source1 + 7);
*(destp + 7) ^= *(source2 + 7);
*(destp + 7) ^= *(source3 + 7);
*(destp + 7) ^= *(source4 + 7);
source1 += 8;
source2 += 8;
source3 += 8;
source4 += 8;
destp += 8;
}
break;
}
}
/*
* platform independent RAID5 checksum calculation, this should
* be very fast on any platform that has a decent amount of
* registers. (32 or more)
*/
XORBLOCK_TEMPLATE(32regs)
{
int size = bh_ptr[0]->b_size;
int lines = size / (sizeof (long)) / 8, i;
long *destp = (long *) bh_ptr[0]->b_data;
long *source1, *source2, *source3, *source4;
/* LOTS of registers available...
We do explicite loop-unrolling here for code which
favours RISC machines. In fact this is almoast direct
RISC assembly on Alpha and SPARC :-) */
switch(count) {
case 2:
source1 = (long *) bh_ptr[1]->b_data;
for (i = lines; i > 0; i--) {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = destp[0]; /* Pull the stuff into registers */
d1 = destp[1]; /* ... in bursts, if possible. */
d2 = destp[2];
d3 = destp[3];
d4 = destp[4];
d5 = destp[5];
d6 = destp[6];
d7 = destp[7];
d0 ^= source1[0];
d1 ^= source1[1];
d2 ^= source1[2];
d3 ^= source1[3];
d4 ^= source1[4];
d5 ^= source1[5];
d6 ^= source1[6];
d7 ^= source1[7];
destp[0] = d0; /* Store the result (in burts) */
destp[1] = d1;
destp[2] = d2;
destp[3] = d3;
destp[4] = d4; /* Store the result (in burts) */
destp[5] = d5;
destp[6] = d6;
destp[7] = d7;
source1 += 8;
destp += 8;
}
break;
case 3:
source2 = (long *) bh_ptr[2]->b_data;
source1 = (long *) bh_ptr[1]->b_data;
for (i = lines; i > 0; i--) {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = destp[0]; /* Pull the stuff into registers */
d1 = destp[1]; /* ... in bursts, if possible. */
d2 = destp[2];
d3 = destp[3];
d4 = destp[4];
d5 = destp[5];
d6 = destp[6];
d7 = destp[7];
d0 ^= source1[0];
d1 ^= source1[1];
d2 ^= source1[2];
d3 ^= source1[3];
d4 ^= source1[4];
d5 ^= source1[5];
d6 ^= source1[6];
d7 ^= source1[7];
d0 ^= source2[0];
d1 ^= source2[1];
d2 ^= source2[2];
d3 ^= source2[3];
d4 ^= source2[4];
d5 ^= source2[5];
d6 ^= source2[6];
d7 ^= source2[7];
destp[0] = d0; /* Store the result (in burts) */
destp[1] = d1;
destp[2] = d2;
destp[3] = d3;
destp[4] = d4; /* Store the result (in burts) */
destp[5] = d5;
destp[6] = d6;
destp[7] = d7;
source1 += 8;
source2 += 8;
destp += 8;
}
break;
case 4:
source3 = (long *) bh_ptr[3]->b_data;
source2 = (long *) bh_ptr[2]->b_data;
source1 = (long *) bh_ptr[1]->b_data;
for (i = lines; i > 0; i--) {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = destp[0]; /* Pull the stuff into registers */
d1 = destp[1]; /* ... in bursts, if possible. */
d2 = destp[2];
d3 = destp[3];
d4 = destp[4];
d5 = destp[5];
d6 = destp[6];
d7 = destp[7];
d0 ^= source1[0];
d1 ^= source1[1];
d2 ^= source1[2];
d3 ^= source1[3];
d4 ^= source1[4];
d5 ^= source1[5];
d6 ^= source1[6];
d7 ^= source1[7];
d0 ^= source2[0];
d1 ^= source2[1];
d2 ^= source2[2];
d3 ^= source2[3];
d4 ^= source2[4];
d5 ^= source2[5];
d6 ^= source2[6];
d7 ^= source2[7];
d0 ^= source3[0];
d1 ^= source3[1];
d2 ^= source3[2];
d3 ^= source3[3];
d4 ^= source3[4];
d5 ^= source3[5];
d6 ^= source3[6];
d7 ^= source3[7];
destp[0] = d0; /* Store the result (in burts) */
destp[1] = d1;
destp[2] = d2;
destp[3] = d3;
destp[4] = d4; /* Store the result (in burts) */
destp[5] = d5;
destp[6] = d6;
destp[7] = d7;
source1 += 8;
source2 += 8;
source3 += 8;
destp += 8;
}
break;
case 5:
source4 = (long *) bh_ptr[4]->b_data;
source3 = (long *) bh_ptr[3]->b_data;
source2 = (long *) bh_ptr[2]->b_data;
source1 = (long *) bh_ptr[1]->b_data;
for (i = lines; i > 0; i--) {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = destp[0]; /* Pull the stuff into registers */
d1 = destp[1]; /* ... in bursts, if possible. */
d2 = destp[2];
d3 = destp[3];
d4 = destp[4];
d5 = destp[5];
d6 = destp[6];
d7 = destp[7];
d0 ^= source1[0];
d1 ^= source1[1];
d2 ^= source1[2];
d3 ^= source1[3];
d4 ^= source1[4];
d5 ^= source1[5];
d6 ^= source1[6];
d7 ^= source1[7];
d0 ^= source2[0];
d1 ^= source2[1];
d2 ^= source2[2];
d3 ^= source2[3];
d4 ^= source2[4];
d5 ^= source2[5];
d6 ^= source2[6];
d7 ^= source2[7];
d0 ^= source3[0];
d1 ^= source3[1];
d2 ^= source3[2];
d3 ^= source3[3];
d4 ^= source3[4];
d5 ^= source3[5];
d6 ^= source3[6];
d7 ^= source3[7];
d0 ^= source4[0];
d1 ^= source4[1];
d2 ^= source4[2];
d3 ^= source4[3];
d4 ^= source4[4];
d5 ^= source4[5];
d6 ^= source4[6];
d7 ^= source4[7];
destp[0] = d0; /* Store the result (in burts) */
destp[1] = d1;
destp[2] = d2;
destp[3] = d3;
destp[4] = d4; /* Store the result (in burts) */
destp[5] = d5;
destp[6] = d6;
destp[7] = d7;
source1 += 8;
source2 += 8;
source3 += 8;
source4 += 8;
destp += 8;
}
break;
}
}
/*
* (the -6*32 shift factor colors the cache)
*/
#define SIZE (PAGE_SIZE-6*32)
static void xor_speed ( struct xor_block_template * func,
struct buffer_head *b1, struct buffer_head *b2)
static void
do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
{
int speed;
unsigned long now;
int i, count, max;
struct buffer_head *bh_ptr[6];
func->next = xor_functions;
xor_functions = func;
bh_ptr[0] = b1;
bh_ptr[1] = b2;
tmpl->next = template_list;
template_list = tmpl;
/*
* count the number of XORs done during a whole jiffy.
* calculate the speed of checksumming from this.
* (we use a 2-page allocation to have guaranteed
* color L1-cache layout)
* Count the number of XORs done during a whole jiffy, and use
* this to calculate the speed of checksumming. We use a 2-page
* allocation to have guaranteed color L1-cache layout.
*/
max = 0;
for (i = 0; i < 5; i++) {
......@@ -2600,7 +82,7 @@ static void xor_speed ( struct xor_block_template * func,
count = 0;
while (jiffies == now) {
mb();
func->xor_block(2,bh_ptr);
tmpl->do_2(BENCH_SIZE, b1, b2);
mb();
count++;
mb();
......@@ -2609,120 +91,53 @@ static void xor_speed ( struct xor_block_template * func,
max = count;
}
speed = max * (HZ*SIZE/1024);
func->speed = speed;
speed = max * (HZ * BENCH_SIZE / 1024);
tmpl->speed = speed;
printk( " %-10s: %5d.%03d MB/sec\n", func->name,
speed / 1000, speed % 1000);
printk(" %-10s: %5d.%03d MB/sec\n", tmpl->name,
speed / 1000, speed % 1000);
}
static inline void pick_fastest_function(void)
static int
calibrate_xor_block(void)
{
void *b1, *b2;
struct xor_block_template *f, *fastest;
fastest = xor_functions;
for (f = fastest; f; f = f->next) {
if (f->speed > fastest->speed)
fastest = f;
}
#ifdef CONFIG_X86_XMM
if (cpu_has_xmm) {
/* we force the use of the KNI xor block because it
can write around l2. we may also be able
to load into the l1 only depending on how
the cpu deals with a load to a line that is
being prefetched.
*/
fastest = &t_xor_block_pIII_kni;
b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
if (! b1) {
printk("raid5: Yikes! No memory available.\n");
return -ENOMEM;
}
#endif
#ifdef __alpha__
if (implver() == IMPLVER_EV6) {
/* Force the use of alpha_prefetch if EV6, as it
is significantly faster in the cold cache case. */
fastest = &t_xor_block_alpha_prefetch;
}
#endif
xor_block = fastest->xor_block;
printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
fastest->speed / 1000, fastest->speed % 1000);
}
static struct buffer_head b1, b2;
void calibrate_xor_block(void)
{
if (xor_block)
return;
memset(&b1,0,sizeof(b1));
b2 = b1;
b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
if (!b1.b_data) {
pick_fastest_function();
return;
}
b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
b1.b_size = SIZE;
b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
printk(KERN_INFO "raid5: measuring checksumming speed\n");
sti();
sti(); /* should be safe */
#define xor_speed(templ) do_xor_speed((templ), b1, b2)
#if defined(__sparc__) && !defined(__sparc_v9__)
printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
xor_speed(&t_xor_block_SPARC,&b1,&b2);
#endif
XOR_TRY_TEMPLATES;
#ifdef CONFIG_X86_XMM
if (cpu_has_xmm) {
printk(KERN_INFO
"raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
}
#endif /* CONFIG_X86_XMM */
#undef xor_speed
#ifdef __i386__
if (md_cpu_has_mmx()) {
printk(KERN_INFO
"raid5: MMX detected, trying high-speed MMX checksum routines\n");
xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
}
#endif /* __i386__ */
free_pages((unsigned long)b1, 2);
#ifdef __alpha__
xor_speed(&t_xor_block_alpha,&b1,&b2);
xor_speed(&t_xor_block_alpha_prefetch,&b1,&b2);
#endif
xor_speed(&t_xor_block_8regs,&b1,&b2);
xor_speed(&t_xor_block_32regs,&b1,&b2);
fastest = template_list;
for (f = fastest; f; f = f->next)
if (f->speed > fastest->speed)
fastest = f;
free_pages((unsigned long)b1.b_data,2);
pick_fastest_function();
}
#ifdef XOR_SELECT_TEMPLATE
fastest = XOR_SELECT_TEMPLATE(fastest);
#endif
#else /* __sparc_v9__ */
active_template = fastest;
printk("raid5: using function: %s (%d.%03d MB/sec)\n",
fastest->name, fastest->speed / 1000, fastest->speed % 1000);
void calibrate_xor_block(void)
{
if (xor_block)
return;
printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
xor_block = xor_block_VIS;
return 0;
}
#endif /* __sparc_v9__ */
MD_EXPORT_SYMBOL(xor_block);
MD_EXPORT_SYMBOL(calibrate_xor_block);
#ifdef MODULE
int init_module(void)
{
calibrate_xor_block();
return 0;
}
#endif
module_init(calibrate_xor_block);
......@@ -1129,7 +1129,7 @@ static int nsc_ircc_hard_xmit_fir(struct sk_buff *skb, struct net_device *dev)
if ((speed = irda_get_speed(skb)) != self->io.speed) {
/* Check for empty frame */
if (!skb->len) {
nsc_ircc_change_speed_complete(self, speed);
nsc_ircc_change_speed(self, speed);
return 0;
} else
self->new_speed = speed;
......
......@@ -207,8 +207,10 @@ int __init a2091_detect(Scsi_Host_Template *tpnt)
continue;
instance = scsi_register (tpnt, sizeof (struct WD33C93_hostdata));
if(instance == NULL)
continue;
if (instance == NULL) {
release_mem_region(address, 256);
continue;
}
instance->base = ZTWO_VADDR(address);
instance->irq = IRQ_AMIGA_PORTS;
instance->unique_id = z->slotaddr;
......
......@@ -66,8 +66,8 @@ static __inline__ long atomic_add_return(int i, atomic_t * v)
long temp, result;
__asm__ __volatile__(
"1: ldl_l %0,%1\n"
" addl %0,%3,%2\n"
" addl %0,%3,%0\n"
" mov %0,%2\n"
" stl_c %0,%1\n"
" beq %0,2f\n"
" mb\n"
......@@ -84,8 +84,8 @@ static __inline__ long atomic_sub_return(int i, atomic_t * v)
long temp, result;
__asm__ __volatile__(
"1: ldl_l %0,%1\n"
" subl %0,%3,%2\n"
" subl %0,%3,%0\n"
" mov %0,%2\n"
" stl_c %0,%1\n"
" beq %0,2f\n"
" mb\n"
......
......@@ -72,4 +72,13 @@
__asm__("stw %1,%0" : "=m"(mem) : "r"(val))
#endif
/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
a mechanism by which the user can annotate likely branch directions and
expect the blocks to be reordered appropriately. Define __builtin_expect
to nothing for earlier compilers. */
#if __GNUC__ == 2 && __GNUC_MINOR__ < 96
#define __builtin_expect(x, expected_value) (x)
#endif
#endif /* __ALPHA_COMPILER_H */
#ifndef _ALPHA_SEMAPHORE_HELPER_H
#define _ALPHA_SEMAPHORE_HELPER_H
/*
* SMP- and interrupt-safe semaphores helper functions.
*
* (C) Copyright 1996 Linus Torvalds
* (C) Copyright 1999 Richard Henderson
*/
/*
* These two _must_ execute atomically wrt each other.
*
* This is trivially done with load_locked/store_cond,
* which we have. Let the rest of the losers suck eggs.
*/
static inline void
wake_one_more(struct semaphore * sem)
{
atomic_inc(&sem->waking);
}
static inline int
waking_non_zero(struct semaphore *sem)
{
long ret, tmp;
/* An atomic conditional decrement. */
__asm__ __volatile__(
"1: ldl_l %1,%2\n"
" blt %1,2f\n"
" subl %1,1,%0\n"
" stl_c %0,%2\n"
" beq %0,3f\n"
"2:\n"
".subsection 2\n"
"3: br 1b\n"
".previous"
: "=r"(ret), "=r"(tmp), "=m"(sem->waking.counter)
: "0"(0));
return ret > 0;
}
/*
* waking_non_zero_interruptible:
* 1 got the lock
* 0 go to sleep
* -EINTR interrupted
*
* We must undo the sem->count down_interruptible decrement
* simultaneously and atomicly with the sem->waking adjustment,
* otherwise we can race with wake_one_more.
*
* This is accomplished by doing a 64-bit ll/sc on the 2 32-bit words.
*/
static inline int
waking_non_zero_interruptible(struct semaphore *sem, struct task_struct *tsk)
{
long ret, tmp, tmp2, tmp3;
/* "Equivalent" C. Note that we have to do this all without
(taken) branches in order to be a valid ll/sc sequence.
do {
tmp = ldq_l;
ret = 0;
if (tmp >= 0) {
tmp += 0xffffffff00000000;
ret = 1;
}
else if (pending) {
// Since -1 + 1 carries into the high word, we have
// to be more careful adding 1 here.
tmp = (tmp & 0xffffffff00000000)
| ((tmp + 1) & 0x00000000ffffffff;
ret = -EINTR;
}
else {
break; // ideally. we don't actually break
// since this is a predicate we don't
// have, and is more trouble to build
// than to elide the noop stq_c.
}
tmp = stq_c = tmp;
} while (tmp == 0);
*/
__asm__ __volatile__(
"1: ldq_l %1,%4\n"
" lda %0,0\n"
" cmovne %5,%6,%0\n"
" addq %1,1,%2\n"
" and %1,%7,%3\n"
" andnot %2,%7,%2\n"
" cmovge %1,1,%0\n"
" or %3,%2,%2\n"
" addq %1,%7,%3\n"
" cmovne %5,%2,%1\n"
" cmovge %2,%3,%1\n"
" stq_c %1,%4\n"
" beq %1,3f\n"
"2:\n"
".subsection 2\n"
"3: br 1b\n"
".previous"
: "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3), "=m"(*sem)
: "r"(signal_pending(tsk)), "r"(-EINTR),
"r"(0xffffffff00000000));
return ret;
}
/*
* waking_non_zero_trylock is unused. we do everything in
* down_trylock and let non-ll/sc hosts bounce around.
*/
static inline int
waking_non_zero_trylock(struct semaphore *sem)
{
return 0;
}
#endif
......@@ -12,10 +12,14 @@
#include <asm/system.h>
#include <asm/atomic.h>
#define DEBUG_SEMAPHORE 0
#define DEBUG_RW_SEMAPHORE 0
struct semaphore {
/* Careful, inline assembly knows about the position of these two. */
atomic_t count;
atomic_t count __attribute__((aligned(8)));
atomic_t waking; /* biased by -1 */
wait_queue_head_t wait;
#if WAITQUEUE_DEBUG
long __magic;
......@@ -42,7 +46,7 @@ struct semaphore {
#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
extern inline void sema_init(struct semaphore *sem, int val)
static inline void sema_init(struct semaphore *sem, int val)
{
/*
* Logically,
......@@ -68,103 +72,33 @@ static inline void init_MUTEX_LOCKED (struct semaphore *sem)
sema_init(sem, 0);
}
extern void __down(struct semaphore * sem);
extern int __down_interruptible(struct semaphore * sem);
extern int __down_trylock(struct semaphore * sem);
extern void __up(struct semaphore * sem);
/* All have custom assembly linkages. */
extern void __down_failed(struct semaphore * sem);
extern void __down_failed_interruptible(struct semaphore * sem);
extern void __down_failed_trylock(struct semaphore * sem);
extern void __up_wakeup(struct semaphore * sem);
extern void down(struct semaphore *);
extern void __down_failed(struct semaphore *);
extern int down_interruptible(struct semaphore *);
extern int __down_failed_interruptible(struct semaphore *);
extern int down_trylock(struct semaphore *);
extern void up(struct semaphore *);
extern void __up_wakeup(struct semaphore *);
/*
* Whee. Hidden out of line code is fun. The contention cases are
* handled out of line in kernel/sched.c; arch/alpha/lib/semaphore.S
* takes care of making sure we can call it without clobbering regs.
* Hidden out of line code is fun, but extremely messy. Rely on newer
* compilers to do a respectable job with this. The contention cases
* are handled out of line in arch/alpha/kernel/semaphore.c.
*/
extern inline void down(struct semaphore * sem)
static inline void __down(struct semaphore *sem)
{
/* Given that we have to use particular hard registers to
communicate with __down_failed anyway, reuse them in
the atomic operation as well.
__down_failed takes the semaphore address in $24, and
it's return address in $28. The pv is loaded as usual.
The gp is clobbered (in the module case) as usual. */
/* This little bit of silliness is to get the GP loaded for
a function that ordinarily wouldn't. Otherwise we could
have it done by the macro directly, which can be optimized
the linker. */
register void *pv __asm__("$27");
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
pv = __down_failed;
__asm__ __volatile__ (
"/* semaphore down operation */\n"
"1: ldl_l $24,%1\n"
" subl $24,1,$28\n"
" subl $24,1,$24\n"
" stl_c $28,%1\n"
" beq $28,2f\n"
" blt $24,3f\n"
"4: mb\n"
".subsection 2\n"
"2: br 1b\n"
"3: lda $24,%1\n"
" jsr $28,($27),__down_failed\n"
" ldgp $29,0($28)\n"
" br 4b\n"
".previous"
: "=r"(pv)
: "m"(sem->count), "r"(pv)
: "$24", "$28", "memory");
long count = atomic_dec_return(&sem->count);
if (__builtin_expect(count < 0, 0))
__down_failed(sem);
}
extern inline int down_interruptible(struct semaphore * sem)
static inline int __down_interruptible(struct semaphore *sem)
{
/* __down_failed_interruptible takes the semaphore address in $24,
and it's return address in $28. The pv is loaded as usual.
The gp is clobbered (in the module case) as usual. The return
value is in $24. */
register int ret __asm__("$24");
register void *pv __asm__("$27");
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
pv = __down_failed_interruptible;
__asm__ __volatile__ (
"/* semaphore down interruptible operation */\n"
"1: ldl_l $24,%2\n"
" subl $24,1,$28\n"
" subl $24,1,$24\n"
" stl_c $28,%2\n"
" beq $28,2f\n"
" blt $24,3f\n"
" mov $31,%0\n"
"4: mb\n"
".subsection 2\n"
"2: br 1b\n"
"3: lda $24,%2\n"
" jsr $28,($27),__down_failed_interruptible\n"
" ldgp $29,0($28)\n"
" br 4b\n"
".previous"
: "=r"(ret), "=r"(pv)
: "m"(sem->count), "r"(pv)
: "$28", "memory");
return ret;
long count = atomic_dec_return(&sem->count);
if (__builtin_expect(count < 0, 0))
return __down_failed_interruptible(sem);
return 0;
}
/*
......@@ -174,7 +108,7 @@ extern inline int down_interruptible(struct semaphore * sem)
* Do this by using ll/sc on the pair of 32-bit words.
*/
extern inline int down_trylock(struct semaphore * sem)
static inline int __down_trylock(struct semaphore * sem)
{
long ret, tmp, tmp2, sub;
......@@ -182,25 +116,21 @@ extern inline int down_trylock(struct semaphore * sem)
(taken) branches in order to be a valid ll/sc sequence.
do {
tmp = ldq_l;
sub = 0x0000000100000000;
ret = ((int)tmp <= 0); // count =< 0 ?
if ((int)tmp >= 0) sub = 0; // count >= 0 ?
// note that if count=0 subq overflows to the high
// longword (i.e waking)
ret &= ((long)tmp < 0); // waking < 0 ?
sub += 1;
if (ret)
break;
tmp -= sub;
tmp = stq_c = tmp;
tmp = ldq_l;
sub = 0x0000000100000000;
ret = ((int)tmp <= 0); // count <= 0 ?
// Note that if count=0, the decrement overflows into
// waking, so cancel the 1 loaded above. Also cancel
// it if the lock was already free.
if ((int)tmp >= 0) sub = 0; // count >= 0 ?
ret &= ((long)tmp < 0); // waking < 0 ?
sub += 1;
if (ret) break;
tmp -= sub;
tmp = stq_c = tmp;
} while (tmp == 0);
*/
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
__asm__ __volatile__(
"1: ldq_l %1,%4\n"
" lda %3,1\n"
......@@ -215,7 +145,7 @@ extern inline int down_trylock(struct semaphore * sem)
" subq %1,%3,%1\n"
" stq_c %1,%4\n"
" beq %1,3f\n"
"2:\n"
"2: mb\n"
".subsection 2\n"
"3: br 1b\n"
".previous"
......@@ -226,45 +156,70 @@ extern inline int down_trylock(struct semaphore * sem)
return ret;
}
extern inline void up(struct semaphore * sem)
static inline void __up(struct semaphore *sem)
{
/* Given that we have to use particular hard registers to
communicate with __up_wakeup anyway, reuse them in
the atomic operation as well.
long ret, tmp, tmp2, tmp3;
__up_wakeup takes the semaphore address in $24, and
it's return address in $28. The pv is loaded as usual.
The gp is clobbered (in the module case) as usual. */
/* We must manipulate count and waking simultaneously and atomically.
Otherwise we have races between up and __down_failed_interruptible
waking up on a signal.
register void *pv __asm__("$27");
"Equivalent" C. Note that we have to do this all without
(taken) branches in order to be a valid ll/sc sequence.
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
pv = __up_wakeup;
__asm__ __volatile__ (
"/* semaphore up operation */\n"
do {
tmp = ldq_l;
ret = (int)tmp + 1; // count += 1;
tmp2 = tmp & 0xffffffff00000000; // extract waking
if (ret <= 0) // still sleepers?
tmp2 += 0x0000000100000000; // waking += 1;
tmp = ret & 0x00000000ffffffff; // insert count
tmp |= tmp2; // insert waking;
tmp = stq_c = tmp;
} while (tmp == 0);
*/
__asm__ __volatile__(
" mb\n"
"1: ldl_l $24,%1\n"
" addl $24,1,$28\n"
" addl $24,1,$24\n"
" stl_c $28,%1\n"
" beq $28,2f\n"
" ble $24,3f\n"
"4:\n"
"1: ldq_l %1,%4\n"
" addl %1,1,%0\n"
" zapnot %1,0xf0,%2\n"
" addq %2,%5,%3\n"
" cmovle %0,%3,%2\n"
" zapnot %0,0x0f,%1\n"
" bis %1,%2,%1\n"
" stq_c %1,%4\n"
" beq %1,3f\n"
"2:\n"
".subsection 2\n"
"2: br 1b\n"
"3: lda $24,%1\n"
" jsr $28,($27),__up_wakeup\n"
" ldgp $29,0($28)\n"
" br 4b\n"
"3: br 1b\n"
".previous"
: "=r"(pv)
: "m"(sem->count), "r"(pv)
: "$24", "$28", "memory");
: "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
: "m"(*sem), "r"(0x0000000100000000)
: "memory");
if (__builtin_expect(ret <= 0, 0))
__up_wakeup(sem);
}
#if !WAITQUEUE_DEBUG && !DEBUG_SEMAPHORE
extern inline void down(struct semaphore *sem)
{
__down(sem);
}
extern inline int down_interruptible(struct semaphore *sem)
{
return __down_interruptible(sem);
}
extern inline int down_trylock(struct semaphore *sem)
{
return __down_trylock(sem);
}
extern inline void up(struct semaphore *sem)
{
__up(sem);
}
#endif
/* rw mutexes (should that be mutices? =) -- throw rw
* spinlocks and semaphores together, and this is what we
......@@ -297,7 +252,7 @@ extern inline void up(struct semaphore * sem)
#define RW_LOCK_BIAS 0x01000000
struct rw_semaphore {
int count;
atomic_t count;
/* bit 0 means read bias granted;
bit 1 means write bias granted. */
unsigned granted;
......@@ -317,7 +272,7 @@ struct rw_semaphore {
#endif
#define __RWSEM_INITIALIZER(name,count) \
{ (count), 0, __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \
{ ATOMIC_INIT(count), 0, __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \
__WAIT_QUEUE_HEAD_INITIALIZER((name).write_bias_wait) \
__SEM_DEBUG_INIT(name) __RWSEM_DEBUG_INIT }
......@@ -331,9 +286,9 @@ struct rw_semaphore {
#define DECLARE_RWSEM_WRITE_LOCKED(name) \
__DECLARE_RWSEM_GENERIC(name, 0)
extern inline void init_rwsem(struct rw_semaphore *sem)
static inline void init_rwsem(struct rw_semaphore *sem)
{
sem->count = RW_LOCK_BIAS;
atomic_set (&sem->count, RW_LOCK_BIAS);
sem->granted = 0;
init_waitqueue_head(&sem->wait);
init_waitqueue_head(&sem->write_bias_wait);
......@@ -344,213 +299,73 @@ extern inline void init_rwsem(struct rw_semaphore *sem)
#endif
}
/* All have custom assembly linkages. */
extern void __down_read_failed(struct rw_semaphore *sem);
extern void __down_write_failed(struct rw_semaphore *sem);
extern void __rwsem_wake(struct rw_semaphore *sem, unsigned long readers);
extern void down_read(struct rw_semaphore *);
extern void down_write(struct rw_semaphore *);
extern void up_read(struct rw_semaphore *);
extern void up_write(struct rw_semaphore *);
extern void __down_read_failed(struct rw_semaphore *, int);
extern void __down_write_failed(struct rw_semaphore *, int);
extern void __rwsem_wake(struct rw_semaphore *, int);
extern inline void down_read(struct rw_semaphore *sem)
static inline void __down_read(struct rw_semaphore *sem)
{
/* Given that we have to use particular hard registers to
communicate with __down_read_failed anyway, reuse them in
the atomic operation as well.
long count = atomic_dec_return(&sem->count);
if (__builtin_expect(count < 0, 0))
__down_read_failed(sem, count);
}
__down_read_failed takes the semaphore address in $24, the count
we read in $25, and it's return address in $28. The pv is loaded
as usual. The gp is clobbered (in the module case) as usual. */
static inline void __down_write(struct rw_semaphore *sem)
{
long count = atomic_sub_return(RW_LOCK_BIAS, &sem->count);
if (__builtin_expect(count != 0, 0))
__down_write_failed(sem, count);
}
/* This little bit of silliness is to get the GP loaded for
a function that ordinarily wouldn't. Otherwise we could
have it done by the macro directly, which can be optimized
the linker. */
register void *pv __asm__("$27");
/* When a reader does a release, the only significant case is when there
was a writer waiting, and we've bumped the count to 0, then we must
wake the writer up. */
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
static inline void __up_read(struct rw_semaphore *sem)
{
long count;
mb();
count = atomic_inc_return(&sem->count);
if (__builtin_expect(count == 0, 0))
__rwsem_wake(sem, 0);
}
pv = __down_read_failed;
__asm__ __volatile__(
"/* semaphore down_read operation */\n"
"1: ldl_l $24,%1\n"
" subl $24,1,$28\n"
" subl $24,1,$25\n"
" stl_c $28,%1\n"
" beq $28,2f\n"
" blt $25,3f\n"
"4: mb\n"
".subsection 2\n"
"2: br 1b\n"
"3: lda $24,%1\n"
" jsr $28,($27),__down_read_failed\n"
" ldgp $29,0($28)\n"
" br 4b\n"
".previous"
: "=r"(pv)
: "m"(sem->count), "r"(pv)
: "$24", "$25", "$28", "memory");
/* Releasing the writer is easy -- just release it and wake up
any sleepers. */
#if WAITQUEUE_DEBUG
if (sem->granted & 2)
BUG();
if (atomic_read(&sem->writers))
BUG();
atomic_inc(&sem->readers);
#endif
static inline void __up_write(struct rw_semaphore *sem)
{
long count, wake;
mb();
count = atomic_add_return(RW_LOCK_BIAS, &sem->count);
/* Only do the wake if we were, but are no longer, negative. */
wake = ((int)(count - RW_LOCK_BIAS) < 0) && count >= 0;
if (__builtin_expect(wake, 0))
__rwsem_wake(sem, count);
}
#if !WAITQUEUE_DEBUG && !DEBUG_RW_SEMAPHORE
extern inline void down_read(struct rw_semaphore *sem)
{
__down_read(sem);
}
extern inline void down_write(struct rw_semaphore *sem)
{
/* Given that we have to use particular hard registers to
communicate with __down_write_failed anyway, reuse them in
the atomic operation as well.
__down_write_failed takes the semaphore address in $24, the count
we read in $25, and it's return address in $28. The pv is loaded
as usual. The gp is clobbered (in the module case) as usual. */
/* This little bit of silliness is to get the GP loaded for
a function that ordinarily wouldn't. Otherwise we could
have it done by the macro directly, which can be optimized
the linker. */
register void *pv __asm__("$27");
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
#endif
pv = __down_write_failed;
__asm__ __volatile__(
"/* semaphore down_write operation */\n"
"1: ldl_l $24,%1\n"
" ldah $28,%3($24)\n"
" ldah $25,%3($24)\n"
" stl_c $28,%1\n"
" beq $28,2f\n"
" bne $25,3f\n"
"4: mb\n"
".subsection 2\n"
"2: br 1b\n"
"3: lda $24,%1\n"
" jsr $28,($27),__down_write_failed\n"
" ldgp $29,0($28)\n"
" br 4b\n"
".previous"
: "=r"(pv)
: "m"(sem->count), "r"(pv), "i"(-(RW_LOCK_BIAS >> 16))
: "$24", "$25", "$28", "memory");
#if WAITQUEUE_DEBUG
if (atomic_read(&sem->writers))
BUG();
if (atomic_read(&sem->readers))
BUG();
if (sem->granted & 3)
BUG();
atomic_inc(&sem->writers);
#endif
__down_write(sem);
}
/* When a reader does a release, the only significant case is when
there was a writer waiting, and we've * bumped the count to 0: we must
wake the writer up. */
extern inline void up_read(struct rw_semaphore *sem)
{
/* Given that we have to use particular hard registers to
communicate with __rwsem_wake anyway, reuse them in
the atomic operation as well.
__rwsem_wake takes the semaphore address in $24, the
number of waiting readers in $25, and it's return address
in $28. The pv is loaded as usual. The gp is clobbered
(in the module case) as usual. */
register void *pv __asm__("$27");
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
if (sem->granted & 2)
BUG();
if (atomic_read(&sem->writers))
BUG();
atomic_dec(&sem->readers);
#endif
pv = __rwsem_wake;
__asm__ __volatile__(
"/* semaphore up_read operation */\n"
" mb\n"
"1: ldl_l $24,%1\n"
" addl $24,1,$28\n"
" addl $24,1,$24\n"
" stl_c $28,%1\n"
" beq $28,2f\n"
" beq $24,3f\n"
"4:\n"
".subsection 2\n"
"2: br 1b\n"
"3: lda $24,%1\n"
" mov 0,$25\n"
" jsr $28,($27),__rwsem_wake\n"
" ldgp $29,0($28)\n"
" br 4b\n"
".previous"
: "=r"(pv)
: "m"(sem->count), "r"(pv)
: "$24", "$25", "$28", "memory");
__up_read(sem);
}
/* releasing the writer is easy -- just release it and
* wake up any sleepers.
*/
extern inline void up_write(struct rw_semaphore *sem)
{
/* Given that we have to use particular hard registers to
communicate with __rwsem_wake anyway, reuse them in
the atomic operation as well.
__rwsem_wake takes the semaphore address in $24, the
number of waiting readers in $25, and it's return address
in $28. The pv is loaded as usual. The gp is clobbered
(in the module case) as usual. */
register void *pv __asm__("$27");
#if WAITQUEUE_DEBUG
CHECK_MAGIC(sem->__magic);
if (sem->granted & 3)
BUG();
if (atomic_read(&sem->readers))
BUG();
if (atomic_read(&sem->writers) != 1)
BUG();
atomic_dec(&sem->writers);
#endif
pv = __rwsem_wake;
__asm__ __volatile__(
"/* semaphore up_write operation */\n"
" mb\n"
"1: ldl_l $24,%1\n"
" ldah $28,%3($24)\n"
" stl_c $28,%1\n"
" beq $28,2f\n"
" blt $24,3f\n"
"4:\n"
".subsection 2\n"
"2: br 1b\n"
"3: ldah $25,%3($24)\n"
/* Only do the wake if we're no longer negative. */
" blt $25,4b\n"
" lda $24,%1\n"
" jsr $28,($27),__rwsem_wake\n"
" ldgp $29,0($28)\n"
" br 4b\n"
".previous"
: "=r"(pv)
: "m"(sem->count), "r"(pv), "i"(RW_LOCK_BIAS >> 16)
: "$24", "$25", "$28", "memory");
__up_write(sem);
}
#endif
#endif
......@@ -80,7 +80,7 @@ static inline void spin_lock(spinlock_t * lock)
" blbs %0,2b\n"
" br 1b\n"
".previous"
: "=r" (tmp), "=m" (lock->lock)
: "=&r" (tmp), "=m" (lock->lock)
: "m"(lock->lock) : "memory");
}
......
/*
* include/asm-alpha/xor.h
*
* Optimized RAID-5 checksumming functions for alpha EV5 and EV6
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
extern void xor_alpha_2(unsigned long, unsigned long *, unsigned long *);
extern void xor_alpha_3(unsigned long, unsigned long *, unsigned long *,
unsigned long *);
extern void xor_alpha_4(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void xor_alpha_5(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *, unsigned long *);
extern void xor_alpha_prefetch_2(unsigned long, unsigned long *,
unsigned long *);
extern void xor_alpha_prefetch_3(unsigned long, unsigned long *,
unsigned long *, unsigned long *);
extern void xor_alpha_prefetch_4(unsigned long, unsigned long *,
unsigned long *, unsigned long *,
unsigned long *);
extern void xor_alpha_prefetch_5(unsigned long, unsigned long *,
unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
asm("
.text
.align 3
.ent xor_alpha_2
xor_alpha_2:
.prologue 0
srl $16, 6, $16
.align 4
2:
ldq $0,0($17)
ldq $1,0($18)
ldq $2,8($17)
ldq $3,8($18)
ldq $4,16($17)
ldq $5,16($18)
ldq $6,24($17)
ldq $7,24($18)
ldq $19,32($17)
ldq $20,32($18)
ldq $21,40($17)
ldq $22,40($18)
ldq $23,48($17)
ldq $24,48($18)
ldq $25,56($17)
xor $0,$1,$0 # 7 cycles from $1 load
ldq $27,56($18)
xor $2,$3,$2
stq $0,0($17)
xor $4,$5,$4
stq $2,8($17)
xor $6,$7,$6
stq $4,16($17)
xor $19,$20,$19
stq $6,24($17)
xor $21,$22,$21
stq $19,32($17)
xor $23,$24,$23
stq $21,40($17)
xor $25,$27,$25
stq $23,48($17)
subq $16,1,$16
stq $25,56($17)
addq $17,64,$17
addq $18,64,$18
bgt $16,2b
ret
.end xor_alpha_2
.align 3
.ent xor_alpha_3
xor_alpha_3:
.prologue 0
srl $16, 6, $16
.align 4
3:
ldq $0,0($17)
ldq $1,0($18)
ldq $2,0($19)
ldq $3,8($17)
ldq $4,8($18)
ldq $6,16($17)
ldq $7,16($18)
ldq $21,24($17)
ldq $22,24($18)
ldq $24,32($17)
ldq $25,32($18)
ldq $5,8($19)
ldq $20,16($19)
ldq $23,24($19)
ldq $27,32($19)
nop
xor $0,$1,$1 # 8 cycles from $0 load
xor $3,$4,$4 # 6 cycles from $4 load
xor $6,$7,$7 # 6 cycles from $7 load
xor $21,$22,$22 # 5 cycles from $22 load
xor $1,$2,$2 # 9 cycles from $2 load
xor $24,$25,$25 # 5 cycles from $25 load
stq $2,0($17)
xor $4,$5,$5 # 6 cycles from $5 load
stq $5,8($17)
xor $7,$20,$20 # 7 cycles from $20 load
stq $20,16($17)
xor $22,$23,$23 # 7 cycles from $23 load
stq $23,24($17)
xor $25,$27,$27 # 7 cycles from $27 load
stq $27,32($17)
nop
ldq $0,40($17)
ldq $1,40($18)
ldq $3,48($17)
ldq $4,48($18)
ldq $6,56($17)
ldq $7,56($18)
ldq $2,40($19)
ldq $5,48($19)
ldq $20,56($19)
xor $0,$1,$1 # 4 cycles from $1 load
xor $3,$4,$4 # 5 cycles from $4 load
xor $6,$7,$7 # 5 cycles from $7 load
xor $1,$2,$2 # 4 cycles from $2 load
xor $4,$5,$5 # 5 cycles from $5 load
stq $2,40($17)
xor $7,$20,$20 # 4 cycles from $20 load
stq $5,48($17)
subq $16,1,$16
stq $20,56($17)
addq $19,64,$19
addq $18,64,$18
addq $17,64,$17
bgt $16,3b
ret
.end xor_alpha_3
.align 3
.ent xor_alpha_4
xor_alpha_4:
.prologue 0
srl $16, 6, $16
.align 4
4:
ldq $0,0($17)
ldq $1,0($18)
ldq $2,0($19)
ldq $3,0($20)
ldq $4,8($17)
ldq $5,8($18)
ldq $6,8($19)
ldq $7,8($20)
ldq $21,16($17)
ldq $22,16($18)
ldq $23,16($19)
ldq $24,16($20)
ldq $25,24($17)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $27,24($18)
xor $2,$3,$3 # 6 cycles from $3 load
ldq $0,24($19)
xor $1,$3,$3
ldq $1,24($20)
xor $4,$5,$5 # 7 cycles from $5 load
stq $3,0($17)
xor $6,$7,$7
xor $21,$22,$22 # 7 cycles from $22 load
xor $5,$7,$7
stq $7,8($17)
xor $23,$24,$24 # 7 cycles from $24 load
ldq $2,32($17)
xor $22,$24,$24
ldq $3,32($18)
ldq $4,32($19)
ldq $5,32($20)
xor $25,$27,$27 # 8 cycles from $27 load
ldq $6,40($17)
ldq $7,40($18)
ldq $21,40($19)
ldq $22,40($20)
stq $24,16($17)
xor $0,$1,$1 # 9 cycles from $1 load
xor $2,$3,$3 # 5 cycles from $3 load
xor $27,$1,$1
stq $1,24($17)
xor $4,$5,$5 # 5 cycles from $5 load
ldq $23,48($17)
ldq $24,48($18)
ldq $25,48($19)
xor $3,$5,$5
ldq $27,48($20)
ldq $0,56($17)
ldq $1,56($18)
ldq $2,56($19)
xor $6,$7,$7 # 8 cycles from $6 load
ldq $3,56($20)
stq $5,32($17)
xor $21,$22,$22 # 8 cycles from $22 load
xor $7,$22,$22
xor $23,$24,$24 # 5 cycles from $24 load
stq $22,40($17)
xor $25,$27,$27 # 5 cycles from $27 load
xor $24,$27,$27
xor $0,$1,$1 # 5 cycles from $1 load
stq $27,48($17)
xor $2,$3,$3 # 4 cycles from $3 load
xor $1,$3,$3
subq $16,1,$16
stq $3,56($17)
addq $20,64,$20
addq $19,64,$19
addq $18,64,$18
addq $17,64,$17
bgt $16,4b
ret
.end xor_alpha_4
.align 3
.ent xor_alpha_5
xor_alpha_5:
.prologue 0
srl $16, 6, $16
.align 4
5:
ldq $0,0($17)
ldq $1,0($18)
ldq $2,0($19)
ldq $3,0($20)
ldq $4,0($21)
ldq $5,8($17)
ldq $6,8($18)
ldq $7,8($19)
ldq $22,8($20)
ldq $23,8($21)
ldq $24,16($17)
ldq $25,16($18)
ldq $27,16($19)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $28,16($20)
xor $2,$3,$3 # 6 cycles from $3 load
ldq $0,16($21)
xor $1,$3,$3
ldq $1,24($17)
xor $3,$4,$4 # 7 cycles from $4 load
stq $4,0($17)
xor $5,$6,$6 # 7 cycles from $6 load
xor $7,$22,$22 # 7 cycles from $22 load
xor $6,$23,$23 # 7 cycles from $23 load
ldq $2,24($18)
xor $22,$23,$23
ldq $3,24($19)
xor $24,$25,$25 # 8 cycles from $25 load
stq $23,8($17)
xor $25,$27,$27 # 8 cycles from $27 load
ldq $4,24($20)
xor $28,$0,$0 # 7 cycles from $0 load
ldq $5,24($21)
xor $27,$0,$0
ldq $6,32($17)
ldq $7,32($18)
stq $0,16($17)
xor $1,$2,$2 # 6 cycles from $2 load
ldq $22,32($19)
xor $3,$4,$4 # 4 cycles from $4 load
ldq $23,32($20)
xor $2,$4,$4
ldq $24,32($21)
ldq $25,40($17)
ldq $27,40($18)
ldq $28,40($19)
ldq $0,40($20)
xor $4,$5,$5 # 7 cycles from $5 load
stq $5,24($17)
xor $6,$7,$7 # 7 cycles from $7 load
ldq $1,40($21)
ldq $2,48($17)
ldq $3,48($18)
xor $7,$22,$22 # 7 cycles from $22 load
ldq $4,48($19)
xor $23,$24,$24 # 6 cycles from $24 load
ldq $5,48($20)
xor $22,$24,$24
ldq $6,48($21)
xor $25,$27,$27 # 7 cycles from $27 load
stq $24,32($17)
xor $27,$28,$28 # 8 cycles from $28 load
ldq $7,56($17)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $22,56($18)
ldq $23,56($19)
ldq $24,56($20)
ldq $25,56($21)
xor $28,$1,$1
xor $2,$3,$3 # 9 cycles from $3 load
xor $3,$4,$4 # 9 cycles from $4 load
xor $5,$6,$6 # 8 cycles from $6 load
stq $1,40($17)
xor $4,$6,$6
xor $7,$22,$22 # 7 cycles from $22 load
xor $23,$24,$24 # 6 cycles from $24 load
stq $6,48($17)
xor $22,$24,$24
subq $16,1,$16
xor $24,$25,$25 # 8 cycles from $25 load
stq $25,56($17)
addq $21,64,$21
addq $20,64,$20
addq $19,64,$19
addq $18,64,$18
addq $17,64,$17
bgt $16,5b
ret
.end xor_alpha_5
.align 3
.ent xor_alpha_prefetch_2
xor_alpha_prefetch_2:
.prologue 0
srl $16, 6, $16
ldq $31, 0($17)
ldq $31, 0($18)
ldq $31, 64($17)
ldq $31, 64($18)
ldq $31, 128($17)
ldq $31, 128($18)
ldq $31, 192($17)
ldq $31, 192($18)
.align 4
2:
ldq $0,0($17)
ldq $1,0($18)
ldq $2,8($17)
ldq $3,8($18)
ldq $4,16($17)
ldq $5,16($18)
ldq $6,24($17)
ldq $7,24($18)
ldq $19,32($17)
ldq $20,32($18)
ldq $21,40($17)
ldq $22,40($18)
ldq $23,48($17)
ldq $24,48($18)
ldq $25,56($17)
ldq $27,56($18)
ldq $31,256($17)
xor $0,$1,$0 # 8 cycles from $1 load
ldq $31,256($18)
xor $2,$3,$2
stq $0,0($17)
xor $4,$5,$4
stq $2,8($17)
xor $6,$7,$6
stq $4,16($17)
xor $19,$20,$19
stq $6,24($17)
xor $21,$22,$21
stq $19,32($17)
xor $23,$24,$23
stq $21,40($17)
xor $25,$27,$25
stq $23,48($17)
subq $16,1,$16
stq $25,56($17)
addq $17,64,$17
addq $18,64,$18
bgt $16,2b
ret
.end xor_alpha_prefetch_2
.align 3
.ent xor_alpha_prefetch_3
xor_alpha_prefetch_3:
.prologue 0
srl $16, 6, $16
ldq $31, 0($17)
ldq $31, 0($18)
ldq $31, 0($19)
ldq $31, 64($17)
ldq $31, 64($18)
ldq $31, 64($19)
ldq $31, 128($17)
ldq $31, 128($18)
ldq $31, 128($19)
ldq $31, 192($17)
ldq $31, 192($18)
ldq $31, 192($19)
.align 4
3:
ldq $0,0($17)
ldq $1,0($18)
ldq $2,0($19)
ldq $3,8($17)
ldq $4,8($18)
ldq $6,16($17)
ldq $7,16($18)
ldq $21,24($17)
ldq $22,24($18)
ldq $24,32($17)
ldq $25,32($18)
ldq $5,8($19)
ldq $20,16($19)
ldq $23,24($19)
ldq $27,32($19)
nop
xor $0,$1,$1 # 8 cycles from $0 load
xor $3,$4,$4 # 7 cycles from $4 load
xor $6,$7,$7 # 6 cycles from $7 load
xor $21,$22,$22 # 5 cycles from $22 load
xor $1,$2,$2 # 9 cycles from $2 load
xor $24,$25,$25 # 5 cycles from $25 load
stq $2,0($17)
xor $4,$5,$5 # 6 cycles from $5 load
stq $5,8($17)
xor $7,$20,$20 # 7 cycles from $20 load
stq $20,16($17)
xor $22,$23,$23 # 7 cycles from $23 load
stq $23,24($17)
xor $25,$27,$27 # 7 cycles from $27 load
stq $27,32($17)
nop
ldq $0,40($17)
ldq $1,40($18)
ldq $3,48($17)
ldq $4,48($18)
ldq $6,56($17)
ldq $7,56($18)
ldq $2,40($19)
ldq $5,48($19)
ldq $20,56($19)
ldq $31,256($17)
ldq $31,256($18)
ldq $31,256($19)
xor $0,$1,$1 # 6 cycles from $1 load
xor $3,$4,$4 # 5 cycles from $4 load
xor $6,$7,$7 # 5 cycles from $7 load
xor $1,$2,$2 # 4 cycles from $2 load
xor $4,$5,$5 # 5 cycles from $5 load
xor $7,$20,$20 # 4 cycles from $20 load
stq $2,40($17)
subq $16,1,$16
stq $5,48($17)
addq $19,64,$19
stq $20,56($17)
addq $18,64,$18
addq $17,64,$17
bgt $16,3b
ret
.end xor_alpha_prefetch_3
.align 3
.ent xor_alpha_prefetch_4
xor_alpha_prefetch_4:
.prologue 0
srl $16, 6, $16
ldq $31, 0($17)
ldq $31, 0($18)
ldq $31, 0($19)
ldq $31, 0($20)
ldq $31, 64($17)
ldq $31, 64($18)
ldq $31, 64($19)
ldq $31, 64($20)
ldq $31, 128($17)
ldq $31, 128($18)
ldq $31, 128($19)
ldq $31, 128($20)
ldq $31, 192($17)
ldq $31, 192($18)
ldq $31, 192($19)
ldq $31, 192($20)
.align 4
4:
ldq $0,0($17)
ldq $1,0($18)
ldq $2,0($19)
ldq $3,0($20)
ldq $4,8($17)
ldq $5,8($18)
ldq $6,8($19)
ldq $7,8($20)
ldq $21,16($17)
ldq $22,16($18)
ldq $23,16($19)
ldq $24,16($20)
ldq $25,24($17)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $27,24($18)
xor $2,$3,$3 # 6 cycles from $3 load
ldq $0,24($19)
xor $1,$3,$3
ldq $1,24($20)
xor $4,$5,$5 # 7 cycles from $5 load
stq $3,0($17)
xor $6,$7,$7
xor $21,$22,$22 # 7 cycles from $22 load
xor $5,$7,$7
stq $7,8($17)
xor $23,$24,$24 # 7 cycles from $24 load
ldq $2,32($17)
xor $22,$24,$24
ldq $3,32($18)
ldq $4,32($19)
ldq $5,32($20)
xor $25,$27,$27 # 8 cycles from $27 load
ldq $6,40($17)
ldq $7,40($18)
ldq $21,40($19)
ldq $22,40($20)
stq $24,16($17)
xor $0,$1,$1 # 9 cycles from $1 load
xor $2,$3,$3 # 5 cycles from $3 load
xor $27,$1,$1
stq $1,24($17)
xor $4,$5,$5 # 5 cycles from $5 load
ldq $23,48($17)
xor $3,$5,$5
ldq $24,48($18)
ldq $25,48($19)
ldq $27,48($20)
ldq $0,56($17)
ldq $1,56($18)
ldq $2,56($19)
ldq $3,56($20)
xor $6,$7,$7 # 8 cycles from $6 load
ldq $31,256($17)
xor $21,$22,$22 # 8 cycles from $22 load
ldq $31,256($18)
xor $7,$22,$22
ldq $31,256($19)
xor $23,$24,$24 # 6 cycles from $24 load
ldq $31,256($20)
xor $25,$27,$27 # 6 cycles from $27 load
stq $5,32($17)
xor $24,$27,$27
xor $0,$1,$1 # 7 cycles from $1 load
xor $2,$3,$3 # 6 cycles from $3 load
stq $22,40($17)
xor $1,$3,$3
stq $27,48($17)
subq $16,1,$16
stq $3,56($17)
addq $20,64,$20
addq $19,64,$19
addq $18,64,$18
addq $17,64,$17
bgt $16,4b
ret
.end xor_alpha_prefetch_4
.align 3
.ent xor_alpha_prefetch_5
xor_alpha_prefetch_5:
.prologue 0
srl $16, 6, $16
ldq $31, 0($17)
ldq $31, 0($18)
ldq $31, 0($19)
ldq $31, 0($20)
ldq $31, 0($21)
ldq $31, 64($17)
ldq $31, 64($18)
ldq $31, 64($19)
ldq $31, 64($20)
ldq $31, 64($21)
ldq $31, 128($17)
ldq $31, 128($18)
ldq $31, 128($19)
ldq $31, 128($20)
ldq $31, 128($21)
ldq $31, 192($17)
ldq $31, 192($18)
ldq $31, 192($19)
ldq $31, 192($20)
ldq $31, 192($21)
.align 4
5:
ldq $0,0($17)
ldq $1,0($18)
ldq $2,0($19)
ldq $3,0($20)
ldq $4,0($21)
ldq $5,8($17)
ldq $6,8($18)
ldq $7,8($19)
ldq $22,8($20)
ldq $23,8($21)
ldq $24,16($17)
ldq $25,16($18)
ldq $27,16($19)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $28,16($20)
xor $2,$3,$3 # 6 cycles from $3 load
ldq $0,16($21)
xor $1,$3,$3
ldq $1,24($17)
xor $3,$4,$4 # 7 cycles from $4 load
stq $4,0($17)
xor $5,$6,$6 # 7 cycles from $6 load
xor $7,$22,$22 # 7 cycles from $22 load
xor $6,$23,$23 # 7 cycles from $23 load
ldq $2,24($18)
xor $22,$23,$23
ldq $3,24($19)
xor $24,$25,$25 # 8 cycles from $25 load
stq $23,8($17)
xor $25,$27,$27 # 8 cycles from $27 load
ldq $4,24($20)
xor $28,$0,$0 # 7 cycles from $0 load
ldq $5,24($21)
xor $27,$0,$0
ldq $6,32($17)
ldq $7,32($18)
stq $0,16($17)
xor $1,$2,$2 # 6 cycles from $2 load
ldq $22,32($19)
xor $3,$4,$4 # 4 cycles from $4 load
ldq $23,32($20)
xor $2,$4,$4
ldq $24,32($21)
ldq $25,40($17)
ldq $27,40($18)
ldq $28,40($19)
ldq $0,40($20)
xor $4,$5,$5 # 7 cycles from $5 load
stq $5,24($17)
xor $6,$7,$7 # 7 cycles from $7 load
ldq $1,40($21)
ldq $2,48($17)
ldq $3,48($18)
xor $7,$22,$22 # 7 cycles from $22 load
ldq $4,48($19)
xor $23,$24,$24 # 6 cycles from $24 load
ldq $5,48($20)
xor $22,$24,$24
ldq $6,48($21)
xor $25,$27,$27 # 7 cycles from $27 load
stq $24,32($17)
xor $27,$28,$28 # 8 cycles from $28 load
ldq $7,56($17)
xor $0,$1,$1 # 6 cycles from $1 load
ldq $22,56($18)
ldq $23,56($19)
ldq $24,56($20)
ldq $25,56($21)
ldq $31,256($17)
xor $28,$1,$1
ldq $31,256($18)
xor $2,$3,$3 # 9 cycles from $3 load
ldq $31,256($19)
xor $3,$4,$4 # 9 cycles from $4 load
ldq $31,256($20)
xor $5,$6,$6 # 8 cycles from $6 load
stq $1,40($17)
xor $4,$6,$6
xor $7,$22,$22 # 7 cycles from $22 load
xor $23,$24,$24 # 6 cycles from $24 load
stq $6,48($17)
xor $22,$24,$24
ldq $31,256($21)
xor $24,$25,$25 # 8 cycles from $25 load
stq $25,56($17)
subq $16,1,$16
addq $21,64,$21
addq $20,64,$20
addq $19,64,$19
addq $18,64,$18
addq $17,64,$17
bgt $16,5b
ret
.end xor_alpha_prefetch_5
");
static struct xor_block_template xor_block_alpha = {
name: "alpha",
do_2: xor_alpha_2,
do_3: xor_alpha_3,
do_4: xor_alpha_4,
do_5: xor_alpha_5,
};
static struct xor_block_template xor_block_alpha_prefetch = {
name: "alpha prefetch",
do_2: xor_alpha_prefetch_2,
do_3: xor_alpha_prefetch_3,
do_4: xor_alpha_prefetch_4,
do_5: xor_alpha_prefetch_5,
};
/* For grins, also test the generic routines. */
#include <asm-generic/xor.h>
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
xor_speed(&xor_block_8regs); \
xor_speed(&xor_block_32regs); \
xor_speed(&xor_block_alpha); \
xor_speed(&xor_block_alpha_prefetch); \
} while (0)
/* Force the use of alpha_prefetch if EV6, as it is significantly
faster in the cold cache case. */
#define XOR_SELECT_TEMPLATE(FASTEST) \
(implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST)
#include <asm-generic/xor.h>
/*
* include/asm-generic/xor.h
*
* Generic optimized RAID-5 checksumming functions.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
static void
xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
long lines = bytes / (sizeof (long)) / 8;
do {
p1[0] ^= p2[0];
p1[1] ^= p2[1];
p1[2] ^= p2[2];
p1[3] ^= p2[3];
p1[4] ^= p2[4];
p1[5] ^= p2[5];
p1[6] ^= p2[6];
p1[7] ^= p2[7];
p1 += 8;
p2 += 8;
} while (--lines > 0);
}
static void
xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
long lines = bytes / (sizeof (long)) / 8;
do {
p1[0] ^= p2[0] ^ p3[0];
p1[1] ^= p2[1] ^ p3[1];
p1[2] ^= p2[2] ^ p3[2];
p1[3] ^= p2[3] ^ p3[3];
p1[4] ^= p2[4] ^ p3[4];
p1[5] ^= p2[5] ^ p3[5];
p1[6] ^= p2[6] ^ p3[6];
p1[7] ^= p2[7] ^ p3[7];
p1 += 8;
p2 += 8;
p3 += 8;
} while (--lines > 0);
}
static void
xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
long lines = bytes / (sizeof (long)) / 8;
do {
p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
p1 += 8;
p2 += 8;
p3 += 8;
p4 += 8;
} while (--lines > 0);
}
static void
xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
long lines = bytes / (sizeof (long)) / 8;
do {
p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
p1 += 8;
p2 += 8;
p3 += 8;
p4 += 8;
p5 += 8;
} while (--lines > 0);
}
static void
xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
long lines = bytes / (sizeof (long)) / 8;
do {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = p1[0]; /* Pull the stuff into registers */
d1 = p1[1]; /* ... in bursts, if possible. */
d2 = p1[2];
d3 = p1[3];
d4 = p1[4];
d5 = p1[5];
d6 = p1[6];
d7 = p1[7];
d0 ^= p2[0];
d1 ^= p2[1];
d2 ^= p2[2];
d3 ^= p2[3];
d4 ^= p2[4];
d5 ^= p2[5];
d6 ^= p2[6];
d7 ^= p2[7];
p1[0] = d0; /* Store the result (in burts) */
p1[1] = d1;
p1[2] = d2;
p1[3] = d3;
p1[4] = d4;
p1[5] = d5;
p1[6] = d6;
p1[7] = d7;
p1 += 8;
p2 += 8;
} while (--lines > 0);
}
static void
xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
long lines = bytes / (sizeof (long)) / 8;
do {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = p1[0]; /* Pull the stuff into registers */
d1 = p1[1]; /* ... in bursts, if possible. */
d2 = p1[2];
d3 = p1[3];
d4 = p1[4];
d5 = p1[5];
d6 = p1[6];
d7 = p1[7];
d0 ^= p2[0];
d1 ^= p2[1];
d2 ^= p2[2];
d3 ^= p2[3];
d4 ^= p2[4];
d5 ^= p2[5];
d6 ^= p2[6];
d7 ^= p2[7];
d0 ^= p3[0];
d1 ^= p3[1];
d2 ^= p3[2];
d3 ^= p3[3];
d4 ^= p3[4];
d5 ^= p3[5];
d6 ^= p3[6];
d7 ^= p3[7];
p1[0] = d0; /* Store the result (in burts) */
p1[1] = d1;
p1[2] = d2;
p1[3] = d3;
p1[4] = d4;
p1[5] = d5;
p1[6] = d6;
p1[7] = d7;
p1 += 8;
p2 += 8;
p3 += 8;
} while (--lines > 0);
}
static void
xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
long lines = bytes / (sizeof (long)) / 8;
do {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = p1[0]; /* Pull the stuff into registers */
d1 = p1[1]; /* ... in bursts, if possible. */
d2 = p1[2];
d3 = p1[3];
d4 = p1[4];
d5 = p1[5];
d6 = p1[6];
d7 = p1[7];
d0 ^= p2[0];
d1 ^= p2[1];
d2 ^= p2[2];
d3 ^= p2[3];
d4 ^= p2[4];
d5 ^= p2[5];
d6 ^= p2[6];
d7 ^= p2[7];
d0 ^= p3[0];
d1 ^= p3[1];
d2 ^= p3[2];
d3 ^= p3[3];
d4 ^= p3[4];
d5 ^= p3[5];
d6 ^= p3[6];
d7 ^= p3[7];
d0 ^= p4[0];
d1 ^= p4[1];
d2 ^= p4[2];
d3 ^= p4[3];
d4 ^= p4[4];
d5 ^= p4[5];
d6 ^= p4[6];
d7 ^= p4[7];
p1[0] = d0; /* Store the result (in burts) */
p1[1] = d1;
p1[2] = d2;
p1[3] = d3;
p1[4] = d4;
p1[5] = d5;
p1[6] = d6;
p1[7] = d7;
p1 += 8;
p2 += 8;
p3 += 8;
p4 += 8;
} while (--lines > 0);
}
static void
xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
long lines = bytes / (sizeof (long)) / 8;
do {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = p1[0]; /* Pull the stuff into registers */
d1 = p1[1]; /* ... in bursts, if possible. */
d2 = p1[2];
d3 = p1[3];
d4 = p1[4];
d5 = p1[5];
d6 = p1[6];
d7 = p1[7];
d0 ^= p2[0];
d1 ^= p2[1];
d2 ^= p2[2];
d3 ^= p2[3];
d4 ^= p2[4];
d5 ^= p2[5];
d6 ^= p2[6];
d7 ^= p2[7];
d0 ^= p3[0];
d1 ^= p3[1];
d2 ^= p3[2];
d3 ^= p3[3];
d4 ^= p3[4];
d5 ^= p3[5];
d6 ^= p3[6];
d7 ^= p3[7];
d0 ^= p4[0];
d1 ^= p4[1];
d2 ^= p4[2];
d3 ^= p4[3];
d4 ^= p4[4];
d5 ^= p4[5];
d6 ^= p4[6];
d7 ^= p4[7];
d0 ^= p5[0];
d1 ^= p5[1];
d2 ^= p5[2];
d3 ^= p5[3];
d4 ^= p5[4];
d5 ^= p5[5];
d6 ^= p5[6];
d7 ^= p5[7];
p1[0] = d0; /* Store the result (in burts) */
p1[1] = d1;
p1[2] = d2;
p1[3] = d3;
p1[4] = d4;
p1[5] = d5;
p1[6] = d6;
p1[7] = d7;
p1 += 8;
p2 += 8;
p3 += 8;
p4 += 8;
p5 += 8;
} while (--lines > 0);
}
static struct xor_block_template xor_block_8regs = {
name: "8regs",
do_2: xor_8regs_2,
do_3: xor_8regs_3,
do_4: xor_8regs_4,
do_5: xor_8regs_5,
};
static struct xor_block_template xor_block_32regs = {
name: "32regs",
do_2: xor_32regs_2,
do_3: xor_32regs_3,
do_4: xor_32regs_4,
do_5: xor_32regs_5,
};
#define XOR_TRY_TEMPLATES \
do { \
xor_speed(&xor_block_8regs); \
xor_speed(&xor_block_32regs); \
} while (0)
/*
* include/asm-i386/xor.h
*
* Optimized RAID-5 checksumming functions for MMX and SSE.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* High-speed RAID5 checksumming functions utilizing MMX instructions.
* Copyright (C) 1998 Ingo Molnar.
*/
#define FPU_SAVE \
do { \
if (!(current->flags & PF_USEDFPU)) \
__asm__ __volatile__ (" clts;\n"); \
__asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0])); \
} while (0)
#define FPU_RESTORE \
do { \
__asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0])); \
if (!(current->flags & PF_USEDFPU)) \
stts(); \
} while (0)
#define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
#define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
#define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
#define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
#define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
#define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
static void
xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
unsigned long lines = bytes >> 7;
char fpu_save[108];
FPU_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
XO1(i,0) \
ST(i,0) \
XO1(i+1,1) \
ST(i+1,1) \
XO1(i+2,2) \
ST(i+2,2) \
XO1(i+3,3) \
ST(i+3,3)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $128, %1 ;\n"
" addl $128, %2 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r" (p2)
: "memory");
FPU_RESTORE;
}
static void
xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
unsigned long lines = bytes >> 7;
char fpu_save[108];
FPU_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
XO2(i,0) \
ST(i,0) \
XO2(i+1,1) \
ST(i+1,1) \
XO2(i+2,2) \
ST(i+2,2) \
XO2(i+3,3) \
ST(i+3,3)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $128, %1 ;\n"
" addl $128, %2 ;\n"
" addl $128, %3 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r" (p2), "r" (p3)
: "memory");
FPU_RESTORE;
}
static void
xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
unsigned long lines = bytes >> 7;
char fpu_save[108];
FPU_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
XO3(i,0) \
ST(i,0) \
XO3(i+1,1) \
ST(i+1,1) \
XO3(i+2,2) \
ST(i+2,2) \
XO3(i+3,3) \
ST(i+3,3)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $128, %1 ;\n"
" addl $128, %2 ;\n"
" addl $128, %3 ;\n"
" addl $128, %4 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r" (p2), "r" (p3), "r" (p4)
: "memory");
FPU_RESTORE;
}
static void
xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
unsigned long lines = bytes >> 7;
char fpu_save[108];
FPU_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
XO3(i,0) \
XO3(i+1,1) \
XO3(i+2,2) \
XO3(i+3,3) \
XO4(i,0) \
ST(i,0) \
XO4(i+1,1) \
ST(i+1,1) \
XO4(i+2,2) \
ST(i+2,2) \
XO4(i+3,3) \
ST(i+3,3)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $128, %1 ;\n"
" addl $128, %2 ;\n"
" addl $128, %3 ;\n"
" addl $128, %4 ;\n"
" addl $128, %5 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "g" (lines),
"r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
: "memory");
FPU_RESTORE;
}
#undef LD
#undef XO1
#undef XO2
#undef XO3
#undef XO4
#undef ST
#undef BLOCK
static void
xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
unsigned long lines = bytes >> 6;
char fpu_save[108];
FPU_SAVE;
__asm__ __volatile__ (
" .align 32 ;\n"
" 1: ;\n"
" movq (%1), %%mm0 ;\n"
" movq 8(%1), %%mm1 ;\n"
" pxor (%2), %%mm0 ;\n"
" movq 16(%1), %%mm2 ;\n"
" movq %%mm0, (%1) ;\n"
" pxor 8(%2), %%mm1 ;\n"
" movq 24(%1), %%mm3 ;\n"
" movq %%mm1, 8(%1) ;\n"
" pxor 16(%2), %%mm2 ;\n"
" movq 32(%1), %%mm4 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 24(%2), %%mm3 ;\n"
" movq 40(%1), %%mm5 ;\n"
" movq %%mm3, 24(%1) ;\n"
" pxor 32(%2), %%mm4 ;\n"
" movq 48(%1), %%mm6 ;\n"
" movq %%mm4, 32(%1) ;\n"
" pxor 40(%2), %%mm5 ;\n"
" movq 56(%1), %%mm7 ;\n"
" movq %%mm5, 40(%1) ;\n"
" pxor 48(%2), %%mm6 ;\n"
" pxor 56(%2), %%mm7 ;\n"
" movq %%mm6, 48(%1) ;\n"
" movq %%mm7, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r" (p2)
: "memory");
FPU_RESTORE;
}
static void
xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
unsigned long lines = bytes >> 6;
char fpu_save[108];
FPU_SAVE;
__asm__ __volatile__ (
" .align 32,0x90 ;\n"
" 1: ;\n"
" movq (%1), %%mm0 ;\n"
" movq 8(%1), %%mm1 ;\n"
" pxor (%2), %%mm0 ;\n"
" movq 16(%1), %%mm2 ;\n"
" pxor 8(%2), %%mm1 ;\n"
" pxor (%3), %%mm0 ;\n"
" pxor 16(%2), %%mm2 ;\n"
" movq %%mm0, (%1) ;\n"
" pxor 8(%3), %%mm1 ;\n"
" pxor 16(%3), %%mm2 ;\n"
" movq 24(%1), %%mm3 ;\n"
" movq %%mm1, 8(%1) ;\n"
" movq 32(%1), %%mm4 ;\n"
" movq 40(%1), %%mm5 ;\n"
" pxor 24(%2), %%mm3 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 32(%2), %%mm4 ;\n"
" pxor 24(%3), %%mm3 ;\n"
" pxor 40(%2), %%mm5 ;\n"
" movq %%mm3, 24(%1) ;\n"
" pxor 32(%3), %%mm4 ;\n"
" pxor 40(%3), %%mm5 ;\n"
" movq 48(%1), %%mm6 ;\n"
" movq %%mm4, 32(%1) ;\n"
" movq 56(%1), %%mm7 ;\n"
" pxor 48(%2), %%mm6 ;\n"
" movq %%mm5, 40(%1) ;\n"
" pxor 56(%2), %%mm7 ;\n"
" pxor 48(%3), %%mm6 ;\n"
" pxor 56(%3), %%mm7 ;\n"
" movq %%mm6, 48(%1) ;\n"
" movq %%mm7, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
" addl $64, %3 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r" (p2), "r" (p3)
: "memory" );
FPU_RESTORE;
}
static void
xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
unsigned long lines = bytes >> 6;
char fpu_save[108];
FPU_SAVE;
__asm__ __volatile__ (
" .align 32,0x90 ;\n"
" 1: ;\n"
" movq (%1), %%mm0 ;\n"
" movq 8(%1), %%mm1 ;\n"
" pxor (%2), %%mm0 ;\n"
" movq 16(%1), %%mm2 ;\n"
" pxor 8(%2), %%mm1 ;\n"
" pxor (%3), %%mm0 ;\n"
" pxor 16(%2), %%mm2 ;\n"
" pxor 8(%3), %%mm1 ;\n"
" pxor (%4), %%mm0 ;\n"
" movq 24(%1), %%mm3 ;\n"
" pxor 16(%3), %%mm2 ;\n"
" pxor 8(%4), %%mm1 ;\n"
" movq %%mm0, (%1) ;\n"
" movq 32(%1), %%mm4 ;\n"
" pxor 24(%2), %%mm3 ;\n"
" pxor 16(%4), %%mm2 ;\n"
" movq %%mm1, 8(%1) ;\n"
" movq 40(%1), %%mm5 ;\n"
" pxor 32(%2), %%mm4 ;\n"
" pxor 24(%3), %%mm3 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 40(%2), %%mm5 ;\n"
" pxor 32(%3), %%mm4 ;\n"
" pxor 24(%4), %%mm3 ;\n"
" movq %%mm3, 24(%1) ;\n"
" movq 56(%1), %%mm7 ;\n"
" movq 48(%1), %%mm6 ;\n"
" pxor 40(%3), %%mm5 ;\n"
" pxor 32(%4), %%mm4 ;\n"
" pxor 48(%2), %%mm6 ;\n"
" movq %%mm4, 32(%1) ;\n"
" pxor 56(%2), %%mm7 ;\n"
" pxor 40(%4), %%mm5 ;\n"
" pxor 48(%3), %%mm6 ;\n"
" pxor 56(%3), %%mm7 ;\n"
" movq %%mm5, 40(%1) ;\n"
" pxor 48(%4), %%mm6 ;\n"
" pxor 56(%4), %%mm7 ;\n"
" movq %%mm6, 48(%1) ;\n"
" movq %%mm7, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
" addl $64, %3 ;\n"
" addl $64, %4 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r" (p2), "r" (p3), "r" (p4)
: "memory");
FPU_RESTORE;
}
static void
xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
unsigned long lines = bytes >> 6;
char fpu_save[108];
FPU_SAVE;
__asm__ __volatile__ (
" .align 32,0x90 ;\n"
" 1: ;\n"
" movq (%1), %%mm0 ;\n"
" movq 8(%1), %%mm1 ;\n"
" pxor (%2), %%mm0 ;\n"
" pxor 8(%2), %%mm1 ;\n"
" movq 16(%1), %%mm2 ;\n"
" pxor (%3), %%mm0 ;\n"
" pxor 8(%3), %%mm1 ;\n"
" pxor 16(%2), %%mm2 ;\n"
" pxor (%4), %%mm0 ;\n"
" pxor 8(%4), %%mm1 ;\n"
" pxor 16(%3), %%mm2 ;\n"
" movq 24(%1), %%mm3 ;\n"
" pxor (%5), %%mm0 ;\n"
" pxor 8(%5), %%mm1 ;\n"
" movq %%mm0, (%1) ;\n"
" pxor 16(%4), %%mm2 ;\n"
" pxor 24(%2), %%mm3 ;\n"
" movq %%mm1, 8(%1) ;\n"
" pxor 16(%5), %%mm2 ;\n"
" pxor 24(%3), %%mm3 ;\n"
" movq 32(%1), %%mm4 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 24(%4), %%mm3 ;\n"
" pxor 32(%2), %%mm4 ;\n"
" movq 40(%1), %%mm5 ;\n"
" pxor 24(%5), %%mm3 ;\n"
" pxor 32(%3), %%mm4 ;\n"
" pxor 40(%2), %%mm5 ;\n"
" movq %%mm3, 24(%1) ;\n"
" pxor 32(%4), %%mm4 ;\n"
" pxor 40(%3), %%mm5 ;\n"
" movq 48(%1), %%mm6 ;\n"
" movq 56(%1), %%mm7 ;\n"
" pxor 32(%5), %%mm4 ;\n"
" pxor 40(%4), %%mm5 ;\n"
" pxor 48(%2), %%mm6 ;\n"
" pxor 56(%2), %%mm7 ;\n"
" movq %%mm4, 32(%1) ;\n"
" pxor 48(%3), %%mm6 ;\n"
" pxor 56(%3), %%mm7 ;\n"
" pxor 40(%5), %%mm5 ;\n"
" pxor 48(%4), %%mm6 ;\n"
" pxor 56(%4), %%mm7 ;\n"
" movq %%mm5, 40(%1) ;\n"
" pxor 48(%5), %%mm6 ;\n"
" pxor 56(%5), %%mm7 ;\n"
" movq %%mm6, 48(%1) ;\n"
" movq %%mm7, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
" addl $64, %3 ;\n"
" addl $64, %4 ;\n"
" addl $64, %5 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "g" (lines),
"r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
: "memory");
FPU_RESTORE;
}
static struct xor_block_template xor_block_pII_mmx = {
name: "pII_mmx",
do_2: xor_pII_mmx_2,
do_3: xor_pII_mmx_3,
do_4: xor_pII_mmx_4,
do_5: xor_pII_mmx_5,
};
static struct xor_block_template xor_block_p5_mmx = {
name: "p5_mmx",
do_2: xor_p5_mmx_2,
do_3: xor_p5_mmx_3,
do_4: xor_p5_mmx_4,
do_5: xor_p5_mmx_5,
};
#undef FPU_SAVE
#undef FPU_RESTORE
/*
* Cache avoiding checksumming functions utilizing KNI instructions
* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
*/
#define XMMS_SAVE \
__asm__ __volatile__ ( \
"movl %%cr0,%0 ;\n\t" \
"clts ;\n\t" \
"movups %%xmm0,(%1) ;\n\t" \
"movups %%xmm1,0x10(%1) ;\n\t" \
"movups %%xmm2,0x20(%1) ;\n\t" \
"movups %%xmm3,0x30(%1) ;\n\t" \
: "=r" (cr0) \
: "r" (xmm_save) \
: "memory")
#define XMMS_RESTORE \
__asm__ __volatile__ ( \
"sfence ;\n\t" \
"movups (%1),%%xmm0 ;\n\t" \
"movups 0x10(%1),%%xmm1 ;\n\t" \
"movups 0x20(%1),%%xmm2 ;\n\t" \
"movups 0x30(%1),%%xmm3 ;\n\t" \
"movl %0,%%cr0 ;\n\t" \
: \
: "r" (cr0), "r" (xmm_save) \
: "memory")
#define OFFS(x) "16*("#x")"
#define PF0(x) " prefetcht0 "OFFS(x)"(%1) ;\n"
#define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
#define PF1(x) " prefetchnta "OFFS(x)"(%2) ;\n"
#define PF2(x) " prefetchnta "OFFS(x)"(%3) ;\n"
#define PF3(x) " prefetchnta "OFFS(x)"(%4) ;\n"
#define PF4(x) " prefetchnta "OFFS(x)"(%5) ;\n"
#define PF5(x) " prefetchnta "OFFS(x)"(%6) ;\n"
#define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
#define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
#define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
#define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
#define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
static void
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
unsigned long lines = bytes >> 8;
char xmm_save[16*4];
int cr0;
XMMS_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
PF1(i) \
PF1(i+2) \
LD(i+2,2) \
LD(i+3,3) \
PF0(i+4) \
PF0(i+6) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
ST(i,0) \
ST(i+1,1) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $256, %1 ;\n"
" addl $256, %2 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r" (p2)
: "memory");
XMMS_RESTORE;
}
static void
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
unsigned long lines = bytes >> 8;
char xmm_save[16*4];
int cr0;
XMMS_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i+2) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
PF2(i) \
PF2(i+2) \
PF0(i+4) \
PF0(i+6) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
ST(i,0) \
ST(i+1,1) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $256, %1 ;\n"
" addl $256, %2 ;\n"
" addl $256, %3 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r"(p2), "r"(p3)
: "memory" );
XMMS_RESTORE;
}
static void
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
unsigned long lines = bytes >> 8;
char xmm_save[16*4];
int cr0;
XMMS_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i+2) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
PF2(i) \
PF2(i+2) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
PF3(i) \
PF3(i+2) \
PF0(i+4) \
PF0(i+6) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
XO3(i,0) \
XO3(i+1,1) \
XO3(i+2,2) \
XO3(i+3,3) \
ST(i,0) \
ST(i+1,1) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $256, %1 ;\n"
" addl $256, %2 ;\n"
" addl $256, %3 ;\n"
" addl $256, %4 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r" (p2), "r" (p3), "r" (p4)
: "memory" );
XMMS_RESTORE;
}
static void
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
unsigned long lines = bytes >> 8;
char xmm_save[16*4];
int cr0;
XMMS_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i+2) \
LD(i,0) \
LD(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
PF2(i) \
PF2(i+2) \
XO1(i,0) \
XO1(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
PF3(i) \
PF3(i+2) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
PF4(i) \
PF4(i+2) \
PF0(i+4) \
PF0(i+6) \
XO3(i,0) \
XO3(i+1,1) \
XO3(i+2,2) \
XO3(i+3,3) \
XO4(i,0) \
XO4(i+1,1) \
XO4(i+2,2) \
XO4(i+3,3) \
ST(i,0) \
ST(i+1,1) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" addl $256, %1 ;\n"
" addl $256, %2 ;\n"
" addl $256, %3 ;\n"
" addl $256, %4 ;\n"
" addl $256, %5 ;\n"
" decl %0 ;\n"
" jnz 1b ;\n"
:
: "r" (lines),
"r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
: "memory");
XMMS_RESTORE;
}
static struct xor_block_template xor_block_pIII_sse = {
name: "pIII_sse",
do_2: xor_sse_2,
do_3: xor_sse_3,
do_4: xor_sse_4,
do_5: xor_sse_5,
};
/* Also try the generic routines. */
#include <asm-generic/xor.h>
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
xor_speed(&xor_block_8regs); \
xor_speed(&xor_block_32regs); \
if (cpu_has_xmm) \
xor_speed(&xor_block_pIII_sse); \
if (md_cpu_has_mmx()) { \
xor_speed(&xor_block_pII_mmx); \
xor_speed(&xor_block_p5_mmx); \
} \
} while (0)
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#define XOR_SELECT_TEMPLATE(FASTEST) \
(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
/*
* include/asm-ia64/xor.h
*
* Optimized RAID-5 checksumming functions for IA-64.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
extern void xor_ia64_2(unsigned long, unsigned long *, unsigned long *);
extern void xor_ia64_3(unsigned long, unsigned long *, unsigned long *,
unsigned long *);
extern void xor_ia64_4(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void xor_ia64_5(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *, unsigned long *);
asm ("
.text
// Assume L2 memory latency of 6 cycles.
.proc xor_ia64_2
xor_ia64_2:
.prologue
.fframe 0
{ .mii
.save ar.pfs, r31
alloc r31 = ar.pfs, 3, 0, 13, 16
.save ar.lc, r30
mov r30 = ar.lc
.save pr, r29
mov r29 = pr
;;
}
.body
{ .mii
mov r8 = in1
mov ar.ec = 6 + 2
shr in0 = in0, 3
;;
}
{ .mmi
adds in0 = -1, in0
mov r16 = in1
mov r17 = in2
;;
}
{ .mii
mov ar.lc = in0
mov pr.rot = 1 << 16
;;
}
.rotr s1[6+1], s2[6+1], d[2]
.rotp p[6+2]
0: { .mmi
(p[0]) ld8.nta s1[0] = [r16], 8
(p[0]) ld8.nta s2[0] = [r17], 8
(p[6]) xor d[0] = s1[6], s2[6]
}
{ .mfb
(p[6+1]) st8.nta [r8] = d[1], 8
nop.f 0
br.ctop.dptk.few 0b
;;
}
{ .mii
mov ar.lc = r30
mov pr = r29, -1
}
{ .bbb
br.ret.sptk.few rp
}
.endp xor_ia64_2
.proc xor_ia64_3
xor_ia64_3:
.prologue
.fframe 0
{ .mii
.save ar.pfs, r31
alloc r31 = ar.pfs, 4, 0, 20, 24
.save ar.lc, r30
mov r30 = ar.lc
.save pr, r29
mov r29 = pr
;;
}
.body
{ .mii
mov r8 = in1
mov ar.ec = 6 + 2
shr in0 = in0, 3
;;
}
{ .mmi
adds in0 = -1, in0
mov r16 = in1
mov r17 = in2
;;
}
{ .mii
mov r18 = in3
mov ar.lc = in0
mov pr.rot = 1 << 16
;;
}
.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
.rotp p[6+2]
0: { .mmi
(p[0]) ld8.nta s1[0] = [r16], 8
(p[0]) ld8.nta s2[0] = [r17], 8
(p[6]) xor d[0] = s1[6], s2[6]
;;
}
{ .mmi
(p[0]) ld8.nta s3[0] = [r18], 8
(p[6+1]) st8.nta [r8] = d[1], 8
(p[6]) xor d[0] = d[0], s3[6]
}
{ .bbb
br.ctop.dptk.few 0b
;;
}
{ .mii
mov ar.lc = r30
mov pr = r29, -1
}
{ .bbb
br.ret.sptk.few rp
}
.endp xor_ia64_3
.proc xor_ia64_4
xor_ia64_4:
.prologue
.fframe 0
{ .mii
.save ar.pfs, r31
alloc r31 = ar.pfs, 5, 0, 27, 32
.save ar.lc, r30
mov r30 = ar.lc
.save pr, r29
mov r29 = pr
;;
}
.body
{ .mii
mov r8 = in1
mov ar.ec = 6 + 2
shr in0 = in0, 3
;;
}
{ .mmi
adds in0 = -1, in0
mov r16 = in1
mov r17 = in2
;;
}
{ .mii
mov r18 = in3
mov ar.lc = in0
mov pr.rot = 1 << 16
}
{ .mfb
mov r19 = in4
;;
}
.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
.rotp p[6+2]
0: { .mmi
(p[0]) ld8.nta s1[0] = [r16], 8
(p[0]) ld8.nta s2[0] = [r17], 8
(p[6]) xor d[0] = s1[6], s2[6]
}
{ .mmi
(p[0]) ld8.nta s3[0] = [r18], 8
(p[0]) ld8.nta s4[0] = [r19], 8
(p[6]) xor r20 = s3[6], s4[6]
;;
}
{ .mib
(p[6+1]) st8.nta [r8] = d[1], 8
(p[6]) xor d[0] = d[0], r20
br.ctop.dptk.few 0b
;;
}
{ .mii
mov ar.lc = r30
mov pr = r29, -1
}
{ .bbb
br.ret.sptk.few rp
}
.endp xor_ia64_4
.proc xor_ia64_5
xor_ia64_5:
.prologue
.fframe 0
{ .mii
.save ar.pfs, r31
alloc r31 = ar.pfs, 6, 0, 34, 40
.save ar.lc, r30
mov r30 = ar.lc
.save pr, r29
mov r29 = pr
;;
}
.body
{ .mii
mov r8 = in1
mov ar.ec = 6 + 2
shr in0 = in0, 3
;;
}
{ .mmi
adds in0 = -1, in0
mov r16 = in1
mov r17 = in2
;;
}
{ .mii
mov r18 = in3
mov ar.lc = in0
mov pr.rot = 1 << 16
}
{ .mib
mov r19 = in4
mov r20 = in5
;;
}
.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
.rotp p[6+2]
0: { .mmi
(p[0]) ld8.nta s1[0] = [r16], 8
(p[0]) ld8.nta s2[0] = [r17], 8
(p[6]) xor d[0] = s1[6], s2[6]
}
{ .mmi
(p[0]) ld8.nta s3[0] = [r18], 8
(p[0]) ld8.nta s4[0] = [r19], 8
(p[6]) xor r21 = s3[6], s4[6]
;;
}
{ .mmi
(p[0]) ld8.nta s5[0] = [r20], 8
(p[6+1]) st8.nta [r8] = d[1], 8
(p[6]) xor d[0] = d[0], r21
;;
}
{ .mfb
(p[6]) xor d[0] = d[0], s5[6]
nop.f 0
br.ctop.dptk.few 0b
;;
}
{ .mii
mov ar.lc = r30
mov pr = r29, -1
}
{ .bbb
br.ret.sptk.few rp
}
.endp xor_ia64_5
");
static struct xor_block_template xor_block_ia64 = {
name: "ia64",
do_2: xor_ia64_2,
do_3: xor_ia64_3,
do_4: xor_ia64_4,
do_5: xor_ia64_5,
};
#define XOR_TRY_TEMPLATES xor_speed(&xor_block_ia64)
#include <asm-generic/xor.h>
#include <asm-generic/xor.h>
#include <asm-generic/xor.h>
#include <asm-generic/xor.h>
#include <asm-generic/xor.h>
#include <asm-generic/xor.h>
/*
* include/asm-sparc/xor.h
*
* Optimized RAID-5 checksumming functions for 32-bit Sparc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* High speed xor_block operation for RAID4/5 utilizing the
* ldd/std SPARC instructions.
*
* Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
*/
static void
sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
int lines = bytes / (sizeof (long)) / 8;
do {
__asm__ __volatile__("
ldd [%0 + 0x00], %%g2
ldd [%0 + 0x08], %%g4
ldd [%0 + 0x10], %%o0
ldd [%0 + 0x18], %%o2
ldd [%1 + 0x00], %%o4
ldd [%1 + 0x08], %%l0
ldd [%1 + 0x10], %%l2
ldd [%1 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
std %%g2, [%0 + 0x00]
std %%g4, [%0 + 0x08]
std %%o0, [%0 + 0x10]
std %%o2, [%0 + 0x18]
"
:
: "r" (p1), "r" (p2)
: "g2", "g3", "g4", "g5",
"o0", "o1", "o2", "o3", "o4", "o5",
"l0", "l1", "l2", "l3", "l4", "l5");
p1 += 8;
p2 += 8;
} while (--lines > 0);
}
static void
sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
int lines = bytes / (sizeof (long)) / 8;
do {
__asm__ __volatile__("
ldd [%0 + 0x00], %%g2
ldd [%0 + 0x08], %%g4
ldd [%0 + 0x10], %%o0
ldd [%0 + 0x18], %%o2
ldd [%1 + 0x00], %%o4
ldd [%1 + 0x08], %%l0
ldd [%1 + 0x10], %%l2
ldd [%1 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%2 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%2 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%2 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%2 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
std %%g2, [%0 + 0x00]
std %%g4, [%0 + 0x08]
std %%o0, [%0 + 0x10]
std %%o2, [%0 + 0x18]
"
:
: "r" (p1), "r" (p2), "r" (p3)
: "g2", "g3", "g4", "g5",
"o0", "o1", "o2", "o3", "o4", "o5",
"l0", "l1", "l2", "l3", "l4", "l5");
p1 += 8;
p2 += 8;
p3 += 8;
} while (--lines > 0);
}
static void
sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
int lines = bytes / (sizeof (long)) / 8;
do {
__asm__ __volatile__("
ldd [%0 + 0x00], %%g2
ldd [%0 + 0x08], %%g4
ldd [%0 + 0x10], %%o0
ldd [%0 + 0x18], %%o2
ldd [%1 + 0x00], %%o4
ldd [%1 + 0x08], %%l0
ldd [%1 + 0x10], %%l2
ldd [%1 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%2 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%2 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%2 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%2 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%3 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%3 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%3 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%3 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
std %%g2, [%0 + 0x00]
std %%g4, [%0 + 0x08]
std %%o0, [%0 + 0x10]
std %%o2, [%0 + 0x18]
"
:
: "r" (p1), "r" (p2), "r" (p3), "r" (p4)
: "g2", "g3", "g4", "g5",
"o0", "o1", "o2", "o3", "o4", "o5",
"l0", "l1", "l2", "l3", "l4", "l5");
p1 += 8;
p2 += 8;
p3 += 8;
p4 += 8;
} while (--lines > 0);
}
static void
sparc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
int lines = bytes / (sizeof (long)) / 8;
do {
__asm__ __volatile__("
ldd [%0 + 0x00], %%g2
ldd [%0 + 0x08], %%g4
ldd [%0 + 0x10], %%o0
ldd [%0 + 0x18], %%o2
ldd [%1 + 0x00], %%o4
ldd [%1 + 0x08], %%l0
ldd [%1 + 0x10], %%l2
ldd [%1 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%2 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%2 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%2 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%2 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%3 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%3 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%3 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%3 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
ldd [%4 + 0x00], %%o4
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
ldd [%4 + 0x08], %%l0
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
ldd [%4 + 0x10], %%l2
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
ldd [%4 + 0x18], %%l4
xor %%g2, %%o4, %%g2
xor %%g3, %%o5, %%g3
xor %%g4, %%l0, %%g4
xor %%g5, %%l1, %%g5
xor %%o0, %%l2, %%o0
xor %%o1, %%l3, %%o1
xor %%o2, %%l4, %%o2
xor %%o3, %%l5, %%o3
std %%g2, [%0 + 0x00]
std %%g4, [%0 + 0x08]
std %%o0, [%0 + 0x10]
std %%o2, [%0 + 0x18]
"
:
: "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
: "g2", "g3", "g4", "g5",
"o0", "o1", "o2", "o3", "o4", "o5",
"l0", "l1", "l2", "l3", "l4", "l5");
p1 += 8;
p2 += 8;
p3 += 8;
p4 += 8;
p5 += 8;
} while (--lines > 0);
}
static struct xor_block_template xor_block_SPARC = {
name: "SPARC",
do_2: sparc_2,
do_3: sparc_3,
do_4: sparc_4,
do_5: sparc_5,
};
/* For grins, also test the generic routines. */
#include <asm-generic/xor.h>
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
xor_speed(&xor_block_8regs); \
xor_speed(&xor_block_32regs); \
xor_speed(&xor_block_SPARC); \
} while (0)
/*
* include/asm-sparc64/xor.h
*
* High speed xor_block operation for RAID4/5 utilizing the
* UltraSparc Visual Instruction Set.
*
* Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* Requirements:
* !(((long)dest | (long)sourceN) & (64 - 1)) &&
* !(len & 127) && len >= 256
*
* It is done in pure assembly, as otherwise gcc makes it a non-leaf
* function, which is not what we want.
*/
#include <asm/pstate.h>
#include <asm/asi.h>
extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
unsigned long *);
extern void xor_vis_4(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void xor_vis_5(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *, unsigned long *);
#define _S(x) __S(x)
#define __S(x) #x
#define DEF(x) __asm__(#x " = " _S(x))
DEF(FPRS_FEF);
DEF(FPRS_DU);
DEF(ASI_BLK_P);
/* ??? We set and use %asi instead of using ASI_BLK_P directly because gas
currently does not accept symbolic constants for the ASI specifier. */
__asm__ ("
.text
.globl xor_vis_2
.type xor_vis_2,@function
xor_vis_2:
rd %fprs, %g1
andcc %g1, FPRS_FEF|FPRS_DU, %g0
be,pt %icc, 0f
sethi %hi(VISenter), %g1
jmpl %g1 + %lo(VISenter), %g7
add %g7, 8, %g7
0: wr %g0, FPRS_FEF, %fprs
rd %asi, %g1
wr %g0, ASI_BLK_P, %asi
membar #LoadStore|#StoreLoad|#StoreStore
sub %o0, 128, %o0
ldda [%o1] %asi, %f0
ldda [%o2] %asi, %f16
2: ldda [%o1 + 64] %asi, %f32
fxor %f0, %f16, %f16
fxor %f2, %f18, %f18
fxor %f4, %f20, %f20
fxor %f6, %f22, %f22
fxor %f8, %f24, %f24
fxor %f10, %f26, %f26
fxor %f12, %f28, %f28
fxor %f14, %f30, %f30
stda %f16, [%o1] %asi
ldda [%o2 + 64] %asi, %f48
ldda [%o1 + 128] %asi, %f0
fxor %f32, %f48, %f48
fxor %f34, %f50, %f50
add %o1, 128, %o1
fxor %f36, %f52, %f52
add %o2, 128, %o2
fxor %f38, %f54, %f54
subcc %o0, 128, %o0
fxor %f40, %f56, %f56
fxor %f42, %f58, %f58
fxor %f44, %f60, %f60
fxor %f46, %f62, %f62
stda %f48, [%o1 - 64] %asi
bne,pt %xcc, 2b
ldda [%o2] %asi, %f16
ldda [%o1 + 64] %asi, %f32
fxor %f0, %f16, %f16
fxor %f2, %f18, %f18
fxor %f4, %f20, %f20
fxor %f6, %f22, %f22
fxor %f8, %f24, %f24
fxor %f10, %f26, %f26
fxor %f12, %f28, %f28
fxor %f14, %f30, %f30
stda %f16, [%o1] %asi
ldda [%o2 + 64] %asi, %f48
membar #Sync
fxor %f32, %f48, %f48
fxor %f34, %f50, %f50
fxor %f36, %f52, %f52
fxor %f38, %f54, %f54
fxor %f40, %f56, %f56
fxor %f42, %f58, %f58
fxor %f44, %f60, %f60
fxor %f46, %f62, %f62
stda %f48, [%o1 + 64] %asi
membar #Sync|#StoreStore|#StoreLoad
wr %g1, %g0, %asi
retl
wr %g0, 0, %fprs
.size xor_vis_2, .-xor_vis_2
.globl xor_vis_3
.type xor_vis_3,@function
xor_vis_3:
rd %fprs, %g1
andcc %g1, FPRS_FEF|FPRS_DU, %g0
be,pt %icc, 0f
sethi %hi(VISenter), %g1
jmpl %g1 + %lo(VISenter), %g7
add %g7, 8, %g7
0: wr %g0, FPRS_FEF, %fprs
rd %asi, %g1
wr %g0, ASI_BLK_P, %asi
membar #LoadStore|#StoreLoad|#StoreStore
sub %o0, 64, %o0
ldda [%o1] %asi, %f0
ldda [%o2] %asi, %f16
3: ldda [%o3] %asi, %f32
fxor %f0, %f16, %f48
fxor %f2, %f18, %f50
add %o1, 64, %o1
fxor %f4, %f20, %f52
fxor %f6, %f22, %f54
add %o2, 64, %o2
fxor %f8, %f24, %f56
fxor %f10, %f26, %f58
fxor %f12, %f28, %f60
fxor %f14, %f30, %f62
ldda [%o1] %asi, %f0
fxor %f48, %f32, %f48
fxor %f50, %f34, %f50
fxor %f52, %f36, %f52
fxor %f54, %f38, %f54
add %o3, 64, %o3
fxor %f56, %f40, %f56
fxor %f58, %f42, %f58
subcc %o0, 64, %o0
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
stda %f48, [%o1 - 64] %asi
bne,pt %xcc, 3b
ldda [%o2] %asi, %f16
ldda [%o3] %asi, %f32
fxor %f0, %f16, %f48
fxor %f2, %f18, %f50
fxor %f4, %f20, %f52
fxor %f6, %f22, %f54
fxor %f8, %f24, %f56
fxor %f10, %f26, %f58
fxor %f12, %f28, %f60
fxor %f14, %f30, %f62
membar #Sync
fxor %f48, %f32, %f48
fxor %f50, %f34, %f50
fxor %f52, %f36, %f52
fxor %f54, %f38, %f54
fxor %f56, %f40, %f56
fxor %f58, %f42, %f58
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
stda %f48, [%o1] %asi
membar #Sync|#StoreStore|#StoreLoad
wr %g1, %g0, %asi
retl
wr %g0, 0, %fprs
.size xor_vis_3, .-xor_vis_3
.globl xor_vis_4
.type xor_vis_4,@function
xor_vis_4:
rd %fprs, %g1
andcc %g1, FPRS_FEF|FPRS_DU, %g0
be,pt %icc, 0f
sethi %hi(VISenter), %g1
jmpl %g1 + %lo(VISenter), %g7
add %g7, 8, %g7
0: wr %g0, FPRS_FEF, %fprs
rd %asi, %g1
wr %g0, ASI_BLK_P, %asi
membar #LoadStore|#StoreLoad|#StoreStore
sub %o0, 64, %o0
ldda [%o1] %asi, %f0
ldda [%o2] %asi, %f16
4: ldda [%o3] %asi, %f32
fxor %f0, %f16, %f16
fxor %f2, %f18, %f18
add %o1, 64, %o1
fxor %f4, %f20, %f20
fxor %f6, %f22, %f22
add %o2, 64, %o2
fxor %f8, %f24, %f24
fxor %f10, %f26, %f26
fxor %f12, %f28, %f28
fxor %f14, %f30, %f30
ldda [%o4] %asi, %f48
fxor %f16, %f32, %f32
fxor %f18, %f34, %f34
fxor %f20, %f36, %f36
fxor %f22, %f38, %f38
add %o3, 64, %o3
fxor %f24, %f40, %f40
fxor %f26, %f42, %f42
fxor %f28, %f44, %f44
fxor %f30, %f46, %f46
ldda [%o1] %asi, %f0
fxor %f32, %f48, %f48
fxor %f34, %f50, %f50
fxor %f36, %f52, %f52
add %o4, 64, %o4
fxor %f38, %f54, %f54
fxor %f40, %f56, %f56
fxor %f42, %f58, %f58
subcc %o0, 64, %o0
fxor %f44, %f60, %f60
fxor %f46, %f62, %f62
stda %f48, [%o1 - 64] %asi
bne,pt %xcc, 4b
ldda [%o2] %asi, %f16
ldda [%o3] %asi, %f32
fxor %f0, %f16, %f16
fxor %f2, %f18, %f18
fxor %f4, %f20, %f20
fxor %f6, %f22, %f22
fxor %f8, %f24, %f24
fxor %f10, %f26, %f26
fxor %f12, %f28, %f28
fxor %f14, %f30, %f30
ldda [%o4] %asi, %f48
fxor %f16, %f32, %f32
fxor %f18, %f34, %f34
fxor %f20, %f36, %f36
fxor %f22, %f38, %f38
fxor %f24, %f40, %f40
fxor %f26, %f42, %f42
fxor %f28, %f44, %f44
fxor %f30, %f46, %f46
membar #Sync
fxor %f32, %f48, %f48
fxor %f34, %f50, %f50
fxor %f36, %f52, %f52
fxor %f38, %f54, %f54
fxor %f40, %f56, %f56
fxor %f42, %f58, %f58
fxor %f44, %f60, %f60
fxor %f46, %f62, %f62
stda %f48, [%o1] %asi
membar #Sync|#StoreStore|#StoreLoad
wr %g1, %g0, %asi
retl
wr %g0, 0, %fprs
.size xor_vis_4, .-xor_vis_4
.globl xor_vis_5
.type xor_vis_5,@function
xor_vis_5:
rd %fprs, %g1
andcc %g1, FPRS_FEF|FPRS_DU, %g0
be,pt %icc, 0f
sethi %hi(VISenter), %g1
jmpl %g1 + %lo(VISenter), %g7
add %g7, 8, %g7
0: wr %g0, FPRS_FEF, %fprs
rd %asi, %g1
wr %g0, ASI_BLK_P, %asi
membar #LoadStore|#StoreLoad|#StoreStore
sub %o0, 64, %o0
ldda [%o1] %asi, %f0
ldda [%o2] %asi, %f16
5: ldda [%o3] %asi, %f32
fxor %f0, %f16, %f48
fxor %f2, %f18, %f50
add %o1, 64, %o1
fxor %f4, %f20, %f52
fxor %f6, %f22, %f54
add %o2, 64, %o2
fxor %f8, %f24, %f56
fxor %f10, %f26, %f58
fxor %f12, %f28, %f60
fxor %f14, %f30, %f62
ldda [%o4] %asi, %f16
fxor %f48, %f32, %f48
fxor %f50, %f34, %f50
fxor %f52, %f36, %f52
fxor %f54, %f38, %f54
add %o3, 64, %o3
fxor %f56, %f40, %f56
fxor %f58, %f42, %f58
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
ldda [%o5] %asi, %f32
fxor %f48, %f16, %f48
fxor %f50, %f18, %f50
add %o4, 64, %o4
fxor %f52, %f20, %f52
fxor %f54, %f22, %f54
add %o5, 64, %o5
fxor %f56, %f24, %f56
fxor %f58, %f26, %f58
fxor %f60, %f28, %f60
fxor %f62, %f30, %f62
ldda [%o1] %asi, %f0
fxor %f48, %f32, %f48
fxor %f50, %f34, %f50
fxor %f52, %f36, %f52
fxor %f54, %f38, %f54
fxor %f56, %f40, %f56
fxor %f58, %f42, %f58
subcc %o0, 64, %o0
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
stda %f48, [%o1 - 64] %asi
bne,pt %xcc, 5b
ldda [%o2] %asi, %f16
ldda [%o3] %asi, %f32
fxor %f0, %f16, %f48
fxor %f2, %f18, %f50
fxor %f4, %f20, %f52
fxor %f6, %f22, %f54
fxor %f8, %f24, %f56
fxor %f10, %f26, %f58
fxor %f12, %f28, %f60
fxor %f14, %f30, %f62
ldda [%o4] %asi, %f16
fxor %f48, %f32, %f48
fxor %f50, %f34, %f50
fxor %f52, %f36, %f52
fxor %f54, %f38, %f54
fxor %f56, %f40, %f56
fxor %f58, %f42, %f58
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
ldda [%o5] %asi, %f32
fxor %f48, %f16, %f48
fxor %f50, %f18, %f50
fxor %f52, %f20, %f52
fxor %f54, %f22, %f54
fxor %f56, %f24, %f56
fxor %f58, %f26, %f58
fxor %f60, %f28, %f60
fxor %f62, %f30, %f62
membar #Sync
fxor %f48, %f32, %f48
fxor %f50, %f34, %f50
fxor %f52, %f36, %f52
fxor %f54, %f38, %f54
fxor %f56, %f40, %f56
fxor %f58, %f42, %f58
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
stda %f48, [%o1] %asi
membar #Sync|#StoreStore|#StoreLoad
wr %g1, %g0, %asi
retl
wr %g0, 0, %fprs
.size xor_vis_5, .-xor_vis_5
");
static struct xor_block_template xor_block_VIS = {
name: "VIS",
do_2: xor_vis_2,
do_3: xor_vis_3,
do_4: xor_vis_4,
do_5: xor_vis_5,
};
#define XOR_TRY_TEMPLATES xor_speed(&xor_block_VIS)
......@@ -73,7 +73,7 @@ extern struct kernel_param __setup_start, __setup_end;
* Mark functions and data as being only used at initialization
* or exit time.
*/
#define __init __attribute__ ((__section__ (".text.init")))
#define __init /* __attribute__ ((__section__ (".text.init"))) */
#define __exit __attribute__ ((unused, __section__(".text.exit")))
#define __initdata __attribute__ ((__section__ (".data.init")))
#define __exitdata __attribute__ ((unused, __section__ (".data.exit")))
......
......@@ -3,10 +3,21 @@
#include <linux/raid/md.h>
#define MAX_XOR_BLOCKS 4
#define MAX_XOR_BLOCKS 5
extern void calibrate_xor_block(void);
extern void (*xor_block)(unsigned int count,
struct buffer_head **bh_ptr);
extern void xor_block(unsigned int count, struct buffer_head **bh_ptr);
struct xor_block_template {
struct xor_block_template *next;
const char *name;
int speed;
void (*do_2)(unsigned long, unsigned long *, unsigned long *);
void (*do_3)(unsigned long, unsigned long *, unsigned long *,
unsigned long *);
void (*do_4)(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
void (*do_5)(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *, unsigned long *);
};
#endif
......@@ -486,10 +486,6 @@ EXPORT_SYMBOL(remove_inode_hash);
EXPORT_SYMBOL(make_bad_inode);
EXPORT_SYMBOL(is_bad_inode);
EXPORT_SYMBOL(event);
EXPORT_SYMBOL(__down);
EXPORT_SYMBOL(__down_interruptible);
EXPORT_SYMBOL(__down_trylock);
EXPORT_SYMBOL(__up);
EXPORT_SYMBOL(brw_page);
#ifdef CONFIG_UID16
......
......@@ -432,16 +432,28 @@ static inline void __schedule_tail(struct task_struct *prev)
#ifdef CONFIG_SMP
int policy;
/*
* prev->policy can be written from here only before `prev'
* can be scheduled (before setting prev->has_cpu to zero).
* Of course it must also be read before allowing prev
* to be rescheduled, but since the write depends on the read
* to complete, wmb() is enough. (the spin_lock() acquired
* before setting has_cpu is not enough because the spin_lock()
* common code semantics allows code outside the critical section
* to enter inside the critical section)
*/
policy = prev->policy;
prev->policy = policy & ~SCHED_YIELD;
wmb();
/*
* fast path falls through. We have to clear has_cpu before
* checking prev->state to avoid a wakeup race - thus we
* also have to protect against the task exiting early.
*/
task_lock(prev);
policy = prev->policy;
prev->policy = policy & ~SCHED_YIELD;
prev->has_cpu = 0;
wmb();
mb();
if (prev->state == TASK_RUNNING)
goto needs_resched;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment