Commit 297565aa authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

lib/xor: make xor prototypes more friendly to compiler vectorization

Modern compilers are perfectly capable of extracting parallelism from
the XOR routines, provided that the prototypes reflect the nature of the
input accurately, in particular, the fact that the input vectors are
expected not to overlap. This is not documented explicitly, but is
implied by the interchangeability of the various C routines, some of
which use temporary variables while others don't: this means that these
routines only behave identically for non-overlapping inputs.

So let's decorate these input vectors with the __restrict modifier,
which informs the compiler that there is no overlap. While at it, make
the input-only vectors pointer-to-const as well.
Tested-by: default avatarNathan Chancellor <nathan@kernel.org>
Signed-off-by: default avatarArd Biesheuvel <ardb@kernel.org>
Reviewed-by: default avatarNick Desaulniers <ndesaulniers@google.com>
Link: https://github.com/ClangBuiltLinux/linux/issues/563Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent e8bf24bd
......@@ -5,24 +5,43 @@
* Optimized RAID-5 checksumming functions for alpha EV5 and EV6
*/
extern void xor_alpha_2(unsigned long, unsigned long *, unsigned long *);
extern void xor_alpha_3(unsigned long, unsigned long *, unsigned long *,
unsigned long *);
extern void xor_alpha_4(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void xor_alpha_5(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *, unsigned long *);
extern void
xor_alpha_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2);
extern void
xor_alpha_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3);
extern void
xor_alpha_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4);
extern void
xor_alpha_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5);
extern void xor_alpha_prefetch_2(unsigned long, unsigned long *,
unsigned long *);
extern void xor_alpha_prefetch_3(unsigned long, unsigned long *,
unsigned long *, unsigned long *);
extern void xor_alpha_prefetch_4(unsigned long, unsigned long *,
unsigned long *, unsigned long *,
unsigned long *);
extern void xor_alpha_prefetch_5(unsigned long, unsigned long *,
unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void
xor_alpha_prefetch_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2);
extern void
xor_alpha_prefetch_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3);
extern void
xor_alpha_prefetch_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4);
extern void
xor_alpha_prefetch_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5);
asm(" \n\
.text \n\
......
......@@ -44,7 +44,8 @@
: "0" (dst), "r" (a1), "r" (a2), "r" (a3), "r" (a4))
static void
xor_arm4regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_arm4regs_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
unsigned int lines = bytes / sizeof(unsigned long) / 4;
register unsigned int a1 __asm__("r4");
......@@ -64,8 +65,9 @@ xor_arm4regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_arm4regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_arm4regs_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned int lines = bytes / sizeof(unsigned long) / 4;
register unsigned int a1 __asm__("r4");
......@@ -86,8 +88,10 @@ xor_arm4regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_arm4regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_arm4regs_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned int lines = bytes / sizeof(unsigned long) / 2;
register unsigned int a1 __asm__("r8");
......@@ -105,8 +109,11 @@ xor_arm4regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_arm4regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
unsigned int lines = bytes / sizeof(unsigned long) / 2;
register unsigned int a1 __asm__("r8");
......@@ -146,7 +153,8 @@ static struct xor_block_template xor_block_arm4regs = {
extern struct xor_block_template const xor_block_neon_inner;
static void
xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
if (in_interrupt()) {
xor_arm4regs_2(bytes, p1, p2);
......@@ -158,8 +166,9 @@ xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
if (in_interrupt()) {
xor_arm4regs_3(bytes, p1, p2, p3);
......@@ -171,8 +180,10 @@ xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
if (in_interrupt()) {
xor_arm4regs_4(bytes, p1, p2, p3, p4);
......@@ -184,8 +195,11 @@ xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
if (in_interrupt()) {
xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
......
......@@ -16,7 +16,8 @@
extern struct xor_block_template const xor_block_inner_neon;
static void
xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
kernel_neon_begin();
xor_block_inner_neon.do_2(bytes, p1, p2);
......@@ -24,8 +25,9 @@ xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
kernel_neon_begin();
xor_block_inner_neon.do_3(bytes, p1, p2, p3);
......@@ -33,8 +35,10 @@ xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
kernel_neon_begin();
xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
......@@ -42,8 +46,11 @@ xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
kernel_neon_begin();
xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
......
......@@ -10,8 +10,8 @@
#include <linux/module.h>
#include <asm/neon-intrinsics.h>
void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
unsigned long *p2)
void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
......@@ -37,8 +37,9 @@ void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3)
void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
......@@ -72,8 +73,10 @@ void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3, unsigned long *p4)
void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
......@@ -115,9 +118,11 @@ void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3,
unsigned long *p4, unsigned long *p5)
void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
......@@ -186,8 +191,10 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
return res;
}
static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3)
static void xor_arm64_eor3_3(unsigned long bytes,
unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
......@@ -219,9 +226,11 @@ static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3,
unsigned long *p4)
static void xor_arm64_eor3_4(unsigned long bytes,
unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
......@@ -261,9 +270,12 @@ static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3,
unsigned long *p4, unsigned long *p5)
static void xor_arm64_eor3_5(unsigned long bytes,
unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
......
......@@ -4,13 +4,20 @@
*/
extern void xor_ia64_2(unsigned long, unsigned long *, unsigned long *);
extern void xor_ia64_3(unsigned long, unsigned long *, unsigned long *,
unsigned long *);
extern void xor_ia64_4(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void xor_ia64_5(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *, unsigned long *);
extern void xor_ia64_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2);
extern void xor_ia64_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3);
extern void xor_ia64_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4);
extern void xor_ia64_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5);
static struct xor_block_template xor_block_ia64 = {
.name = "ia64",
......
......@@ -3,17 +3,20 @@
#define _ASM_POWERPC_XOR_ALTIVEC_H
#ifdef CONFIG_ALTIVEC
void xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in);
void xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in);
void xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in,
unsigned long *v4_in);
void xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in,
unsigned long *v4_in, unsigned long *v5_in);
void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2);
void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3);
void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4);
void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5);
#endif
#endif /* _ASM_POWERPC_XOR_ALTIVEC_H */
......@@ -49,8 +49,9 @@ typedef vector signed char unative_t;
V1##_3 = vec_xor(V1##_3, V2##_3); \
} while (0)
void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in)
void __xor_altivec_2(unsigned long bytes,
unsigned long * __restrict v1_in,
const unsigned long * __restrict v2_in)
{
DEFINE(v1);
DEFINE(v2);
......@@ -67,8 +68,10 @@ void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
} while (--lines > 0);
}
void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in)
void __xor_altivec_3(unsigned long bytes,
unsigned long * __restrict v1_in,
const unsigned long * __restrict v2_in,
const unsigned long * __restrict v3_in)
{
DEFINE(v1);
DEFINE(v2);
......@@ -89,9 +92,11 @@ void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
} while (--lines > 0);
}
void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in,
unsigned long *v4_in)
void __xor_altivec_4(unsigned long bytes,
unsigned long * __restrict v1_in,
const unsigned long * __restrict v2_in,
const unsigned long * __restrict v3_in,
const unsigned long * __restrict v4_in)
{
DEFINE(v1);
DEFINE(v2);
......@@ -116,9 +121,12 @@ void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
} while (--lines > 0);
}
void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in,
unsigned long *v4_in, unsigned long *v5_in)
void __xor_altivec_5(unsigned long bytes,
unsigned long * __restrict v1_in,
const unsigned long * __restrict v2_in,
const unsigned long * __restrict v3_in,
const unsigned long * __restrict v4_in,
const unsigned long * __restrict v5_in)
{
DEFINE(v1);
DEFINE(v2);
......
......@@ -6,16 +6,17 @@
* outside of the enable/disable altivec block.
*/
void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in);
void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in);
void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in,
unsigned long *v4_in);
void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in,
unsigned long *v4_in, unsigned long *v5_in);
void __xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2);
void __xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3);
void __xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4);
void __xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5);
......@@ -12,47 +12,51 @@
#include <asm/xor_altivec.h>
#include "xor_vmx.h"
void xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in)
void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
preempt_disable();
enable_kernel_altivec();
__xor_altivec_2(bytes, v1_in, v2_in);
__xor_altivec_2(bytes, p1, p2);
disable_kernel_altivec();
preempt_enable();
}
EXPORT_SYMBOL(xor_altivec_2);
void xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in)
void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
preempt_disable();
enable_kernel_altivec();
__xor_altivec_3(bytes, v1_in, v2_in, v3_in);
__xor_altivec_3(bytes, p1, p2, p3);
disable_kernel_altivec();
preempt_enable();
}
EXPORT_SYMBOL(xor_altivec_3);
void xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in,
unsigned long *v4_in)
void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
preempt_disable();
enable_kernel_altivec();
__xor_altivec_4(bytes, v1_in, v2_in, v3_in, v4_in);
__xor_altivec_4(bytes, p1, p2, p3, p4);
disable_kernel_altivec();
preempt_enable();
}
EXPORT_SYMBOL(xor_altivec_4);
void xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
unsigned long *v2_in, unsigned long *v3_in,
unsigned long *v4_in, unsigned long *v5_in)
void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
preempt_disable();
enable_kernel_altivec();
__xor_altivec_5(bytes, v1_in, v2_in, v3_in, v4_in, v5_in);
__xor_altivec_5(bytes, p1, p2, p3, p4, p5);
disable_kernel_altivec();
preempt_enable();
}
......
......@@ -11,7 +11,8 @@
#include <linux/raid/xor.h>
#include <asm/xor.h>
static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
asm volatile(
" larl 1,2f\n"
......@@ -32,8 +33,9 @@ static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
: "0", "1", "cc", "memory");
}
static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
asm volatile(
" larl 1,2f\n"
......@@ -58,8 +60,10 @@ static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
: : "0", "1", "cc", "memory");
}
static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
asm volatile(
" larl 1,2f\n"
......@@ -88,8 +92,11 @@ static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
: : "0", "1", "cc", "memory");
}
static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
asm volatile(
" larl 1,2f\n"
......
......@@ -13,7 +13,8 @@
*/
static void
sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
sparc_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
int lines = bytes / (sizeof (long)) / 8;
......@@ -50,8 +51,9 @@ sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
sparc_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
int lines = bytes / (sizeof (long)) / 8;
......@@ -101,8 +103,10 @@ sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
sparc_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
int lines = bytes / (sizeof (long)) / 8;
......@@ -165,8 +169,11 @@ sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
sparc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
sparc_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
int lines = bytes / (sizeof (long)) / 8;
......
......@@ -12,13 +12,20 @@
#include <asm/spitfire.h>
void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
unsigned long *);
void xor_vis_4(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
void xor_vis_5(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *, unsigned long *);
void xor_vis_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2);
void xor_vis_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3);
void xor_vis_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4);
void xor_vis_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5);
/* XXX Ugh, write cheetah versions... -DaveM */
......@@ -30,13 +37,20 @@ static struct xor_block_template xor_block_VIS = {
.do_5 = xor_vis_5,
};
void xor_niagara_2(unsigned long, unsigned long *, unsigned long *);
void xor_niagara_3(unsigned long, unsigned long *, unsigned long *,
unsigned long *);
void xor_niagara_4(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
void xor_niagara_5(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *, unsigned long *);
void xor_niagara_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2);
void xor_niagara_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3);
void xor_niagara_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4);
void xor_niagara_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5);
static struct xor_block_template xor_block_niagara = {
.name = "Niagara",
......
......@@ -57,7 +57,8 @@
op(i + 3, 3)
static void
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 8;
......@@ -108,7 +109,8 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 8;
......@@ -142,8 +144,9 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 8;
......@@ -201,8 +204,9 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 8;
......@@ -238,8 +242,10 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 8;
......@@ -304,8 +310,10 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 8;
......@@ -343,8 +351,11 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 8;
......@@ -416,8 +427,11 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 8;
......
......@@ -21,7 +21,8 @@
#include <asm/fpu/api.h>
static void
xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 7;
......@@ -64,8 +65,9 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 7;
......@@ -113,8 +115,10 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 7;
......@@ -168,8 +172,11 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
static void
xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 7;
......@@ -248,7 +255,8 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
#undef BLOCK
static void
xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 6;
......@@ -295,8 +303,9 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 6;
......@@ -352,8 +361,10 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 6;
......@@ -418,8 +429,11 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 6;
......
......@@ -26,7 +26,8 @@
BLOCK4(8) \
BLOCK4(12)
static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
const unsigned long * __restrict p1)
{
unsigned long lines = bytes >> 9;
......@@ -52,8 +53,9 @@ do { \
kernel_fpu_end();
}
static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
unsigned long *p2)
static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
const unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 9;
......@@ -82,8 +84,10 @@ do { \
kernel_fpu_end();
}
static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
unsigned long *p2, unsigned long *p3)
static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
const unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 9;
......@@ -115,8 +119,11 @@ do { \
kernel_fpu_end();
}
static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
unsigned long *p2, unsigned long *p3, unsigned long *p4)
static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
const unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 9;
......
......@@ -8,7 +8,8 @@
#include <linux/prefetch.h>
static void
xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
long lines = bytes / (sizeof (long)) / 8;
......@@ -27,8 +28,9 @@ xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_8regs_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
long lines = bytes / (sizeof (long)) / 8;
......@@ -48,8 +50,10 @@ xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_8regs_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
long lines = bytes / (sizeof (long)) / 8;
......@@ -70,8 +74,11 @@ xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
long lines = bytes / (sizeof (long)) / 8;
......@@ -93,7 +100,8 @@ xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
long lines = bytes / (sizeof (long)) / 8;
......@@ -129,8 +137,9 @@ xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_32regs_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
long lines = bytes / (sizeof (long)) / 8;
......@@ -175,8 +184,10 @@ xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_32regs_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
long lines = bytes / (sizeof (long)) / 8;
......@@ -230,8 +241,11 @@ xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
long lines = bytes / (sizeof (long)) / 8;
......@@ -294,7 +308,8 @@ xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_8regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
long lines = bytes / (sizeof (long)) / 8 - 1;
prefetchw(p1);
......@@ -320,8 +335,9 @@ xor_8regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_8regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_8regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
long lines = bytes / (sizeof (long)) / 8 - 1;
prefetchw(p1);
......@@ -350,8 +366,10 @@ xor_8regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_8regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_8regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
long lines = bytes / (sizeof (long)) / 8 - 1;
......@@ -384,8 +402,11 @@ xor_8regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_8regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
long lines = bytes / (sizeof (long)) / 8 - 1;
......@@ -421,7 +442,8 @@ xor_8regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_32regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
long lines = bytes / (sizeof (long)) / 8 - 1;
......@@ -466,8 +488,9 @@ xor_32regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
xor_32regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
xor_32regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
long lines = bytes / (sizeof (long)) / 8 - 1;
......@@ -523,8 +546,10 @@ xor_32regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_32regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
xor_32regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
long lines = bytes / (sizeof (long)) / 8 - 1;
......@@ -591,8 +616,11 @@ xor_32regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
xor_32regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
long lines = bytes / (sizeof (long)) / 8 - 1;
......
......@@ -11,13 +11,20 @@ struct xor_block_template {
struct xor_block_template *next;
const char *name;
int speed;
void (*do_2)(unsigned long, unsigned long *, unsigned long *);
void (*do_3)(unsigned long, unsigned long *, unsigned long *,
unsigned long *);
void (*do_4)(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
void (*do_5)(unsigned long, unsigned long *, unsigned long *,
unsigned long *, unsigned long *, unsigned long *);
void (*do_2)(unsigned long, unsigned long * __restrict,
const unsigned long * __restrict);
void (*do_3)(unsigned long, unsigned long * __restrict,
const unsigned long * __restrict,
const unsigned long * __restrict);
void (*do_4)(unsigned long, unsigned long * __restrict,
const unsigned long * __restrict,
const unsigned long * __restrict,
const unsigned long * __restrict);
void (*do_5)(unsigned long, unsigned long * __restrict,
const unsigned long * __restrict,
const unsigned long * __restrict,
const unsigned long * __restrict,
const unsigned long * __restrict);
};
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment