RAID/s390: add SIMD implementation for raid6 gen/xor

Using vector registers is slightly faster: raid6: vx128x8 gen() 19705 MB/s raid6: vx128x8 xor() 11886 MB/s raid6: using algorithm vx128x8 gen() 19705 MB/s raid6: .... xor() 11886 MB/s, rmw enabled vs the software algorithms: raid6: int64x1 gen() 3018 MB/s raid6: int64x1 xor() 1429 MB/s raid6: int64x2 gen() 4661 MB/s raid6: int64x2 xor() 3143 MB/s raid6: int64x4 gen() 5392 MB/s raid6: int64x4 xor() 3509 MB/s raid6: int64x8 gen() 4441 MB/s raid6: int64x8 xor() 3207 MB/s raid6: using algorithm int64x4 gen() 5392 MB/s raid6: .... xor() 3509 MB/s, rmw enabled Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

RAID/s390: add SIMD implementation for raid6 gen/xor
Using vector registers is slightly faster: raid6: vx128x8 gen() 19705 MB/s raid6: vx128x8 xor() 11886 MB/s raid6: using algorithm vx128x8 gen() 19705 MB/s raid6: .... xor() 11886 MB/s, rmw enabled vs the software algorithms: raid6: int64x1 gen() 3018 MB/s raid6: int64x1 xor() 1429 MB/s raid6: int64x2 gen() 4661 MB/s raid6: int64x2 xor() 3143 MB/s raid6: int64x4 gen() 5392 MB/s raid6: int64x4 xor() 3509 MB/s raid6: int64x8 gen() 4441 MB/s raid6: int64x8 xor() 3207 MB/s raid6: using algorithm int64x4 gen() 5392 MB/s raid6: .... xor() 3509 MB/s, rmw enabled Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
474fd6e8 · Martin Schwidefsky · 8f149ea6 · 474fd6e8 · 474fd6e8 · 474fd6e8
Commit 474fd6e8 authored Aug 23, 2016 by Martin Schwidefsky
6 changed files
--- a/arch/s390/include/asm/vx-insn.h
+++ b/arch/s390/include/asm/vx-insn.h
@@ -278,6 +278,15 @@
 	VLVG	\v, \gr, \index, 3
 .endm

+/* VECTOR LOAD REGISTER */
+.macro	VLR	v1, v2
+	VX_NUM	v1, \v1
+	VX_NUM	v2, \v2
+	.word	0xE700 | ((v1&15) << 4) | (v2&15)
+	.word	0
+	MRXBOPC	0, 0x56, v1, v2
+.endm
+
 /* VECTOR LOAD */
 .macro	VL	v, disp, index="%r0", base
 	VX_NUM	v1, \v
@@ -404,6 +413,16 @@

 /* Vector integer instructions */

+/* VECTOR AND */
+.macro	VN	vr1, vr2, vr3
+	VX_NUM	v1, \vr1
+	VX_NUM	v2, \vr2
+	VX_NUM	v3, \vr3
+	.word	0xE700 | ((v1&15) << 4) | (v2&15)
+	.word	((v3&15) << 12)
+	MRXBOPC	0, 0x68, v1, v2, v3
+.endm
+
 /* VECTOR EXCLUSIVE OR */
 .macro	VX	vr1, vr2, vr3
 	VX_NUM	v1, \vr1
@@ -469,6 +488,73 @@
 	MRXBOPC	0, 0x7D, v1, v2, v3
 .endm

+/* VECTOR REPLICATE IMMEDIATE */
+.macro	VREPI	vr1, imm2, m3
+	VX_NUM	v1, \vr1
+	.word	0xE700 | ((v1&15) << 4)
+	.word	\imm2
+	MRXBOPC	\m3, 0x45, v1
+.endm
+.macro	VREPIB	vr1, imm2
+	VREPI	\vr1, \imm2, 0
+.endm
+.macro	VREPIH	vr1, imm2
+	VREPI	\vr1, \imm2, 1
+.endm
+.macro	VREPIF	vr1, imm2
+	VREPI	\vr1, \imm2, 2
+.endm
+.macro	VREPIG	vr1, imm2
+	VREP	\vr1, \imm2, 3
+.endm
+
+/* VECTOR ADD */
+.macro	VA	vr1, vr2, vr3, m4
+	VX_NUM	v1, \vr1
+	VX_NUM	v2, \vr2
+	VX_NUM	v3, \vr3
+	.word	0xE700 | ((v1&15) << 4) | (v2&15)
+	.word	((v3&15) << 12)
+	MRXBOPC	\m4, 0xF3, v1, v2, v3
+.endm
+.macro	VAB	vr1, vr2, vr3
+	VA	\vr1, \vr2, \vr3, 0
+.endm
+.macro	VAH	vr1, vr2, vr3
+	VA	\vr1, \vr2, \vr3, 1
+.endm
+.macro	VAF	vr1, vr2, vr3
+	VA	\vr1, \vr2, \vr3, 2
+.endm
+.macro	VAG	vr1, vr2, vr3
+	VA	\vr1, \vr2, \vr3, 3
+.endm
+.macro	VAQ	vr1, vr2, vr3
+	VA	\vr1, \vr2, \vr3, 4
+.endm
+
+/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */
+.macro	VESRAV	vr1, vr2, vr3, m4
+	VX_NUM	v1, \vr1
+	VX_NUM	v2, \vr2
+	VX_NUM	v3, \vr3
+	.word	0xE700 | ((v1&15) << 4) | (v2&15)
+	.word	((v3&15) << 12)
+	MRXBOPC \m4, 0x7A, v1, v2, v3
+.endm
+
+.macro	VESRAVB	vr1, vr2, vr3
+	VESRAV	\vr1, \vr2, \vr3, 0
+.endm
+.macro	VESRAVH	vr1, vr2, vr3
+	VESRAV	\vr1, \vr2, \vr3, 1
+.endm
+.macro	VESRAVF	vr1, vr2, vr3
+	VESRAV	\vr1, \vr2, \vr3, 2
+.endm
+.macro	VESRAVG	vr1, vr2, vr3
+	VESRAV	\vr1, \vr2, \vr3, 3
+.endm

 #endif	/* __ASSEMBLY__ */
 #endif	/* __ASM_S390_VX_INSN_H */
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -103,6 +103,7 @@ extern const struct raid6_calls raid6_avx2x1;
 extern const struct raid6_calls raid6_avx2x2;
 extern const struct raid6_calls raid6_avx2x4;
 extern const struct raid6_calls raid6_tilegx8;
+extern const struct raid6_calls raid6_s390vx8;

 struct raid6_recov_calls {
 	void (*data2)(int, size_t, int, int, void **);

--- a/lib/raid6/.gitignore
+++ b/lib/raid6/.gitignore
@@ -3,3 +3,4 @@ altivec*.c
 int*.c
 tables.c
 neon?.c
+s390vx?.c
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -7,6 +7,7 @@ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
+raid6_pq-$(CONFIG_S390) += s390vx8.o

 hostprogs-y	+= mktables

@@ -116,6 +117,11 @@ $(obj)/tilegx8.c:   UNROLL := 8
 $(obj)/tilegx8.c:   $(src)/tilegx.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)

+targets += s390vx8.c
+$(obj)/s390vx8.c:   UNROLL := 8
+$(obj)/s390vx8.c:   $(src)/s390vx.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
 quiet_cmd_mktable = TABLE   $@
      cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )


--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -68,6 +68,9 @@ const struct raid6_calls * const raid6_algos[] = {
 #endif
 #if defined(CONFIG_TILEGX)
 	&raid6_tilegx8,
+#endif
+#if defined(CONFIG_S390)
+	&raid6_s390vx8,
 #endif
 	&raid6_intx1,
 	&raid6_intx2,

--- a/lib/raid6/s390vx.uc
+++ b/lib/raid6/s390vx.uc
+/*
+ * raid6_vx$#.c
+ *
+ * $#-way unrolled RAID6 gen/xor functions for s390
+ * based on the vector facility
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *
+ * This file is postprocessed using unroll.awk.
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+asm(".include \"asm/vx-insn.h\"\n");
+
+#define NSIZE 16
+
+static inline void LOAD_CONST(void)
+{
+	asm volatile("VREPIB %v24,7");
+	asm volatile("VREPIB %v25,0x1d");
+}
+
+/*
+ * The SHLBYTE() operation shifts each of the 16 bytes in
+ * vector register y left by 1 bit and stores the result in
+ * vector register x.
+ */
+static inline void SHLBYTE(int x, int y)
+{
+	asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y));
+}
+
+/*
+ * For each of the 16 bytes in the vector register y the MASK()
+ * operation returns 0xFF if the high bit of the byte is 1,
+ * or 0x00 if the high bit is 0. The result is stored in vector
+ * register x.
+ */
+static inline void MASK(int x, int y)
+{
+	asm volatile ("VESRAVB	%0,%1,24" : : "i" (x), "i" (y));
+}
+
+static inline void AND(int x, int y, int z)
+{
+	asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
+}
+
+static inline void XOR(int x, int y, int z)
+{
+	asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
+}
+
+static inline void LOAD_DATA(int x, int n, u8 *ptr)
+{
+	typedef struct { u8 _[16*n]; } addrtype;
+	register addrtype *__ptr asm("1") = (addrtype *) ptr;
+
+	asm volatile ("VLM %2,%3,0,%r1"
+		      : : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1));
+}
+
+static inline void STORE_DATA(int x, int n, u8 *ptr)
+{
+	typedef struct { u8 _[16*n]; } addrtype;
+	register addrtype *__ptr asm("1") = (addrtype *) ptr;
+
+	asm volatile ("VSTM %2,%3,0,1"
+		      : "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1));
+}
+
+static inline void COPY_VEC(int x, int y)
+{
+	asm volatile ("VLR %0,%1" : : "i" (x), "i" (y));
+}
+
+static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	struct kernel_fpu vxstate;
+	u8 **dptr, *p, *q;
+	int d, z, z0;
+
+	kernel_fpu_begin(&vxstate, KERNEL_VXR);
+	LOAD_CONST();
+
+	dptr = (u8 **) ptrs;
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0 + 1];	/* XOR parity */
+	q = dptr[z0 + 2];	/* RS syndrome */
+
+	for (d = 0; d < bytes; d += $#*NSIZE) {
+		LOAD_DATA(0,$#,&dptr[z0][d]);
+		COPY_VEC(8+$$,0+$$);
+		for (z = z0 - 1; z >= 0; z--) {
+			MASK(16+$$,8+$$);
+			AND(16+$$,16+$$,25);
+			SHLBYTE(8+$$,8+$$);
+			XOR(8+$$,8+$$,16+$$);
+			LOAD_DATA(16,$#,&dptr[z][d]);
+			XOR(0+$$,0+$$,16+$$);
+			XOR(8+$$,8+$$,16+$$);
+		}
+		STORE_DATA(0,$#,&p[d]);
+		STORE_DATA(8,$#,&q[d]);
+	}
+	kernel_fpu_end(&vxstate, KERNEL_VXR);
+}
+
+static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
+					size_t bytes, void **ptrs)
+{
+	struct kernel_fpu vxstate;
+	u8 **dptr, *p, *q;
+	int d, z, z0;
+
+	dptr = (u8 **) ptrs;
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks - 2];	/* XOR parity */
+	q = dptr[disks - 1];	/* RS syndrome */
+
+	kernel_fpu_begin(&vxstate, KERNEL_VXR);
+	LOAD_CONST();
+
+	for (d = 0; d < bytes; d += $#*NSIZE) {
+		/* P/Q data pages */
+		LOAD_DATA(0,$#,&dptr[z0][d]);
+		COPY_VEC(8+$$,0+$$);
+		for (z = z0 - 1; z >= start; z--) {
+			MASK(16+$$,8+$$);
+			AND(16+$$,16+$$,25);
+			SHLBYTE(8+$$,8+$$);
+			XOR(8+$$,8+$$,16+$$);
+			LOAD_DATA(16,$#,&dptr[z][d]);
+			XOR(0+$$,0+$$,16+$$);
+			XOR(8+$$,8+$$,16+$$);
+		}
+		/* P/Q left side optimization */
+		for (z = start - 1; z >= 0; z--) {
+			MASK(16+$$,8+$$);
+			AND(16+$$,16+$$,25);
+			SHLBYTE(8+$$,8+$$);
+			XOR(8+$$,8+$$,16+$$);
+		}
+		LOAD_DATA(16,$#,&p[d]);
+		XOR(16+$$,16+$$,0+$$);
+		STORE_DATA(16,$#,&p[d]);
+		LOAD_DATA(16,$#,&q[d]);
+		XOR(16+$$,16+$$,8+$$);
+		STORE_DATA(16,$#,&q[d]);
+	}
+	kernel_fpu_end(&vxstate, KERNEL_VXR);
+}
+
+static int raid6_s390vx$#_valid(void)
+{
+	return MACHINE_HAS_VX;
+}
+
+const struct raid6_calls raid6_s390vx$# = {
+	raid6_s390vx$#_gen_syndrome,
+	raid6_s390vx$#_xor_syndrome,
+	raid6_s390vx$#_valid,
+	"vx128x$#",
+	1
+};