Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Merge crypto-2.6 to pick up NEON yield revert.

Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Merge crypto-2.6 to pick up NEON yield revert.
3465893d · Herbert Xu · d6e43798 · f10dc56c · 3465893d · 3465893d
Commit 3465893d authored Aug 07, 2018 by Herbert Xu
3 changed files
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -19,33 +19,24 @@
 	 *			     u32 *macp, u8 const rk[], u32 rounds);
 	 */
 ENTRY(ce_aes_ccm_auth_data)
-	frame_push	7
+	ldr	w8, [x3]			/* leftover from prev round? */
-	mov	x19, x0
-	mov	x20, x1
-	mov	x21, x2
-	mov	x22, x3
-	mov	x23, x4
-	mov	x24, x5
-	ldr	w25, [x22]			/* leftover from prev round? */
 	ld1	{v0.16b}, [x0]			/* load mac */
-	cbz	w25, 1f
+	cbz	w8, 1f
-	sub	w25, w25, #16
+	sub	w8, w8, #16
 	eor	v1.16b, v1.16b, v1.16b
-0:	ldrb	w7, [x20], #1			/* get 1 byte of input */
+0:	ldrb	w7, [x1], #1			/* get 1 byte of input */
-	subs	w21, w21, #1
+	subs	w2, w2, #1
-	add	w25, w25, #1
+	add	w8, w8, #1
 	ins	v1.b[0], w7
 	ext	v1.16b, v1.16b, v1.16b, #1	/* rotate in the input bytes */
 	beq	8f				/* out of input? */
-	cbnz	w25, 0b
+	cbnz	w8, 0b
 	eor	v0.16b, v0.16b, v1.16b
-1:	ld1	{v3.4s}, [x23]			/* load first round key */
+1:	ld1	{v3.4s}, [x4]			/* load first round key */
-	prfm	pldl1strm, [x20]
+	prfm	pldl1strm, [x1]
-	cmp	w24, #12			/* which key size? */
+	cmp	w5, #12				/* which key size? */
-	add	x6, x23, #16
+	add	x6, x4, #16
-	sub	w7, w24, #2			/* modified # of rounds */
+	sub	w7, w5, #2			/* modified # of rounds */
 	bmi	2f
 	bne	5f
 	mov	v5.16b, v3.16b
@@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data)
 	ld1	{v5.4s}, [x6], #16		/* load next round key */
 	bpl	3b
 	aese	v0.16b, v4.16b
-	subs	w21, w21, #16			/* last data? */
+	subs	w2, w2, #16			/* last data? */
 	eor	v0.16b, v0.16b, v5.16b		/* final round */
 	bmi	6f
-	ld1	{v1.16b}, [x20], #16		/* load next input block */
+	ld1	{v1.16b}, [x1], #16		/* load next input block */
 	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */
-	beq	6f
+	bne	1b
+6:	st1	{v0.16b}, [x0]			/* store mac */
-	if_will_cond_yield_neon
-	st1	{v0.16b}, [x19]			/* store mac */
-	do_cond_yield_neon
-	ld1	{v0.16b}, [x19]			/* reload mac */
-	endif_yield_neon
-	b	1b
-6:	st1	{v0.16b}, [x19]			/* store mac */
 	beq	10f
-	adds	w21, w21, #16
+	adds	w2, w2, #16
 	beq	10f
-	mov	w25, w21
+	mov	w8, w2
-7:	ldrb	w7, [x20], #1
+7:	ldrb	w7, [x1], #1
 	umov	w6, v0.b[0]
 	eor	w6, w6, w7
-	strb	w6, [x19], #1
+	strb	w6, [x0], #1
-	subs	w21, w21, #1
+	subs	w2, w2, #1
 	beq	10f
 	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
 	b	7b
-8:	mov	w7, w25
+8:	mov	w7, w8
-	add	w25, w25, #16
+	add	w8, w8, #16
 9:	ext	v1.16b, v1.16b, v1.16b, #1
 	adds	w7, w7, #1
 	bne	9b
 	eor	v0.16b, v0.16b, v1.16b
-	st1	{v0.16b}, [x19]
+	st1	{v0.16b}, [x0]
-10:	str	w25, [x22]
+10:	str	w8, [x3]
-	frame_pop
 	ret
 ENDPROC(ce_aes_ccm_auth_data)
@@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final)
 ENDPROC(ce_aes_ccm_final)
 	.macro	aes_ccm_do_crypt,enc
-	frame_push	8
+	ldr	x8, [x6, #8]			/* load lower ctr */
+	ld1	{v0.16b}, [x5]			/* load mac */
-	mov	x19, x0
+CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
-	mov	x20, x1
-	mov	x21, x2
-	mov	x22, x3
-	mov	x23, x4
-	mov	x24, x5
-	mov	x25, x6
-	ldr	x26, [x25, #8]			/* load lower ctr */
-	ld1	{v0.16b}, [x24]			/* load mac */
-CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
 0:	/* outer loop */
-	ld1	{v1.8b}, [x25]			/* load upper ctr */
+	ld1	{v1.8b}, [x6]			/* load upper ctr */
-	prfm	pldl1strm, [x20]
+	prfm	pldl1strm, [x1]
-	add	x26, x26, #1
+	add	x8, x8, #1
-	rev	x9, x26
+	rev	x9, x8
-	cmp	w23, #12			/* which key size? */
+	cmp	w4, #12				/* which key size? */
-	sub	w7, w23, #2			/* get modified # of rounds */
+	sub	w7, w4, #2			/* get modified # of rounds */
 	ins	v1.d[1], x9			/* no carry in lower ctr */
-	ld1	{v3.4s}, [x22]			/* load first round key */
+	ld1	{v3.4s}, [x3]			/* load first round key */
-	add	x10, x22, #16
+	add	x10, x3, #16
 	bmi	1f
 	bne	4f
 	mov	v5.16b, v3.16b
@@ -194,9 +165,9 @@ CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
 	bpl	2b
 	aese	v0.16b, v4.16b
 	aese	v1.16b, v4.16b
-	subs	w21, w21, #16
+	subs	w2, w2, #16
-	bmi	7f				/* partial block? */
+	bmi	6f				/* partial block? */
-	ld1	{v2.16b}, [x20], #16		/* load next input block */
+	ld1	{v2.16b}, [x1], #16		/* load next input block */
 	.if	\enc == 1
 	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
 	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */
@@ -205,29 +176,18 @@ CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
 	eor	v1.16b, v2.16b, v5.16b		/* final round enc */
 	.endif
 	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
-	st1	{v1.16b}, [x19], #16		/* write output block */
+	st1	{v1.16b}, [x0], #16		/* write output block */
-	beq	5f
+	bne	0b
+CPU_LE(	rev	x8, x8			)
-	if_will_cond_yield_neon
+	st1	{v0.16b}, [x5]			/* store mac */
-	st1	{v0.16b}, [x24]			/* store mac */
+	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
-	do_cond_yield_neon
+5:	ret
-	ld1	{v0.16b}, [x24]			/* reload mac */
-	endif_yield_neon
+6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
-	b	0b
-5:
-CPU_LE(	rev	x26, x26			)
-	st1	{v0.16b}, [x24]			/* store mac */
-	str	x26, [x25, #8]			/* store lsb end of ctr (BE) */
-6:	frame_pop
-	ret
-7:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
 	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
-	st1	{v0.16b}, [x24]			/* store mac */
+	st1	{v0.16b}, [x5]			/* store mac */
-	add	w21, w21, #16			/* process partial tail block */
+	add	w2, w2, #16			/* process partial tail block */
-8:	ldrb	w9, [x20], #1			/* get 1 byte of input */
+7:	ldrb	w9, [x1], #1			/* get 1 byte of input */
 	umov	w6, v1.b[0]			/* get top crypted ctr byte */
 	umov	w7, v0.b[0]			/* get top mac byte */
 	.if	\enc == 1
@@ -237,13 +197,13 @@ CPU_LE(	rev	x26, x26			)
 	eor	w9, w9, w6
 	eor	w7, w7, w9
 	.endif
-	strb	w9, [x19], #1			/* store out byte */
+	strb	w9, [x0], #1			/* store out byte */
-	strb	w7, [x24], #1			/* store mac byte */
+	strb	w7, [x5], #1			/* store mac byte */
-	subs	w21, w21, #1
+	subs	w2, w2, #1
-	beq	6b
+	beq	5b
 	ext	v0.16b, v0.16b, v0.16b, #1	/* shift out mac byte */
 	ext	v1.16b, v1.16b, v1.16b, #1	/* shift out ctr byte */
-	b	8b
+	b	7b
 	.endm
 	/*

--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8)
 	.endm
 	.macro		pmull_gcm_do_crypt, enc
-	frame_push	10
+	ld1		{SHASH.2d}, [x4]
+	ld1		{XL.2d}, [x1]
+	ldr		x8, [x5, #8]			// load lower counter
-	mov		x19, x0
+	load_round_keys	w7, x6
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x5
-	mov		x25, x6
-	mov		x26, x7
-	.if		\enc == 1
-	ldr		x27, [sp, #96]			// first stacked arg
-	.endif
-	ldr		x28, [x24, #8]			// load lower counter
-CPU_LE(	rev		x28, x28	)
-0:	mov		x0, x25
-	load_round_keys	w26, x0
-	ld1		{SHASH.2d}, [x23]
-	ld1		{XL.2d}, [x20]
 	movi		MASK.16b, #0xe1
 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
+CPU_LE(	rev		x8, x8		)
 	shl		MASK.2d, MASK.2d, #57
 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 	.if		\enc == 1
-	ld1		{KS.16b}, [x27]
+	ldr		x10, [sp]
+	ld1		{KS.16b}, [x10]
 	.endif
-1:	ld1		{CTR.8b}, [x24]			// load upper counter
+0:	ld1		{CTR.8b}, [x5]			// load upper counter
-	ld1		{INP.16b}, [x22], #16
+	ld1		{INP.16b}, [x3], #16
-	rev		x9, x28
+	rev		x9, x8
-	add		x28, x28, #1
+	add		x8, x8, #1
-	sub		w19, w19, #1
+	sub		w0, w0, #1
 	ins		CTR.d[1], x9			// set lower counter
 	.if		\enc == 1
 	eor		INP.16b, INP.16b, KS.16b	// encrypt input
-	st1		{INP.16b}, [x21], #16
+	st1		{INP.16b}, [x2], #16
 	.endif
 	rev64		T1.16b, INP.16b
-	cmp		w26, #12
+	cmp		w7, #12
-	b.ge		4f				// AES-192/256?
+	b.ge		2f				// AES-192/256?
-2:	enc_round	CTR, v21
+1:	enc_round	CTR, v21
 	ext		T2.16b, XL.16b, XL.16b, #8
 	ext		IN1.16b, T1.16b, T1.16b, #8
@@ -425,39 +411,27 @@ CPU_LE(	rev		x28, x28	)
 	.if		\enc == 0
 	eor		INP.16b, INP.16b, KS.16b
-	st1		{INP.16b}, [x21], #16
+	st1		{INP.16b}, [x2], #16
 	.endif
-	cbz		w19, 3f
+	cbnz		w0, 0b
-	if_will_cond_yield_neon
+CPU_LE(	rev		x8, x8		)
-	st1		{XL.2d}, [x20]
+	st1		{XL.2d}, [x1]
-	.if		\enc == 1
+	str		x8, [x5, #8]			// store lower counter
-	st1		{KS.16b}, [x27]
-	.endif
-	do_cond_yield_neon
-	b		0b
-	endif_yield_neon
-	b		1b
-3:	st1		{XL.2d}, [x20]
 	.if		\enc == 1
-	st1		{KS.16b}, [x27]
+	st1		{KS.16b}, [x10]
 	.endif
-CPU_LE(	rev		x28, x28	)
-	str		x28, [x24, #8]			// store lower counter
-	frame_pop
 	ret
-4:	b.eq		5f				// AES-192?
+2:	b.eq		3f				// AES-192?
 	enc_round	CTR, v17
 	enc_round	CTR, v18
-5:	enc_round	CTR, v19
+3:	enc_round	CTR, v19
 	enc_round	CTR, v20
-	b		2b
+	b		1b
 	.endm
 	/*

--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -266,6 +266,8 @@ static inline void padlock_xcrypt_ecb(const u8 *input, u8 *output, void *key,
 		return;
 	}
+	count -= initial;
 	if (initial)
 		asm volatile (".byte 0xf3,0x0f,0xa7,0xc8"	/* rep xcryptecb */
 			      : "+S"(input), "+D"(output)
@@ -273,7 +275,7 @@ static inline void padlock_xcrypt_ecb(const u8 *input, u8 *output, void *key,
 	asm volatile (".byte 0xf3,0x0f,0xa7,0xc8"	/* rep xcryptecb */
 		      : "+S"(input), "+D"(output)
-		      : "d"(control_word), "b"(key), "c"(count - initial));
+		      : "d"(control_word), "b"(key), "c"(count));
 }
 static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
@@ -284,6 +286,8 @@ static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
 	if (count < cbc_fetch_blocks)
 		return cbc_crypt(input, output, key, iv, control_word, count);
+	count -= initial;
 	if (initial)
 		asm volatile (".byte 0xf3,0x0f,0xa7,0xd0"	/* rep xcryptcbc */
 			      : "+S" (input), "+D" (output), "+a" (iv)
@@ -291,7 +295,7 @@ static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
 	asm volatile (".byte 0xf3,0x0f,0xa7,0xd0"	/* rep xcryptcbc */
 		      : "+S" (input), "+D" (output), "+a" (iv)
-		      : "d" (control_word), "b" (key), "c" (count-initial));
+		      : "d" (control_word), "b" (key), "c" (count));
 	return iv;
 }