Commit 00227e3a authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm/ghash-ce - implement support for 4-way aggregation

Speed up the GHASH algorithm based on 64-bit polynomial multiplication
by adding support for 4-way aggregation. This improves throughput by
~85% on Cortex-A53, from 1.7 cycles per byte to 0.9 cycles per byte.

When combined with AES into GCM, throughput improves by ~25%, from
3.8 cycles per byte to 3.0 cycles per byte.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent ab8085c1
...@@ -99,6 +99,7 @@ config CRYPTO_GHASH_ARM_CE ...@@ -99,6 +99,7 @@ config CRYPTO_GHASH_ARM_CE
depends on KERNEL_MODE_NEON depends on KERNEL_MODE_NEON
select CRYPTO_HASH select CRYPTO_HASH
select CRYPTO_CRYPTD select CRYPTO_CRYPTD
select CRYPTO_GF128MUL
help help
Use an implementation of GHASH (used by the GCM AEAD chaining mode) Use an implementation of GHASH (used by the GCM AEAD chaining mode)
that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64) that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
......
...@@ -63,6 +63,33 @@ ...@@ -63,6 +63,33 @@
k48 .req d31 k48 .req d31
SHASH2_p64 .req d31 SHASH2_p64 .req d31
HH .req q10
HH3 .req q11
HH4 .req q12
HH34 .req q13
HH_L .req d20
HH_H .req d21
HH3_L .req d22
HH3_H .req d23
HH4_L .req d24
HH4_H .req d25
HH34_L .req d26
HH34_H .req d27
SHASH2_H .req d29
XL2 .req q5
XM2 .req q6
XH2 .req q7
T3 .req q8
XL2_L .req d10
XL2_H .req d11
XM2_L .req d12
XM2_H .req d13
T3_L .req d16
T3_H .req d17
.text .text
.fpu crypto-neon-fp-armv8 .fpu crypto-neon-fp-armv8
...@@ -175,12 +202,77 @@ ...@@ -175,12 +202,77 @@
beq 0f beq 0f
vld1.64 {T1}, [ip] vld1.64 {T1}, [ip]
teq r0, #0 teq r0, #0
b 1f b 3f
0: .ifc \pn, p64
tst r0, #3 // skip until #blocks is a
bne 2f // round multiple of 4
vld1.8 {XL2-XM2}, [r2]!
1: vld1.8 {T3-T2}, [r2]!
vrev64.8 XL2, XL2
vrev64.8 XM2, XM2
subs r0, r0, #4
vext.8 T1, XL2, XL2, #8
veor XL2_H, XL2_H, XL_L
veor XL, XL, T1
vrev64.8 T3, T3
vrev64.8 T1, T2
vmull.p64 XH, HH4_H, XL_H // a1 * b1
veor XL2_H, XL2_H, XL_H
vmull.p64 XL, HH4_L, XL_L // a0 * b0
vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
veor XM2_L, XM2_L, XM2_H
vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
veor XH, XH, XH2
veor XL, XL, XL2
veor XM, XM, XM2
vmull.p64 XH2, HH_H, T3_L // a1 * b1
veor T3_L, T3_L, T3_H
vmull.p64 XL2, HH_L, T3_H // a0 * b0
vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
veor XH, XH, XH2
veor XL, XL, XL2
veor XM, XM, XM2
vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
veor T1_L, T1_L, T1_H
vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
veor XH, XH, XH2
veor XL, XL, XL2
veor XM, XM, XM2
0: vld1.64 {T1}, [r2]! beq 4f
vld1.8 {XL2-XM2}, [r2]!
veor T1, XL, XH
veor XM, XM, T1
__pmull_reduce_p64
veor T1, T1, XH
veor XL, XL, T1
b 1b
.endif
2: vld1.64 {T1}, [r2]!
subs r0, r0, #1 subs r0, r0, #1
1: /* multiply XL by SHASH in GF(2^128) */ 3: /* multiply XL by SHASH in GF(2^128) */
#ifndef CONFIG_CPU_BIG_ENDIAN #ifndef CONFIG_CPU_BIG_ENDIAN
vrev64.8 T1, T1 vrev64.8 T1, T1
#endif #endif
...@@ -193,7 +285,7 @@ ...@@ -193,7 +285,7 @@
__pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
__pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
veor T1, XL, XH 4: veor T1, XL, XH
veor XM, XM, T1 veor XM, XM, T1
__pmull_reduce_\pn __pmull_reduce_\pn
...@@ -212,8 +304,14 @@ ...@@ -212,8 +304,14 @@
* struct ghash_key const *k, const char *head) * struct ghash_key const *k, const char *head)
*/ */
ENTRY(pmull_ghash_update_p64) ENTRY(pmull_ghash_update_p64)
vld1.64 {SHASH}, [r3] vld1.64 {SHASH}, [r3]!
vld1.64 {HH}, [r3]!
vld1.64 {HH3-HH4}, [r3]
veor SHASH2_p64, SHASH_L, SHASH_H veor SHASH2_p64, SHASH_L, SHASH_H
veor SHASH2_H, HH_L, HH_H
veor HH34_L, HH3_L, HH3_H
veor HH34_H, HH4_L, HH4_H
vmov.i8 MASK, #0xe1 vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57 vshl.u64 MASK, MASK, #57
......
/* /*
* Accelerated GHASH implementation with ARMv8 vmull.p64 instructions. * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
* *
* Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> * Copyright (C) 2015 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published * under the terms of the GNU General Public License version 2 as published
...@@ -28,8 +28,10 @@ MODULE_ALIAS_CRYPTO("ghash"); ...@@ -28,8 +28,10 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GHASH_DIGEST_SIZE 16 #define GHASH_DIGEST_SIZE 16
struct ghash_key { struct ghash_key {
u64 a; u64 h[2];
u64 b; u64 h2[2];
u64 h3[2];
u64 h4[2];
}; };
struct ghash_desc_ctx { struct ghash_desc_ctx {
...@@ -117,26 +119,40 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) ...@@ -117,26 +119,40 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
return 0; return 0;
} }
static void ghash_reflect(u64 h[], const be128 *k)
{
u64 carry = be64_to_cpu(k->a) >> 63;
h[0] = (be64_to_cpu(k->b) << 1) | carry;
h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
if (carry)
h[1] ^= 0xc200000000000000UL;
}
static int ghash_setkey(struct crypto_shash *tfm, static int ghash_setkey(struct crypto_shash *tfm,
const u8 *inkey, unsigned int keylen) const u8 *inkey, unsigned int keylen)
{ {
struct ghash_key *key = crypto_shash_ctx(tfm); struct ghash_key *key = crypto_shash_ctx(tfm);
u64 a, b; be128 h, k;
if (keylen != GHASH_BLOCK_SIZE) { if (keylen != GHASH_BLOCK_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL; return -EINVAL;
} }
/* perform multiplication by 'x' in GF(2^128) */ memcpy(&k, inkey, GHASH_BLOCK_SIZE);
b = get_unaligned_be64(inkey); ghash_reflect(key->h, &k);
a = get_unaligned_be64(inkey + 8);
h = k;
gf128mul_lle(&h, &k);
ghash_reflect(key->h2, &h);
key->a = (a << 1) | (b >> 63); gf128mul_lle(&h, &k);
key->b = (b << 1) | (a >> 63); ghash_reflect(key->h3, &h);
if (b >> 63) gf128mul_lle(&h, &k);
key->b ^= 0xc200000000000000UL; ghash_reflect(key->h4, &h);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment