Commit 22240df7 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm64/ghash-ce - implement 4-way aggregation

Enhance the GHASH implementation that uses 64-bit polynomial
multiplication by adding support for 4-way aggregation. This
more than doubles the performance, from 2.4 cycles per byte
to 1.1 cpb on Cortex-A53.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 8e492eff
...@@ -46,6 +46,19 @@ ...@@ -46,6 +46,19 @@
ss3 .req v26 ss3 .req v26
ss4 .req v27 ss4 .req v27
XL2 .req v8
XM2 .req v9
XH2 .req v10
XL3 .req v11
XM3 .req v12
XH3 .req v13
TT3 .req v14
TT4 .req v15
HH .req v16
HH3 .req v17
HH4 .req v18
HH34 .req v19
.text .text
.arch armv8-a+crypto .arch armv8-a+crypto
...@@ -134,11 +147,25 @@ ...@@ -134,11 +147,25 @@
.endm .endm
.macro __pmull_pre_p64 .macro __pmull_pre_p64
add x8, x3, #16
ld1 {HH.2d-HH4.2d}, [x8]
trn1 SHASH2.2d, SHASH.2d, HH.2d
trn2 T1.2d, SHASH.2d, HH.2d
eor SHASH2.16b, SHASH2.16b, T1.16b
trn1 HH34.2d, HH3.2d, HH4.2d
trn2 T1.2d, HH3.2d, HH4.2d
eor HH34.16b, HH34.16b, T1.16b
movi MASK.16b, #0xe1 movi MASK.16b, #0xe1
shl MASK.2d, MASK.2d, #57 shl MASK.2d, MASK.2d, #57
.endm .endm
.macro __pmull_pre_p8 .macro __pmull_pre_p8
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
eor SHASH2.16b, SHASH2.16b, SHASH.16b
// k00_16 := 0x0000000000000000_000000000000ffff // k00_16 := 0x0000000000000000_000000000000ffff
// k32_48 := 0x00000000ffffffff_0000ffffffffffff // k32_48 := 0x00000000ffffffff_0000ffffffffffff
movi k32_48.2d, #0xffffffff movi k32_48.2d, #0xffffffff
...@@ -215,8 +242,6 @@ ...@@ -215,8 +242,6 @@
.macro __pmull_ghash, pn .macro __pmull_ghash, pn
ld1 {SHASH.2d}, [x3] ld1 {SHASH.2d}, [x3]
ld1 {XL.2d}, [x1] ld1 {XL.2d}, [x1]
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
eor SHASH2.16b, SHASH2.16b, SHASH.16b
__pmull_pre_\pn __pmull_pre_\pn
...@@ -224,12 +249,79 @@ ...@@ -224,12 +249,79 @@
cbz x4, 0f cbz x4, 0f
ld1 {T1.2d}, [x4] ld1 {T1.2d}, [x4]
mov x4, xzr mov x4, xzr
b 1f b 3f
0: .ifc \pn, p64
tbnz w0, #0, 2f // skip until #blocks is a
tbnz w0, #1, 2f // round multiple of 4
1: ld1 {XM3.16b-TT4.16b}, [x2], #64
sub w0, w0, #4
rev64 T1.16b, XM3.16b
rev64 T2.16b, XH3.16b
rev64 TT4.16b, TT4.16b
rev64 TT3.16b, TT3.16b
ext IN1.16b, TT4.16b, TT4.16b, #8
ext XL3.16b, TT3.16b, TT3.16b, #8
eor TT4.16b, TT4.16b, IN1.16b
pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
eor TT3.16b, TT3.16b, XL3.16b
pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
ext IN1.16b, T2.16b, T2.16b, #8
eor XL2.16b, XL2.16b, XL3.16b
eor XH2.16b, XH2.16b, XH3.16b
eor XM2.16b, XM2.16b, XM3.16b
eor T2.16b, T2.16b, IN1.16b
pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
0: ld1 {T1.2d}, [x2], #16 eor XL2.16b, XL2.16b, XL3.16b
eor XH2.16b, XH2.16b, XH3.16b
eor XM2.16b, XM2.16b, XM3.16b
ext IN1.16b, T1.16b, T1.16b, #8
ext TT3.16b, XL.16b, XL.16b, #8
eor XL.16b, XL.16b, IN1.16b
eor T1.16b, T1.16b, TT3.16b
pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
eor T1.16b, T1.16b, XL.16b
pmull XL.1q, HH4.1d, XL.1d // a0 * b0
pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
eor XL.16b, XL.16b, XL2.16b
eor XH.16b, XH.16b, XH2.16b
eor XM.16b, XM.16b, XM2.16b
eor T2.16b, XL.16b, XH.16b
ext T1.16b, XL.16b, XH.16b, #8
eor XM.16b, XM.16b, T2.16b
__pmull_reduce_p64
eor T2.16b, T2.16b, XH.16b
eor XL.16b, XL.16b, T2.16b
cbz w0, 5f
b 1b
.endif
2: ld1 {T1.2d}, [x2], #16
sub w0, w0, #1 sub w0, w0, #1
1: /* multiply XL by SHASH in GF(2^128) */ 3: /* multiply XL by SHASH in GF(2^128) */
CPU_LE( rev64 T1.16b, T1.16b ) CPU_LE( rev64 T1.16b, T1.16b )
ext T2.16b, XL.16b, XL.16b, #8 ext T2.16b, XL.16b, XL.16b, #8
...@@ -242,7 +334,7 @@ CPU_LE( rev64 T1.16b, T1.16b ) ...@@ -242,7 +334,7 @@ CPU_LE( rev64 T1.16b, T1.16b )
__pmull_\pn XL, XL, SHASH // a0 * b0 __pmull_\pn XL, XL, SHASH // a0 * b0
__pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
eor T2.16b, XL.16b, XH.16b 4: eor T2.16b, XL.16b, XH.16b
ext T1.16b, XL.16b, XH.16b, #8 ext T1.16b, XL.16b, XH.16b, #8
eor XM.16b, XM.16b, T2.16b eor XM.16b, XM.16b, T2.16b
...@@ -253,7 +345,7 @@ CPU_LE( rev64 T1.16b, T1.16b ) ...@@ -253,7 +345,7 @@ CPU_LE( rev64 T1.16b, T1.16b )
cbnz w0, 0b cbnz w0, 0b
st1 {XL.2d}, [x1] 5: st1 {XL.2d}, [x1]
ret ret
.endm .endm
...@@ -269,14 +361,10 @@ ENTRY(pmull_ghash_update_p8) ...@@ -269,14 +361,10 @@ ENTRY(pmull_ghash_update_p8)
__pmull_ghash p8 __pmull_ghash p8
ENDPROC(pmull_ghash_update_p8) ENDPROC(pmull_ghash_update_p8)
KS0 .req v8 KS0 .req v12
KS1 .req v9 KS1 .req v13
INP0 .req v10 INP0 .req v14
INP1 .req v11 INP1 .req v15
HH .req v12
XL2 .req v13
XM2 .req v14
XH2 .req v15
.macro load_round_keys, rounds, rk .macro load_round_keys, rounds, rk
cmp \rounds, #12 cmp \rounds, #12
...@@ -310,8 +398,8 @@ ENDPROC(pmull_ghash_update_p8) ...@@ -310,8 +398,8 @@ ENDPROC(pmull_ghash_update_p8)
.endm .endm
.macro pmull_gcm_do_crypt, enc .macro pmull_gcm_do_crypt, enc
ld1 {HH.2d}, [x4], #16 ld1 {SHASH.2d}, [x4], #16
ld1 {SHASH.2d}, [x4] ld1 {HH.2d}, [x4]
ld1 {XL.2d}, [x1] ld1 {XL.2d}, [x1]
ldr x8, [x5, #8] // load lower counter ldr x8, [x5, #8] // load lower counter
......
...@@ -33,9 +33,12 @@ MODULE_ALIAS_CRYPTO("ghash"); ...@@ -33,9 +33,12 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GCM_IV_SIZE 12 #define GCM_IV_SIZE 12
struct ghash_key { struct ghash_key {
u64 a; u64 h[2];
u64 b; u64 h2[2];
be128 k; u64 h3[2];
u64 h4[2];
be128 k;
}; };
struct ghash_desc_ctx { struct ghash_desc_ctx {
...@@ -46,7 +49,6 @@ struct ghash_desc_ctx { ...@@ -46,7 +49,6 @@ struct ghash_desc_ctx {
struct gcm_aes_ctx { struct gcm_aes_ctx {
struct crypto_aes_ctx aes_key; struct crypto_aes_ctx aes_key;
u64 h2[2];
struct ghash_key ghash_key; struct ghash_key ghash_key;
}; };
...@@ -63,11 +65,12 @@ static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, ...@@ -63,11 +65,12 @@ static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
const char *head); const char *head);
asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
const u8 src[], u64 const *k, u8 ctr[], const u8 src[], struct ghash_key const *k,
u32 const rk[], int rounds, u8 ks[]); u8 ctr[], u32 const rk[], int rounds,
u8 ks[]);
asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
const u8 src[], u64 const *k, const u8 src[], struct ghash_key const *k,
u8 ctr[], u32 const rk[], int rounds); u8 ctr[], u32 const rk[], int rounds);
asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[], asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
...@@ -174,23 +177,36 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) ...@@ -174,23 +177,36 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
return 0; return 0;
} }
static void ghash_reflect(u64 h[], const be128 *k)
{
u64 carry = be64_to_cpu(k->a) & BIT(63) ? 1 : 0;
h[0] = (be64_to_cpu(k->b) << 1) | carry;
h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
if (carry)
h[1] ^= 0xc200000000000000UL;
}
static int __ghash_setkey(struct ghash_key *key, static int __ghash_setkey(struct ghash_key *key,
const u8 *inkey, unsigned int keylen) const u8 *inkey, unsigned int keylen)
{ {
u64 a, b; be128 h;
/* needed for the fallback */ /* needed for the fallback */
memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
/* perform multiplication by 'x' in GF(2^128) */ ghash_reflect(key->h, &key->k);
b = get_unaligned_be64(inkey);
a = get_unaligned_be64(inkey + 8); h = key->k;
gf128mul_lle(&h, &key->k);
ghash_reflect(key->h2, &h);
key->a = (a << 1) | (b >> 63); gf128mul_lle(&h, &key->k);
key->b = (b << 1) | (a >> 63); ghash_reflect(key->h3, &h);
if (b >> 63) gf128mul_lle(&h, &key->k);
key->b ^= 0xc200000000000000UL; ghash_reflect(key->h4, &h);
return 0; return 0;
} }
...@@ -240,8 +256,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, ...@@ -240,8 +256,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
unsigned int keylen) unsigned int keylen)
{ {
struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
be128 h1, h2; u8 key[GHASH_BLOCK_SIZE];
u8 *key = (u8 *)&h1;
int ret; int ret;
ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen); ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);
...@@ -253,19 +268,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, ...@@ -253,19 +268,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
__aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){}, __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){},
num_rounds(&ctx->aes_key)); num_rounds(&ctx->aes_key));
__ghash_setkey(&ctx->ghash_key, key, sizeof(be128)); return __ghash_setkey(&ctx->ghash_key, key, sizeof(be128));
/* calculate H^2 (used for 2-way aggregation) */
h2 = h1;
gf128mul_lle(&h2, &h1);
ctx->h2[0] = (be64_to_cpu(h2.b) << 1) | (be64_to_cpu(h2.a) >> 63);
ctx->h2[1] = (be64_to_cpu(h2.a) << 1) | (be64_to_cpu(h2.b) >> 63);
if (be64_to_cpu(h2.a) >> 63)
ctx->h2[1] ^= 0xc200000000000000UL;
return 0;
} }
static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
...@@ -401,8 +404,8 @@ static int gcm_encrypt(struct aead_request *req) ...@@ -401,8 +404,8 @@ static int gcm_encrypt(struct aead_request *req)
kernel_neon_begin(); kernel_neon_begin();
pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
walk.src.virt.addr, ctx->h2, iv, walk.src.virt.addr, &ctx->ghash_key,
rk, nrounds, ks); iv, rk, nrounds, ks);
kernel_neon_end(); kernel_neon_end();
err = skcipher_walk_done(&walk, err = skcipher_walk_done(&walk,
...@@ -512,8 +515,8 @@ static int gcm_decrypt(struct aead_request *req) ...@@ -512,8 +515,8 @@ static int gcm_decrypt(struct aead_request *req)
kernel_neon_begin(); kernel_neon_begin();
pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
walk.src.virt.addr, ctx->h2, iv, walk.src.virt.addr, &ctx->ghash_key,
rk, nrounds); iv, rk, nrounds);
/* check if this is the final iteration of the loop */ /* check if this is the final iteration of the loop */
if (rem < (2 * AES_BLOCK_SIZE)) { if (rem < (2 * AES_BLOCK_SIZE)) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment