Commit b913a640 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Catalin Marinas

arm64/crypto: improve performance of GHASH algorithm

This patches modifies the GHASH secure hash implementation to switch to a
faster, polynomial multiplication based reduction instead of one that uses
shifts and rotates.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
parent 6aa8b209
...@@ -3,14 +3,6 @@ ...@@ -3,14 +3,6 @@
* *
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
* *
* Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
*
* Copyright (c) 2009 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
* Vinodh Gopal
* Erdinc Ozturk
* Deniz Karakoyunlu
*
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published * under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation. * by the Free Software Foundation.
...@@ -19,13 +11,15 @@ ...@@ -19,13 +11,15 @@
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/assembler.h> #include <asm/assembler.h>
DATA .req v0 SHASH .req v0
SHASH .req v1 SHASH2 .req v1
IN1 .req v2
T1 .req v2 T1 .req v2
T2 .req v3 T2 .req v3
T3 .req v4 MASK .req v4
VZR .req v5 XL .req v5
XM .req v6
XH .req v7
IN1 .req v7
.text .text
.arch armv8-a+crypto .arch armv8-a+crypto
...@@ -35,61 +29,51 @@ ...@@ -35,61 +29,51 @@
* struct ghash_key const *k, const char *head) * struct ghash_key const *k, const char *head)
*/ */
ENTRY(pmull_ghash_update) ENTRY(pmull_ghash_update)
ld1 {DATA.16b}, [x1]
ld1 {SHASH.16b}, [x3] ld1 {SHASH.16b}, [x3]
eor VZR.16b, VZR.16b, VZR.16b ld1 {XL.16b}, [x1]
movi MASK.16b, #0xe1
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, SHASH.16b
/* do the head block first, if supplied */ /* do the head block first, if supplied */
cbz x4, 0f cbz x4, 0f
ld1 {IN1.2d}, [x4] ld1 {T1.2d}, [x4]
b 1f b 1f
0: ld1 {IN1.2d}, [x2], #16 0: ld1 {T1.2d}, [x2], #16
sub w0, w0, #1 sub w0, w0, #1
1: ext IN1.16b, IN1.16b, IN1.16b, #8
CPU_LE( rev64 IN1.16b, IN1.16b )
eor DATA.16b, DATA.16b, IN1.16b
/* multiply DATA by SHASH in GF(2^128) */ 1: /* multiply XL by SHASH in GF(2^128) */
ext T2.16b, DATA.16b, DATA.16b, #8 CPU_LE( rev64 T1.16b, T1.16b )
ext T3.16b, SHASH.16b, SHASH.16b, #8
eor T2.16b, T2.16b, DATA.16b
eor T3.16b, T3.16b, SHASH.16b
pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 ext T2.16b, XL.16b, XL.16b, #8
pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 ext IN1.16b, T1.16b, T1.16b, #8
pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) eor T1.16b, T1.16b, T2.16b
eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) eor XL.16b, XL.16b, IN1.16b
eor T2.16b, T2.16b, DATA.16b
ext T3.16b, VZR.16b, T2.16b, #8 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
ext T2.16b, T2.16b, VZR.16b, #8 eor T1.16b, T1.16b, XL.16b
eor DATA.16b, DATA.16b, T3.16b pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
// carry-less multiplication
/* first phase of the reduction */ ext T1.16b, XL.16b, XH.16b, #8
shl T3.2d, DATA.2d, #1 eor T2.16b, XL.16b, XH.16b
eor T3.16b, T3.16b, DATA.16b eor XM.16b, XM.16b, T1.16b
shl T3.2d, T3.2d, #5 eor XM.16b, XM.16b, T2.16b
eor T3.16b, T3.16b, DATA.16b pmull T2.1q, XL.1d, MASK.1d
shl T3.2d, T3.2d, #57
ext T2.16b, VZR.16b, T3.16b, #8
ext T3.16b, T3.16b, VZR.16b, #8
eor DATA.16b, DATA.16b, T2.16b
eor T1.16b, T1.16b, T3.16b
/* second phase of the reduction */ mov XH.d[0], XM.d[1]
ushr T2.2d, DATA.2d, #5 mov XM.d[1], XL.d[0]
eor T2.16b, T2.16b, DATA.16b
ushr T2.2d, T2.2d, #1 eor XL.16b, XM.16b, T2.16b
eor T2.16b, T2.16b, DATA.16b ext T2.16b, XL.16b, XL.16b, #8
ushr T2.2d, T2.2d, #1 pmull XL.1q, XL.1d, MASK.1d
eor T1.16b, T1.16b, T2.16b eor T2.16b, T2.16b, XH.16b
eor DATA.16b, DATA.16b, T1.16b eor XL.16b, XL.16b, T2.16b
cbnz w0, 0b cbnz w0, 0b
st1 {DATA.16b}, [x1] st1 {XL.16b}, [x1]
ret ret
ENDPROC(pmull_ghash_update) ENDPROC(pmull_ghash_update)
...@@ -67,7 +67,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src, ...@@ -67,7 +67,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
blocks = len / GHASH_BLOCK_SIZE; blocks = len / GHASH_BLOCK_SIZE;
len %= GHASH_BLOCK_SIZE; len %= GHASH_BLOCK_SIZE;
kernel_neon_begin_partial(6); kernel_neon_begin_partial(8);
pmull_ghash_update(blocks, ctx->digest, src, key, pmull_ghash_update(blocks, ctx->digest, src, key,
partial ? ctx->buf : NULL); partial ? ctx->buf : NULL);
kernel_neon_end(); kernel_neon_end();
...@@ -89,7 +89,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) ...@@ -89,7 +89,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
kernel_neon_begin_partial(6); kernel_neon_begin_partial(8);
pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
kernel_neon_end(); kernel_neon_end();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment