Commit ed6ed118 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm64/aes-modes - get rid of literal load of addend vector

Replace the literal load of the addend vector with a sequence that
performs each add individually. This sequence is only 2 instructions
longer than the original, and 2% faster on Cortex-A53.

This is an improvement by itself, but also works around a Clang issue,
whose integrated assembler does not implement the GNU ARM asm syntax
completely, and does not support the =literal notation for FP registers
(more info at https://bugs.llvm.org/show_bug.cgi?id=38642)

Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: default avatarNick Desaulniers <ndesaulniers@google.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 00227e3a
...@@ -232,17 +232,19 @@ AES_ENTRY(aes_ctr_encrypt) ...@@ -232,17 +232,19 @@ AES_ENTRY(aes_ctr_encrypt)
bmi .Lctr1x bmi .Lctr1x
cmn w6, #4 /* 32 bit overflow? */ cmn w6, #4 /* 32 bit overflow? */
bcs .Lctr1x bcs .Lctr1x
ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ add w7, w6, #1
dup v7.4s, w6
mov v0.16b, v4.16b mov v0.16b, v4.16b
add v7.4s, v7.4s, v8.4s add w8, w6, #2
mov v1.16b, v4.16b mov v1.16b, v4.16b
rev32 v8.16b, v7.16b add w9, w6, #3
mov v2.16b, v4.16b mov v2.16b, v4.16b
rev w7, w7
mov v3.16b, v4.16b mov v3.16b, v4.16b
mov v1.s[3], v8.s[0] rev w8, w8
mov v2.s[3], v8.s[1] mov v1.s[3], w7
mov v3.s[3], v8.s[2] rev w9, w9
mov v2.s[3], w8
mov v3.s[3], w9
ld1 {v5.16b-v7.16b}, [x20], #48 /* get 3 input blocks */ ld1 {v5.16b-v7.16b}, [x20], #48 /* get 3 input blocks */
bl aes_encrypt_block4x bl aes_encrypt_block4x
eor v0.16b, v5.16b, v0.16b eor v0.16b, v5.16b, v0.16b
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment