Commit 4a97abd4 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Will Deacon

arm64/crypto: issue aese/aesmc instructions in pairs

This changes the AES core transform implementations to issue aese/aesmc
(and aesd/aesimc) in pairs. This enables a micro-architectural optimization
in recent Cortex-A5x cores that improves performance by 50-90%.

Measured performance in cycles per byte (Cortex-A57):

                CBC enc         CBC dec         CTR
  before        3.64            1.34            1.32
  after         1.95            0.85            0.93

Note that this results in a ~5% performance decrease for older cores.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarWill Deacon <will.deacon@arm.com>
parent b63dbef9
...@@ -101,19 +101,19 @@ ENTRY(ce_aes_ccm_final) ...@@ -101,19 +101,19 @@ ENTRY(ce_aes_ccm_final)
0: mov v4.16b, v3.16b 0: mov v4.16b, v3.16b
1: ld1 {v5.2d}, [x2], #16 /* load next round key */ 1: ld1 {v5.2d}, [x2], #16 /* load next round key */
aese v0.16b, v4.16b aese v0.16b, v4.16b
aese v1.16b, v4.16b
aesmc v0.16b, v0.16b aesmc v0.16b, v0.16b
aese v1.16b, v4.16b
aesmc v1.16b, v1.16b aesmc v1.16b, v1.16b
2: ld1 {v3.2d}, [x2], #16 /* load next round key */ 2: ld1 {v3.2d}, [x2], #16 /* load next round key */
aese v0.16b, v5.16b aese v0.16b, v5.16b
aese v1.16b, v5.16b
aesmc v0.16b, v0.16b aesmc v0.16b, v0.16b
aese v1.16b, v5.16b
aesmc v1.16b, v1.16b aesmc v1.16b, v1.16b
3: ld1 {v4.2d}, [x2], #16 /* load next round key */ 3: ld1 {v4.2d}, [x2], #16 /* load next round key */
subs w3, w3, #3 subs w3, w3, #3
aese v0.16b, v3.16b aese v0.16b, v3.16b
aese v1.16b, v3.16b
aesmc v0.16b, v0.16b aesmc v0.16b, v0.16b
aese v1.16b, v3.16b
aesmc v1.16b, v1.16b aesmc v1.16b, v1.16b
bpl 1b bpl 1b
aese v0.16b, v4.16b aese v0.16b, v4.16b
...@@ -146,19 +146,19 @@ ENDPROC(ce_aes_ccm_final) ...@@ -146,19 +146,19 @@ ENDPROC(ce_aes_ccm_final)
ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ ld1 {v5.2d}, [x10], #16 /* load 2nd round key */
2: /* inner loop: 3 rounds, 2x interleaved */ 2: /* inner loop: 3 rounds, 2x interleaved */
aese v0.16b, v4.16b aese v0.16b, v4.16b
aese v1.16b, v4.16b
aesmc v0.16b, v0.16b aesmc v0.16b, v0.16b
aese v1.16b, v4.16b
aesmc v1.16b, v1.16b aesmc v1.16b, v1.16b
3: ld1 {v3.2d}, [x10], #16 /* load next round key */ 3: ld1 {v3.2d}, [x10], #16 /* load next round key */
aese v0.16b, v5.16b aese v0.16b, v5.16b
aese v1.16b, v5.16b
aesmc v0.16b, v0.16b aesmc v0.16b, v0.16b
aese v1.16b, v5.16b
aesmc v1.16b, v1.16b aesmc v1.16b, v1.16b
4: ld1 {v4.2d}, [x10], #16 /* load next round key */ 4: ld1 {v4.2d}, [x10], #16 /* load next round key */
subs w7, w7, #3 subs w7, w7, #3
aese v0.16b, v3.16b aese v0.16b, v3.16b
aese v1.16b, v3.16b
aesmc v0.16b, v0.16b aesmc v0.16b, v0.16b
aese v1.16b, v3.16b
aesmc v1.16b, v1.16b aesmc v1.16b, v1.16b
ld1 {v5.2d}, [x10], #16 /* load next round key */ ld1 {v5.2d}, [x10], #16 /* load next round key */
bpl 2b bpl 2b
......
...@@ -45,18 +45,14 @@ ...@@ -45,18 +45,14 @@
.macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
aes\de \i0\().16b, \k\().16b aes\de \i0\().16b, \k\().16b
.ifnb \i1
aes\de \i1\().16b, \k\().16b
.ifnb \i3
aes\de \i2\().16b, \k\().16b
aes\de \i3\().16b, \k\().16b
.endif
.endif
aes\mc \i0\().16b, \i0\().16b aes\mc \i0\().16b, \i0\().16b
.ifnb \i1 .ifnb \i1
aes\de \i1\().16b, \k\().16b
aes\mc \i1\().16b, \i1\().16b aes\mc \i1\().16b, \i1\().16b
.ifnb \i3 .ifnb \i3
aes\de \i2\().16b, \k\().16b
aes\mc \i2\().16b, \i2\().16b aes\mc \i2\().16b, \i2\().16b
aes\de \i3\().16b, \k\().16b
aes\mc \i3\().16b, \i3\().16b aes\mc \i3\().16b, \i3\().16b
.endif .endif
.endif .endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment