Commit cee7a36e authored by Martin Willi's avatar Martin Willi Committed by Herbert Xu

crypto: x86/chacha20 - Add a 8-block AVX-512VL variant

This variant is similar to the AVX2 version, but benefits from the AVX-512
rotate instructions and the additional registers, so it can operate without
any data on the stack. It uses ymm registers only to avoid the massive core
throttling on Skylake-X platforms. Nontheless does it bring a ~30% speed
improvement compared to the AVX2 variant for random encryption lengths.

The AVX2 version uses "rep movsb" for partial block XORing via the stack.
With AVX-512, the new "vmovdqu8" can do this much more efficiently. The
associated "kmov" instructions to work with dynamic masks is not part of
the AVX-512VL instruction set, hence we depend on AVX-512BW as well. Given
that the major AVX-512VL architectures provide AVX-512BW and this extension
does not affect core clocking, this seems to be no problem at least for
now.
Signed-off-by: default avatarMartin Willi <martin@strongswan.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 059c2a4d
...@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y ...@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no) $(comma)4)$(comma)%ymm2,yes,no)
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no) sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
...@@ -103,6 +104,10 @@ ifeq ($(avx2_supported),yes) ...@@ -103,6 +104,10 @@ ifeq ($(avx2_supported),yes)
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
endif endif
ifeq ($(avx512_supported),yes)
chacha20-x86_64-y += chacha20-avx512vl-x86_64.o
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
......
This diff is collapsed.
...@@ -31,6 +31,11 @@ asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, ...@@ -31,6 +31,11 @@ asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len); unsigned int len);
static bool chacha20_use_avx2; static bool chacha20_use_avx2;
#ifdef CONFIG_AS_AVX512
asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
static bool chacha20_use_avx512vl;
#endif
#endif #endif
static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks) static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
...@@ -43,6 +48,22 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, ...@@ -43,6 +48,22 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes) unsigned int bytes)
{ {
#ifdef CONFIG_AS_AVX2 #ifdef CONFIG_AS_AVX2
#ifdef CONFIG_AS_AVX512
if (chacha20_use_avx512vl) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha20_8block_xor_avx512vl(state, dst, src, bytes);
bytes -= CHACHA_BLOCK_SIZE * 8;
src += CHACHA_BLOCK_SIZE * 8;
dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
if (bytes > CHACHA_BLOCK_SIZE * 4) {
chacha20_8block_xor_avx512vl(state, dst, src, bytes);
state[12] += chacha20_advance(bytes, 8);
return;
}
}
#endif
if (chacha20_use_avx2) { if (chacha20_use_avx2) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) { while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha20_8block_xor_avx2(state, dst, src, bytes); chacha20_8block_xor_avx2(state, dst, src, bytes);
...@@ -149,6 +170,11 @@ static int __init chacha20_simd_mod_init(void) ...@@ -149,6 +170,11 @@ static int __init chacha20_simd_mod_init(void)
chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
#ifdef CONFIG_AS_AVX512
chacha20_use_avx512vl = chacha20_use_avx2 &&
boot_cpu_has(X86_FEATURE_AVX512VL) &&
boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
#endif
#endif #endif
return crypto_register_skcipher(&alg); return crypto_register_skcipher(&alg);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment