Commit 3d1e93cd authored by Martin Willi's avatar Martin Willi Committed by Herbert Xu

crypto: chacha20 - Add an eight block AVX2 variant for x86_64

Extends the x86_64 ChaCha20 implementation by a function processing eight
ChaCha20 blocks in parallel using AVX2.

For large messages, throughput increases by ~55-70% compared to four block
SSSE3:

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 42249230 operations in 10 seconds (675987680 bytes)
test 1 (256 bit key, 64 byte blocks): 46441641 operations in 10 seconds (2972265024 bytes)
test 2 (256 bit key, 256 byte blocks): 33028112 operations in 10 seconds (8455196672 bytes)
test 3 (256 bit key, 1024 byte blocks): 11568759 operations in 10 seconds (11846409216 bytes)
test 4 (256 bit key, 8192 byte blocks): 1448761 operations in 10 seconds (11868250112 bytes)

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 41999675 operations in 10 seconds (671994800 bytes)
test 1 (256 bit key, 64 byte blocks): 45805908 operations in 10 seconds (2931578112 bytes)
test 2 (256 bit key, 256 byte blocks): 32814947 operations in 10 seconds (8400626432 bytes)
test 3 (256 bit key, 1024 byte blocks): 19777167 operations in 10 seconds (20251819008 bytes)
test 4 (256 bit key, 8192 byte blocks): 2279321 operations in 10 seconds (18672197632 bytes)

Benchmark results from a Core i5-4670T.
Signed-off-by: default avatarMartin Willi <martin@strongswan.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 274f938e
......@@ -77,6 +77,7 @@ endif
ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
chacha20-x86_64-y += chacha20-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
endif
......
This diff is collapsed.
......@@ -21,12 +21,27 @@
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
#ifdef CONFIG_AS_AVX2
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
static bool chacha20_use_avx2;
#endif
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u8 buf[CHACHA20_BLOCK_SIZE];
#ifdef CONFIG_AS_AVX2
if (chacha20_use_avx2) {
while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
chacha20_8block_xor_avx2(state, dst, src);
bytes -= CHACHA20_BLOCK_SIZE * 8;
src += CHACHA20_BLOCK_SIZE * 8;
dst += CHACHA20_BLOCK_SIZE * 8;
state[12] += 8;
}
}
#endif
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
chacha20_4block_xor_ssse3(state, dst, src);
bytes -= CHACHA20_BLOCK_SIZE * 4;
......@@ -113,6 +128,10 @@ static int __init chacha20_simd_mod_init(void)
if (!cpu_has_ssse3)
return -ENODEV;
#ifdef CONFIG_AS_AVX2
chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
#endif
return crypto_register_alg(&alg);
}
......
......@@ -1214,7 +1214,7 @@ config CRYPTO_CHACHA20
<http://cr.yp.to/chacha/chacha-20080128.pdf>
config CRYPTO_CHACHA20_X86_64
tristate "ChaCha20 cipher algorithm (x86_64/SSSE3)"
tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
depends on X86 && 64BIT
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment