Commit b2c31107 authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu:
 - Do not idle omap device between crypto operations in one session.
 - Added sha224/sha384 shims for SSSE3.
 - More optimisations for camellia-aesni-avx2.
 - Removed defunct blowfish/twofish AVX2 implementations.
 - Added unaligned buffer self-tests.
 - Added PCLMULQDQ optimisation for CRCT10DIF.
 - Added support for Freescale's DCP co-processor
 - Misc fixes.

* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (44 commits)
  crypto: testmgr - test hash implementations with unaligned buffers
  crypto: testmgr - test AEADs with unaligned buffers
  crypto: testmgr - test skciphers with unaligned buffers
  crypto: testmgr - check that entries in alg_test_descs are in correct order
  Revert "crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher"
  Revert "crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher"
  crypto: camellia-aesni-avx2 - tune assembly code for more performance
  hwrng: bcm2835 - fix MODULE_LICENSE tag
  hwrng: nomadik - use clk_prepare_enable()
  crypto: picoxcell - replace strict_strtoul() with kstrtoul()
  crypto: dcp - Staticize local symbols
  crypto: dcp - Use NULL instead of 0
  crypto: dcp - Use devm_* APIs
  crypto: dcp - Remove redundant platform_set_drvdata()
  hwrng: use platform_{get,set}_drvdata()
  crypto: omap-aes - Don't idle/start AES device between Encrypt operations
  crypto: crct10dif - Use PTR_RET
  crypto: ux500 - Cocci spatch "resource_size.spatch"
  crypto: sha256_ssse3 - add sha224 support
  crypto: sha512_ssse3 - add sha384 support
  ...
parents 45175476 02c0241b
...@@ -736,7 +736,7 @@ dma_apbx: dma-apbx@80024000 { ...@@ -736,7 +736,7 @@ dma_apbx: dma-apbx@80024000 {
dcp@80028000 { dcp@80028000 {
reg = <0x80028000 0x2000>; reg = <0x80028000 0x2000>;
interrupts = <52 53 54>; interrupts = <52 53 54>;
status = "disabled"; compatible = "fsl-dcp";
}; };
pxp@8002a000 { pxp@8002a000 {
......
...@@ -3,8 +3,6 @@ ...@@ -3,8 +3,6 @@
# #
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
...@@ -29,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o ...@@ -29,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
# These modules require assembler to support AVX. # These modules require assembler to support AVX.
ifeq ($(avx_supported),yes) ifeq ($(avx_supported),yes)
...@@ -42,10 +41,8 @@ endif ...@@ -42,10 +41,8 @@ endif
# These modules require assembler to support AVX2. # These modules require assembler to support AVX2.
ifeq ($(avx2_supported),yes) ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
endif endif
aes-i586-y := aes-i586-asm_32.o aes_glue.o aes-i586-y := aes-i586-asm_32.o aes_glue.o
...@@ -73,10 +70,8 @@ ifeq ($(avx_supported),yes) ...@@ -73,10 +70,8 @@ ifeq ($(avx_supported),yes)
endif endif
ifeq ($(avx2_supported),yes) ifeq ($(avx2_supported),yes)
blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
endif endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
...@@ -87,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o ...@@ -87,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
/*
* x86_64/AVX2 assembler optimized version of Blowfish
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*/
#include <linux/linkage.h>
.file "blowfish-avx2-asm_64.S"
.data
.align 32
.Lprefetch_mask:
.long 0*64
.long 1*64
.long 2*64
.long 3*64
.long 4*64
.long 5*64
.long 6*64
.long 7*64
.Lbswap32_mask:
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.Lbswap_iv_mask:
.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
.text
/* structure of crypto context */
#define p 0
#define s0 ((16 + 2) * 4)
#define s1 ((16 + 2 + (1 * 256)) * 4)
#define s2 ((16 + 2 + (2 * 256)) * 4)
#define s3 ((16 + 2 + (3 * 256)) * 4)
/* register macros */
#define CTX %rdi
#define RIO %rdx
#define RS0 %rax
#define RS1 %r8
#define RS2 %r9
#define RS3 %r10
#define RLOOP %r11
#define RLOOPd %r11d
#define RXr0 %ymm8
#define RXr1 %ymm9
#define RXr2 %ymm10
#define RXr3 %ymm11
#define RXl0 %ymm12
#define RXl1 %ymm13
#define RXl2 %ymm14
#define RXl3 %ymm15
/* temp regs */
#define RT0 %ymm0
#define RT0x %xmm0
#define RT1 %ymm1
#define RT1x %xmm1
#define RIDX0 %ymm2
#define RIDX1 %ymm3
#define RIDX1x %xmm3
#define RIDX2 %ymm4
#define RIDX3 %ymm5
/* vpgatherdd mask and '-1' */
#define RNOT %ymm6
/* byte mask, (-1 >> 24) */
#define RBYTE %ymm7
/***********************************************************************
* 32-way AVX2 blowfish
***********************************************************************/
#define F(xl, xr) \
vpsrld $24, xl, RIDX0; \
vpsrld $16, xl, RIDX1; \
vpsrld $8, xl, RIDX2; \
vpand RBYTE, RIDX1, RIDX1; \
vpand RBYTE, RIDX2, RIDX2; \
vpand RBYTE, xl, RIDX3; \
\
vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpcmpeqd RIDX0, RIDX0, RIDX0; \
\
vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
vpcmpeqd RIDX1, RIDX1, RIDX1; \
vpaddd RT0, RT1, RT0; \
\
vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
vpxor RT0, RT1, RT0; \
\
vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpaddd RT0, RT1, RT0; \
\
vpxor RT0, xr, xr;
#define add_roundkey(xl, nmem) \
vpbroadcastd nmem, RT0; \
vpxor RT0, xl ## 0, xl ## 0; \
vpxor RT0, xl ## 1, xl ## 1; \
vpxor RT0, xl ## 2, xl ## 2; \
vpxor RT0, xl ## 3, xl ## 3;
#define round_enc() \
add_roundkey(RXr, p(CTX,RLOOP,4)); \
F(RXl0, RXr0); \
F(RXl1, RXr1); \
F(RXl2, RXr2); \
F(RXl3, RXr3); \
\
add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
F(RXr0, RXl0); \
F(RXr1, RXl1); \
F(RXr2, RXl2); \
F(RXr3, RXl3);
#define round_dec() \
add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
F(RXl0, RXr0); \
F(RXl1, RXr1); \
F(RXl2, RXr2); \
F(RXl3, RXr3); \
\
add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
F(RXr0, RXl0); \
F(RXr1, RXl1); \
F(RXr2, RXl2); \
F(RXr3, RXl3);
#define init_round_constants() \
vpcmpeqd RNOT, RNOT, RNOT; \
leaq s0(CTX), RS0; \
leaq s1(CTX), RS1; \
leaq s2(CTX), RS2; \
leaq s3(CTX), RS3; \
vpsrld $24, RNOT, RBYTE;
#define transpose_2x2(x0, x1, t0) \
vpunpckldq x0, x1, t0; \
vpunpckhdq x0, x1, x1; \
\
vpunpcklqdq t0, x1, x0; \
vpunpckhqdq t0, x1, x1;
#define read_block(xl, xr) \
vbroadcasti128 .Lbswap32_mask, RT1; \
\
vpshufb RT1, xl ## 0, xl ## 0; \
vpshufb RT1, xr ## 0, xr ## 0; \
vpshufb RT1, xl ## 1, xl ## 1; \
vpshufb RT1, xr ## 1, xr ## 1; \
vpshufb RT1, xl ## 2, xl ## 2; \
vpshufb RT1, xr ## 2, xr ## 2; \
vpshufb RT1, xl ## 3, xl ## 3; \
vpshufb RT1, xr ## 3, xr ## 3; \
\
transpose_2x2(xl ## 0, xr ## 0, RT0); \
transpose_2x2(xl ## 1, xr ## 1, RT0); \
transpose_2x2(xl ## 2, xr ## 2, RT0); \
transpose_2x2(xl ## 3, xr ## 3, RT0);
#define write_block(xl, xr) \
vbroadcasti128 .Lbswap32_mask, RT1; \
\
transpose_2x2(xl ## 0, xr ## 0, RT0); \
transpose_2x2(xl ## 1, xr ## 1, RT0); \
transpose_2x2(xl ## 2, xr ## 2, RT0); \
transpose_2x2(xl ## 3, xr ## 3, RT0); \
\
vpshufb RT1, xl ## 0, xl ## 0; \
vpshufb RT1, xr ## 0, xr ## 0; \
vpshufb RT1, xl ## 1, xl ## 1; \
vpshufb RT1, xr ## 1, xr ## 1; \
vpshufb RT1, xl ## 2, xl ## 2; \
vpshufb RT1, xr ## 2, xr ## 2; \
vpshufb RT1, xl ## 3, xl ## 3; \
vpshufb RT1, xr ## 3, xr ## 3;
.align 8
__blowfish_enc_blk32:
/* input:
* %rdi: ctx, CTX
* RXl0..4, RXr0..4: plaintext
* output:
* RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
*/
init_round_constants();
read_block(RXl, RXr);
movl $1, RLOOPd;
add_roundkey(RXl, p+4*(0)(CTX));
.align 4
.L__enc_loop:
round_enc();
leal 2(RLOOPd), RLOOPd;
cmpl $17, RLOOPd;
jne .L__enc_loop;
add_roundkey(RXr, p+4*(17)(CTX));
write_block(RXl, RXr);
ret;
ENDPROC(__blowfish_enc_blk32)
.align 8
__blowfish_dec_blk32:
/* input:
* %rdi: ctx, CTX
* RXl0..4, RXr0..4: ciphertext
* output:
* RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
*/
init_round_constants();
read_block(RXl, RXr);
movl $14, RLOOPd;
add_roundkey(RXl, p+4*(17)(CTX));
.align 4
.L__dec_loop:
round_dec();
addl $-2, RLOOPd;
jns .L__dec_loop;
add_roundkey(RXr, p+4*(0)(CTX));
write_block(RXl, RXr);
ret;
ENDPROC(__blowfish_dec_blk32)
ENTRY(blowfish_ecb_enc_32way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
vzeroupper;
vmovdqu 0*32(%rdx), RXl0;
vmovdqu 1*32(%rdx), RXr0;
vmovdqu 2*32(%rdx), RXl1;
vmovdqu 3*32(%rdx), RXr1;
vmovdqu 4*32(%rdx), RXl2;
vmovdqu 5*32(%rdx), RXr2;
vmovdqu 6*32(%rdx), RXl3;
vmovdqu 7*32(%rdx), RXr3;
call __blowfish_enc_blk32;
vmovdqu RXr0, 0*32(%rsi);
vmovdqu RXl0, 1*32(%rsi);
vmovdqu RXr1, 2*32(%rsi);
vmovdqu RXl1, 3*32(%rsi);
vmovdqu RXr2, 4*32(%rsi);
vmovdqu RXl2, 5*32(%rsi);
vmovdqu RXr3, 6*32(%rsi);
vmovdqu RXl3, 7*32(%rsi);
vzeroupper;
ret;
ENDPROC(blowfish_ecb_enc_32way)
ENTRY(blowfish_ecb_dec_32way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
vzeroupper;
vmovdqu 0*32(%rdx), RXl0;
vmovdqu 1*32(%rdx), RXr0;
vmovdqu 2*32(%rdx), RXl1;
vmovdqu 3*32(%rdx), RXr1;
vmovdqu 4*32(%rdx), RXl2;
vmovdqu 5*32(%rdx), RXr2;
vmovdqu 6*32(%rdx), RXl3;
vmovdqu 7*32(%rdx), RXr3;
call __blowfish_dec_blk32;
vmovdqu RXr0, 0*32(%rsi);
vmovdqu RXl0, 1*32(%rsi);
vmovdqu RXr1, 2*32(%rsi);
vmovdqu RXl1, 3*32(%rsi);
vmovdqu RXr2, 4*32(%rsi);
vmovdqu RXl2, 5*32(%rsi);
vmovdqu RXr3, 6*32(%rsi);
vmovdqu RXl3, 7*32(%rsi);
vzeroupper;
ret;
ENDPROC(blowfish_ecb_dec_32way)
ENTRY(blowfish_cbc_dec_32way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
vzeroupper;
vmovdqu 0*32(%rdx), RXl0;
vmovdqu 1*32(%rdx), RXr0;
vmovdqu 2*32(%rdx), RXl1;
vmovdqu 3*32(%rdx), RXr1;
vmovdqu 4*32(%rdx), RXl2;
vmovdqu 5*32(%rdx), RXr2;
vmovdqu 6*32(%rdx), RXl3;
vmovdqu 7*32(%rdx), RXr3;
call __blowfish_dec_blk32;
/* xor with src */
vmovq (%rdx), RT0x;
vpshufd $0x4f, RT0x, RT0x;
vinserti128 $1, 8(%rdx), RT0, RT0;
vpxor RT0, RXr0, RXr0;
vpxor 0*32+24(%rdx), RXl0, RXl0;
vpxor 1*32+24(%rdx), RXr1, RXr1;
vpxor 2*32+24(%rdx), RXl1, RXl1;
vpxor 3*32+24(%rdx), RXr2, RXr2;
vpxor 4*32+24(%rdx), RXl2, RXl2;
vpxor 5*32+24(%rdx), RXr3, RXr3;
vpxor 6*32+24(%rdx), RXl3, RXl3;
vmovdqu RXr0, (0*32)(%rsi);
vmovdqu RXl0, (1*32)(%rsi);
vmovdqu RXr1, (2*32)(%rsi);
vmovdqu RXl1, (3*32)(%rsi);
vmovdqu RXr2, (4*32)(%rsi);
vmovdqu RXl2, (5*32)(%rsi);
vmovdqu RXr3, (6*32)(%rsi);
vmovdqu RXl3, (7*32)(%rsi);
vzeroupper;
ret;
ENDPROC(blowfish_cbc_dec_32way)
ENTRY(blowfish_ctr_32way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: iv (big endian, 64bit)
*/
vzeroupper;
vpcmpeqd RT0, RT0, RT0;
vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
vpcmpeqd RT1x, RT1x, RT1x;
vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
vpxor RIDX0, RIDX0, RIDX0;
vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
vpcmpeqd RT1, RT1, RT1;
vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
vbroadcasti128 .Lbswap_iv_mask, RIDX0;
vbroadcasti128 .Lbswap128_mask, RIDX1;
/* load IV and byteswap */
vmovq (%rcx), RT1x;
vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
/* construct IVs */
vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */
vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */
vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */
vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */
vpsubq RIDX2, RT1, RT1;
vpshufb RIDX1, RT1, RXl1;
vpsubq RIDX2, RT1, RT1;
vpshufb RIDX1, RT1, RXr1;
vpsubq RIDX2, RT1, RT1;
vpshufb RIDX1, RT1, RXl2;
vpsubq RIDX2, RT1, RT1;
vpshufb RIDX1, RT1, RXr2;
vpsubq RIDX2, RT1, RT1;
vpshufb RIDX1, RT1, RXl3;
vpsubq RIDX2, RT1, RT1;
vpshufb RIDX1, RT1, RXr3;
/* store last IV */
vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
vmovq RT1x, (%rcx);
call __blowfish_enc_blk32;
/* dst = src ^ iv */
vpxor 0*32(%rdx), RXr0, RXr0;
vpxor 1*32(%rdx), RXl0, RXl0;
vpxor 2*32(%rdx), RXr1, RXr1;
vpxor 3*32(%rdx), RXl1, RXl1;
vpxor 4*32(%rdx), RXr2, RXr2;
vpxor 5*32(%rdx), RXl2, RXl2;
vpxor 6*32(%rdx), RXr3, RXr3;
vpxor 7*32(%rdx), RXl3, RXl3;
vmovdqu RXr0, (0*32)(%rsi);
vmovdqu RXl0, (1*32)(%rsi);
vmovdqu RXr1, (2*32)(%rsi);
vmovdqu RXl1, (3*32)(%rsi);
vmovdqu RXr2, (4*32)(%rsi);
vmovdqu RXl2, (5*32)(%rsi);
vmovdqu RXr3, (6*32)(%rsi);
vmovdqu RXl3, (7*32)(%rsi);
vzeroupper;
ret;
ENDPROC(blowfish_ctr_32way)
/*
* Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
* CTR part based on code (crypto/ctr.c) by:
* (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/blowfish.h>
#include <crypto/cryptd.h>
#include <crypto/ctr.h>
#include <asm/i387.h>
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <asm/crypto/blowfish.h>
#include <asm/crypto/ablk_helper.h>
#include <crypto/scatterwalk.h>
#define BF_AVX2_PARALLEL_BLOCKS 32
/* 32-way AVX2 parallel cipher functions */
asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
__be64 *iv);
static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
{
if (fpu_enabled)
return true;
/* FPU is only used when chunk to be processed is large enough, so
* do not enable FPU until it is necessary.
*/
if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
return false;
kernel_fpu_begin();
return true;
}
static inline void bf_fpu_end(bool fpu_enabled)
{
if (fpu_enabled)
kernel_fpu_end();
}
static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
bool enc)
{
bool fpu_enabled = false;
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = BF_BLOCK_SIZE;
unsigned int nbytes;
int err;
err = blkcipher_walk_virt(desc, walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
while ((nbytes = walk->nbytes)) {
u8 *wsrc = walk->src.virt.addr;
u8 *wdst = walk->dst.virt.addr;
fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
/* Process multi-block AVX2 batch */
if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
do {
if (enc)
blowfish_ecb_enc_32way(ctx, wdst, wsrc);
else
blowfish_ecb_dec_32way(ctx, wdst, wsrc);
wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
if (nbytes < bsize)
goto done;
}
/* Process multi-block batch */
if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
do {
if (enc)
blowfish_enc_blk_4way(ctx, wdst, wsrc);
else
blowfish_dec_blk_4way(ctx, wdst, wsrc);
wsrc += bsize * BF_PARALLEL_BLOCKS;
wdst += bsize * BF_PARALLEL_BLOCKS;
nbytes -= bsize * BF_PARALLEL_BLOCKS;
} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
if (nbytes < bsize)
goto done;
}
/* Handle leftovers */
do {
if (enc)
blowfish_enc_blk(ctx, wdst, wsrc);
else
blowfish_dec_blk(ctx, wdst, wsrc);
wsrc += bsize;
wdst += bsize;
nbytes -= bsize;
} while (nbytes >= bsize);
done:
err = blkcipher_walk_done(desc, walk, nbytes);
}
bf_fpu_end(fpu_enabled);
return err;
}
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct blkcipher_walk walk;
blkcipher_walk_init(&walk, dst, src, nbytes);
return ecb_crypt(desc, &walk, true);
}
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct blkcipher_walk walk;
blkcipher_walk_init(&walk, dst, src, nbytes);
return ecb_crypt(desc, &walk, false);
}
static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
struct blkcipher_walk *walk)
{
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
unsigned int bsize = BF_BLOCK_SIZE;
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
u64 *iv = (u64 *)walk->iv;
do {
*dst = *src ^ *iv;
blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
iv = dst;
src += 1;
dst += 1;
nbytes -= bsize;
} while (nbytes >= bsize);
*(u64 *)walk->iv = *iv;
return nbytes;
}
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
while ((nbytes = walk.nbytes)) {
nbytes = __cbc_encrypt(desc, &walk);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
return err;
}
static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
struct blkcipher_walk *walk)
{
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = BF_BLOCK_SIZE;
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
u64 last_iv;
int i;
/* Start of the last block. */
src += nbytes / bsize - 1;
dst += nbytes / bsize - 1;
last_iv = *src;
/* Process multi-block AVX2 batch */
if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
do {
nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
src -= BF_AVX2_PARALLEL_BLOCKS - 1;
dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
nbytes -= bsize;
if (nbytes < bsize)
goto done;
*dst ^= *(src - 1);
src -= 1;
dst -= 1;
} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
if (nbytes < bsize)
goto done;
}
/* Process multi-block batch */
if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
u64 ivs[BF_PARALLEL_BLOCKS - 1];
do {
nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
src -= BF_PARALLEL_BLOCKS - 1;
dst -= BF_PARALLEL_BLOCKS - 1;
for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
ivs[i] = src[i];
blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
dst[i + 1] ^= ivs[i];
nbytes -= bsize;
if (nbytes < bsize)
goto done;
*dst ^= *(src - 1);
src -= 1;
dst -= 1;
} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
if (nbytes < bsize)
goto done;
}
/* Handle leftovers */
for (;;) {
blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
nbytes -= bsize;
if (nbytes < bsize)
break;
*dst ^= *(src - 1);
src -= 1;
dst -= 1;
}
done:
*dst ^= *(u64 *)walk->iv;
*(u64 *)walk->iv = last_iv;
return nbytes;
}
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
bool fpu_enabled = false;
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
while ((nbytes = walk.nbytes)) {
fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
nbytes = __cbc_decrypt(desc, &walk);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
bf_fpu_end(fpu_enabled);
return err;
}
static void ctr_crypt_final(struct blkcipher_desc *desc,
struct blkcipher_walk *walk)
{
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
u8 *ctrblk = walk->iv;
u8 keystream[BF_BLOCK_SIZE];
u8 *src = walk->src.virt.addr;
u8 *dst = walk->dst.virt.addr;
unsigned int nbytes = walk->nbytes;
blowfish_enc_blk(ctx, keystream, ctrblk);
crypto_xor(keystream, src, nbytes);
memcpy(dst, keystream, nbytes);
crypto_inc(ctrblk, BF_BLOCK_SIZE);
}
static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
struct blkcipher_walk *walk)
{
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
unsigned int bsize = BF_BLOCK_SIZE;
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
int i;
/* Process multi-block AVX2 batch */
if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
do {
blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
(__be64 *)walk->iv);
src += BF_AVX2_PARALLEL_BLOCKS;
dst += BF_AVX2_PARALLEL_BLOCKS;
nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
if (nbytes < bsize)
goto done;
}
/* Process four block batch */
if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
__be64 ctrblocks[BF_PARALLEL_BLOCKS];
u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
do {
/* create ctrblks for parallel encrypt */
for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];
ctrblocks[i] = cpu_to_be64(ctrblk++);
}
blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
(u8 *)ctrblocks);
src += BF_PARALLEL_BLOCKS;
dst += BF_PARALLEL_BLOCKS;
nbytes -= bsize * BF_PARALLEL_BLOCKS;
} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
if (nbytes < bsize)
goto done;
}
/* Handle leftovers */
do {
u64 ctrblk;
if (dst != src)
*dst = *src;
ctrblk = *(u64 *)walk->iv;
be64_add_cpu((__be64 *)walk->iv, 1);
blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
src += 1;
dst += 1;
} while ((nbytes -= bsize) >= bsize);
done:
return nbytes;
}
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
bool fpu_enabled = false;
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
nbytes = __ctr_crypt(desc, &walk);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
bf_fpu_end(fpu_enabled);
if (walk.nbytes) {
ctr_crypt_final(desc, &walk);
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static struct crypto_alg bf_algs[6] = { {
.cra_name = "__ecb-blowfish-avx2",
.cra_driver_name = "__driver-ecb-blowfish-avx2",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = BF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct bf_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = BF_MIN_KEY_SIZE,
.max_keysize = BF_MAX_KEY_SIZE,
.setkey = blowfish_setkey,
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
},
},
}, {
.cra_name = "__cbc-blowfish-avx2",
.cra_driver_name = "__driver-cbc-blowfish-avx2",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = BF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct bf_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = BF_MIN_KEY_SIZE,
.max_keysize = BF_MAX_KEY_SIZE,
.setkey = blowfish_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
},
},
}, {
.cra_name = "__ctr-blowfish-avx2",
.cra_driver_name = "__driver-ctr-blowfish-avx2",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct bf_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = BF_MIN_KEY_SIZE,
.max_keysize = BF_MAX_KEY_SIZE,
.ivsize = BF_BLOCK_SIZE,
.setkey = blowfish_setkey,
.encrypt = ctr_crypt,
.decrypt = ctr_crypt,
},
},
}, {
.cra_name = "ecb(blowfish)",
.cra_driver_name = "ecb-blowfish-avx2",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = BF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = BF_MIN_KEY_SIZE,
.max_keysize = BF_MAX_KEY_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "cbc(blowfish)",
.cra_driver_name = "cbc-blowfish-avx2",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = BF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = BF_MIN_KEY_SIZE,
.max_keysize = BF_MAX_KEY_SIZE,
.ivsize = BF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = __ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "ctr(blowfish)",
.cra_driver_name = "ctr-blowfish-avx2",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = BF_MIN_KEY_SIZE,
.max_keysize = BF_MAX_KEY_SIZE,
.ivsize = BF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_encrypt,
.geniv = "chainiv",
},
},
} };
static int __init init(void)
{
u64 xcr0;
if (!cpu_has_avx2 || !cpu_has_osxsave) {
pr_info("AVX2 instructions are not detected.\n");
return -ENODEV;
}
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
pr_info("AVX detected but unusable.\n");
return -ENODEV;
}
return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
}
static void __exit fini(void)
{
crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
}
module_init(init);
module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
MODULE_ALIAS("blowfish");
MODULE_ALIAS("blowfish-asm");
/* /*
* Glue Code for assembler optimized version of Blowfish * Glue Code for assembler optimized version of Blowfish
* *
* Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
* *
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
...@@ -32,24 +32,40 @@ ...@@ -32,24 +32,40 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/types.h> #include <linux/types.h>
#include <crypto/algapi.h> #include <crypto/algapi.h>
#include <asm/crypto/blowfish.h>
/* regular block cipher functions */ /* regular block cipher functions */
asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
bool xor); bool xor);
EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
EXPORT_SYMBOL_GPL(blowfish_dec_blk);
/* 4-way parallel cipher functions */ /* 4-way parallel cipher functions */
asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src, bool xor); const u8 *src, bool xor);
EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
{
__blowfish_enc_blk(ctx, dst, src, false);
}
static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
const u8 *src)
{
__blowfish_enc_blk(ctx, dst, src, true);
}
static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src)
{
__blowfish_enc_blk_4way(ctx, dst, src, false);
}
static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src)
{
__blowfish_enc_blk_4way(ctx, dst, src, true);
}
static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{ {
......
...@@ -51,16 +51,6 @@ ...@@ -51,16 +51,6 @@
#define ymm14_x xmm14 #define ymm14_x xmm14
#define ymm15_x xmm15 #define ymm15_x xmm15
/*
* AES-NI instructions do not support ymmX registers, so we need splitting and
* merging.
*/
#define vaesenclast256(zero, yreg, tmp) \
vextracti128 $1, yreg, tmp##_x; \
vaesenclast zero##_x, yreg##_x, yreg##_x; \
vaesenclast zero##_x, tmp##_x, tmp##_x; \
vinserti128 $1, tmp##_x, yreg, yreg;
/********************************************************************** /**********************************************************************
32-way camellia 32-way camellia
**********************************************************************/ **********************************************************************/
...@@ -79,46 +69,70 @@ ...@@ -79,46 +69,70 @@
* S-function with AES subbytes \ * S-function with AES subbytes \
*/ \ */ \
vbroadcasti128 .Linv_shift_row, t4; \ vbroadcasti128 .Linv_shift_row, t4; \
vpbroadcastb .L0f0f0f0f, t7; \ vpbroadcastd .L0f0f0f0f, t7; \
vbroadcasti128 .Lpre_tf_lo_s1, t0; \ vbroadcasti128 .Lpre_tf_lo_s1, t5; \
vbroadcasti128 .Lpre_tf_hi_s1, t1; \ vbroadcasti128 .Lpre_tf_hi_s1, t6; \
vbroadcasti128 .Lpre_tf_lo_s4, t2; \
vbroadcasti128 .Lpre_tf_hi_s4, t3; \
\ \
/* AES inverse shift rows */ \ /* AES inverse shift rows */ \
vpshufb t4, x0, x0; \ vpshufb t4, x0, x0; \
vpshufb t4, x7, x7; \ vpshufb t4, x7, x7; \
vpshufb t4, x1, x1; \
vpshufb t4, x4, x4; \
vpshufb t4, x2, x2; \
vpshufb t4, x5, x5; \
vpshufb t4, x3, x3; \ vpshufb t4, x3, x3; \
vpshufb t4, x6, x6; \ vpshufb t4, x6, x6; \
vpshufb t4, x2, x2; \
vpshufb t4, x5, x5; \
vpshufb t4, x1, x1; \
vpshufb t4, x4, x4; \
\ \
/* prefilter sboxes 1, 2 and 3 */ \ /* prefilter sboxes 1, 2 and 3 */ \
vbroadcasti128 .Lpre_tf_lo_s4, t2; \
vbroadcasti128 .Lpre_tf_hi_s4, t3; \
filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x1, t0, t1, t7, t6); \
filter_8bit(x4, t0, t1, t7, t6); \
filter_8bit(x2, t0, t1, t7, t6); \
filter_8bit(x5, t0, t1, t7, t6); \
\
/* prefilter sbox 4 */ \ /* prefilter sbox 4 */ \
filter_8bit(x0, t5, t6, t7, t4); \
filter_8bit(x7, t5, t6, t7, t4); \
vextracti128 $1, x0, t0##_x; \
vextracti128 $1, x7, t1##_x; \
filter_8bit(x3, t2, t3, t7, t4); \
filter_8bit(x6, t2, t3, t7, t4); \
vextracti128 $1, x3, t3##_x; \
vextracti128 $1, x6, t2##_x; \
filter_8bit(x2, t5, t6, t7, t4); \
filter_8bit(x5, t5, t6, t7, t4); \
filter_8bit(x1, t5, t6, t7, t4); \
filter_8bit(x4, t5, t6, t7, t4); \
\
vpxor t4##_x, t4##_x, t4##_x; \ vpxor t4##_x, t4##_x, t4##_x; \
filter_8bit(x3, t2, t3, t7, t6); \
filter_8bit(x6, t2, t3, t7, t6); \
\ \
/* AES subbytes + AES shift rows */ \ /* AES subbytes + AES shift rows */ \
vextracti128 $1, x2, t6##_x; \
vextracti128 $1, x5, t5##_x; \
vaesenclast t4##_x, x0##_x, x0##_x; \
vaesenclast t4##_x, t0##_x, t0##_x; \
vinserti128 $1, t0##_x, x0, x0; \
vaesenclast t4##_x, x7##_x, x7##_x; \
vaesenclast t4##_x, t1##_x, t1##_x; \
vinserti128 $1, t1##_x, x7, x7; \
vaesenclast t4##_x, x3##_x, x3##_x; \
vaesenclast t4##_x, t3##_x, t3##_x; \
vinserti128 $1, t3##_x, x3, x3; \
vaesenclast t4##_x, x6##_x, x6##_x; \
vaesenclast t4##_x, t2##_x, t2##_x; \
vinserti128 $1, t2##_x, x6, x6; \
vextracti128 $1, x1, t3##_x; \
vextracti128 $1, x4, t2##_x; \
vbroadcasti128 .Lpost_tf_lo_s1, t0; \ vbroadcasti128 .Lpost_tf_lo_s1, t0; \
vbroadcasti128 .Lpost_tf_hi_s1, t1; \ vbroadcasti128 .Lpost_tf_hi_s1, t1; \
vaesenclast256(t4, x0, t5); \ vaesenclast t4##_x, x2##_x, x2##_x; \
vaesenclast256(t4, x7, t5); \ vaesenclast t4##_x, t6##_x, t6##_x; \
vaesenclast256(t4, x1, t5); \ vinserti128 $1, t6##_x, x2, x2; \
vaesenclast256(t4, x4, t5); \ vaesenclast t4##_x, x5##_x, x5##_x; \
vaesenclast256(t4, x2, t5); \ vaesenclast t4##_x, t5##_x, t5##_x; \
vaesenclast256(t4, x5, t5); \ vinserti128 $1, t5##_x, x5, x5; \
vaesenclast256(t4, x3, t5); \ vaesenclast t4##_x, x1##_x, x1##_x; \
vaesenclast256(t4, x6, t5); \ vaesenclast t4##_x, t3##_x, t3##_x; \
vinserti128 $1, t3##_x, x1, x1; \
vaesenclast t4##_x, x4##_x, x4##_x; \
vaesenclast t4##_x, t2##_x, t2##_x; \
vinserti128 $1, t2##_x, x4, x4; \
\ \
/* postfilter sboxes 1 and 4 */ \ /* postfilter sboxes 1 and 4 */ \
vbroadcasti128 .Lpost_tf_lo_s3, t2; \ vbroadcasti128 .Lpost_tf_lo_s3, t2; \
...@@ -139,22 +153,12 @@ ...@@ -139,22 +153,12 @@
/* postfilter sbox 2 */ \ /* postfilter sbox 2 */ \
filter_8bit(x1, t4, t5, t7, t2); \ filter_8bit(x1, t4, t5, t7, t2); \
filter_8bit(x4, t4, t5, t7, t2); \ filter_8bit(x4, t4, t5, t7, t2); \
vpxor t7, t7, t7; \
\ \
vpsrldq $1, t0, t1; \ vpsrldq $1, t0, t1; \
vpsrldq $2, t0, t2; \ vpsrldq $2, t0, t2; \
vpshufb t7, t1, t1; \
vpsrldq $3, t0, t3; \ vpsrldq $3, t0, t3; \
vpsrldq $4, t0, t4; \
vpsrldq $5, t0, t5; \
vpsrldq $6, t0, t6; \
vpsrldq $7, t0, t7; \
vpbroadcastb t0##_x, t0; \
vpbroadcastb t1##_x, t1; \
vpbroadcastb t2##_x, t2; \
vpbroadcastb t3##_x, t3; \
vpbroadcastb t4##_x, t4; \
vpbroadcastb t6##_x, t6; \
vpbroadcastb t5##_x, t5; \
vpbroadcastb t7##_x, t7; \
\ \
/* P-function */ \ /* P-function */ \
vpxor x5, x0, x0; \ vpxor x5, x0, x0; \
...@@ -162,11 +166,21 @@ ...@@ -162,11 +166,21 @@
vpxor x7, x2, x2; \ vpxor x7, x2, x2; \
vpxor x4, x3, x3; \ vpxor x4, x3, x3; \
\ \
vpshufb t7, t2, t2; \
vpsrldq $4, t0, t4; \
vpshufb t7, t3, t3; \
vpsrldq $5, t0, t5; \
vpshufb t7, t4, t4; \
\
vpxor x2, x4, x4; \ vpxor x2, x4, x4; \
vpxor x3, x5, x5; \ vpxor x3, x5, x5; \
vpxor x0, x6, x6; \ vpxor x0, x6, x6; \
vpxor x1, x7, x7; \ vpxor x1, x7, x7; \
\ \
vpsrldq $6, t0, t6; \
vpshufb t7, t5, t5; \
vpshufb t7, t6, t6; \
\
vpxor x7, x0, x0; \ vpxor x7, x0, x0; \
vpxor x4, x1, x1; \ vpxor x4, x1, x1; \
vpxor x5, x2, x2; \ vpxor x5, x2, x2; \
...@@ -179,12 +193,16 @@ ...@@ -179,12 +193,16 @@
\ \
/* Add key material and result to CD (x becomes new CD) */ \ /* Add key material and result to CD (x becomes new CD) */ \
\ \
vpxor t7, x0, x0; \
vpxor 4 * 32(mem_cd), x0, x0; \
\
vpxor t6, x1, x1; \ vpxor t6, x1, x1; \
vpxor 5 * 32(mem_cd), x1, x1; \ vpxor 5 * 32(mem_cd), x1, x1; \
\ \
vpsrldq $7, t0, t6; \
vpshufb t7, t0, t0; \
vpshufb t7, t6, t7; \
\
vpxor t7, x0, x0; \
vpxor 4 * 32(mem_cd), x0, x0; \
\
vpxor t5, x2, x2; \ vpxor t5, x2, x2; \
vpxor 6 * 32(mem_cd), x2, x2; \ vpxor 6 * 32(mem_cd), x2, x2; \
\ \
...@@ -204,7 +222,7 @@ ...@@ -204,7 +222,7 @@
vpxor 3 * 32(mem_cd), x7, x7; vpxor 3 * 32(mem_cd), x7, x7;
/* /*
* Size optimization... with inlined roundsm16 binary would be over 5 times * Size optimization... with inlined roundsm32 binary would be over 5 times
* larger and would only marginally faster. * larger and would only marginally faster.
*/ */
.align 8 .align 8
...@@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
*/ \ */ \
vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
vpxor tt0, tt0, tt0; \ vpxor tt0, tt0, tt0; \
vpbroadcastb t0##_x, t3; \ vpshufb tt0, t0, t3; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t2; \ vpshufb tt0, t0, t2; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t1; \ vpshufb tt0, t0, t1; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t0; \ vpshufb tt0, t0, t0; \
\ \
vpand l0, t0, t0; \ vpand l0, t0, t0; \
vpand l1, t1, t1; \ vpand l1, t1, t1; \
...@@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
\ \
vpxor l4, t0, l4; \ vpxor l4, t0, l4; \
vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
vmovdqu l4, 4 * 32(l); \ vmovdqu l4, 4 * 32(l); \
vpxor l5, t1, l5; \ vpxor l5, t1, l5; \
vmovdqu l5, 5 * 32(l); \ vmovdqu l5, 5 * 32(l); \
...@@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
* rl ^= t2; \ * rl ^= t2; \
*/ \ */ \
\ \
vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ vpshufb tt0, t0, t3; \
vpbroadcastb t0##_x, t3; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t2; \ vpshufb tt0, t0, t2; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t1; \ vpshufb tt0, t0, t1; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t0; \ vpshufb tt0, t0, t0; \
\ \
vpor 4 * 32(r), t0, t0; \ vpor 4 * 32(r), t0, t0; \
vpor 5 * 32(r), t1, t1; \ vpor 5 * 32(r), t1, t1; \
...@@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
vpxor 2 * 32(r), t2, t2; \ vpxor 2 * 32(r), t2, t2; \
vpxor 3 * 32(r), t3, t3; \ vpxor 3 * 32(r), t3, t3; \
vmovdqu t0, 0 * 32(r); \ vmovdqu t0, 0 * 32(r); \
vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
vmovdqu t1, 1 * 32(r); \ vmovdqu t1, 1 * 32(r); \
vmovdqu t2, 2 * 32(r); \ vmovdqu t2, 2 * 32(r); \
vmovdqu t3, 3 * 32(r); \ vmovdqu t3, 3 * 32(r); \
...@@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
* t2 &= rl; \ * t2 &= rl; \
* rr ^= rol32(t2, 1); \ * rr ^= rol32(t2, 1); \
*/ \ */ \
vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ vpshufb tt0, t0, t3; \
vpbroadcastb t0##_x, t3; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t2; \ vpshufb tt0, t0, t2; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t1; \ vpshufb tt0, t0, t1; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t0; \ vpshufb tt0, t0, t0; \
\ \
vpand 0 * 32(r), t0, t0; \ vpand 0 * 32(r), t0, t0; \
vpand 1 * 32(r), t1, t1; \ vpand 1 * 32(r), t1, t1; \
...@@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
vpxor 6 * 32(r), t2, t2; \ vpxor 6 * 32(r), t2, t2; \
vpxor 7 * 32(r), t3, t3; \ vpxor 7 * 32(r), t3, t3; \
vmovdqu t0, 4 * 32(r); \ vmovdqu t0, 4 * 32(r); \
vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
vmovdqu t1, 5 * 32(r); \ vmovdqu t1, 5 * 32(r); \
vmovdqu t2, 6 * 32(r); \ vmovdqu t2, 6 * 32(r); \
vmovdqu t3, 7 * 32(r); \ vmovdqu t3, 7 * 32(r); \
...@@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
* ll ^= t0; \ * ll ^= t0; \
*/ \ */ \
\ \
vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ vpshufb tt0, t0, t3; \
vpbroadcastb t0##_x, t3; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t2; \ vpshufb tt0, t0, t2; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t1; \ vpshufb tt0, t0, t1; \
vpsrldq $1, t0, t0; \ vpsrldq $1, t0, t0; \
vpbroadcastb t0##_x, t0; \ vpshufb tt0, t0, t0; \
\ \
vpor l4, t0, t0; \ vpor l4, t0, t0; \
vpor l5, t1, t1; \ vpor l5, t1, t1; \
......
########################################################################
# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
#
# Copyright (c) 2013, Intel Corporation
#
# Authors:
# Erdinc Ozturk <erdinc.ozturk@intel.com>
# Vinodh Gopal <vinodh.gopal@intel.com>
# James Guilford <james.guilford@intel.com>
# Tim Chen <tim.c.chen@linux.intel.com>
#
# This software is available to you under a choice of one of two
# licenses. You may choose to be licensed under the terms of the GNU
# General Public License (GPL) Version 2, available from the file
# COPYING in the main directory of this source tree, or the
# OpenIB.org BSD license below:
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the
# distribution.
#
# * Neither the name of the Intel Corporation nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
########################################################################
# Function API:
# UINT16 crc_t10dif_pcl(
# UINT16 init_crc, //initial CRC value, 16 bits
# const unsigned char *buf, //buffer pointer to calculate CRC on
# UINT64 len //buffer length in bytes (64-bit data)
# );
#
# Reference paper titled "Fast CRC Computation for Generic
# Polynomials Using PCLMULQDQ Instruction"
# URL: http://www.intel.com/content/dam/www/public/us/en/documents
# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
#
#
#include <linux/linkage.h>
.text
#define arg1 %rdi
#define arg2 %rsi
#define arg3 %rdx
#define arg1_low32 %edi
ENTRY(crc_t10dif_pcl)
.align 16
# adjust the 16-bit initial_crc value, scale it to 32 bits
shl $16, arg1_low32
# Allocate Stack Space
mov %rsp, %rcx
sub $16*2, %rsp
# align stack to 16 byte boundary
and $~(0x10 - 1), %rsp
# check if smaller than 256
cmp $256, arg3
# for sizes less than 128, we can't fold 64B at a time...
jl _less_than_128
# load the initial crc value
movd arg1_low32, %xmm10 # initial crc
# crc value does not need to be byte-reflected, but it needs
# to be moved to the high part of the register.
# because data will be byte-reflected and will align with
# initial crc at correct place.
pslldq $12, %xmm10
movdqa SHUF_MASK(%rip), %xmm11
# receive the initial 64B data, xor the initial crc value
movdqu 16*0(arg2), %xmm0
movdqu 16*1(arg2), %xmm1
movdqu 16*2(arg2), %xmm2
movdqu 16*3(arg2), %xmm3
movdqu 16*4(arg2), %xmm4
movdqu 16*5(arg2), %xmm5
movdqu 16*6(arg2), %xmm6
movdqu 16*7(arg2), %xmm7
pshufb %xmm11, %xmm0
# XOR the initial_crc value
pxor %xmm10, %xmm0
pshufb %xmm11, %xmm1
pshufb %xmm11, %xmm2
pshufb %xmm11, %xmm3
pshufb %xmm11, %xmm4
pshufb %xmm11, %xmm5
pshufb %xmm11, %xmm6
pshufb %xmm11, %xmm7
movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
#imm value of pclmulqdq instruction
#will determine which constant to use
#################################################################
# we subtract 256 instead of 128 to save one instruction from the loop
sub $256, arg3
# at this section of the code, there is 64*x+y (0<=y<64) bytes of
# buffer. The _fold_64_B_loop will fold 64B at a time
# until we have 64+y Bytes of buffer
# fold 64B at a time. This section of the code folds 4 xmm
# registers in parallel
_fold_64_B_loop:
# update the buffer pointer
add $128, arg2 # buf += 64#
movdqu 16*0(arg2), %xmm9
movdqu 16*1(arg2), %xmm12
pshufb %xmm11, %xmm9
pshufb %xmm11, %xmm12
movdqa %xmm0, %xmm8
movdqa %xmm1, %xmm13
pclmulqdq $0x0 , %xmm10, %xmm0
pclmulqdq $0x11, %xmm10, %xmm8
pclmulqdq $0x0 , %xmm10, %xmm1
pclmulqdq $0x11, %xmm10, %xmm13
pxor %xmm9 , %xmm0
xorps %xmm8 , %xmm0
pxor %xmm12, %xmm1
xorps %xmm13, %xmm1
movdqu 16*2(arg2), %xmm9
movdqu 16*3(arg2), %xmm12
pshufb %xmm11, %xmm9
pshufb %xmm11, %xmm12
movdqa %xmm2, %xmm8
movdqa %xmm3, %xmm13
pclmulqdq $0x0, %xmm10, %xmm2
pclmulqdq $0x11, %xmm10, %xmm8
pclmulqdq $0x0, %xmm10, %xmm3
pclmulqdq $0x11, %xmm10, %xmm13
pxor %xmm9 , %xmm2
xorps %xmm8 , %xmm2
pxor %xmm12, %xmm3
xorps %xmm13, %xmm3
movdqu 16*4(arg2), %xmm9
movdqu 16*5(arg2), %xmm12
pshufb %xmm11, %xmm9
pshufb %xmm11, %xmm12
movdqa %xmm4, %xmm8
movdqa %xmm5, %xmm13
pclmulqdq $0x0, %xmm10, %xmm4
pclmulqdq $0x11, %xmm10, %xmm8
pclmulqdq $0x0, %xmm10, %xmm5
pclmulqdq $0x11, %xmm10, %xmm13
pxor %xmm9 , %xmm4
xorps %xmm8 , %xmm4
pxor %xmm12, %xmm5
xorps %xmm13, %xmm5
movdqu 16*6(arg2), %xmm9
movdqu 16*7(arg2), %xmm12
pshufb %xmm11, %xmm9
pshufb %xmm11, %xmm12
movdqa %xmm6 , %xmm8
movdqa %xmm7 , %xmm13
pclmulqdq $0x0 , %xmm10, %xmm6
pclmulqdq $0x11, %xmm10, %xmm8
pclmulqdq $0x0 , %xmm10, %xmm7
pclmulqdq $0x11, %xmm10, %xmm13
pxor %xmm9 , %xmm6
xorps %xmm8 , %xmm6
pxor %xmm12, %xmm7
xorps %xmm13, %xmm7
sub $128, arg3
# check if there is another 64B in the buffer to be able to fold
jge _fold_64_B_loop
##################################################################
add $128, arg2
# at this point, the buffer pointer is pointing at the last y Bytes
# of the buffer the 64B of folded data is in 4 of the xmm
# registers: xmm0, xmm1, xmm2, xmm3
# fold the 8 xmm registers to 1 xmm register with different constants
movdqa rk9(%rip), %xmm10
movdqa %xmm0, %xmm8
pclmulqdq $0x11, %xmm10, %xmm0
pclmulqdq $0x0 , %xmm10, %xmm8
pxor %xmm8, %xmm7
xorps %xmm0, %xmm7
movdqa rk11(%rip), %xmm10
movdqa %xmm1, %xmm8
pclmulqdq $0x11, %xmm10, %xmm1
pclmulqdq $0x0 , %xmm10, %xmm8
pxor %xmm8, %xmm7
xorps %xmm1, %xmm7
movdqa rk13(%rip), %xmm10
movdqa %xmm2, %xmm8
pclmulqdq $0x11, %xmm10, %xmm2
pclmulqdq $0x0 , %xmm10, %xmm8
pxor %xmm8, %xmm7
pxor %xmm2, %xmm7
movdqa rk15(%rip), %xmm10
movdqa %xmm3, %xmm8
pclmulqdq $0x11, %xmm10, %xmm3
pclmulqdq $0x0 , %xmm10, %xmm8
pxor %xmm8, %xmm7
xorps %xmm3, %xmm7
movdqa rk17(%rip), %xmm10
movdqa %xmm4, %xmm8
pclmulqdq $0x11, %xmm10, %xmm4
pclmulqdq $0x0 , %xmm10, %xmm8
pxor %xmm8, %xmm7
pxor %xmm4, %xmm7
movdqa rk19(%rip), %xmm10
movdqa %xmm5, %xmm8
pclmulqdq $0x11, %xmm10, %xmm5
pclmulqdq $0x0 , %xmm10, %xmm8
pxor %xmm8, %xmm7
xorps %xmm5, %xmm7
movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
#imm value of pclmulqdq instruction
#will determine which constant to use
movdqa %xmm6, %xmm8
pclmulqdq $0x11, %xmm10, %xmm6
pclmulqdq $0x0 , %xmm10, %xmm8
pxor %xmm8, %xmm7
pxor %xmm6, %xmm7
# instead of 64, we add 48 to the loop counter to save 1 instruction
# from the loop instead of a cmp instruction, we use the negative
# flag with the jl instruction
add $128-16, arg3
jl _final_reduction_for_128
# now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
# and the rest is in memory. We can fold 16 bytes at a time if y>=16
# continue folding 16B at a time
_16B_reduction_loop:
movdqa %xmm7, %xmm8
pclmulqdq $0x11, %xmm10, %xmm7
pclmulqdq $0x0 , %xmm10, %xmm8
pxor %xmm8, %xmm7
movdqu (arg2), %xmm0
pshufb %xmm11, %xmm0
pxor %xmm0 , %xmm7
add $16, arg2
sub $16, arg3
# instead of a cmp instruction, we utilize the flags with the
# jge instruction equivalent of: cmp arg3, 16-16
# check if there is any more 16B in the buffer to be able to fold
jge _16B_reduction_loop
#now we have 16+z bytes left to reduce, where 0<= z < 16.
#first, we reduce the data in the xmm7 register
_final_reduction_for_128:
# check if any more data to fold. If not, compute the CRC of
# the final 128 bits
add $16, arg3
je _128_done
# here we are getting data that is less than 16 bytes.
# since we know that there was data before the pointer, we can
# offset the input pointer before the actual point, to receive
# exactly 16 bytes. after that the registers need to be adjusted.
_get_last_two_xmms:
movdqa %xmm7, %xmm2
movdqu -16(arg2, arg3), %xmm1
pshufb %xmm11, %xmm1
# get rid of the extra data that was loaded before
# load the shift constant
lea pshufb_shf_table+16(%rip), %rax
sub arg3, %rax
movdqu (%rax), %xmm0
# shift xmm2 to the left by arg3 bytes
pshufb %xmm0, %xmm2
# shift xmm7 to the right by 16-arg3 bytes
pxor mask1(%rip), %xmm0
pshufb %xmm0, %xmm7
pblendvb %xmm2, %xmm1 #xmm0 is implicit
# fold 16 Bytes
movdqa %xmm1, %xmm2
movdqa %xmm7, %xmm8
pclmulqdq $0x11, %xmm10, %xmm7
pclmulqdq $0x0 , %xmm10, %xmm8
pxor %xmm8, %xmm7
pxor %xmm2, %xmm7
_128_done:
# compute crc of a 128-bit value
movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
movdqa %xmm7, %xmm0
#64b fold
pclmulqdq $0x1, %xmm10, %xmm7
pslldq $8 , %xmm0
pxor %xmm0, %xmm7
#32b fold
movdqa %xmm7, %xmm0
pand mask2(%rip), %xmm0
psrldq $12, %xmm7
pclmulqdq $0x10, %xmm10, %xmm7
pxor %xmm0, %xmm7
#barrett reduction
_barrett:
movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
movdqa %xmm7, %xmm0
pclmulqdq $0x01, %xmm10, %xmm7
pslldq $4, %xmm7
pclmulqdq $0x11, %xmm10, %xmm7
pslldq $4, %xmm7
pxor %xmm0, %xmm7
pextrd $1, %xmm7, %eax
_cleanup:
# scale the result back to 16 bits
shr $16, %eax
mov %rcx, %rsp
ret
########################################################################
.align 16
_less_than_128:
# check if there is enough buffer to be able to fold 16B at a time
cmp $32, arg3
jl _less_than_32
movdqa SHUF_MASK(%rip), %xmm11
# now if there is, load the constants
movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
movd arg1_low32, %xmm0 # get the initial crc value
pslldq $12, %xmm0 # align it to its correct place
movdqu (arg2), %xmm7 # load the plaintext
pshufb %xmm11, %xmm7 # byte-reflect the plaintext
pxor %xmm0, %xmm7
# update the buffer pointer
add $16, arg2
# update the counter. subtract 32 instead of 16 to save one
# instruction from the loop
sub $32, arg3
jmp _16B_reduction_loop
.align 16
_less_than_32:
# mov initial crc to the return value. this is necessary for
# zero-length buffers.
mov arg1_low32, %eax
test arg3, arg3
je _cleanup
movdqa SHUF_MASK(%rip), %xmm11
movd arg1_low32, %xmm0 # get the initial crc value
pslldq $12, %xmm0 # align it to its correct place
cmp $16, arg3
je _exact_16_left
jl _less_than_16_left
movdqu (arg2), %xmm7 # load the plaintext
pshufb %xmm11, %xmm7 # byte-reflect the plaintext
pxor %xmm0 , %xmm7 # xor the initial crc value
add $16, arg2
sub $16, arg3
movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
jmp _get_last_two_xmms
.align 16
_less_than_16_left:
# use stack space to load data less than 16 bytes, zero-out
# the 16B in memory first.
pxor %xmm1, %xmm1
mov %rsp, %r11
movdqa %xmm1, (%r11)
cmp $4, arg3
jl _only_less_than_4
# backup the counter value
mov arg3, %r9
cmp $8, arg3
jl _less_than_8_left
# load 8 Bytes
mov (arg2), %rax
mov %rax, (%r11)
add $8, %r11
sub $8, arg3
add $8, arg2
_less_than_8_left:
cmp $4, arg3
jl _less_than_4_left
# load 4 Bytes
mov (arg2), %eax
mov %eax, (%r11)
add $4, %r11
sub $4, arg3
add $4, arg2
_less_than_4_left:
cmp $2, arg3
jl _less_than_2_left
# load 2 Bytes
mov (arg2), %ax
mov %ax, (%r11)
add $2, %r11
sub $2, arg3
add $2, arg2
_less_than_2_left:
cmp $1, arg3
jl _zero_left
# load 1 Byte
mov (arg2), %al
mov %al, (%r11)
_zero_left:
movdqa (%rsp), %xmm7
pshufb %xmm11, %xmm7
pxor %xmm0 , %xmm7 # xor the initial crc value
# shl r9, 4
lea pshufb_shf_table+16(%rip), %rax
sub %r9, %rax
movdqu (%rax), %xmm0
pxor mask1(%rip), %xmm0
pshufb %xmm0, %xmm7
jmp _128_done
.align 16
_exact_16_left:
movdqu (arg2), %xmm7
pshufb %xmm11, %xmm7
pxor %xmm0 , %xmm7 # xor the initial crc value
jmp _128_done
_only_less_than_4:
cmp $3, arg3
jl _only_less_than_3
# load 3 Bytes
mov (arg2), %al
mov %al, (%r11)
mov 1(arg2), %al
mov %al, 1(%r11)
mov 2(arg2), %al
mov %al, 2(%r11)
movdqa (%rsp), %xmm7
pshufb %xmm11, %xmm7
pxor %xmm0 , %xmm7 # xor the initial crc value
psrldq $5, %xmm7
jmp _barrett
_only_less_than_3:
cmp $2, arg3
jl _only_less_than_2
# load 2 Bytes
mov (arg2), %al
mov %al, (%r11)
mov 1(arg2), %al
mov %al, 1(%r11)
movdqa (%rsp), %xmm7
pshufb %xmm11, %xmm7
pxor %xmm0 , %xmm7 # xor the initial crc value
psrldq $6, %xmm7
jmp _barrett
_only_less_than_2:
# load 1 Byte
mov (arg2), %al
mov %al, (%r11)
movdqa (%rsp), %xmm7
pshufb %xmm11, %xmm7
pxor %xmm0 , %xmm7 # xor the initial crc value
psrldq $7, %xmm7
jmp _barrett
ENDPROC(crc_t10dif_pcl)
.data
# precomputed constants
# these constants are precomputed from the poly:
# 0x8bb70000 (0x8bb7 scaled to 32 bits)
.align 16
# Q = 0x18BB70000
# rk1 = 2^(32*3) mod Q << 32
# rk2 = 2^(32*5) mod Q << 32
# rk3 = 2^(32*15) mod Q << 32
# rk4 = 2^(32*17) mod Q << 32
# rk5 = 2^(32*3) mod Q << 32
# rk6 = 2^(32*2) mod Q << 32
# rk7 = floor(2^64/Q)
# rk8 = Q
rk1:
.quad 0x2d56000000000000
rk2:
.quad 0x06df000000000000
rk3:
.quad 0x9d9d000000000000
rk4:
.quad 0x7cf5000000000000
rk5:
.quad 0x2d56000000000000
rk6:
.quad 0x1368000000000000
rk7:
.quad 0x00000001f65a57f8
rk8:
.quad 0x000000018bb70000
rk9:
.quad 0xceae000000000000
rk10:
.quad 0xbfd6000000000000
rk11:
.quad 0x1e16000000000000
rk12:
.quad 0x713c000000000000
rk13:
.quad 0xf7f9000000000000
rk14:
.quad 0x80a6000000000000
rk15:
.quad 0x044c000000000000
rk16:
.quad 0xe658000000000000
rk17:
.quad 0xad18000000000000
rk18:
.quad 0xa497000000000000
rk19:
.quad 0x6ee3000000000000
rk20:
.quad 0xe7b5000000000000
mask1:
.octa 0x80808080808080808080808080808080
mask2:
.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
SHUF_MASK:
.octa 0x000102030405060708090A0B0C0D0E0F
pshufb_shf_table:
# use these values for shift constants for the pshufb instruction
# different alignments result in values as shown:
# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
.octa 0x8f8e8d8c8b8a89888786858483828100
.octa 0x000e0d0c0b0a09080706050403020100
/*
* Cryptographic API.
*
* T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
*
* Copyright (C) 2013 Intel Corporation
* Author: Tim Chen <tim.c.chen@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/types.h>
#include <linux/module.h>
#include <linux/crc-t10dif.h>
#include <crypto/internal/hash.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <asm/i387.h>
#include <asm/cpufeature.h>
#include <asm/cpu_device_id.h>
asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
size_t len);
struct chksum_desc_ctx {
__u16 crc;
};
/*
* Steps through buffer one byte at at time, calculates reflected
* crc using table.
*/
static int chksum_init(struct shash_desc *desc)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
ctx->crc = 0;
return 0;
}
static int chksum_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
if (irq_fpu_usable()) {
kernel_fpu_begin();
ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
kernel_fpu_end();
} else
ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
return 0;
}
static int chksum_final(struct shash_desc *desc, u8 *out)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
*(__u16 *)out = ctx->crc;
return 0;
}
static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (irq_fpu_usable()) {
kernel_fpu_begin();
*(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
kernel_fpu_end();
} else
*(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
return 0;
}
static int chksum_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
return __chksum_finup(&ctx->crc, data, len, out);
}
static int chksum_digest(struct shash_desc *desc, const u8 *data,
unsigned int length, u8 *out)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
return __chksum_finup(&ctx->crc, data, length, out);
}
static struct shash_alg alg = {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = chksum_init,
.update = chksum_update,
.final = chksum_final,
.finup = chksum_finup,
.digest = chksum_digest,
.descsize = sizeof(struct chksum_desc_ctx),
.base = {
.cra_name = "crct10dif",
.cra_driver_name = "crct10dif-pclmul",
.cra_priority = 200,
.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static const struct x86_cpu_id crct10dif_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
{}
};
MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
static int __init crct10dif_intel_mod_init(void)
{
if (!x86_match_cpu(crct10dif_cpu_id))
return -ENODEV;
return crypto_register_shash(&alg);
}
static void __exit crct10dif_intel_mod_fini(void)
{
crypto_unregister_shash(&alg);
}
module_init(crct10dif_intel_mod_init);
module_exit(crct10dif_intel_mod_fini);
MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
MODULE_LICENSE("GPL");
MODULE_ALIAS("crct10dif");
MODULE_ALIAS("crct10dif-pclmul");
...@@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in) ...@@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
return 0; return 0;
} }
static struct shash_alg alg = { static int sha224_ssse3_init(struct shash_desc *desc)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
sctx->state[0] = SHA224_H0;
sctx->state[1] = SHA224_H1;
sctx->state[2] = SHA224_H2;
sctx->state[3] = SHA224_H3;
sctx->state[4] = SHA224_H4;
sctx->state[5] = SHA224_H5;
sctx->state[6] = SHA224_H6;
sctx->state[7] = SHA224_H7;
sctx->count = 0;
return 0;
}
static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
{
u8 D[SHA256_DIGEST_SIZE];
sha256_ssse3_final(desc, D);
memcpy(hash, D, SHA224_DIGEST_SIZE);
memset(D, 0, SHA256_DIGEST_SIZE);
return 0;
}
static struct shash_alg algs[] = { {
.digestsize = SHA256_DIGEST_SIZE, .digestsize = SHA256_DIGEST_SIZE,
.init = sha256_ssse3_init, .init = sha256_ssse3_init,
.update = sha256_ssse3_update, .update = sha256_ssse3_update,
...@@ -204,7 +233,24 @@ static struct shash_alg alg = { ...@@ -204,7 +233,24 @@ static struct shash_alg alg = {
.cra_blocksize = SHA256_BLOCK_SIZE, .cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE, .cra_module = THIS_MODULE,
} }
}; }, {
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_ssse3_init,
.update = sha256_ssse3_update,
.final = sha224_ssse3_final,
.export = sha256_ssse3_export,
.import = sha256_ssse3_import,
.descsize = sizeof(struct sha256_state),
.statesize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
} };
#ifdef CONFIG_AS_AVX #ifdef CONFIG_AS_AVX
static bool __init avx_usable(void) static bool __init avx_usable(void)
...@@ -227,7 +273,7 @@ static bool __init avx_usable(void) ...@@ -227,7 +273,7 @@ static bool __init avx_usable(void)
static int __init sha256_ssse3_mod_init(void) static int __init sha256_ssse3_mod_init(void)
{ {
/* test for SSE3 first */ /* test for SSSE3 first */
if (cpu_has_ssse3) if (cpu_has_ssse3)
sha256_transform_asm = sha256_transform_ssse3; sha256_transform_asm = sha256_transform_ssse3;
...@@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void) ...@@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void)
else else
#endif #endif
pr_info("Using SSSE3 optimized SHA-256 implementation\n"); pr_info("Using SSSE3 optimized SHA-256 implementation\n");
return crypto_register_shash(&alg); return crypto_register_shashes(algs, ARRAY_SIZE(algs));
} }
pr_info("Neither AVX nor SSSE3 is available/usable.\n"); pr_info("Neither AVX nor SSSE3 is available/usable.\n");
...@@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void) ...@@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void)
static void __exit sha256_ssse3_mod_fini(void) static void __exit sha256_ssse3_mod_fini(void)
{ {
crypto_unregister_shash(&alg); crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
} }
module_init(sha256_ssse3_mod_init); module_init(sha256_ssse3_mod_init);
...@@ -273,3 +319,4 @@ MODULE_LICENSE("GPL"); ...@@ -273,3 +319,4 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
MODULE_ALIAS("sha256"); MODULE_ALIAS("sha256");
MODULE_ALIAS("sha384");
...@@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in) ...@@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
return 0; return 0;
} }
static struct shash_alg alg = { static int sha384_ssse3_init(struct shash_desc *desc)
{
struct sha512_state *sctx = shash_desc_ctx(desc);
sctx->state[0] = SHA384_H0;
sctx->state[1] = SHA384_H1;
sctx->state[2] = SHA384_H2;
sctx->state[3] = SHA384_H3;
sctx->state[4] = SHA384_H4;
sctx->state[5] = SHA384_H5;
sctx->state[6] = SHA384_H6;
sctx->state[7] = SHA384_H7;
sctx->count[0] = sctx->count[1] = 0;
return 0;
}
static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
{
u8 D[SHA512_DIGEST_SIZE];
sha512_ssse3_final(desc, D);
memcpy(hash, D, SHA384_DIGEST_SIZE);
memset(D, 0, SHA512_DIGEST_SIZE);
return 0;
}
static struct shash_alg algs[] = { {
.digestsize = SHA512_DIGEST_SIZE, .digestsize = SHA512_DIGEST_SIZE,
.init = sha512_ssse3_init, .init = sha512_ssse3_init,
.update = sha512_ssse3_update, .update = sha512_ssse3_update,
...@@ -211,7 +241,24 @@ static struct shash_alg alg = { ...@@ -211,7 +241,24 @@ static struct shash_alg alg = {
.cra_blocksize = SHA512_BLOCK_SIZE, .cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE, .cra_module = THIS_MODULE,
} }
}; }, {
.digestsize = SHA384_DIGEST_SIZE,
.init = sha384_ssse3_init,
.update = sha512_ssse3_update,
.final = sha384_ssse3_final,
.export = sha512_ssse3_export,
.import = sha512_ssse3_import,
.descsize = sizeof(struct sha512_state),
.statesize = sizeof(struct sha512_state),
.base = {
.cra_name = "sha384",
.cra_driver_name = "sha384-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
} };
#ifdef CONFIG_AS_AVX #ifdef CONFIG_AS_AVX
static bool __init avx_usable(void) static bool __init avx_usable(void)
...@@ -234,7 +281,7 @@ static bool __init avx_usable(void) ...@@ -234,7 +281,7 @@ static bool __init avx_usable(void)
static int __init sha512_ssse3_mod_init(void) static int __init sha512_ssse3_mod_init(void)
{ {
/* test for SSE3 first */ /* test for SSSE3 first */
if (cpu_has_ssse3) if (cpu_has_ssse3)
sha512_transform_asm = sha512_transform_ssse3; sha512_transform_asm = sha512_transform_ssse3;
...@@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void) ...@@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void)
else else
#endif #endif
pr_info("Using SSSE3 optimized SHA-512 implementation\n"); pr_info("Using SSSE3 optimized SHA-512 implementation\n");
return crypto_register_shash(&alg); return crypto_register_shashes(algs, ARRAY_SIZE(algs));
} }
pr_info("Neither AVX nor SSSE3 is available/usable.\n"); pr_info("Neither AVX nor SSSE3 is available/usable.\n");
...@@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void) ...@@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void)
static void __exit sha512_ssse3_mod_fini(void) static void __exit sha512_ssse3_mod_fini(void)
{ {
crypto_unregister_shash(&alg); crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
} }
module_init(sha512_ssse3_mod_init); module_init(sha512_ssse3_mod_init);
...@@ -280,3 +327,4 @@ MODULE_LICENSE("GPL"); ...@@ -280,3 +327,4 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
MODULE_ALIAS("sha512"); MODULE_ALIAS("sha512");
MODULE_ALIAS("sha384");
/*
* x86_64/AVX2 assembler optimized version of Twofish
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*/
#include <linux/linkage.h>
#include "glue_helper-asm-avx2.S"
.file "twofish-avx2-asm_64.S"
.data
.align 16
.Lvpshufb_mask0:
.long 0x80808000
.long 0x80808004
.long 0x80808008
.long 0x8080800c
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.Lxts_gf128mul_and_shl1_mask_0:
.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
.Lxts_gf128mul_and_shl1_mask_1:
.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
.text
/* structure of crypto context */
#define s0 0
#define s1 1024
#define s2 2048
#define s3 3072
#define w 4096
#define k 4128
/* register macros */
#define CTX %rdi
#define RS0 CTX
#define RS1 %r8
#define RS2 %r9
#define RS3 %r10
#define RK %r11
#define RW %rax
#define RROUND %r12
#define RROUNDd %r12d
#define RA0 %ymm8
#define RB0 %ymm9
#define RC0 %ymm10
#define RD0 %ymm11
#define RA1 %ymm12
#define RB1 %ymm13
#define RC1 %ymm14
#define RD1 %ymm15
/* temp regs */
#define RX0 %ymm0
#define RY0 %ymm1
#define RX1 %ymm2
#define RY1 %ymm3
#define RT0 %ymm4
#define RIDX %ymm5
#define RX0x %xmm0
#define RY0x %xmm1
#define RX1x %xmm2
#define RY1x %xmm3
#define RT0x %xmm4
/* vpgatherdd mask and '-1' */
#define RNOT %ymm6
/* byte mask, (-1 >> 24) */
#define RBYTE %ymm7
/**********************************************************************
16-way AVX2 twofish
**********************************************************************/
#define init_round_constants() \
vpcmpeqd RNOT, RNOT, RNOT; \
vpsrld $24, RNOT, RBYTE; \
leaq k(CTX), RK; \
leaq w(CTX), RW; \
leaq s1(CTX), RS1; \
leaq s2(CTX), RS2; \
leaq s3(CTX), RS3; \
#define g16(ab, rs0, rs1, rs2, rs3, xy) \
vpand RBYTE, ab ## 0, RIDX; \
vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
vpcmpeqd RNOT, RNOT, RNOT; \
\
vpand RBYTE, ab ## 1, RIDX; \
vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
vpcmpeqd RNOT, RNOT, RNOT; \
\
vpsrld $8, ab ## 0, RIDX; \
vpand RBYTE, RIDX, RIDX; \
vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpxor RT0, xy ## 0, xy ## 0; \
\
vpsrld $8, ab ## 1, RIDX; \
vpand RBYTE, RIDX, RIDX; \
vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpxor RT0, xy ## 1, xy ## 1; \
\
vpsrld $16, ab ## 0, RIDX; \
vpand RBYTE, RIDX, RIDX; \
vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpxor RT0, xy ## 0, xy ## 0; \
\
vpsrld $16, ab ## 1, RIDX; \
vpand RBYTE, RIDX, RIDX; \
vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpxor RT0, xy ## 1, xy ## 1; \
\
vpsrld $24, ab ## 0, RIDX; \
vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpxor RT0, xy ## 0, xy ## 0; \
\
vpsrld $24, ab ## 1, RIDX; \
vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpxor RT0, xy ## 1, xy ## 1;
#define g1_16(a, x) \
g16(a, RS0, RS1, RS2, RS3, x);
#define g2_16(b, y) \
g16(b, RS1, RS2, RS3, RS0, y);
#define encrypt_round_end16(a, b, c, d, nk) \
vpaddd RY0, RX0, RX0; \
vpaddd RX0, RY0, RY0; \
vpbroadcastd nk(RK,RROUND,8), RT0; \
vpaddd RT0, RX0, RX0; \
vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
vpaddd RT0, RY0, RY0; \
\
vpxor RY0, d ## 0, d ## 0; \
\
vpxor RX0, c ## 0, c ## 0; \
vpsrld $1, c ## 0, RT0; \
vpslld $31, c ## 0, c ## 0; \
vpor RT0, c ## 0, c ## 0; \
\
vpaddd RY1, RX1, RX1; \
vpaddd RX1, RY1, RY1; \
vpbroadcastd nk(RK,RROUND,8), RT0; \
vpaddd RT0, RX1, RX1; \
vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
vpaddd RT0, RY1, RY1; \
\
vpxor RY1, d ## 1, d ## 1; \
\
vpxor RX1, c ## 1, c ## 1; \
vpsrld $1, c ## 1, RT0; \
vpslld $31, c ## 1, c ## 1; \
vpor RT0, c ## 1, c ## 1; \
#define encrypt_round16(a, b, c, d, nk) \
g2_16(b, RY); \
\
vpslld $1, b ## 0, RT0; \
vpsrld $31, b ## 0, b ## 0; \
vpor RT0, b ## 0, b ## 0; \
\
vpslld $1, b ## 1, RT0; \
vpsrld $31, b ## 1, b ## 1; \
vpor RT0, b ## 1, b ## 1; \
\
g1_16(a, RX); \
\
encrypt_round_end16(a, b, c, d, nk);
#define encrypt_round_first16(a, b, c, d, nk) \
vpslld $1, d ## 0, RT0; \
vpsrld $31, d ## 0, d ## 0; \
vpor RT0, d ## 0, d ## 0; \
\
vpslld $1, d ## 1, RT0; \
vpsrld $31, d ## 1, d ## 1; \
vpor RT0, d ## 1, d ## 1; \
\
encrypt_round16(a, b, c, d, nk);
#define encrypt_round_last16(a, b, c, d, nk) \
g2_16(b, RY); \
\
g1_16(a, RX); \
\
encrypt_round_end16(a, b, c, d, nk);
#define decrypt_round_end16(a, b, c, d, nk) \
vpaddd RY0, RX0, RX0; \
vpaddd RX0, RY0, RY0; \
vpbroadcastd nk(RK,RROUND,8), RT0; \
vpaddd RT0, RX0, RX0; \
vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
vpaddd RT0, RY0, RY0; \
\
vpxor RX0, c ## 0, c ## 0; \
\
vpxor RY0, d ## 0, d ## 0; \
vpsrld $1, d ## 0, RT0; \
vpslld $31, d ## 0, d ## 0; \
vpor RT0, d ## 0, d ## 0; \
\
vpaddd RY1, RX1, RX1; \
vpaddd RX1, RY1, RY1; \
vpbroadcastd nk(RK,RROUND,8), RT0; \
vpaddd RT0, RX1, RX1; \
vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
vpaddd RT0, RY1, RY1; \
\
vpxor RX1, c ## 1, c ## 1; \
\
vpxor RY1, d ## 1, d ## 1; \
vpsrld $1, d ## 1, RT0; \
vpslld $31, d ## 1, d ## 1; \
vpor RT0, d ## 1, d ## 1;
#define decrypt_round16(a, b, c, d, nk) \
g1_16(a, RX); \
\
vpslld $1, a ## 0, RT0; \
vpsrld $31, a ## 0, a ## 0; \
vpor RT0, a ## 0, a ## 0; \
\
vpslld $1, a ## 1, RT0; \
vpsrld $31, a ## 1, a ## 1; \
vpor RT0, a ## 1, a ## 1; \
\
g2_16(b, RY); \
\
decrypt_round_end16(a, b, c, d, nk);
#define decrypt_round_first16(a, b, c, d, nk) \
vpslld $1, c ## 0, RT0; \
vpsrld $31, c ## 0, c ## 0; \
vpor RT0, c ## 0, c ## 0; \
\
vpslld $1, c ## 1, RT0; \
vpsrld $31, c ## 1, c ## 1; \
vpor RT0, c ## 1, c ## 1; \
\
decrypt_round16(a, b, c, d, nk)
#define decrypt_round_last16(a, b, c, d, nk) \
g1_16(a, RX); \
\
g2_16(b, RY); \
\
decrypt_round_end16(a, b, c, d, nk);
#define encrypt_cycle16() \
encrypt_round16(RA, RB, RC, RD, 0); \
encrypt_round16(RC, RD, RA, RB, 8);
#define encrypt_cycle_first16() \
encrypt_round_first16(RA, RB, RC, RD, 0); \
encrypt_round16(RC, RD, RA, RB, 8);
#define encrypt_cycle_last16() \
encrypt_round16(RA, RB, RC, RD, 0); \
encrypt_round_last16(RC, RD, RA, RB, 8);
#define decrypt_cycle16(n) \
decrypt_round16(RC, RD, RA, RB, 8); \
decrypt_round16(RA, RB, RC, RD, 0);
#define decrypt_cycle_first16(n) \
decrypt_round_first16(RC, RD, RA, RB, 8); \
decrypt_round16(RA, RB, RC, RD, 0);
#define decrypt_cycle_last16(n) \
decrypt_round16(RC, RD, RA, RB, 8); \
decrypt_round_last16(RA, RB, RC, RD, 0);
#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
vpunpckhdq x1, x0, t2; \
vpunpckldq x1, x0, x0; \
\
vpunpckldq x3, x2, t1; \
vpunpckhdq x3, x2, x2; \
\
vpunpckhqdq t1, x0, x1; \
vpunpcklqdq t1, x0, x0; \
\
vpunpckhqdq x2, t2, x3; \
vpunpcklqdq x2, t2, x2;
#define read_blocks8(offs,a,b,c,d) \
transpose_4x4(a, b, c, d, RX0, RY0);
#define write_blocks8(offs,a,b,c,d) \
transpose_4x4(a, b, c, d, RX0, RY0);
#define inpack_enc8(a,b,c,d) \
vpbroadcastd 4*0(RW), RT0; \
vpxor RT0, a, a; \
\
vpbroadcastd 4*1(RW), RT0; \
vpxor RT0, b, b; \
\
vpbroadcastd 4*2(RW), RT0; \
vpxor RT0, c, c; \
\
vpbroadcastd 4*3(RW), RT0; \
vpxor RT0, d, d;
#define outunpack_enc8(a,b,c,d) \
vpbroadcastd 4*4(RW), RX0; \
vpbroadcastd 4*5(RW), RY0; \
vpxor RX0, c, RX0; \
vpxor RY0, d, RY0; \
\
vpbroadcastd 4*6(RW), RT0; \
vpxor RT0, a, c; \
vpbroadcastd 4*7(RW), RT0; \
vpxor RT0, b, d; \
\
vmovdqa RX0, a; \
vmovdqa RY0, b;
#define inpack_dec8(a,b,c,d) \
vpbroadcastd 4*4(RW), RX0; \
vpbroadcastd 4*5(RW), RY0; \
vpxor RX0, a, RX0; \
vpxor RY0, b, RY0; \
\
vpbroadcastd 4*6(RW), RT0; \
vpxor RT0, c, a; \
vpbroadcastd 4*7(RW), RT0; \
vpxor RT0, d, b; \
\
vmovdqa RX0, c; \
vmovdqa RY0, d;
#define outunpack_dec8(a,b,c,d) \
vpbroadcastd 4*0(RW), RT0; \
vpxor RT0, a, a; \
\
vpbroadcastd 4*1(RW), RT0; \
vpxor RT0, b, b; \
\
vpbroadcastd 4*2(RW), RT0; \
vpxor RT0, c, c; \
\
vpbroadcastd 4*3(RW), RT0; \
vpxor RT0, d, d;
#define read_blocks16(a,b,c,d) \
read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
#define write_blocks16(a,b,c,d) \
write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
#define xor_blocks16(a,b,c,d) \
xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
#define inpack_enc16(a,b,c,d) \
inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
#define outunpack_enc16(a,b,c,d) \
outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
#define inpack_dec16(a,b,c,d) \
inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
#define outunpack_dec16(a,b,c,d) \
outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
.align 8
__twofish_enc_blk16:
/* input:
* %rdi: ctx, CTX
* RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
* output:
* RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
*/
init_round_constants();
read_blocks16(RA, RB, RC, RD);
inpack_enc16(RA, RB, RC, RD);
xorl RROUNDd, RROUNDd;
encrypt_cycle_first16();
movl $2, RROUNDd;
.align 4
.L__enc_loop:
encrypt_cycle16();
addl $2, RROUNDd;
cmpl $14, RROUNDd;
jne .L__enc_loop;
encrypt_cycle_last16();
outunpack_enc16(RA, RB, RC, RD);
write_blocks16(RA, RB, RC, RD);
ret;
ENDPROC(__twofish_enc_blk16)
.align 8
__twofish_dec_blk16:
/* input:
* %rdi: ctx, CTX
* RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
* output:
* RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
*/
init_round_constants();
read_blocks16(RA, RB, RC, RD);
inpack_dec16(RA, RB, RC, RD);
movl $14, RROUNDd;
decrypt_cycle_first16();
movl $12, RROUNDd;
.align 4
.L__dec_loop:
decrypt_cycle16();
addl $-2, RROUNDd;
jnz .L__dec_loop;
decrypt_cycle_last16();
outunpack_dec16(RA, RB, RC, RD);
write_blocks16(RA, RB, RC, RD);
ret;
ENDPROC(__twofish_dec_blk16)
ENTRY(twofish_ecb_enc_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
vzeroupper;
pushq %r12;
load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
call __twofish_enc_blk16;
store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
popq %r12;
vzeroupper;
ret;
ENDPROC(twofish_ecb_enc_16way)
ENTRY(twofish_ecb_dec_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
vzeroupper;
pushq %r12;
load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
call __twofish_dec_blk16;
store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
popq %r12;
vzeroupper;
ret;
ENDPROC(twofish_ecb_dec_16way)
ENTRY(twofish_cbc_dec_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
vzeroupper;
pushq %r12;
load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
call __twofish_dec_blk16;
store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
RX0);
popq %r12;
vzeroupper;
ret;
ENDPROC(twofish_cbc_dec_16way)
ENTRY(twofish_ctr_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
* %rcx: iv (little endian, 128bit)
*/
vzeroupper;
pushq %r12;
load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
RBYTE);
call __twofish_enc_blk16;
store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
popq %r12;
vzeroupper;
ret;
ENDPROC(twofish_ctr_16way)
.align 8
twofish_xts_crypt_16way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
* %rcx: iv (t αⁿ GF(2¹²⁸))
* %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
*/
vzeroupper;
pushq %r12;
load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
.Lxts_gf128mul_and_shl1_mask_0,
.Lxts_gf128mul_and_shl1_mask_1);
call *%r8;
store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
popq %r12;
vzeroupper;
ret;
ENDPROC(twofish_xts_crypt_16way)
ENTRY(twofish_xts_enc_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
* %rcx: iv (t αⁿ GF(2¹²⁸))
*/
leaq __twofish_enc_blk16, %r8;
jmp twofish_xts_crypt_16way;
ENDPROC(twofish_xts_enc_16way)
ENTRY(twofish_xts_dec_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
* %rcx: iv (t αⁿ GF(2¹²⁸))
*/
leaq __twofish_dec_blk16, %r8;
jmp twofish_xts_crypt_16way;
ENDPROC(twofish_xts_dec_16way)
/*
* Glue Code for x86_64/AVX2 assembler optimized version of Twofish
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/ctr.h>
#include <crypto/twofish.h>
#include <crypto/lrw.h>
#include <crypto/xts.h>
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <asm/crypto/twofish.h>
#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
#include <crypto/scatterwalk.h>
#define TF_AVX2_PARALLEL_BLOCKS 16
/* 16-way AVX2 parallel cipher functions */
asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
le128 *iv);
asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);
asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src)
{
__twofish_enc_blk_3way(ctx, dst, src, false);
}
static const struct common_glue_ctx twofish_enc = {
.num_funcs = 4,
.fpu_blocks_limit = 8,
.funcs = { {
.num_blocks = 16,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
}, {
.num_blocks = 8,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
} }
};
static const struct common_glue_ctx twofish_ctr = {
.num_funcs = 4,
.fpu_blocks_limit = 8,
.funcs = { {
.num_blocks = 16,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
}, {
.num_blocks = 8,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
}, {
.num_blocks = 1,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
} }
};
static const struct common_glue_ctx twofish_enc_xts = {
.num_funcs = 3,
.fpu_blocks_limit = 8,
.funcs = { {
.num_blocks = 16,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
}, {
.num_blocks = 8,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
}, {
.num_blocks = 1,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
} }
};
static const struct common_glue_ctx twofish_dec = {
.num_funcs = 4,
.fpu_blocks_limit = 8,
.funcs = { {
.num_blocks = 16,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
}, {
.num_blocks = 8,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
} }
};
static const struct common_glue_ctx twofish_dec_cbc = {
.num_funcs = 4,
.fpu_blocks_limit = 8,
.funcs = { {
.num_blocks = 16,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
}, {
.num_blocks = 8,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
}, {
.num_blocks = 3,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
}, {
.num_blocks = 1,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
} }
};
static const struct common_glue_ctx twofish_dec_xts = {
.num_funcs = 3,
.fpu_blocks_limit = 8,
.funcs = { {
.num_blocks = 16,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
}, {
.num_blocks = 8,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
}, {
.num_blocks = 1,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
} }
};
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
}
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
}
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
dst, src, nbytes);
}
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
nbytes);
}
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
}
static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
{
/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
}
static inline void twofish_fpu_end(bool fpu_enabled)
{
glue_fpu_end(fpu_enabled);
}
struct crypt_priv {
struct twofish_ctx *ctx;
bool fpu_enabled;
};
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = TF_BLOCK_SIZE;
struct crypt_priv *ctx = priv;
int i;
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
}
while (nbytes >= 8 * bsize) {
twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * 8;
nbytes -= bsize * 8;
}
while (nbytes >= 3 * bsize) {
twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * 3;
nbytes -= bsize * 3;
}
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
twofish_enc_blk(ctx->ctx, srcdst, srcdst);
}
static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = TF_BLOCK_SIZE;
struct crypt_priv *ctx = priv;
int i;
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
}
while (nbytes >= 8 * bsize) {
twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * 8;
nbytes -= bsize * 8;
}
while (nbytes >= 3 * bsize) {
twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * 3;
nbytes -= bsize * 3;
}
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
twofish_dec_blk(ctx->ctx, srcdst, srcdst);
}
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
be128 buf[TF_AVX2_PARALLEL_BLOCKS];
struct crypt_priv crypt_ctx = {
.ctx = &ctx->twofish_ctx,
.fpu_enabled = false,
};
struct lrw_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
.table_ctx = &ctx->lrw_table,
.crypt_ctx = &crypt_ctx,
.crypt_fn = encrypt_callback,
};
int ret;
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
ret = lrw_crypt(desc, dst, src, nbytes, &req);
twofish_fpu_end(crypt_ctx.fpu_enabled);
return ret;
}
static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
be128 buf[TF_AVX2_PARALLEL_BLOCKS];
struct crypt_priv crypt_ctx = {
.ctx = &ctx->twofish_ctx,
.fpu_enabled = false,
};
struct lrw_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
.table_ctx = &ctx->lrw_table,
.crypt_ctx = &crypt_ctx,
.crypt_fn = decrypt_callback,
};
int ret;
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
ret = lrw_crypt(desc, dst, src, nbytes, &req);
twofish_fpu_end(crypt_ctx.fpu_enabled);
return ret;
}
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
XTS_TWEAK_CAST(twofish_enc_blk),
&ctx->tweak_ctx, &ctx->crypt_ctx);
}
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
XTS_TWEAK_CAST(twofish_enc_blk),
&ctx->tweak_ctx, &ctx->crypt_ctx);
}
static struct crypto_alg tf_algs[10] = { {
.cra_name = "__ecb-twofish-avx2",
.cra_driver_name = "__driver-ecb-twofish-avx2",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.setkey = twofish_setkey,
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
},
},
}, {
.cra_name = "__cbc-twofish-avx2",
.cra_driver_name = "__driver-cbc-twofish-avx2",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.setkey = twofish_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
},
},
}, {
.cra_name = "__ctr-twofish-avx2",
.cra_driver_name = "__driver-ctr-twofish-avx2",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct twofish_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = twofish_setkey,
.encrypt = ctr_crypt,
.decrypt = ctr_crypt,
},
},
}, {
.cra_name = "__lrw-twofish-avx2",
.cra_driver_name = "__driver-lrw-twofish-avx2",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_lrw_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_exit = lrw_twofish_exit_tfm,
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE +
TF_BLOCK_SIZE,
.max_keysize = TF_MAX_KEY_SIZE +
TF_BLOCK_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = lrw_twofish_setkey,
.encrypt = lrw_encrypt,
.decrypt = lrw_decrypt,
},
},
}, {
.cra_name = "__xts-twofish-avx2",
.cra_driver_name = "__driver-xts-twofish-avx2",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_xts_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE * 2,
.max_keysize = TF_MAX_KEY_SIZE * 2,
.ivsize = TF_BLOCK_SIZE,
.setkey = xts_twofish_setkey,
.encrypt = xts_encrypt,
.decrypt = xts_decrypt,
},
},
}, {
.cra_name = "ecb(twofish)",
.cra_driver_name = "ecb-twofish-avx2",
.cra_priority = 500,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "cbc(twofish)",
.cra_driver_name = "cbc-twofish-avx2",
.cra_priority = 500,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = __ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "ctr(twofish)",
.cra_driver_name = "ctr-twofish-avx2",
.cra_priority = 500,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_encrypt,
.geniv = "chainiv",
},
},
}, {
.cra_name = "lrw(twofish)",
.cra_driver_name = "lrw-twofish-avx2",
.cra_priority = 500,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE +
TF_BLOCK_SIZE,
.max_keysize = TF_MAX_KEY_SIZE +
TF_BLOCK_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "xts(twofish)",
.cra_driver_name = "xts-twofish-avx2",
.cra_priority = 500,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE * 2,
.max_keysize = TF_MAX_KEY_SIZE * 2,
.ivsize = TF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
} };
static int __init init(void)
{
u64 xcr0;
if (!cpu_has_avx2 || !cpu_has_osxsave) {
pr_info("AVX2 instructions are not detected.\n");
return -ENODEV;
}
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
pr_info("AVX2 detected but unusable.\n");
return -ENODEV;
}
return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
}
static void __exit fini(void)
{
crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
}
module_init(init);
module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
MODULE_ALIAS("twofish");
MODULE_ALIAS("twofish-asm");
...@@ -50,26 +50,18 @@ ...@@ -50,26 +50,18 @@
/* 8-way parallel cipher functions */ /* 8-way parallel cipher functions */
asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);
asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv); const u8 *src, le128 *iv);
EXPORT_SYMBOL_GPL(twofish_ctr_8way);
asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv); const u8 *src, le128 *iv);
EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv); const u8 *src, le128 *iv);
EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src) const u8 *src)
...@@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, ...@@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
__twofish_enc_blk_3way(ctx, dst, src, false); __twofish_enc_blk_3way(ctx, dst, src, false);
} }
void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{ {
glue_xts_crypt_128bit_one(ctx, dst, src, iv, glue_xts_crypt_128bit_one(ctx, dst, src, iv,
GLUE_FUNC_CAST(twofish_enc_blk)); GLUE_FUNC_CAST(twofish_enc_blk));
} }
EXPORT_SYMBOL_GPL(twofish_xts_enc);
void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{ {
glue_xts_crypt_128bit_one(ctx, dst, src, iv, glue_xts_crypt_128bit_one(ctx, dst, src, iv,
GLUE_FUNC_CAST(twofish_dec_blk)); GLUE_FUNC_CAST(twofish_dec_blk));
} }
EXPORT_SYMBOL_GPL(twofish_xts_dec);
static const struct common_glue_ctx twofish_enc = { static const struct common_glue_ctx twofish_enc = {
......
#ifndef ASM_X86_BLOWFISH_H
#define ASM_X86_BLOWFISH_H
#include <linux/crypto.h>
#include <crypto/blowfish.h>
#define BF_PARALLEL_BLOCKS 4
/* regular block cipher functions */
asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
bool xor);
asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
/* 4-way parallel cipher functions */
asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src);
static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
{
__blowfish_enc_blk(ctx, dst, src, false);
}
static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
const u8 *src)
{
__blowfish_enc_blk(ctx, dst, src, true);
}
static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src)
{
__blowfish_enc_blk_4way(ctx, dst, src, false);
}
static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src)
{
__blowfish_enc_blk_4way(ctx, dst, src, true);
}
#endif
...@@ -28,20 +28,6 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, ...@@ -28,20 +28,6 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
/* 8-way parallel cipher functions */
asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);
asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);
asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);
/* helpers from twofish_x86_64-3way module */ /* helpers from twofish_x86_64-3way module */
extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
...@@ -57,8 +43,4 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); ...@@ -57,8 +43,4 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen); unsigned int keylen);
/* helpers from twofish-avx module */
extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
#endif /* ASM_X86_TWOFISH_H */ #endif /* ASM_X86_TWOFISH_H */
...@@ -376,6 +376,25 @@ config CRYPTO_CRC32_PCLMUL ...@@ -376,6 +376,25 @@ config CRYPTO_CRC32_PCLMUL
which will enable any routine to use the CRC-32-IEEE 802.3 checksum which will enable any routine to use the CRC-32-IEEE 802.3 checksum
and gain better performance as compared with the table implementation. and gain better performance as compared with the table implementation.
config CRYPTO_CRCT10DIF
tristate "CRCT10DIF algorithm"
select CRYPTO_HASH
help
CRC T10 Data Integrity Field computation is being cast as
a crypto transform. This allows for faster crc t10 diff
transforms to be used if they are available.
config CRYPTO_CRCT10DIF_PCLMUL
tristate "CRCT10DIF PCLMULQDQ hardware acceleration"
depends on X86 && 64BIT && CRC_T10DIF
select CRYPTO_HASH
help
For x86_64 processors with SSE4.2 and PCLMULQDQ supported,
CRC T10 DIF PCLMULQDQ computation can be hardware
accelerated PCLMULQDQ instruction. This option will create
'crct10dif-plcmul' module, which is faster when computing the
crct10dif checksum as compared with the generic table implementation.
config CRYPTO_GHASH config CRYPTO_GHASH
tristate "GHASH digest algorithm" tristate "GHASH digest algorithm"
select CRYPTO_GF128MUL select CRYPTO_GF128MUL
...@@ -820,25 +839,6 @@ config CRYPTO_BLOWFISH_X86_64 ...@@ -820,25 +839,6 @@ config CRYPTO_BLOWFISH_X86_64
See also: See also:
<http://www.schneier.com/blowfish.html> <http://www.schneier.com/blowfish.html>
config CRYPTO_BLOWFISH_AVX2_X86_64
tristate "Blowfish cipher algorithm (x86_64/AVX2)"
depends on X86 && 64BIT
depends on BROKEN
select CRYPTO_ALGAPI
select CRYPTO_CRYPTD
select CRYPTO_ABLK_HELPER_X86
select CRYPTO_BLOWFISH_COMMON
select CRYPTO_BLOWFISH_X86_64
help
Blowfish cipher algorithm (x86_64/AVX2), by Bruce Schneier.
This is a variable key length cipher which can use keys from 32
bits to 448 bits in length. It's fast, simple and specifically
designed for use on "large microprocessors".
See also:
<http://www.schneier.com/blowfish.html>
config CRYPTO_CAMELLIA config CRYPTO_CAMELLIA
tristate "Camellia cipher algorithms" tristate "Camellia cipher algorithms"
depends on CRYPTO depends on CRYPTO
...@@ -1297,31 +1297,6 @@ config CRYPTO_TWOFISH_AVX_X86_64 ...@@ -1297,31 +1297,6 @@ config CRYPTO_TWOFISH_AVX_X86_64
See also: See also:
<http://www.schneier.com/twofish.html> <http://www.schneier.com/twofish.html>
config CRYPTO_TWOFISH_AVX2_X86_64
tristate "Twofish cipher algorithm (x86_64/AVX2)"
depends on X86 && 64BIT
depends on BROKEN
select CRYPTO_ALGAPI
select CRYPTO_CRYPTD
select CRYPTO_ABLK_HELPER_X86
select CRYPTO_GLUE_HELPER_X86
select CRYPTO_TWOFISH_COMMON
select CRYPTO_TWOFISH_X86_64
select CRYPTO_TWOFISH_X86_64_3WAY
select CRYPTO_TWOFISH_AVX_X86_64
select CRYPTO_LRW
select CRYPTO_XTS
help
Twofish cipher algorithm (x86_64/AVX2).
Twofish was submitted as an AES (Advanced Encryption Standard)
candidate cipher by researchers at CounterPane Systems. It is a
16 round block cipher supporting key sizes of 128, 192, and 256
bits.
See also:
<http://www.schneier.com/twofish.html>
comment "Compression" comment "Compression"
config CRYPTO_DEFLATE config CRYPTO_DEFLATE
......
...@@ -83,6 +83,7 @@ obj-$(CONFIG_CRYPTO_ZLIB) += zlib.o ...@@ -83,6 +83,7 @@ obj-$(CONFIG_CRYPTO_ZLIB) += zlib.o
obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
obj-$(CONFIG_CRYPTO_CRC32) += crc32.o obj-$(CONFIG_CRYPTO_CRC32) += crc32.o
obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif.o
obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o
obj-$(CONFIG_CRYPTO_LZO) += lzo.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o
obj-$(CONFIG_CRYPTO_842) += 842.o obj-$(CONFIG_CRYPTO_842) += 842.o
......
/*
* Cryptographic API.
*
* T10 Data Integrity Field CRC16 Crypto Transform
*
* Copyright (c) 2007 Oracle Corporation. All rights reserved.
* Written by Martin K. Petersen <martin.petersen@oracle.com>
* Copyright (C) 2013 Intel Corporation
* Author: Tim Chen <tim.c.chen@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/types.h>
#include <linux/module.h>
#include <linux/crc-t10dif.h>
#include <crypto/internal/hash.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/kernel.h>
struct chksum_desc_ctx {
__u16 crc;
};
/* Table generated using the following polynomium:
* x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
* gt: 0x8bb7
*/
static const __u16 t10_dif_crc_table[256] = {
0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
};
__u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len)
{
unsigned int i;
for (i = 0 ; i < len ; i++)
crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff];
return crc;
}
EXPORT_SYMBOL(crc_t10dif_generic);
/*
* Steps through buffer one byte at at time, calculates reflected
* crc using table.
*/
static int chksum_init(struct shash_desc *desc)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
ctx->crc = 0;
return 0;
}
static int chksum_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
return 0;
}
static int chksum_final(struct shash_desc *desc, u8 *out)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
*(__u16 *)out = ctx->crc;
return 0;
}
static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
*(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
return 0;
}
static int chksum_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
return __chksum_finup(&ctx->crc, data, len, out);
}
static int chksum_digest(struct shash_desc *desc, const u8 *data,
unsigned int length, u8 *out)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
return __chksum_finup(&ctx->crc, data, length, out);
}
static struct shash_alg alg = {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = chksum_init,
.update = chksum_update,
.final = chksum_final,
.finup = chksum_finup,
.digest = chksum_digest,
.descsize = sizeof(struct chksum_desc_ctx),
.base = {
.cra_name = "crct10dif",
.cra_driver_name = "crct10dif-generic",
.cra_priority = 100,
.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static int __init crct10dif_mod_init(void)
{
int ret;
ret = crypto_register_shash(&alg);
return ret;
}
static void __exit crct10dif_mod_fini(void)
{
crypto_unregister_shash(&alg);
}
module_init(crct10dif_mod_init);
module_exit(crct10dif_mod_fini);
MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
MODULE_DESCRIPTION("T10 DIF CRC calculation.");
MODULE_LICENSE("GPL");
...@@ -251,6 +251,7 @@ static struct shash_alg sha512_algs[2] = { { ...@@ -251,6 +251,7 @@ static struct shash_alg sha512_algs[2] = { {
.descsize = sizeof(struct sha512_state), .descsize = sizeof(struct sha512_state),
.base = { .base = {
.cra_name = "sha512", .cra_name = "sha512",
.cra_driver_name = "sha512-generic",
.cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE, .cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE, .cra_module = THIS_MODULE,
...@@ -263,6 +264,7 @@ static struct shash_alg sha512_algs[2] = { { ...@@ -263,6 +264,7 @@ static struct shash_alg sha512_algs[2] = { {
.descsize = sizeof(struct sha512_state), .descsize = sizeof(struct sha512_state),
.base = { .base = {
.cra_name = "sha384", .cra_name = "sha384",
.cra_driver_name = "sha384-generic",
.cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA384_BLOCK_SIZE, .cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE, .cra_module = THIS_MODULE,
......
...@@ -1174,6 +1174,10 @@ static int do_test(int m) ...@@ -1174,6 +1174,10 @@ static int do_test(int m)
ret += tcrypt_test("ghash"); ret += tcrypt_test("ghash");
break; break;
case 47:
ret += tcrypt_test("crct10dif");
break;
case 100: case 100:
ret += tcrypt_test("hmac(md5)"); ret += tcrypt_test("hmac(md5)");
break; break;
...@@ -1498,6 +1502,10 @@ static int do_test(int m) ...@@ -1498,6 +1502,10 @@ static int do_test(int m)
test_hash_speed("crc32c", sec, generic_hash_speed_template); test_hash_speed("crc32c", sec, generic_hash_speed_template);
if (mode > 300 && mode < 400) break; if (mode > 300 && mode < 400) break;
case 320:
test_hash_speed("crct10dif", sec, generic_hash_speed_template);
if (mode > 300 && mode < 400) break;
case 399: case 399:
break; break;
......
...@@ -184,8 +184,9 @@ static int do_one_async_hash_op(struct ahash_request *req, ...@@ -184,8 +184,9 @@ static int do_one_async_hash_op(struct ahash_request *req,
return ret; return ret;
} }
static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
unsigned int tcount, bool use_digest) unsigned int tcount, bool use_digest,
const int align_offset)
{ {
const char *algo = crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm)); const char *algo = crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
unsigned int i, j, k, temp; unsigned int i, j, k, temp;
...@@ -216,10 +217,15 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, ...@@ -216,10 +217,15 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
if (template[i].np) if (template[i].np)
continue; continue;
ret = -EINVAL;
if (WARN_ON(align_offset + template[i].psize > PAGE_SIZE))
goto out;
j++; j++;
memset(result, 0, 64); memset(result, 0, 64);
hash_buff = xbuf[0]; hash_buff = xbuf[0];
hash_buff += align_offset;
memcpy(hash_buff, template[i].plaintext, template[i].psize); memcpy(hash_buff, template[i].plaintext, template[i].psize);
sg_init_one(&sg[0], hash_buff, template[i].psize); sg_init_one(&sg[0], hash_buff, template[i].psize);
...@@ -281,6 +287,10 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, ...@@ -281,6 +287,10 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
j = 0; j = 0;
for (i = 0; i < tcount; i++) { for (i = 0; i < tcount; i++) {
/* alignment tests are only done with continuous buffers */
if (align_offset != 0)
break;
if (template[i].np) { if (template[i].np) {
j++; j++;
memset(result, 0, 64); memset(result, 0, 64);
...@@ -358,9 +368,36 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, ...@@ -358,9 +368,36 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
return ret; return ret;
} }
static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
unsigned int tcount, bool use_digest)
{
unsigned int alignmask;
int ret;
ret = __test_hash(tfm, template, tcount, use_digest, 0);
if (ret)
return ret;
/* test unaligned buffers, check with one byte offset */
ret = __test_hash(tfm, template, tcount, use_digest, 1);
if (ret)
return ret;
alignmask = crypto_tfm_alg_alignmask(&tfm->base);
if (alignmask) {
/* Check if alignment mask for tfm is correctly set. */
ret = __test_hash(tfm, template, tcount, use_digest,
alignmask + 1);
if (ret)
return ret;
}
return 0;
}
static int __test_aead(struct crypto_aead *tfm, int enc, static int __test_aead(struct crypto_aead *tfm, int enc,
struct aead_testvec *template, unsigned int tcount, struct aead_testvec *template, unsigned int tcount,
const bool diff_dst) const bool diff_dst, const int align_offset)
{ {
const char *algo = crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm)); const char *algo = crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm));
unsigned int i, j, k, n, temp; unsigned int i, j, k, n, temp;
...@@ -423,15 +460,16 @@ static int __test_aead(struct crypto_aead *tfm, int enc, ...@@ -423,15 +460,16 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
if (!template[i].np) { if (!template[i].np) {
j++; j++;
/* some tepmplates have no input data but they will /* some templates have no input data but they will
* touch input * touch input
*/ */
input = xbuf[0]; input = xbuf[0];
input += align_offset;
assoc = axbuf[0]; assoc = axbuf[0];
ret = -EINVAL; ret = -EINVAL;
if (WARN_ON(template[i].ilen > PAGE_SIZE || if (WARN_ON(align_offset + template[i].ilen >
template[i].alen > PAGE_SIZE)) PAGE_SIZE || template[i].alen > PAGE_SIZE))
goto out; goto out;
memcpy(input, template[i].input, template[i].ilen); memcpy(input, template[i].input, template[i].ilen);
...@@ -470,6 +508,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc, ...@@ -470,6 +508,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
if (diff_dst) { if (diff_dst) {
output = xoutbuf[0]; output = xoutbuf[0];
output += align_offset;
sg_init_one(&sgout[0], output, sg_init_one(&sgout[0], output,
template[i].ilen + template[i].ilen +
(enc ? authsize : 0)); (enc ? authsize : 0));
...@@ -530,6 +569,10 @@ static int __test_aead(struct crypto_aead *tfm, int enc, ...@@ -530,6 +569,10 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
} }
for (i = 0, j = 0; i < tcount; i++) { for (i = 0, j = 0; i < tcount; i++) {
/* alignment tests are only done with continuous buffers */
if (align_offset != 0)
break;
if (template[i].np) { if (template[i].np) {
j++; j++;
...@@ -732,15 +775,34 @@ static int __test_aead(struct crypto_aead *tfm, int enc, ...@@ -732,15 +775,34 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
static int test_aead(struct crypto_aead *tfm, int enc, static int test_aead(struct crypto_aead *tfm, int enc,
struct aead_testvec *template, unsigned int tcount) struct aead_testvec *template, unsigned int tcount)
{ {
unsigned int alignmask;
int ret; int ret;
/* test 'dst == src' case */ /* test 'dst == src' case */
ret = __test_aead(tfm, enc, template, tcount, false); ret = __test_aead(tfm, enc, template, tcount, false, 0);
if (ret) if (ret)
return ret; return ret;
/* test 'dst != src' case */ /* test 'dst != src' case */
return __test_aead(tfm, enc, template, tcount, true); ret = __test_aead(tfm, enc, template, tcount, true, 0);
if (ret)
return ret;
/* test unaligned buffers, check with one byte offset */
ret = __test_aead(tfm, enc, template, tcount, true, 1);
if (ret)
return ret;
alignmask = crypto_tfm_alg_alignmask(&tfm->base);
if (alignmask) {
/* Check if alignment mask for tfm is correctly set. */
ret = __test_aead(tfm, enc, template, tcount, true,
alignmask + 1);
if (ret)
return ret;
}
return 0;
} }
static int test_cipher(struct crypto_cipher *tfm, int enc, static int test_cipher(struct crypto_cipher *tfm, int enc,
...@@ -820,7 +882,7 @@ static int test_cipher(struct crypto_cipher *tfm, int enc, ...@@ -820,7 +882,7 @@ static int test_cipher(struct crypto_cipher *tfm, int enc,
static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
struct cipher_testvec *template, unsigned int tcount, struct cipher_testvec *template, unsigned int tcount,
const bool diff_dst) const bool diff_dst, const int align_offset)
{ {
const char *algo = const char *algo =
crypto_tfm_alg_driver_name(crypto_ablkcipher_tfm(tfm)); crypto_tfm_alg_driver_name(crypto_ablkcipher_tfm(tfm));
...@@ -876,10 +938,12 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, ...@@ -876,10 +938,12 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
j++; j++;
ret = -EINVAL; ret = -EINVAL;
if (WARN_ON(template[i].ilen > PAGE_SIZE)) if (WARN_ON(align_offset + template[i].ilen >
PAGE_SIZE))
goto out; goto out;
data = xbuf[0]; data = xbuf[0];
data += align_offset;
memcpy(data, template[i].input, template[i].ilen); memcpy(data, template[i].input, template[i].ilen);
crypto_ablkcipher_clear_flags(tfm, ~0); crypto_ablkcipher_clear_flags(tfm, ~0);
...@@ -900,6 +964,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, ...@@ -900,6 +964,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
sg_init_one(&sg[0], data, template[i].ilen); sg_init_one(&sg[0], data, template[i].ilen);
if (diff_dst) { if (diff_dst) {
data = xoutbuf[0]; data = xoutbuf[0];
data += align_offset;
sg_init_one(&sgout[0], data, template[i].ilen); sg_init_one(&sgout[0], data, template[i].ilen);
} }
...@@ -941,6 +1006,9 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, ...@@ -941,6 +1006,9 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
j = 0; j = 0;
for (i = 0; i < tcount; i++) { for (i = 0; i < tcount; i++) {
/* alignment tests are only done with continuous buffers */
if (align_offset != 0)
break;
if (template[i].iv) if (template[i].iv)
memcpy(iv, template[i].iv, MAX_IVLEN); memcpy(iv, template[i].iv, MAX_IVLEN);
...@@ -1075,15 +1143,34 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, ...@@ -1075,15 +1143,34 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, static int test_skcipher(struct crypto_ablkcipher *tfm, int enc,
struct cipher_testvec *template, unsigned int tcount) struct cipher_testvec *template, unsigned int tcount)
{ {
unsigned int alignmask;
int ret; int ret;
/* test 'dst == src' case */ /* test 'dst == src' case */
ret = __test_skcipher(tfm, enc, template, tcount, false); ret = __test_skcipher(tfm, enc, template, tcount, false, 0);
if (ret) if (ret)
return ret; return ret;
/* test 'dst != src' case */ /* test 'dst != src' case */
return __test_skcipher(tfm, enc, template, tcount, true); ret = __test_skcipher(tfm, enc, template, tcount, true, 0);
if (ret)
return ret;
/* test unaligned buffers, check with one byte offset */
ret = __test_skcipher(tfm, enc, template, tcount, true, 1);
if (ret)
return ret;
alignmask = crypto_tfm_alg_alignmask(&tfm->base);
if (alignmask) {
/* Check if alignment mask for tfm is correctly set. */
ret = __test_skcipher(tfm, enc, template, tcount, true,
alignmask + 1);
if (ret)
return ret;
}
return 0;
} }
static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate, static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate,
...@@ -1653,16 +1740,10 @@ static const struct alg_test_desc alg_test_descs[] = { ...@@ -1653,16 +1740,10 @@ static const struct alg_test_desc alg_test_descs[] = {
}, { }, {
.alg = "__cbc-twofish-avx", .alg = "__cbc-twofish-avx",
.test = alg_test_null, .test = alg_test_null,
}, {
.alg = "__cbc-twofish-avx2",
.test = alg_test_null,
}, { }, {
.alg = "__driver-cbc-aes-aesni", .alg = "__driver-cbc-aes-aesni",
.test = alg_test_null, .test = alg_test_null,
.fips_allowed = 1, .fips_allowed = 1,
}, {
.alg = "__driver-cbc-blowfish-avx2",
.test = alg_test_null,
}, { }, {
.alg = "__driver-cbc-camellia-aesni", .alg = "__driver-cbc-camellia-aesni",
.test = alg_test_null, .test = alg_test_null,
...@@ -1687,16 +1768,10 @@ static const struct alg_test_desc alg_test_descs[] = { ...@@ -1687,16 +1768,10 @@ static const struct alg_test_desc alg_test_descs[] = {
}, { }, {
.alg = "__driver-cbc-twofish-avx", .alg = "__driver-cbc-twofish-avx",
.test = alg_test_null, .test = alg_test_null,
}, {
.alg = "__driver-cbc-twofish-avx2",
.test = alg_test_null,
}, { }, {
.alg = "__driver-ecb-aes-aesni", .alg = "__driver-ecb-aes-aesni",
.test = alg_test_null, .test = alg_test_null,
.fips_allowed = 1, .fips_allowed = 1,
}, {
.alg = "__driver-ecb-blowfish-avx2",
.test = alg_test_null,
}, { }, {
.alg = "__driver-ecb-camellia-aesni", .alg = "__driver-ecb-camellia-aesni",
.test = alg_test_null, .test = alg_test_null,
...@@ -1721,9 +1796,6 @@ static const struct alg_test_desc alg_test_descs[] = { ...@@ -1721,9 +1796,6 @@ static const struct alg_test_desc alg_test_descs[] = {
}, { }, {
.alg = "__driver-ecb-twofish-avx", .alg = "__driver-ecb-twofish-avx",
.test = alg_test_null, .test = alg_test_null,
}, {
.alg = "__driver-ecb-twofish-avx2",
.test = alg_test_null,
}, { }, {
.alg = "__ghash-pclmulqdqni", .alg = "__ghash-pclmulqdqni",
.test = alg_test_null, .test = alg_test_null,
...@@ -1974,12 +2046,19 @@ static const struct alg_test_desc alg_test_descs[] = { ...@@ -1974,12 +2046,19 @@ static const struct alg_test_desc alg_test_descs[] = {
} }
} }
}, { }, {
.alg = "cryptd(__driver-cbc-aes-aesni)", .alg = "crct10dif",
.test = alg_test_null, .test = alg_test_hash,
.fips_allowed = 1, .fips_allowed = 1,
.suite = {
.hash = {
.vecs = crct10dif_tv_template,
.count = CRCT10DIF_TEST_VECTORS
}
}
}, { }, {
.alg = "cryptd(__driver-cbc-blowfish-avx2)", .alg = "cryptd(__driver-cbc-aes-aesni)",
.test = alg_test_null, .test = alg_test_null,
.fips_allowed = 1,
}, { }, {
.alg = "cryptd(__driver-cbc-camellia-aesni)", .alg = "cryptd(__driver-cbc-camellia-aesni)",
.test = alg_test_null, .test = alg_test_null,
...@@ -1993,9 +2072,6 @@ static const struct alg_test_desc alg_test_descs[] = { ...@@ -1993,9 +2072,6 @@ static const struct alg_test_desc alg_test_descs[] = {
.alg = "cryptd(__driver-ecb-aes-aesni)", .alg = "cryptd(__driver-ecb-aes-aesni)",
.test = alg_test_null, .test = alg_test_null,
.fips_allowed = 1, .fips_allowed = 1,
}, {
.alg = "cryptd(__driver-ecb-blowfish-avx2)",
.test = alg_test_null,
}, { }, {
.alg = "cryptd(__driver-ecb-camellia-aesni)", .alg = "cryptd(__driver-ecb-camellia-aesni)",
.test = alg_test_null, .test = alg_test_null,
...@@ -2020,9 +2096,6 @@ static const struct alg_test_desc alg_test_descs[] = { ...@@ -2020,9 +2096,6 @@ static const struct alg_test_desc alg_test_descs[] = {
}, { }, {
.alg = "cryptd(__driver-ecb-twofish-avx)", .alg = "cryptd(__driver-ecb-twofish-avx)",
.test = alg_test_null, .test = alg_test_null,
}, {
.alg = "cryptd(__driver-ecb-twofish-avx2)",
.test = alg_test_null,
}, { }, {
.alg = "cryptd(__driver-gcm-aes-aesni)", .alg = "cryptd(__driver-gcm-aes-aesni)",
.test = alg_test_null, .test = alg_test_null,
...@@ -3068,6 +3141,35 @@ static const struct alg_test_desc alg_test_descs[] = { ...@@ -3068,6 +3141,35 @@ static const struct alg_test_desc alg_test_descs[] = {
} }
}; };
static bool alg_test_descs_checked;
static void alg_test_descs_check_order(void)
{
int i;
/* only check once */
if (alg_test_descs_checked)
return;
alg_test_descs_checked = true;
for (i = 1; i < ARRAY_SIZE(alg_test_descs); i++) {
int diff = strcmp(alg_test_descs[i - 1].alg,
alg_test_descs[i].alg);
if (WARN_ON(diff > 0)) {
pr_warn("testmgr: alg_test_descs entries in wrong order: '%s' before '%s'\n",
alg_test_descs[i - 1].alg,
alg_test_descs[i].alg);
}
if (WARN_ON(diff == 0)) {
pr_warn("testmgr: duplicate alg_test_descs entry: '%s'\n",
alg_test_descs[i].alg);
}
}
}
static int alg_find_test(const char *alg) static int alg_find_test(const char *alg)
{ {
int start = 0; int start = 0;
...@@ -3099,6 +3201,8 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask) ...@@ -3099,6 +3201,8 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
int j; int j;
int rc; int rc;
alg_test_descs_check_order();
if ((type & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_CIPHER) { if ((type & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_CIPHER) {
char nalg[CRYPTO_MAX_ALG_NAME]; char nalg[CRYPTO_MAX_ALG_NAME];
......
...@@ -450,6 +450,39 @@ static struct hash_testvec rmd320_tv_template[] = { ...@@ -450,6 +450,39 @@ static struct hash_testvec rmd320_tv_template[] = {
} }
}; };
#define CRCT10DIF_TEST_VECTORS 3
static struct hash_testvec crct10dif_tv_template[] = {
{
.plaintext = "abc",
.psize = 3,
#ifdef __LITTLE_ENDIAN
.digest = "\x3b\x44",
#else
.digest = "\x44\x3b",
#endif
}, {
.plaintext = "1234567890123456789012345678901234567890"
"123456789012345678901234567890123456789",
.psize = 79,
#ifdef __LITTLE_ENDIAN
.digest = "\x70\x4b",
#else
.digest = "\x4b\x70",
#endif
}, {
.plaintext =
"abcddddddddddddddddddddddddddddddddddddddddddddddddddddd",
.psize = 56,
#ifdef __LITTLE_ENDIAN
.digest = "\xe3\x9c",
#else
.digest = "\x9c\xe3",
#endif
.np = 2,
.tap = { 28, 28 }
}
};
/* /*
* SHA1 test vectors from from FIPS PUB 180-1 * SHA1 test vectors from from FIPS PUB 180-1
* Long vector from CAVS 5.0 * Long vector from CAVS 5.0
......
...@@ -108,8 +108,6 @@ static int atmel_trng_remove(struct platform_device *pdev) ...@@ -108,8 +108,6 @@ static int atmel_trng_remove(struct platform_device *pdev)
clk_disable(trng->clk); clk_disable(trng->clk);
clk_put(trng->clk); clk_put(trng->clk);
platform_set_drvdata(pdev, NULL);
return 0; return 0;
} }
......
...@@ -137,7 +137,6 @@ static int bcm63xx_rng_probe(struct platform_device *pdev) ...@@ -137,7 +137,6 @@ static int bcm63xx_rng_probe(struct platform_device *pdev)
out_clk_disable: out_clk_disable:
clk_disable(clk); clk_disable(clk);
out_free_rng: out_free_rng:
platform_set_drvdata(pdev, NULL);
kfree(rng); kfree(rng);
out_free_priv: out_free_priv:
kfree(priv); kfree(priv);
...@@ -154,7 +153,6 @@ static int bcm63xx_rng_remove(struct platform_device *pdev) ...@@ -154,7 +153,6 @@ static int bcm63xx_rng_remove(struct platform_device *pdev)
clk_disable(priv->clk); clk_disable(priv->clk);
kfree(priv); kfree(priv);
kfree(rng); kfree(rng);
platform_set_drvdata(pdev, NULL);
return 0; return 0;
} }
......
...@@ -700,7 +700,7 @@ static int n2rng_probe(struct platform_device *op) ...@@ -700,7 +700,7 @@ static int n2rng_probe(struct platform_device *op)
if (err) if (err)
goto out_free_units; goto out_free_units;
dev_set_drvdata(&op->dev, np); platform_set_drvdata(op, np);
schedule_delayed_work(&np->work, 0); schedule_delayed_work(&np->work, 0);
...@@ -721,7 +721,7 @@ static int n2rng_probe(struct platform_device *op) ...@@ -721,7 +721,7 @@ static int n2rng_probe(struct platform_device *op)
static int n2rng_remove(struct platform_device *op) static int n2rng_remove(struct platform_device *op)
{ {
struct n2rng *np = dev_get_drvdata(&op->dev); struct n2rng *np = platform_get_drvdata(op);
np->flags |= N2RNG_FLAG_SHUTDOWN; np->flags |= N2RNG_FLAG_SHUTDOWN;
...@@ -736,8 +736,6 @@ static int n2rng_remove(struct platform_device *op) ...@@ -736,8 +736,6 @@ static int n2rng_remove(struct platform_device *op)
kfree(np); kfree(np);
dev_set_drvdata(&op->dev, NULL);
return 0; return 0;
} }
......
...@@ -51,7 +51,7 @@ static int nmk_rng_probe(struct amba_device *dev, const struct amba_id *id) ...@@ -51,7 +51,7 @@ static int nmk_rng_probe(struct amba_device *dev, const struct amba_id *id)
return ret; return ret;
} }
clk_enable(rng_clk); clk_prepare_enable(rng_clk);
ret = amba_request_regions(dev, dev->dev.init_name); ret = amba_request_regions(dev, dev->dev.init_name);
if (ret) if (ret)
......
...@@ -96,7 +96,7 @@ static int octeon_rng_probe(struct platform_device *pdev) ...@@ -96,7 +96,7 @@ static int octeon_rng_probe(struct platform_device *pdev)
rng->ops = ops; rng->ops = ops;
dev_set_drvdata(&pdev->dev, &rng->ops); platform_set_drvdata(pdev, &rng->ops);
ret = hwrng_register(&rng->ops); ret = hwrng_register(&rng->ops);
if (ret) if (ret)
return -ENOENT; return -ENOENT;
...@@ -108,7 +108,7 @@ static int octeon_rng_probe(struct platform_device *pdev) ...@@ -108,7 +108,7 @@ static int octeon_rng_probe(struct platform_device *pdev)
static int __exit octeon_rng_remove(struct platform_device *pdev) static int __exit octeon_rng_remove(struct platform_device *pdev)
{ {
struct hwrng *rng = dev_get_drvdata(&pdev->dev); struct hwrng *rng = platform_get_drvdata(pdev);
hwrng_unregister(rng); hwrng_unregister(rng);
......
...@@ -116,7 +116,7 @@ static int omap_rng_probe(struct platform_device *pdev) ...@@ -116,7 +116,7 @@ static int omap_rng_probe(struct platform_device *pdev)
}; };
omap_rng_ops.priv = (unsigned long)priv; omap_rng_ops.priv = (unsigned long)priv;
dev_set_drvdata(&pdev->dev, priv); platform_set_drvdata(pdev, priv);
priv->mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); priv->mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
priv->base = devm_ioremap_resource(&pdev->dev, priv->mem_res); priv->base = devm_ioremap_resource(&pdev->dev, priv->mem_res);
...@@ -124,7 +124,7 @@ static int omap_rng_probe(struct platform_device *pdev) ...@@ -124,7 +124,7 @@ static int omap_rng_probe(struct platform_device *pdev)
ret = PTR_ERR(priv->base); ret = PTR_ERR(priv->base);
goto err_ioremap; goto err_ioremap;
} }
dev_set_drvdata(&pdev->dev, priv); platform_set_drvdata(pdev, priv);
pm_runtime_enable(&pdev->dev); pm_runtime_enable(&pdev->dev);
pm_runtime_get_sync(&pdev->dev); pm_runtime_get_sync(&pdev->dev);
...@@ -151,7 +151,7 @@ static int omap_rng_probe(struct platform_device *pdev) ...@@ -151,7 +151,7 @@ static int omap_rng_probe(struct platform_device *pdev)
static int __exit omap_rng_remove(struct platform_device *pdev) static int __exit omap_rng_remove(struct platform_device *pdev)
{ {
struct omap_rng_private_data *priv = dev_get_drvdata(&pdev->dev); struct omap_rng_private_data *priv = platform_get_drvdata(pdev);
hwrng_unregister(&omap_rng_ops); hwrng_unregister(&omap_rng_ops);
......
...@@ -192,7 +192,6 @@ static int timeriomem_rng_probe(struct platform_device *pdev) ...@@ -192,7 +192,6 @@ static int timeriomem_rng_probe(struct platform_device *pdev)
out_timer: out_timer:
del_timer_sync(&priv->timer); del_timer_sync(&priv->timer);
out_free: out_free:
platform_set_drvdata(pdev, NULL);
kfree(priv); kfree(priv);
return err; return err;
} }
...@@ -209,7 +208,6 @@ static int timeriomem_rng_remove(struct platform_device *pdev) ...@@ -209,7 +208,6 @@ static int timeriomem_rng_remove(struct platform_device *pdev)
del_timer_sync(&priv->timer); del_timer_sync(&priv->timer);
iounmap(priv->io_base); iounmap(priv->io_base);
release_mem_region(res->start, resource_size(res)); release_mem_region(res->start, resource_size(res));
platform_set_drvdata(pdev, NULL);
kfree(priv); kfree(priv);
return 0; return 0;
......
...@@ -154,7 +154,6 @@ static int __exit tx4939_rng_remove(struct platform_device *dev) ...@@ -154,7 +154,6 @@ static int __exit tx4939_rng_remove(struct platform_device *dev)
struct tx4939_rng *rngdev = platform_get_drvdata(dev); struct tx4939_rng *rngdev = platform_get_drvdata(dev);
hwrng_unregister(&rngdev->rng); hwrng_unregister(&rngdev->rng);
platform_set_drvdata(dev, NULL);
return 0; return 0;
} }
......
...@@ -278,7 +278,7 @@ config CRYPTO_DEV_PICOXCELL ...@@ -278,7 +278,7 @@ config CRYPTO_DEV_PICOXCELL
config CRYPTO_DEV_SAHARA config CRYPTO_DEV_SAHARA
tristate "Support for SAHARA crypto accelerator" tristate "Support for SAHARA crypto accelerator"
depends on ARCH_MXC && EXPERIMENTAL && OF depends on ARCH_MXC && OF
select CRYPTO_BLKCIPHER select CRYPTO_BLKCIPHER
select CRYPTO_AES select CRYPTO_AES
select CRYPTO_ECB select CRYPTO_ECB
...@@ -286,6 +286,16 @@ config CRYPTO_DEV_SAHARA ...@@ -286,6 +286,16 @@ config CRYPTO_DEV_SAHARA
This option enables support for the SAHARA HW crypto accelerator This option enables support for the SAHARA HW crypto accelerator
found in some Freescale i.MX chips. found in some Freescale i.MX chips.
config CRYPTO_DEV_DCP
tristate "Support for the DCP engine"
depends on ARCH_MXS && OF
select CRYPTO_BLKCIPHER
select CRYPTO_AES
select CRYPTO_CBC
help
This options enables support for the hardware crypto-acceleration
capabilities of the DCP co-processor
config CRYPTO_DEV_S5P config CRYPTO_DEV_S5P
tristate "Support for Samsung S5PV210 crypto accelerator" tristate "Support for Samsung S5PV210 crypto accelerator"
depends on ARCH_S5PV210 depends on ARCH_S5PV210
......
...@@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o ...@@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o
obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o
obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o
obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o
obj-$(CONFIG_CRYPTO_DEV_DCP) += dcp.o
obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o
obj-$(CONFIG_CRYPTO_DEV_TEGRA_AES) += tegra-aes.o obj-$(CONFIG_CRYPTO_DEV_TEGRA_AES) += tegra-aes.o
obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/ obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/
......
...@@ -202,6 +202,7 @@ static int caam_probe(struct platform_device *pdev) ...@@ -202,6 +202,7 @@ static int caam_probe(struct platform_device *pdev)
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS
struct caam_perfmon *perfmon; struct caam_perfmon *perfmon;
#endif #endif
u64 cha_vid;
ctrlpriv = kzalloc(sizeof(struct caam_drv_private), GFP_KERNEL); ctrlpriv = kzalloc(sizeof(struct caam_drv_private), GFP_KERNEL);
if (!ctrlpriv) if (!ctrlpriv)
...@@ -293,11 +294,14 @@ static int caam_probe(struct platform_device *pdev) ...@@ -293,11 +294,14 @@ static int caam_probe(struct platform_device *pdev)
return -ENOMEM; return -ENOMEM;
} }
cha_vid = rd_reg64(&topregs->ctrl.perfmon.cha_id);
/* /*
* RNG4 based SECs (v5+) need special initialization prior * If SEC has RNG version >= 4 and RNG state handle has not been
* to executing any descriptors * already instantiated ,do RNG instantiation
*/ */
if (of_device_is_compatible(nprop, "fsl,sec-v5.0")) { if ((cha_vid & CHA_ID_RNG_MASK) >> CHA_ID_RNG_SHIFT >= 4 &&
!(rd_reg32(&topregs->ctrl.r4tst[0].rdsta) & RDSTA_IF0)) {
kick_trng(pdev); kick_trng(pdev);
ret = instantiate_rng(ctrlpriv->jrdev[0]); ret = instantiate_rng(ctrlpriv->jrdev[0]);
if (ret) { if (ret) {
......
...@@ -231,7 +231,12 @@ struct sec4_sg_entry { ...@@ -231,7 +231,12 @@ struct sec4_sg_entry {
#define LDST_SRCDST_WORD_PKHA_B_SZ (0x11 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_PKHA_B_SZ (0x11 << LDST_SRCDST_SHIFT)
#define LDST_SRCDST_WORD_PKHA_N_SZ (0x12 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_PKHA_N_SZ (0x12 << LDST_SRCDST_SHIFT)
#define LDST_SRCDST_WORD_PKHA_E_SZ (0x13 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_PKHA_E_SZ (0x13 << LDST_SRCDST_SHIFT)
#define LDST_SRCDST_WORD_CLASS_CTX (0x20 << LDST_SRCDST_SHIFT)
#define LDST_SRCDST_WORD_DESCBUF (0x40 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_DESCBUF (0x40 << LDST_SRCDST_SHIFT)
#define LDST_SRCDST_WORD_DESCBUF_JOB (0x41 << LDST_SRCDST_SHIFT)
#define LDST_SRCDST_WORD_DESCBUF_SHARED (0x42 << LDST_SRCDST_SHIFT)
#define LDST_SRCDST_WORD_DESCBUF_JOB_WE (0x45 << LDST_SRCDST_SHIFT)
#define LDST_SRCDST_WORD_DESCBUF_SHARED_WE (0x46 << LDST_SRCDST_SHIFT)
#define LDST_SRCDST_WORD_INFO_FIFO (0x7a << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_INFO_FIFO (0x7a << LDST_SRCDST_SHIFT)
/* Offset in source/destination */ /* Offset in source/destination */
...@@ -366,6 +371,7 @@ struct sec4_sg_entry { ...@@ -366,6 +371,7 @@ struct sec4_sg_entry {
#define FIFOLD_TYPE_LAST2FLUSH1 (0x05 << FIFOLD_TYPE_SHIFT) #define FIFOLD_TYPE_LAST2FLUSH1 (0x05 << FIFOLD_TYPE_SHIFT)
#define FIFOLD_TYPE_LASTBOTH (0x06 << FIFOLD_TYPE_SHIFT) #define FIFOLD_TYPE_LASTBOTH (0x06 << FIFOLD_TYPE_SHIFT)
#define FIFOLD_TYPE_LASTBOTHFL (0x07 << FIFOLD_TYPE_SHIFT) #define FIFOLD_TYPE_LASTBOTHFL (0x07 << FIFOLD_TYPE_SHIFT)
#define FIFOLD_TYPE_NOINFOFIFO (0x0F << FIFOLD_TYPE_SHIFT)
#define FIFOLDST_LEN_MASK 0xffff #define FIFOLDST_LEN_MASK 0xffff
#define FIFOLDST_EXT_LEN_MASK 0xffffffff #define FIFOLDST_EXT_LEN_MASK 0xffffffff
...@@ -1294,10 +1300,10 @@ struct sec4_sg_entry { ...@@ -1294,10 +1300,10 @@ struct sec4_sg_entry {
#define SQOUT_SGF 0x01000000 #define SQOUT_SGF 0x01000000
/* Appends to a previous pointer */ /* Appends to a previous pointer */
#define SQOUT_PRE 0x00800000 #define SQOUT_PRE SQIN_PRE
/* Restore sequence with pointer/length */ /* Restore sequence with pointer/length */
#define SQOUT_RTO 0x00200000 #define SQOUT_RTO SQIN_RTO
/* Use extended length following pointer */ /* Use extended length following pointer */
#define SQOUT_EXT 0x00400000 #define SQOUT_EXT 0x00400000
...@@ -1359,6 +1365,7 @@ struct sec4_sg_entry { ...@@ -1359,6 +1365,7 @@ struct sec4_sg_entry {
#define MOVE_DEST_MATH3 (0x07 << MOVE_DEST_SHIFT) #define MOVE_DEST_MATH3 (0x07 << MOVE_DEST_SHIFT)
#define MOVE_DEST_CLASS1INFIFO (0x08 << MOVE_DEST_SHIFT) #define MOVE_DEST_CLASS1INFIFO (0x08 << MOVE_DEST_SHIFT)
#define MOVE_DEST_CLASS2INFIFO (0x09 << MOVE_DEST_SHIFT) #define MOVE_DEST_CLASS2INFIFO (0x09 << MOVE_DEST_SHIFT)
#define MOVE_DEST_INFIFO_NOINFO (0x0a << MOVE_DEST_SHIFT)
#define MOVE_DEST_PK_A (0x0c << MOVE_DEST_SHIFT) #define MOVE_DEST_PK_A (0x0c << MOVE_DEST_SHIFT)
#define MOVE_DEST_CLASS1KEY (0x0d << MOVE_DEST_SHIFT) #define MOVE_DEST_CLASS1KEY (0x0d << MOVE_DEST_SHIFT)
#define MOVE_DEST_CLASS2KEY (0x0e << MOVE_DEST_SHIFT) #define MOVE_DEST_CLASS2KEY (0x0e << MOVE_DEST_SHIFT)
...@@ -1411,6 +1418,7 @@ struct sec4_sg_entry { ...@@ -1411,6 +1418,7 @@ struct sec4_sg_entry {
#define MATH_SRC0_REG2 (0x02 << MATH_SRC0_SHIFT) #define MATH_SRC0_REG2 (0x02 << MATH_SRC0_SHIFT)
#define MATH_SRC0_REG3 (0x03 << MATH_SRC0_SHIFT) #define MATH_SRC0_REG3 (0x03 << MATH_SRC0_SHIFT)
#define MATH_SRC0_IMM (0x04 << MATH_SRC0_SHIFT) #define MATH_SRC0_IMM (0x04 << MATH_SRC0_SHIFT)
#define MATH_SRC0_DPOVRD (0x07 << MATH_SRC0_SHIFT)
#define MATH_SRC0_SEQINLEN (0x08 << MATH_SRC0_SHIFT) #define MATH_SRC0_SEQINLEN (0x08 << MATH_SRC0_SHIFT)
#define MATH_SRC0_SEQOUTLEN (0x09 << MATH_SRC0_SHIFT) #define MATH_SRC0_SEQOUTLEN (0x09 << MATH_SRC0_SHIFT)
#define MATH_SRC0_VARSEQINLEN (0x0a << MATH_SRC0_SHIFT) #define MATH_SRC0_VARSEQINLEN (0x0a << MATH_SRC0_SHIFT)
...@@ -1425,6 +1433,7 @@ struct sec4_sg_entry { ...@@ -1425,6 +1433,7 @@ struct sec4_sg_entry {
#define MATH_SRC1_REG2 (0x02 << MATH_SRC1_SHIFT) #define MATH_SRC1_REG2 (0x02 << MATH_SRC1_SHIFT)
#define MATH_SRC1_REG3 (0x03 << MATH_SRC1_SHIFT) #define MATH_SRC1_REG3 (0x03 << MATH_SRC1_SHIFT)
#define MATH_SRC1_IMM (0x04 << MATH_SRC1_SHIFT) #define MATH_SRC1_IMM (0x04 << MATH_SRC1_SHIFT)
#define MATH_SRC1_DPOVRD (0x07 << MATH_SRC0_SHIFT)
#define MATH_SRC1_INFIFO (0x0a << MATH_SRC1_SHIFT) #define MATH_SRC1_INFIFO (0x0a << MATH_SRC1_SHIFT)
#define MATH_SRC1_OUTFIFO (0x0b << MATH_SRC1_SHIFT) #define MATH_SRC1_OUTFIFO (0x0b << MATH_SRC1_SHIFT)
#define MATH_SRC1_ONE (0x0c << MATH_SRC1_SHIFT) #define MATH_SRC1_ONE (0x0c << MATH_SRC1_SHIFT)
...@@ -1600,4 +1609,13 @@ struct sec4_sg_entry { ...@@ -1600,4 +1609,13 @@ struct sec4_sg_entry {
#define NFIFOENTRY_PLEN_SHIFT 0 #define NFIFOENTRY_PLEN_SHIFT 0
#define NFIFOENTRY_PLEN_MASK (0xFF << NFIFOENTRY_PLEN_SHIFT) #define NFIFOENTRY_PLEN_MASK (0xFF << NFIFOENTRY_PLEN_SHIFT)
/* Append Load Immediate Command */
#define FD_CMD_APPEND_LOAD_IMMEDIATE 0x80000000
/* Set SEQ LIODN equal to the Non-SEQ LIODN for the job */
#define FD_CMD_SET_SEQ_LIODN_EQUAL_NONSEQ_LIODN 0x40000000
/* Frame Descriptor Command for Replacement Job Descriptor */
#define FD_CMD_REPLACE_JOB_DESC 0x20000000
#endif /* DESC_H */ #endif /* DESC_H */
...@@ -110,6 +110,26 @@ static inline void append_cmd(u32 *desc, u32 command) ...@@ -110,6 +110,26 @@ static inline void append_cmd(u32 *desc, u32 command)
(*desc)++; (*desc)++;
} }
#define append_u32 append_cmd
static inline void append_u64(u32 *desc, u64 data)
{
u32 *offset = desc_end(desc);
*offset = upper_32_bits(data);
*(++offset) = lower_32_bits(data);
(*desc) += 2;
}
/* Write command without affecting header, and return pointer to next word */
static inline u32 *write_cmd(u32 *desc, u32 command)
{
*desc = command;
return desc + 1;
}
static inline void append_cmd_ptr(u32 *desc, dma_addr_t ptr, int len, static inline void append_cmd_ptr(u32 *desc, dma_addr_t ptr, int len,
u32 command) u32 command)
{ {
...@@ -122,7 +142,8 @@ static inline void append_cmd_ptr_extlen(u32 *desc, dma_addr_t ptr, ...@@ -122,7 +142,8 @@ static inline void append_cmd_ptr_extlen(u32 *desc, dma_addr_t ptr,
unsigned int len, u32 command) unsigned int len, u32 command)
{ {
append_cmd(desc, command); append_cmd(desc, command);
append_ptr(desc, ptr); if (!(command & (SQIN_RTO | SQIN_PRE)))
append_ptr(desc, ptr);
append_cmd(desc, len); append_cmd(desc, len);
} }
...@@ -176,17 +197,36 @@ static inline void append_##cmd(u32 *desc, dma_addr_t ptr, unsigned int len, \ ...@@ -176,17 +197,36 @@ static inline void append_##cmd(u32 *desc, dma_addr_t ptr, unsigned int len, \
} }
APPEND_CMD_PTR(key, KEY) APPEND_CMD_PTR(key, KEY)
APPEND_CMD_PTR(load, LOAD) APPEND_CMD_PTR(load, LOAD)
APPEND_CMD_PTR(store, STORE)
APPEND_CMD_PTR(fifo_load, FIFO_LOAD) APPEND_CMD_PTR(fifo_load, FIFO_LOAD)
APPEND_CMD_PTR(fifo_store, FIFO_STORE) APPEND_CMD_PTR(fifo_store, FIFO_STORE)
static inline void append_store(u32 *desc, dma_addr_t ptr, unsigned int len,
u32 options)
{
u32 cmd_src;
cmd_src = options & LDST_SRCDST_MASK;
append_cmd(desc, CMD_STORE | options | len);
/* The following options do not require pointer */
if (!(cmd_src == LDST_SRCDST_WORD_DESCBUF_SHARED ||
cmd_src == LDST_SRCDST_WORD_DESCBUF_JOB ||
cmd_src == LDST_SRCDST_WORD_DESCBUF_JOB_WE ||
cmd_src == LDST_SRCDST_WORD_DESCBUF_SHARED_WE))
append_ptr(desc, ptr);
}
#define APPEND_SEQ_PTR_INTLEN(cmd, op) \ #define APPEND_SEQ_PTR_INTLEN(cmd, op) \
static inline void append_seq_##cmd##_ptr_intlen(u32 *desc, dma_addr_t ptr, \ static inline void append_seq_##cmd##_ptr_intlen(u32 *desc, dma_addr_t ptr, \
unsigned int len, \ unsigned int len, \
u32 options) \ u32 options) \
{ \ { \
PRINT_POS; \ PRINT_POS; \
append_cmd_ptr(desc, ptr, len, CMD_SEQ_##op##_PTR | options); \ if (options & (SQIN_RTO | SQIN_PRE)) \
append_cmd(desc, CMD_SEQ_##op##_PTR | len | options); \
else \
append_cmd_ptr(desc, ptr, len, CMD_SEQ_##op##_PTR | options); \
} }
APPEND_SEQ_PTR_INTLEN(in, IN) APPEND_SEQ_PTR_INTLEN(in, IN)
APPEND_SEQ_PTR_INTLEN(out, OUT) APPEND_SEQ_PTR_INTLEN(out, OUT)
...@@ -259,7 +299,7 @@ APPEND_CMD_RAW_IMM(load, LOAD, u32); ...@@ -259,7 +299,7 @@ APPEND_CMD_RAW_IMM(load, LOAD, u32);
*/ */
#define APPEND_MATH(op, desc, dest, src_0, src_1, len) \ #define APPEND_MATH(op, desc, dest, src_0, src_1, len) \
append_cmd(desc, CMD_MATH | MATH_FUN_##op | MATH_DEST_##dest | \ append_cmd(desc, CMD_MATH | MATH_FUN_##op | MATH_DEST_##dest | \
MATH_SRC0_##src_0 | MATH_SRC1_##src_1 | (u32) (len & MATH_LEN_MASK)); MATH_SRC0_##src_0 | MATH_SRC1_##src_1 | (u32)len);
#define append_math_add(desc, dest, src0, src1, len) \ #define append_math_add(desc, dest, src0, src1, len) \
APPEND_MATH(ADD, desc, dest, src0, src1, len) APPEND_MATH(ADD, desc, dest, src0, src1, len)
...@@ -279,6 +319,8 @@ append_cmd(desc, CMD_MATH | MATH_FUN_##op | MATH_DEST_##dest | \ ...@@ -279,6 +319,8 @@ append_cmd(desc, CMD_MATH | MATH_FUN_##op | MATH_DEST_##dest | \
APPEND_MATH(LSHIFT, desc, dest, src0, src1, len) APPEND_MATH(LSHIFT, desc, dest, src0, src1, len)
#define append_math_rshift(desc, dest, src0, src1, len) \ #define append_math_rshift(desc, dest, src0, src1, len) \
APPEND_MATH(RSHIFT, desc, dest, src0, src1, len) APPEND_MATH(RSHIFT, desc, dest, src0, src1, len)
#define append_math_ldshift(desc, dest, src0, src1, len) \
APPEND_MATH(SHLD, desc, dest, src0, src1, len)
/* Exactly one source is IMM. Data is passed in as u32 value */ /* Exactly one source is IMM. Data is passed in as u32 value */
#define APPEND_MATH_IMM_u32(op, desc, dest, src_0, src_1, data) \ #define APPEND_MATH_IMM_u32(op, desc, dest, src_0, src_1, data) \
...@@ -305,3 +347,34 @@ do { \ ...@@ -305,3 +347,34 @@ do { \
APPEND_MATH_IMM_u32(LSHIFT, desc, dest, src0, src1, data) APPEND_MATH_IMM_u32(LSHIFT, desc, dest, src0, src1, data)
#define append_math_rshift_imm_u32(desc, dest, src0, src1, data) \ #define append_math_rshift_imm_u32(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u32(RSHIFT, desc, dest, src0, src1, data) APPEND_MATH_IMM_u32(RSHIFT, desc, dest, src0, src1, data)
/* Exactly one source is IMM. Data is passed in as u64 value */
#define APPEND_MATH_IMM_u64(op, desc, dest, src_0, src_1, data) \
do { \
u32 upper = (data >> 16) >> 16; \
APPEND_MATH(op, desc, dest, src_0, src_1, CAAM_CMD_SZ * 2 | \
(upper ? 0 : MATH_IFB)); \
if (upper) \
append_u64(desc, data); \
else \
append_u32(desc, data); \
} while (0)
#define append_math_add_imm_u64(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u64(ADD, desc, dest, src0, src1, data)
#define append_math_sub_imm_u64(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u64(SUB, desc, dest, src0, src1, data)
#define append_math_add_c_imm_u64(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u64(ADDC, desc, dest, src0, src1, data)
#define append_math_sub_b_imm_u64(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u64(SUBB, desc, dest, src0, src1, data)
#define append_math_and_imm_u64(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u64(AND, desc, dest, src0, src1, data)
#define append_math_or_imm_u64(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u64(OR, desc, dest, src0, src1, data)
#define append_math_xor_imm_u64(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u64(XOR, desc, dest, src0, src1, data)
#define append_math_lshift_imm_u64(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u64(LSHIFT, desc, dest, src0, src1, data)
#define append_math_rshift_imm_u64(desc, dest, src0, src1, data) \
APPEND_MATH_IMM_u64(RSHIFT, desc, dest, src0, src1, data)
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
#define PDBOPTS_ESP_IPHDRSRC 0x08 /* IP header comes from PDB (encap) */ #define PDBOPTS_ESP_IPHDRSRC 0x08 /* IP header comes from PDB (encap) */
#define PDBOPTS_ESP_INCIPHDR 0x04 /* Prepend IP header to output frame */ #define PDBOPTS_ESP_INCIPHDR 0x04 /* Prepend IP header to output frame */
#define PDBOPTS_ESP_IPVSN 0x02 /* process IPv6 header */ #define PDBOPTS_ESP_IPVSN 0x02 /* process IPv6 header */
#define PDBOPTS_ESP_AOFL 0x04 /* adjust out frame len (decap, SEC>=5.3)*/
#define PDBOPTS_ESP_TUNNEL 0x01 /* tunnel mode next-header byte */ #define PDBOPTS_ESP_TUNNEL 0x01 /* tunnel mode next-header byte */
#define PDBOPTS_ESP_IPV6 0x02 /* ip header version is V6 */ #define PDBOPTS_ESP_IPV6 0x02 /* ip header version is V6 */
#define PDBOPTS_ESP_DIFFSERV 0x40 /* copy TOS/TC from inner iphdr */ #define PDBOPTS_ESP_DIFFSERV 0x40 /* copy TOS/TC from inner iphdr */
......
...@@ -117,6 +117,43 @@ struct jr_outentry { ...@@ -117,6 +117,43 @@ struct jr_outentry {
#define CHA_NUM_DECONUM_SHIFT 56 #define CHA_NUM_DECONUM_SHIFT 56
#define CHA_NUM_DECONUM_MASK (0xfull << CHA_NUM_DECONUM_SHIFT) #define CHA_NUM_DECONUM_MASK (0xfull << CHA_NUM_DECONUM_SHIFT)
/* CHA Version IDs */
#define CHA_ID_AES_SHIFT 0
#define CHA_ID_AES_MASK (0xfull << CHA_ID_AES_SHIFT)
#define CHA_ID_DES_SHIFT 4
#define CHA_ID_DES_MASK (0xfull << CHA_ID_DES_SHIFT)
#define CHA_ID_ARC4_SHIFT 8
#define CHA_ID_ARC4_MASK (0xfull << CHA_ID_ARC4_SHIFT)
#define CHA_ID_MD_SHIFT 12
#define CHA_ID_MD_MASK (0xfull << CHA_ID_MD_SHIFT)
#define CHA_ID_RNG_SHIFT 16
#define CHA_ID_RNG_MASK (0xfull << CHA_ID_RNG_SHIFT)
#define CHA_ID_SNW8_SHIFT 20
#define CHA_ID_SNW8_MASK (0xfull << CHA_ID_SNW8_SHIFT)
#define CHA_ID_KAS_SHIFT 24
#define CHA_ID_KAS_MASK (0xfull << CHA_ID_KAS_SHIFT)
#define CHA_ID_PK_SHIFT 28
#define CHA_ID_PK_MASK (0xfull << CHA_ID_PK_SHIFT)
#define CHA_ID_CRC_SHIFT 32
#define CHA_ID_CRC_MASK (0xfull << CHA_ID_CRC_SHIFT)
#define CHA_ID_SNW9_SHIFT 36
#define CHA_ID_SNW9_MASK (0xfull << CHA_ID_SNW9_SHIFT)
#define CHA_ID_DECO_SHIFT 56
#define CHA_ID_DECO_MASK (0xfull << CHA_ID_DECO_SHIFT)
#define CHA_ID_JR_SHIFT 60
#define CHA_ID_JR_MASK (0xfull << CHA_ID_JR_SHIFT)
struct sec_vid { struct sec_vid {
u16 ip_id; u16 ip_id;
u8 maj_rev; u8 maj_rev;
...@@ -228,7 +265,10 @@ struct rng4tst { ...@@ -228,7 +265,10 @@ struct rng4tst {
u32 rtfrqmax; /* PRGM=1: freq. count max. limit register */ u32 rtfrqmax; /* PRGM=1: freq. count max. limit register */
u32 rtfrqcnt; /* PRGM=0: freq. count register */ u32 rtfrqcnt; /* PRGM=0: freq. count register */
}; };
u32 rsvd1[56]; u32 rsvd1[40];
#define RDSTA_IF0 0x00000001
u32 rdsta;
u32 rsvd2[15];
}; };
/* /*
......
/*
* Cryptographic API.
*
* Support for DCP cryptographic accelerator.
*
* Copyright (c) 2013
* Author: Tobias Rauter <tobias.rauter@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*
* Based on tegra-aes.c, dcp.c (from freescale SDK) and sahara.c
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/dma-mapping.h>
#include <linux/io.h>
#include <linux/mutex.h>
#include <linux/interrupt.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <linux/crypto.h>
#include <linux/miscdevice.h>
#include <crypto/scatterwalk.h>
#include <crypto/aes.h>
/* IOCTL for DCP OTP Key AES - taken from Freescale's SDK*/
#define DBS_IOCTL_BASE 'd'
#define DBS_ENC _IOW(DBS_IOCTL_BASE, 0x00, uint8_t[16])
#define DBS_DEC _IOW(DBS_IOCTL_BASE, 0x01, uint8_t[16])
/* DCP channel used for AES */
#define USED_CHANNEL 1
/* Ring Buffers' maximum size */
#define DCP_MAX_PKG 20
/* Control Register */
#define DCP_REG_CTRL 0x000
#define DCP_CTRL_SFRST (1<<31)
#define DCP_CTRL_CLKGATE (1<<30)
#define DCP_CTRL_CRYPTO_PRESENT (1<<29)
#define DCP_CTRL_SHA_PRESENT (1<<28)
#define DCP_CTRL_GATHER_RES_WRITE (1<<23)
#define DCP_CTRL_ENABLE_CONTEXT_CACHE (1<<22)
#define DCP_CTRL_ENABLE_CONTEXT_SWITCH (1<<21)
#define DCP_CTRL_CH_IRQ_E_0 0x01
#define DCP_CTRL_CH_IRQ_E_1 0x02
#define DCP_CTRL_CH_IRQ_E_2 0x04
#define DCP_CTRL_CH_IRQ_E_3 0x08
/* Status register */
#define DCP_REG_STAT 0x010
#define DCP_STAT_OTP_KEY_READY (1<<28)
#define DCP_STAT_CUR_CHANNEL(stat) ((stat>>24)&0x0F)
#define DCP_STAT_READY_CHANNEL(stat) ((stat>>16)&0x0F)
#define DCP_STAT_IRQ(stat) (stat&0x0F)
#define DCP_STAT_CHAN_0 (0x01)
#define DCP_STAT_CHAN_1 (0x02)
#define DCP_STAT_CHAN_2 (0x04)
#define DCP_STAT_CHAN_3 (0x08)
/* Channel Control Register */
#define DCP_REG_CHAN_CTRL 0x020
#define DCP_CHAN_CTRL_CH0_IRQ_MERGED (1<<16)
#define DCP_CHAN_CTRL_HIGH_PRIO_0 (0x0100)
#define DCP_CHAN_CTRL_HIGH_PRIO_1 (0x0200)
#define DCP_CHAN_CTRL_HIGH_PRIO_2 (0x0400)
#define DCP_CHAN_CTRL_HIGH_PRIO_3 (0x0800)
#define DCP_CHAN_CTRL_ENABLE_0 (0x01)
#define DCP_CHAN_CTRL_ENABLE_1 (0x02)
#define DCP_CHAN_CTRL_ENABLE_2 (0x04)
#define DCP_CHAN_CTRL_ENABLE_3 (0x08)
/*
* Channel Registers:
* The DCP has 4 channels. Each of this channels
* has 4 registers (command pointer, semaphore, status and options).
* The address of register REG of channel CHAN is obtained by
* dcp_chan_reg(REG, CHAN)
*/
#define DCP_REG_CHAN_PTR 0x00000100
#define DCP_REG_CHAN_SEMA 0x00000110
#define DCP_REG_CHAN_STAT 0x00000120
#define DCP_REG_CHAN_OPT 0x00000130
#define DCP_CHAN_STAT_NEXT_CHAIN_IS_0 0x010000
#define DCP_CHAN_STAT_NO_CHAIN 0x020000
#define DCP_CHAN_STAT_CONTEXT_ERROR 0x030000
#define DCP_CHAN_STAT_PAYLOAD_ERROR 0x040000
#define DCP_CHAN_STAT_INVALID_MODE 0x050000
#define DCP_CHAN_STAT_PAGEFAULT 0x40
#define DCP_CHAN_STAT_DST 0x20
#define DCP_CHAN_STAT_SRC 0x10
#define DCP_CHAN_STAT_PACKET 0x08
#define DCP_CHAN_STAT_SETUP 0x04
#define DCP_CHAN_STAT_MISMATCH 0x02
/* hw packet control*/
#define DCP_PKT_PAYLOAD_KEY (1<<11)
#define DCP_PKT_OTP_KEY (1<<10)
#define DCP_PKT_CIPHER_INIT (1<<9)
#define DCP_PKG_CIPHER_ENCRYPT (1<<8)
#define DCP_PKT_CIPHER_ENABLE (1<<5)
#define DCP_PKT_DECR_SEM (1<<1)
#define DCP_PKT_CHAIN (1<<2)
#define DCP_PKT_IRQ 1
#define DCP_PKT_MODE_CBC (1<<4)
#define DCP_PKT_KEYSELECT_OTP (0xFF<<8)
/* cipher flags */
#define DCP_ENC 0x0001
#define DCP_DEC 0x0002
#define DCP_ECB 0x0004
#define DCP_CBC 0x0008
#define DCP_CBC_INIT 0x0010
#define DCP_NEW_KEY 0x0040
#define DCP_OTP_KEY 0x0080
#define DCP_AES 0x1000
/* DCP Flags */
#define DCP_FLAG_BUSY 0x01
#define DCP_FLAG_PRODUCING 0x02
/* clock defines */
#define CLOCK_ON 1
#define CLOCK_OFF 0
struct dcp_dev_req_ctx {
int mode;
};
struct dcp_op {
unsigned int flags;
u8 key[AES_KEYSIZE_128];
int keylen;
struct ablkcipher_request *req;
struct crypto_ablkcipher *fallback;
uint32_t stat;
uint32_t pkt1;
uint32_t pkt2;
struct ablkcipher_walk walk;
};
struct dcp_dev {
struct device *dev;
void __iomem *dcp_regs_base;
int dcp_vmi_irq;
int dcp_irq;
spinlock_t queue_lock;
struct crypto_queue queue;
uint32_t pkt_produced;
uint32_t pkt_consumed;
struct dcp_hw_packet *hw_pkg[DCP_MAX_PKG];
dma_addr_t hw_phys_pkg;
/* [KEY][IV] Both with 16 Bytes */
u8 *payload_base;
dma_addr_t payload_base_dma;
struct tasklet_struct done_task;
struct tasklet_struct queue_task;
struct timer_list watchdog;
unsigned long flags;
struct dcp_op *ctx;
struct miscdevice dcp_bootstream_misc;
};
struct dcp_hw_packet {
uint32_t next;
uint32_t pkt1;
uint32_t pkt2;
uint32_t src;
uint32_t dst;
uint32_t size;
uint32_t payload;
uint32_t stat;
};
static struct dcp_dev *global_dev;
static inline u32 dcp_chan_reg(u32 reg, int chan)
{
return reg + (chan) * 0x40;
}
static inline void dcp_write(struct dcp_dev *dev, u32 data, u32 reg)
{
writel(data, dev->dcp_regs_base + reg);
}
static inline void dcp_set(struct dcp_dev *dev, u32 data, u32 reg)
{
writel(data, dev->dcp_regs_base + (reg | 0x04));
}
static inline void dcp_clear(struct dcp_dev *dev, u32 data, u32 reg)
{
writel(data, dev->dcp_regs_base + (reg | 0x08));
}
static inline void dcp_toggle(struct dcp_dev *dev, u32 data, u32 reg)
{
writel(data, dev->dcp_regs_base + (reg | 0x0C));
}
static inline unsigned int dcp_read(struct dcp_dev *dev, u32 reg)
{
return readl(dev->dcp_regs_base + reg);
}
static void dcp_dma_unmap(struct dcp_dev *dev, struct dcp_hw_packet *pkt)
{
dma_unmap_page(dev->dev, pkt->src, pkt->size, DMA_TO_DEVICE);
dma_unmap_page(dev->dev, pkt->dst, pkt->size, DMA_FROM_DEVICE);
dev_dbg(dev->dev, "unmap packet %x", (unsigned int) pkt);
}
static int dcp_dma_map(struct dcp_dev *dev,
struct ablkcipher_walk *walk, struct dcp_hw_packet *pkt)
{
dev_dbg(dev->dev, "map packet %x", (unsigned int) pkt);
/* align to length = 16 */
pkt->size = walk->nbytes - (walk->nbytes % 16);
pkt->src = dma_map_page(dev->dev, walk->src.page, walk->src.offset,
pkt->size, DMA_TO_DEVICE);
if (pkt->src == 0) {
dev_err(dev->dev, "Unable to map src");
return -ENOMEM;
}
pkt->dst = dma_map_page(dev->dev, walk->dst.page, walk->dst.offset,
pkt->size, DMA_FROM_DEVICE);
if (pkt->dst == 0) {
dev_err(dev->dev, "Unable to map dst");
dma_unmap_page(dev->dev, pkt->src, pkt->size, DMA_TO_DEVICE);
return -ENOMEM;
}
return 0;
}
static void dcp_op_one(struct dcp_dev *dev, struct dcp_hw_packet *pkt,
uint8_t last)
{
struct dcp_op *ctx = dev->ctx;
pkt->pkt1 = ctx->pkt1;
pkt->pkt2 = ctx->pkt2;
pkt->payload = (u32) dev->payload_base_dma;
pkt->stat = 0;
if (ctx->flags & DCP_CBC_INIT) {
pkt->pkt1 |= DCP_PKT_CIPHER_INIT;
ctx->flags &= ~DCP_CBC_INIT;
}
mod_timer(&dev->watchdog, jiffies + msecs_to_jiffies(500));
pkt->pkt1 |= DCP_PKT_IRQ;
if (!last)
pkt->pkt1 |= DCP_PKT_CHAIN;
dev->pkt_produced++;
dcp_write(dev, 1,
dcp_chan_reg(DCP_REG_CHAN_SEMA, USED_CHANNEL));
}
static void dcp_op_proceed(struct dcp_dev *dev)
{
struct dcp_op *ctx = dev->ctx;
struct dcp_hw_packet *pkt;
while (ctx->walk.nbytes) {
int err = 0;
pkt = dev->hw_pkg[dev->pkt_produced % DCP_MAX_PKG];
err = dcp_dma_map(dev, &ctx->walk, pkt);
if (err) {
dev->ctx->stat |= err;
/* start timer to wait for already set up calls */
mod_timer(&dev->watchdog,
jiffies + msecs_to_jiffies(500));
break;
}
err = ctx->walk.nbytes - pkt->size;
ablkcipher_walk_done(dev->ctx->req, &dev->ctx->walk, err);
dcp_op_one(dev, pkt, ctx->walk.nbytes == 0);
/* we have to wait if no space is left in buffer */
if (dev->pkt_produced - dev->pkt_consumed == DCP_MAX_PKG)
break;
}
clear_bit(DCP_FLAG_PRODUCING, &dev->flags);
}
static void dcp_op_start(struct dcp_dev *dev, uint8_t use_walk)
{
struct dcp_op *ctx = dev->ctx;
if (ctx->flags & DCP_NEW_KEY) {
memcpy(dev->payload_base, ctx->key, ctx->keylen);
ctx->flags &= ~DCP_NEW_KEY;
}
ctx->pkt1 = 0;
ctx->pkt1 |= DCP_PKT_CIPHER_ENABLE;
ctx->pkt1 |= DCP_PKT_DECR_SEM;
if (ctx->flags & DCP_OTP_KEY)
ctx->pkt1 |= DCP_PKT_OTP_KEY;
else
ctx->pkt1 |= DCP_PKT_PAYLOAD_KEY;
if (ctx->flags & DCP_ENC)
ctx->pkt1 |= DCP_PKG_CIPHER_ENCRYPT;
ctx->pkt2 = 0;
if (ctx->flags & DCP_CBC)
ctx->pkt2 |= DCP_PKT_MODE_CBC;
dev->pkt_produced = 0;
dev->pkt_consumed = 0;
ctx->stat = 0;
dcp_clear(dev, -1, dcp_chan_reg(DCP_REG_CHAN_STAT, USED_CHANNEL));
dcp_write(dev, (u32) dev->hw_phys_pkg,
dcp_chan_reg(DCP_REG_CHAN_PTR, USED_CHANNEL));
set_bit(DCP_FLAG_PRODUCING, &dev->flags);
if (use_walk) {
ablkcipher_walk_init(&ctx->walk, ctx->req->dst,
ctx->req->src, ctx->req->nbytes);
ablkcipher_walk_phys(ctx->req, &ctx->walk);
dcp_op_proceed(dev);
} else {
dcp_op_one(dev, dev->hw_pkg[0], 1);
clear_bit(DCP_FLAG_PRODUCING, &dev->flags);
}
}
static void dcp_done_task(unsigned long data)
{
struct dcp_dev *dev = (struct dcp_dev *)data;
struct dcp_hw_packet *last_packet;
int fin;
fin = 0;
for (last_packet = dev->hw_pkg[(dev->pkt_consumed) % DCP_MAX_PKG];
last_packet->stat == 1;
last_packet =
dev->hw_pkg[++(dev->pkt_consumed) % DCP_MAX_PKG]) {
dcp_dma_unmap(dev, last_packet);
last_packet->stat = 0;
fin++;
}
/* the last call of this function already consumed this IRQ's packet */
if (fin == 0)
return;
dev_dbg(dev->dev,
"Packet(s) done with status %x; finished: %d, produced:%d, complete consumed: %d",
dev->ctx->stat, fin, dev->pkt_produced, dev->pkt_consumed);
last_packet = dev->hw_pkg[(dev->pkt_consumed - 1) % DCP_MAX_PKG];
if (!dev->ctx->stat && last_packet->pkt1 & DCP_PKT_CHAIN) {
if (!test_and_set_bit(DCP_FLAG_PRODUCING, &dev->flags))
dcp_op_proceed(dev);
return;
}
while (unlikely(dev->pkt_consumed < dev->pkt_produced)) {
dcp_dma_unmap(dev,
dev->hw_pkg[dev->pkt_consumed++ % DCP_MAX_PKG]);
}
if (dev->ctx->flags & DCP_OTP_KEY) {
/* we used the miscdevice, no walk to finish */
clear_bit(DCP_FLAG_BUSY, &dev->flags);
return;
}
ablkcipher_walk_complete(&dev->ctx->walk);
dev->ctx->req->base.complete(&dev->ctx->req->base,
dev->ctx->stat);
dev->ctx->req = NULL;
/* in case there are other requests in the queue */
tasklet_schedule(&dev->queue_task);
}
static void dcp_watchdog(unsigned long data)
{
struct dcp_dev *dev = (struct dcp_dev *)data;
dev->ctx->stat |= dcp_read(dev,
dcp_chan_reg(DCP_REG_CHAN_STAT, USED_CHANNEL));
dev_err(dev->dev, "Timeout, Channel status: %x", dev->ctx->stat);
if (!dev->ctx->stat)
dev->ctx->stat = -ETIMEDOUT;
dcp_done_task(data);
}
static irqreturn_t dcp_common_irq(int irq, void *context)
{
u32 msk;
struct dcp_dev *dev = (struct dcp_dev *) context;
del_timer(&dev->watchdog);
msk = DCP_STAT_IRQ(dcp_read(dev, DCP_REG_STAT));
dcp_clear(dev, msk, DCP_REG_STAT);
if (msk == 0)
return IRQ_NONE;
dev->ctx->stat |= dcp_read(dev,
dcp_chan_reg(DCP_REG_CHAN_STAT, USED_CHANNEL));
if (msk & DCP_STAT_CHAN_1)
tasklet_schedule(&dev->done_task);
return IRQ_HANDLED;
}
static irqreturn_t dcp_vmi_irq(int irq, void *context)
{
return dcp_common_irq(irq, context);
}
static irqreturn_t dcp_irq(int irq, void *context)
{
return dcp_common_irq(irq, context);
}
static void dcp_crypt(struct dcp_dev *dev, struct dcp_op *ctx)
{
dev->ctx = ctx;
if ((ctx->flags & DCP_CBC) && ctx->req->info) {
ctx->flags |= DCP_CBC_INIT;
memcpy(dev->payload_base + AES_KEYSIZE_128,
ctx->req->info, AES_KEYSIZE_128);
}
dcp_op_start(dev, 1);
}
static void dcp_queue_task(unsigned long data)
{
struct dcp_dev *dev = (struct dcp_dev *) data;
struct crypto_async_request *async_req, *backlog;
struct crypto_ablkcipher *tfm;
struct dcp_op *ctx;
struct dcp_dev_req_ctx *rctx;
struct ablkcipher_request *req;
unsigned long flags;
spin_lock_irqsave(&dev->queue_lock, flags);
backlog = crypto_get_backlog(&dev->queue);
async_req = crypto_dequeue_request(&dev->queue);
spin_unlock_irqrestore(&dev->queue_lock, flags);
if (!async_req)
goto ret_nothing_done;
if (backlog)
backlog->complete(backlog, -EINPROGRESS);
req = ablkcipher_request_cast(async_req);
tfm = crypto_ablkcipher_reqtfm(req);
rctx = ablkcipher_request_ctx(req);
ctx = crypto_ablkcipher_ctx(tfm);
if (!req->src || !req->dst)
goto ret_nothing_done;
ctx->flags |= rctx->mode;
ctx->req = req;
dcp_crypt(dev, ctx);
return;
ret_nothing_done:
clear_bit(DCP_FLAG_BUSY, &dev->flags);
}
static int dcp_cra_init(struct crypto_tfm *tfm)
{
const char *name = tfm->__crt_alg->cra_name;
struct dcp_op *ctx = crypto_tfm_ctx(tfm);
tfm->crt_ablkcipher.reqsize = sizeof(struct dcp_dev_req_ctx);
ctx->fallback = crypto_alloc_ablkcipher(name, 0,
CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
if (IS_ERR(ctx->fallback)) {
dev_err(global_dev->dev, "Error allocating fallback algo %s\n",
name);
return PTR_ERR(ctx->fallback);
}
return 0;
}
static void dcp_cra_exit(struct crypto_tfm *tfm)
{
struct dcp_op *ctx = crypto_tfm_ctx(tfm);
if (ctx->fallback)
crypto_free_ablkcipher(ctx->fallback);
ctx->fallback = NULL;
}
/* async interface */
static int dcp_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
unsigned int len)
{
struct dcp_op *ctx = crypto_ablkcipher_ctx(tfm);
unsigned int ret = 0;
ctx->keylen = len;
ctx->flags = 0;
if (len == AES_KEYSIZE_128) {
if (memcmp(ctx->key, key, AES_KEYSIZE_128)) {
memcpy(ctx->key, key, len);
ctx->flags |= DCP_NEW_KEY;
}
return 0;
}
ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
ctx->fallback->base.crt_flags |=
(tfm->base.crt_flags & CRYPTO_TFM_REQ_MASK);
ret = crypto_ablkcipher_setkey(ctx->fallback, key, len);
if (ret) {
struct crypto_tfm *tfm_aux = crypto_ablkcipher_tfm(tfm);
tfm_aux->crt_flags &= ~CRYPTO_TFM_RES_MASK;
tfm_aux->crt_flags |=
(ctx->fallback->base.crt_flags & CRYPTO_TFM_RES_MASK);
}
return ret;
}
static int dcp_aes_cbc_crypt(struct ablkcipher_request *req, int mode)
{
struct dcp_dev_req_ctx *rctx = ablkcipher_request_ctx(req);
struct dcp_dev *dev = global_dev;
unsigned long flags;
int err = 0;
if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE))
return -EINVAL;
rctx->mode = mode;
spin_lock_irqsave(&dev->queue_lock, flags);
err = ablkcipher_enqueue_request(&dev->queue, req);
spin_unlock_irqrestore(&dev->queue_lock, flags);
flags = test_and_set_bit(DCP_FLAG_BUSY, &dev->flags);
if (!(flags & DCP_FLAG_BUSY))
tasklet_schedule(&dev->queue_task);
return err;
}
static int dcp_aes_cbc_encrypt(struct ablkcipher_request *req)
{
struct crypto_tfm *tfm =
crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
struct dcp_op *ctx = crypto_ablkcipher_ctx(
crypto_ablkcipher_reqtfm(req));
if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
int err = 0;
ablkcipher_request_set_tfm(req, ctx->fallback);
err = crypto_ablkcipher_encrypt(req);
ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
return err;
}
return dcp_aes_cbc_crypt(req, DCP_AES | DCP_ENC | DCP_CBC);
}
static int dcp_aes_cbc_decrypt(struct ablkcipher_request *req)
{
struct crypto_tfm *tfm =
crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
struct dcp_op *ctx = crypto_ablkcipher_ctx(
crypto_ablkcipher_reqtfm(req));
if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
int err = 0;
ablkcipher_request_set_tfm(req, ctx->fallback);
err = crypto_ablkcipher_decrypt(req);
ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
return err;
}
return dcp_aes_cbc_crypt(req, DCP_AES | DCP_DEC | DCP_CBC);
}
static struct crypto_alg algs[] = {
{
.cra_name = "cbc(aes)",
.cra_driver_name = "dcp-cbc-aes",
.cra_alignmask = 3,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC |
CRYPTO_ALG_NEED_FALLBACK,
.cra_blocksize = AES_KEYSIZE_128,
.cra_type = &crypto_ablkcipher_type,
.cra_priority = 300,
.cra_u.ablkcipher = {
.min_keysize = AES_KEYSIZE_128,
.max_keysize = AES_KEYSIZE_128,
.setkey = dcp_aes_setkey,
.encrypt = dcp_aes_cbc_encrypt,
.decrypt = dcp_aes_cbc_decrypt,
.ivsize = AES_KEYSIZE_128,
}
},
};
/* DCP bootstream verification interface: uses OTP key for crypto */
static int dcp_bootstream_open(struct inode *inode, struct file *file)
{
file->private_data = container_of((file->private_data),
struct dcp_dev, dcp_bootstream_misc);
return 0;
}
static long dcp_bootstream_ioctl(struct file *file,
unsigned int cmd, unsigned long arg)
{
struct dcp_dev *dev = (struct dcp_dev *) file->private_data;
void __user *argp = (void __user *)arg;
int ret;
if (dev == NULL)
return -EBADF;
if (cmd != DBS_ENC && cmd != DBS_DEC)
return -EINVAL;
if (copy_from_user(dev->payload_base, argp, 16))
return -EFAULT;
if (test_and_set_bit(DCP_FLAG_BUSY, &dev->flags))
return -EAGAIN;
dev->ctx = kzalloc(sizeof(struct dcp_op), GFP_KERNEL);
if (!dev->ctx) {
dev_err(dev->dev,
"cannot allocate context for OTP crypto");
clear_bit(DCP_FLAG_BUSY, &dev->flags);
return -ENOMEM;
}
dev->ctx->flags = DCP_AES | DCP_ECB | DCP_OTP_KEY | DCP_CBC_INIT;
dev->ctx->flags |= (cmd == DBS_ENC) ? DCP_ENC : DCP_DEC;
dev->hw_pkg[0]->src = dev->payload_base_dma;
dev->hw_pkg[0]->dst = dev->payload_base_dma;
dev->hw_pkg[0]->size = 16;
dcp_op_start(dev, 0);
while (test_bit(DCP_FLAG_BUSY, &dev->flags))
cpu_relax();
ret = dev->ctx->stat;
if (!ret && copy_to_user(argp, dev->payload_base, 16))
ret = -EFAULT;
kfree(dev->ctx);
return ret;
}
static const struct file_operations dcp_bootstream_fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = dcp_bootstream_ioctl,
.open = dcp_bootstream_open,
};
static int dcp_probe(struct platform_device *pdev)
{
struct dcp_dev *dev = NULL;
struct resource *r;
int i, ret, j;
dev = devm_kzalloc(&pdev->dev, sizeof(*dev), GFP_KERNEL);
if (!dev)
return -ENOMEM;
global_dev = dev;
dev->dev = &pdev->dev;
platform_set_drvdata(pdev, dev);
r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (!r) {
dev_err(&pdev->dev, "failed to get IORESOURCE_MEM\n");
return -ENXIO;
}
dev->dcp_regs_base = devm_ioremap(&pdev->dev, r->start,
resource_size(r));
dcp_set(dev, DCP_CTRL_SFRST, DCP_REG_CTRL);
udelay(10);
dcp_clear(dev, DCP_CTRL_SFRST | DCP_CTRL_CLKGATE, DCP_REG_CTRL);
dcp_write(dev, DCP_CTRL_GATHER_RES_WRITE |
DCP_CTRL_ENABLE_CONTEXT_CACHE | DCP_CTRL_CH_IRQ_E_1,
DCP_REG_CTRL);
dcp_write(dev, DCP_CHAN_CTRL_ENABLE_1, DCP_REG_CHAN_CTRL);
for (i = 0; i < 4; i++)
dcp_clear(dev, -1, dcp_chan_reg(DCP_REG_CHAN_STAT, i));
dcp_clear(dev, -1, DCP_REG_STAT);
r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
if (!r) {
dev_err(&pdev->dev, "can't get IRQ resource (0)\n");
return -EIO;
}
dev->dcp_vmi_irq = r->start;
ret = request_irq(dev->dcp_vmi_irq, dcp_vmi_irq, 0, "dcp", dev);
if (ret != 0) {
dev_err(&pdev->dev, "can't request_irq (0)\n");
return -EIO;
}
r = platform_get_resource(pdev, IORESOURCE_IRQ, 1);
if (!r) {
dev_err(&pdev->dev, "can't get IRQ resource (1)\n");
ret = -EIO;
goto err_free_irq0;
}
dev->dcp_irq = r->start;
ret = request_irq(dev->dcp_irq, dcp_irq, 0, "dcp", dev);
if (ret != 0) {
dev_err(&pdev->dev, "can't request_irq (1)\n");
ret = -EIO;
goto err_free_irq0;
}
dev->hw_pkg[0] = dma_alloc_coherent(&pdev->dev,
DCP_MAX_PKG * sizeof(struct dcp_hw_packet),
&dev->hw_phys_pkg,
GFP_KERNEL);
if (!dev->hw_pkg[0]) {
dev_err(&pdev->dev, "Could not allocate hw descriptors\n");
ret = -ENOMEM;
goto err_free_irq1;
}
for (i = 1; i < DCP_MAX_PKG; i++) {
dev->hw_pkg[i - 1]->next = dev->hw_phys_pkg
+ i * sizeof(struct dcp_hw_packet);
dev->hw_pkg[i] = dev->hw_pkg[i - 1] + 1;
}
dev->hw_pkg[i - 1]->next = dev->hw_phys_pkg;
dev->payload_base = dma_alloc_coherent(&pdev->dev, 2 * AES_KEYSIZE_128,
&dev->payload_base_dma, GFP_KERNEL);
if (!dev->payload_base) {
dev_err(&pdev->dev, "Could not allocate memory for key\n");
ret = -ENOMEM;
goto err_free_hw_packet;
}
tasklet_init(&dev->queue_task, dcp_queue_task,
(unsigned long) dev);
tasklet_init(&dev->done_task, dcp_done_task,
(unsigned long) dev);
spin_lock_init(&dev->queue_lock);
crypto_init_queue(&dev->queue, 10);
init_timer(&dev->watchdog);
dev->watchdog.function = &dcp_watchdog;
dev->watchdog.data = (unsigned long)dev;
dev->dcp_bootstream_misc.minor = MISC_DYNAMIC_MINOR,
dev->dcp_bootstream_misc.name = "dcpboot",
dev->dcp_bootstream_misc.fops = &dcp_bootstream_fops,
ret = misc_register(&dev->dcp_bootstream_misc);
if (ret != 0) {
dev_err(dev->dev, "Unable to register misc device\n");
goto err_free_key_iv;
}
for (i = 0; i < ARRAY_SIZE(algs); i++) {
algs[i].cra_priority = 300;
algs[i].cra_ctxsize = sizeof(struct dcp_op);
algs[i].cra_module = THIS_MODULE;
algs[i].cra_init = dcp_cra_init;
algs[i].cra_exit = dcp_cra_exit;
if (crypto_register_alg(&algs[i])) {
dev_err(&pdev->dev, "register algorithm failed\n");
ret = -ENOMEM;
goto err_unregister;
}
}
dev_notice(&pdev->dev, "DCP crypto enabled.!\n");
return 0;
err_unregister:
for (j = 0; j < i; j++)
crypto_unregister_alg(&algs[j]);
err_free_key_iv:
dma_free_coherent(&pdev->dev, 2 * AES_KEYSIZE_128, dev->payload_base,
dev->payload_base_dma);
err_free_hw_packet:
dma_free_coherent(&pdev->dev, DCP_MAX_PKG *
sizeof(struct dcp_hw_packet), dev->hw_pkg[0],
dev->hw_phys_pkg);
err_free_irq1:
free_irq(dev->dcp_irq, dev);
err_free_irq0:
free_irq(dev->dcp_vmi_irq, dev);
return ret;
}
static int dcp_remove(struct platform_device *pdev)
{
struct dcp_dev *dev;
int j;
dev = platform_get_drvdata(pdev);
dma_free_coherent(&pdev->dev,
DCP_MAX_PKG * sizeof(struct dcp_hw_packet),
dev->hw_pkg[0], dev->hw_phys_pkg);
dma_free_coherent(&pdev->dev, 2 * AES_KEYSIZE_128, dev->payload_base,
dev->payload_base_dma);
free_irq(dev->dcp_irq, dev);
free_irq(dev->dcp_vmi_irq, dev);
tasklet_kill(&dev->done_task);
tasklet_kill(&dev->queue_task);
for (j = 0; j < ARRAY_SIZE(algs); j++)
crypto_unregister_alg(&algs[j]);
misc_deregister(&dev->dcp_bootstream_misc);
return 0;
}
static struct of_device_id fs_dcp_of_match[] = {
{ .compatible = "fsl-dcp"},
{},
};
static struct platform_driver fs_dcp_driver = {
.probe = dcp_probe,
.remove = dcp_remove,
.driver = {
.name = "fsl-dcp",
.owner = THIS_MODULE,
.of_match_table = fs_dcp_of_match
}
};
module_platform_driver(fs_dcp_driver);
MODULE_AUTHOR("Tobias Rauter <tobias.rauter@gmail.com>");
MODULE_DESCRIPTION("Freescale DCP Crypto Driver");
MODULE_LICENSE("GPL");
...@@ -2676,7 +2676,7 @@ static int hifn_probe(struct pci_dev *pdev, const struct pci_device_id *id) ...@@ -2676,7 +2676,7 @@ static int hifn_probe(struct pci_dev *pdev, const struct pci_device_id *id)
hifn_reset_dma(dev, 1); hifn_reset_dma(dev, 1);
hifn_stop_device(dev); hifn_stop_device(dev);
err_out_free_irq: err_out_free_irq:
free_irq(dev->irq, dev->name); free_irq(dev->irq, dev);
tasklet_kill(&dev->tasklet); tasklet_kill(&dev->tasklet);
err_out_free_desc: err_out_free_desc:
pci_free_consistent(pdev, sizeof(struct hifn_dma), pci_free_consistent(pdev, sizeof(struct hifn_dma),
...@@ -2711,7 +2711,7 @@ static void hifn_remove(struct pci_dev *pdev) ...@@ -2711,7 +2711,7 @@ static void hifn_remove(struct pci_dev *pdev)
hifn_reset_dma(dev, 1); hifn_reset_dma(dev, 1);
hifn_stop_device(dev); hifn_stop_device(dev);
free_irq(dev->irq, dev->name); free_irq(dev->irq, dev);
tasklet_kill(&dev->tasklet); tasklet_kill(&dev->tasklet);
hifn_flush(dev); hifn_flush(dev);
......
...@@ -1146,7 +1146,6 @@ static int mv_probe(struct platform_device *pdev) ...@@ -1146,7 +1146,6 @@ static int mv_probe(struct platform_device *pdev)
err: err:
kfree(cp); kfree(cp);
cpg = NULL; cpg = NULL;
platform_set_drvdata(pdev, NULL);
return ret; return ret;
} }
......
...@@ -203,13 +203,6 @@ static void omap_aes_write_n(struct omap_aes_dev *dd, u32 offset, ...@@ -203,13 +203,6 @@ static void omap_aes_write_n(struct omap_aes_dev *dd, u32 offset,
static int omap_aes_hw_init(struct omap_aes_dev *dd) static int omap_aes_hw_init(struct omap_aes_dev *dd)
{ {
/*
* clocks are enabled when request starts and disabled when finished.
* It may be long delays between requests.
* Device might go to off mode to save power.
*/
pm_runtime_get_sync(dd->dev);
if (!(dd->flags & FLAGS_INIT)) { if (!(dd->flags & FLAGS_INIT)) {
dd->flags |= FLAGS_INIT; dd->flags |= FLAGS_INIT;
dd->err = 0; dd->err = 0;
...@@ -636,7 +629,6 @@ static void omap_aes_finish_req(struct omap_aes_dev *dd, int err) ...@@ -636,7 +629,6 @@ static void omap_aes_finish_req(struct omap_aes_dev *dd, int err)
pr_debug("err: %d\n", err); pr_debug("err: %d\n", err);
pm_runtime_put(dd->dev);
dd->flags &= ~FLAGS_BUSY; dd->flags &= ~FLAGS_BUSY;
req->base.complete(&req->base, err); req->base.complete(&req->base, err);
...@@ -837,8 +829,16 @@ static int omap_aes_ctr_decrypt(struct ablkcipher_request *req) ...@@ -837,8 +829,16 @@ static int omap_aes_ctr_decrypt(struct ablkcipher_request *req)
static int omap_aes_cra_init(struct crypto_tfm *tfm) static int omap_aes_cra_init(struct crypto_tfm *tfm)
{ {
pr_debug("enter\n"); struct omap_aes_dev *dd = NULL;
/* Find AES device, currently picks the first device */
spin_lock_bh(&list_lock);
list_for_each_entry(dd, &dev_list, list) {
break;
}
spin_unlock_bh(&list_lock);
pm_runtime_get_sync(dd->dev);
tfm->crt_ablkcipher.reqsize = sizeof(struct omap_aes_reqctx); tfm->crt_ablkcipher.reqsize = sizeof(struct omap_aes_reqctx);
return 0; return 0;
...@@ -846,7 +846,16 @@ static int omap_aes_cra_init(struct crypto_tfm *tfm) ...@@ -846,7 +846,16 @@ static int omap_aes_cra_init(struct crypto_tfm *tfm)
static void omap_aes_cra_exit(struct crypto_tfm *tfm) static void omap_aes_cra_exit(struct crypto_tfm *tfm)
{ {
pr_debug("enter\n"); struct omap_aes_dev *dd = NULL;
/* Find AES device, currently picks the first device */
spin_lock_bh(&list_lock);
list_for_each_entry(dd, &dev_list, list) {
break;
}
spin_unlock_bh(&list_lock);
pm_runtime_put_sync(dd->dev);
} }
/* ********************** ALGS ************************************ */ /* ********************** ALGS ************************************ */
...@@ -1125,10 +1134,9 @@ static int omap_aes_probe(struct platform_device *pdev) ...@@ -1125,10 +1134,9 @@ static int omap_aes_probe(struct platform_device *pdev)
if (err) if (err)
goto err_res; goto err_res;
dd->io_base = devm_request_and_ioremap(dev, &res); dd->io_base = devm_ioremap_resource(dev, &res);
if (!dd->io_base) { if (IS_ERR(dd->io_base)) {
dev_err(dev, "can't ioremap\n"); err = PTR_ERR(dd->io_base);
err = -ENOMEM;
goto err_res; goto err_res;
} }
dd->phys_base = res.start; dd->phys_base = res.start;
......
...@@ -1686,10 +1686,9 @@ static int omap_sham_probe(struct platform_device *pdev) ...@@ -1686,10 +1686,9 @@ static int omap_sham_probe(struct platform_device *pdev)
if (err) if (err)
goto res_err; goto res_err;
dd->io_base = devm_request_and_ioremap(dev, &res); dd->io_base = devm_ioremap_resource(dev, &res);
if (!dd->io_base) { if (IS_ERR(dd->io_base)) {
dev_err(dev, "can't ioremap\n"); err = PTR_ERR(dd->io_base);
err = -ENOMEM;
goto res_err; goto res_err;
} }
dd->phys_base = res.start; dd->phys_base = res.start;
......
...@@ -1298,7 +1298,7 @@ static ssize_t spacc_stat_irq_thresh_store(struct device *dev, ...@@ -1298,7 +1298,7 @@ static ssize_t spacc_stat_irq_thresh_store(struct device *dev,
struct spacc_engine *engine = spacc_dev_to_engine(dev); struct spacc_engine *engine = spacc_dev_to_engine(dev);
unsigned long thresh; unsigned long thresh;
if (strict_strtoul(buf, 0, &thresh)) if (kstrtoul(buf, 0, &thresh))
return -EINVAL; return -EINVAL;
thresh = clamp(thresh, 1UL, engine->fifo_sz - 1); thresh = clamp(thresh, 1UL, engine->fifo_sz - 1);
......
...@@ -647,7 +647,6 @@ static int s5p_aes_probe(struct platform_device *pdev) ...@@ -647,7 +647,6 @@ static int s5p_aes_probe(struct platform_device *pdev)
clk_disable(pdata->clk); clk_disable(pdata->clk);
s5p_dev = NULL; s5p_dev = NULL;
platform_set_drvdata(pdev, NULL);
return err; return err;
} }
...@@ -668,7 +667,6 @@ static int s5p_aes_remove(struct platform_device *pdev) ...@@ -668,7 +667,6 @@ static int s5p_aes_remove(struct platform_device *pdev)
clk_disable(pdata->clk); clk_disable(pdata->clk);
s5p_dev = NULL; s5p_dev = NULL;
platform_set_drvdata(pdev, NULL);
return 0; return 0;
} }
......
...@@ -1629,7 +1629,7 @@ static int ux500_cryp_remove(struct platform_device *pdev) ...@@ -1629,7 +1629,7 @@ static int ux500_cryp_remove(struct platform_device *pdev)
res = platform_get_resource(pdev, IORESOURCE_MEM, 0); res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (res) if (res)
release_mem_region(res->start, res->end - res->start + 1); release_mem_region(res->start, resource_size(res));
kfree(device_data); kfree(device_data);
......
...@@ -3,6 +3,10 @@ ...@@ -3,6 +3,10 @@
#include <linux/types.h> #include <linux/types.h>
#define CRC_T10DIF_DIGEST_SIZE 2
#define CRC_T10DIF_BLOCK_SIZE 1
__u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len);
__u16 crc_t10dif(unsigned char const *, size_t); __u16 crc_t10dif(unsigned char const *, size_t);
#endif #endif
...@@ -66,6 +66,8 @@ config CRC16 ...@@ -66,6 +66,8 @@ config CRC16
config CRC_T10DIF config CRC_T10DIF
tristate "CRC calculation for the T10 Data Integrity Field" tristate "CRC calculation for the T10 Data Integrity Field"
select CRYPTO
select CRYPTO_CRCT10DIF
help help
This option is only needed if a module that's not in the This option is only needed if a module that's not in the
kernel tree needs to calculate CRC checks for use with the kernel tree needs to calculate CRC checks for use with the
......
...@@ -11,57 +11,44 @@ ...@@ -11,57 +11,44 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/crc-t10dif.h> #include <linux/crc-t10dif.h>
#include <linux/err.h>
#include <linux/init.h>
#include <crypto/hash.h>
/* Table generated using the following polynomium: static struct crypto_shash *crct10dif_tfm;
* x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
* gt: 0x8bb7
*/
static const __u16 t10_dif_crc_table[256] = {
0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
};
__u16 crc_t10dif(const unsigned char *buffer, size_t len) __u16 crc_t10dif(const unsigned char *buffer, size_t len)
{ {
__u16 crc = 0; struct {
unsigned int i; struct shash_desc shash;
char ctx[2];
} desc;
int err;
desc.shash.tfm = crct10dif_tfm;
desc.shash.flags = 0;
*(__u16 *)desc.ctx = 0;
for (i = 0 ; i < len ; i++) err = crypto_shash_update(&desc.shash, buffer, len);
crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff]; BUG_ON(err);
return crc; return *(__u16 *)desc.ctx;
} }
EXPORT_SYMBOL(crc_t10dif); EXPORT_SYMBOL(crc_t10dif);
static int __init crc_t10dif_mod_init(void)
{
crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
return PTR_RET(crct10dif_tfm);
}
static void __exit crc_t10dif_mod_fini(void)
{
crypto_free_shash(crct10dif_tfm);
}
module_init(crc_t10dif_mod_init);
module_exit(crc_t10dif_mod_fini);
MODULE_DESCRIPTION("T10 DIF CRC calculation"); MODULE_DESCRIPTION("T10 DIF CRC calculation");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment