Commit 1ed55eac authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu:

 - Added aesni/avx/x86_64 implementations for camellia.

 - Optimised AVX code for cast5/serpent/twofish/cast6.

 - Fixed vmac bug with unaligned input.

 - Allow compression algorithms in FIPS mode.

 - Optimised crc32c implementation for Intel.

 - Misc fixes.

* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (32 commits)
  crypto: caam - Updated SEC-4.0 device tree binding for ERA information.
  crypto: testmgr - remove superfluous initializers for xts(aes)
  crypto: testmgr - allow compression algs in fips mode
  crypto: testmgr - add larger crc32c test vector to test FPU path in crc32c_intel
  crypto: testmgr - clean alg_test_null entries in alg_test_descs[]
  crypto: testmgr - remove fips_allowed flag from camellia-aesni null-tests
  crypto: cast5/cast6 - move lookup tables to shared module
  padata: use __this_cpu_read per-cpu helper
  crypto: s5p-sss - Fix compilation error
  crypto: picoxcell - Add terminating entry for platform_device_id table
  crypto: omap-aes - select BLKCIPHER2
  crypto: camellia - add AES-NI/AVX/x86_64 assembler implementation of camellia cipher
  crypto: camellia-x86_64 - share common functions and move structures and function definitions to header file
  crypto: tcrypt - add async speed test for camellia cipher
  crypto: tegra-aes - fix error-valued pointer dereference
  crypto: tegra - fix missing unlock on error case
  crypto: cast5/avx - avoid using temporary stack buffers
  crypto: serpent/avx - avoid using temporary stack buffers
  crypto: twofish/avx - avoid using temporary stack buffers
  crypto: cast6/avx - avoid using temporary stack buffers
  ...
parents 08242bc2 a2c0911c
...@@ -54,7 +54,8 @@ PROPERTIES ...@@ -54,7 +54,8 @@ PROPERTIES
- compatible - compatible
Usage: required Usage: required
Value type: <string> Value type: <string>
Definition: Must include "fsl,sec-v4.0" Definition: Must include "fsl,sec-v4.0". Also includes SEC
ERA versions (optional) with which the device is compatible.
- #address-cells - #address-cells
Usage: required Usage: required
...@@ -106,7 +107,7 @@ PROPERTIES ...@@ -106,7 +107,7 @@ PROPERTIES
EXAMPLE EXAMPLE
crypto@300000 { crypto@300000 {
compatible = "fsl,sec-v4.0"; compatible = "fsl,sec-v4.0", "fsl,sec-era-v2.0";
#address-cells = <1>; #address-cells = <1>;
#size-cells = <1>; #size-cells = <1>;
reg = <0x300000 0x10000>; reg = <0x300000 0x10000>;
......
...@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o ...@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
...@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o ...@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
...@@ -47,3 +50,5 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o ...@@ -47,3 +50,5 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
This diff is collapsed.
This diff is collapsed.
...@@ -32,53 +32,24 @@ ...@@ -32,53 +32,24 @@
#include <crypto/algapi.h> #include <crypto/algapi.h>
#include <crypto/lrw.h> #include <crypto/lrw.h>
#include <crypto/xts.h> #include <crypto/xts.h>
#include <asm/crypto/camellia.h>
#include <asm/crypto/glue_helper.h> #include <asm/crypto/glue_helper.h>
#define CAMELLIA_MIN_KEY_SIZE 16
#define CAMELLIA_MAX_KEY_SIZE 32
#define CAMELLIA_BLOCK_SIZE 16
#define CAMELLIA_TABLE_BYTE_LEN 272
struct camellia_ctx {
u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
u32 key_length;
};
/* regular block cipher functions */ /* regular block cipher functions */
asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src, bool xor); const u8 *src, bool xor);
EXPORT_SYMBOL_GPL(__camellia_enc_blk);
asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
EXPORT_SYMBOL_GPL(camellia_dec_blk);
/* 2-way parallel cipher functions */ /* 2-way parallel cipher functions */
asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src, bool xor); const u8 *src, bool xor);
EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, false);
}
static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, true);
}
static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, false);
}
static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, true);
}
static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{ {
...@@ -1275,8 +1246,7 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey) ...@@ -1275,8 +1246,7 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey)
camellia_setup256(kk, subkey); camellia_setup256(kk, subkey);
} }
static int __camellia_setkey(struct camellia_ctx *cctx, int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
const unsigned char *key,
unsigned int key_len, u32 *flags) unsigned int key_len, u32 *flags)
{ {
if (key_len != 16 && key_len != 24 && key_len != 32) { if (key_len != 16 && key_len != 24 && key_len != 32) {
...@@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx, ...@@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx,
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(__camellia_setkey);
static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len) unsigned int key_len)
...@@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, ...@@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
&tfm->crt_flags); &tfm->crt_flags);
} }
static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
{ {
u128 iv = *src; u128 iv = *src;
...@@ -1316,22 +1287,23 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) ...@@ -1316,22 +1287,23 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
u128_xor(&dst[1], &dst[1], &iv); u128_xor(&dst[1], &dst[1], &iv);
} }
EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{ {
be128 ctrblk; be128 ctrblk;
if (dst != src) if (dst != src)
*dst = *src; *dst = *src;
u128_to_be128(&ctrblk, iv); le128_to_be128(&ctrblk, iv);
u128_inc(iv); le128_inc(iv);
camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
} }
EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)
u128 *iv)
{ {
be128 ctrblks[2]; be128 ctrblks[2];
...@@ -1340,13 +1312,14 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, ...@@ -1340,13 +1312,14 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
dst[1] = src[1]; dst[1] = src[1];
} }
u128_to_be128(&ctrblks[0], iv); le128_to_be128(&ctrblks[0], iv);
u128_inc(iv); le128_inc(iv);
u128_to_be128(&ctrblks[1], iv); le128_to_be128(&ctrblks[1], iv);
u128_inc(iv); le128_inc(iv);
camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
} }
EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
static const struct common_glue_ctx camellia_enc = { static const struct common_glue_ctx camellia_enc = {
.num_funcs = 2, .num_funcs = 2,
...@@ -1464,12 +1437,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ...@@ -1464,12 +1437,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
camellia_dec_blk(ctx, srcdst, srcdst); camellia_dec_blk(ctx, srcdst, srcdst);
} }
struct camellia_lrw_ctx { int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
struct lrw_table_ctx lrw_table;
struct camellia_ctx camellia_ctx;
};
static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen) unsigned int keylen)
{ {
struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
...@@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, ...@@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
return lrw_init_table(&ctx->lrw_table, return lrw_init_table(&ctx->lrw_table,
key + keylen - CAMELLIA_BLOCK_SIZE); key + keylen - CAMELLIA_BLOCK_SIZE);
} }
EXPORT_SYMBOL_GPL(lrw_camellia_setkey);
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes) struct scatterlist *src, unsigned int nbytes)
...@@ -1519,19 +1488,15 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, ...@@ -1519,19 +1488,15 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
return lrw_crypt(desc, dst, src, nbytes, &req); return lrw_crypt(desc, dst, src, nbytes, &req);
} }
static void lrw_exit_tfm(struct crypto_tfm *tfm) void lrw_camellia_exit_tfm(struct crypto_tfm *tfm)
{ {
struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
lrw_free_table(&ctx->lrw_table); lrw_free_table(&ctx->lrw_table);
} }
EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm);
struct camellia_xts_ctx { int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
struct camellia_ctx tweak_ctx;
struct camellia_ctx crypt_ctx;
};
static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen) unsigned int keylen)
{ {
struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm); struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm);
...@@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, ...@@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
flags); flags);
} }
EXPORT_SYMBOL_GPL(xts_camellia_setkey);
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes) struct scatterlist *src, unsigned int nbytes)
...@@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { { ...@@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { {
.cra_alignmask = 0, .cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type, .cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE, .cra_module = THIS_MODULE,
.cra_exit = lrw_exit_tfm, .cra_exit = lrw_camellia_exit_tfm,
.cra_u = { .cra_u = {
.blkcipher = { .blkcipher = {
.min_keysize = CAMELLIA_MIN_KEY_SIZE + .min_keysize = CAMELLIA_MIN_KEY_SIZE +
......
This diff is collapsed.
...@@ -37,29 +37,14 @@ ...@@ -37,29 +37,14 @@
#define CAST5_PARALLEL_BLOCKS 16 #define CAST5_PARALLEL_BLOCKS 16
asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst, asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
const u8 *src) asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
{ const u8 *src);
__cast5_enc_blk_16way(ctx, dst, src, false); asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
} __be64 *iv);
static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast5_enc_blk_16way(ctx, dst, src, true);
}
static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
const u8 *src)
{
cast5_dec_blk_16way(ctx, dst, src);
}
static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes) static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
{ {
...@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, ...@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = CAST5_BLOCK_SIZE; const unsigned int bsize = CAST5_BLOCK_SIZE;
unsigned int nbytes; unsigned int nbytes;
void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
int err; int err;
fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
err = blkcipher_walk_virt(desc, walk); err = blkcipher_walk_virt(desc, walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
...@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, ...@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
/* Process multi-block batch */ /* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do { do {
if (enc) fn(ctx, wdst, wsrc);
cast5_enc_blk_xway(ctx, wdst, wsrc);
else
cast5_dec_blk_xway(ctx, wdst, wsrc);
wsrc += bsize * CAST5_PARALLEL_BLOCKS; wsrc += bsize * CAST5_PARALLEL_BLOCKS;
wdst += bsize * CAST5_PARALLEL_BLOCKS; wdst += bsize * CAST5_PARALLEL_BLOCKS;
...@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, ...@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
goto done; goto done;
} }
fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
/* Handle leftovers */ /* Handle leftovers */
do { do {
if (enc) fn(ctx, wdst, wsrc);
__cast5_encrypt(ctx, wdst, wsrc);
else
__cast5_decrypt(ctx, wdst, wsrc);
wsrc += bsize; wsrc += bsize;
wdst += bsize; wdst += bsize;
...@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, ...@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
unsigned int nbytes = walk->nbytes; unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr; u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr;
u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
u64 last_iv; u64 last_iv;
int i;
/* Start of the last block. */ /* Start of the last block. */
src += nbytes / bsize - 1; src += nbytes / bsize - 1;
...@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, ...@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
src -= CAST5_PARALLEL_BLOCKS - 1; src -= CAST5_PARALLEL_BLOCKS - 1;
dst -= CAST5_PARALLEL_BLOCKS - 1; dst -= CAST5_PARALLEL_BLOCKS - 1;
for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
ivs[i] = src[i];
cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
*(dst + (i + 1)) ^= *(ivs + i);
nbytes -= bsize; nbytes -= bsize;
if (nbytes < bsize) if (nbytes < bsize)
...@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, ...@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
unsigned int nbytes = walk->nbytes; unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr; u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr;
u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
__be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
int i;
/* Process multi-block batch */ /* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do { do {
/* create ctrblks for parallel encrypt */ cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) { (__be64 *)walk->iv);
if (dst != src)
dst[i] = src[i];
ctrblocks[i] = cpu_to_be64(ctrblk++);
}
cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
(u8 *)ctrblocks);
src += CAST5_PARALLEL_BLOCKS; src += CAST5_PARALLEL_BLOCKS;
dst += CAST5_PARALLEL_BLOCKS; dst += CAST5_PARALLEL_BLOCKS;
...@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, ...@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
/* Handle leftovers */ /* Handle leftovers */
do { do {
u64 ctrblk;
if (dst != src) if (dst != src)
*dst = *src; *dst = *src;
ctrblocks[0] = cpu_to_be64(ctrblk++); ctrblk = *(u64 *)walk->iv;
be64_add_cpu((__be64 *)walk->iv, 1);
__cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
*dst ^= ctrblocks[0]; *dst ^= ctrblk;
src += 1; src += 1;
dst += 1; dst += 1;
...@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, ...@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
} while (nbytes >= bsize); } while (nbytes >= bsize);
done: done:
*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
return nbytes; return nbytes;
} }
......
...@@ -23,22 +23,24 @@ ...@@ -23,22 +23,24 @@
* *
*/ */
#include "glue_helper-asm-avx.S"
.file "cast6-avx-x86_64-asm_64.S" .file "cast6-avx-x86_64-asm_64.S"
.extern cast6_s1 .extern cast_s1
.extern cast6_s2 .extern cast_s2
.extern cast6_s3 .extern cast_s3
.extern cast6_s4 .extern cast_s4
/* structure of crypto context */ /* structure of crypto context */
#define km 0 #define km 0
#define kr (12*4*4) #define kr (12*4*4)
/* s-boxes */ /* s-boxes */
#define s1 cast6_s1 #define s1 cast_s1
#define s2 cast6_s2 #define s2 cast_s2
#define s3 cast6_s3 #define s3 cast_s3
#define s4 cast6_s4 #define s4 cast_s4
/********************************************************************** /**********************************************************************
8-way AVX cast6 8-way AVX cast6
...@@ -205,11 +207,7 @@ ...@@ -205,11 +207,7 @@
vpunpcklqdq x3, t2, x2; \ vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3; vpunpckhqdq x3, t2, x3;
#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
vmovdqu (0*4*4)(in), x0; \
vmovdqu (1*4*4)(in), x1; \
vmovdqu (2*4*4)(in), x2; \
vmovdqu (3*4*4)(in), x3; \
vpshufb rmask, x0, x0; \ vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \ vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \ vpshufb rmask, x2, x2; \
...@@ -217,39 +215,21 @@ ...@@ -217,39 +215,21 @@
\ \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\ \
vpshufb rmask, x0, x0; \ vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \ vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \ vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \ vpshufb rmask, x3, x3;
vmovdqu x0, (0*4*4)(out); \
vmovdqu x1, (1*4*4)(out); \
vmovdqu x2, (2*4*4)(out); \
vmovdqu x3, (3*4*4)(out);
#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor (1*4*4)(out), x1, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor (2*4*4)(out), x2, x2; \
vmovdqu x2, (2*4*4)(out); \
vpxor (3*4*4)(out), x3, x3; \
vmovdqu x3, (3*4*4)(out);
.data .data
.align 16 .align 16
.Lbswap_mask: .Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.Lrkr_enc_Q_Q_QBAR_QBAR: .Lrkr_enc_Q_Q_QBAR_QBAR:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
.Lrkr_enc_QBAR_QBAR_QBAR_QBAR: .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
...@@ -269,31 +249,26 @@ ...@@ -269,31 +249,26 @@
.text .text
.align 16 .align 8
.global __cast6_enc_blk_8way .type __cast6_enc_blk8,@function;
.type __cast6_enc_blk_8way,@function;
__cast6_enc_blk_8way: __cast6_enc_blk8:
/* input: /* input:
* %rdi: ctx, CTX * %rdi: ctx, CTX
* %rsi: dst * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* %rdx: src * output:
* %rcx: bool, if true: xor output * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/ */
pushq %rbp; pushq %rbp;
pushq %rbx; pushq %rbx;
pushq %rcx;
vmovdqa .Lbswap_mask, RKM; vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST; vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32; vmovd .L32_mask, R32;
leaq (4*4*4)(%rdx), %rax; inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
movq %rsi, %r11;
preload_rkr(0, dummy, none); preload_rkr(0, dummy, none);
Q(0); Q(0);
...@@ -311,36 +286,25 @@ __cast6_enc_blk_8way: ...@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
QBAR(10); QBAR(10);
QBAR(11); QBAR(11);
popq %rcx;
popq %rbx; popq %rbx;
popq %rbp; popq %rbp;
vmovdqa .Lbswap_mask, RKM; vmovdqa .Lbswap_mask, RKM;
leaq (4*4*4)(%r11), %rax;
testb %cl, %cl;
jnz __enc_xor8;
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
ret; ret;
__enc_xor8: .align 8
outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); .type __cast6_dec_blk8,@function;
outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
ret; __cast6_dec_blk8:
.align 16
.global cast6_dec_blk_8way
.type cast6_dec_blk_8way,@function;
cast6_dec_blk_8way:
/* input: /* input:
* %rdi: ctx, CTX * %rdi: ctx, CTX
* %rsi: dst * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
* %rdx: src * output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/ */
pushq %rbp; pushq %rbp;
...@@ -350,11 +314,8 @@ cast6_dec_blk_8way: ...@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
vmovd .Lfirst_mask, R1ST; vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32; vmovd .L32_mask, R32;
leaq (4*4*4)(%rdx), %rax; inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
movq %rsi, %r11;
preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
Q(11); Q(11);
...@@ -376,8 +337,103 @@ cast6_dec_blk_8way: ...@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
popq %rbp; popq %rbp;
vmovdqa .Lbswap_mask, RKM; vmovdqa .Lbswap_mask, RKM;
leaq (4*4*4)(%r11), %rax; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
ret;
.align 8
.global cast6_ecb_enc_8way
.type cast6_ecb_enc_8way,@function;
cast6_ecb_enc_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
movq %rsi, %r11;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __cast6_enc_blk8;
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
.align 8
.global cast6_ecb_dec_8way
.type cast6_ecb_dec_8way,@function;
cast6_ecb_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
movq %rsi, %r11;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __cast6_dec_blk8;
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
.align 8
.global cast6_cbc_dec_8way
.type cast6_cbc_dec_8way,@function;
cast6_cbc_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
pushq %r12;
movq %rsi, %r11;
movq %rdx, %r12;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __cast6_dec_blk8;
store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
popq %r12;
ret;
.align 8
.global cast6_ctr_8way
.type cast6_ctr_8way,@function;
cast6_ctr_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
pushq %r12;
movq %rsi, %r11;
movq %rdx, %r12;
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RX, RKR, RKM);
call __cast6_enc_blk8;
store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
popq %r12;
ret; ret;
...@@ -40,79 +40,34 @@ ...@@ -40,79 +40,34 @@
#define CAST6_PARALLEL_BLOCKS 8 #define CAST6_PARALLEL_BLOCKS 8
asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst, asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src, bool xor); const u8 *src);
asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst, asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst, asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src) const u8 *src);
{ asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
__cast6_enc_blk_8way(ctx, dst, src, false); le128 *iv);
}
static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast6_enc_blk_8way(ctx, dst, src, true);
}
static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
const u8 *src)
{
cast6_dec_blk_8way(ctx, dst, src);
}
static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
{
u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
unsigned int j;
for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
ivs[j] = src[j];
cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}
static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{ {
be128 ctrblk; be128 ctrblk;
u128_to_be128(&ctrblk, iv); le128_to_be128(&ctrblk, iv);
u128_inc(iv); le128_inc(iv);
__cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, src, (u128 *)&ctrblk); u128_xor(dst, src, (u128 *)&ctrblk);
} }
static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
u128 *iv)
{
be128 ctrblks[CAST6_PARALLEL_BLOCKS];
unsigned int i;
for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];
u128_to_be128(&ctrblks[i], iv);
u128_inc(iv);
}
cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
}
static const struct common_glue_ctx cast6_enc = { static const struct common_glue_ctx cast6_enc = {
.num_funcs = 2, .num_funcs = 2,
.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
.funcs = { { .funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS, .num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) } .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
}, { }, {
.num_blocks = 1, .num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
...@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = { ...@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {
.funcs = { { .funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS, .num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) } .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
}, { }, {
.num_blocks = 1, .num_blocks = 1,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
...@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = { ...@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {
.funcs = { { .funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS, .num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) } .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
}, { }, {
.num_blocks = 1, .num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
...@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = { ...@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {
.funcs = { { .funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS, .num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) } .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
}, { }, {
.num_blocks = 1, .num_blocks = 1,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
...@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ...@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst); cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
return; return;
} }
...@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ...@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst); cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
return; return;
} }
......
...@@ -32,6 +32,8 @@ ...@@ -32,6 +32,8 @@
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/cpu_device_id.h> #include <asm/cpu_device_id.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_BLOCK_SIZE 1
#define CHKSUM_DIGEST_SIZE 4 #define CHKSUM_DIGEST_SIZE 4
...@@ -44,6 +46,31 @@ ...@@ -44,6 +46,31 @@
#define REX_PRE #define REX_PRE
#endif #endif
#ifdef CONFIG_X86_64
/*
* use carryless multiply version of crc32c when buffer
* size is >= 512 (when eager fpu is enabled) or
* >= 1024 (when eager fpu is disabled) to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512
#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024
asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
#if defined(X86_FEATURE_EAGER_FPU)
#define set_pcl_breakeven_point() \
do { \
if (!use_eager_fpu()) \
crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
} while (0)
#else
#define set_pcl_breakeven_point() \
(crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU)
#endif
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{ {
while (length--) { while (length--) {
...@@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm) ...@@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
return 0; return 0;
} }
#ifdef CONFIG_X86_64
static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
u32 *crcp = shash_desc_ctx(desc);
/*
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
kernel_fpu_begin();
*crcp = crc_pcl(data, len, *crcp);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
kernel_fpu_begin();
*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
}
static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
}
static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
out);
}
#endif /* CONFIG_X86_64 */
static struct shash_alg alg = { static struct shash_alg alg = {
.setkey = crc32c_intel_setkey, .setkey = crc32c_intel_setkey,
.init = crc32c_intel_init, .init = crc32c_intel_init,
...@@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void) ...@@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void)
{ {
if (!x86_match_cpu(crc32c_cpu_id)) if (!x86_match_cpu(crc32c_cpu_id))
return -ENODEV; return -ENODEV;
#ifdef CONFIG_X86_64
if (cpu_has_pclmulqdq) {
alg.update = crc32c_pcl_intel_update;
alg.finup = crc32c_pcl_intel_finup;
alg.digest = crc32c_pcl_intel_digest;
set_pcl_breakeven_point();
}
#endif
return crypto_register_shash(&alg); return crypto_register_shash(&alg);
} }
......
This diff is collapsed.
/*
* Shared glue code for 128bit block ciphers, AVX assembler macros
*
* Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
vmovdqu (0*16)(src), x0; \
vmovdqu (1*16)(src), x1; \
vmovdqu (2*16)(src), x2; \
vmovdqu (3*16)(src), x3; \
vmovdqu (4*16)(src), x4; \
vmovdqu (5*16)(src), x5; \
vmovdqu (6*16)(src), x6; \
vmovdqu (7*16)(src), x7;
#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
vmovdqu x0, (0*16)(dst); \
vmovdqu x1, (1*16)(dst); \
vmovdqu x2, (2*16)(dst); \
vmovdqu x3, (3*16)(dst); \
vmovdqu x4, (4*16)(dst); \
vmovdqu x5, (5*16)(dst); \
vmovdqu x6, (6*16)(dst); \
vmovdqu x7, (7*16)(dst);
#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
vpxor (0*16)(src), x1, x1; \
vpxor (1*16)(src), x2, x2; \
vpxor (2*16)(src), x3, x3; \
vpxor (3*16)(src), x4, x4; \
vpxor (4*16)(src), x5, x5; \
vpxor (5*16)(src), x6, x6; \
vpxor (6*16)(src), x7, x7; \
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
vpsubq minus_one, x, x; \
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;
#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
vpcmpeqd t0, t0, t0; \
vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
vmovdqa bswap, t1; \
\
/* load IV and byteswap */ \
vmovdqu (iv), x7; \
vpshufb t1, x7, x0; \
\
/* construct IVs */ \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x1; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x2; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x3; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x4; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x5; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x6; \
inc_le128(x7, t0, t2); \
vmovdqa x7, t2; \
vpshufb t1, x7, x7; \
inc_le128(t2, t0, t1); \
vmovdqu t2, (iv);
#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
vpxor (0*16)(src), x0, x0; \
vpxor (1*16)(src), x1, x1; \
vpxor (2*16)(src), x2, x2; \
vpxor (3*16)(src), x3, x3; \
vpxor (4*16)(src), x4, x4; \
vpxor (5*16)(src), x5, x5; \
vpxor (6*16)(src), x6, x6; \
vpxor (7*16)(src), x7, x7; \
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
...@@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, ...@@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
u8 *src = (u8 *)walk->src.virt.addr; u8 *src = (u8 *)walk->src.virt.addr;
u8 *dst = (u8 *)walk->dst.virt.addr; u8 *dst = (u8 *)walk->dst.virt.addr;
unsigned int nbytes = walk->nbytes; unsigned int nbytes = walk->nbytes;
u128 ctrblk; le128 ctrblk;
u128 tmp; u128 tmp;
be128_to_u128(&ctrblk, (be128 *)walk->iv); be128_to_le128(&ctrblk, (be128 *)walk->iv);
memcpy(&tmp, src, nbytes); memcpy(&tmp, src, nbytes);
fn_ctr(ctx, &tmp, &tmp, &ctrblk); fn_ctr(ctx, &tmp, &tmp, &ctrblk);
memcpy(dst, &tmp, nbytes); memcpy(dst, &tmp, nbytes);
u128_to_be128((be128 *)walk->iv, &ctrblk); le128_to_be128((be128 *)walk->iv, &ctrblk);
} }
EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
...@@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, ...@@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
unsigned int nbytes = walk->nbytes; unsigned int nbytes = walk->nbytes;
u128 *src = (u128 *)walk->src.virt.addr; u128 *src = (u128 *)walk->src.virt.addr;
u128 *dst = (u128 *)walk->dst.virt.addr; u128 *dst = (u128 *)walk->dst.virt.addr;
u128 ctrblk; le128 ctrblk;
unsigned int num_blocks, func_bytes; unsigned int num_blocks, func_bytes;
unsigned int i; unsigned int i;
be128_to_u128(&ctrblk, (be128 *)walk->iv); be128_to_le128(&ctrblk, (be128 *)walk->iv);
/* Process multi-block batch */ /* Process multi-block batch */
for (i = 0; i < gctx->num_funcs; i++) { for (i = 0; i < gctx->num_funcs; i++) {
...@@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, ...@@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
} }
done: done:
u128_to_be128((be128 *)walk->iv, &ctrblk); le128_to_be128((be128 *)walk->iv, &ctrblk);
return nbytes; return nbytes;
} }
......
...@@ -24,7 +24,16 @@ ...@@ -24,7 +24,16 @@
* *
*/ */
#include "glue_helper-asm-avx.S"
.file "serpent-avx-x86_64-asm_64.S" .file "serpent-avx-x86_64-asm_64.S"
.data
.align 16
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.text .text
#define CTX %rdi #define CTX %rdi
...@@ -550,51 +559,27 @@ ...@@ -550,51 +559,27 @@
vpunpcklqdq x3, t2, x2; \ vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3; vpunpckhqdq x3, t2, x3;
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
vmovdqu (0*4*4)(in), x0; \
vmovdqu (1*4*4)(in), x1; \
vmovdqu (2*4*4)(in), x2; \
vmovdqu (3*4*4)(in), x3; \
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
\
vmovdqu x0, (0*4*4)(out); \
vmovdqu x1, (1*4*4)(out); \
vmovdqu x2, (2*4*4)(out); \
vmovdqu x3, (3*4*4)(out);
#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor (1*4*4)(out), x1, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor (2*4*4)(out), x2, x2; \
vmovdqu x2, (2*4*4)(out); \
vpxor (3*4*4)(out), x3, x3; \
vmovdqu x3, (3*4*4)(out);
.align 8 .align 8
.global __serpent_enc_blk_8way_avx .type __serpent_enc_blk8_avx,@function;
.type __serpent_enc_blk_8way_avx,@function;
__serpent_enc_blk_8way_avx: __serpent_enc_blk8_avx:
/* input: /* input:
* %rdi: ctx, CTX * %rdi: ctx, CTX
* %rsi: dst * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* %rdx: src * output:
* %rcx: bool, if true: xor output * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/ */
vpcmpeqd RNOT, RNOT, RNOT; vpcmpeqd RNOT, RNOT, RNOT;
leaq (4*4*4)(%rdx), %rax; read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
K2(RA, RB, RC, RD, RE, 0); K2(RA, RB, RC, RD, RE, 0);
S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
...@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx: ...@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:
S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
leaq (4*4*4)(%rsi), %rax; write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
testb %cl, %cl;
jnz __enc_xor8;
write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
ret;
__enc_xor8:
xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
ret; ret;
.align 8 .align 8
.global serpent_dec_blk_8way_avx .type __serpent_dec_blk8_avx,@function;
.type serpent_dec_blk_8way_avx,@function;
serpent_dec_blk_8way_avx: __serpent_dec_blk8_avx:
/* input: /* input:
* %rdi: ctx, CTX * %rdi: ctx, CTX
* %rsi: dst * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
* %rdx: src * output:
* RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
*/ */
vpcmpeqd RNOT, RNOT, RNOT; vpcmpeqd RNOT, RNOT, RNOT;
leaq (4*4*4)(%rdx), %rax; read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
K2(RA, RB, RC, RD, RE, 32); K2(RA, RB, RC, RD, RE, 32);
SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
...@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx: ...@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:
SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
leaq (4*4*4)(%rsi), %rax; write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
ret;
.align 8
.global serpent_ecb_enc_8way_avx
.type serpent_ecb_enc_8way_avx,@function;
serpent_ecb_enc_8way_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __serpent_enc_blk8_avx;
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
.align 8
.global serpent_ecb_dec_8way_avx
.type serpent_ecb_dec_8way_avx,@function;
serpent_ecb_dec_8way_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __serpent_dec_blk8_avx;
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
ret;
.align 8
.global serpent_cbc_dec_8way_avx
.type serpent_cbc_dec_8way_avx,@function;
serpent_cbc_dec_8way_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __serpent_dec_blk8_avx;
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
ret;
.align 8
.global serpent_ctr_8way_avx
.type serpent_ctr_8way_avx,@function;
serpent_ctr_8way_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RK0, RK1, RK2);
call __serpent_enc_blk8_avx;
store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret; ret;
...@@ -42,55 +42,24 @@ ...@@ -42,55 +42,24 @@
#include <asm/crypto/ablk_helper.h> #include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h> #include <asm/crypto/glue_helper.h>
static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
unsigned int j;
for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
ivs[j] = src[j];
serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
{ {
be128 ctrblk; be128 ctrblk;
u128_to_be128(&ctrblk, iv); le128_to_be128(&ctrblk, iv);
u128_inc(iv); le128_inc(iv);
__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, src, (u128 *)&ctrblk); u128_xor(dst, src, (u128 *)&ctrblk);
} }
static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
u128 *iv)
{
be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
unsigned int i;
for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];
u128_to_be128(&ctrblks[i], iv);
u128_inc(iv);
}
serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
}
static const struct common_glue_ctx serpent_enc = { static const struct common_glue_ctx serpent_enc = {
.num_funcs = 2, .num_funcs = 2,
.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
.funcs = { { .funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS, .num_blocks = SERPENT_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
}, { }, {
.num_blocks = 1, .num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
...@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = { ...@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = {
.funcs = { { .funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS, .num_blocks = SERPENT_PARALLEL_BLOCKS,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
}, { }, {
.num_blocks = 1, .num_blocks = 1,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
...@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = { ...@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = {
.funcs = { { .funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS, .num_blocks = SERPENT_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
}, { }, {
.num_blocks = 1, .num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
...@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = { ...@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = {
.funcs = { { .funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS, .num_blocks = SERPENT_PARALLEL_BLOCKS,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
}, { }, {
.num_blocks = 1, .num_blocks = 1,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
...@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ...@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
return; return;
} }
...@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ...@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
return; return;
} }
......
...@@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) ...@@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
} }
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{ {
be128 ctrblk; be128 ctrblk;
u128_to_be128(&ctrblk, iv); le128_to_be128(&ctrblk, iv);
u128_inc(iv); le128_inc(iv);
__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, src, (u128 *)&ctrblk); u128_xor(dst, src, (u128 *)&ctrblk);
} }
static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
u128 *iv) le128 *iv)
{ {
be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
unsigned int i; unsigned int i;
...@@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, ...@@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
if (dst != src) if (dst != src)
dst[i] = src[i]; dst[i] = src[i];
u128_to_be128(&ctrblks[i], iv); le128_to_be128(&ctrblks[i], iv);
u128_inc(iv); le128_inc(iv);
} }
serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
......
...@@ -23,7 +23,16 @@ ...@@ -23,7 +23,16 @@
* *
*/ */
#include "glue_helper-asm-avx.S"
.file "twofish-avx-x86_64-asm_64.S" .file "twofish-avx-x86_64-asm_64.S"
.data
.align 16
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.text .text
/* structure of crypto context */ /* structure of crypto context */
...@@ -217,69 +226,45 @@ ...@@ -217,69 +226,45 @@
vpunpcklqdq x3, t2, x2; \ vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3; vpunpckhqdq x3, t2, x3;
#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ #define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
vpxor (0*4*4)(in), wkey, x0; \
vpxor (1*4*4)(in), wkey, x1; \
vpxor (2*4*4)(in), wkey, x2; \
vpxor (3*4*4)(in), wkey, x3; \
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpxor x0, wkey, x0; \ vpxor x0, wkey, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor x1, wkey, x1; \ vpxor x1, wkey, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor x2, wkey, x2; \ vpxor x2, wkey, x2; \
vmovdqu x2, (2*4*4)(out); \
vpxor x3, wkey, x3; \ vpxor x3, wkey, x3; \
vmovdqu x3, (3*4*4)(out); \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ #define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\ \
vpxor x0, wkey, x0; \ vpxor x0, wkey, x0; \
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor x1, wkey, x1; \ vpxor x1, wkey, x1; \
vpxor (1*4*4)(out), x1, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor x2, wkey, x2; \ vpxor x2, wkey, x2; \
vpxor (2*4*4)(out), x2, x2; \ vpxor x3, wkey, x3;
vmovdqu x2, (2*4*4)(out); \
vpxor x3, wkey, x3; \
vpxor (3*4*4)(out), x3, x3; \
vmovdqu x3, (3*4*4)(out);
.align 8 .align 8
.global __twofish_enc_blk_8way .type __twofish_enc_blk8,@function;
.type __twofish_enc_blk_8way,@function;
__twofish_enc_blk_8way: __twofish_enc_blk8:
/* input: /* input:
* %rdi: ctx, CTX * %rdi: ctx, CTX
* %rsi: dst * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* %rdx: src * output:
* %rcx: bool, if true: xor output * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
*/ */
vmovdqu w(CTX), RK1;
pushq %rbp; pushq %rbp;
pushq %rbx; pushq %rbx;
pushq %rcx; pushq %rcx;
vmovdqu w(CTX), RK1; inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
preload_rgi(RA1); preload_rgi(RA1);
rotate_1l(RD1); rotate_1l(RD1);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
rotate_1l(RD2); rotate_1l(RD2);
movq %rsi, %r11;
encrypt_cycle(0); encrypt_cycle(0);
encrypt_cycle(1); encrypt_cycle(1);
encrypt_cycle(2); encrypt_cycle(2);
...@@ -295,47 +280,33 @@ __twofish_enc_blk_8way: ...@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
popq %rbx; popq %rbx;
popq %rbp; popq %rbp;
leaq (4*4*4)(%r11), %rax; outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
testb %cl, %cl;
jnz __enc_xor8;
outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
ret;
__enc_xor8:
outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
ret; ret;
.align 8 .align 8
.global twofish_dec_blk_8way .type __twofish_dec_blk8,@function;
.type twofish_dec_blk_8way,@function;
twofish_dec_blk_8way: __twofish_dec_blk8:
/* input: /* input:
* %rdi: ctx, CTX * %rdi: ctx, CTX
* %rsi: dst * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
* %rdx: src * output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/ */
vmovdqu (w+4*4)(CTX), RK1;
pushq %rbp; pushq %rbp;
pushq %rbx; pushq %rbx;
vmovdqu (w+4*4)(CTX), RK1; inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
preload_rgi(RC1); preload_rgi(RC1);
rotate_1l(RA1); rotate_1l(RA1);
inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
rotate_1l(RA2); rotate_1l(RA2);
movq %rsi, %r11;
decrypt_cycle(7); decrypt_cycle(7);
decrypt_cycle(6); decrypt_cycle(6);
decrypt_cycle(5); decrypt_cycle(5);
...@@ -350,8 +321,103 @@ twofish_dec_blk_8way: ...@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
popq %rbx; popq %rbx;
popq %rbp; popq %rbp;
leaq (4*4*4)(%r11), %rax; outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
ret;
.align 8
.global twofish_ecb_enc_8way
.type twofish_ecb_enc_8way,@function;
twofish_ecb_enc_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
movq %rsi, %r11;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __twofish_enc_blk8;
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
ret;
.align 8
.global twofish_ecb_dec_8way
.type twofish_ecb_dec_8way,@function;
twofish_ecb_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
movq %rsi, %r11;
load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
call __twofish_dec_blk8;
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
.align 8
.global twofish_cbc_dec_8way
.type twofish_cbc_dec_8way,@function;
twofish_cbc_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
pushq %r12;
movq %rsi, %r11;
movq %rdx, %r12;
load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
call __twofish_dec_blk8;
store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
popq %r12;
ret;
.align 8
.global twofish_ctr_8way
.type twofish_ctr_8way,@function;
twofish_ctr_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
pushq %r12;
movq %rsi, %r11;
movq %rdx, %r12;
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RX0, RX1, RY0);
call __twofish_enc_blk8;
store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
popq %r12;
ret; ret;
...@@ -45,66 +45,23 @@ ...@@ -45,66 +45,23 @@
#define TWOFISH_PARALLEL_BLOCKS 8 #define TWOFISH_PARALLEL_BLOCKS 8
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src)
{
__twofish_enc_blk_3way(ctx, dst, src, false);
}
/* 8-way parallel cipher functions */ /* 8-way parallel cipher functions */
asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, bool xor); const u8 *src);
asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src) const u8 *src);
{ asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
__twofish_enc_blk_8way(ctx, dst, src, false); const u8 *src, le128 *iv);
}
static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
const u8 *src)
{
__twofish_enc_blk_8way(ctx, dst, src, true);
}
static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src) const u8 *src)
{ {
twofish_dec_blk_8way(ctx, dst, src); __twofish_enc_blk_3way(ctx, dst, src, false);
}
static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
{
u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
unsigned int j;
for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
ivs[j] = src[j];
twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
} }
static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
u128 *iv)
{
be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
unsigned int i;
for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];
u128_to_be128(&ctrblks[i], iv);
u128_inc(iv);
}
twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
}
static const struct common_glue_ctx twofish_enc = { static const struct common_glue_ctx twofish_enc = {
.num_funcs = 3, .num_funcs = 3,
...@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = { ...@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {
.funcs = { { .funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS, .num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
}, { }, {
.num_blocks = 3, .num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
...@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = { ...@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {
.funcs = { { .funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS, .num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
}, { }, {
.num_blocks = 3, .num_blocks = 3,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
...@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = { ...@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {
.funcs = { { .funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS, .num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
}, { }, {
.num_blocks = 3, .num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
...@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = { ...@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {
.funcs = { { .funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS, .num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
}, { }, {
.num_blocks = 3, .num_blocks = 3,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
...@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ...@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
return; return;
} }
...@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ...@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
return; return;
} }
......
...@@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) ...@@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
} }
EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{ {
be128 ctrblk; be128 ctrblk;
if (dst != src) if (dst != src)
*dst = *src; *dst = *src;
u128_to_be128(&ctrblk, iv); le128_to_be128(&ctrblk, iv);
u128_inc(iv); le128_inc(iv);
twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, dst, (u128 *)&ctrblk); u128_xor(dst, dst, (u128 *)&ctrblk);
...@@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) ...@@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
u128 *iv) le128 *iv)
{ {
be128 ctrblks[3]; be128 ctrblks[3];
...@@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, ...@@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
dst[2] = src[2]; dst[2] = src[2];
} }
u128_to_be128(&ctrblks[0], iv); le128_to_be128(&ctrblks[0], iv);
u128_inc(iv); le128_inc(iv);
u128_to_be128(&ctrblks[1], iv); le128_to_be128(&ctrblks[1], iv);
u128_inc(iv); le128_inc(iv);
u128_to_be128(&ctrblks[2], iv); le128_to_be128(&ctrblks[2], iv);
u128_inc(iv); le128_inc(iv);
twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
} }
......
#ifndef ASM_X86_CAMELLIA_H
#define ASM_X86_CAMELLIA_H
#include <linux/kernel.h>
#include <linux/crypto.h>
#define CAMELLIA_MIN_KEY_SIZE 16
#define CAMELLIA_MAX_KEY_SIZE 32
#define CAMELLIA_BLOCK_SIZE 16
#define CAMELLIA_TABLE_BYTE_LEN 272
#define CAMELLIA_PARALLEL_BLOCKS 2
struct camellia_ctx {
u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
u32 key_length;
};
struct camellia_lrw_ctx {
struct lrw_table_ctx lrw_table;
struct camellia_ctx camellia_ctx;
};
struct camellia_xts_ctx {
struct camellia_ctx tweak_ctx;
struct camellia_ctx crypt_ctx;
};
extern int __camellia_setkey(struct camellia_ctx *cctx,
const unsigned char *key,
unsigned int key_len, u32 *flags);
extern int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen);
extern void lrw_camellia_exit_tfm(struct crypto_tfm *tfm);
extern int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen);
/* regular block cipher functions */
asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src);
/* 2-way parallel cipher functions */
asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src);
static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, false);
}
static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, true);
}
static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, false);
}
static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, true);
}
/* glue helpers */
extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src);
extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
le128 *iv);
extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
le128 *iv);
#endif /* ASM_X86_CAMELLIA_H */
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
u128 *iv); le128 *iv);
#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
...@@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled) ...@@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled)
kernel_fpu_end(); kernel_fpu_end();
} }
static inline void u128_to_be128(be128 *dst, const u128 *src) static inline void le128_to_be128(be128 *dst, const le128 *src)
{ {
dst->a = cpu_to_be64(src->a); dst->a = cpu_to_be64(le64_to_cpu(src->a));
dst->b = cpu_to_be64(src->b); dst->b = cpu_to_be64(le64_to_cpu(src->b));
} }
static inline void be128_to_u128(u128 *dst, const be128 *src) static inline void be128_to_le128(le128 *dst, const be128 *src)
{ {
dst->a = be64_to_cpu(src->a); dst->a = cpu_to_le64(be64_to_cpu(src->a));
dst->b = be64_to_cpu(src->b); dst->b = cpu_to_le64(be64_to_cpu(src->b));
} }
static inline void u128_inc(u128 *i) static inline void le128_inc(le128 *i)
{ {
i->b++; u64 a = le64_to_cpu(i->a);
if (!i->b) u64 b = le64_to_cpu(i->b);
i->a++;
b++;
if (!b)
a++;
i->a = cpu_to_le64(a);
i->b = cpu_to_le64(b);
} }
extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
......
...@@ -6,27 +6,14 @@ ...@@ -6,27 +6,14 @@
#define SERPENT_PARALLEL_BLOCKS 8 #define SERPENT_PARALLEL_BLOCKS 8
asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src, bool xor); const u8 *src);
asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src); const u8 *src);
static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src) const u8 *src);
{ asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
__serpent_enc_blk_8way_avx(ctx, dst, src, false); const u8 *src, le128 *iv);
}
static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
const u8 *src)
{
__serpent_enc_blk_8way_avx(ctx, dst, src, true);
}
static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
const u8 *src)
{
serpent_dec_blk_8way_avx(ctx, dst, src);
}
#endif #endif
...@@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, ...@@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
/* helpers from twofish_x86_64-3way module */ /* helpers from twofish_x86_64-3way module */
extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
u128 *iv); le128 *iv);
extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
u128 *iv); le128 *iv);
extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen); unsigned int keylen);
......
...@@ -324,9 +324,19 @@ config CRYPTO_CRC32C ...@@ -324,9 +324,19 @@ config CRYPTO_CRC32C
by iSCSI for header and data digests and by others. by iSCSI for header and data digests and by others.
See Castagnoli93. Module will be crc32c. See Castagnoli93. Module will be crc32c.
config CRYPTO_CRC32C_X86_64
bool
depends on X86 && 64BIT
select CRYPTO_HASH
help
In Intel processor with SSE4.2 supported, the processor will
support CRC32C calculation using hardware accelerated CRC32
instruction optimized with PCLMULQDQ instruction when available.
config CRYPTO_CRC32C_INTEL config CRYPTO_CRC32C_INTEL
tristate "CRC32c INTEL hardware acceleration" tristate "CRC32c INTEL hardware acceleration"
depends on X86 depends on X86
select CRYPTO_CRC32C_X86_64 if 64BIT
select CRYPTO_HASH select CRYPTO_HASH
help help
In Intel processor with SSE4.2 supported, the processor will In Intel processor with SSE4.2 supported, the processor will
...@@ -793,6 +803,28 @@ config CRYPTO_CAMELLIA_X86_64 ...@@ -793,6 +803,28 @@ config CRYPTO_CAMELLIA_X86_64
See also: See also:
<https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html> <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX)"
depends on X86 && 64BIT
depends on CRYPTO
select CRYPTO_ALGAPI
select CRYPTO_CRYPTD
select CRYPTO_ABLK_HELPER_X86
select CRYPTO_GLUE_HELPER_X86
select CRYPTO_CAMELLIA_X86_64
select CRYPTO_LRW
select CRYPTO_XTS
help
Camellia cipher algorithm module (x86_64/AES-NI/AVX).
Camellia is a symmetric key block cipher developed jointly
at NTT and Mitsubishi Electric Corporation.
The Camellia specifies three key sizes: 128, 192 and 256 bits.
See also:
<https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
config CRYPTO_CAMELLIA_SPARC64 config CRYPTO_CAMELLIA_SPARC64
tristate "Camellia cipher algorithm (SPARC64)" tristate "Camellia cipher algorithm (SPARC64)"
depends on SPARC64 depends on SPARC64
...@@ -809,9 +841,16 @@ config CRYPTO_CAMELLIA_SPARC64 ...@@ -809,9 +841,16 @@ config CRYPTO_CAMELLIA_SPARC64
See also: See also:
<https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html> <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
config CRYPTO_CAST_COMMON
tristate
help
Common parts of the CAST cipher algorithms shared by the
generic c and the assembler implementations.
config CRYPTO_CAST5 config CRYPTO_CAST5
tristate "CAST5 (CAST-128) cipher algorithm" tristate "CAST5 (CAST-128) cipher algorithm"
select CRYPTO_ALGAPI select CRYPTO_ALGAPI
select CRYPTO_CAST_COMMON
help help
The CAST5 encryption algorithm (synonymous with CAST-128) is The CAST5 encryption algorithm (synonymous with CAST-128) is
described in RFC2144. described in RFC2144.
...@@ -822,6 +861,7 @@ config CRYPTO_CAST5_AVX_X86_64 ...@@ -822,6 +861,7 @@ config CRYPTO_CAST5_AVX_X86_64
select CRYPTO_ALGAPI select CRYPTO_ALGAPI
select CRYPTO_CRYPTD select CRYPTO_CRYPTD
select CRYPTO_ABLK_HELPER_X86 select CRYPTO_ABLK_HELPER_X86
select CRYPTO_CAST_COMMON
select CRYPTO_CAST5 select CRYPTO_CAST5
help help
The CAST5 encryption algorithm (synonymous with CAST-128) is The CAST5 encryption algorithm (synonymous with CAST-128) is
...@@ -833,6 +873,7 @@ config CRYPTO_CAST5_AVX_X86_64 ...@@ -833,6 +873,7 @@ config CRYPTO_CAST5_AVX_X86_64
config CRYPTO_CAST6 config CRYPTO_CAST6
tristate "CAST6 (CAST-256) cipher algorithm" tristate "CAST6 (CAST-256) cipher algorithm"
select CRYPTO_ALGAPI select CRYPTO_ALGAPI
select CRYPTO_CAST_COMMON
help help
The CAST6 encryption algorithm (synonymous with CAST-256) is The CAST6 encryption algorithm (synonymous with CAST-256) is
described in RFC2612. described in RFC2612.
...@@ -844,6 +885,7 @@ config CRYPTO_CAST6_AVX_X86_64 ...@@ -844,6 +885,7 @@ config CRYPTO_CAST6_AVX_X86_64
select CRYPTO_CRYPTD select CRYPTO_CRYPTD
select CRYPTO_ABLK_HELPER_X86 select CRYPTO_ABLK_HELPER_X86
select CRYPTO_GLUE_HELPER_X86 select CRYPTO_GLUE_HELPER_X86
select CRYPTO_CAST_COMMON
select CRYPTO_CAST6 select CRYPTO_CAST6
select CRYPTO_LRW select CRYPTO_LRW
select CRYPTO_XTS select CRYPTO_XTS
......
...@@ -68,6 +68,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH_COMMON) += twofish_common.o ...@@ -68,6 +68,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH_COMMON) += twofish_common.o
obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
obj-$(CONFIG_CRYPTO_AES) += aes_generic.o obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o
obj-$(CONFIG_CRYPTO_CAST_COMMON) += cast_common.o
obj-$(CONFIG_CRYPTO_CAST5) += cast5_generic.o obj-$(CONFIG_CRYPTO_CAST5) += cast5_generic.o
obj-$(CONFIG_CRYPTO_CAST6) += cast6_generic.o obj-$(CONFIG_CRYPTO_CAST6) += cast6_generic.o
obj-$(CONFIG_CRYPTO_ARC4) += arc4.o obj-$(CONFIG_CRYPTO_ARC4) += arc4.o
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -971,11 +971,13 @@ static int do_test(int m) ...@@ -971,11 +971,13 @@ static int do_test(int m)
case 3: case 3:
ret += tcrypt_test("ecb(des)"); ret += tcrypt_test("ecb(des)");
ret += tcrypt_test("cbc(des)"); ret += tcrypt_test("cbc(des)");
ret += tcrypt_test("ctr(des)");
break; break;
case 4: case 4:
ret += tcrypt_test("ecb(des3_ede)"); ret += tcrypt_test("ecb(des3_ede)");
ret += tcrypt_test("cbc(des3_ede)"); ret += tcrypt_test("cbc(des3_ede)");
ret += tcrypt_test("ctr(des3_ede)");
break; break;
case 5: case 5:
...@@ -1479,6 +1481,10 @@ static int do_test(int m) ...@@ -1479,6 +1481,10 @@ static int do_test(int m)
test_hash_speed("ghash-generic", sec, hash_speed_template_16); test_hash_speed("ghash-generic", sec, hash_speed_template_16);
if (mode > 300 && mode < 400) break; if (mode > 300 && mode < 400) break;
case 319:
test_hash_speed("crc32c", sec, generic_hash_speed_template);
if (mode > 300 && mode < 400) break;
case 399: case 399:
break; break;
...@@ -1722,6 +1728,29 @@ static int do_test(int m) ...@@ -1722,6 +1728,29 @@ static int do_test(int m)
speed_template_32_64); speed_template_32_64);
break; break;
case 508:
test_acipher_speed("ecb(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("ecb(camellia)", DECRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("cbc(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("cbc(camellia)", DECRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("ctr(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("ctr(camellia)", DECRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("lrw(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_32_48);
test_acipher_speed("lrw(camellia)", DECRYPT, sec, NULL, 0,
speed_template_32_48);
test_acipher_speed("xts(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_32_64);
test_acipher_speed("xts(camellia)", DECRYPT, sec, NULL, 0,
speed_template_32_64);
break;
case 1000: case 1000:
test_available(); test_available();
break; break;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -254,6 +254,7 @@ config CRYPTO_DEV_OMAP_AES ...@@ -254,6 +254,7 @@ config CRYPTO_DEV_OMAP_AES
tristate "Support for OMAP AES hw engine" tristate "Support for OMAP AES hw engine"
depends on ARCH_OMAP2 || ARCH_OMAP3 depends on ARCH_OMAP2 || ARCH_OMAP3
select CRYPTO_AES select CRYPTO_AES
select CRYPTO_BLKCIPHER2
help help
OMAP processors have AES module accelerator. Select this if you OMAP processors have AES module accelerator. Select this if you
want to use the OMAP module for AES algorithms. want to use the OMAP module for AES algorithms.
......
...@@ -1863,6 +1863,7 @@ static int __devexit spacc_remove(struct platform_device *pdev) ...@@ -1863,6 +1863,7 @@ static int __devexit spacc_remove(struct platform_device *pdev)
static const struct platform_device_id spacc_id_table[] = { static const struct platform_device_id spacc_id_table[] = {
{ "picochip,spacc-ipsec", }, { "picochip,spacc-ipsec", },
{ "picochip,spacc-l2", }, { "picochip,spacc-l2", },
{ }
}; };
static struct platform_driver spacc_driver = { static struct platform_driver spacc_driver = {
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
#include <crypto/ctr.h> #include <crypto/ctr.h>
#include <plat/cpu.h> #include <plat/cpu.h>
#include <plat/dma.h> #include <mach/dma.h>
#define _SBF(s, v) ((v) << (s)) #define _SBF(s, v) ((v) << (s))
#define _BIT(b) _SBF(b, 1) #define _BIT(b) _SBF(b, 1)
......
...@@ -936,8 +936,7 @@ static int sg_to_link_tbl(struct scatterlist *sg, int sg_count, ...@@ -936,8 +936,7 @@ static int sg_to_link_tbl(struct scatterlist *sg, int sg_count,
sg_count--; sg_count--;
link_tbl_ptr--; link_tbl_ptr--;
} }
link_tbl_ptr->len = cpu_to_be16(be16_to_cpu(link_tbl_ptr->len) be16_add_cpu(&link_tbl_ptr->len, cryptlen);
+ cryptlen);
/* tag end of link table */ /* tag end of link table */
link_tbl_ptr->j_extent = DESC_PTR_LNKTBL_RETURN; link_tbl_ptr->j_extent = DESC_PTR_LNKTBL_RETURN;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef _CRYPTO_CAST_COMMON_H
#define _CRYPTO_CAST_COMMON_H
extern const u32 cast_s1[256];
extern const u32 cast_s2[256];
extern const u32 cast_s3[256];
extern const u32 cast_s4[256];
#endif
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment