Commit ce41fefd authored by Tianjia Zhang's avatar Tianjia Zhang Committed by Herbert Xu

crypto: arm64/sm4 - refactor and simplify CE implementation

This patch does not add new features, but only refactors and simplifies the
implementation of the Crypto Extension acceleration of the SM4 algorithm:

Extract the macro optimized by SM4 Crypto Extension for reuse in the
subsequent optimization of CCM/GCM modes.

Encryption in CBC and CFB modes processes four blocks at a time instead of
one, allowing the ld1 instruction to load 64 bytes of data at a time, which
will reduces unnecessary memory accesses.

CBC/CFB/CTR makes full use of free registers to reduce redundant memory
accesses, and rearranges some instructions to improve out-of-order execution
capabilities.
Signed-off-by: default avatarTianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 3c383637
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SM4 helper macros for Crypto Extensions
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/
#define SM4_PREPARE(ptr) \
ld1 {v24.16b-v27.16b}, [ptr], #64; \
ld1 {v28.16b-v31.16b}, [ptr];
#define SM4_CRYPT_BLK_BE(b0) \
sm4e b0.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
rev32 b0.16b, b0.16b;
#define SM4_CRYPT_BLK(b0) \
rev32 b0.16b, b0.16b; \
SM4_CRYPT_BLK_BE(b0);
#define SM4_CRYPT_BLK2_BE(b0, b1) \
sm4e b0.4s, v24.4s; \
sm4e b1.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b1.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b1.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b1.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b1.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b1.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b1.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
sm4e b1.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
rev64 b1.4s, b1.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
ext b1.16b, b1.16b, b1.16b, #8; \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
#define SM4_CRYPT_BLK2(b0, b1) \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
SM4_CRYPT_BLK2_BE(b0, b1);
#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
sm4e b0.4s, v24.4s; \
sm4e b1.4s, v24.4s; \
sm4e b2.4s, v24.4s; \
sm4e b3.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b1.4s, v25.4s; \
sm4e b2.4s, v25.4s; \
sm4e b3.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b1.4s, v26.4s; \
sm4e b2.4s, v26.4s; \
sm4e b3.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b1.4s, v27.4s; \
sm4e b2.4s, v27.4s; \
sm4e b3.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b1.4s, v28.4s; \
sm4e b2.4s, v28.4s; \
sm4e b3.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b1.4s, v29.4s; \
sm4e b2.4s, v29.4s; \
sm4e b3.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b1.4s, v30.4s; \
sm4e b2.4s, v30.4s; \
sm4e b3.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
sm4e b1.4s, v31.4s; \
sm4e b2.4s, v31.4s; \
sm4e b3.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
rev64 b1.4s, b1.4s; \
rev64 b2.4s, b2.4s; \
rev64 b3.4s, b3.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
ext b1.16b, b1.16b, b1.16b, #8; \
ext b2.16b, b2.16b, b2.16b, #8; \
ext b3.16b, b3.16b, b3.16b, #8; \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b;
#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b; \
SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
#define SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7) \
sm4e b0.4s, v24.4s; \
sm4e b1.4s, v24.4s; \
sm4e b2.4s, v24.4s; \
sm4e b3.4s, v24.4s; \
sm4e b4.4s, v24.4s; \
sm4e b5.4s, v24.4s; \
sm4e b6.4s, v24.4s; \
sm4e b7.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b1.4s, v25.4s; \
sm4e b2.4s, v25.4s; \
sm4e b3.4s, v25.4s; \
sm4e b4.4s, v25.4s; \
sm4e b5.4s, v25.4s; \
sm4e b6.4s, v25.4s; \
sm4e b7.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b1.4s, v26.4s; \
sm4e b2.4s, v26.4s; \
sm4e b3.4s, v26.4s; \
sm4e b4.4s, v26.4s; \
sm4e b5.4s, v26.4s; \
sm4e b6.4s, v26.4s; \
sm4e b7.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b1.4s, v27.4s; \
sm4e b2.4s, v27.4s; \
sm4e b3.4s, v27.4s; \
sm4e b4.4s, v27.4s; \
sm4e b5.4s, v27.4s; \
sm4e b6.4s, v27.4s; \
sm4e b7.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b1.4s, v28.4s; \
sm4e b2.4s, v28.4s; \
sm4e b3.4s, v28.4s; \
sm4e b4.4s, v28.4s; \
sm4e b5.4s, v28.4s; \
sm4e b6.4s, v28.4s; \
sm4e b7.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b1.4s, v29.4s; \
sm4e b2.4s, v29.4s; \
sm4e b3.4s, v29.4s; \
sm4e b4.4s, v29.4s; \
sm4e b5.4s, v29.4s; \
sm4e b6.4s, v29.4s; \
sm4e b7.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b1.4s, v30.4s; \
sm4e b2.4s, v30.4s; \
sm4e b3.4s, v30.4s; \
sm4e b4.4s, v30.4s; \
sm4e b5.4s, v30.4s; \
sm4e b6.4s, v30.4s; \
sm4e b7.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
sm4e b1.4s, v31.4s; \
sm4e b2.4s, v31.4s; \
sm4e b3.4s, v31.4s; \
sm4e b4.4s, v31.4s; \
sm4e b5.4s, v31.4s; \
sm4e b6.4s, v31.4s; \
sm4e b7.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
rev64 b1.4s, b1.4s; \
rev64 b2.4s, b2.4s; \
rev64 b3.4s, b3.4s; \
rev64 b4.4s, b4.4s; \
rev64 b5.4s, b5.4s; \
rev64 b6.4s, b6.4s; \
rev64 b7.4s, b7.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
ext b1.16b, b1.16b, b1.16b, #8; \
ext b2.16b, b2.16b, b2.16b, #8; \
ext b3.16b, b3.16b, b3.16b, #8; \
ext b4.16b, b4.16b, b4.16b, #8; \
ext b5.16b, b5.16b, b5.16b, #8; \
ext b6.16b, b6.16b, b6.16b, #8; \
ext b7.16b, b7.16b, b7.16b, #8; \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b; \
rev32 b4.16b, b4.16b; \
rev32 b5.16b, b5.16b; \
rev32 b6.16b, b6.16b; \
rev32 b7.16b, b7.16b;
#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b; \
rev32 b4.16b, b4.16b; \
rev32 b5.16b, b5.16b; \
rev32 b6.16b, b6.16b; \
rev32 b7.16b, b7.16b; \
SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7);
This diff is collapsed.
...@@ -26,9 +26,9 @@ asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src); ...@@ -26,9 +26,9 @@ asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src);
asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src, asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src,
unsigned int nblks); unsigned int nblks);
asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src, asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src,
u8 *iv, unsigned int nblks); u8 *iv, unsigned int nblocks);
asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src, asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src,
u8 *iv, unsigned int nblks); u8 *iv, unsigned int nblocks);
asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src, asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src,
u8 *iv, unsigned int nblks); u8 *iv, unsigned int nblks);
asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src, asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src,
...@@ -94,66 +94,56 @@ static int sm4_ecb_decrypt(struct skcipher_request *req) ...@@ -94,66 +94,56 @@ static int sm4_ecb_decrypt(struct skcipher_request *req)
return sm4_ecb_do_crypt(req, ctx->rkey_dec); return sm4_ecb_do_crypt(req, ctx->rkey_dec);
} }
static int sm4_cbc_encrypt(struct skcipher_request *req) static int sm4_cbc_crypt(struct skcipher_request *req,
struct sm4_ctx *ctx, bool encrypt)
{ {
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk; struct skcipher_walk walk;
unsigned int nbytes; unsigned int nbytes;
int err; int err;
err = skcipher_walk_virt(&walk, req, false); err = skcipher_walk_virt(&walk, req, false);
if (err)
return err;
while ((nbytes = walk.nbytes) > 0) { while ((nbytes = walk.nbytes) > 0) {
const u8 *src = walk.src.virt.addr; const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr; u8 *dst = walk.dst.virt.addr;
unsigned int nblks; unsigned int nblocks;
nblocks = nbytes / SM4_BLOCK_SIZE;
if (nblocks) {
kernel_neon_begin(); kernel_neon_begin();
nblks = BYTES2BLKS(nbytes); if (encrypt)
if (nblks) { sm4_ce_cbc_enc(ctx->rkey_enc, dst, src,
sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, walk.iv, nblks); walk.iv, nblocks);
nbytes -= nblks * SM4_BLOCK_SIZE; else
} sm4_ce_cbc_dec(ctx->rkey_dec, dst, src,
walk.iv, nblocks);
kernel_neon_end(); kernel_neon_end();
}
err = skcipher_walk_done(&walk, nbytes); err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE);
} }
return err; return err;
} }
static int sm4_cbc_decrypt(struct skcipher_request *req) static int sm4_cbc_encrypt(struct skcipher_request *req)
{ {
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, false); return sm4_cbc_crypt(req, ctx, true);
}
while ((nbytes = walk.nbytes) > 0) {
const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
unsigned int nblks;
kernel_neon_begin();
nblks = BYTES2BLKS(nbytes);
if (nblks) {
sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, walk.iv, nblks);
nbytes -= nblks * SM4_BLOCK_SIZE;
}
kernel_neon_end();
err = skcipher_walk_done(&walk, nbytes); static int sm4_cbc_decrypt(struct skcipher_request *req)
} {
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
return err; return sm4_cbc_crypt(req, ctx, false);
} }
static int sm4_cfb_encrypt(struct skcipher_request *req) static int sm4_cfb_encrypt(struct skcipher_request *req)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment