Commit 1476db2d authored by Dave Watson's avatar Dave Watson Committed by Herbert Xu

crypto: aesni - Move HashKey computation from stack to gcm_context

HashKey computation only needs to happen once per scatter/gather operation,
save it between calls in gcm_context struct instead of on the stack.
Since the asm no longer stores anything on the stack, we can use
%rsp directly, and clean up the frame save/restore macros a bit.

Hashkeys actually only need to be calculated once per key and could
be moved to when set_key is called, however, the current glue code
falls back to generic aes code if fpu is disabled.
Signed-off-by: default avatarDave Watson <davejwatson@fb.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent e2e34b08
...@@ -94,23 +94,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff ...@@ -94,23 +94,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define STACK_OFFSET 8*3 #define STACK_OFFSET 8*3
#define HashKey 16*0 // store HashKey <<1 mod poly here
#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
// bits of HashKey <<1 mod poly here
//(for Karatsuba purposes)
#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
// bits of HashKey^2 <<1 mod poly here
// (for Karatsuba purposes)
#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
// bits of HashKey^3 <<1 mod poly here
// (for Karatsuba purposes)
#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
// bits of HashKey^4 <<1 mod poly here
// (for Karatsuba purposes)
#define VARIABLE_OFFSET 16*8
#define AadHash 16*0 #define AadHash 16*0
#define AadLen 16*1 #define AadLen 16*1
...@@ -119,6 +102,22 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff ...@@ -119,6 +102,22 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define OrigIV 16*3 #define OrigIV 16*3
#define CurCount 16*4 #define CurCount 16*4
#define PBlockLen 16*5 #define PBlockLen 16*5
#define HashKey 16*6 // store HashKey <<1 mod poly here
#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
// bits of HashKey <<1 mod poly here
//(for Karatsuba purposes)
#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
// bits of HashKey^2 <<1 mod poly here
// (for Karatsuba purposes)
#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
// bits of HashKey^3 <<1 mod poly here
// (for Karatsuba purposes)
#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
// bits of HashKey^4 <<1 mod poly here
// (for Karatsuba purposes)
#define arg1 rdi #define arg1 rdi
#define arg2 rsi #define arg2 rsi
...@@ -126,11 +125,11 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff ...@@ -126,11 +125,11 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define arg4 rcx #define arg4 rcx
#define arg5 r8 #define arg5 r8
#define arg6 r9 #define arg6 r9
#define arg7 STACK_OFFSET+8(%r14) #define arg7 STACK_OFFSET+8(%rsp)
#define arg8 STACK_OFFSET+16(%r14) #define arg8 STACK_OFFSET+16(%rsp)
#define arg9 STACK_OFFSET+24(%r14) #define arg9 STACK_OFFSET+24(%rsp)
#define arg10 STACK_OFFSET+32(%r14) #define arg10 STACK_OFFSET+32(%rsp)
#define arg11 STACK_OFFSET+40(%r14) #define arg11 STACK_OFFSET+40(%rsp)
#define keysize 2*15*16(%arg1) #define keysize 2*15*16(%arg1)
#endif #endif
...@@ -184,28 +183,79 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff ...@@ -184,28 +183,79 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
push %r12 push %r12
push %r13 push %r13
push %r14 push %r14
mov %rsp, %r14
# #
# states of %xmm registers %xmm6:%xmm15 not saved # states of %xmm registers %xmm6:%xmm15 not saved
# all %xmm registers are clobbered # all %xmm registers are clobbered
# #
sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp
.endm .endm
.macro FUNC_RESTORE .macro FUNC_RESTORE
mov %r14, %rsp
pop %r14 pop %r14
pop %r13 pop %r13
pop %r12 pop %r12
.endm .endm
# Precompute hashkeys.
# Input: Hash subkey.
# Output: HashKeys stored in gcm_context_data. Only needs to be called
# once per key.
# clobbers r12, and tmp xmm registers.
.macro PRECOMPUTE TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
mov arg7, %r12
movdqu (%r12), \TMP3
movdqa SHUF_MASK(%rip), \TMP2
PSHUFB_XMM \TMP2, \TMP3
# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
movdqa \TMP3, \TMP2
psllq $1, \TMP3
psrlq $63, \TMP2
movdqa \TMP2, \TMP1
pslldq $8, \TMP2
psrldq $8, \TMP1
por \TMP2, \TMP3
# reduce HashKey<<1
pshufd $0x24, \TMP1, \TMP2
pcmpeqd TWOONE(%rip), \TMP2
pand POLY(%rip), \TMP2
pxor \TMP2, \TMP3
movdqa \TMP3, HashKey(%arg2)
movdqa \TMP3, \TMP5
pshufd $78, \TMP3, \TMP1
pxor \TMP3, \TMP1
movdqa \TMP1, HashKey_k(%arg2)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^2<<1 (mod poly)
movdqa \TMP5, HashKey_2(%arg2)
# HashKey_2 = HashKey^2<<1 (mod poly)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqa \TMP1, HashKey_2_k(%arg2)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
movdqa \TMP5, HashKey_3(%arg2)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqa \TMP1, HashKey_3_k(%arg2)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
movdqa \TMP5, HashKey_4(%arg2)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqa \TMP1, HashKey_4_k(%arg2)
.endm
# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
.macro GCM_INIT .macro GCM_INIT
mov arg9, %r11 mov arg9, %r11
mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
xor %r11, %r11 xor %r11, %r11
...@@ -220,28 +270,8 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff ...@@ -220,28 +270,8 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
PSHUFB_XMM %xmm2, %xmm0 PSHUFB_XMM %xmm2, %xmm0
movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
mov arg7, %r12 PRECOMPUTE %xmm1 %xmm2 %xmm3 %xmm4 %xmm5 %xmm6 %xmm7
movdqu (%r12), %xmm13 movdqa HashKey(%arg2), %xmm13
movdqa SHUF_MASK(%rip), %xmm2
PSHUFB_XMM %xmm2, %xmm13
# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
movdqa %xmm13, %xmm2
psllq $1, %xmm13
psrlq $63, %xmm2
movdqa %xmm2, %xmm1
pslldq $8, %xmm2
psrldq $8, %xmm1
por %xmm2, %xmm13
# reduce HashKey<<1
pshufd $0x24, %xmm1, %xmm2
pcmpeqd TWOONE(%rip), %xmm2
pand POLY(%rip), %xmm2
pxor %xmm2, %xmm13
movdqa %xmm13, HashKey(%rsp)
CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \ CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
%xmm5 %xmm6 %xmm5 %xmm6
...@@ -253,7 +283,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff ...@@ -253,7 +283,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
# Clobbers rax, r10-r13, and xmm0-xmm15 # Clobbers rax, r10-r13, and xmm0-xmm15
.macro GCM_ENC_DEC operation .macro GCM_ENC_DEC operation
movdqu AadHash(%arg2), %xmm8 movdqu AadHash(%arg2), %xmm8
movdqu HashKey(%rsp), %xmm13 movdqu HashKey(%arg2), %xmm13
add %arg5, InLen(%arg2) add %arg5, InLen(%arg2)
mov %arg5, %r13 # save the number of bytes mov %arg5, %r13 # save the number of bytes
and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
...@@ -377,7 +407,7 @@ _multiple_of_16_bytes_\@: ...@@ -377,7 +407,7 @@ _multiple_of_16_bytes_\@:
# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
.macro GCM_COMPLETE .macro GCM_COMPLETE
movdqu AadHash(%arg2), %xmm8 movdqu AadHash(%arg2), %xmm8
movdqu HashKey(%rsp), %xmm13 movdqu HashKey(%arg2), %xmm13
mov PBlockLen(%arg2), %r12 mov PBlockLen(%arg2), %r12
...@@ -584,7 +614,7 @@ _get_AAD_done\@: ...@@ -584,7 +614,7 @@ _get_AAD_done\@:
* the ciphertext * the ciphertext
* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
* are clobbered * are clobbered
* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified * arg1, %arg2, %arg3 are used as a pointer only, not modified
*/ */
...@@ -695,17 +725,6 @@ aes_loop_initial_\@: ...@@ -695,17 +725,6 @@ aes_loop_initial_\@:
pxor \TMP1, \XMM2 pxor \TMP1, \XMM2
pxor \TMP1, \XMM3 pxor \TMP1, \XMM3
pxor \TMP1, \XMM4 pxor \TMP1, \XMM4
movdqa \TMP3, \TMP5
pshufd $78, \TMP3, \TMP1
pxor \TMP3, \TMP1
movdqa \TMP1, HashKey_k(%rsp)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^2<<1 (mod poly)
movdqa \TMP5, HashKey_2(%rsp)
# HashKey_2 = HashKey^2<<1 (mod poly)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqa \TMP1, HashKey_2_k(%rsp)
.irpc index, 1234 # do 4 rounds .irpc index, 1234 # do 4 rounds
movaps 0x10*\index(%arg1), \TMP1 movaps 0x10*\index(%arg1), \TMP1
AESENC \TMP1, \XMM1 AESENC \TMP1, \XMM1
...@@ -713,12 +732,6 @@ aes_loop_initial_\@: ...@@ -713,12 +732,6 @@ aes_loop_initial_\@:
AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM3
AESENC \TMP1, \XMM4 AESENC \TMP1, \XMM4
.endr .endr
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
movdqa \TMP5, HashKey_3(%rsp)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqa \TMP1, HashKey_3_k(%rsp)
.irpc index, 56789 # do next 5 rounds .irpc index, 56789 # do next 5 rounds
movaps 0x10*\index(%arg1), \TMP1 movaps 0x10*\index(%arg1), \TMP1
AESENC \TMP1, \XMM1 AESENC \TMP1, \XMM1
...@@ -726,12 +739,6 @@ aes_loop_initial_\@: ...@@ -726,12 +739,6 @@ aes_loop_initial_\@:
AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM3
AESENC \TMP1, \XMM4 AESENC \TMP1, \XMM4
.endr .endr
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
movdqa \TMP5, HashKey_4(%rsp)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqa \TMP1, HashKey_4_k(%rsp)
lea 0xa0(%arg1),%r10 lea 0xa0(%arg1),%r10
mov keysize,%eax mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8 shr $2,%eax # 128->4, 192->6, 256->8
...@@ -816,7 +823,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -816,7 +823,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pshufd $78, \XMM5, \TMP6 pshufd $78, \XMM5, \TMP6
pxor \XMM5, \TMP6 pxor \XMM5, \TMP6
paddd ONE(%rip), \XMM0 # INCR CNT paddd ONE(%rip), \XMM0 # INCR CNT
movdqa HashKey_4(%rsp), \TMP5 movdqa HashKey_4(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
movdqa \XMM0, \XMM1 movdqa \XMM0, \XMM1
paddd ONE(%rip), \XMM0 # INCR CNT paddd ONE(%rip), \XMM0 # INCR CNT
...@@ -835,7 +842,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -835,7 +842,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pxor (%arg1), \XMM2 pxor (%arg1), \XMM2
pxor (%arg1), \XMM3 pxor (%arg1), \XMM3
pxor (%arg1), \XMM4 pxor (%arg1), \XMM4
movdqa HashKey_4_k(%rsp), \TMP5 movdqa HashKey_4_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
movaps 0x10(%arg1), \TMP1 movaps 0x10(%arg1), \TMP1
AESENC \TMP1, \XMM1 # Round 1 AESENC \TMP1, \XMM1 # Round 1
...@@ -850,7 +857,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -850,7 +857,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM6, \TMP1 movdqa \XMM6, \TMP1
pshufd $78, \XMM6, \TMP2 pshufd $78, \XMM6, \TMP2
pxor \XMM6, \TMP2 pxor \XMM6, \TMP2
movdqa HashKey_3(%rsp), \TMP5 movdqa HashKey_3(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
movaps 0x30(%arg1), \TMP3 movaps 0x30(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 3 AESENC \TMP3, \XMM1 # Round 3
...@@ -863,7 +870,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -863,7 +870,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM2
AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4 AESENC \TMP3, \XMM4
movdqa HashKey_3_k(%rsp), \TMP5 movdqa HashKey_3_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x50(%arg1), \TMP3 movaps 0x50(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 5 AESENC \TMP3, \XMM1 # Round 5
...@@ -877,7 +884,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -877,7 +884,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM7, \TMP1 movdqa \XMM7, \TMP1
pshufd $78, \XMM7, \TMP2 pshufd $78, \XMM7, \TMP2
pxor \XMM7, \TMP2 pxor \XMM7, \TMP2
movdqa HashKey_2(%rsp ), \TMP5 movdqa HashKey_2(%arg2), \TMP5
# Multiply TMP5 * HashKey using karatsuba # Multiply TMP5 * HashKey using karatsuba
...@@ -893,7 +900,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -893,7 +900,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM2
AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4 AESENC \TMP3, \XMM4
movdqa HashKey_2_k(%rsp), \TMP5 movdqa HashKey_2_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x80(%arg1), \TMP3 movaps 0x80(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 8 AESENC \TMP3, \XMM1 # Round 8
...@@ -911,7 +918,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -911,7 +918,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM8, \TMP1 movdqa \XMM8, \TMP1
pshufd $78, \XMM8, \TMP2 pshufd $78, \XMM8, \TMP2
pxor \XMM8, \TMP2 pxor \XMM8, \TMP2
movdqa HashKey(%rsp), \TMP5 movdqa HashKey(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x90(%arg1), \TMP3 movaps 0x90(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 9 AESENC \TMP3, \XMM1 # Round 9
...@@ -940,7 +947,7 @@ aes_loop_par_enc_done: ...@@ -940,7 +947,7 @@ aes_loop_par_enc_done:
AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM2
AESENCLAST \TMP3, \XMM3 AESENCLAST \TMP3, \XMM3
AESENCLAST \TMP3, \XMM4 AESENCLAST \TMP3, \XMM4
movdqa HashKey_k(%rsp), \TMP5 movdqa HashKey_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqu (%arg4,%r11,1), \TMP3 movdqu (%arg4,%r11,1), \TMP3
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
...@@ -1024,7 +1031,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -1024,7 +1031,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pshufd $78, \XMM5, \TMP6 pshufd $78, \XMM5, \TMP6
pxor \XMM5, \TMP6 pxor \XMM5, \TMP6
paddd ONE(%rip), \XMM0 # INCR CNT paddd ONE(%rip), \XMM0 # INCR CNT
movdqa HashKey_4(%rsp), \TMP5 movdqa HashKey_4(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
movdqa \XMM0, \XMM1 movdqa \XMM0, \XMM1
paddd ONE(%rip), \XMM0 # INCR CNT paddd ONE(%rip), \XMM0 # INCR CNT
...@@ -1043,7 +1050,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -1043,7 +1050,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pxor (%arg1), \XMM2 pxor (%arg1), \XMM2
pxor (%arg1), \XMM3 pxor (%arg1), \XMM3
pxor (%arg1), \XMM4 pxor (%arg1), \XMM4
movdqa HashKey_4_k(%rsp), \TMP5 movdqa HashKey_4_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
movaps 0x10(%arg1), \TMP1 movaps 0x10(%arg1), \TMP1
AESENC \TMP1, \XMM1 # Round 1 AESENC \TMP1, \XMM1 # Round 1
...@@ -1058,7 +1065,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -1058,7 +1065,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM6, \TMP1 movdqa \XMM6, \TMP1
pshufd $78, \XMM6, \TMP2 pshufd $78, \XMM6, \TMP2
pxor \XMM6, \TMP2 pxor \XMM6, \TMP2
movdqa HashKey_3(%rsp), \TMP5 movdqa HashKey_3(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
movaps 0x30(%arg1), \TMP3 movaps 0x30(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 3 AESENC \TMP3, \XMM1 # Round 3
...@@ -1071,7 +1078,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -1071,7 +1078,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM2
AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4 AESENC \TMP3, \XMM4
movdqa HashKey_3_k(%rsp), \TMP5 movdqa HashKey_3_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x50(%arg1), \TMP3 movaps 0x50(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 5 AESENC \TMP3, \XMM1 # Round 5
...@@ -1085,7 +1092,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -1085,7 +1092,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM7, \TMP1 movdqa \XMM7, \TMP1
pshufd $78, \XMM7, \TMP2 pshufd $78, \XMM7, \TMP2
pxor \XMM7, \TMP2 pxor \XMM7, \TMP2
movdqa HashKey_2(%rsp ), \TMP5 movdqa HashKey_2(%arg2), \TMP5
# Multiply TMP5 * HashKey using karatsuba # Multiply TMP5 * HashKey using karatsuba
...@@ -1101,7 +1108,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -1101,7 +1108,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM2
AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4 AESENC \TMP3, \XMM4
movdqa HashKey_2_k(%rsp), \TMP5 movdqa HashKey_2_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x80(%arg1), \TMP3 movaps 0x80(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 8 AESENC \TMP3, \XMM1 # Round 8
...@@ -1119,7 +1126,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation ...@@ -1119,7 +1126,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM8, \TMP1 movdqa \XMM8, \TMP1
pshufd $78, \XMM8, \TMP2 pshufd $78, \XMM8, \TMP2
pxor \XMM8, \TMP2 pxor \XMM8, \TMP2
movdqa HashKey(%rsp), \TMP5 movdqa HashKey(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x90(%arg1), \TMP3 movaps 0x90(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 9 AESENC \TMP3, \XMM1 # Round 9
...@@ -1148,7 +1155,7 @@ aes_loop_par_dec_done: ...@@ -1148,7 +1155,7 @@ aes_loop_par_dec_done:
AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM2
AESENCLAST \TMP3, \XMM3 AESENCLAST \TMP3, \XMM3
AESENCLAST \TMP3, \XMM4 AESENCLAST \TMP3, \XMM4
movdqa HashKey_k(%rsp), \TMP5 movdqa HashKey_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqu (%arg4,%r11,1), \TMP3 movdqu (%arg4,%r11,1), \TMP3
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
...@@ -1224,10 +1231,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst ...@@ -1224,10 +1231,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
movdqa \XMM1, \TMP6 movdqa \XMM1, \TMP6
pshufd $78, \XMM1, \TMP2 pshufd $78, \XMM1, \TMP2
pxor \XMM1, \TMP2 pxor \XMM1, \TMP2
movdqa HashKey_4(%rsp), \TMP5 movdqa HashKey_4(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
movdqa HashKey_4_k(%rsp), \TMP4 movdqa HashKey_4_k(%arg2), \TMP4
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqa \XMM1, \XMMDst movdqa \XMM1, \XMMDst
movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
...@@ -1237,10 +1244,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst ...@@ -1237,10 +1244,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
movdqa \XMM2, \TMP1 movdqa \XMM2, \TMP1
pshufd $78, \XMM2, \TMP2 pshufd $78, \XMM2, \TMP2
pxor \XMM2, \TMP2 pxor \XMM2, \TMP2
movdqa HashKey_3(%rsp), \TMP5 movdqa HashKey_3(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
movdqa HashKey_3_k(%rsp), \TMP4 movdqa HashKey_3_k(%arg2), \TMP4
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6 pxor \TMP1, \TMP6
pxor \XMM2, \XMMDst pxor \XMM2, \XMMDst
...@@ -1252,10 +1259,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst ...@@ -1252,10 +1259,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
movdqa \XMM3, \TMP1 movdqa \XMM3, \TMP1
pshufd $78, \XMM3, \TMP2 pshufd $78, \XMM3, \TMP2
pxor \XMM3, \TMP2 pxor \XMM3, \TMP2
movdqa HashKey_2(%rsp), \TMP5 movdqa HashKey_2(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
movdqa HashKey_2_k(%rsp), \TMP4 movdqa HashKey_2_k(%arg2), \TMP4
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6 pxor \TMP1, \TMP6
pxor \XMM3, \XMMDst pxor \XMM3, \XMMDst
...@@ -1265,10 +1272,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst ...@@ -1265,10 +1272,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
movdqa \XMM4, \TMP1 movdqa \XMM4, \TMP1
pshufd $78, \XMM4, \TMP2 pshufd $78, \XMM4, \TMP2
pxor \XMM4, \TMP2 pxor \XMM4, \TMP2
movdqa HashKey(%rsp), \TMP5 movdqa HashKey(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
movdqa HashKey_k(%rsp), \TMP4 movdqa HashKey_k(%arg2), \TMP4
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6 pxor \TMP1, \TMP6
pxor \XMM4, \XMMDst pxor \XMM4, \XMMDst
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment