Commit b327dfe0 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm

Pull ARM udpates from Russell King:

 - Improve Kconfig help text for Cortex A8 and Cortex A9 errata

 - Kconfig spelling and grammar fixes

 - Allow kernel-mode VFP/Neon in softirq context

 - Use Neon in softirq context

 - Implement AES-CTR/GHASH version of GCM

* tag 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm:
  ARM: 9289/1: Allow pre-ARMv5 builds with ld.lld 16.0.0 and newer
  ARM: 9288/1: Kconfigs: fix spelling & grammar
  ARM: 9286/1: crypto: Implement fused AES-CTR/GHASH version of GCM
  ARM: 9285/1: remove meaningless arch/arm/mach-rda/Makefile
  ARM: 9283/1: permit non-nested kernel mode NEON in softirq context
  ARM: 9282/1: vfp: Manipulate task VFP state with softirqs disabled
  ARM: 9281/1: improve Cortex A8/A9 errata help text
parents eb6d5bbe 5eb6e280
...@@ -344,14 +344,16 @@ comment "CPU Core family selection" ...@@ -344,14 +344,16 @@ comment "CPU Core family selection"
config ARCH_MULTI_V4 config ARCH_MULTI_V4
bool "ARMv4 based platforms (FA526, StrongARM)" bool "ARMv4 based platforms (FA526, StrongARM)"
depends on !ARCH_MULTI_V6_V7 depends on !ARCH_MULTI_V6_V7
depends on !LD_IS_LLD # https://github.com/llvm/llvm-project/issues/50764
depends on !LD_IS_LLD || LLD_VERSION >= 160000
select ARCH_MULTI_V4_V5 select ARCH_MULTI_V4_V5
select CPU_FA526 if !(CPU_SA110 || CPU_SA1100) select CPU_FA526 if !(CPU_SA110 || CPU_SA1100)
config ARCH_MULTI_V4T config ARCH_MULTI_V4T
bool "ARMv4T based platforms (ARM720T, ARM920T, ...)" bool "ARMv4T based platforms (ARM720T, ARM920T, ...)"
depends on !ARCH_MULTI_V6_V7 depends on !ARCH_MULTI_V6_V7
depends on !LD_IS_LLD # https://github.com/llvm/llvm-project/issues/50764
depends on !LD_IS_LLD || LLD_VERSION >= 160000
select ARCH_MULTI_V4_V5 select ARCH_MULTI_V4_V5
select CPU_ARM920T if !(CPU_ARM7TDMI || CPU_ARM720T || \ select CPU_ARM920T if !(CPU_ARM7TDMI || CPU_ARM720T || \
CPU_ARM740T || CPU_ARM9TDMI || CPU_ARM922T || \ CPU_ARM740T || CPU_ARM9TDMI || CPU_ARM922T || \
...@@ -656,7 +658,9 @@ config ARM_ERRATA_458693 ...@@ -656,7 +658,9 @@ config ARM_ERRATA_458693
hazard might then cause a processor deadlock. The workaround enables hazard might then cause a processor deadlock. The workaround enables
the L1 caching of the NEON accesses and disables the PLD instruction the L1 caching of the NEON accesses and disables the PLD instruction
in the ACTLR register. Note that setting specific bits in the ACTLR in the ACTLR register. Note that setting specific bits in the ACTLR
register may not be available in non-secure mode. register may not be available in non-secure mode and thus is not
available on a multiplatform kernel. This should be applied by the
bootloader instead.
config ARM_ERRATA_460075 config ARM_ERRATA_460075
bool "ARM errata: Data written to the L2 cache can be overwritten with stale data" bool "ARM errata: Data written to the L2 cache can be overwritten with stale data"
...@@ -669,7 +673,9 @@ config ARM_ERRATA_460075 ...@@ -669,7 +673,9 @@ config ARM_ERRATA_460075
and overwritten with stale memory contents from external memory. The and overwritten with stale memory contents from external memory. The
workaround disables the write-allocate mode for the L2 cache via the workaround disables the write-allocate mode for the L2 cache via the
ACTLR register. Note that setting specific bits in the ACTLR register ACTLR register. Note that setting specific bits in the ACTLR register
may not be available in non-secure mode. may not be available in non-secure mode and thus is not available on
a multiplatform kernel. This should be applied by the bootloader
instead.
config ARM_ERRATA_742230 config ARM_ERRATA_742230
bool "ARM errata: DMB operation may be faulty" bool "ARM errata: DMB operation may be faulty"
...@@ -682,7 +688,10 @@ config ARM_ERRATA_742230 ...@@ -682,7 +688,10 @@ config ARM_ERRATA_742230
ordering of the two writes. This workaround sets a specific bit in ordering of the two writes. This workaround sets a specific bit in
the diagnostic register of the Cortex-A9 which causes the DMB the diagnostic register of the Cortex-A9 which causes the DMB
instruction to behave as a DSB, ensuring the correct behaviour of instruction to behave as a DSB, ensuring the correct behaviour of
the two writes. the two writes. Note that setting specific bits in the diagnostics
register may not be available in non-secure mode and thus is not
available on a multiplatform kernel. This should be applied by the
bootloader instead.
config ARM_ERRATA_742231 config ARM_ERRATA_742231
bool "ARM errata: Incorrect hazard handling in the SCU may lead to data corruption" bool "ARM errata: Incorrect hazard handling in the SCU may lead to data corruption"
...@@ -697,7 +706,10 @@ config ARM_ERRATA_742231 ...@@ -697,7 +706,10 @@ config ARM_ERRATA_742231
replaced from one of the CPUs at the same time as another CPU is replaced from one of the CPUs at the same time as another CPU is
accessing it. This workaround sets specific bits in the diagnostic accessing it. This workaround sets specific bits in the diagnostic
register of the Cortex-A9 which reduces the linefill issuing register of the Cortex-A9 which reduces the linefill issuing
capabilities of the processor. capabilities of the processor. Note that setting specific bits in the
diagnostics register may not be available in non-secure mode and thus
is not available on a multiplatform kernel. This should be applied by
the bootloader instead.
config ARM_ERRATA_643719 config ARM_ERRATA_643719
bool "ARM errata: LoUIS bit field in CLIDR register is incorrect" bool "ARM errata: LoUIS bit field in CLIDR register is incorrect"
...@@ -734,7 +746,9 @@ config ARM_ERRATA_743622 ...@@ -734,7 +746,9 @@ config ARM_ERRATA_743622
register of the Cortex-A9 which disables the Store Buffer register of the Cortex-A9 which disables the Store Buffer
optimisation, preventing the defect from occurring. This has no optimisation, preventing the defect from occurring. This has no
visible impact on the overall performance or power consumption of the visible impact on the overall performance or power consumption of the
processor. processor. Note that setting specific bits in the diagnostics register
may not be available in non-secure mode and thus is not available on a
multiplatform kernel. This should be applied by the bootloader instead.
config ARM_ERRATA_751472 config ARM_ERRATA_751472
bool "ARM errata: Interrupted ICIALLUIS may prevent completion of broadcasted operation" bool "ARM errata: Interrupted ICIALLUIS may prevent completion of broadcasted operation"
...@@ -746,6 +760,10 @@ config ARM_ERRATA_751472 ...@@ -746,6 +760,10 @@ config ARM_ERRATA_751472
completion of a following broadcasted operation if the second completion of a following broadcasted operation if the second
operation is received by a CPU before the ICIALLUIS has completed, operation is received by a CPU before the ICIALLUIS has completed,
potentially leading to corrupted entries in the cache or TLB. potentially leading to corrupted entries in the cache or TLB.
Note that setting specific bits in the diagnostics register may
not be available in non-secure mode and thus is not available on
a multiplatform kernel. This should be applied by the bootloader
instead.
config ARM_ERRATA_754322 config ARM_ERRATA_754322
bool "ARM errata: possible faulty MMU translations following an ASID switch" bool "ARM errata: possible faulty MMU translations following an ASID switch"
......
...@@ -1206,8 +1206,8 @@ choice ...@@ -1206,8 +1206,8 @@ choice
depends on MACH_STM32MP157 depends on MACH_STM32MP157
select DEBUG_STM32_UART select DEBUG_STM32_UART
help help
Say Y here if you want kernel low-level debugging support Say Y here if you want kernel low-level debugging support on
on STM32MP1 based platforms, wich default UART is wired on STM32MP1-based platforms, where the default UART is wired to
UART4, but another UART instance can be selected by modifying UART4, but another UART instance can be selected by modifying
CONFIG_DEBUG_UART_PHYS and CONFIG_DEBUG_UART_VIRT. CONFIG_DEBUG_UART_PHYS and CONFIG_DEBUG_UART_VIRT.
......
...@@ -209,7 +209,6 @@ machine-$(CONFIG_ARCH_OMAP2PLUS) += omap2 ...@@ -209,7 +209,6 @@ machine-$(CONFIG_ARCH_OMAP2PLUS) += omap2
machine-$(CONFIG_ARCH_ORION5X) += orion5x machine-$(CONFIG_ARCH_ORION5X) += orion5x
machine-$(CONFIG_ARCH_PXA) += pxa machine-$(CONFIG_ARCH_PXA) += pxa
machine-$(CONFIG_ARCH_QCOM) += qcom machine-$(CONFIG_ARCH_QCOM) += qcom
machine-$(CONFIG_ARCH_RDA) += rda
machine-$(CONFIG_ARCH_REALTEK) += realtek machine-$(CONFIG_ARCH_REALTEK) += realtek
machine-$(CONFIG_ARCH_ROCKCHIP) += rockchip machine-$(CONFIG_ARCH_ROCKCHIP) += rockchip
machine-$(CONFIG_ARCH_RPC) += rpc machine-$(CONFIG_ARCH_RPC) += rpc
......
...@@ -16,8 +16,10 @@ config CRYPTO_CURVE25519_NEON ...@@ -16,8 +16,10 @@ config CRYPTO_CURVE25519_NEON
config CRYPTO_GHASH_ARM_CE config CRYPTO_GHASH_ARM_CE
tristate "Hash functions: GHASH (PMULL/NEON/ARMv8 Crypto Extensions)" tristate "Hash functions: GHASH (PMULL/NEON/ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON depends on KERNEL_MODE_NEON
select CRYPTO_AEAD
select CRYPTO_HASH select CRYPTO_HASH
select CRYPTO_CRYPTD select CRYPTO_CRYPTD
select CRYPTO_LIB_AES
select CRYPTO_LIB_GF128MUL select CRYPTO_LIB_GF128MUL
help help
GCM GHASH function (NIST SP800-38D) GCM GHASH function (NIST SP800-38D)
......
...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
/* /*
* Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
* *
* Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * Copyright (C) 2015 - 2017 Linaro Ltd.
* Copyright (C) 2023 Google LLC. <ardb@google.com>
*/ */
#include <linux/linkage.h> #include <linux/linkage.h>
...@@ -44,7 +45,7 @@ ...@@ -44,7 +45,7 @@
t2q .req q7 t2q .req q7
t3q .req q8 t3q .req q8
t4q .req q9 t4q .req q9
T2 .req q9 XH2 .req q9
s1l .req d20 s1l .req d20
s1h .req d21 s1h .req d21
...@@ -80,7 +81,7 @@ ...@@ -80,7 +81,7 @@
XL2 .req q5 XL2 .req q5
XM2 .req q6 XM2 .req q6
XH2 .req q7 T2 .req q7
T3 .req q8 T3 .req q8
XL2_L .req d10 XL2_L .req d10
...@@ -192,9 +193,10 @@ ...@@ -192,9 +193,10 @@
vshr.u64 XL, XL, #1 vshr.u64 XL, XL, #1
.endm .endm
.macro ghash_update, pn .macro ghash_update, pn, enc, aggregate=1, head=1
vld1.64 {XL}, [r1] vld1.64 {XL}, [r1]
.if \head
/* do the head block first, if supplied */ /* do the head block first, if supplied */
ldr ip, [sp] ldr ip, [sp]
teq ip, #0 teq ip, #0
...@@ -202,13 +204,32 @@ ...@@ -202,13 +204,32 @@
vld1.64 {T1}, [ip] vld1.64 {T1}, [ip]
teq r0, #0 teq r0, #0
b 3f b 3f
.endif
0: .ifc \pn, p64 0: .ifc \pn, p64
.if \aggregate
tst r0, #3 // skip until #blocks is a tst r0, #3 // skip until #blocks is a
bne 2f // round multiple of 4 bne 2f // round multiple of 4
vld1.8 {XL2-XM2}, [r2]! vld1.8 {XL2-XM2}, [r2]!
1: vld1.8 {T3-T2}, [r2]! 1: vld1.8 {T2-T3}, [r2]!
.ifnb \enc
\enc\()_4x XL2, XM2, T2, T3
add ip, r3, #16
vld1.64 {HH}, [ip, :128]!
vld1.64 {HH3-HH4}, [ip, :128]
veor SHASH2_p64, SHASH_L, SHASH_H
veor SHASH2_H, HH_L, HH_H
veor HH34_L, HH3_L, HH3_H
veor HH34_H, HH4_L, HH4_H
vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57
.endif
vrev64.8 XL2, XL2 vrev64.8 XL2, XL2
vrev64.8 XM2, XM2 vrev64.8 XM2, XM2
...@@ -218,8 +239,8 @@ ...@@ -218,8 +239,8 @@
veor XL2_H, XL2_H, XL_L veor XL2_H, XL2_H, XL_L
veor XL, XL, T1 veor XL, XL, T1
vrev64.8 T3, T3 vrev64.8 T1, T3
vrev64.8 T1, T2 vrev64.8 T3, T2
vmull.p64 XH, HH4_H, XL_H // a1 * b1 vmull.p64 XH, HH4_H, XL_H // a1 * b1
veor XL2_H, XL2_H, XL_H veor XL2_H, XL2_H, XL_H
...@@ -267,14 +288,22 @@ ...@@ -267,14 +288,22 @@
b 1b b 1b
.endif .endif
.endif
2: vld1.8 {T1}, [r2]!
.ifnb \enc
\enc\()_1x T1
veor SHASH2_p64, SHASH_L, SHASH_H
vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57
.endif
2: vld1.64 {T1}, [r2]!
subs r0, r0, #1 subs r0, r0, #1
3: /* multiply XL by SHASH in GF(2^128) */ 3: /* multiply XL by SHASH in GF(2^128) */
#ifndef CONFIG_CPU_BIG_ENDIAN
vrev64.8 T1, T1 vrev64.8 T1, T1
#endif
vext.8 IN1, T1, T1, #8 vext.8 IN1, T1, T1, #8
veor T1_L, T1_L, XL_H veor T1_L, T1_L, XL_H
veor XL, XL, IN1 veor XL, XL, IN1
...@@ -293,9 +322,6 @@ ...@@ -293,9 +322,6 @@
veor XL, XL, T1 veor XL, XL, T1
bne 0b bne 0b
vst1.64 {XL}, [r1]
bx lr
.endm .endm
/* /*
...@@ -316,6 +342,9 @@ ENTRY(pmull_ghash_update_p64) ...@@ -316,6 +342,9 @@ ENTRY(pmull_ghash_update_p64)
vshl.u64 MASK, MASK, #57 vshl.u64 MASK, MASK, #57
ghash_update p64 ghash_update p64
vst1.64 {XL}, [r1]
bx lr
ENDPROC(pmull_ghash_update_p64) ENDPROC(pmull_ghash_update_p64)
ENTRY(pmull_ghash_update_p8) ENTRY(pmull_ghash_update_p8)
...@@ -336,4 +365,331 @@ ENTRY(pmull_ghash_update_p8) ...@@ -336,4 +365,331 @@ ENTRY(pmull_ghash_update_p8)
vmov.i64 k48, #0xffffffffffff vmov.i64 k48, #0xffffffffffff
ghash_update p8 ghash_update p8
vst1.64 {XL}, [r1]
bx lr
ENDPROC(pmull_ghash_update_p8) ENDPROC(pmull_ghash_update_p8)
e0 .req q9
e1 .req q10
e2 .req q11
e3 .req q12
e0l .req d18
e0h .req d19
e2l .req d22
e2h .req d23
e3l .req d24
e3h .req d25
ctr .req q13
ctr0 .req d26
ctr1 .req d27
ek0 .req q14
ek1 .req q15
.macro round, rk:req, regs:vararg
.irp r, \regs
aese.8 \r, \rk
aesmc.8 \r, \r
.endr
.endm
.macro aes_encrypt, rkp, rounds, regs:vararg
vld1.8 {ek0-ek1}, [\rkp, :128]!
cmp \rounds, #12
blt .L\@ // AES-128
round ek0, \regs
vld1.8 {ek0}, [\rkp, :128]!
round ek1, \regs
vld1.8 {ek1}, [\rkp, :128]!
beq .L\@ // AES-192
round ek0, \regs
vld1.8 {ek0}, [\rkp, :128]!
round ek1, \regs
vld1.8 {ek1}, [\rkp, :128]!
.L\@: .rept 4
round ek0, \regs
vld1.8 {ek0}, [\rkp, :128]!
round ek1, \regs
vld1.8 {ek1}, [\rkp, :128]!
.endr
round ek0, \regs
vld1.8 {ek0}, [\rkp, :128]
.irp r, \regs
aese.8 \r, ek1
.endr
.irp r, \regs
veor \r, \r, ek0
.endr
.endm
pmull_aes_encrypt:
add ip, r5, #4
vld1.8 {ctr0}, [r5] // load 12 byte IV
vld1.8 {ctr1}, [ip]
rev r8, r7
vext.8 ctr1, ctr1, ctr1, #4
add r7, r7, #1
vmov.32 ctr1[1], r8
vmov e0, ctr
add ip, r3, #64
aes_encrypt ip, r6, e0
bx lr
ENDPROC(pmull_aes_encrypt)
pmull_aes_encrypt_4x:
add ip, r5, #4
vld1.8 {ctr0}, [r5]
vld1.8 {ctr1}, [ip]
rev r8, r7
vext.8 ctr1, ctr1, ctr1, #4
add r7, r7, #1
vmov.32 ctr1[1], r8
rev ip, r7
vmov e0, ctr
add r7, r7, #1
vmov.32 ctr1[1], ip
rev r8, r7
vmov e1, ctr
add r7, r7, #1
vmov.32 ctr1[1], r8
rev ip, r7
vmov e2, ctr
add r7, r7, #1
vmov.32 ctr1[1], ip
vmov e3, ctr
add ip, r3, #64
aes_encrypt ip, r6, e0, e1, e2, e3
bx lr
ENDPROC(pmull_aes_encrypt_4x)
pmull_aes_encrypt_final:
add ip, r5, #4
vld1.8 {ctr0}, [r5]
vld1.8 {ctr1}, [ip]
rev r8, r7
vext.8 ctr1, ctr1, ctr1, #4
mov r7, #1 << 24 // BE #1 for the tag
vmov.32 ctr1[1], r8
vmov e0, ctr
vmov.32 ctr1[1], r7
vmov e1, ctr
add ip, r3, #64
aes_encrypt ip, r6, e0, e1
bx lr
ENDPROC(pmull_aes_encrypt_final)
.macro enc_1x, in0
bl pmull_aes_encrypt
veor \in0, \in0, e0
vst1.8 {\in0}, [r4]!
.endm
.macro dec_1x, in0
bl pmull_aes_encrypt
veor e0, e0, \in0
vst1.8 {e0}, [r4]!
.endm
.macro enc_4x, in0, in1, in2, in3
bl pmull_aes_encrypt_4x
veor \in0, \in0, e0
veor \in1, \in1, e1
veor \in2, \in2, e2
veor \in3, \in3, e3
vst1.8 {\in0-\in1}, [r4]!
vst1.8 {\in2-\in3}, [r4]!
.endm
.macro dec_4x, in0, in1, in2, in3
bl pmull_aes_encrypt_4x
veor e0, e0, \in0
veor e1, e1, \in1
veor e2, e2, \in2
veor e3, e3, \in3
vst1.8 {e0-e1}, [r4]!
vst1.8 {e2-e3}, [r4]!
.endm
/*
* void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
* struct gcm_key const *k, char *dst,
* char *iv, int rounds, u32 counter)
*/
ENTRY(pmull_gcm_encrypt)
push {r4-r8, lr}
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
vld1.64 {SHASH}, [r3]
ghash_update p64, enc, head=0
vst1.64 {XL}, [r1]
pop {r4-r8, pc}
ENDPROC(pmull_gcm_encrypt)
/*
* void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
* struct gcm_key const *k, char *dst,
* char *iv, int rounds, u32 counter)
*/
ENTRY(pmull_gcm_decrypt)
push {r4-r8, lr}
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
vld1.64 {SHASH}, [r3]
ghash_update p64, dec, head=0
vst1.64 {XL}, [r1]
pop {r4-r8, pc}
ENDPROC(pmull_gcm_decrypt)
/*
* void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
* struct gcm_key const *k, char *head,
* char *iv, int rounds, u32 counter)
*/
ENTRY(pmull_gcm_enc_final)
push {r4-r8, lr}
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
bl pmull_aes_encrypt_final
cmp r0, #0
beq .Lenc_final
mov_l ip, .Lpermute
sub r4, r4, #16
add r8, ip, r0
add ip, ip, #32
add r4, r4, r0
sub ip, ip, r0
vld1.8 {e3}, [r8] // permute vector for key stream
vld1.8 {e2}, [ip] // permute vector for ghash input
vtbl.8 e3l, {e0}, e3l
vtbl.8 e3h, {e0}, e3h
vld1.8 {e0}, [r4] // encrypt tail block
veor e0, e0, e3
vst1.8 {e0}, [r4]
vtbl.8 T1_L, {e0}, e2l
vtbl.8 T1_H, {e0}, e2h
vld1.64 {XL}, [r1]
.Lenc_final:
vld1.64 {SHASH}, [r3, :128]
vmov.i8 MASK, #0xe1
veor SHASH2_p64, SHASH_L, SHASH_H
vshl.u64 MASK, MASK, #57
mov r0, #1
bne 3f // process head block first
ghash_update p64, aggregate=0, head=0
vrev64.8 XL, XL
vext.8 XL, XL, XL, #8
veor XL, XL, e1
sub r2, r2, #16 // rewind src pointer
vst1.8 {XL}, [r2] // store tag
pop {r4-r8, pc}
ENDPROC(pmull_gcm_enc_final)
/*
* int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
* struct gcm_key const *k, char *head,
* char *iv, int rounds, u32 counter,
* const char *otag, int authsize)
*/
ENTRY(pmull_gcm_dec_final)
push {r4-r8, lr}
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
bl pmull_aes_encrypt_final
cmp r0, #0
beq .Ldec_final
mov_l ip, .Lpermute
sub r4, r4, #16
add r8, ip, r0
add ip, ip, #32
add r4, r4, r0
sub ip, ip, r0
vld1.8 {e3}, [r8] // permute vector for key stream
vld1.8 {e2}, [ip] // permute vector for ghash input
vtbl.8 e3l, {e0}, e3l
vtbl.8 e3h, {e0}, e3h
vld1.8 {e0}, [r4]
vtbl.8 T1_L, {e0}, e2l
vtbl.8 T1_H, {e0}, e2h
veor e0, e0, e3
vst1.8 {e0}, [r4]
vld1.64 {XL}, [r1]
.Ldec_final:
vld1.64 {SHASH}, [r3]
vmov.i8 MASK, #0xe1
veor SHASH2_p64, SHASH_L, SHASH_H
vshl.u64 MASK, MASK, #57
mov r0, #1
bne 3f // process head block first
ghash_update p64, aggregate=0, head=0
vrev64.8 XL, XL
vext.8 XL, XL, XL, #8
veor XL, XL, e1
mov_l ip, .Lpermute
ldrd r2, r3, [sp, #40] // otag and authsize
vld1.8 {T1}, [r2]
add ip, ip, r3
vceq.i8 T1, T1, XL // compare tags
vmvn T1, T1 // 0 for eq, -1 for ne
vld1.8 {e0}, [ip]
vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only
vtbl.8 XL_H, {T1}, e0h
vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector
vpmin.s8 XL_L, XL_L, XL_L
vmov.32 r0, XL_L[0] // fail if != 0x0
pop {r4-r8, pc}
ENDPROC(pmull_gcm_dec_final)
.section ".rodata", "a", %progbits
.align 5
.Lpermute:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
This diff is collapsed.
...@@ -236,21 +236,26 @@ THUMB( fpreg .req r7 ) ...@@ -236,21 +236,26 @@ THUMB( fpreg .req r7 )
sub \tmp, \tmp, #1 @ decrement it sub \tmp, \tmp, #1 @ decrement it
str \tmp, [\ti, #TI_PREEMPT] str \tmp, [\ti, #TI_PREEMPT]
.endm .endm
.macro dec_preempt_count_ti, ti, tmp
get_thread_info \ti
dec_preempt_count \ti, \tmp
.endm
#else #else
.macro inc_preempt_count, ti, tmp .macro inc_preempt_count, ti, tmp
.endm .endm
.macro dec_preempt_count, ti, tmp .macro dec_preempt_count, ti, tmp
.endm .endm
#endif
.macro local_bh_disable, ti, tmp
ldr \tmp, [\ti, #TI_PREEMPT]
add \tmp, \tmp, #SOFTIRQ_DISABLE_OFFSET
str \tmp, [\ti, #TI_PREEMPT]
.endm
.macro dec_preempt_count_ti, ti, tmp .macro local_bh_enable_ti, ti, tmp
get_thread_info \ti
ldr \tmp, [\ti, #TI_PREEMPT]
sub \tmp, \tmp, #SOFTIRQ_DISABLE_OFFSET
str \tmp, [\ti, #TI_PREEMPT]
.endm .endm
#endif
#define USERL(l, x...) \ #define USERL(l, x...) \
9999: x; \ 9999: x; \
......
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/hardirq.h>
static __must_check inline bool may_use_simd(void)
{
return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq();
}
...@@ -56,6 +56,7 @@ int main(void) ...@@ -56,6 +56,7 @@ int main(void)
DEFINE(VFP_CPU, offsetof(union vfp_state, hard.cpu)); DEFINE(VFP_CPU, offsetof(union vfp_state, hard.cpu));
#endif #endif
#endif #endif
DEFINE(SOFTIRQ_DISABLE_OFFSET,SOFTIRQ_DISABLE_OFFSET);
#ifdef CONFIG_ARM_THUMBEE #ifdef CONFIG_ARM_THUMBEE
DEFINE(TI_THUMBEE_STATE, offsetof(struct thread_info, thumbee_state)); DEFINE(TI_THUMBEE_STATE, offsetof(struct thread_info, thumbee_state));
#endif #endif
......
# SPDX-License-Identifier: GPL-2.0-only
obj- += dummy.o
...@@ -743,7 +743,7 @@ config SWP_EMULATE ...@@ -743,7 +743,7 @@ config SWP_EMULATE
If unsure, say Y. If unsure, say Y.
choice choice
prompt "CPU Endianess" prompt "CPU Endianness"
default CPU_LITTLE_ENDIAN default CPU_LITTLE_ENDIAN
config CPU_LITTLE_ENDIAN config CPU_LITTLE_ENDIAN
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
@ IRQs enabled. @ IRQs enabled.
@ @
ENTRY(do_vfp) ENTRY(do_vfp)
inc_preempt_count r10, r4 local_bh_disable r10, r4
ldr r4, .LCvfp ldr r4, .LCvfp
ldr r11, [r10, #TI_CPU] @ CPU number ldr r11, [r10, #TI_CPU] @ CPU number
add r10, r10, #TI_VFPSTATE @ r10 = workspace add r10, r10, #TI_VFPSTATE @ r10 = workspace
...@@ -30,7 +30,7 @@ ENTRY(do_vfp) ...@@ -30,7 +30,7 @@ ENTRY(do_vfp)
ENDPROC(do_vfp) ENDPROC(do_vfp)
ENTRY(vfp_null_entry) ENTRY(vfp_null_entry)
dec_preempt_count_ti r10, r4 local_bh_enable_ti r10, r4
ret lr ret lr
ENDPROC(vfp_null_entry) ENDPROC(vfp_null_entry)
......
...@@ -175,7 +175,7 @@ vfp_hw_state_valid: ...@@ -175,7 +175,7 @@ vfp_hw_state_valid:
@ else it's one 32-bit instruction, so @ else it's one 32-bit instruction, so
@ always subtract 4 from the following @ always subtract 4 from the following
@ instruction address. @ instruction address.
dec_preempt_count_ti r10, r4 local_bh_enable_ti r10, r4
ret r9 @ we think we have handled things ret r9 @ we think we have handled things
...@@ -200,7 +200,7 @@ skip: ...@@ -200,7 +200,7 @@ skip:
@ not recognised by VFP @ not recognised by VFP
DBGSTR "not VFP" DBGSTR "not VFP"
dec_preempt_count_ti r10, r4 local_bh_enable_ti r10, r4
ret lr ret lr
process_exception: process_exception:
......
...@@ -416,7 +416,7 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) ...@@ -416,7 +416,7 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
if (exceptions) if (exceptions)
vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs);
exit: exit:
preempt_enable(); local_bh_enable();
} }
static void vfp_enable(void *unused) static void vfp_enable(void *unused)
...@@ -517,6 +517,8 @@ void vfp_sync_hwstate(struct thread_info *thread) ...@@ -517,6 +517,8 @@ void vfp_sync_hwstate(struct thread_info *thread)
{ {
unsigned int cpu = get_cpu(); unsigned int cpu = get_cpu();
local_bh_disable();
if (vfp_state_in_hw(cpu, thread)) { if (vfp_state_in_hw(cpu, thread)) {
u32 fpexc = fmrx(FPEXC); u32 fpexc = fmrx(FPEXC);
...@@ -528,6 +530,7 @@ void vfp_sync_hwstate(struct thread_info *thread) ...@@ -528,6 +530,7 @@ void vfp_sync_hwstate(struct thread_info *thread)
fmxr(FPEXC, fpexc); fmxr(FPEXC, fpexc);
} }
local_bh_enable();
put_cpu(); put_cpu();
} }
...@@ -717,13 +720,15 @@ void kernel_neon_begin(void) ...@@ -717,13 +720,15 @@ void kernel_neon_begin(void)
unsigned int cpu; unsigned int cpu;
u32 fpexc; u32 fpexc;
local_bh_disable();
/* /*
* Kernel mode NEON is only allowed outside of interrupt context * Kernel mode NEON is only allowed outside of hardirq context with
* with preemption disabled. This will make sure that the kernel * preemption and softirq processing disabled. This will make sure that
* mode NEON register contents never need to be preserved. * the kernel mode NEON register contents never need to be preserved.
*/ */
BUG_ON(in_interrupt()); BUG_ON(in_hardirq());
cpu = get_cpu(); cpu = __smp_processor_id();
fpexc = fmrx(FPEXC) | FPEXC_EN; fpexc = fmrx(FPEXC) | FPEXC_EN;
fmxr(FPEXC, fpexc); fmxr(FPEXC, fpexc);
...@@ -746,7 +751,7 @@ void kernel_neon_end(void) ...@@ -746,7 +751,7 @@ void kernel_neon_end(void)
{ {
/* Disable the NEON/VFP unit. */ /* Disable the NEON/VFP unit. */
fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN); fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
put_cpu(); local_bh_enable();
} }
EXPORT_SYMBOL(kernel_neon_end); EXPORT_SYMBOL(kernel_neon_end);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment