Commit 29a843ae authored by Russell King's avatar Russell King

Merge tag 'arm32-efi-cache-ops-for-rmk' of...

Merge tag 'arm32-efi-cache-ops-for-rmk' of git://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux into devel-stable

ARMv7 compliant cache maintenance for the decompressor

On v7 and later cores, cache maintenance operations by set/way are only
intended to be used in the context of on/offlining a core, while it has
been taken out of the coherency domain. Any use intended to ensure that
the contents of the cache have made it to main memory is unreliable,
since cacheline migration and non-architected system caches may cause
these contents to linger elsewhere, without being visible in main memory
once the MMU and caches are disabled.

So switch to cache maintenance by virtual address for v7 and later cores.
This makes the 32-bit kernel bootable on systems with L3 system caches
that are not covered by set/way operations, such as Socionext SynQuacer.

Tony says:

  I gave these a try on top of the earlier "arm: fix Kbuild issue caused
  by per-task stack protector GCC plugin" and booting still works for
  me on armv7 including appended dtb:
Tested-by: default avatarTony Lindgren <tony@atomide.com>

Linus says:

  No problem, I have tested it on the following:

  - ARMv7 Cortex A9 x 2 Qualcomm APQ8060 DragonBoard
  - ARM PB11MPCore (4 x 1176)
  - ARMv7 Ux500 Cortex A9 x 2

  The PB11MPCore is again the crucial board, if it work on that
  board it works on anything, most of the time :D
Tested-by: default avatarLinus Walleij <linus.walleij@linaro.org>

Note that the first 2 patches are shared with the efi/core branch in
TIP, which is the reason why this is sent as a pull request rather
than via the patch system.
parents bb6d3fb3 401b368c
...@@ -151,6 +151,25 @@ ...@@ -151,6 +151,25 @@
.L_\@: .L_\@:
.endm .endm
/*
* The kernel build system appends the size of the
* decompressed kernel at the end of the compressed data
* in little-endian form.
*/
.macro get_inflated_image_size, res:req, tmp1:req, tmp2:req
adr \res, .Linflated_image_size_offset
ldr \tmp1, [\res]
add \tmp1, \tmp1, \res @ address of inflated image size
ldrb \res, [\tmp1] @ get_unaligned_le32
ldrb \tmp2, [\tmp1, #1]
orr \res, \res, \tmp2, lsl #8
ldrb \tmp2, [\tmp1, #2]
ldrb \tmp1, [\tmp1, #3]
orr \res, \res, \tmp2, lsl #16
orr \res, \res, \tmp1, lsl #24
.endm
.section ".start", "ax" .section ".start", "ax"
/* /*
* sort out different calling conventions * sort out different calling conventions
...@@ -268,15 +287,15 @@ not_angel: ...@@ -268,15 +287,15 @@ not_angel:
*/ */
mov r0, pc mov r0, pc
cmp r0, r4 cmp r0, r4
ldrcc r0, LC0+32 ldrcc r0, LC0+28
addcc r0, r0, pc addcc r0, r0, pc
cmpcc r4, r0 cmpcc r4, r0
orrcc r4, r4, #1 @ remember we skipped cache_on orrcc r4, r4, #1 @ remember we skipped cache_on
blcs cache_on blcs cache_on
restart: adr r0, LC0 restart: adr r0, LC0
ldmia r0, {r1, r2, r3, r6, r10, r11, r12} ldmia r0, {r1, r2, r3, r6, r11, r12}
ldr sp, [r0, #28] ldr sp, [r0, #24]
/* /*
* We might be running at a different address. We need * We might be running at a different address. We need
...@@ -284,20 +303,8 @@ restart: adr r0, LC0 ...@@ -284,20 +303,8 @@ restart: adr r0, LC0
*/ */
sub r0, r0, r1 @ calculate the delta offset sub r0, r0, r1 @ calculate the delta offset
add r6, r6, r0 @ _edata add r6, r6, r0 @ _edata
add r10, r10, r0 @ inflated kernel size location
/* get_inflated_image_size r9, r10, lr
* The kernel build system appends the size of the
* decompressed kernel at the end of the compressed data
* in little-endian form.
*/
ldrb r9, [r10, #0]
ldrb lr, [r10, #1]
orr r9, r9, lr, lsl #8
ldrb lr, [r10, #2]
ldrb r10, [r10, #3]
orr r9, r9, lr, lsl #16
orr r9, r9, r10, lsl #24
#ifndef CONFIG_ZBOOT_ROM #ifndef CONFIG_ZBOOT_ROM
/* malloc space is above the relocated stack (64k max) */ /* malloc space is above the relocated stack (64k max) */
...@@ -521,11 +528,8 @@ dtb_check_done: ...@@ -521,11 +528,8 @@ dtb_check_done:
/* Preserve offset to relocated code. */ /* Preserve offset to relocated code. */
sub r6, r9, r6 sub r6, r9, r6
#ifndef CONFIG_ZBOOT_ROM mov r0, r9 @ start of relocated zImage
/* cache_clean_flush may use the stack, so relocate it */ add r1, sp, r6 @ end of relocated zImage
add sp, sp, r6
#endif
bl cache_clean_flush bl cache_clean_flush
badr r0, restart badr r0, restart
...@@ -622,6 +626,11 @@ not_relocated: mov r0, #0 ...@@ -622,6 +626,11 @@ not_relocated: mov r0, #0
add r2, sp, #0x10000 @ 64k max add r2, sp, #0x10000 @ 64k max
mov r3, r7 mov r3, r7
bl decompress_kernel bl decompress_kernel
get_inflated_image_size r1, r2, r3
mov r0, r4 @ start of inflated image
add r1, r1, r0 @ end of inflated image
bl cache_clean_flush bl cache_clean_flush
bl cache_off bl cache_off
...@@ -652,13 +661,15 @@ LC0: .word LC0 @ r1 ...@@ -652,13 +661,15 @@ LC0: .word LC0 @ r1
.word __bss_start @ r2 .word __bss_start @ r2
.word _end @ r3 .word _end @ r3
.word _edata @ r6 .word _edata @ r6
.word input_data_end - 4 @ r10 (inflated size location)
.word _got_start @ r11 .word _got_start @ r11
.word _got_end @ ip .word _got_end @ ip
.word .L_user_stack_end @ sp .word .L_user_stack_end @ sp
.word _end - restart + 16384 + 1024*1024 .word _end - restart + 16384 + 1024*1024
.size LC0, . - LC0 .size LC0, . - LC0
.Linflated_image_size_offset:
.long (input_data_end - 4) - .
#ifdef CONFIG_ARCH_RPC #ifdef CONFIG_ARCH_RPC
.globl params .globl params
params: ldr r0, =0x10000100 @ params_phys for RPC params: ldr r0, =0x10000100 @ params_phys for RPC
...@@ -667,6 +678,24 @@ params: ldr r0, =0x10000100 @ params_phys for RPC ...@@ -667,6 +678,24 @@ params: ldr r0, =0x10000100 @ params_phys for RPC
.align .align
#endif #endif
/*
* dcache_line_size - get the minimum D-cache line size from the CTR register
* on ARMv7.
*/
.macro dcache_line_size, reg, tmp
#ifdef CONFIG_CPU_V7M
movw \tmp, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_CTR
movt \tmp, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_CTR
ldr \tmp, [\tmp]
#else
mrc p15, 0, \tmp, c0, c0, 1 @ read ctr
#endif
lsr \tmp, \tmp, #16
and \tmp, \tmp, #0xf @ cache line size encoding
mov \reg, #4 @ bytes per word
mov \reg, \reg, lsl \tmp @ actual cache line size
.endm
/* /*
* Turn on the cache. We need to setup some page tables so that we * Turn on the cache. We need to setup some page tables so that we
* can have both the I and D caches on. * can have both the I and D caches on.
...@@ -1159,8 +1188,6 @@ __armv7_mmu_cache_off: ...@@ -1159,8 +1188,6 @@ __armv7_mmu_cache_off:
bic r0, r0, #0x000c bic r0, r0, #0x000c
#endif #endif
mcr p15, 0, r0, c1, c0 @ turn MMU and cache off mcr p15, 0, r0, c1, c0 @ turn MMU and cache off
mov r12, lr
bl __armv7_mmu_cache_flush
mov r0, #0 mov r0, #0
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
mcr p15, 0, r0, c8, c7, 0 @ invalidate whole TLB mcr p15, 0, r0, c8, c7, 0 @ invalidate whole TLB
...@@ -1168,11 +1195,14 @@ __armv7_mmu_cache_off: ...@@ -1168,11 +1195,14 @@ __armv7_mmu_cache_off:
mcr p15, 0, r0, c7, c5, 6 @ invalidate BTC mcr p15, 0, r0, c7, c5, 6 @ invalidate BTC
mcr p15, 0, r0, c7, c10, 4 @ DSB mcr p15, 0, r0, c7, c10, 4 @ DSB
mcr p15, 0, r0, c7, c5, 4 @ ISB mcr p15, 0, r0, c7, c5, 4 @ ISB
mov pc, r12 mov pc, lr
/* /*
* Clean and flush the cache to maintain consistency. * Clean and flush the cache to maintain consistency.
* *
* On entry,
* r0 = start address
* r1 = end address (exclusive)
* On exit, * On exit,
* r1, r2, r3, r9, r10, r11, r12 corrupted * r1, r2, r3, r9, r10, r11, r12 corrupted
* This routine must preserve: * This routine must preserve:
...@@ -1181,6 +1211,7 @@ __armv7_mmu_cache_off: ...@@ -1181,6 +1211,7 @@ __armv7_mmu_cache_off:
.align 5 .align 5
cache_clean_flush: cache_clean_flush:
mov r3, #16 mov r3, #16
mov r11, r1
b call_cache_fn b call_cache_fn
__armv4_mpu_cache_flush: __armv4_mpu_cache_flush:
...@@ -1231,51 +1262,16 @@ __armv7_mmu_cache_flush: ...@@ -1231,51 +1262,16 @@ __armv7_mmu_cache_flush:
mcr p15, 0, r10, c7, c14, 0 @ clean+invalidate D mcr p15, 0, r10, c7, c14, 0 @ clean+invalidate D
b iflush b iflush
hierarchical: hierarchical:
mcr p15, 0, r10, c7, c10, 5 @ DMB dcache_line_size r1, r2 @ r1 := dcache min line size
stmfd sp!, {r0-r7, r9-r11} sub r2, r1, #1 @ r2 := line size mask
mrc p15, 1, r0, c0, c0, 1 @ read clidr bic r0, r0, r2 @ round down start to line size
ands r3, r0, #0x7000000 @ extract loc from clidr sub r11, r11, #1 @ end address is exclusive
mov r3, r3, lsr #23 @ left align loc bit field bic r11, r11, r2 @ round down end to line size
beq finished @ if loc is 0, then no need to clean 0: cmp r0, r11 @ finished?
mov r10, #0 @ start clean at cache level 0 bgt iflush
loop1: mcr p15, 0, r0, c7, c14, 1 @ Dcache clean/invalidate by VA
add r2, r10, r10, lsr #1 @ work out 3x current cache level add r0, r0, r1
mov r1, r0, lsr r2 @ extract cache type bits from clidr b 0b
and r1, r1, #7 @ mask of the bits for current cache only
cmp r1, #2 @ see what cache we have at this level
blt skip @ skip if no cache, or just i-cache
mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr
mcr p15, 0, r10, c7, c5, 4 @ isb to sych the new cssr&csidr
mrc p15, 1, r1, c0, c0, 0 @ read the new csidr
and r2, r1, #7 @ extract the length of the cache lines
add r2, r2, #4 @ add 4 (line length offset)
ldr r4, =0x3ff
ands r4, r4, r1, lsr #3 @ find maximum number on the way size
clz r5, r4 @ find bit position of way size increment
ldr r7, =0x7fff
ands r7, r7, r1, lsr #13 @ extract max number of the index size
loop2:
mov r9, r4 @ create working copy of max way size
loop3:
ARM( orr r11, r10, r9, lsl r5 ) @ factor way and cache number into r11
ARM( orr r11, r11, r7, lsl r2 ) @ factor index number into r11
THUMB( lsl r6, r9, r5 )
THUMB( orr r11, r10, r6 ) @ factor way and cache number into r11
THUMB( lsl r6, r7, r2 )
THUMB( orr r11, r11, r6 ) @ factor index number into r11
mcr p15, 0, r11, c7, c14, 2 @ clean & invalidate by set/way
subs r9, r9, #1 @ decrement the way
bge loop3
subs r7, r7, #1 @ decrement the index
bge loop2
skip:
add r10, r10, #2 @ increment cache number
cmp r3, r10
bgt loop1
finished:
ldmfd sp!, {r0-r7, r9-r11}
mov r10, #0 @ switch back to cache level 0
mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr
iflush: iflush:
mcr p15, 0, r10, c7, c10, 4 @ DSB mcr p15, 0, r10, c7, c10, 4 @ DSB
mcr p15, 0, r10, c7, c5, 0 @ invalidate I+BTB mcr p15, 0, r10, c7, c5, 0 @ invalidate I+BTB
...@@ -1460,7 +1456,24 @@ ENTRY(efi_stub_entry) ...@@ -1460,7 +1456,24 @@ ENTRY(efi_stub_entry)
@ Preserve return value of efi_entry() in r4 @ Preserve return value of efi_entry() in r4
mov r4, r0 mov r4, r0
add r1, r4, #SZ_2M @ DT end
bl cache_clean_flush
ldr r0, [sp] @ relocated zImage
ldr r1, =_edata @ size of zImage
add r1, r1, r0 @ end of zImage
bl cache_clean_flush bl cache_clean_flush
@ The PE/COFF loader might not have cleaned the code we are
@ running beyond the PoU, and so calling cache_off below from
@ inside the PE/COFF loader allocated region is unsafe. Let's
@ assume our own zImage relocation code did a better job, and
@ jump into its version of this routine before proceeding.
ldr r0, [sp] @ relocated zImage
ldr r1, .Ljmp
sub r1, r0, r1
mov pc, r1 @ no mode switch
0:
bl cache_off bl cache_off
@ Set parameters for booting zImage according to boot protocol @ Set parameters for booting zImage according to boot protocol
...@@ -1469,18 +1482,15 @@ ENTRY(efi_stub_entry) ...@@ -1469,18 +1482,15 @@ ENTRY(efi_stub_entry)
mov r0, #0 mov r0, #0
mov r1, #0xFFFFFFFF mov r1, #0xFFFFFFFF
mov r2, r4 mov r2, r4
b __efi_start
@ Branch to (possibly) relocated zImage that is in [sp]
ldr lr, [sp]
ldr ip, =start_offset
add lr, lr, ip
mov pc, lr @ no mode switch
efi_load_fail: efi_load_fail:
@ Return EFI_LOAD_ERROR to EFI firmware on error. @ Return EFI_LOAD_ERROR to EFI firmware on error.
ldr r0, =0x80000001 ldr r0, =0x80000001
ldmfd sp!, {ip, pc} ldmfd sp!, {ip, pc}
ENDPROC(efi_stub_entry) ENDPROC(efi_stub_entry)
.align 2
.Ljmp: .long start - 0b
#endif #endif
.align .align
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment