arm64: entry: use ldp/stp instead of push/pop when saving/restoring regs

The push/pop instructions can be suboptimal when saving/restoring large amounts of data to/from the stack, for example on entry/exit from the kernel. This is because: (1) They act on descending addresses (i.e. the newly decremented sp), which may defeat some hardware prefetchers (2) They introduce an implicit dependency between each instruction, as the sp has to be updated in order to resolve the address of the next access. This patch removes the push/pop instructions from our kernel entry/exit macros in favour of ldp/stp plus offset. Signed-off-by: Will Deacon <will.deacon@arm.com>

arm64: entry: use ldp/stp instead of push/pop when saving/restoring regs
The push/pop instructions can be suboptimal when saving/restoring large amounts of data to/from the stack, for example on entry/exit from the kernel. This is because: (1) They act on descending addresses (i.e. the newly decremented sp), which may defeat some hardware prefetchers (2) They introduce an implicit dependency between each instruction, as the sp has to be updated in order to resolve the address of the next access. This patch removes the push/pop instructions from our kernel entry/exit macros in favour of ldp/stp plus offset. Signed-off-by: Will Deacon <will.deacon@arm.com>
63648dd2 · Will Deacon · d54e81f9 · 63648dd2
Commit 63648dd2 authored Sep 29, 2014 by Will Deacon
Hide whitespace changes
Inline Side-by-side

Showing with 37 additions and 38 deletions

arch/arm64/kernel/entry.S arch/arm64/kernel/entry.S +37 -38

No files found.
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -64,25 +64,26 @@
 #define BAD_ERROR	3
 	.macro	kernel_entry, el, regsize = 64
-	sub	sp, sp, #S_FRAME_SIZE - S_LR	// room for LR, SP, SPSR, ELR
+	sub	sp, sp, #S_FRAME_SIZE
 	.if	\regsize == 32
 	mov	w0, w0				// zero upper 32 bits of x0
 	.endif
-	push	x28, x29
+	stp	x0, x1, [sp, #16 * 0]
-	push	x26, x27
+	stp	x2, x3, [sp, #16 * 1]
-	push	x24, x25
+	stp	x4, x5, [sp, #16 * 2]
-	push	x22, x23
+	stp	x6, x7, [sp, #16 * 3]
-	push	x20, x21
+	stp	x8, x9, [sp, #16 * 4]
-	push	x18, x19
+	stp	x10, x11, [sp, #16 * 5]
-	push	x16, x17
+	stp	x12, x13, [sp, #16 * 6]
-	push	x14, x15
+	stp	x14, x15, [sp, #16 * 7]
-	push	x12, x13
+	stp	x16, x17, [sp, #16 * 8]
-	push	x10, x11
+	stp	x18, x19, [sp, #16 * 9]
-	push	x8, x9
+	stp	x20, x21, [sp, #16 * 10]
-	push	x6, x7
+	stp	x22, x23, [sp, #16 * 11]
-	push	x4, x5
+	stp	x24, x25, [sp, #16 * 12]
-	push	x2, x3
+	stp	x26, x27, [sp, #16 * 13]
-	push	x0, x1
+	stp	x28, x29, [sp, #16 * 14]
 	.if	\el == 0
 	mrs	x21, sp_el0
 	get_thread_info tsk			// Ensure MDSCR_EL1.SS is clear,
@@ -118,33 +119,31 @@
 	.if	\el == 0
 	ct_user_enter
 	ldr	x23, [sp, #S_SP]		// load return stack pointer
+	msr	sp_el0, x23
 	.endif
+	msr	elr_el1, x21			// set up the return data
+	msr	spsr_el1, x22
 	.if	\ret
 	ldr	x1, [sp, #S_X1]			// preserve x0 (syscall return)
-	add	sp, sp, S_X2
 	.else
-	pop	x0, x1
+	ldp	x0, x1, [sp, #16 * 0]
-	.endif
-	pop	x2, x3				// load the rest of the registers
-	pop	x4, x5
-	pop	x6, x7
-	pop	x8, x9
-	msr	elr_el1, x21			// set up the return data
-	msr	spsr_el1, x22
-	.if	\el == 0
-	msr	sp_el0, x23
 	.endif
-	pop	x10, x11
+	ldp	x2, x3, [sp, #16 * 1]
-	pop	x12, x13
+	ldp	x4, x5, [sp, #16 * 2]
-	pop	x14, x15
+	ldp	x6, x7, [sp, #16 * 3]
-	pop	x16, x17
+	ldp	x8, x9, [sp, #16 * 4]
-	pop	x18, x19
+	ldp	x10, x11, [sp, #16 * 5]
-	pop	x20, x21
+	ldp	x12, x13, [sp, #16 * 6]
-	pop	x22, x23
+	ldp	x14, x15, [sp, #16 * 7]
-	pop	x24, x25
+	ldp	x16, x17, [sp, #16 * 8]
-	pop	x26, x27
+	ldp	x18, x19, [sp, #16 * 9]
-	pop	x28, x29
+	ldp	x20, x21, [sp, #16 * 10]
-	ldr	lr, [sp], #S_FRAME_SIZE - S_LR	// load LR and restore SP
+	ldp	x22, x23, [sp, #16 * 11]
+	ldp	x24, x25, [sp, #16 * 12]
+	ldp	x26, x27, [sp, #16 * 13]
+	ldp	x28, x29, [sp, #16 * 14]
+	ldr	lr, [sp, #S_LR]
+	add	sp, sp, #S_FRAME_SIZE		// restore sp
 	eret					// return to kernel
 	.endm