Commit bc8c6490 authored by David Mosberger's avatar David Mosberger Committed by Tony Luck

[IA64] Improve ia64_leave_syscall() for McKinley-type cores.

Optimize ia64_leave_syscall() a bit better for McKinley-type cores.
The patch looks big, but that's mostly due to renaming r16/r17 to r2/r3.
Good for a 13 cycle improvement.
Signed-off-by: default avatarDavid Mosberger-Tang <davidm@hpl.hp.com>
Signed-off-by: default avatarTony Luck <tony.luck@intel.com>
parent b6f4b744
......@@ -633,10 +633,12 @@ END(ia64_ret_from_syscall)
* r13: restored (user-level thread pointer)
* r14: cleared
* r15: restored (syscall #)
* r16-r19: cleared
* r16-r17: cleared
* r18: user-level b6
* r19: cleared
* r20: user-level ar.fpsr
* r21: user-level b0
* r22: user-level b6
* r22: cleared
* r23: user-level ar.bspstore
* r24: user-level ar.rnat
* r25: user-level ar.unat
......@@ -661,7 +663,7 @@ END(ia64_ret_from_syscall)
* ar.csd: cleared
* ar.ssd: cleared
*/
GLOBAL_ENTRY(ia64_leave_syscall)
ENTRY(ia64_leave_syscall)
PT_REGS_UNWIND_INFO(0)
/*
* work.need_resched etc. mustn't get changed by this CPU before it returns to
......@@ -690,79 +692,80 @@ GLOBAL_ENTRY(ia64_leave_syscall)
(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
#endif
.work_processed_syscall:
adds r16=PT(LOADRS)+16,r12
adds r17=PT(AR_BSPSTORE)+16,r12
adds r2=PT(LOADRS)+16,r12
adds r3=PT(AR_BSPSTORE)+16,r12
adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
;;
(p6) ld4 r31=[r18] // load current_thread_info()->flags
ld8 r19=[r16],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs"
nop.i 0
ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs"
mov b7=r0 // clear b7
;;
ld8 r23=[r17],PT(R9)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage)
ld8 r22=[r16],PT(R8)-PT(B6) // load b6
ld8 r23=[r3],PT(R9)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage)
ld8 r18=[r2],PT(R8)-PT(B6) // load b6
(p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE?
;;
mov.m ar.ccv=r0 // clear ar.ccv
mov r16=ar.bsp // M2 get existing backing store pointer
(p6) cmp4.ne.unc p6,p0=r15, r0 // any special work pending?
(p6) br.cond.spnt .work_pending
;;
// start restoring the state saved on the kernel stack (struct pt_regs):
ld8.fill r8=[r16],16
ld8.fill r9=[r17],16
ld8.fill r8=[r2],16
ld8.fill r9=[r3],16
mov f6=f0 // clear f6
;;
ld8.fill r10=[r16],16
ld8.fill r11=[r17],16
invala // M0|1 invalidate ALAT
rsm psr.i | psr.ic // M2 initiate turning off of interrupt and interruption collection
mov f9=f0 // clear f9
ld8.fill r10=[r2],16
ld8.fill r11=[r3],16
mov f7=f0 // clear f7
;;
ld8 r29=[r16],16 // load cr.ipsr
ld8 r28=[r17],16 // load cr.iip
ld8 r29=[r2],16 // load cr.ipsr
ld8 r28=[r3],16 // load cr.iip
mov f8=f0 // clear f8
;;
ld8 r30=[r16],16 // load cr.ifs
ld8 r25=[r17],16 // load ar.unat
ld8 r30=[r2],16 // M0|1 load cr.ifs
mov.m ar.ssd=r0 // M2 clear ar.ssd
cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs
;;
rsm psr.i | psr.ic // initiate turning off of interrupt and interruption collection
invala // invalidate ALAT
mov f9=f0 // clear f9
mov.m ar.ssd=r0 // clear ar.ssd
mov.m ar.csd=r0 // clear ar.csd
ld8 r25=[r3],16 // M0|1 load ar.unat
mov.m ar.csd=r0 // M2 clear ar.csd
mov r22=r0 // clear r22
;;
ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs
nop.m 0
mov f10=f0 // clear f10
;;
ld8 r26=[r16],16 // load ar.pfs
ld8 r27=[r17],PT(PR)-PT(AR_RSC) // load ar.rsc
ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
ld8 r27=[r3],PT(PR)-PT(AR_RSC) // load ar.rsc
mov f11=f0 // clear f11
;;
ld8 r24=[r16],PT(B0)-PT(AR_RNAT) // load ar.rnat (may be garbage)
ld8 r31=[r17],PT(R1)-PT(PR) // load predicates
ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT) // load ar.rnat (may be garbage)
ld8 r31=[r3],PT(R1)-PT(PR) // load predicates
(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
;;
ld8 r21=[r16],PT(R12)-PT(B0) // load b0
ld8.fill r1=[r17],16 // load r1
(pUStk) mov r3=1
ld8 r20=[r2],PT(R12)-PT(AR_FPSR) // load ar.fpsr
ld8.fill r1=[r3],16 // load r1
(pUStk) mov r17=1
;;
ld8.fill r12=[r16],16
ld8.fill r13=[r17],16
mov r2=r0 // clear r2
srlz.i // M0 ensure interruption collection is off
ld8.fill r13=[r3],16
nop.i 0
;;
ld8 r20=[r16] // load ar.fpsr
ld8.fill r15=[r17] // load r15
mov b7=r0 // clear b7
ld8.fill r12=[r2] // restore r12 (sp)
ld8.fill r15=[r3] // restore r15
addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
;;
(pUStk) st1 [r14]=r3
addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
(pUStk) ld4 r3=[r3] // r3 = cpu_data->phys_stacked_size_p8
(pUStk) st1 [r14]=r17
mov b6=r18 // I0 restore b6
;;
mov r16=ar.bsp // get existing backing store pointer
srlz.i // ensure interruption collection is off
shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition
mov r14=r0 // clear r14
;;
ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8
mov b6=r22 // restore b6
shr.u r18=r19,16 // get byte size of existing "dirty" partition
(pKStk) br.cond.dpnt.many skip_rbs_switch
mov.m ar.ccv=r0 // clear ar.ccv
(pNonSys) br.cond.dpnt.many dont_preserve_current_frame
br.cond.sptk.many rbs_switch
END(ia64_leave_syscall)
......@@ -1054,7 +1057,7 @@ skip_rbs_switch:
;;
(pUStk) mov ar.rnat=r24 // M2 must happen with RSE in lazy mode
nop 0
nop 0
(pLvSys)mov r2=r0
mov ar.rsc=r27 // M2
mov pr=r31,-1 // I0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment