Commit 889ac863 authored by David Mosberger's avatar David Mosberger

More McKinley tuning and minor do_csum() cleanup.

parent d4bbe676
...@@ -214,61 +214,79 @@ GLOBAL_ENTRY(save_switch_stack) ...@@ -214,61 +214,79 @@ GLOBAL_ENTRY(save_switch_stack)
.save @priunat,r17 .save @priunat,r17
mov r17=ar.unat // preserve caller's mov r17=ar.unat // preserve caller's
.body .body
adds r3=80,sp #ifdef CONFIG_ITANIUM
adds r2=16+128,sp
adds r3=16+64,sp
adds r14=SW(R4)+16,sp
;; ;;
st8.spill [r14]=r4,16 // spill r4
lfetch.fault.excl.nt1 [r3],128 lfetch.fault.excl.nt1 [r3],128
mov ar.rsc=0 // put RSE in mode: enforced lazy, little endian, pl 0
adds r2=16+128,sp
;; ;;
lfetch.fault.excl.nt1 [r2],128 lfetch.fault.excl.nt1 [r2],128
lfetch.fault.excl.nt1 [r3],128 lfetch.fault.excl.nt1 [r3],128
adds r14=SW(R4)+16,sp
;; ;;
lfetch.fault.excl [r2] lfetch.fault.excl [r2]
lfetch.fault.excl [r3] lfetch.fault.excl [r3]
adds r15=SW(R5)+16,sp adds r15=SW(R5)+16,sp
#else
add r2=16+3*128,sp
add r3=16,sp
add r14=SW(R4)+16,sp
;;
st8.spill [r14]=r4,SW(R6)-SW(R4) // spill r4 and prefetch offset 0x1c0
lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x010
;; ;;
mov r18=ar.fpsr // preserve fpsr lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x090
mov r19=ar.rnat lfetch.fault.excl.nt1 [r2],128 // prefetch offset 0x190
;;
lfetch.fault.excl.nt1 [r3] // prefetch offset 0x110
lfetch.fault.excl.nt1 [r2] // prefetch offset 0x210
adds r15=SW(R5)+16,sp
#endif
;;
st8.spill [r15]=r5,SW(R7)-SW(R5) // spill r5
mov.m ar.rsc=0 // put RSE in mode: enforced lazy, little endian, pl 0
add r2=SW(F2)+16,sp // r2 = &sw->f2 add r2=SW(F2)+16,sp // r2 = &sw->f2
.mem.offset 0,0; st8.spill [r14]=r4,16 // spill r4 ;;
.mem.offset 8,0; st8.spill [r15]=r5,16 // spill r5 st8.spill [r14]=r6,SW(B0)-SW(R6) // spill r6
mov.m r18=ar.fpsr // preserve fpsr
add r3=SW(F3)+16,sp // r3 = &sw->f3 add r3=SW(F3)+16,sp // r3 = &sw->f3
;; ;;
stf.spill [r2]=f2,32 stf.spill [r2]=f2,32
stf.spill [r3]=f3,32 mov.m r19=ar.rnat
mov r21=b0 mov r21=b0
.mem.offset 0,0; st8.spill [r14]=r6,16 // spill r6
.mem.offset 8,0; st8.spill [r15]=r7,16 // spill r7 stf.spill [r3]=f3,32
st8.spill [r15]=r7,SW(B2)-SW(R7) // spill r7
mov r22=b1 mov r22=b1
;; ;;
// since we're done with the spills, read and save ar.unat: // since we're done with the spills, read and save ar.unat:
mov r29=ar.unat // M-unit mov.m r29=ar.unat
mov r20=ar.bspstore // M-unit mov.m r20=ar.bspstore
mov r23=b2 mov r23=b2
stf.spill [r2]=f4,32 stf.spill [r2]=f4,32
stf.spill [r3]=f5,32 stf.spill [r3]=f5,32
mov r24=b3 mov r24=b3
;; ;;
st8 [r14]=r21,16 // save b0 st8 [r14]=r21,SW(B1)-SW(B0) // save b0
st8 [r15]=r22,16 // save b1 st8 [r15]=r23,SW(B3)-SW(B2) // save b2
mov r25=b4 mov r25=b4
stf.spill [r2]=f10,32 stf.spill [r2]=f10,32
stf.spill [r3]=f11,32 stf.spill [r3]=f11,32
mov r26=b5 mov r26=b5
;; ;;
st8 [r14]=r23,16 // save b2 st8 [r14]=r22,SW(B4)-SW(B1) // save b1
st8 [r15]=r24,16 // save b3 st8 [r15]=r24,SW(AR_PFS)-SW(B3) // save b3
mov r21=ar.lc // I-unit mov r21=ar.lc // I-unit
stf.spill [r2]=f12,32 stf.spill [r2]=f12,32
stf.spill [r3]=f13,32 stf.spill [r3]=f13,32
;; ;;
st8 [r14]=r25,16 // save b4 st8 [r14]=r25,SW(B5)-SW(B4) // save b4
st8 [r15]=r26,16 // save b5 st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS) // save ar.pfs
stf.spill [r2]=f14,32 stf.spill [r2]=f14,32
stf.spill [r3]=f15,32 stf.spill [r3]=f15,32
;; ;;
st8 [r14]=r16 // save ar.pfs st8 [r14]=r26 // save b5
st8 [r15]=r21 // save ar.lc st8 [r15]=r21 // save ar.lc
stf.spill [r2]=f16,32 stf.spill [r2]=f16,32
stf.spill [r3]=f17,32 stf.spill [r3]=f17,32
...@@ -284,26 +302,26 @@ GLOBAL_ENTRY(save_switch_stack) ...@@ -284,26 +302,26 @@ GLOBAL_ENTRY(save_switch_stack)
;; ;;
stf.spill [r2]=f24,32 stf.spill [r2]=f24,32
stf.spill [r3]=f25,32 stf.spill [r3]=f25,32
add r14=SW(CALLER_UNAT)+16,sp
;; ;;
stf.spill [r2]=f26,32 stf.spill [r2]=f26,32
stf.spill [r3]=f27,32 stf.spill [r3]=f27,32
add r15=SW(AR_FPSR)+16,sp
;; ;;
stf.spill [r2]=f28,32 stf.spill [r2]=f28,32
stf.spill [r3]=f29,32 stf.spill [r3]=f29,32
st8 [r14]=r17 // save caller_unat
st8 [r15]=r18 // save fpsr
mov r21=pr
;; ;;
stf.spill [r2]=f30,(SW(AR_UNAT)-SW(F30)) stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30)
stf.spill [r3]=f31,(SW(AR_RNAT)-SW(F31)) stf.spill [r3]=f31,SW(PR)-SW(F31)
add r14=SW(CALLER_UNAT)+16,sp
;; ;;
st8 [r2]=r29,16 // save ar.unat st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT) // save ar.unat
st8 [r3]=r19,16 // save ar.rnat st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat
mov r21=pr
;; ;;
st8 [r2]=r20 // save ar.bspstore st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat
st8 [r3]=r21 // save predicate registers st8 [r3]=r21 // save predicate registers
;;
st8 [r2]=r20 // save ar.bspstore
st8 [r14]=r18 // save fpsr
mov ar.rsc=3 // put RSE back into eager mode, pl 0 mov ar.rsc=3 // put RSE back into eager mode, pl 0
br.cond.sptk.many b7 br.cond.sptk.many b7
END(save_switch_stack) END(save_switch_stack)
...@@ -647,23 +665,38 @@ dont_preserve_current_frame: ...@@ -647,23 +665,38 @@ dont_preserve_current_frame:
/* /*
* To prevent leaking bits between the kernel and user-space, * To prevent leaking bits between the kernel and user-space,
* we must clear the stacked registers in the "invalid" partition here. * we must clear the stacked registers in the "invalid" partition here.
* Not pretty, but at least it's fast (3.34 registers/cycle). * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
* Architecturally, this loop could go at 4.67 registers/cycle, but that would * 5 registers/cycle on McKinley).
* oversubscribe Itanium.
*/ */
# define pRecurse p6 # define pRecurse p6
# define pReturn p7 # define pReturn p7
#ifdef CONFIG_ITANIUM
# define Nregs 10 # define Nregs 10
#else
# define Nregs 14
#endif
alloc loc0=ar.pfs,2,Nregs-2,2,0 alloc loc0=ar.pfs,2,Nregs-2,2,0
shr.u loc1=r18,9 // RNaTslots <= dirtySize / (64*8) + 1 shr.u loc1=r18,9 // RNaTslots <= dirtySize / (64*8) + 1
sub r17=r17,r18 // r17 = (physStackedSize + 8) - dirtySize sub r17=r17,r18 // r17 = (physStackedSize + 8) - dirtySize
;; ;;
#if 1
.align 32 // see comment below about gas bug...
#endif
mov ar.rsc=r19 // load ar.rsc to be used for "loadrs" mov ar.rsc=r19 // load ar.rsc to be used for "loadrs"
shladd in0=loc1,3,r17 shladd in0=loc1,3,r17
mov in1=0 mov in1=0
#if 0
// gas-2.12.90 is unable to generate a stop bit after .align, which is bad,
// because alloc must be at the beginning of an insn-group.
.align 32
#else
nop 0
nop 0
nop 0
#endif
;; ;;
// .align 32 // gas-2.11.90 is unable to generate a stop bit after .align
rse_clear_invalid: rse_clear_invalid:
#ifdef CONFIG_ITANIUM
// cycle 0 // cycle 0
{ .mii { .mii
alloc loc0=ar.pfs,2,Nregs-2,2,0 alloc loc0=ar.pfs,2,Nregs-2,2,0
...@@ -692,9 +725,31 @@ rse_clear_invalid: ...@@ -692,9 +725,31 @@ rse_clear_invalid:
mov loc7=0 mov loc7=0
(pReturn) br.ret.sptk.many b6 (pReturn) br.ret.sptk.many b6
} }
#else /* !CONFIG_ITANIUM */
alloc loc0=ar.pfs,2,Nregs-2,2,0
cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse
add out0=-Nregs*8,in0
add out1=1,in1 // increment recursion count
mov loc1=0
mov loc2=0
;;
mov loc3=0
mov loc4=0
mov loc9=0
mov loc5=0
mov loc6=0
(pRecurse) br.call.sptk.many b6=rse_clear_invalid
;;
mov loc7=0
mov loc8=0
cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret
mov loc10=0
mov loc11=0
(pReturn) br.ret.sptk.many b6
#endif /* !CONFIG_ITANIUM */
# undef pRecurse # undef pRecurse
# undef pReturn # undef pReturn
;;
alloc r17=ar.pfs,0,0,0,0 // drop current register frame alloc r17=ar.pfs,0,0,0,0 // drop current register frame
;; ;;
loadrs loadrs
......
...@@ -28,18 +28,19 @@ ...@@ -28,18 +28,19 @@
* on interrupts. * on interrupts.
*/ */
#define MINSTATE_START_SAVE_MIN_VIRT \ #define MINSTATE_START_SAVE_MIN_VIRT \
dep r1=-1,r1,61,3; /* r1 = current (virtual) */ \
(pUser) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ (pUser) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \
dep r1=-1,r1,61,3; /* r1 = current (virtual) */ \
;; \ ;; \
(pUser) mov.m rARRNAT=ar.rnat; \
(pUser) addl rKRBS=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \ (pUser) addl rKRBS=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \
(pUser) mov rARRNAT=ar.rnat; \
(pKern) mov r1=sp; /* get sp */ \ (pKern) mov r1=sp; /* get sp */ \
;; \ ;; \
(pUser) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ (pUser) lfetch.fault.excl.nt1 [rKRBS]; \
(pUser) mov rARBSPSTORE=ar.bspstore; /* save ar.bspstore */ \ (pUser) mov rARBSPSTORE=ar.bspstore; /* save ar.bspstore */ \
(pUser) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \
;; \ ;; \
(pKern) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
(pUser) mov ar.bspstore=rKRBS; /* switch to kernel RBS */ \ (pUser) mov ar.bspstore=rKRBS; /* switch to kernel RBS */ \
(pKern) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
;; \ ;; \
(pUser) mov r18=ar.bsp; \ (pUser) mov r18=ar.bsp; \
(pUser) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ (pUser) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \
...@@ -125,51 +126,57 @@ ...@@ -125,51 +126,57 @@
;; \ ;; \
SAVE_IFS; \ SAVE_IFS; \
MINSTATE_START_SAVE_MIN \ MINSTATE_START_SAVE_MIN \
add r17=L1_CACHE_BYTES,r1 /* really: biggest cache-line size */ \
;; \ ;; \
mov r16=r1; /* initialize first base pointer */ \ st8 [r1]=rCRIPSR; /* save cr.ipsr */ \
adds r17=8,r1; /* initialize second base pointer */ \ lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
add r16=16,r1; /* initialize first base pointer */ \
;; \ ;; \
st8 [r16]=rCRIPSR,16; /* save cr.ipsr */ \ lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
st8 [r17]=rCRIIP,16; /* save cr.iip */ \ ;; \
lfetch.fault.excl.nt1 [r17]; \
adds r17=8,r1; /* initialize second base pointer */ \
(pKern) mov r18=r0; /* make sure r18 isn't NaT */ \ (pKern) mov r18=r0; /* make sure r18 isn't NaT */ \
;; \ ;; \
st8 [r17]=rCRIIP,16; /* save cr.iip */ \
st8 [r16]=rCRIFS,16; /* save cr.ifs */ \ st8 [r16]=rCRIFS,16; /* save cr.ifs */ \
st8 [r17]=rARUNAT,16; /* save ar.unat */ \
(pUser) sub r18=r18,rKRBS; /* r18=RSE.ndirty*8 */ \ (pUser) sub r18=r18,rKRBS; /* r18=RSE.ndirty*8 */ \
;; \ ;; \
st8 [r17]=rARUNAT,16; /* save ar.unat */ \
st8 [r16]=rARPFS,16; /* save ar.pfs */ \ st8 [r16]=rARPFS,16; /* save ar.pfs */ \
shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \
;; \
st8 [r17]=rARRSC,16; /* save ar.rsc */ \ st8 [r17]=rARRSC,16; /* save ar.rsc */ \
tbit.nz p15,p0=rCRIPSR,IA64_PSR_I_BIT \
;; /* avoid RAW on r16 & r17 */ \
(pKern) adds r16=16,r16; /* skip over ar_rnat field */ \
(pKern) adds r17=16,r17; /* skip over ar_bspstore field */ \
(pUser) st8 [r16]=rARRNAT,16; /* save ar.rnat */ \ (pUser) st8 [r16]=rARRNAT,16; /* save ar.rnat */ \
(pKern) adds r16=16,r16; /* skip over ar_rnat field */ \
;; /* avoid RAW on r16 & r17 */ \
(pUser) st8 [r17]=rARBSPSTORE,16; /* save ar.bspstore */ \ (pUser) st8 [r17]=rARBSPSTORE,16; /* save ar.bspstore */ \
;; \
st8 [r16]=rARPR,16; /* save predicates */ \ st8 [r16]=rARPR,16; /* save predicates */ \
st8 [r17]=rB6,16; /* save b6 */ \ (pKern) adds r17=16,r17; /* skip over ar_bspstore field */ \
shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \
;; \ ;; \
st8 [r17]=rB6,16; /* save b6 */ \
st8 [r16]=r18,16; /* save ar.rsc value for "loadrs" */ \ st8 [r16]=r18,16; /* save ar.rsc value for "loadrs" */ \
st8.spill [r17]=rR1,16; /* save original r1 */ \ tbit.nz p15,p0=rCRIPSR,IA64_PSR_I_BIT \
;; \ ;; \
.mem.offset 8,0; st8.spill [r17]=rR1,16; /* save original r1 */ \
.mem.offset 0,0; st8.spill [r16]=r2,16; \ .mem.offset 0,0; st8.spill [r16]=r2,16; \
;; \
.mem.offset 8,0; st8.spill [r17]=r3,16; \ .mem.offset 8,0; st8.spill [r17]=r3,16; \
.mem.offset 0,0; st8.spill [r16]=r12,16; \
adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
;; \ ;; \
.mem.offset 0,0; st8.spill [r16]=r12,16; \
.mem.offset 8,0; st8.spill [r17]=r13,16; \ .mem.offset 8,0; st8.spill [r17]=r13,16; \
.mem.offset 0,0; st8.spill [r16]=r14,16; \
cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \ cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \
;; \ ;; \
.mem.offset 0,0; st8.spill [r16]=r14,16; \
.mem.offset 8,0; st8.spill [r17]=r15,16; \ .mem.offset 8,0; st8.spill [r17]=r15,16; \
.mem.offset 0,0; st8.spill [r16]=r8,16; \
dep r14=-1,r0,61,3; \ dep r14=-1,r0,61,3; \
;; \ ;; \
.mem.offset 0,0; st8.spill [r16]=r8,16; \
.mem.offset 8,0; st8.spill [r17]=r9,16; \ .mem.offset 8,0; st8.spill [r17]=r9,16; \
.mem.offset 0,0; st8.spill [r16]=r10,16; \
adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \ adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \
;; \ ;; \
.mem.offset 0,0; st8.spill [r16]=r10,16; \
.mem.offset 8,0; st8.spill [r17]=r11,16; \ .mem.offset 8,0; st8.spill [r17]=r11,16; \
mov r13=IA64_KR(CURRENT); /* establish `current' */ \ mov r13=IA64_KR(CURRENT); /* establish `current' */ \
;; \ ;; \
...@@ -190,10 +197,12 @@ ...@@ -190,10 +197,12 @@
*/ */
#define SAVE_REST \ #define SAVE_REST \
.mem.offset 0,0; st8.spill [r2]=r16,16; \ .mem.offset 0,0; st8.spill [r2]=r16,16; \
.mem.offset 8,0; st8.spill [r3]=r17,16; \
;; \ ;; \
.mem.offset 8,0; st8.spill [r3]=r17,16; \
.mem.offset 0,0; st8.spill [r2]=r18,16; \ .mem.offset 0,0; st8.spill [r2]=r18,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r19,16; \ .mem.offset 8,0; st8.spill [r3]=r19,16; \
.mem.offset 0,0; st8.spill [r2]=r20,16; \
;; \ ;; \
mov r16=ar.ccv; /* M-unit */ \ mov r16=ar.ccv; /* M-unit */ \
movl r18=FPSR_DEFAULT /* L-unit */ \ movl r18=FPSR_DEFAULT /* L-unit */ \
...@@ -201,30 +210,29 @@ ...@@ -201,30 +210,29 @@
mov r17=ar.fpsr; /* M-unit */ \ mov r17=ar.fpsr; /* M-unit */ \
mov ar.fpsr=r18; /* M-unit */ \ mov ar.fpsr=r18; /* M-unit */ \
;; \ ;; \
.mem.offset 0,0; st8.spill [r2]=r20,16; \
.mem.offset 8,0; st8.spill [r3]=r21,16; \ .mem.offset 8,0; st8.spill [r3]=r21,16; \
.mem.offset 0,0; st8.spill [r2]=r22,16; \
mov r18=b0; \ mov r18=b0; \
;; \ ;; \
.mem.offset 0,0; st8.spill [r2]=r22,16; \
.mem.offset 8,0; st8.spill [r3]=r23,16; \ .mem.offset 8,0; st8.spill [r3]=r23,16; \
.mem.offset 0,0; st8.spill [r2]=r24,16; \
mov r19=b7; \ mov r19=b7; \
;; \ ;; \
.mem.offset 0,0; st8.spill [r2]=r24,16; \
.mem.offset 8,0; st8.spill [r3]=r25,16; \ .mem.offset 8,0; st8.spill [r3]=r25,16; \
;; \
.mem.offset 0,0; st8.spill [r2]=r26,16; \ .mem.offset 0,0; st8.spill [r2]=r26,16; \
.mem.offset 8,0; st8.spill [r3]=r27,16; \
;; \ ;; \
.mem.offset 8,0; st8.spill [r3]=r27,16; \
.mem.offset 0,0; st8.spill [r2]=r28,16; \ .mem.offset 0,0; st8.spill [r2]=r28,16; \
.mem.offset 8,0; st8.spill [r3]=r29,16; \
;; \ ;; \
.mem.offset 8,0; st8.spill [r3]=r29,16; \
.mem.offset 0,0; st8.spill [r2]=r30,16; \ .mem.offset 0,0; st8.spill [r2]=r30,16; \
.mem.offset 8,0; st8.spill [r3]=r31,16; \
;; \ ;; \
.mem.offset 8,0; st8.spill [r3]=r31,16; \
st8 [r2]=r16,16; /* ar.ccv */ \ st8 [r2]=r16,16; /* ar.ccv */ \
st8 [r3]=r17,16; /* ar.fpsr */ \
;; \ ;; \
st8 [r3]=r17,16; /* ar.fpsr */ \
st8 [r2]=r18,16; /* b0 */ \ st8 [r2]=r18,16; /* b0 */ \
;; \
st8 [r3]=r19,16+8; /* b7 */ \ st8 [r3]=r19,16+8; /* b7 */ \
;; \ ;; \
stf.spill [r2]=f6,32; \ stf.spill [r2]=f6,32; \
......
...@@ -8,9 +8,11 @@ ...@@ -8,9 +8,11 @@
* in0: address of buffer to checksum (char *) * in0: address of buffer to checksum (char *)
* in1: length of the buffer (int) * in1: length of the buffer (int)
* *
* Copyright (C) 1999, 2001 Hewlett-Packard Co * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
* Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> * Stephane Eranian <eranian@hpl.hp.com>
* *
* 02/04/08 David Mosberger <davidm@hpl.hp.com>
* More cleanup and tuning.
* 01/04/18 Jun Nakajima <jun.nakajima@intel.com> * 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
* Clean up and optimize and the software pipeline, loading two * Clean up and optimize and the software pipeline, loading two
* back-to-back 8-byte words per loop. Clean up the initialization * back-to-back 8-byte words per loop. Clean up the initialization
...@@ -71,8 +73,6 @@ ...@@ -71,8 +73,6 @@
// calculating the Internet checksum. // calculating the Internet checksum.
// //
// NOT YET DONE: // NOT YET DONE:
// - use the lfetch instruction to augment the chances of the data being in
// the cache when we need it.
// - Maybe another algorithm which would take care of the folding at the // - Maybe another algorithm which would take care of the folding at the
// end in a different manner // end in a different manner
// - Work with people more knowledgeable than me on the network stack // - Work with people more knowledgeable than me on the network stack
...@@ -102,10 +102,6 @@ ...@@ -102,10 +102,6 @@
#define buf in0 #define buf in0
#define len in1 #define len in1
#ifndef CONFIG_IA64_LOAD_LATENCY
#define CONFIG_IA64_LOAD_LATENCY 2
#endif
#define LOAD_LATENCY 2 // XXX fix me #define LOAD_LATENCY 2 // XXX fix me
#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2) #if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
...@@ -122,45 +118,46 @@ GLOBAL_ENTRY(do_csum) ...@@ -122,45 +118,46 @@ GLOBAL_ENTRY(do_csum)
.prologue .prologue
.save ar.pfs, saved_pfs .save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,2,16,1,16 alloc saved_pfs=ar.pfs,2,16,1,16
.rotr word1[4], word2[4],result1[4],result2[4] .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
.rotp p[PIPE_DEPTH] .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
mov ret0=r0 // in case we have zero length mov ret0=r0 // in case we have zero length
cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len) cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
;; // avoid WAW on CFM ;;
mov tmp3=0x7 // a temporary mask/value
add tmp1=buf,len // last byte's address add tmp1=buf,len // last byte's address
(p6) br.ret.spnt.many rp // return if true (hope we can avoid that) .save pr, saved_pr
mov saved_pr=pr // preserve predicates (rotation)
(p6) br.ret.spnt.many rp // return if zero or negative length
and firstoff=7,buf // how many bytes off for first1 element
tbit.nz p15,p0=buf,0 // is buf an odd address ?
mov hmask=-1 // intialize head mask mov hmask=-1 // intialize head mask
;; tbit.nz p15,p0=buf,0 // is buf an odd address?
andcm first1=buf,tmp3 // 8byte aligned down address of first1 element and first1=-8,buf // 8-byte align down address of first1 element
and firstoff=7,buf // how many bytes off for first1 element
mov tmask=-1 // initialize tail mask mov tmask=-1 // initialize tail mask
adds tmp2=-1,tmp1 // last-1
;; ;;
adds tmp2=-1,tmp1 // last-1
and lastoff=7,tmp1 // how many bytes off for last element and lastoff=7,tmp1 // how many bytes off for last element
andcm last=tmp2,tmp3 // address of word containing last byte ;;
.save pr, saved_pr sub tmp1=8,lastoff // complement to lastoff
mov saved_pr=pr // preserve predicates (rotation) and last=-8,tmp2 // address of word containing last byte
;; ;;
sub tmp3=last,first1 // tmp3=distance from first1 to last sub tmp3=last,first1 // tmp3=distance from first1 to last
.save ar.lc, saved_lc
mov saved_lc=ar.lc // save lc
cmp.eq p8,p9=last,first1 // everything fits in one word ? cmp.eq p8,p9=last,first1 // everything fits in one word ?
sub tmp1=8,lastoff // complement to lastoff
ld8 firstval=[first1],8 // load,ahead of time, "first1" word ld8 firstval=[first1],8 // load, ahead of time, "first1" word
and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
shl tmp2=firstoff,3 // number of bits shl tmp2=firstoff,3 // number of bits
;; ;;
and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 (p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
(p9) ld8 lastval=[last] // load,ahead of time, "last" word, if needed shl tmp1=tmp1,3 // number of bits
(p9) adds tmp3=-8,tmp3 // effectively loaded (p9) adds tmp3=-8,tmp3 // effectively loaded
;; ;;
(p8) mov lastval=r0 // we don't need lastval if first1==last (p8) mov lastval=r0 // we don't need lastval if first1==last
shl tmp1=tmp1,3 // number of bits
shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[ shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
;;
shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
.save ar.lc, saved_lc
mov saved_lc=ar.lc // save lc
;; ;;
.body .body
#define count tmp3 #define count tmp3
...@@ -171,8 +168,8 @@ GLOBAL_ENTRY(do_csum) ...@@ -171,8 +168,8 @@ GLOBAL_ENTRY(do_csum)
;; ;;
// If count is odd, finish this 8-byte word so that we can // If count is odd, finish this 8-byte word so that we can
// load two back-to-back 8-byte words per loop thereafter. // load two back-to-back 8-byte words per loop thereafter.
tbit.nz p10,p11=count,0 // if (count is odd)
and word1[0]=firstval,hmask // and mask it as appropriate and word1[0]=firstval,hmask // and mask it as appropriate
tbit.nz p10,p11=count,0 // if (count is odd)
;; ;;
(p8) mov result1[0]=word1[0] (p8) mov result1[0]=word1[0]
(p9) add result1[0]=word1[0],word2[0] (p9) add result1[0]=word1[0],word2[0]
...@@ -181,9 +178,8 @@ GLOBAL_ENTRY(do_csum) ...@@ -181,9 +178,8 @@ GLOBAL_ENTRY(do_csum)
;; ;;
(p6) adds result1[0]=1,result1[0] (p6) adds result1[0]=1,result1[0]
(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word) (p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
;;
(p11) br.cond.dptk .do_csum16 // if (count is even) (p11) br.cond.dptk .do_csum16 // if (count is even)
;;
// Here count is odd. // Here count is odd.
ld8 word1[1]=[first1],8 // load an 8-byte word ld8 word1[1]=[first1],8 // load an 8-byte word
cmp.eq p9,p10=1,count // if (count == 1) cmp.eq p9,p10=1,count // if (count == 1)
...@@ -194,11 +190,9 @@ GLOBAL_ENTRY(do_csum) ...@@ -194,11 +190,9 @@ GLOBAL_ENTRY(do_csum)
cmp.ltu p6,p0=result1[0],word1[1] cmp.ltu p6,p0=result1[0],word1[1]
;; ;;
(p6) adds result1[0]=1,result1[0] (p6) adds result1[0]=1,result1[0]
;;
(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit (p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
// Fall through to caluculate the checksum, feeding result1[0] as // Fall through to caluculate the checksum, feeding result1[0] as
// the initial value in result1[0]. // the initial value in result1[0].
;;
// //
// Calculate the checksum loading two 8-byte words per loop. // Calculate the checksum loading two 8-byte words per loop.
// //
...@@ -207,45 +201,36 @@ GLOBAL_ENTRY(do_csum) ...@@ -207,45 +201,36 @@ GLOBAL_ENTRY(do_csum)
shr.u count=count,1 // we do 16 bytes per loop shr.u count=count,1 // we do 16 bytes per loop
;; ;;
cmp.eq p9,p10=r0,count // if (count == 0) cmp.eq p9,p10=r0,count // if (count == 0)
adds count=-1,count
brp.loop.imp 1f,2f brp.loop.imp 1f,2f
;; ;;
adds count=-1,count
mov ar.ec=PIPE_DEPTH mov ar.ec=PIPE_DEPTH
;;
mov ar.lc=count // set lc mov ar.lc=count // set lc
;;
// result1[0] must be initialized in advance. // result1[0] must be initialized in advance.
mov result2[0]=r0 mov result2[0]=r0
;;
mov pr.rot=1<<16 mov pr.rot=1<<16
;;
mov carry1=r0 mov carry1=r0
mov carry2=r0 mov carry2=r0
;;
add first2=8,first1 add first2=8,first1
;;
(p9) br.cond.sptk .do_csum_exit (p9) br.cond.sptk .do_csum_exit
;;
nop.m 0
nop.i 0
;; ;;
.align 32 .align 32
1: 1:
(ELD_1) cmp.ltu p31,p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1] (ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
(p32) adds carry1=1,carry1 (pC1[1])adds carry1=1,carry1
(ELD_1) cmp.ltu p47,p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1] (ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
(p48) adds carry2=1,carry2 (pC2[1])adds carry2=1,carry2
(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] (ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] (ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
2: [2:]
(p16) ld8 word1[0]=[first1],16 (p[0]) ld8 word1[0]=[first1],16
(p16) ld8 word2[0]=[first2],16 (p[0]) ld8 word2[0]=[first2],16
br.ctop.sptk 1b br.ctop.sptk 1b
;; ;;
// Since len is a 32-bit value, carry cannot be larger than // Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
// a 64-bit value. (pC1[1])adds carry1=1,carry1 // since we miss the last one
(p32) adds carry1=1,carry1 // since we miss the last one (pC2[1])adds carry2=1,carry2
(p48) adds carry2=1,carry2
;; ;;
add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1 add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2 add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
...@@ -263,18 +248,15 @@ GLOBAL_ENTRY(do_csum) ...@@ -263,18 +248,15 @@ GLOBAL_ENTRY(do_csum)
(p6) adds result1[0]=1,result1[0] (p6) adds result1[0]=1,result1[0]
;; ;;
.do_csum_exit: .do_csum_exit:
movl tmp3=0xffffffff
;;
// XXX Fixme
// //
// now fold 64 into 16 bits taking care of carry // now fold 64 into 16 bits taking care of carry
// that's not very good because it has lots of sequentiality // that's not very good because it has lots of sequentiality
// //
and tmp1=result1[0],tmp3 mov tmp3=0xffff
zxt4 tmp1=result1[0]
shr.u tmp2=result1[0],32 shr.u tmp2=result1[0],32
;; ;;
add result1[0]=tmp1,tmp2 add result1[0]=tmp1,tmp2
shr.u tmp3=tmp3,16
;; ;;
and tmp1=result1[0],tmp3 and tmp1=result1[0],tmp3
shr.u tmp2=result1[0],16 shr.u tmp2=result1[0],16
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment