More McKinley tuning and minor do_csum() cleanup.

889ac863 · David Mosberger · d4bbe676 · 889ac863 · 889ac863 · 889ac863
Commit 889ac863 authored Apr 10, 2002 by David Mosberger
Showing with 178 additions and 133 deletions

arch/ia64/kernel/entry.S arch/ia64/kernel/entry.S +94 -39

arch/ia64/kernel/minstate.h arch/ia64/kernel/minstate.h +42 -34

arch/ia64/lib/do_csum.S arch/ia64/lib/do_csum.S +42 -60

No files found.
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -214,61 +214,79 @@ GLOBAL_ENTRY(save_switch_stack)
 	.save @priunat,r17
 	mov r17=ar.unat		// preserve caller's
 	.body
-	adds r3=80,sp
+#ifdef CONFIG_ITANIUM
+	adds r2=16+128,sp
+	adds r3=16+64,sp
+	adds r14=SW(R4)+16,sp
 	;;
+	st8.spill [r14]=r4,16		// spill r4
 	lfetch.fault.excl.nt1 [r3],128
-	mov ar.rsc=0		// put RSE in mode: enforced lazy, little endian, pl 0
-	adds r2=16+128,sp
 	;;
 	lfetch.fault.excl.nt1 [r2],128
 	lfetch.fault.excl.nt1 [r3],128
-	adds r14=SW(R4)+16,sp
 	;;
 	lfetch.fault.excl [r2]
 	lfetch.fault.excl [r3]
 	adds r15=SW(R5)+16,sp
+#else
+	add r2=16+3*128,sp
+	add r3=16,sp
+	add r14=SW(R4)+16,sp
+	;;
+	st8.spill [r14]=r4,SW(R6)-SW(R4)	// spill r4 and prefetch offset 0x1c0
+	lfetch.fault.excl.nt1 [r3],128	//		prefetch offset 0x010
 	;;
-	mov r18=ar.fpsr		// preserve fpsr
+	lfetch.fault.excl.nt1 [r3],128	//		prefetch offset 0x090
-	mov r19=ar.rnat
+	lfetch.fault.excl.nt1 [r2],128	//		prefetch offset 0x190
+	;;
+	lfetch.fault.excl.nt1 [r3]	//		prefetch offset 0x110
+	lfetch.fault.excl.nt1 [r2]	//		prefetch offset 0x210
+	adds r15=SW(R5)+16,sp
+#endif
+	;;
+	st8.spill [r15]=r5,SW(R7)-SW(R5)	// spill r5
+	mov.m ar.rsc=0			// put RSE in mode: enforced lazy, little endian, pl 0
 	add r2=SW(F2)+16,sp		// r2 = &sw->f2
-.mem.offset 0,0; st8.spill [r14]=r4,16		// spill r4
+	;;
-.mem.offset 8,0; st8.spill [r15]=r5,16		// spill r5
+	st8.spill [r14]=r6,SW(B0)-SW(R6)	// spill r6
+	mov.m r18=ar.fpsr		// preserve fpsr
 	add r3=SW(F3)+16,sp		// r3 = &sw->f3
 	;;
 	stf.spill [r2]=f2,32
-	stf.spill [r3]=f3,32
+	mov.m r19=ar.rnat
 	mov r21=b0
-.mem.offset 0,0; st8.spill [r14]=r6,16		// spill r6
-.mem.offset 8,0; st8.spill [r15]=r7,16		// spill r7
+	stf.spill [r3]=f3,32
+	st8.spill [r15]=r7,SW(B2)-SW(R7)	// spill r7
 	mov r22=b1
 	;;
 	// since we're done with the spills, read and save ar.unat:
-	mov r29=ar.unat		// M-unit
+	mov.m r29=ar.unat
-	mov r20=ar.bspstore	// M-unit
+	mov.m r20=ar.bspstore
 	mov r23=b2
 	stf.spill [r2]=f4,32
 	stf.spill [r3]=f5,32
 	mov r24=b3
 	;;
-	st8 [r14]=r21,16	// save b0
+	st8 [r14]=r21,SW(B1)-SW(B0)		// save b0
-	st8 [r15]=r22,16	// save b1
+	st8 [r15]=r23,SW(B3)-SW(B2)		// save b2
 	mov r25=b4
 	stf.spill [r2]=f10,32
 	stf.spill [r3]=f11,32
 	mov r26=b5
 	;;
-	st8 [r14]=r23,16	// save b2
+	st8 [r14]=r22,SW(B4)-SW(B1)		// save b1
-	st8 [r15]=r24,16	// save b3
+	st8 [r15]=r24,SW(AR_PFS)-SW(B3)		// save b3
 	mov r21=ar.lc		// I-unit
 	stf.spill [r2]=f12,32
 	stf.spill [r3]=f13,32
 	;;
-	st8 [r14]=r25,16	// save b4
+	st8 [r14]=r25,SW(B5)-SW(B4)		// save b4
-	st8 [r15]=r26,16	// save b5
+	st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS)	// save ar.pfs
 	stf.spill [r2]=f14,32
 	stf.spill [r3]=f15,32
 	;;
-	st8 [r14]=r16		// save ar.pfs
+	st8 [r14]=r26				// save b5
 	st8 [r15]=r21				// save ar.lc
 	stf.spill [r2]=f16,32
 	stf.spill [r3]=f17,32
@@ -284,26 +302,26 @@ GLOBAL_ENTRY(save_switch_stack)
 	;;
 	stf.spill [r2]=f24,32
 	stf.spill [r3]=f25,32
-	add r14=SW(CALLER_UNAT)+16,sp
 	;;
 	stf.spill [r2]=f26,32
 	stf.spill [r3]=f27,32
-	add r15=SW(AR_FPSR)+16,sp
 	;;
 	stf.spill [r2]=f28,32
 	stf.spill [r3]=f29,32
-	st8 [r14]=r17		// save caller_unat
-	st8 [r15]=r18		// save fpsr
-	mov r21=pr
 	;;
-	stf.spill [r2]=f30,(SW(AR_UNAT)-SW(F30))
+	stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30)
-	stf.spill [r3]=f31,(SW(AR_RNAT)-SW(F31))
+	stf.spill [r3]=f31,SW(PR)-SW(F31)
+	add r14=SW(CALLER_UNAT)+16,sp
 	;;
-	st8 [r2]=r29,16		// save ar.unat
+	st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT)	// save ar.unat
-	st8 [r3]=r19,16		// save ar.rnat
+	st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat
+	mov r21=pr
 	;;
-	st8 [r2]=r20		// save ar.bspstore
+	st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat
 	st8 [r3]=r21				// save predicate registers
+	;;
+	st8 [r2]=r20				// save ar.bspstore
+	st8 [r14]=r18				// save fpsr
 	mov ar.rsc=3		// put RSE back into eager mode, pl 0
 	br.cond.sptk.many b7
 END(save_switch_stack)
@@ -647,23 +665,38 @@ dont_preserve_current_frame:
 	/*
 	 * To prevent leaking bits between the kernel and user-space,
 	 * we must clear the stacked registers in the "invalid" partition here.
-	 * Not pretty, but at least it's fast (3.34 registers/cycle).
+	 * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
-	 * Architecturally, this loop could go at 4.67 registers/cycle, but that would
+	 * 5 registers/cycle on McKinley).
-	 * oversubscribe Itanium.
 	 */
 #	define pRecurse	p6
 #	define pReturn	p7
+#ifdef CONFIG_ITANIUM
 #	define Nregs	10
+#else
+#	define Nregs	14
+#endif
 	alloc loc0=ar.pfs,2,Nregs-2,2,0
 	shr.u loc1=r18,9		// RNaTslots <= dirtySize / (64*8) + 1
 	sub r17=r17,r18			// r17 = (physStackedSize + 8) - dirtySize
 	;;
+#if 1
+	.align 32		// see comment below about gas bug...
+#endif
 	mov ar.rsc=r19			// load ar.rsc to be used for "loadrs"
 	shladd in0=loc1,3,r17
 	mov in1=0
+#if 0
+	// gas-2.12.90 is unable to generate a stop bit after .align, which is bad,
+	// because alloc must be at the beginning of an insn-group.
+	.align 32
+#else
+	nop 0
+	nop 0
+	nop 0
+#endif
 	;;
-//	.align 32	// gas-2.11.90 is unable to generate a stop bit after .align
 rse_clear_invalid:
+#ifdef CONFIG_ITANIUM
 	// cycle 0
 { .mii
 	alloc loc0=ar.pfs,2,Nregs-2,2,0
@@ -692,9 +725,31 @@ rse_clear_invalid:
 	mov loc7=0
 (pReturn) br.ret.sptk.many b6
 }
+#else /* !CONFIG_ITANIUM */
+	alloc loc0=ar.pfs,2,Nregs-2,2,0
+	cmp.lt pRecurse,p0=Nregs*8,in0	// if more than Nregs regs left to clear, (re)curse
+	add out0=-Nregs*8,in0
+	add out1=1,in1			// increment recursion count
+	mov loc1=0
+	mov loc2=0
+	;;
+	mov loc3=0
+	mov loc4=0
+	mov loc9=0
+	mov loc5=0
+	mov loc6=0
+(pRecurse) br.call.sptk.many b6=rse_clear_invalid
+	;;
+	mov loc7=0
+	mov loc8=0
+	cmp.ne pReturn,p0=r0,in1	// if recursion count != 0, we need to do a br.ret
+	mov loc10=0
+	mov loc11=0
+(pReturn) br.ret.sptk.many b6
+#endif /* !CONFIG_ITANIUM */
 #	undef pRecurse
 #	undef pReturn
+	;;
 	alloc r17=ar.pfs,0,0,0,0	// drop current register frame
 	;;
 	loadrs

--- a/arch/ia64/kernel/minstate.h
+++ b/arch/ia64/kernel/minstate.h
@@ -28,18 +28,19 @@
 * on interrupts.
 */
 #define MINSTATE_START_SAVE_MIN_VIRT								\
-	dep r1=-1,r1,61,3;				/* r1 = current (virtual) */		\
 (pUser)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
+	dep r1=-1,r1,61,3;				/* r1 = current (virtual) */		\
 	;;											\
+(pUser)	mov.m rARRNAT=ar.rnat;									\
 (pUser)	addl rKRBS=IA64_RBS_OFFSET,r1;			/* compute base of RBS */		\
-(pUser)	mov rARRNAT=ar.rnat;									\
 (pKern) mov r1=sp;					/* get sp  */				\
 	;;											\
-(pUser)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
+(pUser) lfetch.fault.excl.nt1 [rKRBS];								\
 (pUser)	mov rARBSPSTORE=ar.bspstore;			/* save ar.bspstore */			\
+(pUser)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
 	;;											\
-(pKern) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
 (pUser)	mov ar.bspstore=rKRBS;				/* switch to kernel RBS */		\
+(pKern) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
 	;;											\
 (pUser)	mov r18=ar.bsp;										\
 (pUser)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
@@ -125,51 +126,57 @@
 	;;											  \
 	SAVE_IFS;										  \
 	MINSTATE_START_SAVE_MIN									  \
+	add r17=L1_CACHE_BYTES,r1			/* really: biggest cache-line size */	  \
 	;;											  \
-	mov r16=r1;					/* initialize first base pointer */	  \
+	st8 [r1]=rCRIPSR;	/* save cr.ipsr */						  \
-	adds r17=8,r1;					/* initialize second base pointer */	  \
+	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;						  \
+	add r16=16,r1;					/* initialize first base pointer */	  \
 	;;											  \
-	st8 [r16]=rCRIPSR,16;	/* save cr.ipsr */						  \
+	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;						  \
-	st8 [r17]=rCRIIP,16;	/* save cr.iip */						  \
+	;;											  \
+	lfetch.fault.excl.nt1 [r17];								  \
+	adds r17=8,r1;					/* initialize second base pointer */	  \
 (pKern)	mov r18=r0;		/* make sure r18 isn't NaT */					  \
 	;;											  \
+	st8 [r17]=rCRIIP,16;	/* save cr.iip */						  \
 	st8 [r16]=rCRIFS,16;	/* save cr.ifs */						  \
-	st8 [r17]=rARUNAT,16;	/* save ar.unat */						  \
 (pUser)	sub r18=r18,rKRBS;	/* r18=RSE.ndirty*8 */						  \
 	;;											  \
+	st8 [r17]=rARUNAT,16;	/* save ar.unat */						  \
 	st8 [r16]=rARPFS,16;	/* save ar.pfs */						  \
+	shl r18=r18,16;		/* compute ar.rsc to be used for "loadrs" */			  \
+	;;											  \
 	st8 [r17]=rARRSC,16;	/* save ar.rsc */						  \
-	tbit.nz p15,p0=rCRIPSR,IA64_PSR_I_BIT							  \
-	;;			/* avoid RAW on r16 & r17 */					  \
-(pKern)	adds r16=16,r16;	/* skip over ar_rnat field */					  \
-(pKern)	adds r17=16,r17;	/* skip over ar_bspstore field */				  \
 (pUser)	st8 [r16]=rARRNAT,16;	/* save ar.rnat */						  \
+(pKern)	adds r16=16,r16;	/* skip over ar_rnat field */					  \
+	;;			/* avoid RAW on r16 & r17 */					  \
 (pUser)	st8 [r17]=rARBSPSTORE,16;	/* save ar.bspstore */					  \
-	;;											  \
 	st8 [r16]=rARPR,16;	/* save predicates */						  \
-	st8 [r17]=rB6,16;	/* save b6 */							  \
+(pKern)	adds r17=16,r17;	/* skip over ar_bspstore field */				  \
-	shl r18=r18,16;		/* compute ar.rsc to be used for "loadrs" */			  \
 	;;											  \
+	st8 [r17]=rB6,16;	/* save b6 */							  \
 	st8 [r16]=r18,16;	/* save ar.rsc value for "loadrs" */				  \
-	st8.spill [r17]=rR1,16;	/* save original r1 */						  \
+	tbit.nz p15,p0=rCRIPSR,IA64_PSR_I_BIT							  \
 	;;											  \
+.mem.offset 8,0;	st8.spill [r17]=rR1,16;	/* save original r1 */				  \
 .mem.offset 0,0;	st8.spill [r16]=r2,16;							  \
+	;;											  \
 .mem.offset 8,0;	st8.spill [r17]=r3,16;							  \
+.mem.offset 0,0;	st8.spill [r16]=r12,16;							  \
 	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							  \
 	;;											  \
-.mem.offset 0,0;		st8.spill [r16]=r12,16;						  \
 .mem.offset 8,0;	st8.spill [r17]=r13,16;							  \
+.mem.offset 0,0;	st8.spill [r16]=r14,16;							  \
 	cmp.eq pNonSys,pSys=r0,r0	/* initialize pSys=0, pNonSys=1 */			  \
 	;;											  \
-.mem.offset 0,0;		st8.spill [r16]=r14,16;						  \
 .mem.offset 8,0;	st8.spill [r17]=r15,16;							  \
+.mem.offset 0,0;	st8.spill [r16]=r8,16;							  \
 	dep r14=-1,r0,61,3;									  \
 	;;											  \
-.mem.offset 0,0;		st8.spill [r16]=r8,16;						  \
 .mem.offset 8,0;	st8.spill [r17]=r9,16;							  \
+.mem.offset 0,0;	st8.spill [r16]=r10,16;							  \
 	adds r12=-16,r1;	/* switch to kernel memory stack (with 16 bytes of scratch) */	  \
 	;;											  \
-.mem.offset 0,0;		st8.spill [r16]=r10,16;						  \
 .mem.offset 8,0;	st8.spill [r17]=r11,16;							  \
 	mov r13=IA64_KR(CURRENT);	/* establish `current' */				  \
 	;;											  \
@@ -190,10 +197,12 @@
 */
 #define SAVE_REST				\
 .mem.offset 0,0;	st8.spill [r2]=r16,16;	\
-.mem.offset 8,0;	st8.spill [r3]=r17,16;	\
 	;;					\
+.mem.offset 8,0;	st8.spill [r3]=r17,16;	\
 .mem.offset 0,0;	st8.spill [r2]=r18,16;	\
+	;;					\
 .mem.offset 8,0;	st8.spill [r3]=r19,16;	\
+.mem.offset 0,0;	st8.spill [r2]=r20,16;	\
 	;;					\
 	mov r16=ar.ccv;		/* M-unit */	\
 	movl r18=FPSR_DEFAULT	/* L-unit */	\
@@ -201,30 +210,29 @@
 	mov r17=ar.fpsr;	/* M-unit */	\
 	mov ar.fpsr=r18;	/* M-unit */	\
 	;;					\
-.mem.offset 0,0;	st8.spill [r2]=r20,16;	\
 .mem.offset 8,0;	st8.spill [r3]=r21,16;	\
+.mem.offset 0,0;	st8.spill [r2]=r22,16;	\
 	mov r18=b0;				\
 	;;					\
-.mem.offset 0,0;	st8.spill [r2]=r22,16;	\
 .mem.offset 8,0;	st8.spill [r3]=r23,16;	\
+.mem.offset 0,0;	st8.spill [r2]=r24,16;	\
 	mov r19=b7;				\
 	;;					\
-.mem.offset 0,0;	st8.spill [r2]=r24,16;	\
 .mem.offset 8,0;	st8.spill [r3]=r25,16;	\
-	;;					\
 .mem.offset 0,0;	st8.spill [r2]=r26,16;	\
-.mem.offset 8,0;	st8.spill [r3]=r27,16;	\
 	;;					\
+.mem.offset 8,0;	st8.spill [r3]=r27,16;	\
 .mem.offset 0,0;	st8.spill [r2]=r28,16;	\
-.mem.offset 8,0;	st8.spill [r3]=r29,16;	\
 	;;					\
+.mem.offset 8,0;	st8.spill [r3]=r29,16;	\
 .mem.offset 0,0;	st8.spill [r2]=r30,16;	\
-.mem.offset 8,0;	st8.spill [r3]=r31,16;	\
 	;;					\
+.mem.offset 8,0;	st8.spill [r3]=r31,16;	\
 	st8 [r2]=r16,16;	/* ar.ccv */	\
-	st8 [r3]=r17,16;	/* ar.fpsr */	\
 	;;					\
+	st8 [r3]=r17,16;	/* ar.fpsr */	\
 	st8 [r2]=r18,16;	/* b0 */	\
+	;;					\
 	st8 [r3]=r19,16+8;	/* b7 */	\
 	;;					\
 	stf.spill [r2]=f6,32;			\

--- a/arch/ia64/lib/do_csum.S
+++ b/arch/ia64/lib/do_csum.S
@@ -8,9 +8,11 @@
 *	in0: address of buffer to checksum (char *)
 *	in1: length of the buffer (int)
 *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
+ * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
 *
+ * 02/04/08	David Mosberger <davidm@hpl.hp.com>
+ *		More cleanup and tuning.
 * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
 *		Clean up and optimize and the software pipeline, loading two
 *		back-to-back 8-byte words per loop. Clean up the initialization
@@ -71,8 +73,6 @@
 //	calculating the Internet checksum.
 //
 // NOT YET DONE:
-//	- use the lfetch instruction to augment the chances of the data being in
-//	  the cache when we need it.
 //	- Maybe another algorithm which would take care of the folding at the
 //	  end in a different manner
 //	- Work with people more knowledgeable than me on the network stack
@@ -102,10 +102,6 @@
 #define buf		in0
 #define len		in1
-#ifndef CONFIG_IA64_LOAD_LATENCY
-#define CONFIG_IA64_LOAD_LATENCY	2
-#endif
 #define LOAD_LATENCY	2	// XXX fix me
 #if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
@@ -122,45 +118,46 @@ GLOBAL_ENTRY(do_csum)
 	.prologue
 	.save ar.pfs, saved_pfs
 	alloc saved_pfs=ar.pfs,2,16,1,16
-	.rotr word1[4], word2[4],result1[4],result2[4]
+	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
-	.rotp p[PIPE_DEPTH]
+	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
 	mov ret0=r0		// in case we have zero length
 	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
-	;;			// avoid WAW on CFM
+	;;
-	mov tmp3=0x7		// a temporary mask/value
 	add tmp1=buf,len	// last byte's address
-(p6)	br.ret.spnt.many rp	// return if true (hope we can avoid that)
+	.save pr, saved_pr
+	mov saved_pr=pr		// preserve predicates (rotation)
+(p6)	br.ret.spnt.many rp	// return if zero or negative length
-	and firstoff=7,buf	// how many bytes off for first1 element
-	tbit.nz p15,p0=buf,0	// is buf an odd address ?
 	mov hmask=-1		// intialize head mask
-	;;
+	tbit.nz p15,p0=buf,0	// is buf an odd address?
-	andcm first1=buf,tmp3	// 8byte aligned down address of first1 element
+	and first1=-8,buf	// 8-byte align down address of first1 element
+	and firstoff=7,buf	// how many bytes off for first1 element
 	mov tmask=-1		// initialize tail mask
-	adds tmp2=-1,tmp1	// last-1
 	;;
+	adds tmp2=-1,tmp1	// last-1
 	and lastoff=7,tmp1	// how many bytes off for last element
-	andcm last=tmp2,tmp3	// address of word containing last byte
+	;;
-	.save pr, saved_pr
+	sub tmp1=8,lastoff	// complement to lastoff
-	mov saved_pr=pr		// preserve predicates (rotation)
+	and last=-8,tmp2	// address of word containing last byte
 	;;
 	sub tmp3=last,first1	// tmp3=distance from first1 to last
+	.save ar.lc, saved_lc
+	mov saved_lc=ar.lc	// save lc
 	cmp.eq p8,p9=last,first1	// everything fits in one word ?
-	sub tmp1=8,lastoff	// complement to lastoff
-	ld8 firstval=[first1],8	// load,ahead of time, "first1" word
+	ld8 firstval=[first1],8	// load, ahead of time, "first1" word
+	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
 	shl tmp2=firstoff,3	// number of bits
 	;;
-	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
+(p9)	ld8 lastval=[last]	// load, ahead of time, "last" word, if needed
-(p9)	ld8 lastval=[last]	// load,ahead of time, "last" word, if needed
+	shl tmp1=tmp1,3		// number of bits
 (p9)	adds tmp3=-8,tmp3	// effectively loaded
 	;;
 (p8)	mov lastval=r0		// we don't need lastval if first1==last
-	shl tmp1=tmp1,3		// number of bits
 	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
-	;;
 	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// save lc
 	;;
 	.body
 #define count tmp3
@@ -171,8 +168,8 @@ GLOBAL_ENTRY(do_csum)
 	;;
 	// If count is odd, finish this 8-byte word so that we can
 	// load two back-to-back 8-byte words per loop thereafter.
-	tbit.nz p10,p11=count,0		// if (count is odd)
 	and word1[0]=firstval,hmask	// and mask it as appropriate
+	tbit.nz p10,p11=count,0		// if (count is odd)
 	;;
 (p8)	mov result1[0]=word1[0]
 (p9)	add result1[0]=word1[0],word2[0]
@@ -181,9 +178,8 @@ GLOBAL_ENTRY(do_csum)
 	;;
 (p6)	adds result1[0]=1,result1[0]
 (p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
-	;;
 (p11)	br.cond.dptk .do_csum16		// if (count is even)
-	;;
 	// Here count is odd.
 	ld8 word1[1]=[first1],8		// load an 8-byte word
 	cmp.eq p9,p10=1,count		// if (count == 1)
@@ -194,11 +190,9 @@ GLOBAL_ENTRY(do_csum)
 	cmp.ltu p6,p0=result1[0],word1[1]
 	;;
 (p6)	adds result1[0]=1,result1[0]
-	;;
 (p9)	br.cond.sptk .do_csum_exit	// if (count == 1) exit
 	// Fall through to caluculate the checksum, feeding result1[0] as
 	// the initial value in result1[0].
-	;;
 	//
 	// Calculate the checksum loading two 8-byte words per loop.
 	//
@@ -207,45 +201,36 @@ GLOBAL_ENTRY(do_csum)
 	shr.u count=count,1	// we do 16 bytes per loop
 	;;
 	cmp.eq p9,p10=r0,count	// if (count == 0)
+	adds count=-1,count
 	brp.loop.imp 1f,2f
 	;;
-	adds count=-1,count
 	mov ar.ec=PIPE_DEPTH
-	;;
 	mov ar.lc=count	// set lc
-	;;
 	// result1[0] must be initialized in advance.
 	mov result2[0]=r0
-	;;
 	mov pr.rot=1<<16
-	;;
 	mov carry1=r0
 	mov carry2=r0
-	;;
 	add first2=8,first1
-	;;
 (p9)	br.cond.sptk .do_csum_exit
-	;;
-	nop.m	0
-	nop.i	0
 	;;
 	.align 32
 1:
-(ELD_1)	cmp.ltu p31,p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
+(ELD_1)	cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(p32)	adds carry1=1,carry1
+(pC1[1])adds carry1=1,carry1
-(ELD_1)	cmp.ltu p47,p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
+(ELD_1)	cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(p48)	adds carry2=1,carry2
+(pC2[1])adds carry2=1,carry2
 (ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
 (ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
+[2:]
-(p16)	ld8 word1[0]=[first1],16
+(p[0])	ld8 word1[0]=[first1],16
-(p16)	ld8 word2[0]=[first2],16
+(p[0])	ld8 word2[0]=[first2],16
 	br.ctop.sptk 1b
 	;;
-	// Since len is a 32-bit value, carry cannot be larger than
+	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
-	// a 64-bit value.
+(pC1[1])adds carry1=1,carry1	// since we miss the last one
-(p32)	adds carry1=1,carry1	// since we miss the last one
+(pC2[1])adds carry2=1,carry2
-(p48)	adds carry2=1,carry2
 	;;
 	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
 	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
@@ -263,18 +248,15 @@ GLOBAL_ENTRY(do_csum)
 (p6)	adds result1[0]=1,result1[0]
 	;;
 .do_csum_exit:
-	movl tmp3=0xffffffff
-	;;
-	// XXX Fixme
 	//
 	// now fold 64 into 16 bits taking care of carry
 	// that's not very good because it has lots of sequentiality
 	//
-	and tmp1=result1[0],tmp3
+	mov tmp3=0xffff
+	zxt4 tmp1=result1[0]
 	shr.u tmp2=result1[0],32
 	;;
 	add result1[0]=tmp1,tmp2
-	shr.u tmp3=tmp3,16
 	;;
 	and tmp1=result1[0],tmp3
 	shr.u tmp2=result1[0],16