ia64: Add optimized ip_fast_csum() by Ken Chen and merge his cleanups

to do_csum.S.

ia64: Add optimized ip_fast_csum() by Ken Chen and merge his cleanups
to do_csum.S.
878da831 · David Mosberger · d2c4281c · 878da831 · 878da831 · 878da831
Commit 878da831 authored Apr 25, 2002 by David Mosberger
4 changed files
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -13,7 +13,7 @@ obj-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o					\
 	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o					\
 	checksum.o clear_page.o csum_partial_copy.o copy_page.o				\
 	copy_user.o clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
-	flush.o io.o do_csum.o								\
+	flush.o io.o ip_fast_csum.o do_csum.o						\
 	memcpy.o memset.o strlen.o swiotlb.o
 obj-$(CONFIG_ITANIUM) += copy_page.o

--- a/arch/ia64/lib/checksum.c
+++ b/arch/ia64/lib/checksum.c
@@ -15,7 +15,7 @@
 #include <asm/byteorder.h>
 static inline unsigned short
-from64to16(unsigned long x)
+from64to16 (unsigned long x)
 {
 	/* add up 32-bit words for 33 bits */
 	x = (x & 0xffffffff) + (x >> 32);
@@ -32,22 +32,17 @@ from64to16(unsigned long x)
 * computes the checksum of the TCP/UDP pseudo-header
 * returns a 16-bit checksum, already complemented.
 */
-unsigned short int csum_tcpudp_magic(unsigned long saddr,
+unsigned short int
-				   unsigned long daddr,
+csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len,
-				   unsigned short len,
+		   unsigned short proto, unsigned int sum)
-				   unsigned short proto,
-				   unsigned int sum)
 {
-	return ~from64to16(saddr + daddr + sum +
+	return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) +
-		((unsigned long) ntohs(len) << 16) +
 			   ((unsigned long) proto << 8));
 }
-unsigned int csum_tcpudp_nofold(unsigned long saddr,
+unsigned int
-				   unsigned long daddr,
+csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len,
-				   unsigned short len,
+		    unsigned short proto, unsigned int sum)
-				   unsigned short proto,
-				   unsigned int sum)
 {
 	unsigned long result;
@@ -65,15 +60,6 @@ unsigned int csum_tcpudp_nofold(unsigned long saddr,
 extern unsigned long do_csum (const unsigned char *, long);
-/*
- *	This is a version of ip_compute_csum() optimized for IP headers,
- *	which always checksum on 4 octet boundaries.
- */
-unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
-{
-	return ~do_csum(iph, ihl*4);
-}
 /*
 * computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit)
@@ -86,7 +72,8 @@ unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
 *
 * it's best to have buff aligned on a 32-bit boundary
 */
-unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+unsigned int
+csum_partial (const unsigned char * buff, int len, unsigned int sum)
 {
 	unsigned long result = do_csum(buff, len);
@@ -102,7 +89,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 * this routine is used for miscellaneous IP-like checksums, mainly
 * in icmp.c
 */
-unsigned short ip_compute_csum(unsigned char * buff, int len)
+unsigned short
+ip_compute_csum (unsigned char * buff, int len)
 {
 	return ~do_csum(buff,len);
 }
--- a/arch/ia64/lib/do_csum.S
+++ b/arch/ia64/lib/do_csum.S
@@ -11,6 +11,9 @@
 * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
 *	Stephane Eranian <eranian@hpl.hp.com>
 *
+ * 02/04/22	Ken Chen <kenneth.w.chen@intel.com>
+ *		Data locality study on the checksum buffer.
+ *		More optimization cleanup - remove excessive stop bits.
 * 02/04/08	David Mosberger <davidm@hpl.hp.com>
 *		More cleanup and tuning.
 * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
@@ -80,6 +83,12 @@
 //	  type of packet or alignment we get. Like the ip_fast_csum() routine
 //	  where we know we have at least 20bytes worth of data to checksum.
 //	- Do a better job of handling small packets.
+//	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
+//	  nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
+//	  on the data that buffer points to (partly because the checksum is often preceded by
+//	  a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
+//	  the data is already in the cache.
+//
 #define saved_pfs	r11
 #define hmask		r16
@@ -117,7 +126,7 @@
 GLOBAL_ENTRY(do_csum)
 	.prologue
 	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,2,16,1,16
+	alloc saved_pfs=ar.pfs,2,16,0,16
 	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
 	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
 	mov ret0=r0		// in case we have zero length
@@ -197,22 +206,21 @@ GLOBAL_ENTRY(do_csum)
 	// Calculate the checksum loading two 8-byte words per loop.
 	//
 .do_csum16:
-	mov saved_lc=ar.lc
 	shr.u count=count,1	// we do 16 bytes per loop
-	;;
-	cmp.eq p9,p10=r0,count	// if (count == 0)
 	adds count=-1,count
 	brp.loop.imp 1f,2f
 	;;
+	cmp.eq p9,p10=r0,count	// if (count == 0)
 	mov ar.ec=PIPE_DEPTH
-	mov ar.lc=count	// set lc
-	// result1[0] must be initialized in advance.
-	mov result2[0]=r0
-	mov pr.rot=1<<16
 	mov carry1=r0
 	mov carry2=r0
 	add first2=8,first1
+	;;
+	mov ar.lc=count	// set lc
+	mov pr.rot=1<<16
+	// result1[0] must be initialized in advance.
+	mov result2[0]=r0
 (p9)	br.cond.sptk .do_csum_exit
 	;;
 	.align 32
@@ -223,7 +231,7 @@ GLOBAL_ENTRY(do_csum)
 (pC2[1])adds carry2=1,carry2
 (ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
 (ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-[2:]
+2:
 (p[0])	ld8 word1[0]=[first1],16
 (p[0])	ld8 word2[0]=[first2],16
 	br.ctop.sptk 1b
@@ -246,7 +254,6 @@ GLOBAL_ENTRY(do_csum)
 	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
 	;;
 (p6)	adds result1[0]=1,result1[0]
-	;;
 .do_csum_exit:
 	//
 	// now fold 64 into 16 bits taking care of carry

--- a/arch/ia64/lib/ip_fast_csum.S
+++ b/arch/ia64/lib/ip_fast_csum.S
+/*
+ * Optmized version of the ip_fast_csum() function
+ * Used for calculating IP header checksum
+ *
+ * Return: 16bit checksum, complemented
+ *
+ * Inputs:
+ *      in0: address of buffer to checksum (char *)
+ *      in1: length of the buffer (int)
+ *
+ * Copyright (C) 2002 Intel Corp.
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ */
+#include <asm/asmmacro.h>
+/*
+ * Since we know that most likely this function is called with buf aligned
+ * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
+ * versus calling generic version of do_csum, which has lots of overhead in
+ * handling various alignments and sizes.  However, due to lack of constrains
+ * put on the function input argument, cases with alignment not on 4-byte or
+ * size not equal to 20 bytes will be handled by the generic do_csum function.
+ */
+#define in0	r32
+#define in1	r33
+#define ret0	r8
+GLOBAL_ENTRY(ip_fast_csum)
+	.body
+	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
+	and	r14=3,in0	// is it aligned on 4-byte?
+	add	r15=4,in0	// second source pointer
+	;;
+	cmp.ne.or.andcm p6,p7=r14,r0
+	;;
+(p7)	ld4	r20=[in0],8
+(p7)	ld4	r21=[r15],8
+(p6)	br.spnt	.generic
+	;;
+	ld4	r22=[in0],8
+	ld4	r23=[r15],8
+	;;
+	ld4	r24=[in0]
+	add	r20=r20,r21
+	add	r22=r22,r23
+	;;
+	add	r20=r20,r22
+	;;
+	add	r20=r20,r24
+	;;
+	shr.u	ret0=r20,16	// now need to add the carry
+	zxt2	r20=r20
+	;;
+	add	r20=ret0,r20
+	;;
+	shr.u	ret0=r20,16	// add carry again
+	zxt2	r20=r20
+	;;
+	add	r20=ret0,r20
+	;;
+	shr.u	ret0=r20,16
+	zxt2	r20=r20
+	;;
+	add	r20=ret0,r20
+	;;
+	andcm	ret0=-1,r20
+	.restore sp		// reset frame state
+	br.ret.sptk.many b0
+	;;
+.generic:
+	.prologue
+	.save ar.pfs, r35
+	alloc	r35=ar.pfs,2,2,2,0
+	.save rp, r34
+	mov	r34=b0
+	.body
+	dep.z	out1=in1,2,30
+	mov	out0=in0
+	;;
+	br.call.sptk.many b0=do_csum
+	;;
+	andcm	ret0=-1,ret0
+	mov	ar.pfs=r35
+	mov	b0=r34
+	br.ret.sptk.many b0
+END(ip_fast_csum)