Commit 878da831 authored by David Mosberger's avatar David Mosberger

ia64: Add optimized ip_fast_csum() by Ken Chen and merge his cleanups

	to do_csum.S.
parent d2c4281c
...@@ -13,7 +13,7 @@ obj-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ ...@@ -13,7 +13,7 @@ obj-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \ __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
checksum.o clear_page.o csum_partial_copy.o copy_page.o \ checksum.o clear_page.o csum_partial_copy.o copy_page.o \
copy_user.o clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \ copy_user.o clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \
flush.o io.o do_csum.o \ flush.o io.o ip_fast_csum.o do_csum.o \
memcpy.o memset.o strlen.o swiotlb.o memcpy.o memset.o strlen.o swiotlb.o
obj-$(CONFIG_ITANIUM) += copy_page.o obj-$(CONFIG_ITANIUM) += copy_page.o
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include <asm/byteorder.h> #include <asm/byteorder.h>
static inline unsigned short static inline unsigned short
from64to16(unsigned long x) from64to16 (unsigned long x)
{ {
/* add up 32-bit words for 33 bits */ /* add up 32-bit words for 33 bits */
x = (x & 0xffffffff) + (x >> 32); x = (x & 0xffffffff) + (x >> 32);
...@@ -32,22 +32,17 @@ from64to16(unsigned long x) ...@@ -32,22 +32,17 @@ from64to16(unsigned long x)
* computes the checksum of the TCP/UDP pseudo-header * computes the checksum of the TCP/UDP pseudo-header
* returns a 16-bit checksum, already complemented. * returns a 16-bit checksum, already complemented.
*/ */
unsigned short int csum_tcpudp_magic(unsigned long saddr, unsigned short int
unsigned long daddr, csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len,
unsigned short len, unsigned short proto, unsigned int sum)
unsigned short proto,
unsigned int sum)
{ {
return ~from64to16(saddr + daddr + sum + return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) +
((unsigned long) ntohs(len) << 16) +
((unsigned long) proto << 8)); ((unsigned long) proto << 8));
} }
unsigned int csum_tcpudp_nofold(unsigned long saddr, unsigned int
unsigned long daddr, csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len,
unsigned short len, unsigned short proto, unsigned int sum)
unsigned short proto,
unsigned int sum)
{ {
unsigned long result; unsigned long result;
...@@ -65,15 +60,6 @@ unsigned int csum_tcpudp_nofold(unsigned long saddr, ...@@ -65,15 +60,6 @@ unsigned int csum_tcpudp_nofold(unsigned long saddr,
extern unsigned long do_csum (const unsigned char *, long); extern unsigned long do_csum (const unsigned char *, long);
/*
* This is a version of ip_compute_csum() optimized for IP headers,
* which always checksum on 4 octet boundaries.
*/
unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
{
return ~do_csum(iph, ihl*4);
}
/* /*
* computes the checksum of a memory block at buff, length len, * computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit) * and adds in "sum" (32-bit)
...@@ -86,7 +72,8 @@ unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl) ...@@ -86,7 +72,8 @@ unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
* *
* it's best to have buff aligned on a 32-bit boundary * it's best to have buff aligned on a 32-bit boundary
*/ */
unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) unsigned int
csum_partial (const unsigned char * buff, int len, unsigned int sum)
{ {
unsigned long result = do_csum(buff, len); unsigned long result = do_csum(buff, len);
...@@ -102,7 +89,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) ...@@ -102,7 +89,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
* this routine is used for miscellaneous IP-like checksums, mainly * this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c * in icmp.c
*/ */
unsigned short ip_compute_csum(unsigned char * buff, int len) unsigned short
ip_compute_csum (unsigned char * buff, int len)
{ {
return ~do_csum(buff,len); return ~do_csum(buff,len);
} }
...@@ -11,6 +11,9 @@ ...@@ -11,6 +11,9 @@
* Copyright (C) 1999, 2001-2002 Hewlett-Packard Co * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com> * Stephane Eranian <eranian@hpl.hp.com>
* *
* 02/04/22 Ken Chen <kenneth.w.chen@intel.com>
* Data locality study on the checksum buffer.
* More optimization cleanup - remove excessive stop bits.
* 02/04/08 David Mosberger <davidm@hpl.hp.com> * 02/04/08 David Mosberger <davidm@hpl.hp.com>
* More cleanup and tuning. * More cleanup and tuning.
* 01/04/18 Jun Nakajima <jun.nakajima@intel.com> * 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
...@@ -80,6 +83,12 @@ ...@@ -80,6 +83,12 @@
// type of packet or alignment we get. Like the ip_fast_csum() routine // type of packet or alignment we get. Like the ip_fast_csum() routine
// where we know we have at least 20bytes worth of data to checksum. // where we know we have at least 20bytes worth of data to checksum.
// - Do a better job of handling small packets. // - Do a better job of handling small packets.
// - Note on prefetching: it was found that under various load, i.e. ftp read/write,
// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
// on the data that buffer points to (partly because the checksum is often preceded by
// a copy_from_user()). This finding indiate that lfetch will not be beneficial since
// the data is already in the cache.
//
#define saved_pfs r11 #define saved_pfs r11
#define hmask r16 #define hmask r16
...@@ -117,7 +126,7 @@ ...@@ -117,7 +126,7 @@
GLOBAL_ENTRY(do_csum) GLOBAL_ENTRY(do_csum)
.prologue .prologue
.save ar.pfs, saved_pfs .save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,2,16,1,16 alloc saved_pfs=ar.pfs,2,16,0,16
.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2] .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
.rotp p[PIPE_DEPTH], pC1[2], pC2[2] .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
mov ret0=r0 // in case we have zero length mov ret0=r0 // in case we have zero length
...@@ -197,22 +206,21 @@ GLOBAL_ENTRY(do_csum) ...@@ -197,22 +206,21 @@ GLOBAL_ENTRY(do_csum)
// Calculate the checksum loading two 8-byte words per loop. // Calculate the checksum loading two 8-byte words per loop.
// //
.do_csum16: .do_csum16:
mov saved_lc=ar.lc
shr.u count=count,1 // we do 16 bytes per loop shr.u count=count,1 // we do 16 bytes per loop
;;
cmp.eq p9,p10=r0,count // if (count == 0)
adds count=-1,count adds count=-1,count
brp.loop.imp 1f,2f brp.loop.imp 1f,2f
;; ;;
cmp.eq p9,p10=r0,count // if (count == 0)
mov ar.ec=PIPE_DEPTH mov ar.ec=PIPE_DEPTH
mov ar.lc=count // set lc
// result1[0] must be initialized in advance.
mov result2[0]=r0
mov pr.rot=1<<16
mov carry1=r0 mov carry1=r0
mov carry2=r0 mov carry2=r0
add first2=8,first1 add first2=8,first1
;;
mov ar.lc=count // set lc
mov pr.rot=1<<16
// result1[0] must be initialized in advance.
mov result2[0]=r0
(p9) br.cond.sptk .do_csum_exit (p9) br.cond.sptk .do_csum_exit
;; ;;
.align 32 .align 32
...@@ -223,7 +231,7 @@ GLOBAL_ENTRY(do_csum) ...@@ -223,7 +231,7 @@ GLOBAL_ENTRY(do_csum)
(pC2[1])adds carry2=1,carry2 (pC2[1])adds carry2=1,carry2
(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] (ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] (ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
[2:] 2:
(p[0]) ld8 word1[0]=[first1],16 (p[0]) ld8 word1[0]=[first1],16
(p[0]) ld8 word2[0]=[first2],16 (p[0]) ld8 word2[0]=[first2],16
br.ctop.sptk 1b br.ctop.sptk 1b
...@@ -246,7 +254,6 @@ GLOBAL_ENTRY(do_csum) ...@@ -246,7 +254,6 @@ GLOBAL_ENTRY(do_csum)
cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1] cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
;; ;;
(p6) adds result1[0]=1,result1[0] (p6) adds result1[0]=1,result1[0]
;;
.do_csum_exit: .do_csum_exit:
// //
// now fold 64 into 16 bits taking care of carry // now fold 64 into 16 bits taking care of carry
......
/*
* Optmized version of the ip_fast_csum() function
* Used for calculating IP header checksum
*
* Return: 16bit checksum, complemented
*
* Inputs:
* in0: address of buffer to checksum (char *)
* in1: length of the buffer (int)
*
* Copyright (C) 2002 Intel Corp.
* Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
*/
#include <asm/asmmacro.h>
/*
* Since we know that most likely this function is called with buf aligned
* on 4-byte boundary and 20 bytes in length, we can execution rather quickly
* versus calling generic version of do_csum, which has lots of overhead in
* handling various alignments and sizes. However, due to lack of constrains
* put on the function input argument, cases with alignment not on 4-byte or
* size not equal to 20 bytes will be handled by the generic do_csum function.
*/
#define in0 r32
#define in1 r33
#define ret0 r8
GLOBAL_ENTRY(ip_fast_csum)
.body
cmp.ne p6,p7=5,in1 // size other than 20 byte?
and r14=3,in0 // is it aligned on 4-byte?
add r15=4,in0 // second source pointer
;;
cmp.ne.or.andcm p6,p7=r14,r0
;;
(p7) ld4 r20=[in0],8
(p7) ld4 r21=[r15],8
(p6) br.spnt .generic
;;
ld4 r22=[in0],8
ld4 r23=[r15],8
;;
ld4 r24=[in0]
add r20=r20,r21
add r22=r22,r23
;;
add r20=r20,r22
;;
add r20=r20,r24
;;
shr.u ret0=r20,16 // now need to add the carry
zxt2 r20=r20
;;
add r20=ret0,r20
;;
shr.u ret0=r20,16 // add carry again
zxt2 r20=r20
;;
add r20=ret0,r20
;;
shr.u ret0=r20,16
zxt2 r20=r20
;;
add r20=ret0,r20
;;
andcm ret0=-1,r20
.restore sp // reset frame state
br.ret.sptk.many b0
;;
.generic:
.prologue
.save ar.pfs, r35
alloc r35=ar.pfs,2,2,2,0
.save rp, r34
mov r34=b0
.body
dep.z out1=in1,2,30
mov out0=in0
;;
br.call.sptk.many b0=do_csum
;;
andcm ret0=-1,ret0
mov ar.pfs=r35
mov b0=r34
br.ret.sptk.many b0
END(ip_fast_csum)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment