Commit 0292c497 authored by Bob Pearson's avatar Bob Pearson Committed by Linus Torvalds

crc32: optimize loop counter for x86

Add two changes that improve the performance of x86 systems

1. replace main loop with incrementing counter this change improves
   the performance of the selftest by about 5-6% on Nehalem CPUs.  The
   apparent reason is that the compiler can use the loop index to perform
   an indexed memory access.  This is reported to make the performance of
   PowerPC CPUs to get worse.

2. replace the rem_len loop with incrementing counter this change
   improves the performance of the selftest, which has more than the usual
   number of occurances, by about 1-2% on x86 CPUs.  In actual work loads
   the length is most often a multiple of 4 bytes and this code does not
   get executed as often if at all.  Again this change is reported to make
   the performance of PowerPC get worse.

[djwong@us.ibm.com: Minor changelog tweaks]
Signed-off-by: default avatarBob Pearson <rpearson@systemfabricworks.com>
Signed-off-by: default avatarDarrick J. Wong <djwong@us.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 324eb0f1
...@@ -66,6 +66,9 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) ...@@ -66,6 +66,9 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
# endif # endif
const u32 *b; const u32 *b;
size_t rem_len; size_t rem_len;
# ifdef CONFIG_X86
size_t i;
# endif
const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];
const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
u32 q; u32 q;
...@@ -86,7 +89,12 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) ...@@ -86,7 +89,12 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
# endif # endif
b = (const u32 *)buf; b = (const u32 *)buf;
# ifdef CONFIG_X86
--b;
for (i = 0; i < len; i++) {
# else
for (--b; len; --len) { for (--b; len; --len) {
# endif
q = crc ^ *++b; /* use pre increment for speed */ q = crc ^ *++b; /* use pre increment for speed */
# if CRC_LE_BITS == 32 # if CRC_LE_BITS == 32
crc = DO_CRC4; crc = DO_CRC4;
...@@ -100,9 +108,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) ...@@ -100,9 +108,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
/* And the last few bytes */ /* And the last few bytes */
if (len) { if (len) {
u8 *p = (u8 *)(b + 1) - 1; u8 *p = (u8 *)(b + 1) - 1;
# ifdef CONFIG_X86
for (i = 0; i < len; i++)
DO_CRC(*++p); /* use pre increment for speed */
# else
do { do {
DO_CRC(*++p); /* use pre increment for speed */ DO_CRC(*++p); /* use pre increment for speed */
} while (--len); } while (--len);
# endif
} }
return crc; return crc;
#undef DO_CRC #undef DO_CRC
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment