include/asm-i386/string.h · ed109bc5c733275283b4dbb040e9a83c81921a4d · Kirill Smelkov / linux

[PATCH] optimize ia32 memmove · ed109bc5

Andrew Morton authored Dec 29, 2003

From: Manfred Spraul <manfred@colorfullife.com>

The memmove implementation of i386 is not optimized: it uses movsb, which is
far slower than movsd.  The optimization is trivial: if dest is less than
source, then call memcpy().  markw tried it on a 4xXeon with dbt2, it saved
around 300 million cpu ticks in cache_flusharray():

oprofile, GLOBAL_POWER_EVENTS, count 100k
Before:
c0144ed1 <cache_flusharray>: /* cache_flusharray total:  21823  0.0165 */
     6 4.5e-06 :c0144f8e:       cmp    %esi,%ebx
    11 8.3e-06 :c0144f90:       jae    c0144f9e <cache_flusharray+0xcd>
     3 2.3e-06 :c0144f92:       mov    %ebx,%edi
  7305  0.0055 :c0144f94:       repz movsb %ds:(%esi),%es:(%edi)
   201 1.5e-04 :c0144f96:       add    $0x10,%esp

After:
c0144f1d <cache_flusharray>: /* cache_flusharray total:  17959  0.0136 */
  1270 9.6e-04 :c0144f1d:       push   %ebp
[snip]
     6 4.6e-06 :c0144fdc:       cmp    %esi,%ebx
    13 9.9e-06 :c0144fde:       jae    c0145000 <cache_flusharray+0xe3>
     2 1.5e-06 :c0144fe0:       mov    %edx,%eax
     1 7.6e-07 :c0144fe2:       mov    %ebx,%edi
    11 8.4e-06 :c0144fe4:       shr    $0x2,%eax
     1 7.6e-07 :c0144fe7:       mov    %eax,%ecx
  4129  0.0031 :c0144fe9:       repz movsl %ds:(%esi),%es:(%edi)
   261 2.0e-04 :c0144feb:       test   $0x2,%dl
    27 2.1e-05 :c0144fee:       je     c0144ff2 <cache_flusharray+0xd5>
               :c0144ff0:       movsw  %ds:(%esi),%es:(%edi)
    95 7.2e-05 :c0144ff2:       test   $0x1,%dl
    96 7.3e-05 :c0144ff5:       je     c0144ff8 <cache_flusharray+0xdb>
               :c0144ff7:       movsb  %ds:(%esi),%es:(%edi)
   121 9.2e-05 :c0144ff8:       add    $0x1c,%esp

ed109bc5

string.h 10 KB

Replace string.h