Commit 0ad9500e authored by Eric Dumazet's avatar Eric Dumazet Committed by Pekka Enberg

slub: prefetch next freelist pointer in slab_alloc()

Recycling a page is a problem, since freelist link chain is hot on
cpu(s) which freed objects, and possibly very cold on cpu currently
owning slab.

Adding a prefetch of cache line containing the pointer to next object in
slab_alloc() helps a lot in many workloads, in particular on assymetric
ones (allocations done on one cpu, frees on another cpus). Added cost is
three machine instructions only.

Examples on my dual socket quad core ht machine (Intel CPU E5540
@2.53GHz) (16 logical cpus, 2 memory nodes), 64bit kernel.

Before patch :

# perf stat -r 32 hackbench 50 process 4000 >/dev/null

 Performance counter stats for 'hackbench 50 process 4000' (32 runs):

     327577,471718 task-clock                #   15,821 CPUs utilized            ( +-  0,64% )
        28 866 491 context-switches          #    0,088 M/sec                    ( +-  1,80% )
         1 506 929 CPU-migrations            #    0,005 M/sec                    ( +-  3,24% )
           127 151 page-faults               #    0,000 M/sec                    ( +-  0,16% )
   829 399 813 448 cycles                    #    2,532 GHz                      ( +-  0,64% )
   580 664 691 740 stalled-cycles-frontend   #   70,01% frontend cycles idle     ( +-  0,71% )
   197 431 700 448 stalled-cycles-backend    #   23,80% backend  cycles idle     ( +-  1,03% )
   503 548 648 975 instructions              #    0,61  insns per cycle
                                             #    1,15  stalled cycles per insn  ( +-  0,46% )
    95 780 068 471 branches                  #  292,389 M/sec                    ( +-  0,48% )
     1 426 407 916 branch-misses             #    1,49% of all branches          ( +-  1,35% )

      20,705679994 seconds time elapsed                                          ( +-  0,64% )

After patch :

# perf stat -r 32 hackbench 50 process 4000 >/dev/null

 Performance counter stats for 'hackbench 50 process 4000' (32 runs):

     286236,542804 task-clock                #   15,786 CPUs utilized            ( +-  1,32% )
        19 703 372 context-switches          #    0,069 M/sec                    ( +-  4,99% )
         1 658 249 CPU-migrations            #    0,006 M/sec                    ( +-  6,62% )
           126 776 page-faults               #    0,000 M/sec                    ( +-  0,12% )
   724 636 593 213 cycles                    #    2,532 GHz                      ( +-  1,32% )
   499 320 714 837 stalled-cycles-frontend   #   68,91% frontend cycles idle     ( +-  1,47% )
   156 555 126 809 stalled-cycles-backend    #   21,60% backend  cycles idle     ( +-  2,22% )
   463 897 792 661 instructions              #    0,64  insns per cycle
                                             #    1,08  stalled cycles per insn  ( +-  0,94% )
    87 717 352 563 branches                  #  306,451 M/sec                    ( +-  0,99% )
       941 738 280 branch-misses             #    1,07% of all branches          ( +-  3,35% )

      18,132070670 seconds time elapsed                                          ( +-  1,30% )
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Acked-by: default avatarChristoph Lameter <cl@linux.com>
CC: Matt Mackall <mpm@selenic.com>
CC: David Rientjes <rientjes@google.com>
CC: "Alex,Shi" <alex.shi@intel.com>
CC: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: default avatarPekka Enberg <penberg@kernel.org>
parent 42c8c99c
......@@ -269,6 +269,11 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
return *(void **)(object + s->offset);
}
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
prefetch(object + s->offset);
}
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
void *p;
......@@ -2309,6 +2314,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
void *next_object = get_freepointer_safe(s, object);
/*
* The cmpxchg will only match if there was no additional
* operation and if we are on the right processor.
......@@ -2324,11 +2331,12 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
object, tid,
get_freepointer_safe(s, object), next_tid(tid)))) {
next_object, next_tid(tid)))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
prefetch_freepointer(s, next_object);
stat(s, ALLOC_FASTPATH);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment