Add per-thread caches to the allocator

Now, threads will claim an entire block at a time, and put it in a thread-local cache. In the common case, they can allocate out of this block, and only need to take out a lock if they run out.

Add per-thread caches to the allocator
Now, threads will claim an entire block at a time, and put it in a thread-local cache. In the common case, they can allocate out of this block, and only need to take out a lock if they run out.
0687a454 · Kevin Modzelewski · 51cf8080 · 0687a454 · 0687a454 · 0687a454
Commit 0687a454 authored Jun 17, 2014 by Kevin Modzelewski
Hide whitespace changes
Inline Side-by-side

Showing with 114 additions and 78 deletions

src/gc/collector.cpp src/gc/collector.cpp +1 -3

src/gc/gc_alloc.h src/gc/gc_alloc.h +2 -5

src/gc/heap.cpp src/gc/heap.cpp +105 -66

src/gc/heap.h src/gc/heap.h +6 -4

No files found.
--- a/src/gc/collector.cpp
+++ b/src/gc/collector.cpp
@@ -109,10 +109,8 @@ static void markPhase() {
    while (void* p = stack.pop()) {
        assert(((intptr_t)p) % 8 == 0);
        GCObjectHeader* header = headerFromObject(p);
-        // printf("%p\n", p);

        if (isMarked(header)) {
-            // printf("Already marked, skipping\n");
            continue;
        }

@@ -173,7 +171,7 @@ void runCollection() {
    markPhase();
    sweepPhase();
    if (VERBOSITY("gc") >= 2)
-        printf("Collection #%d done\n", ++ncollections);
+        printf("Collection #%d done\n\n", ncollections);

    long us = _t.end();
    static StatCounter sc_us("gc_collections_us");

--- a/src/gc/gc_alloc.h
+++ b/src/gc/gc_alloc.h
@@ -29,11 +29,6 @@ namespace gc {

 inline void* gc_alloc(size_t bytes) __attribute__((visibility("default")));
 inline void* gc_alloc(size_t bytes) {
-// if ((++numAllocs) >= ALLOCS_PER_COLLECTION) {
-// numAllocs = 0;
-// runCollection();
-//}
-

 #ifndef NVALGRIND
 // Adding a redzone will confuse the allocator, so disable it for now.
@@ -65,6 +60,8 @@ inline void* gc_alloc(size_t bytes) {

 // if (VERBOSITY()) printf("Allocated %ld bytes at [%p, %p)\n", bytes, r, (char*)r + bytes);
 #endif
+    // printf("Allocated %p\n", r);
+

    return r;
 }

--- a/src/gc/heap.cpp
+++ b/src/gc/heap.cpp
@@ -20,6 +20,7 @@
 #include <sys/mman.h>

 #include "core/common.h"
+#include "core/util.h"
 #include "gc/gc_alloc.h"

 #ifndef NVALGRIND
@@ -147,98 +148,124 @@ static Block* alloc_block(uint64_t size, Block** prev) {
    return rtn;
 }

+static void insertIntoLL(Block** next_pointer, Block* next) {
+    assert(next_pointer);
+    assert(next);
+    assert(!next->next);
+    assert(!next->prev);
+
+    next->next = *next_pointer;
+    if (next->next)
+        next->next->prev = &next->next;
+    *next_pointer = next;
+    next->prev = next_pointer;
+}
+
+static void removeFromLL(Block* b) {
+    if (b->next)
+        b->next->prev = b->prev;
+    *b->prev = b->next;
+
+    b->next = NULL;
+    b->prev = NULL;
+}
+
 Heap::ThreadBlockCache::~ThreadBlockCache() {
    LOCK_REGION(heap->lock);

    for (int i = 0; i < NUM_BUCKETS; i++) {
-        if (cache_heads[i] == NULL)
+        Block* b = cache_heads[i];
+        if (b == NULL)
            continue;
-        assert(0);
+
+        removeFromLL(b);
+        // This should have been the only block in the list.
+        // Well, we could cache multiple blocks if we want, and maybe we should,
+        // but for now this routine only supports caching a single one, and would
+        // need to get updated:
+        assert(cache_heads[i] == NULL);
+
+        insertIntoLL(&heap->heads[i], b);
    }
 }

-void* Heap::allocSmall(size_t rounded_size, Block** prev, Block** full_head) {
-    _collectIfNeeded(rounded_size);
+static void* allocFromBlock(Block* b) {
+    int i = 0;
+    uint64_t mask = 0;
+    for (; i < BITFIELD_ELTS; i++) {
+        mask = b->isfree[i];
+        if (mask != 0L) {
+            break;
+        }
+    }

-    ThreadBlockCache* cache = thread_caches.get();
+    if (i == BITFIELD_ELTS) {
+        return NULL;
+    }

-    LOCK_REGION(lock);
+    int first = __builtin_ctzll(mask);
+    assert(first < 64);
+    assert(b->isfree[i] & (1L << first));
+    b->isfree[i] ^= (1L << first);
+    // printf("Marking %d:%d: %p=%lx\n", i, first, &b->isfree[i], b->isfree[i]);

-    Block* cur = *prev;
-    assert(!cur || prev == cur->prev);
-    int scanned = 0;
+    int idx = first + i * 64;

-    // printf("alloc(%ld)\n", rounded_size);
+    void* rtn = &b->atoms[idx];
+    return rtn;
+}

-    // Block **full_prev = full_head;
-    while (true) {
-        // printf("cur = %p, prev = %p\n", cur, prev);
-        if (cur == NULL) {
-            Block* next = alloc_block(rounded_size, &cur->next);
-            // printf("allocated new block %p\n", next);
-            *prev = next;
-            next->prev = prev;
-            prev = &cur->next;
-
-            next->next = *full_head;
-            *full_head = NULL;
-            prev = full_head;
-
-            cur = next;
-        }
+static Block* claimBlock(size_t rounded_size, Block** free_head) {
+    Block* free_block = *free_head;
+    if (free_block) {
+        removeFromLL(free_block);
+        return free_block;
+    }

-        int i = 0;
-        uint64_t mask = 0;
-        for (; i < BITFIELD_ELTS; i++) {
-            mask = cur->isfree[i];
-            if (mask != 0L) {
-                break;
-            }
-        }
+    return alloc_block(rounded_size, NULL);
+}

-        if (i == BITFIELD_ELTS) {
-            scanned++;
-            // printf("moving on\n");
+void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
+    _collectIfNeeded(rounded_size);

-            Block* t = *prev = cur->next;
-            cur->next = NULL;
-            if (t)
-                t->prev = prev;
+    Block** free_head = &heads[bucket_idx];
+    Block** full_head = &full_heads[bucket_idx];

-            cur->prev = full_head;
-            cur->next = *full_head;
-            *full_head = cur;
+    ThreadBlockCache* cache = thread_caches.get();

-            cur = t;
+    Block** cache_head = &cache->cache_heads[bucket_idx];

-            scanned++;
-            continue;
-        }
+    static StatCounter sc_total("gc_total");
+    sc_total.log();

-        // printf("scanned %d\n", scanned);
-        int first = __builtin_ctzll(mask);
-        assert(first < 64);
-        // printf("mask: %lx, first: %d\n", mask, first);
-        cur->isfree[i] ^= (1L << first);
+    while (true) {
+        Block* cache_block = *cache_head;
+        if (cache_block) {
+            void* rtn = allocFromBlock(cache_block);
+            if (rtn)
+                return rtn;

-        int idx = first + i * 64;
+            removeFromLL(cache_block);
+        }

-        // printf("Using index %d\n", idx);
+        static StatCounter sc_fallback("gc_nocache");
+        sc_fallback.log();

-        void* rtn = &cur->atoms[idx];
+        LOCK_REGION(lock);

-#ifndef NDEBUG
-        Block* b = Block::forPointer(rtn);
-        assert(b == cur);
-        int offset = (char*)rtn - (char*)b;
-        assert(offset % rounded_size == 0);
-#endif
+        if (cache_block) {
+            insertIntoLL(full_head, cache_block);
+        }

-#ifndef NVALGRIND
-// VALGRIND_MEMPOOL_ALLOC(cur, rtn, rounded_size);
-#endif
+        assert(*cache_head == NULL);

-        return rtn;
+        // should probably be called allocBlock:
+        Block* myblock = claimBlock(rounded_size, &heads[bucket_idx]);
+        assert(myblock);
+        assert(!myblock->next);
+        assert(!myblock->prev);
+
+        insertIntoLL(cache_head, myblock);
    }
 }

@@ -392,8 +419,20 @@ void Heap::freeUnmarked() {
    for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
        bytes_freed += freeChain(heads[bidx]);
        bytes_freed += freeChain(full_heads[bidx]);
+
+        while (Block* b = full_heads[bidx]) {
+            // these should be added at the end...
+            removeFromLL(b);
+            insertIntoLL(&heads[bidx], b);
+        }
    }

+    thread_caches.forEachValue([&bytes_freed](ThreadBlockCache* cache) {
+        for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
+            bytes_freed += freeChain(cache->cache_heads[bidx]);
+        }
+    });
+
    LargeObj* cur = large_head;
    while (cur) {
        void* p = cur->data;

--- a/src/gc/heap.h
+++ b/src/gc/heap.h
@@ -80,7 +80,7 @@ private:
    Block* full_heads[NUM_BUCKETS];
    LargeObj* large_head = NULL;

-    void* allocSmall(size_t rounded_size, Block** head, Block** full_head);
+    void* allocSmall(size_t rounded_size, int bucket_idx);
    void* allocLarge(size_t bytes);

    // DS_DEFINE_MUTEX(lock);
@@ -106,16 +106,16 @@ public:
        void* rtn;
        // assert(bytes >= 16);
        if (bytes <= 16)
-            rtn = allocSmall(16, &heads[0], &full_heads[0]);
+            rtn = allocSmall(16, 0);
        else if (bytes <= 32)
-            rtn = allocSmall(32, &heads[1], &full_heads[1]);
+            rtn = allocSmall(32, 1);
        else if (bytes > sizes[NUM_BUCKETS - 1])
            rtn = allocLarge(bytes);
        else {
            rtn = NULL;
            for (int i = 2; i < NUM_BUCKETS; i++) {
                if (sizes[i] >= bytes) {
-                    rtn = allocSmall(sizes[i], &heads[i], &full_heads[i]);
+                    rtn = allocSmall(sizes[i], i);
                    break;
                }
            }
@@ -129,7 +129,9 @@ public:

    void free(void* ptr);

+    // not thread safe:
    void* getAllocationFromInteriorPointer(void* ptr);
+    // not thread safe:
    void freeUnmarked();
 };