Commit 0687a454 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Add per-thread caches to the allocator

Now, threads will claim an entire block at a time, and
put it in a thread-local cache.  In the common case, they
can allocate out of this block, and only need to take out
a lock if they run out.
parent 51cf8080
......@@ -109,10 +109,8 @@ static void markPhase() {
while (void* p = stack.pop()) {
assert(((intptr_t)p) % 8 == 0);
GCObjectHeader* header = headerFromObject(p);
// printf("%p\n", p);
if (isMarked(header)) {
// printf("Already marked, skipping\n");
continue;
}
......@@ -173,7 +171,7 @@ void runCollection() {
markPhase();
sweepPhase();
if (VERBOSITY("gc") >= 2)
printf("Collection #%d done\n", ++ncollections);
printf("Collection #%d done\n\n", ncollections);
long us = _t.end();
static StatCounter sc_us("gc_collections_us");
......
......@@ -29,11 +29,6 @@ namespace gc {
inline void* gc_alloc(size_t bytes) __attribute__((visibility("default")));
inline void* gc_alloc(size_t bytes) {
// if ((++numAllocs) >= ALLOCS_PER_COLLECTION) {
// numAllocs = 0;
// runCollection();
//}
#ifndef NVALGRIND
// Adding a redzone will confuse the allocator, so disable it for now.
......@@ -65,6 +60,8 @@ inline void* gc_alloc(size_t bytes) {
// if (VERBOSITY()) printf("Allocated %ld bytes at [%p, %p)\n", bytes, r, (char*)r + bytes);
#endif
// printf("Allocated %p\n", r);
return r;
}
......
......@@ -20,6 +20,7 @@
#include <sys/mman.h>
#include "core/common.h"
#include "core/util.h"
#include "gc/gc_alloc.h"
#ifndef NVALGRIND
......@@ -147,98 +148,124 @@ static Block* alloc_block(uint64_t size, Block** prev) {
return rtn;
}
static void insertIntoLL(Block** next_pointer, Block* next) {
assert(next_pointer);
assert(next);
assert(!next->next);
assert(!next->prev);
next->next = *next_pointer;
if (next->next)
next->next->prev = &next->next;
*next_pointer = next;
next->prev = next_pointer;
}
static void removeFromLL(Block* b) {
if (b->next)
b->next->prev = b->prev;
*b->prev = b->next;
b->next = NULL;
b->prev = NULL;
}
Heap::ThreadBlockCache::~ThreadBlockCache() {
LOCK_REGION(heap->lock);
for (int i = 0; i < NUM_BUCKETS; i++) {
if (cache_heads[i] == NULL)
Block* b = cache_heads[i];
if (b == NULL)
continue;
assert(0);
removeFromLL(b);
// This should have been the only block in the list.
// Well, we could cache multiple blocks if we want, and maybe we should,
// but for now this routine only supports caching a single one, and would
// need to get updated:
assert(cache_heads[i] == NULL);
insertIntoLL(&heap->heads[i], b);
}
}
void* Heap::allocSmall(size_t rounded_size, Block** prev, Block** full_head) {
_collectIfNeeded(rounded_size);
static void* allocFromBlock(Block* b) {
int i = 0;
uint64_t mask = 0;
for (; i < BITFIELD_ELTS; i++) {
mask = b->isfree[i];
if (mask != 0L) {
break;
}
}
ThreadBlockCache* cache = thread_caches.get();
if (i == BITFIELD_ELTS) {
return NULL;
}
LOCK_REGION(lock);
int first = __builtin_ctzll(mask);
assert(first < 64);
assert(b->isfree[i] & (1L << first));
b->isfree[i] ^= (1L << first);
// printf("Marking %d:%d: %p=%lx\n", i, first, &b->isfree[i], b->isfree[i]);
Block* cur = *prev;
assert(!cur || prev == cur->prev);
int scanned = 0;
int idx = first + i * 64;
// printf("alloc(%ld)\n", rounded_size);
void* rtn = &b->atoms[idx];
return rtn;
}
// Block **full_prev = full_head;
while (true) {
// printf("cur = %p, prev = %p\n", cur, prev);
if (cur == NULL) {
Block* next = alloc_block(rounded_size, &cur->next);
// printf("allocated new block %p\n", next);
*prev = next;
next->prev = prev;
prev = &cur->next;
next->next = *full_head;
*full_head = NULL;
prev = full_head;
cur = next;
}
static Block* claimBlock(size_t rounded_size, Block** free_head) {
Block* free_block = *free_head;
if (free_block) {
removeFromLL(free_block);
return free_block;
}
int i = 0;
uint64_t mask = 0;
for (; i < BITFIELD_ELTS; i++) {
mask = cur->isfree[i];
if (mask != 0L) {
break;
}
}
return alloc_block(rounded_size, NULL);
}
if (i == BITFIELD_ELTS) {
scanned++;
// printf("moving on\n");
void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
_collectIfNeeded(rounded_size);
Block* t = *prev = cur->next;
cur->next = NULL;
if (t)
t->prev = prev;
Block** free_head = &heads[bucket_idx];
Block** full_head = &full_heads[bucket_idx];
cur->prev = full_head;
cur->next = *full_head;
*full_head = cur;
ThreadBlockCache* cache = thread_caches.get();
cur = t;
Block** cache_head = &cache->cache_heads[bucket_idx];
scanned++;
continue;
}
static StatCounter sc_total("gc_total");
sc_total.log();
// printf("scanned %d\n", scanned);
int first = __builtin_ctzll(mask);
assert(first < 64);
// printf("mask: %lx, first: %d\n", mask, first);
cur->isfree[i] ^= (1L << first);
while (true) {
Block* cache_block = *cache_head;
if (cache_block) {
void* rtn = allocFromBlock(cache_block);
if (rtn)
return rtn;
int idx = first + i * 64;
removeFromLL(cache_block);
}
// printf("Using index %d\n", idx);
static StatCounter sc_fallback("gc_nocache");
sc_fallback.log();
void* rtn = &cur->atoms[idx];
LOCK_REGION(lock);
#ifndef NDEBUG
Block* b = Block::forPointer(rtn);
assert(b == cur);
int offset = (char*)rtn - (char*)b;
assert(offset % rounded_size == 0);
#endif
if (cache_block) {
insertIntoLL(full_head, cache_block);
}
#ifndef NVALGRIND
// VALGRIND_MEMPOOL_ALLOC(cur, rtn, rounded_size);
#endif
assert(*cache_head == NULL);
return rtn;
// should probably be called allocBlock:
Block* myblock = claimBlock(rounded_size, &heads[bucket_idx]);
assert(myblock);
assert(!myblock->next);
assert(!myblock->prev);
insertIntoLL(cache_head, myblock);
}
}
......@@ -392,8 +419,20 @@ void Heap::freeUnmarked() {
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
bytes_freed += freeChain(heads[bidx]);
bytes_freed += freeChain(full_heads[bidx]);
while (Block* b = full_heads[bidx]) {
// these should be added at the end...
removeFromLL(b);
insertIntoLL(&heads[bidx], b);
}
}
thread_caches.forEachValue([&bytes_freed](ThreadBlockCache* cache) {
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
bytes_freed += freeChain(cache->cache_heads[bidx]);
}
});
LargeObj* cur = large_head;
while (cur) {
void* p = cur->data;
......
......@@ -80,7 +80,7 @@ private:
Block* full_heads[NUM_BUCKETS];
LargeObj* large_head = NULL;
void* allocSmall(size_t rounded_size, Block** head, Block** full_head);
void* allocSmall(size_t rounded_size, int bucket_idx);
void* allocLarge(size_t bytes);
// DS_DEFINE_MUTEX(lock);
......@@ -106,16 +106,16 @@ public:
void* rtn;
// assert(bytes >= 16);
if (bytes <= 16)
rtn = allocSmall(16, &heads[0], &full_heads[0]);
rtn = allocSmall(16, 0);
else if (bytes <= 32)
rtn = allocSmall(32, &heads[1], &full_heads[1]);
rtn = allocSmall(32, 1);
else if (bytes > sizes[NUM_BUCKETS - 1])
rtn = allocLarge(bytes);
else {
rtn = NULL;
for (int i = 2; i < NUM_BUCKETS; i++) {
if (sizes[i] >= bytes) {
rtn = allocSmall(sizes[i], &heads[i], &full_heads[i]);
rtn = allocSmall(sizes[i], i);
break;
}
}
......@@ -129,7 +129,9 @@ public:
void free(void* ptr);
// not thread safe:
void* getAllocationFromInteriorPointer(void* ptr);
// not thread safe:
void freeUnmarked();
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment