Commit 0b0b2ceb authored by Kevin Modzelewski's avatar Kevin Modzelewski

Turns out a large amount of thread contention was coming from these shared...

Turns out a large amount of thread contention was coming from these shared counters -- disable some of them and add some thread-local caching
parent 03066506
......@@ -17,6 +17,7 @@ pyston_prof
pyston_profile
pyston_release
pyston_grwl
pyston_grwl_dbg
pyston_nosync
*.cache
......
......@@ -585,6 +585,7 @@ PASS_OBJS := $(PASS_SRCS:.cpp=.standalone.o)
$(call make_compile_config,,$(CXXFLAGS_DBG))
$(call make_compile_config,.release,$(CXXFLAGS_RELEASE))
$(call make_compile_config,.grwl,$(CXXFLAGS_RELEASE) -DTHREADING_USE_GRWL=1 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_grwl)
$(call make_compile_config,.grwl_dbg,$(CXXFLAGS_DBG) -DTHREADING_USE_GRWL=1 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_grwl_dbg -UBINARY_STRIPPED_SUFFIX -DBINARY_STRIPPED_SUFFIX=)
$(call make_compile_config,.nosync,$(CXXFLAGS_RELEASE) -DTHREADING_USE_GRWL=0 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_nosync)
$(UNITTEST_SRCS:.cpp=.o): CXXFLAGS += -isystem $(GTEST_DIR)/include
......@@ -658,6 +659,7 @@ $(call link,_dbg,$(OBJS),$(LDFLAGS),$(LLVM_DEPS))
$(call link,_debug,$(OBJS),$(LDFLAGS_DEBUG),$(LLVM_DEBUG_DEPS))
$(call link,_release,$(OPT_OBJS),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS))
$(call link,_grwl,stdlib.grwl.bc.o $(SRCS:.cpp=.grwl.o),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS))
$(call link,_grwl_dbg,stdlib.grwl_dbg.bc.o $(SRCS:.cpp=.grwl_dbg.o),$(LDFLAGS),$(LLVM_DEPS))
$(call link,_nosync,stdlib.nosync.bc.o $(SRCS:.cpp=.nosync.o),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS))
pyston_oprof: $(OPT_OBJS) codegen/profiling/oprofile.o $(LLVM_DEPS)
$(ECHO) Linking $@
......@@ -734,6 +736,9 @@ $$(call make_search,memcheck_gdb$1_%)
memleaks$1_%: %.py pyston$1 $$(RUN_DEPS)
$$(VALGRIND) --tool=memcheck --leak-check=full --leak-resolution=low --show-reachable=yes ./pyston$1 $$(ARGS) $$<
$$(call make_search,memleaks$1_%)
cachegrind$1_%: %.py pyston$1 $$(RUN_DEPS)
$$(VALGRIND) --tool=cachegrind ./pyston$1 $$(ARGS) $$<
$$(call make_search,cachegrind$1_%)
endif
.PHONY: perf$1_%
......@@ -759,6 +764,7 @@ perf_%: perf_release_%
$(call make_target,_dbg)
$(call make_target,_release)
$(call make_target,_grwl)
$(call make_target,_grwl_dbg)
$(call make_target,_nosync)
# "kill valgrind":
......
......@@ -20,7 +20,9 @@
namespace pyston {
#if !DISABLE_STATS
std::vector<long>* Stats::counts;
std::vector<std::atomic<long> >* Stats::threadsafe_counts;
std::unordered_map<int, std::string>* Stats::names;
StatCounter::StatCounter(const std::string& name) : id(Stats::getStatId(name)) {
}
......@@ -37,6 +39,8 @@ int Stats::getStatId(const std::string& name) {
Stats::names = &names;
static std::vector<long> counts;
Stats::counts = &counts;
static std::vector<std::atomic<long> > threadsafe_counts;
Stats::threadsafe_counts = &threadsafe_counts;
static std::unordered_map<std::string, int> made;
if (made.count(name))
......@@ -46,6 +50,7 @@ int Stats::getStatId(const std::string& name) {
names[rtn] = name;
made[name] = rtn;
counts.push_back(0);
threadsafe_counts.emplace_back(0);
return rtn;
}
......@@ -60,7 +65,10 @@ void Stats::dump() {
std::sort(pairs.begin(), pairs.end());
for (int i = 0; i < pairs.size(); i++) {
printf("%s: %ld\n", pairs[i].first.c_str(), (*counts)[pairs[i].second]);
printf("%s: %ld\n", pairs[i].first.c_str(), (*counts)[pairs[i].second] + (*threadsafe_counts)[pairs[i].second]);
}
}
#endif
}
......@@ -15,6 +15,7 @@
#ifndef PYSTON_CORE_STATS_H
#define PYSTON_CORE_STATS_H
#include <atomic>
#include <cstdio>
#include <string>
#include <unordered_map>
......@@ -24,15 +25,20 @@
namespace pyston {
#define DISABLE_STATS 0
#if !DISABLE_STATS
struct Stats {
private:
static std::vector<long>* counts;
static std::vector<std::atomic<long> >* threadsafe_counts;
static std::unordered_map<int, std::string>* names;
public:
static int getStatId(const std::string& name);
static void log(int id, int count = 1) { (*counts)[id] += count; }
static void threadsafe_log(int id, int count = 1) { (*threadsafe_counts)[id].fetch_add(count, std::memory_order_relaxed); }
static void dump();
};
......@@ -45,6 +51,7 @@ public:
StatCounter(const std::string& name);
void log(int count = 1) { Stats::log(id, count); }
void threadsafe_log(int count = 1) { Stats::threadsafe_log(id, count); }
};
struct StatPerThreadCounter {
......@@ -55,7 +62,25 @@ public:
StatPerThreadCounter(const std::string& name);
void log(int count = 1) { Stats::log(id, count); }
void threadsafe_log(int count = 1) { Stats::threadsafe_log(id, count); }
};
#else
struct Stats {
static void dump() {
printf("(Stats disabled)\n");
}
};
struct StatCounter {
StatCounter(const char* name) {}
void log(int count=1) {};
};
struct StatPerThreadCounter {
StatPerThreadCounter(const char* name) {}
void log(int count=1) {};
};
#endif
}
#endif
......@@ -23,7 +23,9 @@
#include "core/common.h"
#include "core/options.h"
#include "core/stats.h"
#include "core/thread_utils.h"
#include "core/util.h"
extern "C" int start_thread(void* arg);
......@@ -372,7 +374,7 @@ void allowGLReadPreemption() {
}
}
#elif THREADING_USE_GRWL
static pthread_rwlock_t grwl = PTHREAD_RWLOCK_INITIALIZER;
static pthread_rwlock_t grwl = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP;
enum class GRWLHeldState {
N,
......@@ -412,9 +414,15 @@ void releaseGLWrite() {
}
void promoteGL() {
Timer _t2("promoting", /*min_usec=*/10000);
// Note: this is *not* the same semantics as normal promoting, on purpose.
releaseGLRead();
acquireGLWrite();
long promote_us = _t2.end();
static thread_local StatPerThreadCounter sc_promoting_us("grwl_promoting_us");
sc_promoting_us.log(promote_us);
}
void demoteGL() {
......@@ -426,18 +434,24 @@ static __thread int gl_check_count = 0;
void allowGLReadPreemption() {
assert(grwl_state == GRWLHeldState::R);
gl_check_count++;
if (gl_check_count < 10)
return;
gl_check_count = 0;
//gl_check_count++;
//if (gl_check_count < 10)
//return;
//gl_check_count = 0;
if (__builtin_expect(!writers_waiting.load(std::memory_order_relaxed), 1))
return;
Timer _t2("preempted", /*min_usec=*/10000);
pthread_rwlock_unlock(&grwl);
// printf("waiters!\n");
sleep(0);
// The GRWL is a writer-prefered rwlock, so this next statement will block even
// if the lock is in read mode:
pthread_rwlock_rdlock(&grwl);
long preempt_us = _t2.end();
static thread_local StatPerThreadCounter sc_preempting_us("grwl_preempt_us");
sc_preempting_us.log(preempt_us);
}
#endif
......
......@@ -155,16 +155,10 @@ void runCollection() {
static StatCounter sc("gc_collections");
sc.log();
if (VERBOSITY("gc") >= 2)
printf("Collection #%d\n", ++ncollections);
// Timer _t2("promoting", /*min_usec=*/10000);
ncollections++;
threading::GLPromoteRegion _lock;
// long promote_us = _t2.end();
// static thread_local StatPerThreadCounter sc_promoting_us("gc_promoting_us");
// sc_promoting_us.log(promote_us);
if (VERBOSITY("gc") >= 2)
printf("Collection #%d\n", ncollections);
Timer _t("collecting", /*min_usec=*/10000);
......
......@@ -33,17 +33,28 @@
namespace pyston {
namespace gc {
// extern unsigned numAllocs;
//#define ALLOCS_PER_COLLECTION 1000
unsigned bytesAllocatedSinceCollection;
static unsigned bytesAllocatedSinceCollection;
static __thread unsigned thread_bytesAllocatedSinceCollection;
#define ALLOCBYTES_PER_COLLECTION 2000000
void _collectIfNeeded(size_t bytes) {
if (bytesAllocatedSinceCollection >= ALLOCBYTES_PER_COLLECTION) {
bytesAllocatedSinceCollection = 0;
runCollection();
//bytesAllocatedSinceCollection = 0;
//threading::GLPromoteRegion _lock;
//runCollection();
threading::GLPromoteRegion _lock;
if (bytesAllocatedSinceCollection >= ALLOCBYTES_PER_COLLECTION) {
runCollection();
bytesAllocatedSinceCollection = 0;
}
}
thread_bytesAllocatedSinceCollection += bytes;
if (thread_bytesAllocatedSinceCollection > ALLOCBYTES_PER_COLLECTION / 4) {
bytesAllocatedSinceCollection += thread_bytesAllocatedSinceCollection;
thread_bytesAllocatedSinceCollection = 0;
}
bytesAllocatedSinceCollection += bytes;
}
......@@ -233,8 +244,12 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
Block** cache_head = &cache->cache_heads[bucket_idx];
static StatCounter sc_total("gc_total");
sc_total.log();
//static __thread int gc_allocs = 0;
//if (++gc_allocs == 128) {
//static StatCounter sc_total("gc_allocs");
//sc_total.log(128);
//gc_allocs = 0;
//}
while (true) {
Block* cache_block = *cache_head;
......@@ -246,8 +261,9 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
removeFromLL(cache_block);
}
static StatCounter sc_fallback("gc_nocache");
sc_fallback.log();
// Not very useful to count the cache misses if we don't count the total attempts:
//static StatCounter sc_fallback("gc_allocs_cachemiss");
//sc_fallback.log();
LOCK_REGION(lock);
......@@ -263,6 +279,8 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
assert(!myblock->next);
assert(!myblock->prev);
//printf("%d claimed new block %p with %d objects\n", threading::gettid(), myblock, myblock->numObjects());
insertIntoLL(cache_head, myblock);
}
}
......
......@@ -31,7 +31,7 @@ inline GCObjectHeader* headerFromObject(void* obj) {
#endif
}
#define BLOCK_SIZE 4096
#define BLOCK_SIZE (4*4096)
#define ATOM_SIZE 16
static_assert(BLOCK_SIZE % ATOM_SIZE == 0, "");
#define ATOMS_PER_BLOCK (BLOCK_SIZE / ATOM_SIZE)
......
......@@ -855,9 +855,11 @@ extern "C" Box* getattr(Box* obj, const char* attr) {
slowpath_getattr.log();
if (VERBOSITY() >= 2) {
#if !DISABLE_STATS
std::string per_name_stat_name = "getattr__" + std::string(attr);
int id = Stats::getStatId(per_name_stat_name);
Stats::log(id);
#endif
}
std::unique_ptr<Rewriter2> rewriter(
......@@ -2772,9 +2774,11 @@ extern "C" Box* getGlobal(BoxedModule* m, std::string* name) {
static StatCounter nopatch_getglobal("nopatch_getglobal");
if (VERBOSITY() >= 2) {
#if !DISABLE_STATS
std::string per_name_stat_name = "getglobal__" + *name;
int id = Stats::getStatId(per_name_stat_name);
Stats::log(id);
#endif
}
{ /* anonymous scope to make sure destructors get run before we err out */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment