Commit 0b0b2ceb authored by Kevin Modzelewski's avatar Kevin Modzelewski

Turns out a large amount of thread contention was coming from these shared...

Turns out a large amount of thread contention was coming from these shared counters -- disable some of them and add some thread-local caching
parent 03066506
...@@ -17,6 +17,7 @@ pyston_prof ...@@ -17,6 +17,7 @@ pyston_prof
pyston_profile pyston_profile
pyston_release pyston_release
pyston_grwl pyston_grwl
pyston_grwl_dbg
pyston_nosync pyston_nosync
*.cache *.cache
......
...@@ -585,6 +585,7 @@ PASS_OBJS := $(PASS_SRCS:.cpp=.standalone.o) ...@@ -585,6 +585,7 @@ PASS_OBJS := $(PASS_SRCS:.cpp=.standalone.o)
$(call make_compile_config,,$(CXXFLAGS_DBG)) $(call make_compile_config,,$(CXXFLAGS_DBG))
$(call make_compile_config,.release,$(CXXFLAGS_RELEASE)) $(call make_compile_config,.release,$(CXXFLAGS_RELEASE))
$(call make_compile_config,.grwl,$(CXXFLAGS_RELEASE) -DTHREADING_USE_GRWL=1 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_grwl) $(call make_compile_config,.grwl,$(CXXFLAGS_RELEASE) -DTHREADING_USE_GRWL=1 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_grwl)
$(call make_compile_config,.grwl_dbg,$(CXXFLAGS_DBG) -DTHREADING_USE_GRWL=1 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_grwl_dbg -UBINARY_STRIPPED_SUFFIX -DBINARY_STRIPPED_SUFFIX=)
$(call make_compile_config,.nosync,$(CXXFLAGS_RELEASE) -DTHREADING_USE_GRWL=0 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_nosync) $(call make_compile_config,.nosync,$(CXXFLAGS_RELEASE) -DTHREADING_USE_GRWL=0 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_nosync)
$(UNITTEST_SRCS:.cpp=.o): CXXFLAGS += -isystem $(GTEST_DIR)/include $(UNITTEST_SRCS:.cpp=.o): CXXFLAGS += -isystem $(GTEST_DIR)/include
...@@ -658,6 +659,7 @@ $(call link,_dbg,$(OBJS),$(LDFLAGS),$(LLVM_DEPS)) ...@@ -658,6 +659,7 @@ $(call link,_dbg,$(OBJS),$(LDFLAGS),$(LLVM_DEPS))
$(call link,_debug,$(OBJS),$(LDFLAGS_DEBUG),$(LLVM_DEBUG_DEPS)) $(call link,_debug,$(OBJS),$(LDFLAGS_DEBUG),$(LLVM_DEBUG_DEPS))
$(call link,_release,$(OPT_OBJS),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS)) $(call link,_release,$(OPT_OBJS),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS))
$(call link,_grwl,stdlib.grwl.bc.o $(SRCS:.cpp=.grwl.o),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS)) $(call link,_grwl,stdlib.grwl.bc.o $(SRCS:.cpp=.grwl.o),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS))
$(call link,_grwl_dbg,stdlib.grwl_dbg.bc.o $(SRCS:.cpp=.grwl_dbg.o),$(LDFLAGS),$(LLVM_DEPS))
$(call link,_nosync,stdlib.nosync.bc.o $(SRCS:.cpp=.nosync.o),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS)) $(call link,_nosync,stdlib.nosync.bc.o $(SRCS:.cpp=.nosync.o),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS))
pyston_oprof: $(OPT_OBJS) codegen/profiling/oprofile.o $(LLVM_DEPS) pyston_oprof: $(OPT_OBJS) codegen/profiling/oprofile.o $(LLVM_DEPS)
$(ECHO) Linking $@ $(ECHO) Linking $@
...@@ -734,6 +736,9 @@ $$(call make_search,memcheck_gdb$1_%) ...@@ -734,6 +736,9 @@ $$(call make_search,memcheck_gdb$1_%)
memleaks$1_%: %.py pyston$1 $$(RUN_DEPS) memleaks$1_%: %.py pyston$1 $$(RUN_DEPS)
$$(VALGRIND) --tool=memcheck --leak-check=full --leak-resolution=low --show-reachable=yes ./pyston$1 $$(ARGS) $$< $$(VALGRIND) --tool=memcheck --leak-check=full --leak-resolution=low --show-reachable=yes ./pyston$1 $$(ARGS) $$<
$$(call make_search,memleaks$1_%) $$(call make_search,memleaks$1_%)
cachegrind$1_%: %.py pyston$1 $$(RUN_DEPS)
$$(VALGRIND) --tool=cachegrind ./pyston$1 $$(ARGS) $$<
$$(call make_search,cachegrind$1_%)
endif endif
.PHONY: perf$1_% .PHONY: perf$1_%
...@@ -759,6 +764,7 @@ perf_%: perf_release_% ...@@ -759,6 +764,7 @@ perf_%: perf_release_%
$(call make_target,_dbg) $(call make_target,_dbg)
$(call make_target,_release) $(call make_target,_release)
$(call make_target,_grwl) $(call make_target,_grwl)
$(call make_target,_grwl_dbg)
$(call make_target,_nosync) $(call make_target,_nosync)
# "kill valgrind": # "kill valgrind":
......
...@@ -20,7 +20,9 @@ ...@@ -20,7 +20,9 @@
namespace pyston { namespace pyston {
#if !DISABLE_STATS
std::vector<long>* Stats::counts; std::vector<long>* Stats::counts;
std::vector<std::atomic<long> >* Stats::threadsafe_counts;
std::unordered_map<int, std::string>* Stats::names; std::unordered_map<int, std::string>* Stats::names;
StatCounter::StatCounter(const std::string& name) : id(Stats::getStatId(name)) { StatCounter::StatCounter(const std::string& name) : id(Stats::getStatId(name)) {
} }
...@@ -37,6 +39,8 @@ int Stats::getStatId(const std::string& name) { ...@@ -37,6 +39,8 @@ int Stats::getStatId(const std::string& name) {
Stats::names = &names; Stats::names = &names;
static std::vector<long> counts; static std::vector<long> counts;
Stats::counts = &counts; Stats::counts = &counts;
static std::vector<std::atomic<long> > threadsafe_counts;
Stats::threadsafe_counts = &threadsafe_counts;
static std::unordered_map<std::string, int> made; static std::unordered_map<std::string, int> made;
if (made.count(name)) if (made.count(name))
...@@ -46,6 +50,7 @@ int Stats::getStatId(const std::string& name) { ...@@ -46,6 +50,7 @@ int Stats::getStatId(const std::string& name) {
names[rtn] = name; names[rtn] = name;
made[name] = rtn; made[name] = rtn;
counts.push_back(0); counts.push_back(0);
threadsafe_counts.emplace_back(0);
return rtn; return rtn;
} }
...@@ -60,7 +65,10 @@ void Stats::dump() { ...@@ -60,7 +65,10 @@ void Stats::dump() {
std::sort(pairs.begin(), pairs.end()); std::sort(pairs.begin(), pairs.end());
for (int i = 0; i < pairs.size(); i++) { for (int i = 0; i < pairs.size(); i++) {
printf("%s: %ld\n", pairs[i].first.c_str(), (*counts)[pairs[i].second]); printf("%s: %ld\n", pairs[i].first.c_str(), (*counts)[pairs[i].second] + (*threadsafe_counts)[pairs[i].second]);
} }
} }
#endif
} }
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#ifndef PYSTON_CORE_STATS_H #ifndef PYSTON_CORE_STATS_H
#define PYSTON_CORE_STATS_H #define PYSTON_CORE_STATS_H
#include <atomic>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -24,15 +25,20 @@ ...@@ -24,15 +25,20 @@
namespace pyston { namespace pyston {
#define DISABLE_STATS 0
#if !DISABLE_STATS
struct Stats { struct Stats {
private: private:
static std::vector<long>* counts; static std::vector<long>* counts;
static std::vector<std::atomic<long> >* threadsafe_counts;
static std::unordered_map<int, std::string>* names; static std::unordered_map<int, std::string>* names;
public: public:
static int getStatId(const std::string& name); static int getStatId(const std::string& name);
static void log(int id, int count = 1) { (*counts)[id] += count; } static void log(int id, int count = 1) { (*counts)[id] += count; }
static void threadsafe_log(int id, int count = 1) { (*threadsafe_counts)[id].fetch_add(count, std::memory_order_relaxed); }
static void dump(); static void dump();
}; };
...@@ -45,6 +51,7 @@ public: ...@@ -45,6 +51,7 @@ public:
StatCounter(const std::string& name); StatCounter(const std::string& name);
void log(int count = 1) { Stats::log(id, count); } void log(int count = 1) { Stats::log(id, count); }
void threadsafe_log(int count = 1) { Stats::threadsafe_log(id, count); }
}; };
struct StatPerThreadCounter { struct StatPerThreadCounter {
...@@ -55,7 +62,25 @@ public: ...@@ -55,7 +62,25 @@ public:
StatPerThreadCounter(const std::string& name); StatPerThreadCounter(const std::string& name);
void log(int count = 1) { Stats::log(id, count); } void log(int count = 1) { Stats::log(id, count); }
void threadsafe_log(int count = 1) { Stats::threadsafe_log(id, count); }
};
#else
struct Stats {
static void dump() {
printf("(Stats disabled)\n");
}
};
struct StatCounter {
StatCounter(const char* name) {}
void log(int count=1) {};
};
struct StatPerThreadCounter {
StatPerThreadCounter(const char* name) {}
void log(int count=1) {};
}; };
#endif
} }
#endif #endif
...@@ -23,7 +23,9 @@ ...@@ -23,7 +23,9 @@
#include "core/common.h" #include "core/common.h"
#include "core/options.h" #include "core/options.h"
#include "core/stats.h"
#include "core/thread_utils.h" #include "core/thread_utils.h"
#include "core/util.h"
extern "C" int start_thread(void* arg); extern "C" int start_thread(void* arg);
...@@ -372,7 +374,7 @@ void allowGLReadPreemption() { ...@@ -372,7 +374,7 @@ void allowGLReadPreemption() {
} }
} }
#elif THREADING_USE_GRWL #elif THREADING_USE_GRWL
static pthread_rwlock_t grwl = PTHREAD_RWLOCK_INITIALIZER; static pthread_rwlock_t grwl = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP;
enum class GRWLHeldState { enum class GRWLHeldState {
N, N,
...@@ -412,9 +414,15 @@ void releaseGLWrite() { ...@@ -412,9 +414,15 @@ void releaseGLWrite() {
} }
void promoteGL() { void promoteGL() {
Timer _t2("promoting", /*min_usec=*/10000);
// Note: this is *not* the same semantics as normal promoting, on purpose. // Note: this is *not* the same semantics as normal promoting, on purpose.
releaseGLRead(); releaseGLRead();
acquireGLWrite(); acquireGLWrite();
long promote_us = _t2.end();
static thread_local StatPerThreadCounter sc_promoting_us("grwl_promoting_us");
sc_promoting_us.log(promote_us);
} }
void demoteGL() { void demoteGL() {
...@@ -426,18 +434,24 @@ static __thread int gl_check_count = 0; ...@@ -426,18 +434,24 @@ static __thread int gl_check_count = 0;
void allowGLReadPreemption() { void allowGLReadPreemption() {
assert(grwl_state == GRWLHeldState::R); assert(grwl_state == GRWLHeldState::R);
gl_check_count++; //gl_check_count++;
if (gl_check_count < 10) //if (gl_check_count < 10)
return; //return;
gl_check_count = 0; //gl_check_count = 0;
if (__builtin_expect(!writers_waiting.load(std::memory_order_relaxed), 1)) if (__builtin_expect(!writers_waiting.load(std::memory_order_relaxed), 1))
return; return;
Timer _t2("preempted", /*min_usec=*/10000);
pthread_rwlock_unlock(&grwl); pthread_rwlock_unlock(&grwl);
// printf("waiters!\n"); // The GRWL is a writer-prefered rwlock, so this next statement will block even
sleep(0); // if the lock is in read mode:
pthread_rwlock_rdlock(&grwl); pthread_rwlock_rdlock(&grwl);
long preempt_us = _t2.end();
static thread_local StatPerThreadCounter sc_preempting_us("grwl_preempt_us");
sc_preempting_us.log(preempt_us);
} }
#endif #endif
......
...@@ -155,16 +155,10 @@ void runCollection() { ...@@ -155,16 +155,10 @@ void runCollection() {
static StatCounter sc("gc_collections"); static StatCounter sc("gc_collections");
sc.log(); sc.log();
if (VERBOSITY("gc") >= 2) ncollections++;
printf("Collection #%d\n", ++ncollections);
// Timer _t2("promoting", /*min_usec=*/10000);
threading::GLPromoteRegion _lock; if (VERBOSITY("gc") >= 2)
printf("Collection #%d\n", ncollections);
// long promote_us = _t2.end();
// static thread_local StatPerThreadCounter sc_promoting_us("gc_promoting_us");
// sc_promoting_us.log(promote_us);
Timer _t("collecting", /*min_usec=*/10000); Timer _t("collecting", /*min_usec=*/10000);
......
...@@ -33,17 +33,28 @@ ...@@ -33,17 +33,28 @@
namespace pyston { namespace pyston {
namespace gc { namespace gc {
// extern unsigned numAllocs; static unsigned bytesAllocatedSinceCollection;
//#define ALLOCS_PER_COLLECTION 1000 static __thread unsigned thread_bytesAllocatedSinceCollection;
unsigned bytesAllocatedSinceCollection;
#define ALLOCBYTES_PER_COLLECTION 2000000 #define ALLOCBYTES_PER_COLLECTION 2000000
void _collectIfNeeded(size_t bytes) { void _collectIfNeeded(size_t bytes) {
if (bytesAllocatedSinceCollection >= ALLOCBYTES_PER_COLLECTION) { if (bytesAllocatedSinceCollection >= ALLOCBYTES_PER_COLLECTION) {
bytesAllocatedSinceCollection = 0; //bytesAllocatedSinceCollection = 0;
runCollection(); //threading::GLPromoteRegion _lock;
//runCollection();
threading::GLPromoteRegion _lock;
if (bytesAllocatedSinceCollection >= ALLOCBYTES_PER_COLLECTION) {
runCollection();
bytesAllocatedSinceCollection = 0;
}
}
thread_bytesAllocatedSinceCollection += bytes;
if (thread_bytesAllocatedSinceCollection > ALLOCBYTES_PER_COLLECTION / 4) {
bytesAllocatedSinceCollection += thread_bytesAllocatedSinceCollection;
thread_bytesAllocatedSinceCollection = 0;
} }
bytesAllocatedSinceCollection += bytes;
} }
...@@ -233,8 +244,12 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) { ...@@ -233,8 +244,12 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
Block** cache_head = &cache->cache_heads[bucket_idx]; Block** cache_head = &cache->cache_heads[bucket_idx];
static StatCounter sc_total("gc_total"); //static __thread int gc_allocs = 0;
sc_total.log(); //if (++gc_allocs == 128) {
//static StatCounter sc_total("gc_allocs");
//sc_total.log(128);
//gc_allocs = 0;
//}
while (true) { while (true) {
Block* cache_block = *cache_head; Block* cache_block = *cache_head;
...@@ -246,8 +261,9 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) { ...@@ -246,8 +261,9 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
removeFromLL(cache_block); removeFromLL(cache_block);
} }
static StatCounter sc_fallback("gc_nocache"); // Not very useful to count the cache misses if we don't count the total attempts:
sc_fallback.log(); //static StatCounter sc_fallback("gc_allocs_cachemiss");
//sc_fallback.log();
LOCK_REGION(lock); LOCK_REGION(lock);
...@@ -263,6 +279,8 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) { ...@@ -263,6 +279,8 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
assert(!myblock->next); assert(!myblock->next);
assert(!myblock->prev); assert(!myblock->prev);
//printf("%d claimed new block %p with %d objects\n", threading::gettid(), myblock, myblock->numObjects());
insertIntoLL(cache_head, myblock); insertIntoLL(cache_head, myblock);
} }
} }
......
...@@ -31,7 +31,7 @@ inline GCObjectHeader* headerFromObject(void* obj) { ...@@ -31,7 +31,7 @@ inline GCObjectHeader* headerFromObject(void* obj) {
#endif #endif
} }
#define BLOCK_SIZE 4096 #define BLOCK_SIZE (4*4096)
#define ATOM_SIZE 16 #define ATOM_SIZE 16
static_assert(BLOCK_SIZE % ATOM_SIZE == 0, ""); static_assert(BLOCK_SIZE % ATOM_SIZE == 0, "");
#define ATOMS_PER_BLOCK (BLOCK_SIZE / ATOM_SIZE) #define ATOMS_PER_BLOCK (BLOCK_SIZE / ATOM_SIZE)
......
...@@ -855,9 +855,11 @@ extern "C" Box* getattr(Box* obj, const char* attr) { ...@@ -855,9 +855,11 @@ extern "C" Box* getattr(Box* obj, const char* attr) {
slowpath_getattr.log(); slowpath_getattr.log();
if (VERBOSITY() >= 2) { if (VERBOSITY() >= 2) {
#if !DISABLE_STATS
std::string per_name_stat_name = "getattr__" + std::string(attr); std::string per_name_stat_name = "getattr__" + std::string(attr);
int id = Stats::getStatId(per_name_stat_name); int id = Stats::getStatId(per_name_stat_name);
Stats::log(id); Stats::log(id);
#endif
} }
std::unique_ptr<Rewriter2> rewriter( std::unique_ptr<Rewriter2> rewriter(
...@@ -2772,9 +2774,11 @@ extern "C" Box* getGlobal(BoxedModule* m, std::string* name) { ...@@ -2772,9 +2774,11 @@ extern "C" Box* getGlobal(BoxedModule* m, std::string* name) {
static StatCounter nopatch_getglobal("nopatch_getglobal"); static StatCounter nopatch_getglobal("nopatch_getglobal");
if (VERBOSITY() >= 2) { if (VERBOSITY() >= 2) {
#if !DISABLE_STATS
std::string per_name_stat_name = "getglobal__" + *name; std::string per_name_stat_name = "getglobal__" + *name;
int id = Stats::getStatId(per_name_stat_name); int id = Stats::getStatId(per_name_stat_name);
Stats::log(id); Stats::log(id);
#endif
} }
{ /* anonymous scope to make sure destructors get run before we err out */ { /* anonymous scope to make sure destructors get run before we err out */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment