Turns out a large amount of thread contention was coming from these shared...

Turns out a large amount of thread contention was coming from these shared counters -- disable some of them and add some thread-local caching

Turns out a large amount of thread contention was coming from these shared...
Turns out a large amount of thread contention was coming from these shared counters -- disable some of them and add some thread-local caching
0b0b2ceb · Kevin Modzelewski · 03066506 · 0b0b2ceb · 0b0b2ceb · 0b0b2ceb
Commit 0b0b2ceb authored Jun 17, 2014 by Kevin Modzelewski
9 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@ pyston_prof
 pyston_profile
 pyston_release
 pyston_grwl
+pyston_grwl_dbg
 pyston_nosync

 *.cache

--- a/src/Makefile
+++ b/src/Makefile
@@ -585,6 +585,7 @@ PASS_OBJS := $(PASS_SRCS:.cpp=.standalone.o)
 $(call make_compile_config,,$(CXXFLAGS_DBG))
 $(call make_compile_config,.release,$(CXXFLAGS_RELEASE))
 $(call make_compile_config,.grwl,$(CXXFLAGS_RELEASE) -DTHREADING_USE_GRWL=1 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_grwl)
+$(call make_compile_config,.grwl_dbg,$(CXXFLAGS_DBG) -DTHREADING_USE_GRWL=1 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_grwl_dbg -UBINARY_STRIPPED_SUFFIX -DBINARY_STRIPPED_SUFFIX=)
 $(call make_compile_config,.nosync,$(CXXFLAGS_RELEASE) -DTHREADING_USE_GRWL=0 -DTHREADING_USE_GIL=0 -UBINARY_SUFFIX -DBINARY_SUFFIX=_nosync)

 $(UNITTEST_SRCS:.cpp=.o): CXXFLAGS += -isystem $(GTEST_DIR)/include
@@ -658,6 +659,7 @@ $(call link,_dbg,$(OBJS),$(LDFLAGS),$(LLVM_DEPS))
 $(call link,_debug,$(OBJS),$(LDFLAGS_DEBUG),$(LLVM_DEBUG_DEPS))
 $(call link,_release,$(OPT_OBJS),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS))
 $(call link,_grwl,stdlib.grwl.bc.o $(SRCS:.cpp=.grwl.o),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS))
+$(call link,_grwl_dbg,stdlib.grwl_dbg.bc.o $(SRCS:.cpp=.grwl_dbg.o),$(LDFLAGS),$(LLVM_DEPS))
 $(call link,_nosync,stdlib.nosync.bc.o $(SRCS:.cpp=.nosync.o),$(LDFLAGS_RELEASE),$(LLVM_RELEASE_DEPS))
 pyston_oprof: $(OPT_OBJS) codegen/profiling/oprofile.o $(LLVM_DEPS)
 	$(ECHO) Linking $@
@@ -734,6 +736,9 @@ $$(call make_search,memcheck_gdb$1_%)
 memleaks$1_%: %.py pyston$1 $$(RUN_DEPS)
 	$$(VALGRIND) --tool=memcheck --leak-check=full --leak-resolution=low --show-reachable=yes ./pyston$1 $$(ARGS) $$<
 $$(call make_search,memleaks$1_%)
+cachegrind$1_%: %.py pyston$1 $$(RUN_DEPS)
+	$$(VALGRIND) --tool=cachegrind ./pyston$1 $$(ARGS) $$<
+$$(call make_search,cachegrind$1_%)
 endif

 .PHONY: perf$1_%
@@ -759,6 +764,7 @@ perf_%: perf_release_%
 $(call make_target,_dbg)
 $(call make_target,_release)
 $(call make_target,_grwl)
+$(call make_target,_grwl_dbg)
 $(call make_target,_nosync)

 # "kill valgrind":

--- a/src/core/stats.cpp
+++ b/src/core/stats.cpp
@@ -20,7 +20,9 @@

 namespace pyston {

+#if !DISABLE_STATS
 std::vector<long>* Stats::counts;
+std::vector<std::atomic<long> >* Stats::threadsafe_counts;
 std::unordered_map<int, std::string>* Stats::names;
 StatCounter::StatCounter(const std::string& name) : id(Stats::getStatId(name)) {
 }
@@ -37,6 +39,8 @@ int Stats::getStatId(const std::string& name) {
    Stats::names = &names;
    static std::vector<long> counts;
    Stats::counts = &counts;
+    static std::vector<std::atomic<long> > threadsafe_counts;
+    Stats::threadsafe_counts = &threadsafe_counts;
    static std::unordered_map<std::string, int> made;

    if (made.count(name))
@@ -46,6 +50,7 @@ int Stats::getStatId(const std::string& name) {
    names[rtn] = name;
    made[name] = rtn;
    counts.push_back(0);
+    threadsafe_counts.emplace_back(0);
    return rtn;
 }

@@ -60,7 +65,10 @@ void Stats::dump() {
    std::sort(pairs.begin(), pairs.end());

    for (int i = 0; i < pairs.size(); i++) {
-        printf("%s: %ld\n", pairs[i].first.c_str(), (*counts)[pairs[i].second]);
+        printf("%s: %ld\n", pairs[i].first.c_str(), (*counts)[pairs[i].second] + (*threadsafe_counts)[pairs[i].second]);
    }
 }
+
+#endif
+
 }
--- a/src/core/stats.h
+++ b/src/core/stats.h
@@ -15,6 +15,7 @@
 #ifndef PYSTON_CORE_STATS_H
 #define PYSTON_CORE_STATS_H

+#include <atomic>
 #include <cstdio>
 #include <string>
 #include <unordered_map>
@@ -24,15 +25,20 @@

 namespace pyston {

+#define DISABLE_STATS 0
+
+#if !DISABLE_STATS
 struct Stats {
 private:
    static std::vector<long>* counts;
+    static std::vector<std::atomic<long> >* threadsafe_counts;
    static std::unordered_map<int, std::string>* names;

 public:
    static int getStatId(const std::string& name);

    static void log(int id, int count = 1) { (*counts)[id] += count; }
+    static void threadsafe_log(int id, int count = 1) { (*threadsafe_counts)[id].fetch_add(count, std::memory_order_relaxed); }

    static void dump();
 };
@@ -45,6 +51,7 @@ public:
    StatCounter(const std::string& name);

    void log(int count = 1) { Stats::log(id, count); }
+    void threadsafe_log(int count = 1) { Stats::threadsafe_log(id, count); }
 };

 struct StatPerThreadCounter {
@@ -55,7 +62,25 @@ public:
    StatPerThreadCounter(const std::string& name);

    void log(int count = 1) { Stats::log(id, count); }
+    void threadsafe_log(int count = 1) { Stats::threadsafe_log(id, count); }
+};
+
+#else
+struct Stats {
+    static void dump() {
+        printf("(Stats disabled)\n");
+    }
+};
+struct StatCounter {
+    StatCounter(const char* name) {}
+    void log(int count=1) {};
+};
+struct StatPerThreadCounter {
+    StatPerThreadCounter(const char* name) {}
+    void log(int count=1) {};
 };
+#endif
+
 }

 #endif
--- a/src/core/threading.cpp
+++ b/src/core/threading.cpp
@@ -23,7 +23,9 @@

 #include "core/common.h"
 #include "core/options.h"
+#include "core/stats.h"
 #include "core/thread_utils.h"
+#include "core/util.h"

 extern "C" int start_thread(void* arg);

@@ -372,7 +374,7 @@ void allowGLReadPreemption() {
    }
 }
 #elif THREADING_USE_GRWL
-static pthread_rwlock_t grwl = PTHREAD_RWLOCK_INITIALIZER;
+static pthread_rwlock_t grwl = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP;

 enum class GRWLHeldState {
    N,
@@ -412,9 +414,15 @@ void releaseGLWrite() {
 }

 void promoteGL() {
+    Timer _t2("promoting", /*min_usec=*/10000);
+
    // Note: this is *not* the same semantics as normal promoting, on purpose.
    releaseGLRead();
    acquireGLWrite();
+
+    long promote_us = _t2.end();
+    static thread_local StatPerThreadCounter sc_promoting_us("grwl_promoting_us");
+    sc_promoting_us.log(promote_us);
 }

 void demoteGL() {
@@ -426,18 +434,24 @@ static __thread int gl_check_count = 0;
 void allowGLReadPreemption() {
    assert(grwl_state == GRWLHeldState::R);

-    gl_check_count++;
-    if (gl_check_count < 10)
-        return;
-    gl_check_count = 0;
+    //gl_check_count++;
+    //if (gl_check_count < 10)
+        //return;
+    //gl_check_count = 0;

    if (__builtin_expect(!writers_waiting.load(std::memory_order_relaxed), 1))
        return;

+    Timer _t2("preempted", /*min_usec=*/10000);
    pthread_rwlock_unlock(&grwl);
-    // printf("waiters!\n");
-    sleep(0);
+    // The GRWL is a writer-prefered rwlock, so this next statement will block even
+    // if the lock is in read mode:
    pthread_rwlock_rdlock(&grwl);
+
+    long preempt_us = _t2.end();
+    static thread_local StatPerThreadCounter sc_preempting_us("grwl_preempt_us");
+    sc_preempting_us.log(preempt_us);
+
 }
 #endif


--- a/src/gc/collector.cpp
+++ b/src/gc/collector.cpp
@@ -155,16 +155,10 @@ void runCollection() {
    static StatCounter sc("gc_collections");
    sc.log();

-    if (VERBOSITY("gc") >= 2)
-        printf("Collection #%d\n", ++ncollections);
-
-    // Timer _t2("promoting", /*min_usec=*/10000);
+    ncollections++;

-    threading::GLPromoteRegion _lock;
-
-    // long promote_us = _t2.end();
-    // static thread_local StatPerThreadCounter sc_promoting_us("gc_promoting_us");
-    // sc_promoting_us.log(promote_us);
+    if (VERBOSITY("gc") >= 2)
+        printf("Collection #%d\n", ncollections);

    Timer _t("collecting", /*min_usec=*/10000);


--- a/src/gc/heap.cpp
+++ b/src/gc/heap.cpp
@@ -33,17 +33,28 @@
 namespace pyston {
 namespace gc {

-// extern unsigned numAllocs;
-//#define ALLOCS_PER_COLLECTION 1000
-unsigned bytesAllocatedSinceCollection;
+static unsigned bytesAllocatedSinceCollection;
+static __thread unsigned thread_bytesAllocatedSinceCollection;
 #define ALLOCBYTES_PER_COLLECTION 2000000

 void _collectIfNeeded(size_t bytes) {
    if (bytesAllocatedSinceCollection >= ALLOCBYTES_PER_COLLECTION) {
-        bytesAllocatedSinceCollection = 0;
-        runCollection();
+        //bytesAllocatedSinceCollection = 0;
+        //threading::GLPromoteRegion _lock;
+        //runCollection();
+
+        threading::GLPromoteRegion _lock;
+        if (bytesAllocatedSinceCollection >= ALLOCBYTES_PER_COLLECTION) {
+            runCollection();
+            bytesAllocatedSinceCollection = 0;
+        }
+    }
+
+    thread_bytesAllocatedSinceCollection += bytes;
+    if (thread_bytesAllocatedSinceCollection > ALLOCBYTES_PER_COLLECTION / 4) {
+        bytesAllocatedSinceCollection += thread_bytesAllocatedSinceCollection;
+        thread_bytesAllocatedSinceCollection = 0;
    }
-    bytesAllocatedSinceCollection += bytes;
 }


@@ -233,8 +244,12 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {

    Block** cache_head = &cache->cache_heads[bucket_idx];

-    static StatCounter sc_total("gc_total");
-    sc_total.log();
+    //static __thread int gc_allocs = 0;
+    //if (++gc_allocs == 128) {
+        //static StatCounter sc_total("gc_allocs");
+        //sc_total.log(128);
+        //gc_allocs = 0;
+    //}

    while (true) {
        Block* cache_block = *cache_head;
@@ -246,8 +261,9 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
            removeFromLL(cache_block);
        }

-        static StatCounter sc_fallback("gc_nocache");
-        sc_fallback.log();
+        // Not very useful to count the cache misses if we don't count the total attempts:
+        //static StatCounter sc_fallback("gc_allocs_cachemiss");
+        //sc_fallback.log();

        LOCK_REGION(lock);

@@ -263,6 +279,8 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
        assert(!myblock->next);
        assert(!myblock->prev);

+        //printf("%d claimed new block %p with %d objects\n", threading::gettid(), myblock, myblock->numObjects());
+
        insertIntoLL(cache_head, myblock);
    }
 }

--- a/src/gc/heap.h
+++ b/src/gc/heap.h
@@ -31,7 +31,7 @@ inline GCObjectHeader* headerFromObject(void* obj) {
 #endif
 }

-#define BLOCK_SIZE 4096
+#define BLOCK_SIZE (4*4096)
 #define ATOM_SIZE 16
 static_assert(BLOCK_SIZE % ATOM_SIZE == 0, "");
 #define ATOMS_PER_BLOCK (BLOCK_SIZE / ATOM_SIZE)

--- a/src/runtime/objmodel.cpp
+++ b/src/runtime/objmodel.cpp
@@ -855,9 +855,11 @@ extern "C" Box* getattr(Box* obj, const char* attr) {
    slowpath_getattr.log();

    if (VERBOSITY() >= 2) {
+#if !DISABLE_STATS
        std::string per_name_stat_name = "getattr__" + std::string(attr);
        int id = Stats::getStatId(per_name_stat_name);
        Stats::log(id);
+#endif
    }

    std::unique_ptr<Rewriter2> rewriter(
@@ -2772,9 +2774,11 @@ extern "C" Box* getGlobal(BoxedModule* m, std::string* name) {
    static StatCounter nopatch_getglobal("nopatch_getglobal");

    if (VERBOSITY() >= 2) {
+#if !DISABLE_STATS
        std::string per_name_stat_name = "getglobal__" + *name;
        int id = Stats::getStatId(per_name_stat_name);
        Stats::log(id);
+#endif
    }

    { /* anonymous scope to make sure destructors get run before we err out */