Merge pull request #676 from kmod/perf

An assortment of misc small perf changes

Merge pull request #676 from kmod/perf
An assortment of misc small perf changes
de36c6be · Kevin Modzelewski · 2e9a9e35 · ac3dedc2 · de36c6be · de36c6be
Commit de36c6be authored Jul 07, 2015 by Kevin Modzelewski
11 changed files
--- a/Makefile
+++ b/Makefile
@@ -108,11 +108,7 @@ else
 	LLVM_BUILD := $(LLVM_TRUNK_BUILD)
 endif

-ifeq ($(FORCE_TRUNK_BINARIES),1)
-	LLVM_BIN := $(LLVM_TRUNK_BUILD)/Release/bin
-else
-	LLVM_BIN := $(LLVM_BUILD)/Release/bin
-endif
+LLVM_BIN := ./build/Release/llvm/bin

 LLVM_LINK_LIBS := core mcjit native bitreader bitwriter ipo irreader debuginfodwarf instrumentation
 ifneq ($(ENABLE_INTEL_JIT_EVENTS),0)

--- a/microbenchmarks/getattrfunc_ubench.py
+++ b/microbenchmarks/getattrfunc_ubench.py
+class C(object):
+    pass
+
+def f():
+    g = getattr
+    c = C()
+    c.o = 1
+    for i in xrange(10000000):
+        g(c, "o")
+f()
--- a/section_ordering.txt
+++ b/section_ordering.txt
@@ -11,6 +11,7 @@
 .text._ZN6pyston15objectNewNoArgsEPNS_10BoxedClassE
 .text._PyIndex_Check
 .text._ZN6pyston9threading21allowGLReadPreemptionEv
+.text._ZN6pyston9threading22_allowGLReadPreemptionEv
 .text._ZN6pyston9getOpNameEi
 .text._ZN6pyston8callFuncEPNS_17BoxedFunctionBaseEPNS_15CallRewriteArgsENS_11ArgPassSpecEPNS_3BoxES6_S6_PS6_PKSt6vectorIPKSsSaISA_EE
 .text._ZN6pyston2gc9GCVisitor5visitEPv

--- a/src/analysis/scoping_analysis.h
+++ b/src/analysis/scoping_analysis.h
@@ -15,6 +15,8 @@
 #ifndef PYSTON_ANALYSIS_SCOPINGANALYSIS_H
 #define PYSTON_ANALYSIS_SCOPINGANALYSIS_H

+#include "llvm/ADT/DenseMap.h"
+
 #include "core/common.h"
 #include "core/stringpool.h"

@@ -146,14 +148,14 @@ public:
 class ScopingAnalysis {
 public:
    struct ScopeNameUsage;
-    typedef std::unordered_map<AST*, ScopeNameUsage*> NameUsageMap;
+    typedef llvm::DenseMap<AST*, ScopeNameUsage*> NameUsageMap;

 private:
-    std::unordered_map<AST*, ScopeInfo*> scopes;
+    llvm::DenseMap<AST*, ScopeInfo*> scopes;
    AST_Module* parent_module;
    InternedStringPool* interned_strings;

-    std::unordered_map<AST*, AST*> scope_replacements;
+    llvm::DenseMap<AST*, AST*> scope_replacements;

    ScopeInfo* analyzeSubtree(AST* node);
    void processNameUsages(NameUsageMap* usages);

--- a/src/asm_writing/rewriter.cpp
+++ b/src/asm_writing/rewriter.cpp
@@ -186,8 +186,9 @@ void Rewriter::ConstLoader::moveImmediate(uint64_t val, assembler::Register dst_
 assembler::Register Rewriter::ConstLoader::findConst(uint64_t val, bool& found_value) {
    assert(rewriter->phase_emitting);

-    if (constToVar.count(val) > 0) {
-        RewriterVar* var = constToVar[val];
+    auto it = constToVar.find(val);
+    if (it != constToVar.end()) {
+        RewriterVar* var = it->second;
        for (Location l : var->locations) {
            if (l.type == Location::Register) {
                found_value = true;

--- a/src/core/threading.cpp
+++ b/src/core/threading.cpp
@@ -14,7 +14,6 @@

 #include "core/threading.h"

-#include <atomic>
 #include <cstdio>
 #include <cstdlib>
 #include <err.h>
@@ -481,7 +480,7 @@ extern "C" void endAllowThreads() noexcept {

 static pthread_mutex_t gil = PTHREAD_MUTEX_INITIALIZER;

-static std::atomic<int> threads_waiting_on_gil(0);
+std::atomic<int> threads_waiting_on_gil(0);
 static pthread_cond_t gil_acquired = PTHREAD_COND_INITIALIZER;

 extern "C" void PyEval_ReInitThreads() noexcept {
@@ -524,9 +523,6 @@ void releaseGLWrite() {
    pthread_mutex_unlock(&gil);
 }

-#define GIL_CHECK_INTERVAL 1000
-// Note: this doesn't need to be an atomic, since it should
-// only be accessed by the thread that holds the gil:
 int gil_check_count = 0;

 // TODO: this function is fair in that it forces a thread to give up the GIL
@@ -535,25 +531,8 @@ int gil_check_count = 0;
 // switching back and forth, and a third that never gets run.
 // We could enforce fairness by having a FIFO of events (implementd with mutexes?)
 // and make sure to always wake up the longest-waiting one.
-void allowGLReadPreemption() {
-#if ENABLE_SAMPLING_PROFILER
-    if (unlikely(sigprof_pending)) {
-        // Output multiple stacktraces if we received multiple signals
-        // between being able to handle it (such as being in LLVM or the GC),
-        // to try to fully account for that time.
-        while (sigprof_pending) {
-            _printStacktrace();
-            sigprof_pending--;
-        }
-    }
-#endif
-
-    // Double-checked locking: first read with no ordering constraint:
-    if (!threads_waiting_on_gil.load(std::memory_order_relaxed))
-        return;
-
-    gil_check_count++;
-    if (gil_check_count >= GIL_CHECK_INTERVAL) {
+void _allowGLReadPreemption() {
+    assert(gil_check_count >= GIL_CHECK_INTERVAL);
    gil_check_count = 0;

    // Double check this, since if we are wrong about there being a thread waiting on the gil,
@@ -565,7 +544,6 @@ void allowGLReadPreemption() {
    pthread_cond_wait(&gil_acquired, &gil);
    threads_waiting_on_gil--;
    pthread_cond_signal(&gil_acquired);
-    }
 }
 #elif THREADING_USE_GRWL
 static pthread_rwlock_t grwl = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP;

--- a/src/core/threading.h
+++ b/src/core/threading.h
@@ -15,6 +15,7 @@
 #ifndef PYSTON_CORE_THREADING_H
 #define PYSTON_CORE_THREADING_H

+#include <atomic>
 #include <cstdint>
 #include <cstring>
 #include <ucontext.h>
@@ -81,7 +82,36 @@ void acquireGLRead();
 void releaseGLRead();
 void acquireGLWrite();
 void releaseGLWrite();
-void allowGLReadPreemption();
+void _allowGLReadPreemption();
+
+#define GIL_CHECK_INTERVAL 1000
+// Note: this doesn't need to be an atomic, since it should
+// only be accessed by the thread that holds the gil:
+extern int gil_check_count;
+extern std::atomic<int> threads_waiting_on_gil;
+inline void allowGLReadPreemption() {
+#if ENABLE_SAMPLING_PROFILER
+    if (unlikely(sigprof_pending)) {
+        // Output multiple stacktraces if we received multiple signals
+        // between being able to handle it (such as being in LLVM or the GC),
+        // to try to fully account for that time.
+        while (sigprof_pending) {
+            _printStacktrace();
+            sigprof_pending--;
+        }
+    }
+#endif
+
+    // Double-checked locking: first read with no ordering constraint:
+    if (!threads_waiting_on_gil.load(std::memory_order_relaxed))
+        return;
+
+    gil_check_count++;
+    if (likely(gil_check_count < GIL_CHECK_INTERVAL))
+        return;
+
+    _allowGLReadPreemption();
+}
 // Note: promoteGL is free to drop the lock and then reacquire
 void promoteGL();
 void demoteGL();

--- a/src/runtime/descr.cpp
+++ b/src/runtime/descr.cpp
@@ -393,9 +393,11 @@ Box* BoxedWrapperDescriptor::descr_get(Box* _self, Box* inst, Box* owner) noexce
    if (inst == None)
        return self;

-    if (!isSubclass(inst->cls, self->type))
+    if (!isSubclass(inst->cls, self->type)) {
        PyErr_Format(TypeError, "Descriptor '' for '%s' objects doesn't apply to '%s' object",
                     getFullNameOfClass(self->type).c_str(), getFullTypeName(inst).c_str());
+        return NULL;
+    }

    return new BoxedWrapperObject(self, inst);
 }

--- a/src/runtime/dict.cpp
+++ b/src/runtime/dict.cpp
@@ -249,6 +249,8 @@ extern "C" PyObject* PyDict_GetItem(PyObject* dict, PyObject* key) noexcept {
        return d->getOrNull(key);
    }

+    // XXX this would be easy to make much faster.
+
    // This path doesn't exist in CPython; we have it to support extension modules that do
    // something along the lines of PyDict_GetItem(PyModule_GetDict()):
    try {
@@ -304,6 +306,9 @@ extern "C" int PyDict_Next(PyObject* op, Py_ssize_t* ppos, PyObject** pkey, PyOb
 }

 extern "C" PyObject* PyDict_GetItemString(PyObject* dict, const char* key) noexcept {
+    if (dict->cls == attrwrapper_cls)
+        return unwrapAttrWrapper(dict)->getattr(key);
+
    Box* key_s;
    try {
        key_s = boxString(key);

--- a/src/runtime/inline/link_forcer.cpp
+++ b/src/runtime/inline/link_forcer.cpp
@@ -17,6 +17,7 @@

 #include "codegen/irgen/hooks.h"
 #include "core/ast.h"
+#include "core/threading.h"
 #include "core/types.h"
 #include "gc/heap.h"
 #include "runtime/complex.h"
@@ -138,6 +139,8 @@ void force() {
    FORCE(boxedLocalsGet);
    FORCE(boxedLocalsDel);

+    FORCE(threading::allowGLReadPreemption);
+
    // FORCE(listIter);
 }
 }

--- a/src/runtime/list.cpp
+++ b/src/runtime/list.cpp
@@ -594,6 +594,10 @@ Box* listIAdd(BoxedList* self, Box* _rhs) {

        int s1 = self->size;
        int s2 = rhs->size;
+
+        if (s2 == 0)
+            return self;
+
        self->ensure(s1 + s2);

        memcpy(self->elts->elts + s1, rhs->elts->elts, sizeof(rhs->elts->elts[0]) * s2);