Add scratch space to our runtime ics

(scratch space is pre-allocated stack space since the IC can't allocate new space itself, in order to have unwinding work) Allocate a fixed amount (currently: 40 bytes) of extra stack space in our runtime ics. This involves changing the function prologue+ epilogue to do more rsp adjustment, and modifying the .eh_frame sections we generate. One tricky thing is that we currently use frame pointer elimination in our runtime ics, but the rest of the scratch space logic had assumed the scratch would be rbp-relative, which I had to convert to rsp-relative.

Add scratch space to our runtime ics
(scratch space is pre-allocated stack space since the IC can't allocate new space itself, in order to have unwinding work) Allocate a fixed amount (currently: 40 bytes) of extra stack space in our runtime ics. This involves changing the function prologue+ epilogue to do more rsp adjustment, and modifying the .eh_frame sections we generate. One tricky thing is that we currently use frame pointer elimination in our runtime ics, but the rest of the scratch space logic had assumed the scratch would be rbp-relative, which I had to convert to rsp-relative.
1a6b1e0c · Kevin Modzelewski · c88e3b48 · 1a6b1e0c · 1a6b1e0c · 1a6b1e0c
Commit 1a6b1e0c authored Mar 17, 2015 by Kevin Modzelewski
12 changed files
--- a/src/asm_writing/icinfo.cpp
+++ b/src/asm_writing/icinfo.cpp
@@ -95,19 +95,22 @@ void ICSlotRewrite::commit(CommitHook* hook) {
    if (ic_entry == NULL)
        return;
-    for (int i = 0; i < dependencies.size(); i++) {
-        ICInvalidator* invalidator = dependencies[i].first;
-        invalidator->addDependent(ic_entry);
-    }
    uint8_t* slot_start = (uint8_t*)ic->start_addr + ic_entry->idx * ic->getSlotSize();
    uint8_t* continue_point = (uint8_t*)ic->continue_addr;
-    hook->finishAssembly(ic_entry, continue_point - slot_start);
+    bool do_commit = hook->finishAssembly(ic_entry, continue_point - slot_start);
+    if (!do_commit)
+        return;
    assert(assembler->isExactlyFull());
    assert(!assembler->hasFailed());
+    for (int i = 0; i < dependencies.size(); i++) {
+        ICInvalidator* invalidator = dependencies[i].first;
+        invalidator->addDependent(ic_entry);
+    }
    // if (VERBOSITY()) printf("Commiting to %p-%p\n", start, start + ic->slot_size);
    memcpy(slot_start, buf, ic->getSlotSize());
@@ -129,18 +132,13 @@ int ICSlotRewrite::getSlotSize() {
    return ic->getSlotSize();
 }
-int ICSlotRewrite::getFuncStackSize() {
+int ICSlotRewrite::getScratchRspOffset() {
-    return ic->stack_info.stack_size;
+    assert(ic->stack_info.scratch_size);
-}
+    return ic->stack_info.scratch_rsp_offset;
-int ICSlotRewrite::getScratchRbpOffset() {
-    assert(ic->stack_info.scratch_bytes);
-    return ic->stack_info.scratch_rbp_offset;
 }
-int ICSlotRewrite::getScratchBytes() {
+int ICSlotRewrite::getScratchSize() {
-    assert(ic->stack_info.scratch_bytes);
+    return ic->stack_info.scratch_size;
-    return ic->stack_info.scratch_bytes;
 }
 TypeRecorder* ICSlotRewrite::getTypeRecorder() {

--- a/src/asm_writing/icinfo.h
+++ b/src/asm_writing/icinfo.h
@@ -48,7 +48,7 @@ public:
    class CommitHook {
    public:
        virtual ~CommitHook() {}
-        virtual void finishAssembly(ICSlotInfo* picked_slot, int fastpath_offset) = 0;
+        virtual bool finishAssembly(ICSlotInfo* picked_slot, int fastpath_offset) = 0;
    };
 private:
@@ -67,9 +67,8 @@ public:
    assembler::Assembler* getAssembler() { return assembler; }
    int getSlotSize();
-    int getFuncStackSize();
+    int getScratchRspOffset();
-    int getScratchRbpOffset();
+    int getScratchSize();
-    int getScratchBytes();
    TypeRecorder* getTypeRecorder();

--- a/src/asm_writing/rewriter.cpp
+++ b/src/asm_writing/rewriter.cpp
@@ -952,18 +952,20 @@ void Rewriter::commit() {
        return;
    }
+    rewrite->commit(this);
+    if (assembler->hasFailed()) {
+        on_assemblyfail();
+        return;
+    }
    finished = true;
    static StatCounter rewriter_commits("rewriter_commits");
    rewriter_commits.log();
-    // TODO: have to check that we have enough room to write the final jmp
-    rewrite->commit(this);
-    assert(!assembler->hasFailed());
 }
-void Rewriter::finishAssembly(ICSlotInfo* picked_slot, int continue_offset) {
+bool Rewriter::finishAssembly(ICSlotInfo* picked_slot, int continue_offset) {
    if (marked_inside_ic) {
        void* mark_addr = &picked_slot->num_inside;
@@ -977,6 +979,8 @@ void Rewriter::finishAssembly(ICSlotInfo* picked_slot, int continue_offset) {
    assembler->jmp(assembler::JumpDestination::fromStart(continue_offset));
    assembler->fillWithNops();
+    return !assembler->hasFailed();
 }
 void Rewriter::commitReturning(RewriterVar* var) {
@@ -995,14 +999,14 @@ void Rewriter::addDependenceOn(ICInvalidator& invalidator) {
 Location Rewriter::allocScratch() {
    assertPhaseEmitting();
-    int scratch_bytes = rewrite->getScratchBytes();
+    int scratch_size = rewrite->getScratchSize();
-    for (int i = 0; i < scratch_bytes; i += 8) {
+    for (int i = 0; i < scratch_size; i += 8) {
        Location l(Location::Scratch, i);
        if (vars_by_location.count(l) == 0) {
            return l;
        }
    }
-    RELEASE_ASSERT(0, "Using all %d bytes of scratch!", scratch_bytes);
+    RELEASE_ASSERT(0, "Using all %d bytes of scratch!", scratch_size);
 }
 RewriterVar* Rewriter::add(RewriterVar* a, int64_t b, Location dest) {
@@ -1042,9 +1046,9 @@ RewriterVar* Rewriter::allocate(int n) {
 int Rewriter::_allocate(RewriterVar* result, int n) {
    assert(n >= 1);
-    int scratch_bytes = rewrite->getScratchBytes();
+    int scratch_size = rewrite->getScratchSize();
    int consec = 0;
-    for (int i = 0; i < scratch_bytes; i += 8) {
+    for (int i = 0; i < scratch_size; i += 8) {
        Location l(Location::Scratch, i);
        if (vars_by_location.count(l) == 0) {
            consec++;
@@ -1056,8 +1060,8 @@ int Rewriter::_allocate(RewriterVar* result, int n) {
                // TODO should be a LEA instruction
                // In fact, we could do something like we do for constants and only load
                // this when necessary, so it won't spill. Is that worth?
-                assembler->mov(assembler::RBP, r);
+                assembler->mov(assembler::RSP, r);
-                assembler->add(assembler::Immediate(8 * a + rewrite->getScratchRbpOffset()), r);
+                assembler->add(assembler::Immediate(8 * a + rewrite->getScratchRspOffset()), r);
                // Put placeholders in so the array space doesn't get re-allocated.
                // This won't get collected, but that's fine.
@@ -1074,7 +1078,7 @@ int Rewriter::_allocate(RewriterVar* result, int n) {
            consec = 0;
        }
    }
-    RELEASE_ASSERT(0, "Using all %d bytes of scratch!", scratch_bytes);
+    RELEASE_ASSERT(0, "Using all %d bytes of scratch!", scratch_size);
 }
 RewriterVar* Rewriter::allocateAndCopy(RewriterVar* array_ptr, int n) {
@@ -1094,7 +1098,7 @@ void Rewriter::_allocateAndCopy(RewriterVar* result, RewriterVar* array_ptr, int
    for (int i = 0; i < n; i++) {
        assembler->mov(assembler::Indirect(src_ptr, 8 * i), tmp);
-        assembler->mov(tmp, assembler::Indirect(assembler::RBP, 8 * (offset + i) + rewrite->getScratchRbpOffset()));
+        assembler->mov(tmp, assembler::Indirect(assembler::RSP, 8 * (offset + i) + rewrite->getScratchRspOffset()));
    }
    array_ptr->bumpUse();
@@ -1121,7 +1125,7 @@ void Rewriter::_allocateAndCopyPlus1(RewriterVar* result, RewriterVar* first_ele
    int offset = _allocate(result, n_rest + 1);
    assembler::Register tmp = first_elem->getInReg();
-    assembler->mov(tmp, assembler::Indirect(assembler::RBP, 8 * offset + rewrite->getScratchRbpOffset()));
+    assembler->mov(tmp, assembler::Indirect(assembler::RSP, 8 * offset + rewrite->getScratchRspOffset()));
    if (n_rest > 0) {
        assembler::Register src_ptr = rest_ptr->getInReg();
@@ -1131,7 +1135,7 @@ void Rewriter::_allocateAndCopyPlus1(RewriterVar* result, RewriterVar* first_ele
        for (int i = 0; i < n_rest; i++) {
            assembler->mov(assembler::Indirect(src_ptr, 8 * i), tmp);
            assembler->mov(tmp,
-                           assembler::Indirect(assembler::RBP, 8 * (offset + i + 1) + rewrite->getScratchRbpOffset()));
+                           assembler::Indirect(assembler::RSP, 8 * (offset + i + 1) + rewrite->getScratchRspOffset()));
        }
        rest_ptr->bumpUse();
    }
@@ -1146,8 +1150,7 @@ assembler::Indirect Rewriter::indirectFor(Location l) {
    assert(l.type == Location::Scratch || l.type == Location::Stack);
    if (l.type == Location::Scratch)
-        // TODO it can sometimes be more efficient to do RSP-relative addressing?
+        return assembler::Indirect(assembler::RSP, rewrite->getScratchRspOffset() + l.scratch_offset);
-        return assembler::Indirect(assembler::RBP, rewrite->getScratchRbpOffset() + l.scratch_offset);
    else
        return assembler::Indirect(assembler::RSP, l.stack_offset);
 }
@@ -1414,6 +1417,18 @@ Rewriter::Rewriter(ICSlotRewrite* rewrite, int num_args, const std::vector<int>&
        this->live_outs.push_back(var);
        this->live_out_regs.push_back(dwarf_regnum);
    }
+    // Getting the scratch space location/size wrong could be disastrous and hard to track down,
+    // so here's a "forcefully check it" mode, which starts every inline cache by overwriting
+    // the entire scratch space.
+    bool VALIDATE_SCRATCH_SPACE = false;
+    if (VALIDATE_SCRATCH_SPACE) {
+        int scratch_size = rewrite->getScratchSize();
+        for (int i = 0; i < scratch_size; i += 8) {
+            assembler->movq(assembler::Immediate(0x12345678UL),
+                            assembler::Indirect(assembler::RSP, i + rewrite->getScratchRspOffset()));
+        }
+    }
 }
 Rewriter* Rewriter::createRewriter(void* rtn_addr, int num_args, const char* debug_name) {

--- a/src/asm_writing/rewriter.h
+++ b/src/asm_writing/rewriter.h
@@ -384,7 +384,7 @@ private:
    // Do the bookkeeping to say that var is no longer in location l
    void removeLocationFromVar(RewriterVar* var, Location l);
-    void finishAssembly(ICSlotInfo* picked_slot, int continue_offset) override;
+    bool finishAssembly(ICSlotInfo* picked_slot, int continue_offset) override;
    void _trap();
    void _loadConst(RewriterVar* result, int64_t val, Location loc);

--- a/src/asm_writing/types.h
+++ b/src/asm_writing/types.h
@@ -15,13 +15,18 @@
 #ifndef PYSTON_ASMWRITING_TYPES_H
 #define PYSTON_ASMWRITING_TYPES_H
+#include "core/common.h"
 namespace pyston {
 struct StackInfo {
-    int stack_size;
+    int scratch_size;
+    int scratch_rsp_offset;
-    int scratch_bytes;
+    StackInfo(int scratch_size, int scratch_rsp_offset)
-    int scratch_rbp_offset;
+        : scratch_size(scratch_size), scratch_rsp_offset(scratch_rsp_offset) {
+        assert(scratch_rsp_offset >= 0);
+    }
 };
 namespace assembler {

--- a/src/capi/modsupport.cpp
+++ b/src/capi/modsupport.cpp
@@ -386,9 +386,6 @@ extern "C" PyObject* Py_InitModule4(const char* name, PyMethodDef* methods, cons
    Box* passthrough = static_cast<Box*>(self);
    while (methods && methods->ml_name) {
-        if (VERBOSITY())
-            printf("Loading method %s\n", methods->ml_name);
        RELEASE_ASSERT((methods->ml_flags & (~(METH_VARARGS | METH_KEYWORDS | METH_NOARGS | METH_O))) == 0, "%d",
                       methods->ml_flags);
        module->giveAttr(methods->ml_name,

--- a/src/codegen/irgen/irgenerator.cpp
+++ b/src/codegen/irgen/irgenerator.cpp
@@ -1810,7 +1810,7 @@ private:
            // Maybe if there are a ton of live variables it'd be nice to have them be
            // heap-allocated, or if we don't immediately return the result of the OSR?
            bool use_malloc = false;
-            if (false) {
+            if (use_malloc) {
                llvm::Value* n_bytes = getConstantInt((sorted_symbol_table.size() - 3) * sizeof(Box*), g.i64);
                llvm::Value* l_malloc = embedConstantPtr(
                    (void*)malloc, llvm::FunctionType::get(g.i8->getPointerTo(), g.i64, false)->getPointerTo());
@@ -1818,7 +1818,10 @@ private:
                arg_array = emitter.getBuilder()->CreateBitCast(malloc_save, g.llvm_value_type_ptr->getPointerTo());
            } else {
                llvm::Value* n_varargs = llvm::ConstantInt::get(g.i64, sorted_symbol_table.size() - 3, false);
-                arg_array = emitter.getBuilder()->CreateAlloca(g.llvm_value_type_ptr, n_varargs);
+                // TODO we have a number of allocas with non-overlapping lifetimes, that end up
+                // being redundant.
+                arg_array = new llvm::AllocaInst(g.llvm_value_type_ptr, n_varargs, "",
+                                                 irstate->getLLVMFunction()->getEntryBlock().getFirstInsertionPt());
            }
        }

--- a/src/codegen/patchpoints.cpp
+++ b/src/codegen/patchpoints.cpp
@@ -248,9 +248,22 @@ void processStackmap(CompiledFunction* cf, StackMap* stackmap) {
        assert(pp->numICStackmapArgs() == 0); // don't do anything with these for now
-        std::unique_ptr<ICInfo> icinfo = registerCompiledPatchpoint(
+        // We currently specify the scratch's location as an RSP offset, but LLVM gives it
-            start_addr, slowpath_start, end_addr, slowpath_rtn_addr, ic,
+        // to us as an RBP offset.  It's easy to convert between them if the function has a static
-            StackInfo({ stack_size, scratch_size, scratch_rbp_offset }), std::move(live_outs));
+        // stack size, but if the function doesn't have a fixed stack size (which happens if there
+        // is a non-static alloca), then we can't convert.
+        // Internally, it's easy enough to handle either rsp-relative or rbp-relative offsets
+        // for the scratch array, but there are some places that require the use of rsp-relative
+        // offsets, and we don't (yet) have the ability to specify on a per-patchpoint basis
+        // which one we want to use.
+        RELEASE_ASSERT(stack_size >= 0, "function does not have static stack size!");
+        // (rbp - rsp) == (stack_size - 8)  -- the "-8" is from the value of rbp being pushed onto the stack
+        int scratch_rsp_offset = scratch_rbp_offset + (stack_size - 8);
+        std::unique_ptr<ICInfo> icinfo
+            = registerCompiledPatchpoint(start_addr, slowpath_start, end_addr, slowpath_rtn_addr, ic,
+                                         StackInfo(scratch_size, scratch_rsp_offset), std::move(live_outs));
        assert(cf);
        // TODO: unsafe.  hard to use a unique_ptr here though.

--- a/src/codegen/unwinding.cpp
+++ b/src/codegen/unwinding.cpp
@@ -522,8 +522,9 @@ ExcInfo* getFrameExcInfo() {
        *copy_from_exc = ExcInfo(None, None, None);
    }
-    assert(copy_from_exc->value);
+    assert(gc::isValidGCObject(copy_from_exc->type));
-    assert(copy_from_exc->traceback);
+    assert(gc::isValidGCObject(copy_from_exc->value));
+    assert(gc::isValidGCObject(copy_from_exc->traceback));
    for (auto* ex : to_update) {
        *ex = *copy_from_exc;

--- a/src/runtime/ics.cpp
+++ b/src/runtime/ics.cpp
@@ -42,25 +42,29 @@ namespace pyston {
 //
 // This template is generated from this C++ file:
 //
-// extern void foo();
+// extern void foo(void*);
 // int bar() {
-//   foo();
+//   char buf[N];
+//   foo(&buf);
 //   return 1;
 // }
 //
+// (where N is the extra bytes of stack to allocate)
+//
 // objdump -s -j .eh_frame test
 // readelf -w test
 //
 #if RUNTIMEICS_OMIT_FRAME_PTR
-// clang++ test.cpp -o test -O3 -fomit-frame-pointer -c
+// clang++ test.cpp -o test -O3 -fomit-frame-pointer -c -DN=40
 // The generated assembly is:
 //
-//  0:   50                      push   %rax
+//  0:   48 83 ec 28             sub    $0x28,%rsp
-//  1:   e8 00 00 00 00          callq  6 <_Z3barv+0x6>
+//  4:   48 8d 3c 24             lea    (%rsp),%rdi
-//  6:   b8 01 00 00 00          mov    $0x1,%eax
+//  8:   e8 00 00 00 00          callq  d <_Z3barv+0xd>
-//  b:   5a                      pop    %rdx
+//  d:   b8 01 00 00 00          mov    $0x1,%eax
-//  c:   c3                      retq
+// 12:   48 83 c4 28             add    $0x28,%rsp
+// 16:   c3                      retq
 //
 //  (I believe the push/pop are for stack alignment)
 //
@@ -84,24 +88,26 @@ static const char _eh_frame_template[] =
    "\x00\x00\x00\x00" // prcel offset to function address [to be filled in]
    "\x0d\x00\x00\x00" // function size [to be filled in]
    "\x00"             // augmentation data (none)
-    "\x41\x0e\x10"
+    "\x44\x0e\x30"
    // Instructions:
-    // - DW_CFA_advance_loc: 1 to 00000001
+    // - DW_CFA_advance_loc: 4 to 00000004
-    // - DW_CFA_def_cfa_offset: 16
+    // - DW_CFA_def_cfa_offset: 48
    "\x00\x00\x00\x00" // padding
    "\x00\x00\x00\x00" // terminator
    ;
 #else
-// clang++ test.cpp -o test -O3 -fno-omit-frame-pointer -c
+// clang++ test.cpp -o test -O3 -fno-omit-frame-pointer -c -DN=40
 // The generated assembly is:
-//
 //  0:   55                      push   %rbp
 //  1:   48 89 e5                mov    %rsp,%rbp
-//  4:   e8 00 00 00 00          callq  9 <_Z3barv+0x9>
+//  4:   48 83 ec 30             sub    $0x30,%rsp
-//  9:   b8 01 00 00 00          mov    $0x1,%eax
+//  8:   48 8d 7d d0             lea    -0x30(%rbp),%rdi
-//  e:   5d                      pop    %rbp
+//  c:   e8 00 00 00 00          callq  11 <_Z3barv+0x11>
-//  f:   c3                      retq
+// 11:   b8 01 00 00 00          mov    $0x1,%eax
+// 16:   48 83 c4 30             add    $0x30,%rsp
+// 1a:   5d                      pop    %rbp
+// 1b:   c3                      retq
 //
 static const char _eh_frame_template[] =
    // CIE
@@ -152,31 +158,53 @@ static void writeTrivialEhFrame(void* eh_frame_addr, void* func_addr, uint64_t f
    *size_ptr = func_size;
 }
+#if RUNTIMEICS_OMIT_FRAME_PTR
+// If you change this, you *must* update the value in _eh_frame_template
+// (set the -9'th byte to this value plus 8)
+#define SCRATCH_BYTES 0x28
+#else
+#define SCRATCH_BYTES 0x30
+#endif
 RuntimeIC::RuntimeIC(void* func_addr, int num_slots, int slot_size) {
    static StatCounter sc("runtime_ics_num");
    sc.log();
    if (ENABLE_RUNTIME_ICS) {
+        assert(SCRATCH_BYTES >= 0);
+        assert(SCRATCH_BYTES < 0x80); // This would break both the instruction encoding and the dwarf encoding
+        assert(SCRATCH_BYTES % 8 == 0);
 #if RUNTIMEICS_OMIT_FRAME_PTR
-        static const int PROLOGUE_SIZE = 1;
-#else
        /*
-         * We emit a prologue since we want to align the stack pointer,
+         * prologue:
-         * and also use RBP.
+         * sub $0x28, %rsp  # 48 83 ec 28
-         * It's not clear if we need to use RBP or not, since we emit the .eh_frame section anyway.
+         *
+         * epilogue:
+         * add $0x28, %rsp  # 48 83 c4 28
+         * retq             # c3
         *
+         */
+        static const int PROLOGUE_SIZE = 4;
+        static const int EPILOGUE_SIZE = 5;
+        assert(SCRATCH_BYTES % 16 == 8);
+#else
+        /*
         * The prologue looks like:
         * push %rbp        # 55
         * mov %rsp, %rbp   # 48 89 e5
+         * sub $0x30, %rsp  # 48 83 ec 30
         *
         * The epilogue is:
+         * add $0x30, %rsp  # 48 83 c4 30
         * pop %rbp         # 5d
         * retq             # c3
         */
-        static const int PROLOGUE_SIZE = 4;
+        static const int PROLOGUE_SIZE = 8;
+        static const int EPILOGUE_SIZE = 6;
+        assert(SCRATCH_BYTES % 16 == 0);
 #endif
        static const int CALL_SIZE = 13;
-        static const int EPILOGUE_SIZE = 2;
        int patchable_size = num_slots * slot_size;
        int total_size = PROLOGUE_SIZE + patchable_size + CALL_SIZE + EPILOGUE_SIZE;
@@ -198,23 +226,28 @@ RuntimeIC::RuntimeIC(void* func_addr, int num_slots, int slot_size) {
        assert(p.first == pp_start + patchable_size);
        assert(p.second == pp_end);
+        StackInfo stack_info(SCRATCH_BYTES, 0);
        icinfo = registerCompiledPatchpoint(pp_start, pp_start + patchable_size, pp_end, pp_end, setup_info.get(),
-                                            StackInfo(), std::unordered_set<int>());
+                                            stack_info, std::unordered_set<int>());
        assembler::Assembler prologue_assem((uint8_t*)addr, PROLOGUE_SIZE);
 #if RUNTIMEICS_OMIT_FRAME_PTR
-        prologue_assem.push(assembler::RAX);
+        // If SCRATCH_BYTES is 8 or less, we could use more compact instruction encodings
+        // (push instead of sub), but it doesn't seem worth it for now.
+        prologue_assem.sub(assembler::Immediate(SCRATCH_BYTES), assembler::RSP);
 #else
        prologue_assem.push(assembler::RBP);
        prologue_assem.mov(assembler::RSP, assembler::RBP);
+        prologue_assem.sub(assembler::Immediate(SCRATCH_BYTES), assembler::RSP);
 #endif
        assert(!prologue_assem.hasFailed());
        assert(prologue_assem.isExactlyFull());
        assembler::Assembler epilogue_assem(pp_end, EPILOGUE_SIZE);
 #if RUNTIMEICS_OMIT_FRAME_PTR
-        epilogue_assem.pop(assembler::RDX);
+        epilogue_assem.add(assembler::Immediate(SCRATCH_BYTES), assembler::RSP);
 #else
+        epilogue_assem.add(assembler::Immediate(SCRATCH_BYTES), assembler::RSP);
        epilogue_assem.pop(assembler::RBP);
 #endif
        epilogue_assem.retq();

--- a/test/tests/thread_spawning.py
+++ b/test/tests/thread_spawning.py
@@ -20,6 +20,8 @@ for i in xrange(400):
        while nworkers >= MAX_WORKERS:
            cv.wait()
+        nworkers += 1
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

--- a/test/tests/urlparse_test.py
+++ b/test/tests/urlparse_test.py
-# skip-if: True
-# - this is blocking on some rewriter stuff
 import urlparse
 print urlparse.urlparse("http://www.dropbox.com")