Commit 1a6b1e0c authored by Kevin Modzelewski's avatar Kevin Modzelewski

Add scratch space to our runtime ics

(scratch space is pre-allocated stack space since the IC can't
allocate new space itself, in order to have unwinding work)

Allocate a fixed amount (currently: 40 bytes) of extra stack space
in our runtime ics.  This involves changing the function prologue+
epilogue to do more rsp adjustment, and modifying the .eh_frame sections
we generate.

One tricky thing is that we currently use frame pointer elimination
in our runtime ics, but the rest of the scratch space logic had assumed
the scratch would be rbp-relative, which I had to convert to rsp-relative.
parent c88e3b48
......@@ -95,19 +95,22 @@ void ICSlotRewrite::commit(CommitHook* hook) {
if (ic_entry == NULL)
return;
for (int i = 0; i < dependencies.size(); i++) {
ICInvalidator* invalidator = dependencies[i].first;
invalidator->addDependent(ic_entry);
}
uint8_t* slot_start = (uint8_t*)ic->start_addr + ic_entry->idx * ic->getSlotSize();
uint8_t* continue_point = (uint8_t*)ic->continue_addr;
hook->finishAssembly(ic_entry, continue_point - slot_start);
bool do_commit = hook->finishAssembly(ic_entry, continue_point - slot_start);
if (!do_commit)
return;
assert(assembler->isExactlyFull());
assert(!assembler->hasFailed());
for (int i = 0; i < dependencies.size(); i++) {
ICInvalidator* invalidator = dependencies[i].first;
invalidator->addDependent(ic_entry);
}
// if (VERBOSITY()) printf("Commiting to %p-%p\n", start, start + ic->slot_size);
memcpy(slot_start, buf, ic->getSlotSize());
......@@ -129,18 +132,13 @@ int ICSlotRewrite::getSlotSize() {
return ic->getSlotSize();
}
int ICSlotRewrite::getFuncStackSize() {
return ic->stack_info.stack_size;
}
int ICSlotRewrite::getScratchRbpOffset() {
assert(ic->stack_info.scratch_bytes);
return ic->stack_info.scratch_rbp_offset;
int ICSlotRewrite::getScratchRspOffset() {
assert(ic->stack_info.scratch_size);
return ic->stack_info.scratch_rsp_offset;
}
int ICSlotRewrite::getScratchBytes() {
assert(ic->stack_info.scratch_bytes);
return ic->stack_info.scratch_bytes;
int ICSlotRewrite::getScratchSize() {
return ic->stack_info.scratch_size;
}
TypeRecorder* ICSlotRewrite::getTypeRecorder() {
......
......@@ -48,7 +48,7 @@ public:
class CommitHook {
public:
virtual ~CommitHook() {}
virtual void finishAssembly(ICSlotInfo* picked_slot, int fastpath_offset) = 0;
virtual bool finishAssembly(ICSlotInfo* picked_slot, int fastpath_offset) = 0;
};
private:
......@@ -67,9 +67,8 @@ public:
assembler::Assembler* getAssembler() { return assembler; }
int getSlotSize();
int getFuncStackSize();
int getScratchRbpOffset();
int getScratchBytes();
int getScratchRspOffset();
int getScratchSize();
TypeRecorder* getTypeRecorder();
......
......@@ -952,18 +952,20 @@ void Rewriter::commit() {
return;
}
rewrite->commit(this);
if (assembler->hasFailed()) {
on_assemblyfail();
return;
}
finished = true;
static StatCounter rewriter_commits("rewriter_commits");
rewriter_commits.log();
// TODO: have to check that we have enough room to write the final jmp
rewrite->commit(this);
assert(!assembler->hasFailed());
}
void Rewriter::finishAssembly(ICSlotInfo* picked_slot, int continue_offset) {
bool Rewriter::finishAssembly(ICSlotInfo* picked_slot, int continue_offset) {
if (marked_inside_ic) {
void* mark_addr = &picked_slot->num_inside;
......@@ -977,6 +979,8 @@ void Rewriter::finishAssembly(ICSlotInfo* picked_slot, int continue_offset) {
assembler->jmp(assembler::JumpDestination::fromStart(continue_offset));
assembler->fillWithNops();
return !assembler->hasFailed();
}
void Rewriter::commitReturning(RewriterVar* var) {
......@@ -995,14 +999,14 @@ void Rewriter::addDependenceOn(ICInvalidator& invalidator) {
Location Rewriter::allocScratch() {
assertPhaseEmitting();
int scratch_bytes = rewrite->getScratchBytes();
for (int i = 0; i < scratch_bytes; i += 8) {
int scratch_size = rewrite->getScratchSize();
for (int i = 0; i < scratch_size; i += 8) {
Location l(Location::Scratch, i);
if (vars_by_location.count(l) == 0) {
return l;
}
}
RELEASE_ASSERT(0, "Using all %d bytes of scratch!", scratch_bytes);
RELEASE_ASSERT(0, "Using all %d bytes of scratch!", scratch_size);
}
RewriterVar* Rewriter::add(RewriterVar* a, int64_t b, Location dest) {
......@@ -1042,9 +1046,9 @@ RewriterVar* Rewriter::allocate(int n) {
int Rewriter::_allocate(RewriterVar* result, int n) {
assert(n >= 1);
int scratch_bytes = rewrite->getScratchBytes();
int scratch_size = rewrite->getScratchSize();
int consec = 0;
for (int i = 0; i < scratch_bytes; i += 8) {
for (int i = 0; i < scratch_size; i += 8) {
Location l(Location::Scratch, i);
if (vars_by_location.count(l) == 0) {
consec++;
......@@ -1056,8 +1060,8 @@ int Rewriter::_allocate(RewriterVar* result, int n) {
// TODO should be a LEA instruction
// In fact, we could do something like we do for constants and only load
// this when necessary, so it won't spill. Is that worth?
assembler->mov(assembler::RBP, r);
assembler->add(assembler::Immediate(8 * a + rewrite->getScratchRbpOffset()), r);
assembler->mov(assembler::RSP, r);
assembler->add(assembler::Immediate(8 * a + rewrite->getScratchRspOffset()), r);
// Put placeholders in so the array space doesn't get re-allocated.
// This won't get collected, but that's fine.
......@@ -1074,7 +1078,7 @@ int Rewriter::_allocate(RewriterVar* result, int n) {
consec = 0;
}
}
RELEASE_ASSERT(0, "Using all %d bytes of scratch!", scratch_bytes);
RELEASE_ASSERT(0, "Using all %d bytes of scratch!", scratch_size);
}
RewriterVar* Rewriter::allocateAndCopy(RewriterVar* array_ptr, int n) {
......@@ -1094,7 +1098,7 @@ void Rewriter::_allocateAndCopy(RewriterVar* result, RewriterVar* array_ptr, int
for (int i = 0; i < n; i++) {
assembler->mov(assembler::Indirect(src_ptr, 8 * i), tmp);
assembler->mov(tmp, assembler::Indirect(assembler::RBP, 8 * (offset + i) + rewrite->getScratchRbpOffset()));
assembler->mov(tmp, assembler::Indirect(assembler::RSP, 8 * (offset + i) + rewrite->getScratchRspOffset()));
}
array_ptr->bumpUse();
......@@ -1121,7 +1125,7 @@ void Rewriter::_allocateAndCopyPlus1(RewriterVar* result, RewriterVar* first_ele
int offset = _allocate(result, n_rest + 1);
assembler::Register tmp = first_elem->getInReg();
assembler->mov(tmp, assembler::Indirect(assembler::RBP, 8 * offset + rewrite->getScratchRbpOffset()));
assembler->mov(tmp, assembler::Indirect(assembler::RSP, 8 * offset + rewrite->getScratchRspOffset()));
if (n_rest > 0) {
assembler::Register src_ptr = rest_ptr->getInReg();
......@@ -1131,7 +1135,7 @@ void Rewriter::_allocateAndCopyPlus1(RewriterVar* result, RewriterVar* first_ele
for (int i = 0; i < n_rest; i++) {
assembler->mov(assembler::Indirect(src_ptr, 8 * i), tmp);
assembler->mov(tmp,
assembler::Indirect(assembler::RBP, 8 * (offset + i + 1) + rewrite->getScratchRbpOffset()));
assembler::Indirect(assembler::RSP, 8 * (offset + i + 1) + rewrite->getScratchRspOffset()));
}
rest_ptr->bumpUse();
}
......@@ -1146,8 +1150,7 @@ assembler::Indirect Rewriter::indirectFor(Location l) {
assert(l.type == Location::Scratch || l.type == Location::Stack);
if (l.type == Location::Scratch)
// TODO it can sometimes be more efficient to do RSP-relative addressing?
return assembler::Indirect(assembler::RBP, rewrite->getScratchRbpOffset() + l.scratch_offset);
return assembler::Indirect(assembler::RSP, rewrite->getScratchRspOffset() + l.scratch_offset);
else
return assembler::Indirect(assembler::RSP, l.stack_offset);
}
......@@ -1414,6 +1417,18 @@ Rewriter::Rewriter(ICSlotRewrite* rewrite, int num_args, const std::vector<int>&
this->live_outs.push_back(var);
this->live_out_regs.push_back(dwarf_regnum);
}
// Getting the scratch space location/size wrong could be disastrous and hard to track down,
// so here's a "forcefully check it" mode, which starts every inline cache by overwriting
// the entire scratch space.
bool VALIDATE_SCRATCH_SPACE = false;
if (VALIDATE_SCRATCH_SPACE) {
int scratch_size = rewrite->getScratchSize();
for (int i = 0; i < scratch_size; i += 8) {
assembler->movq(assembler::Immediate(0x12345678UL),
assembler::Indirect(assembler::RSP, i + rewrite->getScratchRspOffset()));
}
}
}
Rewriter* Rewriter::createRewriter(void* rtn_addr, int num_args, const char* debug_name) {
......
......@@ -384,7 +384,7 @@ private:
// Do the bookkeeping to say that var is no longer in location l
void removeLocationFromVar(RewriterVar* var, Location l);
void finishAssembly(ICSlotInfo* picked_slot, int continue_offset) override;
bool finishAssembly(ICSlotInfo* picked_slot, int continue_offset) override;
void _trap();
void _loadConst(RewriterVar* result, int64_t val, Location loc);
......
......@@ -15,13 +15,18 @@
#ifndef PYSTON_ASMWRITING_TYPES_H
#define PYSTON_ASMWRITING_TYPES_H
#include "core/common.h"
namespace pyston {
struct StackInfo {
int stack_size;
int scratch_size;
int scratch_rsp_offset;
int scratch_bytes;
int scratch_rbp_offset;
StackInfo(int scratch_size, int scratch_rsp_offset)
: scratch_size(scratch_size), scratch_rsp_offset(scratch_rsp_offset) {
assert(scratch_rsp_offset >= 0);
}
};
namespace assembler {
......
......@@ -386,9 +386,6 @@ extern "C" PyObject* Py_InitModule4(const char* name, PyMethodDef* methods, cons
Box* passthrough = static_cast<Box*>(self);
while (methods && methods->ml_name) {
if (VERBOSITY())
printf("Loading method %s\n", methods->ml_name);
RELEASE_ASSERT((methods->ml_flags & (~(METH_VARARGS | METH_KEYWORDS | METH_NOARGS | METH_O))) == 0, "%d",
methods->ml_flags);
module->giveAttr(methods->ml_name,
......
......@@ -1810,7 +1810,7 @@ private:
// Maybe if there are a ton of live variables it'd be nice to have them be
// heap-allocated, or if we don't immediately return the result of the OSR?
bool use_malloc = false;
if (false) {
if (use_malloc) {
llvm::Value* n_bytes = getConstantInt((sorted_symbol_table.size() - 3) * sizeof(Box*), g.i64);
llvm::Value* l_malloc = embedConstantPtr(
(void*)malloc, llvm::FunctionType::get(g.i8->getPointerTo(), g.i64, false)->getPointerTo());
......@@ -1818,7 +1818,10 @@ private:
arg_array = emitter.getBuilder()->CreateBitCast(malloc_save, g.llvm_value_type_ptr->getPointerTo());
} else {
llvm::Value* n_varargs = llvm::ConstantInt::get(g.i64, sorted_symbol_table.size() - 3, false);
arg_array = emitter.getBuilder()->CreateAlloca(g.llvm_value_type_ptr, n_varargs);
// TODO we have a number of allocas with non-overlapping lifetimes, that end up
// being redundant.
arg_array = new llvm::AllocaInst(g.llvm_value_type_ptr, n_varargs, "",
irstate->getLLVMFunction()->getEntryBlock().getFirstInsertionPt());
}
}
......
......@@ -248,9 +248,22 @@ void processStackmap(CompiledFunction* cf, StackMap* stackmap) {
assert(pp->numICStackmapArgs() == 0); // don't do anything with these for now
std::unique_ptr<ICInfo> icinfo = registerCompiledPatchpoint(
start_addr, slowpath_start, end_addr, slowpath_rtn_addr, ic,
StackInfo({ stack_size, scratch_size, scratch_rbp_offset }), std::move(live_outs));
// We currently specify the scratch's location as an RSP offset, but LLVM gives it
// to us as an RBP offset. It's easy to convert between them if the function has a static
// stack size, but if the function doesn't have a fixed stack size (which happens if there
// is a non-static alloca), then we can't convert.
// Internally, it's easy enough to handle either rsp-relative or rbp-relative offsets
// for the scratch array, but there are some places that require the use of rsp-relative
// offsets, and we don't (yet) have the ability to specify on a per-patchpoint basis
// which one we want to use.
RELEASE_ASSERT(stack_size >= 0, "function does not have static stack size!");
// (rbp - rsp) == (stack_size - 8) -- the "-8" is from the value of rbp being pushed onto the stack
int scratch_rsp_offset = scratch_rbp_offset + (stack_size - 8);
std::unique_ptr<ICInfo> icinfo
= registerCompiledPatchpoint(start_addr, slowpath_start, end_addr, slowpath_rtn_addr, ic,
StackInfo(scratch_size, scratch_rsp_offset), std::move(live_outs));
assert(cf);
// TODO: unsafe. hard to use a unique_ptr here though.
......
......@@ -522,8 +522,9 @@ ExcInfo* getFrameExcInfo() {
*copy_from_exc = ExcInfo(None, None, None);
}
assert(copy_from_exc->value);
assert(copy_from_exc->traceback);
assert(gc::isValidGCObject(copy_from_exc->type));
assert(gc::isValidGCObject(copy_from_exc->value));
assert(gc::isValidGCObject(copy_from_exc->traceback));
for (auto* ex : to_update) {
*ex = *copy_from_exc;
......
......@@ -42,25 +42,29 @@ namespace pyston {
//
// This template is generated from this C++ file:
//
// extern void foo();
// extern void foo(void*);
// int bar() {
// foo();
// char buf[N];
// foo(&buf);
// return 1;
// }
//
// (where N is the extra bytes of stack to allocate)
//
// objdump -s -j .eh_frame test
// readelf -w test
//
#if RUNTIMEICS_OMIT_FRAME_PTR
// clang++ test.cpp -o test -O3 -fomit-frame-pointer -c
// clang++ test.cpp -o test -O3 -fomit-frame-pointer -c -DN=40
// The generated assembly is:
//
// 0: 50 push %rax
// 1: e8 00 00 00 00 callq 6 <_Z3barv+0x6>
// 6: b8 01 00 00 00 mov $0x1,%eax
// b: 5a pop %rdx
// c: c3 retq
// 0: 48 83 ec 28 sub $0x28,%rsp
// 4: 48 8d 3c 24 lea (%rsp),%rdi
// 8: e8 00 00 00 00 callq d <_Z3barv+0xd>
// d: b8 01 00 00 00 mov $0x1,%eax
// 12: 48 83 c4 28 add $0x28,%rsp
// 16: c3 retq
//
// (I believe the push/pop are for stack alignment)
//
......@@ -84,24 +88,26 @@ static const char _eh_frame_template[] =
"\x00\x00\x00\x00" // prcel offset to function address [to be filled in]
"\x0d\x00\x00\x00" // function size [to be filled in]
"\x00" // augmentation data (none)
"\x41\x0e\x10"
"\x44\x0e\x30"
// Instructions:
// - DW_CFA_advance_loc: 1 to 00000001
// - DW_CFA_def_cfa_offset: 16
// - DW_CFA_advance_loc: 4 to 00000004
// - DW_CFA_def_cfa_offset: 48
"\x00\x00\x00\x00" // padding
"\x00\x00\x00\x00" // terminator
;
#else
// clang++ test.cpp -o test -O3 -fno-omit-frame-pointer -c
// clang++ test.cpp -o test -O3 -fno-omit-frame-pointer -c -DN=40
// The generated assembly is:
//
// 0: 55 push %rbp
// 1: 48 89 e5 mov %rsp,%rbp
// 4: e8 00 00 00 00 callq 9 <_Z3barv+0x9>
// 9: b8 01 00 00 00 mov $0x1,%eax
// e: 5d pop %rbp
// f: c3 retq
// 4: 48 83 ec 30 sub $0x30,%rsp
// 8: 48 8d 7d d0 lea -0x30(%rbp),%rdi
// c: e8 00 00 00 00 callq 11 <_Z3barv+0x11>
// 11: b8 01 00 00 00 mov $0x1,%eax
// 16: 48 83 c4 30 add $0x30,%rsp
// 1a: 5d pop %rbp
// 1b: c3 retq
//
static const char _eh_frame_template[] =
// CIE
......@@ -152,31 +158,53 @@ static void writeTrivialEhFrame(void* eh_frame_addr, void* func_addr, uint64_t f
*size_ptr = func_size;
}
#if RUNTIMEICS_OMIT_FRAME_PTR
// If you change this, you *must* update the value in _eh_frame_template
// (set the -9'th byte to this value plus 8)
#define SCRATCH_BYTES 0x28
#else
#define SCRATCH_BYTES 0x30
#endif
RuntimeIC::RuntimeIC(void* func_addr, int num_slots, int slot_size) {
static StatCounter sc("runtime_ics_num");
sc.log();
if (ENABLE_RUNTIME_ICS) {
assert(SCRATCH_BYTES >= 0);
assert(SCRATCH_BYTES < 0x80); // This would break both the instruction encoding and the dwarf encoding
assert(SCRATCH_BYTES % 8 == 0);
#if RUNTIMEICS_OMIT_FRAME_PTR
static const int PROLOGUE_SIZE = 1;
#else
/*
* We emit a prologue since we want to align the stack pointer,
* and also use RBP.
* It's not clear if we need to use RBP or not, since we emit the .eh_frame section anyway.
* prologue:
* sub $0x28, %rsp # 48 83 ec 28
*
* epilogue:
* add $0x28, %rsp # 48 83 c4 28
* retq # c3
*
*/
static const int PROLOGUE_SIZE = 4;
static const int EPILOGUE_SIZE = 5;
assert(SCRATCH_BYTES % 16 == 8);
#else
/*
* The prologue looks like:
* push %rbp # 55
* mov %rsp, %rbp # 48 89 e5
* push %rbp # 55
* mov %rsp, %rbp # 48 89 e5
* sub $0x30, %rsp # 48 83 ec 30
*
* The epilogue is:
* pop %rbp # 5d
* retq # c3
* add $0x30, %rsp # 48 83 c4 30
* pop %rbp # 5d
* retq # c3
*/
static const int PROLOGUE_SIZE = 4;
static const int PROLOGUE_SIZE = 8;
static const int EPILOGUE_SIZE = 6;
assert(SCRATCH_BYTES % 16 == 0);
#endif
static const int CALL_SIZE = 13;
static const int EPILOGUE_SIZE = 2;
int patchable_size = num_slots * slot_size;
int total_size = PROLOGUE_SIZE + patchable_size + CALL_SIZE + EPILOGUE_SIZE;
......@@ -198,23 +226,28 @@ RuntimeIC::RuntimeIC(void* func_addr, int num_slots, int slot_size) {
assert(p.first == pp_start + patchable_size);
assert(p.second == pp_end);
StackInfo stack_info(SCRATCH_BYTES, 0);
icinfo = registerCompiledPatchpoint(pp_start, pp_start + patchable_size, pp_end, pp_end, setup_info.get(),
StackInfo(), std::unordered_set<int>());
stack_info, std::unordered_set<int>());
assembler::Assembler prologue_assem((uint8_t*)addr, PROLOGUE_SIZE);
#if RUNTIMEICS_OMIT_FRAME_PTR
prologue_assem.push(assembler::RAX);
// If SCRATCH_BYTES is 8 or less, we could use more compact instruction encodings
// (push instead of sub), but it doesn't seem worth it for now.
prologue_assem.sub(assembler::Immediate(SCRATCH_BYTES), assembler::RSP);
#else
prologue_assem.push(assembler::RBP);
prologue_assem.mov(assembler::RSP, assembler::RBP);
prologue_assem.sub(assembler::Immediate(SCRATCH_BYTES), assembler::RSP);
#endif
assert(!prologue_assem.hasFailed());
assert(prologue_assem.isExactlyFull());
assembler::Assembler epilogue_assem(pp_end, EPILOGUE_SIZE);
#if RUNTIMEICS_OMIT_FRAME_PTR
epilogue_assem.pop(assembler::RDX);
epilogue_assem.add(assembler::Immediate(SCRATCH_BYTES), assembler::RSP);
#else
epilogue_assem.add(assembler::Immediate(SCRATCH_BYTES), assembler::RSP);
epilogue_assem.pop(assembler::RBP);
#endif
epilogue_assem.retq();
......
......@@ -20,6 +20,8 @@ for i in xrange(400):
while nworkers >= MAX_WORKERS:
cv.wait()
nworkers += 1
t = threading.Thread(target=worker)
t.start()
threads.append(t)
......
# skip-if: True
# - this is blocking on some rewriter stuff
import urlparse
print urlparse.urlparse("http://www.dropbox.com")
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment