Commit e7174486 authored by Marius Wachtler's avatar Marius Wachtler

Merge pull request #1227 from undingen/bjit_opt2

major bjit improvements
parents 820dd0cd 9fd4924f
......@@ -755,6 +755,7 @@ void Assembler::incq(Indirect mem) {
}
assert(src_idx >= 0 && src_idx < 8);
bool needssib = (src_idx == 0b100);
if (rex)
emitRex(rex);
......@@ -763,8 +764,12 @@ void Assembler::incq(Indirect mem) {
assert(-0x80 <= mem.offset && mem.offset < 0x80);
if (mem.offset == 0) {
emitModRM(0b00, 0, src_idx);
if (needssib)
emitSIB(0b00, 0b100, src_idx);
} else {
emitModRM(0b01, 0, src_idx);
if (needssib)
emitSIB(0b00, 0b100, src_idx);
emitByte(mem.offset);
}
}
......
This diff is collapsed.
......@@ -16,6 +16,7 @@
#define PYSTON_ASMWRITING_REWRITER_H
#include <deque>
#include <forward_list>
#include <list>
#include <map>
#include <memory>
......@@ -192,8 +193,6 @@ public:
// if no action is specified it will assume the last action consumed the reference
void refConsumed(RewriterAction* action = NULL);
void refUsed();
// registerOwnedAttr tells the refcounter that a certain memory location holds a pointer
// to an owned reference. This must be paired with a call to deregisterOwnedAttr
// Call these right before emitting the store (for register) or decref (for deregister).
......@@ -237,11 +236,11 @@ private:
// /* some code */
// bumpUseLateIfNecessary();
void bumpUseEarlyIfPossible() {
if (reftype != RefType::OWNED)
if (reftype != RefType::OWNED && !hasScratchAllocation())
bumpUse();
}
void bumpUseLateIfNecessary() {
if (reftype == RefType::OWNED)
if (reftype == RefType::OWNED || hasScratchAllocation())
bumpUse();
}
......@@ -254,7 +253,7 @@ private:
bool isDoneUsing() { return next_use == uses.size(); }
bool hasScratchAllocation() const { return scratch_allocation.second > 0; }
void resetHasScratchAllocation() { scratch_allocation = std::make_pair(0, 0); }
bool needsDecref();
bool needsDecref(int current_action_index);
// Indicates if this variable is an arg, and if so, what location the arg is from.
bool is_arg;
......@@ -339,8 +338,9 @@ public:
class RewriterAction {
public:
SmallFunction<56> action;
std::vector<RewriterVar*> consumed_refs;
SmallFunction<48> action;
std::forward_list<RewriterVar*> consumed_refs;
template <typename F> RewriterAction(F&& action) : action(std::forward<F>(action)) {}
......@@ -367,7 +367,33 @@ private:
protected:
// Allocates `bytes` bytes of data. The allocation will get freed when the rewriter gets freed.
void* regionAlloc(size_t bytes) { return allocator.Allocate(bytes, 16 /* alignment */); }
void* regionAlloc(size_t bytes, int alignment = 16) { return allocator.Allocate(bytes, alignment); }
template <typename T> llvm::MutableArrayRef<T> regionAlloc(size_t num_elements) {
return llvm::MutableArrayRef<T>(allocator.Allocate<T>(num_elements), num_elements);
}
// This takes a variable number of llvm::ArrayRef<RewriterVar*> and copies in all elements into a single contiguous
// memory location.
template <typename... Args>
llvm::MutableArrayRef<RewriterVar*> regionAllocArgs(llvm::ArrayRef<RewriterVar*> arg1, Args... args) {
size_t num_total_args = 0;
for (auto&& array : { arg1, args... }) {
num_total_args += array.size();
}
if (num_total_args == 0)
return llvm::MutableArrayRef<RewriterVar*>();
auto args_array_ref = regionAlloc<RewriterVar*>(num_total_args);
auto insert_point = args_array_ref;
for (auto&& array : { arg1, args... }) {
if (!array.empty()) {
memcpy(insert_point.data(), array.data(), array.size() * sizeof(RewriterVar*));
insert_point = insert_point.slice(array.size());
}
}
assert(insert_point.size() == 0);
return args_array_ref;
}
// Helps generating the best code for loading a const integer value.
// By keeping track of the last known value of every register and reusing it.
......@@ -432,6 +458,8 @@ protected:
bool needs_invalidation_support = true);
std::deque<RewriterAction> actions;
int current_action_idx; // in the emitting phase get's set to index of currently executed action
template <typename F> RewriterAction* addAction(F&& action, llvm::ArrayRef<RewriterVar*> vars, ActionType type) {
assertPhaseCollecting();
for (RewriterVar* var : vars) {
......@@ -483,6 +511,8 @@ protected:
// Allocates a register. dest must be of type Register or AnyReg
// If otherThan is a register, guaranteed to not use that register.
assembler::Register allocReg(Location dest, Location otherThan = Location::any());
assembler::Register allocReg(Location dest, Location otherThan,
llvm::ArrayRef<assembler::Register> valid_registers);
assembler::XMMRegister allocXMMReg(Location dest, Location otherThan = Location::any());
// Allocates an 8-byte region in the scratch space
Location allocScratch();
......@@ -507,11 +537,13 @@ protected:
void _slowpathJump(bool condition_eq);
void _trap();
void _loadConst(RewriterVar* result, int64_t val);
void _setupCall(bool has_side_effects, llvm::ArrayRef<RewriterVar*> args, llvm::ArrayRef<RewriterVar*> args_xmm,
Location preserve = Location::any());
void _setupCall(bool has_side_effects, llvm::ArrayRef<RewriterVar*> args = {},
llvm::ArrayRef<RewriterVar*> args_xmm = {}, Location preserve = Location::any(),
llvm::ArrayRef<RewriterVar*> bump_if_possible = {});
// _call does not call bumpUse on its arguments:
void _call(RewriterVar* result, bool has_side_effects, void* func_addr, llvm::ArrayRef<RewriterVar*> args,
llvm::ArrayRef<RewriterVar*> args_xmm);
void _call(RewriterVar* result, bool has_side_effects, bool can_throw, void* func_addr,
llvm::ArrayRef<RewriterVar*> args, llvm::ArrayRef<RewriterVar*> args_xmm = {},
llvm::ArrayRef<RewriterVar*> vars_to_bump = {});
void _add(RewriterVar* result, RewriterVar* a, int64_t b, Location dest);
int _allocate(RewriterVar* result, int n);
void _allocateAndCopy(RewriterVar* result, RewriterVar* array, int n);
......@@ -565,6 +597,8 @@ protected:
#endif
}
llvm::ArrayRef<assembler::Register> allocatable_regs;
public:
// This should be called exactly once for each argument
RewriterVar* getArg(int argnum);
......@@ -606,16 +640,13 @@ public:
// 2) does not have any side-effects that would be user-visible if we bailed out from the middle of the
// inline cache. (Extra allocations don't count even though they're potentially visible if you look
// hard enough.)
RewriterVar* call(bool has_side_effects, void* func_addr, const RewriterVar::SmallVector& args,
const RewriterVar::SmallVector& args_xmm = RewriterVar::SmallVector());
RewriterVar* call(bool has_side_effects, void* func_addr);
RewriterVar* call(bool has_side_effects, void* func_addr, RewriterVar* arg0);
RewriterVar* call(bool has_side_effects, void* func_addr, RewriterVar* arg0, RewriterVar* arg1);
RewriterVar* call(bool has_side_effects, void* func_addr, RewriterVar* arg0, RewriterVar* arg1, RewriterVar* arg2);
RewriterVar* call(bool has_side_effects, void* func_addr, RewriterVar* arg0, RewriterVar* arg1, RewriterVar* arg2,
RewriterVar* arg3);
RewriterVar* call(bool has_side_effects, void* func_addr, RewriterVar* arg0, RewriterVar* arg1, RewriterVar* arg2,
RewriterVar* arg3, RewriterVar* arg4);
RewriterVar* call(bool has_side_effects, void* func_addr, llvm::ArrayRef<RewriterVar*> args = {},
llvm::ArrayRef<RewriterVar*> args_xmm = {}, llvm::ArrayRef<RewriterVar*> additional_uses = {});
template <typename... Args>
RewriterVar* call(bool has_side_effects, void* func_addr, RewriterVar* arg1, Args... args) {
return call(has_side_effects, func_addr, llvm::ArrayRef<RewriterVar*>({ arg1, args... }), {});
}
RewriterVar* add(RewriterVar* a, int64_t b, Location dest);
// Allocates n pointer-sized stack slots:
RewriterVar* allocate(int n);
......
......@@ -458,7 +458,7 @@ void ASTInterpreter::doStore(AST_Name* node, STOLEN(Value) value) {
ScopeInfo::VarScopeType vst = node->lookup_type;
if (vst == ScopeInfo::VarScopeType::GLOBAL) {
if (jit)
jit->emitSetGlobal(frame_info.globals, name.getBox(), value);
jit->emitSetGlobal(name.getBox(), value, getMD()->source->scoping->areGlobalsFromModule());
setGlobal(frame_info.globals, name.getBox(), value.o);
} else if (vst == ScopeInfo::VarScopeType::NAME) {
if (jit)
......@@ -471,13 +471,12 @@ void ASTInterpreter::doStore(AST_Name* node, STOLEN(Value) value) {
bool closure = vst == ScopeInfo::VarScopeType::CLOSURE;
if (jit) {
bool is_live = true;
// TODO: turn this optimization back on.
// if (!closure)
// is_live = source_info->getLiveness()->isLiveAtEnd(name, current_block);
if (!closure)
is_live = source_info->getLiveness()->isLiveAtEnd(name, current_block);
if (is_live)
jit->emitSetLocal(name, node->vreg, closure, value);
else
jit->emitSetBlockLocal(name, value);
jit->emitSetBlockLocal(name, node->vreg, value);
}
if (closure) {
......@@ -686,12 +685,12 @@ Value ASTInterpreter::visit_jump(AST_Jump* node) {
if (backedge)
++edgecount;
if (ENABLE_BASELINEJIT && backedge && edgecount == OSR_THRESHOLD_INTERPRETER && !jit && !node->target->code) {
if (ENABLE_BASELINEJIT && backedge && edgecount >= OSR_THRESHOLD_INTERPRETER && !jit && !node->target->code) {
should_jit = true;
startJITing(node->target);
}
if (backedge && edgecount == OSR_THRESHOLD_BASELINE) {
if (backedge && edgecount >= OSR_THRESHOLD_BASELINE) {
Box* rtn = doOSR(node);
if (rtn)
return Value(rtn, NULL);
......@@ -1173,12 +1172,9 @@ Value ASTInterpreter::createFunction(AST* node, AST_arguments* args, const std::
closure_var = jit->imm(0ul);
if (!passed_globals_var)
passed_globals_var = jit->imm(0ul);
rtn.var = jit->call(false, (void*)createFunctionFromMetadata, jit->imm(md), closure_var, passed_globals_var,
defaults_var, jit->imm(args->defaults.size()))->setType(RefType::OWNED);
for (auto d_var : defaults_vars) {
d_var->refUsed();
}
rtn.var = jit->call(false, (void*)createFunctionFromMetadata, { jit->imm(md), closure_var, passed_globals_var,
defaults_var, jit->imm(args->defaults.size()) },
{}, defaults_vars)->setType(RefType::OWNED);
}
rtn.o = createFunctionFromMetadata(md, closure, passed_globals, u.il);
......@@ -1661,7 +1657,7 @@ Value ASTInterpreter::visit_name(AST_Name* node) {
assert(!node->is_kill);
Value v;
if (jit)
v.var = jit->emitGetGlobal(frame_info.globals, node->id.getBox());
v.var = jit->emitGetGlobal(node->id.getBox());
v.o = getGlobal(frame_info.globals, node->id.getBox());
return v;
......@@ -1781,6 +1777,10 @@ int ASTInterpreterJitInterface::getBoxedLocalsOffset() {
return offsetof(ASTInterpreter, frame_info.boxedLocals);
}
int ASTInterpreterJitInterface::getCreatedClosureOffset() {
return offsetof(ASTInterpreter, created_closure);
}
int ASTInterpreterJitInterface::getCurrentBlockOffset() {
return offsetof(ASTInterpreter, current_block);
}
......
......@@ -39,6 +39,7 @@ struct ASTInterpreterJitInterface {
static constexpr uint64_t osr_dummy_value = -1;
static int getBoxedLocalsOffset();
static int getCreatedClosureOffset();
static int getCurrentBlockOffset();
static int getCurrentInstOffset();
static int getEdgeCountOffset();
......
This diff is collapsed.
......@@ -23,6 +23,9 @@
namespace pyston {
// passes MAP_32BIT to mmap when allocating the memory for the bjit code.
// it's nice for inspecting the generated asm because the debugger is able to show the name of called C/C++ functions
#define ENABLE_BASELINEJIT_MAP_32BIT 1
#define ENABLE_BASELINEJIT_ICS 1
class AST_stmt;
......@@ -70,8 +73,9 @@ class JitFragmentWriter;
// register or stack slot but we aren't if it outlives the block - we have to store it in the interpreter instance.
//
// We use the following callee-save regs to speed up the generated code:
// r13: pointer to ASTInterpreter instance
// r14: pointer to the vregs array
// r12, r15: temporary values
// r13: pointer to ASTInterpreter instance
// r14: pointer to the vregs array
//
// To execute a specific CFGBlock one has to call:
// CFGBlock* block;
......@@ -90,8 +94,10 @@ class JitFragmentWriter;
//
// Basic layout of generated code block is:
// entry_code:
// push %r15 ; save r15
// push %r14 ; save r14
// push %r13 ; save r13
// push %r12 ; save r12
// sub $0x118,%rsp ; setup scratch, 0x118 = scratch_size + 16 = space for two func args passed on the
// stack + 8 byte for stack alignment
// mov %rdi,%r13 ; copy the pointer to ASTInterpreter instance into r13
......@@ -107,8 +113,10 @@ class JitFragmentWriter;
// jne end_side_exit
// movabs $0x215bb60,%rax ; rax = CFGBlock* to interpret next (rax is the 1. return reg)
// add $0x118,%rsp ; restore stack pointer
// pop %r12 ; restore r12
// pop %r13 ; restore r13
// pop %r14 ; restore r14
// pop %r15 ; restore r15
// ret ; exit to the interpreter which will interpret the specified CFGBLock*
// end_side_exit:
// ....
......@@ -120,8 +128,10 @@ class JitFragmentWriter;
// in this case 0 which means we are finished
// movabs $0x1270014108,%rdx ; rdx must contain the Box* value to return
// add $0x118,%rsp ; restore stack pointer
// pop %r12 ; restore r12
// pop %r13 ; restore r13
// pop %r14 ; restore r14
// pop %r15 ; restore r15
// ret
//
// nth_JitFragment:
......@@ -140,8 +150,18 @@ public:
static constexpr int sp_adjustment = scratch_size + num_stack_args * 8 + 8 /* = alignment */;
private:
struct MemoryManager {
private:
uint8_t* addr;
public:
MemoryManager();
~MemoryManager();
uint8_t* get() { return addr; }
};
// the memory block contains the EH frame directly followed by the generated machine code.
std::unique_ptr<uint8_t[]> memory;
MemoryManager memory;
int entry_offset;
assembler::Assembler a;
bool is_currently_writing;
......@@ -234,7 +254,7 @@ public:
RewriterVar* emitGetBoxedLocal(BoxedString* s);
RewriterVar* emitGetBoxedLocals();
RewriterVar* emitGetClsAttr(RewriterVar* obj, BoxedString* s);
RewriterVar* emitGetGlobal(Box* global, BoxedString* s);
RewriterVar* emitGetGlobal(BoxedString* s);
RewriterVar* emitGetItem(AST_expr* node, RewriterVar* value, RewriterVar* slice);
RewriterVar* emitGetLocal(InternedString s, int vreg);
RewriterVar* emitGetPystonIter(RewriterVar* v);
......@@ -265,10 +285,10 @@ public:
void emitRaise3(RewriterVar* arg0, RewriterVar* arg1, RewriterVar* arg2);
void emitReturn(RewriterVar* v);
void emitSetAttr(AST_expr* node, RewriterVar* obj, BoxedString* s, STOLEN(RewriterVar*) attr);
void emitSetBlockLocal(InternedString s, STOLEN(RewriterVar*) v);
void emitSetBlockLocal(InternedString s, int vreg, STOLEN(RewriterVar*) v);
void emitSetCurrentInst(AST_stmt* node);
void emitSetExcInfo(RewriterVar* type, RewriterVar* value, RewriterVar* traceback);
void emitSetGlobal(Box* global, BoxedString* s, STOLEN(RewriterVar*) v);
void emitSetGlobal(BoxedString* s, STOLEN(RewriterVar*) v, bool are_globals_from_module);
void emitSetItemName(BoxedString* s, RewriterVar* v);
void emitSetItem(RewriterVar* target, RewriterVar* slice, RewriterVar* value);
void emitSetLocal(InternedString s, int vreg, bool set_closure, STOLEN(RewriterVar*) v);
......@@ -296,8 +316,9 @@ private:
RewriterVar* emitCallWithAllocatedArgs(void* func_addr, const llvm::ArrayRef<RewriterVar*> args,
const llvm::ArrayRef<RewriterVar*> additional_uses);
std::pair<RewriterVar*, RewriterAction*> emitPPCall(void* func_addr, llvm::ArrayRef<RewriterVar*> args,
int num_slots, int slot_size, AST* ast_node = NULL,
TypeRecorder* type_recorder = NULL);
unsigned char num_slots, unsigned short slot_size,
AST* ast_node = NULL, TypeRecorder* type_recorder = NULL,
llvm::ArrayRef<RewriterVar*> additional_uses = {});
static void assertNameDefinedHelper(const char* id);
static Box* callattrHelper(Box* obj, BoxedString* attr, CallattrFlags flags, TypeRecorder* type_recorder,
......@@ -308,8 +329,8 @@ private:
static Box* createTupleHelper(uint64_t num, Box** data);
static Box* exceptionMatchesHelper(Box* obj, Box* cls);
static Box* hasnextHelper(Box* b);
static Box* nonzeroHelper(Box* b);
static Box* notHelper(Box* b);
static BORROWED(Box*) nonzeroHelper(Box* b);
static BORROWED(Box*) notHelper(Box* b);
static Box* runtimeCallHelper(Box* obj, ArgPassSpec argspec, TypeRecorder* type_recorder, Box** args,
std::vector<BoxedString*>* keyword_names);
......@@ -317,7 +338,7 @@ private:
void _emitJump(CFGBlock* b, RewriterVar* block_next, ExitInfo& exit_info);
void _emitOSRPoint();
void _emitPPCall(RewriterVar* result, void* func_addr, llvm::ArrayRef<RewriterVar*> args, int num_slots,
int slot_size, AST* ast_node);
int slot_size, AST* ast_node, llvm::ArrayRef<RewriterVar*> vars_to_bump);
void _emitRecordType(RewriterVar* type_recorder_var, RewriterVar* obj_cls_var);
void _emitReturn(RewriterVar* v);
void _emitSideExit(STOLEN(RewriterVar*) var, RewriterVar* val_constant, CFGBlock* next_block,
......
......@@ -593,11 +593,7 @@ public:
assert(l.stack_second_offset % 8 == 0);
b = b_ptr[l.stack_second_offset / 8];
} else if (l.type == Location::Register) {
RELEASE_ASSERT(0, "untested");
// This branch should never get hit since we shouldn't generate Register locations,
// since we don't allow allocating callee-save registers.
// If we did, this code might be right:
// b = (Box*)get_cursor_reg(cursor, l.regnum);
b = (Box*)get_cursor_reg(cursor, l.regnum);
} else {
RELEASE_ASSERT(0, "not implemented");
}
......
......@@ -4870,32 +4870,32 @@ Box* callCLFunc(FunctionMetadata* md, CallRewriteArgs* rewrite_args, int num_out
} else {
// Hacky workaround: the rewriter can only pass arguments in registers, so use this helper function
// to unpack some of the additional arguments:
llvm::SmallVector<RewriterVar*, 4> additional_uses;
RewriterVar* arg_array = rewrite_args->rewriter->allocate(4);
arg_vec.push_back(arg_array);
if (num_output_args >= 1)
if (num_output_args >= 1) {
arg_array->setAttr(0, rewrite_args->arg1, RewriterVar::SetattrType::REF_USED);
if (num_output_args >= 2)
additional_uses.push_back(rewrite_args->arg1);
}
if (num_output_args >= 2) {
arg_array->setAttr(8, rewrite_args->arg2, RewriterVar::SetattrType::REF_USED);
if (num_output_args >= 3)
additional_uses.push_back(rewrite_args->arg2);
}
if (num_output_args >= 3) {
arg_array->setAttr(16, rewrite_args->arg3, RewriterVar::SetattrType::REF_USED);
if (num_output_args >= 4)
additional_uses.push_back(rewrite_args->arg3);
}
if (num_output_args >= 4) {
arg_array->setAttr(24, rewrite_args->args, RewriterVar::SetattrType::REF_USED);
additional_uses.push_back(rewrite_args->args);
}
if (S == CXX)
rewrite_args->out_rtn = rewrite_args->rewriter->call(true, (void*)astInterpretHelper, arg_vec)
->setType(RefType::OWNED);
rewrite_args->out_rtn = rewrite_args->rewriter->call(true, (void*)astInterpretHelper, arg_vec, {},
additional_uses)->setType(RefType::OWNED);
else
rewrite_args->out_rtn = rewrite_args->rewriter->call(true, (void*)astInterpretHelperCapi, arg_vec)
->setType(RefType::OWNED);
if (num_output_args >= 1)
rewrite_args->arg1->refUsed();
if (num_output_args >= 2)
rewrite_args->arg2->refUsed();
if (num_output_args >= 3)
rewrite_args->arg3->refUsed();
if (num_output_args >= 4)
rewrite_args->args->refUsed();
rewrite_args->out_rtn = rewrite_args->rewriter->call(true, (void*)astInterpretHelperCapi, arg_vec,
{}, additional_uses)->setType(RefType::OWNED);
}
rewrite_args->out_success = true;
......@@ -5732,8 +5732,8 @@ Box* compareInternal(Box* lhs, Box* rhs, int op_type, CompareRewriteArgs* rewrit
bool neg = (op_type == AST_TYPE::IsNot);
if (rewrite_args) {
RewriterVar* cmpres = rewrite_args->lhs->cmp(neg ? AST_TYPE::NotEq : AST_TYPE::Eq, rewrite_args->rhs,
rewrite_args->destination);
RewriterVar* cmpres
= rewrite_args->lhs->cmp(neg ? AST_TYPE::NotEq : AST_TYPE::Eq, rewrite_args->rhs, assembler::RDI);
rewrite_args->out_rtn
= rewrite_args->rewriter->call(false, (void*)boxBool, cmpres)->setType(RefType::OWNED);
rewrite_args->out_success = true;
......@@ -7302,6 +7302,9 @@ extern "C" Box* getGlobal(Box* globals, BoxedString* name) {
}
extern "C" void setGlobal(Box* globals, BoxedString* name, STOLEN(Box*) value) {
static StatCounter slowpath_setglobal("slowpath_setglobal");
slowpath_setglobal.log();
if (globals->cls == attrwrapper_cls) {
globals = unwrapAttrWrapper(globals);
RELEASE_ASSERT(globals->cls == module_cls, "%s", globals->cls->tp_name);
......
# fail-if: '-n' in EXTRA_JIT_ARGS or '-O' in EXTRA_JIT_ARGS
# expected: fail
# this only works in the interpreter and not in the bjit and llvm jit
class C(object):
def next(self):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment