Commit 06274389 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Change some of the low-level initialization of patchpoints

Will make it easier to initialize non-ic patchpoints that
are coming up.
parent dbc15587
......@@ -17,6 +17,7 @@
#include <cstring>
#include "core/common.h"
#include "core/options.h"
namespace pyston {
namespace assembler {
......@@ -31,14 +32,14 @@ void Register::dump() const {
const int dwarf_to_gp[] = {
// http://www.x86-64.org/documentation/abi.pdf#page=57
0, // 0
2, // 1
0, // 0 -> rax
2, // 1 -> rdx
1, // 2 -> rcx
3, // 3 -> rbx
6, // 4
7, // 5
6, // 4 -> rsi
7, // 5 -> rdi
5, // 6 -> rbp
4, // 7
4, // 7 -> rsp
8, // 8 -> r8
9, // 9 -> r9
10, // 10 -> r10
......@@ -331,7 +332,6 @@ void Assembler::movsd(XMMRegister src, Indirect dest) {
int dest_idx = dest.base.regnum;
if (src_idx >= 8) {
trap();
rex |= REX_R;
src_idx -= 8;
}
......@@ -680,21 +680,21 @@ uint8_t* Assembler::emitCall(void* ptr, Register scratch) {
return addr;
}
void Assembler::emitBatchPush(StackInfo stack_info, const std::vector<GenericRegister>& to_push) {
assert(stack_info.has_scratch);
void Assembler::emitBatchPush(int scratch_rbp_offset, int scratch_size, const std::vector<GenericRegister>& to_push) {
int offset = 0;
for (const GenericRegister& r : to_push) {
assert(stack_info.scratch_bytes >= offset + 8);
Indirect next_slot(RBP, offset + stack_info.scratch_rbp_offset);
Indirect next_slot(RBP, offset + scratch_rbp_offset);
if (r.type == GenericRegister::GP) {
Register gp = r.gp;
assert(gp.regnum >= 0 && gp.regnum < 16);
assert(scratch_size >= offset + 8);
mov(gp, next_slot);
offset += 8;
} else if (r.type == GenericRegister::XMM) {
XMMRegister reg = r.xmm;
assert(scratch_size >= offset + 8);
movsd(reg, next_slot);
offset += 8;
} else {
......@@ -703,13 +703,12 @@ void Assembler::emitBatchPush(StackInfo stack_info, const std::vector<GenericReg
}
}
void Assembler::emitBatchPop(StackInfo stack_info, const std::vector<GenericRegister>& to_push) {
assert(stack_info.has_scratch);
void Assembler::emitBatchPop(int scratch_rbp_offset, int scratch_size, const std::vector<GenericRegister>& to_push) {
int offset = 0;
for (const GenericRegister& r : to_push) {
assert(stack_info.scratch_bytes >= offset + 8);
Indirect next_slot(RBP, offset + stack_info.scratch_rbp_offset);
assert(scratch_size >= offset + 8);
Indirect next_slot(RBP, offset + scratch_rbp_offset);
if (r.type == GenericRegister::GP) {
Register gp = r.gp;
......@@ -743,71 +742,5 @@ void Assembler::emitAnnotation(int num) {
cmp(RAX, Immediate(num));
nop();
}
uint8_t* initializePatchpoint2(uint8_t* start_addr, uint8_t* slowpath_start, uint8_t* end_addr, StackInfo stack_info,
const std::unordered_set<int>& live_outs) {
assert(start_addr < slowpath_start);
static const int INITIAL_CALL_SIZE = 13;
assert(end_addr > slowpath_start + INITIAL_CALL_SIZE);
#ifndef NDEBUG
// if (VERBOSITY()) printf("initializing patchpoint at %p - %p\n", addr, addr + size);
// for (int i = 0; i < size; i++) {
// printf("%02x ", *(addr + i));
//}
// printf("\n");
// Check the exact form of the patchpoint call.
// It's important to make sure that the only live registers
// are the ones that are used as arguments; ie it wouldn't
// matter if the call happened on %r10 instead of %r11,
// but it would matter if there wasn't a mov immediately before
// the call, since then %r11 would be live and we couldn't
// use it as a temporary.
// mov $imm, %r11:
ASSERT(start_addr[0] == 0x49, "%x", start_addr[0]);
assert(start_addr[1] == 0xbb);
// 8 bytes of the addr
// callq *%r11:
assert(start_addr[10] == 0x41);
assert(start_addr[11] == 0xff);
assert(start_addr[12] == 0xd3);
int i = INITIAL_CALL_SIZE;
while (*(start_addr + i) == 0x66 || *(start_addr + i) == 0x0f || *(start_addr + i) == 0x2e)
i++;
assert(*(start_addr + i) == 0x90 || *(start_addr + i) == 0x1f);
#endif
void* call_addr = *(void**)&start_addr[2];
Assembler(start_addr, slowpath_start - start_addr).fillWithNops();
std::vector<GenericRegister> regs_to_spill;
for (int dwarf_regnum : live_outs) {
GenericRegister ru = GenericRegister::fromDwarf(dwarf_regnum);
if (ru.type == GenericRegister::GP) {
if (ru.gp == RSP || ru.gp.isCalleeSave())
continue;
}
regs_to_spill.push_back(ru);
}
Assembler assem(slowpath_start, end_addr - slowpath_start);
// if (regs_to_spill.size())
// assem.trap();
assem.emitBatchPush(stack_info, regs_to_spill);
uint8_t* rtn = assem.emitCall(call_addr, R11);
assem.emitBatchPop(stack_info, regs_to_spill);
assem.fillWithNops();
return rtn;
}
}
}
......@@ -115,12 +115,13 @@ public:
// Macros:
uint8_t* emitCall(void* func_addr, Register scratch);
void emitBatchPop(StackInfo stack_info, const std::vector<GenericRegister>& to_push);
void emitBatchPush(StackInfo stack_info, const std::vector<GenericRegister>& to_push);
void emitBatchPop(int scratch_rbp_offset, int scratch_size, const std::vector<GenericRegister>& to_push);
void emitBatchPush(int scratch_rbp_offset, int scratch_size, const std::vector<GenericRegister>& to_push);
void fillWithNops();
void fillWithNopsExcept(int bytes);
void emitAnnotation(int num);
uint8_t* curInstPointer() { return addr; }
bool isExactlyFull() { return addr == end_addr; }
};
......
......@@ -122,13 +122,11 @@ int ICSlotRewrite::getFuncStackSize() {
}
int ICSlotRewrite::getScratchRbpOffset() {
assert(ic->stack_info.has_scratch);
assert(ic->stack_info.scratch_bytes);
return ic->stack_info.scratch_rbp_offset;
}
int ICSlotRewrite::getScratchBytes() {
assert(ic->stack_info.has_scratch);
assert(ic->stack_info.scratch_bytes);
return ic->stack_info.scratch_bytes;
}
......@@ -198,18 +196,18 @@ ICInfo::ICInfo(void* start_addr, void* continue_addr, StackInfo stack_info, int
}
static std::unordered_map<void*, ICInfo*> ics_by_return_addr;
void registerCompiledPatchpoint(CompiledFunction* cf, uint8_t* start_addr, const ICSetupInfo* pp, StackInfo stack_info,
std::unordered_set<int> live_outs) {
int size = pp->totalSize();
uint8_t* end_addr = start_addr + size;
uint8_t* slowpath_addr = end_addr;
uint8_t* rtn_addr;
void registerCompiledPatchpoint(CompiledFunction* cf, uint8_t* start_addr, uint8_t* slowpath_start_addr,
uint8_t* continue_addr, uint8_t* slowpath_rtn_addr, const ICSetupInfo* ic,
StackInfo stack_info, std::unordered_set<int> live_outs) {
assert(slowpath_start_addr - start_addr >= ic->num_slots * ic->slot_size);
assert(slowpath_rtn_addr > slowpath_start_addr);
assert(slowpath_rtn_addr <= start_addr + ic->totalSize());
assembler::GenericRegister return_register;
assert(pp->getCallingConvention() == llvm::CallingConv::C
|| pp->getCallingConvention() == llvm::CallingConv::PreserveAll);
if (pp->hasReturnValue()) {
assert(ic->getCallingConvention() == llvm::CallingConv::C
|| ic->getCallingConvention() == llvm::CallingConv::PreserveAll);
if (ic->hasReturnValue()) {
static const int DWARF_RAX = 0;
// It's possible that the return value doesn't get used, in which case
// we can avoid copying back into RAX at the end
......@@ -222,46 +220,29 @@ void registerCompiledPatchpoint(CompiledFunction* cf, uint8_t* start_addr, const
return_register = assembler::RAX;
}
if (pp->getCallingConvention() != llvm::CallingConv::C) {
uint8_t* slowpath_start = start_addr + pp->num_slots * pp->slot_size;
rtn_addr = initializePatchpoint2(start_addr, slowpath_start, (uint8_t*)end_addr, stack_info, live_outs);
} else {
// for (int regnum : live_outs) {
//// LLVM has a bug where it incorrectly determines the set of liveouts;
//// so far it only seems to add additional ones to the set, which should
//// hopefully be safe.
//// Otherwise, I'd like to test here that it's only the registers
//// that we'd expect to be saved...
// ASSERT(regnum == 0 || regnum == 3 || regnum == 6 || regnum == 12 || regnum == 13 || regnum == 14 || regnum ==
// 15 || regnum == 7, "%d", regnum);
//}
initializePatchpoint(start_addr, size);
rtn_addr = slowpath_addr;
}
// we can let the user just slide down the nop section, but instead
// emit jumps to the end.
// Not sure if this is worth it or not?
for (int i = 0; i < pp->num_slots; i++) {
uint8_t* start = start_addr + i * pp->slot_size;
// std::unique_ptr<MCWriter> writer(createMCWriter(start, pp->slot_size * (pp->num_slots - i), 0));
for (int i = 0; i < ic->num_slots; i++) {
uint8_t* start = start_addr + i * ic->slot_size;
// std::unique_ptr<MCWriter> writer(createMCWriter(start, ic->slot_size * (ic->num_slots - i), 0));
// writer->emitNop();
// writer->emitGuardFalse();
std::unique_ptr<Assembler> writer(new Assembler(start, pp->slot_size));
std::unique_ptr<Assembler> writer(new Assembler(start, ic->slot_size));
writer->nop();
// writer->trap();
writer->jmp(JumpDestination::fromStart(pp->slot_size * (pp->num_slots - i)));
// writer->jmp(JumpDestination::fromStart(ic->slot_size * (ic->num_slots - i)));
writer->jmp(JumpDestination::fromStart(slowpath_start_addr - start));
}
ICInfo* ic = new ICInfo(start_addr, slowpath_addr, stack_info, pp->num_slots, pp->slot_size,
pp->getCallingConvention(), live_outs, return_register, pp->type_recorder);
ICInfo* icinfo = new ICInfo(start_addr, continue_addr, stack_info, ic->num_slots, ic->slot_size,
ic->getCallingConvention(), live_outs, return_register, ic->type_recorder);
ics_by_return_addr[rtn_addr] = ic;
ics_by_return_addr[slowpath_rtn_addr] = icinfo;
assert(cf);
cf->ics.push_back(ic);
cf->ics.push_back(icinfo);
}
ICInfo* getICInfo(void* rtn_addr) {
......
......@@ -129,8 +129,9 @@ public:
class ICSetupInfo;
class CompiledFunction;
void registerCompiledPatchpoint(CompiledFunction* cf, uint8_t* start_addr, const ICSetupInfo*, StackInfo stack_info,
std::unordered_set<int> live_outs);
void registerCompiledPatchpoint(CompiledFunction* cf, uint8_t* start_addr, uint8_t* slowpath_start_addr,
uint8_t* continue_addr, uint8_t* slowpath_rtn_addr, const ICSetupInfo*,
StackInfo stack_info, std::unordered_set<int> live_outs);
ICInfo* getICInfo(void* rtn_addr);
}
......
......@@ -791,63 +791,4 @@ public:
}
};
#endif
void initializePatchpoint(uint8_t* addr, int size) {
#define CALL_SIZE 13
#ifndef NDEBUG
assert(size >= CALL_SIZE);
// if (VERBOSITY()) printf("initializing patchpoint at %p - %p\n", addr, addr + size);
// for (int i = 0; i < size; i++) {
// printf("%02x ", *(addr + i));
//}
// printf("\n");
// Check the exact form of the patchpoint call.
// It's important to make sure that the only live registers
// are the ones that are used as arguments; ie it wouldn't
// matter if the call happened on %r10 instead of %r11,
// but it would matter if there wasn't a mov immediately before
// the call, since then %r11 would be live and we couldn't
// use it as a temporary.
// mov $imm, %r11:
ASSERT(addr[0] == 0x49, "%x", addr[0]);
assert(addr[1] == 0xbb);
// 8 bytes of the addr
// callq *%r11:
assert(addr[10] == 0x41);
assert(addr[11] == 0xff);
assert(addr[12] == 0xd3);
int i = CALL_SIZE;
while (*(addr + i) == 0x66 || *(addr + i) == 0x0f || *(addr + i) == 0x2e)
i++;
assert(*(addr + i) == 0x90 || *(addr + i) == 0x1f);
#endif
memcpy(addr + size - CALL_SIZE, addr, CALL_SIZE);
memset(addr, 0x90, size - CALL_SIZE);
// addr[0] = 0xcc;
//// Move the call to the end of the region:
// char scratch[CALL_SIZE];
// memcpy(scratch, addr, CALL_SIZE);
// std::memmove(addr, addr + CALL_SIZE, size - CALL_SIZE);
// memcpy(addr + size - CALL_SIZE, scratch, CALL_SIZE);
}
/*
MCWriter* createMCWriter(uint8_t* addr, int size, int num_temp_regs) {
assert(num_temp_regs >= 0);
// The X86MCWriter will automatically use %r10 and %r11, so don't need
// to pass that along. But if the client requested more than two
// temporaries, err out.
assert(num_temp_regs <= 2 && "unsupported");
return new X86MCWriter(addr, size);
}
*/
}
......@@ -51,9 +51,6 @@ public:
virtual void emitCmp(AST_TYPE::AST_TYPE cmp_type, int lhs_argnum, int rhs_argnum, int dest_argnum) = 0;
virtual void emitToBool(int argnum, int dest_argnum) = 0;
};
void initializePatchpoint(uint8_t* addr, int size);
MCWriter* createMCWriter(uint8_t* addr, int size, int num_temp_regs);
}
#endif
......@@ -30,7 +30,11 @@ static const assembler::Register allocatable_regs[] = {
assembler::RDI, assembler::RSI, assembler::R8, assembler::R9, assembler::R10, assembler::R11,
// For now, cannot allocate callee-save registers since we do not restore them properly
// at potentially-unwinding callsites.
// at potentially-throwing callsites.
// Also, if we wanted to allow spilling of existing values in callee-save registers (which
// adding them to this list would by default enable), we would need to somehow tell our frame
// introspection code where we spilled them to.
//
// TODO fix that behavior, or create an unwinder that knows how to unwind through our
// inline caches.
/*
......@@ -1104,4 +1108,84 @@ RewriterVarUsage RewriterVarUsage::addUse() {
#ifndef NDEBUG
int RewriterVar::nvars = 0;
#endif
static const int INITIAL_CALL_SIZE = 13;
static const int DWARF_RBP_REGNUM = 6;
void* extractSlowpathFunc(uint8_t* pp_addr) {
#ifndef NDEBUG
// mov $imm, %r11:
ASSERT(pp_addr[0] == 0x49, "%x", pp_addr[0]);
assert(pp_addr[1] == 0xbb);
// 8 bytes of the addr
// callq *%r11:
assert(pp_addr[10] == 0x41);
assert(pp_addr[11] == 0xff);
assert(pp_addr[12] == 0xd3);
int i = INITIAL_CALL_SIZE;
while (*(pp_addr + i) == 0x66 || *(pp_addr + i) == 0x0f || *(pp_addr + i) == 0x2e)
i++;
assert(*(pp_addr + i) == 0x90 || *(pp_addr + i) == 0x1f);
#endif
void* call_addr = *(void**)&pp_addr[2];
return call_addr;
}
std::pair<uint8_t*, uint8_t*> initializePatchpoint3(void* slowpath_func, uint8_t* start_addr, uint8_t* end_addr,
int scratch_offset, int scratch_size,
const std::unordered_set<int>& live_outs) {
assert(start_addr < end_addr);
int est_slowpath_size = INITIAL_CALL_SIZE;
std::vector<assembler::GenericRegister> regs_to_spill;
for (int dwarf_regnum : live_outs) {
assembler::GenericRegister ru = assembler::GenericRegister::fromDwarf(dwarf_regnum);
assert(!(ru.type == assembler::GenericRegister::GP && ru.gp == assembler::R11) && "We assume R11 is free!");
if (ru.type == assembler::GenericRegister::GP) {
if (ru.gp == assembler::RSP || ru.gp.isCalleeSave())
continue;
}
// Location(ru).dump();
regs_to_spill.push_back(ru);
if (ru.type == assembler::GenericRegister::GP)
est_slowpath_size += 14; // 7 bytes for a mov with 4-byte displacement, needed twice
else if (ru.type == assembler::GenericRegister::XMM)
est_slowpath_size += 18; // (up to) 9 bytes for a movsd with 4-byte displacement, needed twice
else
abort();
}
if (VERBOSITY())
printf("Have to spill %ld regs around the slowpath\n", regs_to_spill.size());
// TODO: some of these registers could already have been pushed via the frame saving code
uint8_t* slowpath_start = end_addr - est_slowpath_size;
ASSERT(slowpath_start >= start_addr, "Used more slowpath space than expected; change ICSetupInfo::totalSize()?");
assembler::Assembler _a(start_addr, slowpath_start - start_addr);
//_a.trap();
_a.fillWithNops();
assembler::Assembler assem(slowpath_start, end_addr - slowpath_start);
// if (regs_to_spill.size())
// assem.trap();
assem.emitBatchPush(scratch_offset, scratch_size, regs_to_spill);
uint8_t* rtn = assem.emitCall(slowpath_func, assembler::R11);
assem.emitBatchPop(scratch_offset, scratch_size, regs_to_spill);
assem.fillWithNops();
assert(!assem.hasFailed());
return std::make_pair(slowpath_start, rtn);
}
}
......@@ -312,6 +312,13 @@ public:
friend class RewriterVar;
friend class RewriterVarUsage;
};
void* extractSlowpathFunc(uint8_t* pp_addr);
// returns (start_of_slowpath, return_addr_of_slowpath_call)
std::pair<uint8_t*, uint8_t*> initializePatchpoint3(void* slowpath_func, uint8_t* start_addr, uint8_t* end_addr,
int scratch_offset, int scratch_size,
const std::unordered_set<int>& live_outs);
}
#endif
......@@ -20,7 +20,6 @@ namespace pyston {
struct StackInfo {
int stack_size;
bool has_scratch;
int scratch_bytes;
int scratch_rbp_offset;
};
......@@ -80,6 +79,8 @@ struct XMMRegister {
bool operator==(const XMMRegister& rhs) const { return regnum == rhs.regnum; }
bool operator!=(const XMMRegister& rhs) const { return !(*this == rhs); }
void dump() const { printf("XMM%d\n", regnum); }
};
const XMMRegister XMM0(0);
......@@ -129,6 +130,15 @@ struct GenericRegister {
constexpr GenericRegister(const Register r) : gp(r), type(GP) {}
constexpr GenericRegister(const XMMRegister r) : xmm(r), type(XMM) {}
void dump() const {
if (type == GP)
gp.dump();
else if (type == XMM)
xmm.dump();
else
abort();
}
static GenericRegister fromDwarf(int dwarf_regnum);
};
......
......@@ -32,7 +32,7 @@ int ICSetupInfo::totalSize() const {
int call_size = CALL_ONLY_SIZE;
if (getCallingConvention() != llvm::CallingConv::C) {
// 14 bytes per reg that needs to be spilled
call_size += 14 * 4;
call_size += 14 * 6;
}
return num_slots * slot_size + call_size;
}
......@@ -120,15 +120,49 @@ void processStackmap(CompiledFunction* cf, StackMap* stackmap) {
uint8_t* start_addr = (uint8_t*)pp->parentFunction()->code + r->offset;
uint8_t* end_addr = start_addr + pp->patchpointSize();
// TODO shouldn't have to do it this way
void* slowpath_func = extractSlowpathFunc(start_addr);
//*start_addr = 0xcc;
// start_addr++;
const ICSetupInfo* ic = pp->getICInfo();
if (ic == NULL)
if (ic == NULL) {
// We have to be using the C calling convention here, so we don't need to check the live outs
// or save them across the call.
initializePatchpoint3(slowpath_func, start_addr, end_addr, scratch_rbp_offset, scratch_size,
std::unordered_set<int>());
continue;
}
std::unordered_set<int> live_outs(extractLiveOuts(r, ic->getCallingConvention()));
registerCompiledPatchpoint(cf, start_addr, ic,
StackInfo({ stack_size, true, scratch_size, scratch_rbp_offset }),
std::move(live_outs));
if (ic->hasReturnValue()) {
assert(ic->getCallingConvention() == llvm::CallingConv::C
|| ic->getCallingConvention() == llvm::CallingConv::PreserveAll);
static const int DWARF_RAX = 0;
// It's possible that the return value doesn't get used, in which case
// we can avoid copying back into RAX at the end
if (live_outs.count(DWARF_RAX)) {
live_outs.erase(DWARF_RAX);
}
}
auto _p
= initializePatchpoint3(slowpath_func, start_addr, end_addr, scratch_rbp_offset, scratch_size, live_outs);
uint8_t* slowpath_start = _p.first;
uint8_t* slowpath_rtn_addr = _p.second;
ASSERT(slowpath_start - start_addr >= ic->num_slots * ic->slot_size,
"Used more slowpath space than expected; change ICSetupInfo::totalSize()?");
assert(pp->numICStackmapArgs() == 0); // don't do anything with these for now
registerCompiledPatchpoint(cf, start_addr, slowpath_start, end_addr, slowpath_rtn_addr, ic,
StackInfo({ stack_size, scratch_size, scratch_rbp_offset }), std::move(live_outs));
}
for (PatchpointInfo* pp : new_patchpoints) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment