Commit 76c42191 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Merge pull request #822 from kmod/perf4

Improve dictionary performance
parents cd5a4d02 8813c42e
......@@ -94,7 +94,7 @@ struct _dictobject {
#endif
typedef struct {
PyObject_HEAD;
char _filler[SIZEOF_UNORDEREDMAP];
char _filler[24];
} PyDictObject;
// Pyston change: these are no longer static objects:
......
# Test the speed of using strings as dictionary keys.
# We internally optimize many things to not use dictionaries,
# but there are some times that we can't.
def f():
d = {'a':1}
for i in xrange(3000000):
d['a']
d['a']
d['a']
d['a']
d['a']
d['a']
d['a']
d['a']
d['a']
d['a']
f()
......@@ -712,22 +712,7 @@ void Assembler::cmp(Register reg1, Register reg2) {
}
void Assembler::cmp(Register reg, Immediate imm) {
int64_t val = imm.val;
assert((-1L << 31) <= val && val < (1L << 31) - 1);
int reg_idx = reg.regnum;
int rex = REX_W;
if (reg_idx >= 8) {
rex |= REX_B;
reg_idx -= 8;
}
assert(0 <= reg_idx && reg_idx < 8);
emitRex(rex);
emitByte(0x81);
emitModRM(0b11, 7, reg_idx);
emitInt(val, 4);
emitArith(imm, reg, OPCODE_CMP);
}
void Assembler::cmp(Indirect mem, Immediate imm) {
......
......@@ -73,7 +73,7 @@ private:
uint8_t* addr;
bool failed; // if the rewrite failed at the assembly-generation level for some reason
static const uint8_t OPCODE_ADD = 0b000, OPCODE_SUB = 0b101;
static const uint8_t OPCODE_ADD = 0b000, OPCODE_SUB = 0b101, OPCODE_CMP = 0b111;
static const uint8_t REX_B = 1, REX_X = 2, REX_R = 4, REX_W = 8;
#ifndef NDEBUG
......
......@@ -19,6 +19,7 @@
#include "codegen/unwinding.h"
#include "core/common.h"
#include "core/stats.h"
#include "runtime/types.h"
namespace pyston {
......@@ -1522,6 +1523,32 @@ void Rewriter::_allocateAndCopyPlus1(RewriterVar* result, RewriterVar* first_ele
assertConsistent();
}
void Rewriter::checkAndThrowCAPIException(RewriterVar* r, int64_t exc_val) {
STAT_TIMER(t0, "us_timer_rewriter", 10);
addAction([=]() { this->_checkAndThrowCAPIException(r, exc_val); }, { r }, ActionType::MUTATION);
}
void Rewriter::_checkAndThrowCAPIException(RewriterVar* r, int64_t exc_val) {
assembler->comment("_checkAndThrowCAPIException");
assembler::Register var_reg = r->getInReg();
if (exc_val == 0)
assembler->test(var_reg, var_reg);
else
assembler->cmp(var_reg, assembler::Immediate(exc_val));
{
assembler::ForwardJump jnz(*assembler, assembler::COND_NOT_ZERO);
assembler->mov(assembler::Immediate((void*)throwCAPIException), assembler::R11);
assembler->callq(assembler::R11);
}
r->bumpUse();
assertConsistent();
}
assembler::Indirect Rewriter::indirectFor(Location l) {
assert(l.type == Location::Scratch || l.type == Location::Stack);
......
......@@ -430,6 +430,7 @@ protected:
int _allocate(RewriterVar* result, int n);
void _allocateAndCopy(RewriterVar* result, RewriterVar* array, int n);
void _allocateAndCopyPlus1(RewriterVar* result, RewriterVar* first_elem, RewriterVar* rest, int n_rest);
void _checkAndThrowCAPIException(RewriterVar* r, int64_t exc_val);
// The public versions of these are in RewriterVar
void _addGuard(RewriterVar* var, RewriterVar* val_constant);
......@@ -514,6 +515,9 @@ public:
RewriterVar* allocateAndCopy(RewriterVar* array, int n);
RewriterVar* allocateAndCopyPlus1(RewriterVar* first_elem, RewriterVar* rest, int n_rest);
// This emits `if (r == exc_val) throwCAPIException()`
void checkAndThrowCAPIException(RewriterVar* r, int64_t exc_val = 0);
void abort();
void commit();
void commitReturning(RewriterVar* rtn);
......
......@@ -31,6 +31,29 @@ extern "C" void conservativeGCHandler(GCVisitor* v, Box* b) noexcept {
v->visitPotentialRange((void* const*)b, (void* const*)((char*)b + b->cls->tp_basicsize));
}
extern "C" void conservativeAndBasesGCHandler(GCVisitor* v, Box* b) noexcept {
// TODO: this function is expensive. We should try to make sure it doesn't get used
// that often, or to come up with a better approach.
// Call all the custom gc handlers defined anywhere in the hierarchy:
assert(PyTuple_CheckExact(b->cls->tp_mro));
for (auto c : *static_cast<BoxedTuple*>(b->cls->tp_mro)) {
if (!PyType_Check(c))
continue;
auto gc_visit = static_cast<BoxedClass*>(c)->gc_visit;
// Skip conservativeGCHandler since it's slow, and skip conservativeAndBasesGCHandler since
// it would cause an infinite loop:
if (gc_visit == conservativeGCHandler || gc_visit == conservativeAndBasesGCHandler)
continue;
gc_visit(v, b);
}
conservativeGCHandler(v, b);
}
static int check_num_args(PyObject* ob, int n) noexcept {
if (!PyTuple_CheckExact(ob)) {
PyErr_SetString(PyExc_SystemError, "PyArg_UnpackTuple() argument list is not a tuple");
......@@ -3224,7 +3247,7 @@ static Box* tppProxyToTpCall(Box* self, CallRewriteArgs* rewrite_args, ArgPassSp
rewrite_args->out_rtn = rewrite_args->rewriter->call(true, (void*)self->cls->tp_call, rewrite_args->obj,
rewrite_args->arg1, rewrite_args->arg2);
if (S == CXX)
rewrite_args->rewriter->call(true, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(rewrite_args->out_rtn);
rewrite_args->out_success = true;
}
......@@ -3323,7 +3346,16 @@ extern "C" int PyType_Ready(PyTypeObject* cls) noexcept {
if (cls->tp_alloc == &PystonType_GenericAlloc)
cls->tp_alloc = &PyType_GenericAlloc;
cls->gc_visit = &conservativeGCHandler;
// If an extension class visits from a Pyston class that does custom visiting,
// the base class needs to call the parent's visit function in case it visits
// non-inline data. There's not an easy way to put in a function pointer here
// that defers to a specific class's gc_visit, even if it's a base class, since
// the gc_visit could get inherited by subclasses. For now just use an expensive
// function, conservativeAndBasesGCHandler
if (base->gc_visit != object_cls->gc_visit && base->gc_visit != &conservativeGCHandler)
cls->gc_visit = &conservativeAndBasesGCHandler;
else
cls->gc_visit = &conservativeGCHandler;
cls->is_user_defined = true;
......
......@@ -1508,7 +1508,7 @@ Value ASTInterpreter::visit_name(AST_Name* node) {
assert(getSymVRegMap()[node->id] == node->vreg);
Box* val = vregs[node->vreg];
if (val) {
assert(gc::isValidGCObject(val));
ASSERT(gc::isValidGCObject(val), "%s is %p", node->id.c_str(), val);
v.o = val;
return v;
}
......
......@@ -271,18 +271,15 @@ bool isValidGCObject(void* p) {
GCAllocation* al = global_heap.getAllocationFromInteriorPointer(p);
if (!al)
return false;
return al->user_data == p && (al->kind_id == GCKind::CONSERVATIVE_PYTHON || al->kind_id == GCKind::PYTHON);
return al->user_data == p && al->kind_id == GCKind::PYTHON;
}
void registerPythonObject(Box* b) {
assert(isValidGCMemory(b));
auto al = GCAllocation::fromUserData(b);
if (al->kind_id == GCKind::CONSERVATIVE) {
al->kind_id = GCKind::CONSERVATIVE_PYTHON;
} else {
assert(al->kind_id == GCKind::PYTHON);
}
assert(al->kind_id == GCKind::CONSERVATIVE || al->kind_id == GCKind::PYTHON);
al->kind_id = GCKind::PYTHON;
assert(b->cls);
if (hasOrderedFinalizer(b->cls)) {
......@@ -386,7 +383,7 @@ static __attribute__((always_inline)) void visitByGCKind(void* p, GCVisitor& vis
GCKind kind_id = al->kind_id;
if (kind_id == GCKind::UNTRACKED) {
// Nothing to do here.
} else if (kind_id == GCKind::CONSERVATIVE || kind_id == GCKind::CONSERVATIVE_PYTHON) {
} else if (kind_id == GCKind::CONSERVATIVE) {
uint32_t bytes = al->kind_data;
visitor.visitPotentialRange((void**)p, (void**)((char*)p + bytes));
} else if (kind_id == GCKind::PRECISE) {
......@@ -521,7 +518,7 @@ static void graphTraversalMarking(TraceStack& stack, GCVisitor& visitor) {
GCAllocation* al = GCAllocation::fromUserData(p);
#if TRACE_GC_MARKING
if (al->kind_id == GCKind::PYTHON || al->kind_id == GCKind::CONSERVATIVE_PYTHON)
if (al->kind_id == GCKind::PYTHON)
GC_TRACE_LOG("Looking at %s object %p\n", static_cast<Box*>(p)->cls->tp_name, p);
else
GC_TRACE_LOG("Looking at non-python allocation %p\n", p);
......@@ -685,11 +682,6 @@ static void markPhase() {
// pending finalization list.
orderFinalizers();
#if TRACE_GC_MARKING
fclose(trace_fp);
trace_fp = NULL;
#endif
#ifndef NVALGRIND
VALGRIND_ENABLE_ERROR_REPORTING;
#endif
......
......@@ -59,10 +59,6 @@ enum class GCKind : uint8_t {
// because it contains pointers into our heap or our heap points to these
// objects. These objects inherit from GCAllocatedRuntime.
RUNTIME = 5,
// A Python object where we don't have a way to visit precisely with a GC
// handler function. These are usually Python objects allocated in C extensions.
CONSERVATIVE_PYTHON = 6,
};
extern "C" void* gc_alloc(size_t nbytes, GCKind kind);
......
......@@ -170,7 +170,7 @@ __attribute__((always_inline)) bool _doFree(GCAllocation* al, std::vector<Box*>*
VALGRIND_ENABLE_ERROR_REPORTING;
#endif
if (alloc_kind == GCKind::PYTHON || alloc_kind == GCKind::CONSERVATIVE_PYTHON) {
if (alloc_kind == GCKind::PYTHON) {
#ifndef NVALGRIND
VALGRIND_DISABLE_ERROR_REPORTING;
#endif
......@@ -186,8 +186,7 @@ __attribute__((always_inline)) bool _doFree(GCAllocation* al, std::vector<Box*>*
return false;
}
ASSERT(!hasOrderedFinalizer(b->cls) || hasFinalized(al) || alloc_kind == GCKind::CONSERVATIVE_PYTHON, "%s",
getTypeName(b));
ASSERT(!hasOrderedFinalizer(b->cls) || hasFinalized(al), "%s", getTypeName(b));
if (b->cls->tp_dealloc != dealloc_null && b->cls->has_safe_tp_dealloc) {
gc_safe_destructors.log();
......@@ -273,17 +272,6 @@ void addStatistic(HeapStatistics* stats, GCAllocation* al, int nbytes) {
} else if (al->kind_id == GCKind::CONSERVATIVE) {
stats->conservative.nallocs++;
stats->conservative.nbytes += nbytes;
} else if (al->kind_id == GCKind::CONSERVATIVE_PYTHON) {
stats->conservative_python.nallocs++;
stats->conservative_python.nbytes += nbytes;
if (stats->collect_cls_stats) {
Box* b = (Box*)al->user_data;
auto& t = stats->by_cls[b->cls];
t.nallocs++;
t.nbytes += nbytes;
}
} else if (al->kind_id == GCKind::UNTRACKED) {
stats->untracked.nallocs++;
stats->untracked.nbytes += nbytes;
......@@ -509,7 +497,7 @@ SmallArena::Block** SmallArena::_freeChain(Block** head, std::vector<Box*>& weak
clearMark(al);
} else {
if (_doFree(al, &weakly_referenced)) {
GC_TRACE_LOG("freeing %p\n", al->user_data);
// GC_TRACE_LOG("freeing %p\n", al->user_data);
b->isfree.set(atom_idx);
#ifndef NDEBUG
memset(al->user_data, 0xbb, b->size - sizeof(GCAllocation));
......
......@@ -797,6 +797,10 @@ void throwCAPIException() {
}
void checkAndThrowCAPIException() {
// Log these since these are expensive and usually avoidable:
static StatCounter num_checkAndThrowCAPIException("num_checkAndThrowCAPIException");
num_checkAndThrowCAPIException.log();
Box* _type = cur_thread_state.curexc_type;
if (!_type)
assert(!cur_thread_state.curexc_value);
......@@ -1572,7 +1576,7 @@ Box* BoxedCApiFunction::tppCall(Box* _self, CallRewriteArgs* rewrite_args, ArgPa
}
if (rewrite_args) {
rewrite_args->rewriter->call(false, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(rewrite_args->out_rtn);
rewrite_args->out_success = true;
}
......
......@@ -387,7 +387,7 @@ Box* BoxedMethodDescriptor::tppCall(Box* _self, CallRewriteArgs* rewrite_args, A
throwCAPIException();
if (rewrite_args) {
rewrite_args->rewriter->call(false, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(rewrite_args->out_rtn);
rewrite_args->out_success = true;
}
......@@ -565,7 +565,7 @@ Box* BoxedWrapperObject::tppCall(Box* _self, CallRewriteArgs* rewrite_args, ArgP
rewrite_args->out_rtn = rewriter->call(
true, (void*)wk, r_obj, rewrite_args->arg1,
rewriter->loadConst((intptr_t)self->descr->wrapped, Location::forArg(2)), rewrite_args->arg2);
rewriter->call(false, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(rewrite_args->out_rtn);
rewrite_args->out_success = true;
}
} else if (flags == PyWrapperFlag_PYSTON || flags == 0) {
......@@ -577,7 +577,7 @@ Box* BoxedWrapperObject::tppCall(Box* _self, CallRewriteArgs* rewrite_args, ArgP
rewrite_args->out_rtn
= rewriter->call(true, (void*)wrapper, r_obj, rewrite_args->arg1,
rewriter->loadConst((intptr_t)self->descr->wrapped, Location::forArg(2)));
rewriter->call(false, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(rewrite_args->out_rtn);
rewrite_args->out_success = true;
}
} else {
......
......@@ -479,7 +479,7 @@ Box* dictSetdefault(BoxedDict* self, Box* k, Box* v) {
if (it != self->d.end())
return it->second;
self->d.insert(it, std::make_pair(k, v));
self->d.insert(std::make_pair(k, v));
return v;
}
......@@ -689,14 +689,10 @@ void BoxedDict::gcHandler(GCVisitor* v, Box* b) {
BoxedDict* d = (BoxedDict*)b;
// This feels like a cludge, but we need to find anything that
// the unordered_map might have allocated.
// Another way to handle this would be to rt_alloc the unordered_map
// as well, though that incurs extra memory dereferences which would
// be nice to avoid.
void** start = (void**)&d->d;
void** end = start + (sizeof(d->d) / 8);
v->visitPotentialRange(start, end);
for (auto p : d->d) {
v->visit(p.first);
v->visit(p.second);
}
}
void BoxedDictIterator::gcHandler(GCVisitor* v, Box* b) {
......@@ -708,7 +704,7 @@ void BoxedDictIterator::gcHandler(GCVisitor* v, Box* b) {
}
void BoxedDictView::gcHandler(GCVisitor* v, Box* b) {
assert(b->cls == dict_items_cls);
assert(b->cls == dict_items_cls || b->cls == dict_values_cls || b->cls == dict_keys_cls);
Box::gcHandler(v, b);
BoxedDictView* view = static_cast<BoxedDictView*>(b);
......
......@@ -1375,8 +1375,7 @@ Box* dataDescriptorInstanceSpecialCases(GetattrRewriteArgs* rewrite_args, BoxedS
/* has_side_effects */ true, (void*)getset_descr->get, rewrite_args->obj, r_closure);
if (descr->cls == capi_getset_cls)
// TODO I think we are supposed to check the return value?
rewrite_args->rewriter->call(true, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(rewrite_args->out_rtn);
rewrite_args->out_success = true;
}
......@@ -1434,7 +1433,7 @@ Box* getattrInternalEx(Box* obj, BoxedString* attr, GetattrRewriteArgs* rewrite_
auto r_rtn = rewrite_args->rewriter->call(true, (void*)obj->cls->tp_getattro, rewrite_args->obj, r_box);
if (S == CXX)
rewrite_args->rewriter->call(true, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(r_rtn);
else
rewrite_args->rewriter->call(false, (void*)ensureValidCapiReturn, r_rtn);
......@@ -2057,12 +2056,11 @@ bool dataDescriptorSetSpecialCases(Box* obj, Box* val, Box* descr, SetattrRewrit
args.push_back(r_obj);
args.push_back(r_val);
args.push_back(r_closure);
rewrite_args->rewriter->call(
RewriterVar* r_rtn = rewrite_args->rewriter->call(
/* has_side_effects */ true, (void*)getset_descr->set, args);
if (descr->cls == capi_getset_cls)
// TODO I think we are supposed to check the return value?
rewrite_args->rewriter->call(true, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(r_rtn, -1);
rewrite_args->out_success = true;
}
......@@ -2579,7 +2577,11 @@ template <ExceptionStyle S> BoxedInt* lenInternal(Box* obj, LenRewriteArgs* rewr
// support calling a RewriterVar (can only call fixed function addresses).
r_m->addAttrGuard(offsetof(PySequenceMethods, sq_length), (intptr_t)m->sq_length);
RewriterVar* r_n = rewrite_args->rewriter->call(true, (void*)m->sq_length, r_obj);
rewrite_args->rewriter->call(true, (void*)checkAndThrowCAPIException);
// Some CPython code seems to think that any negative return value means an exception,
// but the docs say -1. TODO it would be nice to just handle any negative value.
rewrite_args->rewriter->checkAndThrowCAPIException(r_n, -1);
RewriterVar* r_r = rewrite_args->rewriter->call(false, (void*)boxInt, r_n);
rewrite_args->out_success = true;
......@@ -3658,7 +3660,7 @@ Box* callCLFunc(CLFunction* f, CallRewriteArgs* rewrite_args, int num_output_arg
rewrite_args->out_rtn = rewrite_args->rewriter->call(true, func_ptr, arg_vec);
if (S == CXX && chosen_cf->exception_style == CAPI)
rewrite_args->rewriter->call(true, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(rewrite_args->out_rtn);
rewrite_args->out_success = true;
}
......@@ -3696,7 +3698,7 @@ Box* callCLFunc(CLFunction* f, CallRewriteArgs* rewrite_args, int num_output_arg
ASSERT(chosen_cf->spec->rtn_type->isFitBy(r->cls), "%s (%p) was supposed to return %s, but gave a %s",
g.func_addr_registry.getFuncNameAtAddress(chosen_cf->code, true, NULL).c_str(), chosen_cf->code,
chosen_cf->spec->rtn_type->debugName().c_str(), r->cls->tp_name);
assert(!PyErr_Occurred());
ASSERT(!PyErr_Occurred(), "%p", chosen_cf->code);
}
return r;
......@@ -4232,7 +4234,8 @@ Box* compareInternal(Box* lhs, Box* rhs, int op_type, CompareRewriteArgs* rewrit
// support calling a RewriterVar (can only call fixed function addresses).
r_sqm->addAttrGuard(offsetof(PySequenceMethods, sq_contains), (intptr_t)sqm->sq_contains);
RewriterVar* r_b = rewrite_args->rewriter->call(true, (void*)sqm->sq_contains, r_rhs, r_lhs);
rewrite_args->rewriter->call(true, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(r_b, -1);
// This could be inlined:
RewriterVar* r_r;
if (op_type == AST_TYPE::NotIn)
......@@ -4649,7 +4652,7 @@ Box* getitemInternal(Box* target, Box* slice, GetitemRewriteArgs* rewrite_args)
r_m->addAttrGuard(offsetof(PyMappingMethods, mp_subscript), (intptr_t)m->mp_subscript);
RewriterVar* r_rtn = rewrite_args->rewriter->call(true, (void*)m->mp_subscript, r_obj, r_slice);
if (S == CXX)
rewrite_args->rewriter->call(true, (void*)checkAndThrowCAPIException);
rewrite_args->rewriter->checkAndThrowCAPIException(r_rtn);
rewrite_args->out_success = true;
rewrite_args->out_rtn = r_rtn;
}
......
......@@ -128,6 +128,7 @@ int BoxedTuple::Resize(BoxedTuple** pv, size_t newsize) noexcept {
BoxedTuple* resized = new (newsize) BoxedTuple();
memmove(resized->elts, t->elts, sizeof(Box*) * t->size());
memset(resized->elts + t->size(), 0, sizeof(Box*) * (newsize - t->size()));
*pv = resized;
return 0;
......
......@@ -659,9 +659,22 @@ struct PyLt {
bool operator()(Box*, Box*) const;
};
struct PythonLevelEq {
static bool isEqual(Box* lhs, Box* rhs) {
if (lhs == rhs)
return true;
if (rhs == getEmptyKey() || rhs == getTombstoneKey())
return false;
return PyEq()(lhs, rhs);
}
static Box* getEmptyKey() { return (Box*)-1; }
static Box* getTombstoneKey() { return (Box*)-2; }
static unsigned getHashValue(Box* val) { return PyHasher()(val); }
};
class BoxedDict : public Box {
public:
typedef std::unordered_map<Box*, Box*, PyHasher, PyEq, StlCompatAllocator<std::pair<Box*, Box*>>> DictMap;
typedef llvm::DenseMap<Box*, Box*, PythonLevelEq> DictMap;
DictMap d;
......
......@@ -168,11 +168,8 @@ extern "C" void dumpEx(void* p, int levels) {
return;
}
if (al->kind_id == gc::GCKind::PYTHON || al->kind_id == gc::GCKind::CONSERVATIVE_PYTHON) {
if (al->kind_id == gc::GCKind::PYTHON)
printf("Python object (precisely scanned)\n");
else
printf("Python object (conservatively scanned)\n");
if (al->kind_id == gc::GCKind::PYTHON) {
printf("Python object\n");
Box* b = (Box*)p;
printf("Class: %s", getFullTypeName(b).c_str());
......@@ -224,7 +221,7 @@ extern "C" void dumpEx(void* p, int levels) {
if (isSubclass(b->cls, dict_cls)) {
BoxedDict* d = static_cast<BoxedDict*>(b);
printf("%ld elements\n", d->d.size());
printf("%d elements\n", d->d.size());
if (levels > 0) {
int i = 0;
......
......@@ -233,6 +233,5 @@ print sorted(l)
#recursive printing test
d = dict()
d['one'] = '1'
d['two'] = d
print d
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment