Commit d713267d authored by Marius Wachtler's avatar Marius Wachtler

string interning: make it slightly more space efficient

We used to store the string content twice.
This implementation makes use of DenseSet::find_as functionality where one can search using a different type.
I was not sure if the special DenseMapInfo I had to create causes any problems so I choose to create a new source file
for it so that it does not get picked up somewhere else.
parent ce0a5ddd
......@@ -38,7 +38,7 @@ add_library(PYSTON_OBJECTS OBJECT ${OPTIONAL_SRCS}
codegen/baseline_jit.cpp
codegen/codegen.cpp
codegen/compvars.cpp
codegen/cpython_ast.cpp
codegen/cpython_ast.cpp
codegen/entry.cpp
codegen/gcbuilder.cpp
codegen/irgen.cpp
......@@ -102,6 +102,7 @@ add_library(PYSTON_OBJECTS OBJECT ${OPTIONAL_SRCS}
runtime/objmodel.cpp
runtime/set.cpp
runtime/str.cpp
runtime/str_interning.cpp
runtime/super.cpp
runtime/tuple.cpp
runtime/types.cpp
......
......@@ -369,62 +369,6 @@ template <ExceptionStyle S> Box* strAdd(BoxedString* lhs, Box* _rhs) noexcept(S
return new (lhs->size() + rhs->size()) BoxedString(lhs->s(), rhs->s());
}
static llvm::StringMap<BoxedString*> interned_strings;
static StatCounter num_interned_strings("num_interned_string");
extern "C" PyObject* PyString_InternFromString(const char* s) noexcept {
RELEASE_ASSERT(s, "");
return internStringImmortal(s);
}
BoxedString* internStringImmortal(llvm::StringRef s) noexcept {
auto& entry = interned_strings[s];
if (!entry) {
num_interned_strings.log();
entry = boxString(s);
// CPython returns mortal but in our current implementation they are inmortal
entry->interned_state = SSTATE_INTERNED_IMMORTAL;
}
Py_INCREF(entry);
return entry;
}
extern "C" void PyString_InternInPlace(PyObject** p) noexcept {
BoxedString* s = (BoxedString*)*p;
if (s == NULL || !PyString_Check(s))
Py_FatalError("PyString_InternInPlace: strings only please!");
/* If it's a string subclass, we don't really know what putting
it in the interned dict might do. */
if (!PyString_CheckExact(s))
return;
if (PyString_CHECK_INTERNED(s))
return;
auto& entry = interned_strings[s->s()];
if (entry) {
Py_INCREF(entry);
Py_DECREF(*p);
*p = entry;
} else {
// TODO: do CPython's refcounting here
num_interned_strings.log();
entry = s;
Py_INCREF(s);
// CPython returns mortal but in our current implementation they are inmortal
s->interned_state = SSTATE_INTERNED_IMMORTAL;
}
}
extern "C" void _Py_ReleaseInternedStrings() noexcept {
// printf("%ld interned strings\n", interned_strings.size());
for (const auto& p : interned_strings) {
Py_DECREF(p.second);
}
interned_strings.clear();
}
/* Format codes
* F_LJUST '-'
* F_SIGN '+'
......@@ -1613,34 +1557,45 @@ extern "C" size_t unicodeHashUnboxed(PyUnicodeObject* self) {
return x;
}
extern "C" size_t strHashUnboxed(BoxedString* self) {
assert(PyString_Check(self));
size_t strHashUnboxedStrRef(llvm::StringRef self) {
const char* p;
long x;
#ifdef Py_DEBUG
assert(_Py_HashSecret_Initialized);
#endif
if (self->hash != -1)
return self->hash;
long len = Py_SIZE(self);
long len = self.size();
/*
We make the hash of the empty string be 0, rather than using
(prefix ^ suffix), since this slightly obfuscates the hash secret
*/
if (len == 0) {
self->hash = 0;
return 0;
}
p = self->s().data();
p = self.data();
x = _Py_HashSecret.prefix;
x ^= *p << 7;
while (--len >= 0)
x = (1000003 * x) ^ *p++;
x ^= Py_SIZE(self);
x ^= self.size();
x ^= _Py_HashSecret.suffix;
if (x == -1)
x = -2;
return x;
}
extern "C" size_t strHashUnboxed(BoxedString* self) {
assert(PyString_Check(self));
const char* p;
long x;
#ifdef Py_DEBUG
assert(_Py_HashSecret_Initialized);
#endif
if (self->hash != -1)
return self->hash;
x = strHashUnboxedStrRef(self->s());
self->hash = x;
return x;
}
......
// Copyright (c) 2014-2016 Dropbox, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringRef.h"
#include "Python.h"
#include "runtime/types.h"
// create a DenseMapInfo which produces the same hash values for llvm::StringRef and BoxedString* keys
namespace llvm {
template <> struct DenseMapInfo<pyston::BoxedString*> {
static inline pyston::BoxedString* getEmptyKey() {
uintptr_t Val = static_cast<uintptr_t>(-1);
Val <<= PointerLikeTypeTraits<pyston::BoxedString*>::NumLowBitsAvailable;
return reinterpret_cast<pyston::BoxedString*>(Val);
}
static inline pyston::BoxedString* getTombstoneKey() {
uintptr_t Val = static_cast<uintptr_t>(-2);
Val <<= PointerLikeTypeTraits<pyston::BoxedString*>::NumLowBitsAvailable;
return reinterpret_cast<pyston::BoxedString*>(Val);
}
static unsigned getHashValue(pyston::BoxedString* s) { return pyston::strHashUnboxed(s); }
static unsigned getHashValue(llvm::StringRef s) { return pyston::strHashUnboxedStrRef(s); }
static bool isSpecial(pyston::BoxedString* v) { return v == getEmptyKey() || v == getTombstoneKey(); }
static bool isEqual(pyston::BoxedString* lhs, pyston::BoxedString* rhs) {
if (isSpecial(lhs) || isSpecial(rhs))
return lhs == rhs;
return lhs->s() == rhs->s();
}
static bool isEqual(llvm::StringRef lhs, pyston::BoxedString* rhs) {
if (isSpecial(rhs))
return false;
return lhs == rhs->s();
}
};
}
namespace pyston {
static llvm::DenseSet<BoxedString*> interned_strings;
static StatCounter num_interned_strings("num_interned_string");
extern "C" PyObject* PyString_InternFromString(const char* s) noexcept {
RELEASE_ASSERT(s, "");
return internStringImmortal(s);
}
BoxedString* internStringImmortal(llvm::StringRef s) noexcept {
auto it = interned_strings.find_as(s);
if (it != interned_strings.end())
return incref(*it);
num_interned_strings.log();
BoxedString* entry = boxString(s);
// CPython returns mortal but in our current implementation they are inmortal
entry->interned_state = SSTATE_INTERNED_IMMORTAL;
interned_strings.insert((BoxedString*)entry);
Py_INCREF(entry);
return entry;
}
extern "C" void PyString_InternInPlace(PyObject** p) noexcept {
BoxedString* s = (BoxedString*)*p;
if (s == NULL || !PyString_Check(s))
Py_FatalError("PyString_InternInPlace: strings only please!");
/* If it's a string subclass, we don't really know what putting
it in the interned dict might do. */
if (!PyString_CheckExact(s))
return;
if (PyString_CHECK_INTERNED(s))
return;
auto it = interned_strings.find(s);
if (it != interned_strings.end()) {
auto entry = *it;
Py_INCREF(entry);
Py_DECREF(*p);
*p = entry;
} else {
// TODO: do CPython's refcounting here
num_interned_strings.log();
interned_strings.insert(s);
Py_INCREF(s);
// CPython returns mortal but in our current implementation they are inmortal
s->interned_state = SSTATE_INTERNED_IMMORTAL;
}
}
extern "C" void _Py_ReleaseInternedStrings() noexcept {
// printf("%ld interned strings\n", interned_strings.size());
for (const auto& p : interned_strings) {
Py_DECREF(p);
}
interned_strings.clear();
}
}
......@@ -672,6 +672,7 @@ static_assert(offsetof(BoxedString, hash) == offsetof(PyStringObject, ob_shash),
static_assert(offsetof(BoxedString, interned_state) == offsetof(PyStringObject, ob_sstate), "");
static_assert(offsetof(BoxedString, s_data) == offsetof(PyStringObject, ob_sval), "");
size_t strHashUnboxedStrRef(llvm::StringRef str);
extern "C" size_t strHashUnboxed(BoxedString* self);
extern "C" int64_t hashUnboxed(Box* obj);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment