Commit a00d35fe authored by Marius Gedminas's avatar Marius Gedminas

Python 3: pickle bytestrings using SHORT_BINSTRING

This uses bytes_as_strings=True option introduced in zodbpickle 0.2 for
this purpose.

This way pickles produced on Python 3 are nearly the same as on Python 2.
There are some slight differences (Python 3 seems to perform more
memoizations which grows the size of some pickles by a couple of bytes),
but they're immaterial.

Now we can use zodbpickle's noload() on Python 3 to scan pickles for
persistent references.  We couldn't do that before, because Python 3
normally pickles byte strings as calls to codecs.encode(u'latin1-data',
'latin-1'), and noload() doesn't interpret the REDUCE opcode involved in
that representation.

Note that when you're pickling byte strings using bytes_as_strings=True,
you have to load them using encoding='bytes' (which breaks instances, so
cannot be used) or using errors='bytes' (which mean some bytestrings may
get unpickled as unicode instead).  I've tried hard to discover every
place that unpickles OIDs and added conversion to bytes in those places.

Applications dealing with binary data be prepared to handle bytestrings
that unexpectedly become unicode on unpickling.  That's the price of
Python 2 compatibility.
parent dc3ba252
...@@ -133,7 +133,7 @@ setup(name="ZODB", ...@@ -133,7 +133,7 @@ setup(name="ZODB",
'zc.lockfile', 'zc.lockfile',
'zdaemon >= 4.0.0a1', 'zdaemon >= 4.0.0a1',
'zope.interface', 'zope.interface',
] + (['zodbpickle'] if PY3 else []), ] + (['zodbpickle >= 0.2'] if PY3 else []),
zip_safe = False, zip_safe = False,
entry_points = """ entry_points = """
[console_scripts] [console_scripts]
......
...@@ -136,7 +136,7 @@ class PersistentReference(object): ...@@ -136,7 +136,7 @@ class PersistentReference(object):
# it. Fortunately, a class reference in a persistent # it. Fortunately, a class reference in a persistent
# reference is allowed to be a module+name tuple. # reference is allowed to be a module+name tuple.
self.data = self.oid, klass.args self.data = self.oid, klass.args
elif isinstance(data, bytes): elif isinstance(data, (bytes, str)):
self.oid = data self.oid = data
else: # a list else: # a list
reference_type = data[0] reference_type = data[0]
...@@ -162,6 +162,10 @@ class PersistentReference(object): ...@@ -162,6 +162,10 @@ class PersistentReference(object):
assert len(data) == 1, 'unknown reference format' assert len(data) == 1, 'unknown reference format'
self.oid = data[0] self.oid = data[0]
self.weak = True self.weak = True
if not isinstance(self.oid, (bytes, type(None))):
assert isinstance(self.oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
self.oid = self.oid.encode('ascii')
def __cmp__(self, other): def __cmp__(self, other):
if self is other or ( if self is other or (
......
...@@ -120,6 +120,11 @@ class ExportImport: ...@@ -120,6 +120,11 @@ class ExportImport:
if isinstance(ooid, tuple): if isinstance(ooid, tuple):
ooid, klass = ooid ooid, klass = ooid
if not isinstance(ooid, bytes):
assert isinstance(ooid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
ooid = ooid.encode('ascii')
if ooid in oids: if ooid in oids:
oid = oids[ooid] oid = oids[ooid]
else: else:
......
...@@ -33,6 +33,9 @@ checker = renormalizing.RENormalizing([ ...@@ -33,6 +33,9 @@ checker = renormalizing.RENormalizing([
# this changes all the offsets in iterator.test # this changes all the offsets in iterator.test
(re.compile('data.fs:207766'), 'data.fs:117080'), (re.compile('data.fs:207766'), 'data.fs:117080'),
(re.compile('data.fs:57991'), 'data.fs:35936'), (re.compile('data.fs:57991'), 'data.fs:35936'),
# even with Pickler(bytes_as_strings=True) some of our pickles are larger
(re.compile('data.fs:117679'), 'data.fs:117080'),
(re.compile('data.fs:36241'), 'data.fs:35936'),
]) ])
def pack_keep_old(): def pack_keep_old():
......
...@@ -20,11 +20,17 @@ try: ...@@ -20,11 +20,17 @@ try:
except ImportError: except ImportError:
# Python 3.x: can't use stdlib's pickle because # Python 3.x: can't use stdlib's pickle because
# http://bugs.python.org/issue6784 # http://bugs.python.org/issue6784
from zodbpickle.pickle import Pickler, dump, dumps import zodbpickle.pickle
from zodbpickle.pickle import Unpickler as _Unpickler, loads as _loads
from _compat_pickle import IMPORT_MAPPING, NAME_MAPPING from _compat_pickle import IMPORT_MAPPING, NAME_MAPPING
class Unpickler(_Unpickler): class Pickler(zodbpickle.pickle.Pickler):
def __init__(self, f, protocol=None):
if protocol:
# we want to be backwards-compatible with Python 2
assert 0 <= protocol < 3
super(Pickler, self).__init__(f, protocol, bytes_as_strings=True)
class Unpickler(zodbpickle.pickle.Unpickler):
def __init__(self, f): def __init__(self, f):
super(Unpickler, self).__init__(f, encoding='ASCII', errors='bytes') super(Unpickler, self).__init__(f, encoding='ASCII', errors='bytes')
...@@ -38,11 +44,22 @@ except ImportError: ...@@ -38,11 +44,22 @@ except ImportError:
return super(Unpickler, self).find_class(modulename, name) return super(Unpickler, self).find_class(modulename, name)
return self.find_global(modulename, name) return self.find_global(modulename, name)
def dump(o, f, protocol=None):
if protocol:
# we want to be backwards-compatible with Python 2
assert 0 <= protocol < 3
return zodbpickle.pickle.dump(o, f, protocol, bytes_as_strings=True)
def dumps(o, protocol=None):
if protocol:
# we want to be backwards-compatible with Python 2
assert 0 <= protocol < 3
return zodbpickle.pickle.dumps(o, protocol, bytes_as_strings=True)
def loads(s): def loads(s):
return _loads(s, encoding='ASCII', errors='bytes') return zodbpickle.pickle.loads(s, encoding='ASCII', errors='bytes')
# XXX: overridable Unpickler.find_global as used in serialize.py?
# XXX: consistent spelling of inst_persistent_id/persistent_id? # XXX: consistent spelling of inst_persistent_id/persistent_id?
# e.g. StorageTestBase and probably elsewhere # e.g. StorageTestBase and probably elsewhere
......
...@@ -63,6 +63,11 @@ def prefix_minus_one(s): ...@@ -63,6 +63,11 @@ def prefix_minus_one(s):
num = str2num(s) num = str2num(s)
return num2str(num - 1) return num2str(num - 1)
def ensure_bytes(s):
# on Python 3 we might pickle bytes and unpickle unicode strings
return s.encode('ascii') if not isinstance(s, bytes) else s
class fsIndex(object): class fsIndex(object):
def __init__(self, data=None): def __init__(self, data=None):
...@@ -85,14 +90,19 @@ class fsIndex(object): ...@@ -85,14 +90,19 @@ class fsIndex(object):
def _setstate_0(self, state): def _setstate_0(self, state):
self.__dict__.clear() self.__dict__.clear()
self.__dict__.update(state) self.__dict__.update(state)
self._data = OOBTree([
(ensure_bytes(k), v)
for (k, v) in self._data.items()
])
def _setstate_1(self, state): def _setstate_1(self, state):
self._data = OOBTree([ self._data = OOBTree([
(k, fsBucket().fromString(v)) (ensure_bytes(k), fsBucket().fromString(ensure_bytes(v)))
for (k, v) in state['_data'] for (k, v) in state['_data']
]) ])
def __getitem__(self, key): def __getitem__(self, key):
assert isinstance(key, bytes)
return str2num(self._data[key[:6]][key[6:]]) return str2num(self._data[key[:6]][key[6:]])
def save(self, pos, fname): def save(self, pos, fname):
...@@ -110,6 +120,10 @@ class fsIndex(object): ...@@ -110,6 +120,10 @@ class fsIndex(object):
unpickler = Unpickler(f) unpickler = Unpickler(f)
pos = unpickler.load() pos = unpickler.load()
if not isinstance(pos, int): if not isinstance(pos, int):
# NB: this might contain OIDs that got unpickled
# into Unicode strings on Python 3; hope the caller
# will pipe the result to fsIndex().update() to normalize
# the keys
return pos # Old format return pos # Old format
index = class_() index = class_()
data = index._data data = index._data
...@@ -118,10 +132,11 @@ class fsIndex(object): ...@@ -118,10 +132,11 @@ class fsIndex(object):
if not v: if not v:
break break
k, v = v k, v = v
data[k] = fsBucket().fromString(v) data[ensure_bytes(k)] = fsBucket().fromString(ensure_bytes(v))
return dict(pos=pos, index=index) return dict(pos=pos, index=index)
def get(self, key, default=None): def get(self, key, default=None):
assert isinstance(key, bytes)
tree = self._data.get(key[:6], default) tree = self._data.get(key[:6], default)
if tree is default: if tree is default:
return default return default
...@@ -131,6 +146,7 @@ class fsIndex(object): ...@@ -131,6 +146,7 @@ class fsIndex(object):
return str2num(v) return str2num(v)
def __setitem__(self, key, value): def __setitem__(self, key, value):
assert isinstance(key, bytes)
value = num2str(value) value = num2str(value)
treekey = key[:6] treekey = key[:6]
tree = self._data.get(treekey) tree = self._data.get(treekey)
...@@ -140,6 +156,7 @@ class fsIndex(object): ...@@ -140,6 +156,7 @@ class fsIndex(object):
tree[key[6:]] = value tree[key[6:]] = value
def __delitem__(self, key): def __delitem__(self, key):
assert isinstance(key, bytes)
treekey = key[:6] treekey = key[:6]
tree = self._data.get(treekey) tree = self._data.get(treekey)
if tree is None: if tree is None:
...@@ -156,13 +173,14 @@ class fsIndex(object): ...@@ -156,13 +173,14 @@ class fsIndex(object):
def update(self, mapping): def update(self, mapping):
for k, v in mapping.items(): for k, v in mapping.items():
self[k] = v self[ensure_bytes(k)] = v
def has_key(self, key): def has_key(self, key):
v = self.get(key, self) v = self.get(key, self)
return v is not self return v is not self
def __contains__(self, key): def __contains__(self, key):
assert isinstance(key, bytes)
tree = self._data.get(key[:6]) tree = self._data.get(key[:6])
if tree is None: if tree is None:
return False return False
......
...@@ -489,7 +489,7 @@ class ObjectReader: ...@@ -489,7 +489,7 @@ class ObjectReader:
def _persistent_load(self, reference): def _persistent_load(self, reference):
if isinstance(reference, tuple): if isinstance(reference, tuple):
return self.load_persistent(*reference) return self.load_persistent(*reference)
elif isinstance(reference, bytes): elif isinstance(reference, (bytes, str)):
return self.load_oid(reference) return self.load_oid(reference)
else: else:
try: try:
...@@ -504,6 +504,11 @@ class ObjectReader: ...@@ -504,6 +504,11 @@ class ObjectReader:
# Quick instance reference. We know all we need to know # Quick instance reference. We know all we need to know
# to create the instance w/o hitting the db, so go for it! # to create the instance w/o hitting the db, so go for it!
if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
obj = self._cache.get(oid, None) obj = self._cache.get(oid, None)
if obj is not None: if obj is not None:
return obj return obj
...@@ -538,6 +543,10 @@ class ObjectReader: ...@@ -538,6 +543,10 @@ class ObjectReader:
def load_persistent_weakref(self, oid, database_name=None): def load_persistent_weakref(self, oid, database_name=None):
if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
obj = WeakRef.__new__(WeakRef) obj = WeakRef.__new__(WeakRef)
obj.oid = oid obj.oid = oid
if database_name is None: if database_name is None:
...@@ -556,6 +565,10 @@ class ObjectReader: ...@@ -556,6 +565,10 @@ class ObjectReader:
loaders['w'] = load_persistent_weakref loaders['w'] = load_persistent_weakref
def load_oid(self, oid): def load_oid(self, oid):
if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
obj = self._cache.get(oid, None) obj = self._cache.get(oid, None)
if obj is not None: if obj is not None:
return obj return obj
...@@ -632,15 +645,9 @@ def referencesf(p, oids=None): ...@@ -632,15 +645,9 @@ def referencesf(p, oids=None):
refs = [] refs = []
u = Unpickler(BytesIO(p)) u = Unpickler(BytesIO(p))
if sys.version_info[0] < 3: u.persistent_load = refs.append
u.persistent_load = refs
u.noload() u.noload()
u.noload() u.noload()
else:
# Py3: There is no `noload()` in Python 3.
u.persistent_load = refs.append
u.load()
u.load()
# Now we have a list of referencs. Need to convert to list of # Now we have a list of referencs. Need to convert to list of
# oids: # oids:
...@@ -651,12 +658,17 @@ def referencesf(p, oids=None): ...@@ -651,12 +658,17 @@ def referencesf(p, oids=None):
for reference in refs: for reference in refs:
if isinstance(reference, tuple): if isinstance(reference, tuple):
oid = reference[0] oid = reference[0]
elif isinstance(reference, bytes): elif isinstance(reference, (bytes, str)):
oid = reference oid = reference
else: else:
assert isinstance(reference, list) assert isinstance(reference, list)
continue continue
if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
oids.append(oid) oids.append(oid)
return oids return oids
...@@ -675,15 +687,9 @@ def get_refs(a_pickle): ...@@ -675,15 +687,9 @@ def get_refs(a_pickle):
refs = [] refs = []
u = Unpickler(BytesIO(a_pickle)) u = Unpickler(BytesIO(a_pickle))
if sys.version_info[0] < 3: u.persistent_load = refs.append
u.persistent_load = refs
u.noload() u.noload()
u.noload() u.noload()
else:
# Py3: There is no `noload()` in Python 3.
u.persistent_load = refs.append
u.load()
u.load()
# Now we have a list of references. Need to convert to list of # Now we have a list of references. Need to convert to list of
# oids and class info: # oids and class info:
...@@ -692,13 +698,18 @@ def get_refs(a_pickle): ...@@ -692,13 +698,18 @@ def get_refs(a_pickle):
for reference in refs: for reference in refs:
if isinstance(reference, tuple): if isinstance(reference, tuple):
data = reference oid, klass = reference
elif isinstance(reference, bytes): elif isinstance(reference, (bytes, str)):
data = reference, None data, klass = reference, None
else: else:
assert isinstance(reference, list) assert isinstance(reference, list)
continue continue
result.append(data) if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
result.append((oid, klass))
return result return result
...@@ -58,6 +58,13 @@ class Object(object): ...@@ -58,6 +58,13 @@ class Object(object):
def getoid(self): def getoid(self):
return self._oid return self._oid
def __setstate__(self, state):
self.__dict__.clear()
self.__dict__.update(state)
if not isinstance(self._oid, bytes):
# Python 3
self._oid = self._oid.encode('ascii')
class C(Persistent): class C(Persistent):
pass pass
...@@ -89,7 +96,7 @@ def dumps(obj): ...@@ -89,7 +96,7 @@ def dumps(obj):
def pdumps(obj): def pdumps(obj):
s = BytesIO() s = BytesIO()
p = Pickler(s) p = Pickler(s, _protocol)
p.dump(obj) p.dump(obj)
p.dump(None) p.dump(None)
return s.getvalue() return s.getvalue()
......
...@@ -82,6 +82,8 @@ checker = renormalizing.RENormalizing([ ...@@ -82,6 +82,8 @@ checker = renormalizing.RENormalizing([
(re.compile(r'\bsize=65\b'), 'size=60'), (re.compile(r'\bsize=65\b'), 'size=60'),
(re.compile(r'\offset=206\b'), 'offset=201'), (re.compile(r'\offset=206\b'), 'offset=201'),
(re.compile(r'\bsize=156\b'), 'size=107'), (re.compile(r'\bsize=156\b'), 'size=107'),
# even with Pickler(bytes_as_strings=True) some of our pickles are larger
(re.compile(r'\bsize=113\b'), 'size=107'),
]) ])
...@@ -89,4 +91,5 @@ def test_suite(): ...@@ -89,4 +91,5 @@ def test_suite():
return doctest.DocTestSuite( return doctest.DocTestSuite(
setUp=zope.testing.setupstack.setUpDirectory, setUp=zope.testing.setupstack.setUpDirectory,
tearDown=zope.testing.setupstack.tearDown, tearDown=zope.testing.setupstack.tearDown,
optionflags=doctest.REPORT_NDIFF,
checker=ZODB.tests.util.checker + checker) checker=ZODB.tests.util.checker + checker)
...@@ -11,11 +11,11 @@ ...@@ -11,11 +11,11 @@
# FOR A PARTICULAR PURPOSE. # FOR A PARTICULAR PURPOSE.
# #
############################################################################## ##############################################################################
from pickle import Pickler, Unpickler
from ZODB.blob import Blob from ZODB.blob import Blob
from ZODB.DB import DB from ZODB.DB import DB
from ZODB.FileStorage import FileStorage from ZODB.FileStorage import FileStorage
from ZODB.tests.testConfig import ConfigTestBase from ZODB.tests.testConfig import ConfigTestBase
from ZODB._compat import Pickler, Unpickler
import os import os
if os.environ.get('USE_ZOPE_TESTING_DOCTEST'): if os.environ.get('USE_ZOPE_TESTING_DOCTEST'):
......
...@@ -183,8 +183,13 @@ checker = renormalizing.RENormalizing([ ...@@ -183,8 +183,13 @@ checker = renormalizing.RENormalizing([
(re.compile(r'(PersistentMapping|OOBTree) at 206\b'), r'\1 at 201'), (re.compile(r'(PersistentMapping|OOBTree) at 206\b'), r'\1 at 201'),
(re.compile(r'(OOBTree) at 404\b'), r'\1 at 350'), (re.compile(r'(OOBTree) at 404\b'), r'\1 at 350'),
(re.compile(r'(PersistentMapping|OOBTree) at 531\b'), r'\1 at 477'), (re.compile(r'(PersistentMapping|OOBTree) at 531\b'), r'\1 at 477'),
# even with Pickler(bytes_as_strings=True) some of our pickles are larger
(re.compile(r'(OOBTree) at 361\b'), r'\1 at 350'),
(re.compile(r'\boffset=440\b'), 'offset=429'),
(re.compile(r'(PersistentMapping|OOBTree) at 488\b'), r'\1 at 477'),
]) ])
def test_suite(): def test_suite():
return doctest.DocTestSuite(checker=ZODB.tests.util.checker + checker) return doctest.DocTestSuite(checker=ZODB.tests.util.checker + checker,
optionflags=doctest.REPORT_NDIFF)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment