Commit 1ec5ed82 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str_pickle: Fix it so that py3 can load what py2 saved and back

Since ebd18f3f (golang_str: bstr/ustr pickle support) bstr and ustr have
support for pickling. However in that patch I verified that it is
possible to dump and load back an object only on the same python
version, which missed that fact that a bstr pickled on py2 cannot be
loaded on py3:

on py2:

    (z-dev) kirr@deca:~/src/tools/go/pygolang$ ipython
    Python 2.7.18 (default, Apr 28 2021, 17:39:59)
    ...

    In [1]: from golang import *

    In [2]: s = bstr('мир') + b'\xff'

    In [3]: s
    Out[3]: b(b'мир\xff')

    In [5]: import pickle

    In [6]: p = pickle.dumps(1)

    In [7]: p
    Out[7]: 'I1\n.'

    In [8]: import pickletools

    In [9]: p = pickle.dumps(s, 1)

    In [10]: p
    Out[10]: 'ccopy_reg\n_reconstructor\nq\x00(cgolang._golang\n_pybstr\nq\x01h\x01U\x07\xd0\xbc\xd0\xb8\xd1\x80\xffq\x02tq\x03Rq\x04.'

    In [11]: pickletools.dis(p)
        0: c    GLOBAL     'copy_reg _reconstructor'
       25: q    BINPUT     0
       27: (    MARK
       28: c        GLOBAL     'golang._golang _pybstr'
       52: q        BINPUT     1
       54: h        BINGET     1
       56: U        SHORT_BINSTRING '\xd0\xbc\xd0\xb8\xd1\x80\xff'
       65: q        BINPUT     2
       67: t        TUPLE      (MARK at 27)
       68: q    BINPUT     3
       70: R    REDUCE
       71: q    BINPUT     4
       73: .    STOP
    highest protocol among opcodes = 1

on py3:

    (py39.venv) kirr@deca:~/src/tools/go/pygolang-master$ ipython
    Python 3.9.19+ (heads/3.9:40d77b93672, Apr 12 2024, 06:40:05)
    ...

    In [1]: from golang import *

    In [2]: import pickle

    In [3]: p = b'ccopy_reg\n_reconstructor\nq\x00(cgolang._golang\n_pybstr\nq\x01h\x01U\x07\xd0\xbc\xd0\xb8\xd1\x80\xffq\x02tq\x03Rq\x04.'

    In [4]: s = pickle.loads(p)
    ---------------------------------------------------------------------------
    UnicodeDecodeError                        Traceback (most recent call last)
    Cell In[4], line 1
    ----> 1 s = pickle.loads(p)

    UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)

which happens in the above example because pickling bstr relies on
SHORT_BINSTRING opcode which is not really handled well on py3.

-> Rework how bstr and ustr are pickled by fully taking control on what
we emit at which protocol level and how and asserting in tests that
pickling produces exactly the data, that is expected to be on the
output.

This way we know that pickling bstr/ustr works the same way on both py2
and py3 and, by also asserting that that data can be unpickled and into
the same string object, that both py2 and py3 can load what any of py2
or py3 saved.

For the reference the dump for above b(b'мир\xff') now becomes

    In [5]: p
    Out[5]: 'cgolang\nbstr\nq\x00(X\t\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbfq\x01tq\x02Rq\x03.'

    In [7]: pickletools.dis(p)
        0: c    GLOBAL     'golang bstr'
       13: q    BINPUT     0
       15: (    MARK
       16: X        BINUNICODE u'\u043c\u0438\u0440\udcff'
       30: q        BINPUT     1
       32: t        TUPLE      (MARK at 15)
       33: q    BINPUT     2
       35: R    REDUCE
       36: q    BINPUT     3
       38: .    STOP
    highest protocol among opcodes = 1

See comments in the code, and added golden vectors in the test for details.
parent 33576c9e
......@@ -1141,6 +1141,15 @@ if PY2:
# ---- adjust bstr/ustr classes after what cython generated ----
# change names of bstr/ustr to be e.g. "golang.bstr" instead of "golang._golang._bstr"
# this makes sure that unpickling saved bstr does not load via unpatched origin
# class, and is also generally good for saving pickle size and for reducing _golang exposure.
(<PyTypeObject*>pybstr).tp_name = "golang.bstr"
(<PyTypeObject*>pyustr).tp_name = "golang.ustr"
assert pybstr.__module__ == "golang"; assert pybstr.__name__ == "bstr"
assert pyustr.__module__ == "golang"; assert pyustr.__name__ == "ustr"
# for pybstr/pyustr cython generates .tp_dealloc that refer to bytes/unicode types directly.
# override that to refer to zbytes/zunicode to avoid infinite recursion on free
# when builtin bytes and unicode are replaced with bstr/ustr.
......
......@@ -28,26 +28,55 @@ if PY_MAJOR_VERSION >= 3:
else:
import copy_reg as pycopyreg
cdef object zbinary # = zodbpickle.binary | None
try:
import zodbpickle
except ImportError:
zbinary = None
else:
zbinary = zodbpickle.binary
# support for pickling bstr/ustr as standalone types.
#
# pickling is organized in such a way that
# - what is saved by py2 can be loaded correctly on both py2/py3, and similarly
# - what is saved by py3 can be loaded correctly on both py2/py3 as well.
cdef _bstr__reduce_ex__(self, protocol):
# override reduce for protocols < 2. Builtin handler for that goes through
# copyreg._reduce_ex which eventually calls bytes(bstr-instance) to
# retrieve state, which gives bstr, not bytes. Fix state to be bytes ourselves.
if protocol >= 2:
return zbytes.__reduce_ex__(self, protocol)
return (
pycopyreg._reconstructor,
(self.__class__, self.__class__, _bdata(self))
)
# Ideally we want to emit bstr(BYTES), but BYTES is not available for
# protocol < 3. And for protocol < 3 emitting bstr(STRING) is not an
# option because plain py3 raises UnicodeDecodeError on loading arbitrary
# STRING data. However emitting bstr(UNICODE) works universally because
# pickle supports arbitrary unicode - including invalid unicode - out of
# the box and in exactly the same way on both py2 and py3. For the
# reference upstream py3 uses surrogatepass on encode/decode UNICODE data
# to achieve that.
if protocol < 3:
# use UNICODE for data
udata = _udata(pyu(self))
if protocol < 2:
return (self.__class__, (udata,)) # bstr UNICODE REDUCE
else:
return (pycopyreg.__newobj__,
(self.__class__, udata)) # bstr UNICODE NEWOBJ
else:
# use BYTES for data
bdata = _bdata(self)
if PY_MAJOR_VERSION < 3:
# the only way we can get here on py2 and protocol >= 3 is zodbpickle
# -> similarly to py3 save bdata as BYTES
assert zbinary is not None
bdata = zbinary(bdata)
return (
pycopyreg.__newobj__, # bstr BYTES NEWOBJ
(self.__class__, bdata))
cdef _ustr__reduce_ex__(self, protocol):
# override reduce for protocols < 2. Builtin handler for that goes through
# copyreg._reduce_ex which eventually calls unicode(ustr-instance) to
# retrieve state, which gives ustr, not unicode. Fix state to be unicode ourselves.
if protocol >= 2:
return zunicode.__reduce_ex__(self, protocol)
return (
pycopyreg._reconstructor,
(self.__class__, self.__class__, _udata(self))
)
# emit ustr(UNICODE).
# TODO later we might want to switch to emitting ustr(BYTES)
# even if we do this, it should be backward compatible
if protocol < 2:
return (self.__class__, (_udata(self),))# ustr UNICODE REDUCE
else:
return (pycopyreg.__newobj__, # ustr UNICODE NEWOBJ
(self.__class__, _udata(self)))
......@@ -22,7 +22,7 @@ from __future__ import print_function, absolute_import
from golang import b, u, bstr, ustr
from golang.golang_str_test import xbytes
from pytest import fixture
from pytest import raises, fixture
import io, struct
import six
......@@ -70,18 +70,60 @@ def test_strings_pickle(pickle):
def diss(p): return xdiss(pickle2tools(pickle), p)
def dis(p): print(diss(p))
for proto in range(0, HIGHEST_PROTOCOL(pickle)+1):
p_bs = xdumps(pickle, bs, proto)
#dis(p_bs)
bs_ = xloads(pickle, p_bs)
assert type(bs_) is bstr
assert bs_ == bs
p_us = xdumps(pickle, us, proto)
#dis(p_us)
us_ = xloads(pickle, p_us)
assert type(us_) is ustr
assert us_ == us
# assert_pickle verifies that pickling obj results in dumps_ok
# and that unpickling results back in obj.
assert HIGHEST_PROTOCOL(pickle) <= 5
def assert_pickle(obj, proto, dumps_ok):
if proto > HIGHEST_PROTOCOL(pickle):
with raises(ValueError):
xdumps(pickle, obj, proto)
return
p = xdumps(pickle, obj, proto)
assert p == dumps_ok, diss(p)
#dis(p)
obj2 = xloads(pickle, p)
assert type(obj2) is type(obj)
assert obj2 == obj
_ = assert_pickle
_(bs, 0,
b"cgolang\nbstr\n(V\\u043c\\u0438\\u0440\\udcff\ntR.") # bstr(UNICODE)
_(us, 0,
b'cgolang\nustr\n(V\\u043c\\u0430\\u0439\\udcff\ntR.') # ustr(UNICODE)
_(bs, 1,
b'cgolang\nbstr\n(X\x09\x00\x00\x00' # bstr(BINUNICODE)
b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbftR.')
# NOTE BINUNICODE ...edb3bf not ...ff
_(us, 1,
b'cgolang\nustr\n(X\x09\x00\x00\x00' # bstr(BINUNICODE)
b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbftR.')
_(bs, 2,
b'cgolang\nbstr\nX\x09\x00\x00\x00' # bstr(BINUNICODE)
b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf\x85\x81.')
_(us, 2,
b'cgolang\nustr\nX\x09\x00\x00\x00' # ustr(BINUNICODE)
b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf\x85\x81.')
_(bs, 3,
b'cgolang\nbstr\nC\x07\xd0\xbc\xd0\xb8\xd1\x80\xff\x85\x81.') # bstr(SHORT_BINBYTES)
_(us, 3,
b'cgolang\nustr\nX\x09\x00\x00\x00' # ustr(BINUNICODE)
b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf\x85\x81.')
for p in (4,5):
_(bs, p,
b'\x8c\x06golang\x8c\x04bstr\x93C\x07' # bstr(SHORT_BINBYTES)
b'\xd0\xbc\xd0\xb8\xd1\x80\xff\x85\x81.')
_(us, p,
b'\x8c\x06golang\x8c\x04ustr\x93\x8c\x09' # ustr(SHORT_BINUNICODE)
b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf\x85\x81.')
# ---- disassembly ----
......@@ -93,7 +135,7 @@ def xdiss(pickletools, p): # -> str
return out.getvalue()
# ---- loads and dumps ----
# ---- loads and normalized dumps ----
# xloads loads pickle p via pickle.loads
# it also verifies that .load and Unpickler.load give the same result.
......@@ -119,7 +161,10 @@ def xdumps(pickle, obj, proto, **kw):
assert type(p3) is bytes
assert p1 == p2 == p3
return p1
# remove not interesting parts: PROTO / FRAME header and unused PUTs
if proto >= 2:
assert p1.startswith(PROTO(proto))
return pickle_normalize(pickle2tools(pickle), p1)
def _xpickle_attr(pickle, name):
# on py3 pickle.py tries to import from C _pickle to optimize by default
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment