Commit 5aa1de72 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str_pickle: Fix bstr to pickle/unpickle in forward-compatible way wrt upcoming UTF-8bk

In 1ec5ed82 (golang_str_pickle: Fix it so that py3 can load what py2
saved and back) we changed how bstr and ustr are pickled so that the
pickling process is explicit and that both py2/py3 can load what any of
py2/py3 saved. It all works ok for that.

However for protocol < 3 bstr is pickled via unicode data, with
instructions to unpickle it as bstr(unicode-data). The idea is generally ok,
but taking into account planned introduction of UTF-8bk (see c0a53847
"golang_str: TODO UTF-8bk" for details), it might result in bstr data
saved before UTF-8b -> UTF-8bk switch, to become loaded in corrupt form
after the switch.

-> Care to avoid that by explicitly instructing pickle stream to always
load data saved before the switch to UTF-8bk, as UTF-8b.
parent 9ef32517
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Copyright (C) 2018-2025 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -376,4 +376,5 @@ from ._golang import \ ...@@ -376,4 +376,5 @@ from ._golang import \
pyustr as ustr, \ pyustr as ustr, \
pyuchr as uchr, \ pyuchr as uchr, \
pybiter as biter, \ pybiter as biter, \
pyuiter as uiter pyuiter as uiter, \
_butf8b
...@@ -2017,6 +2017,11 @@ cdef _encoding_with_defaults(encoding, errors): # -> (encoding, errors) ...@@ -2017,6 +2017,11 @@ cdef _encoding_with_defaults(encoding, errors): # -> (encoding, errors)
# UnicodeEncodeError: 'utf-8' codec can't encode character '\udc00' in position 0: surrogates not allowed # UnicodeEncodeError: 'utf-8' codec can't encode character '\udc00' in position 0: surrogates not allowed
# #
# (*) aka UTF-8b (see http://hyperreal.org/~est/utf-8b/releases/utf-8b-20060413043934/kuhn-utf-8b.html) # (*) aka UTF-8b (see http://hyperreal.org/~est/utf-8b/releases/utf-8b-20060413043934/kuhn-utf-8b.html)
#
# Call resulting encoding as UTF-8bk.
#
# TODO(kirr) adjust bstr pickling for protocol < 3 after switching bstr/ustr
# to decode/encode via UTF-8bk instead of UTF-8b.
from six import unichr # py2: unichr py3: chr from six import unichr # py2: unichr py3: chr
from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,)) from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,))
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2023-2024 Nexedi SA and Contributors. # Copyright (C) 2023-2025 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -53,12 +53,20 @@ cdef _bstr__reduce_ex__(self, protocol): ...@@ -53,12 +53,20 @@ cdef _bstr__reduce_ex__(self, protocol):
# to achieve that. # to achieve that.
if protocol < 3: if protocol < 3:
# use UNICODE for data # use UNICODE for data
udata = _udata(pyu(self)) #
if protocol < 2: # explicitly mark to unpickle via _butf8b because with the introduction
return (self.__class__, (udata,)) # bstr UNICODE REDUCE # of UTF-8bk the way bstr decodes unicode will change, and so if we
# would use `bstr UNICODE` for pickling it will result in corrupt data
# to be loaded after the switch to UTF-8bk.
#
# TODO pickle via bstr UNICODE REDUCE/NEWOBJ after switch from UTF-8b to UTF-8bk.
udata = _utf8_decode_surrogateescape(self)
if self.__class__ is pybstr:
return (_butf8b, # _butf8b UNICODE REDUCE
(udata,))
else: else:
return (pycopyreg.__newobj__, return (_butf8b, # _butf8b bstr UNICODE REDUCE
(self.__class__, udata)) # bstr UNICODE NEWOBJ (self.__class__, udata))
else: else:
# use BYTES for data # use BYTES for data
bdata = _bdata(self) bdata = _bdata(self)
...@@ -73,10 +81,25 @@ cdef _bstr__reduce_ex__(self, protocol): ...@@ -73,10 +81,25 @@ cdef _bstr__reduce_ex__(self, protocol):
cdef _ustr__reduce_ex__(self, protocol): cdef _ustr__reduce_ex__(self, protocol):
# emit ustr(UNICODE). # emit ustr(UNICODE).
# TODO later we might want to switch to emitting ustr(BYTES) # TODO after UTF-8bk we might want to switch to emitting ustr(BYTES)
# even if we do this, it should be backward compatible # even if we do this, it should be backward compatible
if protocol < 2: if protocol < 2:
return (self.__class__, (_udata(self),))# ustr UNICODE REDUCE return (self.__class__, (_udata(self),))# ustr UNICODE REDUCE
else: else:
return (pycopyreg.__newobj__, # ustr UNICODE NEWOBJ return (pycopyreg.__newobj__, # ustr UNICODE NEWOBJ
(self.__class__, _udata(self))) (self.__class__, _udata(self)))
# `_butf8b [bcls] udata` serves unpickling of bstr pickled with data
# represented via UTF-8b decoded unicode.
def _butf8b(*argv):
cdef object bcls = pybstr
cdef object udata
cdef int l = len(argv)
if l == 1:
udata = argv[0]
elif l == 2:
bcls, udata = argv
else:
raise TypeError("_butf8b() takes 1 or 2 arguments; %d given" % l)
return _pyb(bcls, _utf8_encode_surrogateescape(udata))
_butf8b.__module__ = "golang"
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2022-2024 Nexedi SA and Contributors. # Copyright (C) 2022-2025 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -128,13 +128,13 @@ def test_strings_pickle_bstr_ustr(pickle): ...@@ -128,13 +128,13 @@ def test_strings_pickle_bstr_ustr(pickle):
_ = assert_pickle _ = assert_pickle
_(bs, 0, _(bs, 0,
b"cgolang\nbstr\n(V\\u043c\\u0438\\u0440\\udcff\ntR.") # bstr(UNICODE) b"cgolang\n_butf8b\n(V\\u043c\\u0438\\u0440\\udcff\ntR.") # _butf8b(UNICODE)
_(us, 0, _(us, 0,
b'cgolang\nustr\n(V\\u043c\\u0430\\u0439\\udcff\ntR.') # ustr(UNICODE) b'cgolang\nustr\n(V\\u043c\\u0430\\u0439\\udcff\ntR.') # ustr(UNICODE)
_(bs, 1, _(bs, 1,
b'cgolang\nbstr\n(X\x09\x00\x00\x00' # bstr(BINUNICODE) b'cgolang\n_butf8b\n(X\x09\x00\x00\x00' # _butf8b(BINUNICODE)
b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbftR.') b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbftR.')
# NOTE BINUNICODE ...edb3bf not ...ff (see test_strings_pickle_loadsave_UNICODE for details) # NOTE BINUNICODE ...edb3bf not ...ff (see test_strings_pickle_loadsave_UNICODE for details)
...@@ -143,8 +143,8 @@ def test_strings_pickle_bstr_ustr(pickle): ...@@ -143,8 +143,8 @@ def test_strings_pickle_bstr_ustr(pickle):
b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbftR.') b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbftR.')
_(bs, 2, _(bs, 2,
b'cgolang\nbstr\nX\x09\x00\x00\x00' # bstr(BINUNICODE) b'cgolang\n_butf8b\nX\x09\x00\x00\x00' # _butf8b(BINUNICODE)
b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf\x85\x81.') b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf\x85R.')
_(us, 2, _(us, 2,
b'cgolang\nustr\nX\x09\x00\x00\x00' # ustr(BINUNICODE) b'cgolang\nustr\nX\x09\x00\x00\x00' # ustr(BINUNICODE)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment