golang_str_pickle: Fix bstr to pickle/unpickle in forward-compatible way wrt upcoming UTF-8bk

In 1ec5ed82 (golang_str_pickle: Fix it so that py3 can load what py2 saved and back) we changed how bstr and ustr are pickled so that the pickling process is explicit and that both py2/py3 can load what any of py2/py3 saved. It all works ok for that. However for protocol < 3 bstr is pickled via unicode data, with instructions to unpickle it as bstr(unicode-data). The idea is generally ok, but taking into account planned introduction of UTF-8bk (see c0a53847 "golang_str: TODO UTF-8bk" for details), it might result in bstr data saved before UTF-8b -> UTF-8bk switch, to become loaded in corrupt form after the switch. -> Care to avoid that by explicitly instructing pickle stream to always load data saved before the switch to UTF-8bk, as UTF-8b.

golang_str_pickle: Fix bstr to pickle/unpickle in forward-compatible way wrt upcoming UTF-8bk
In 1ec5ed82 (golang_str_pickle: Fix it so that py3 can load what py2 saved and back) we changed how bstr and ustr are pickled so that the pickling process is explicit and that both py2/py3 can load what any of py2/py3 saved. It all works ok for that. However for protocol < 3 bstr is pickled via unicode data, with instructions to unpickle it as bstr(unicode-data). The idea is generally ok, but taking into account planned introduction of UTF-8bk (see c0a53847 "golang_str: TODO UTF-8bk" for details), it might result in bstr data saved before UTF-8b -> UTF-8bk switch, to become loaded in corrupt form after the switch. -> Care to avoid that by explicitly instructing pickle stream to always load data saved before the switch to UTF-8bk, as UTF-8b.
5aa1de72 · Kirill Smelkov · 9ef32517 · 5aa1de72 · 5aa1de72 · 5aa1de72
Commit 5aa1de72 authored Feb 19, 2025 by Kirill Smelkov
4 changed files
--- a/golang/__init__.py
+++ b/golang/__init__.py
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024  Nexedi SA and Contributors.
+# Copyright (C) 2018-2025  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -376,4 +376,5 @@ from ._golang import    \
    pyustr      as ustr,    \
    pyuchr      as uchr,    \
    pybiter     as biter,   \
-    pyuiter     as uiter
+    pyuiter     as uiter,   \
+    _butf8b
--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -2017,6 +2017,11 @@ cdef _encoding_with_defaults(encoding, errors): # -> (encoding, errors)
 #   UnicodeEncodeError: 'utf-8' codec can't encode character '\udc00' in position 0: surrogates not allowed
 #
 # (*) aka UTF-8b (see http://hyperreal.org/~est/utf-8b/releases/utf-8b-20060413043934/kuhn-utf-8b.html)
+#
+# Call resulting encoding as UTF-8bk.
+#
+# TODO(kirr) adjust bstr pickling for protocol < 3 after switching bstr/ustr
+# to decode/encode via UTF-8bk instead of UTF-8b.
 from six import unichr                      # py2: unichr       py3: chr
 from six import int2byte as bchr            # py2: chr          py3: lambda x: bytes((x,))

--- a/golang/_golang_str_pickle.pyx
+++ b/golang/_golang_str_pickle.pyx
 # -*- coding: utf-8 -*-
-# Copyright (C) 2023-2024  Nexedi SA and Contributors.
+# Copyright (C) 2023-2025  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -53,12 +53,20 @@ cdef _bstr__reduce_ex__(self, protocol):
    # to achieve that.
    if protocol < 3:
        # use UNICODE for data
-        udata = _udata(pyu(self))
+        #
-        if protocol < 2:
+        # explicitly mark to unpickle via _butf8b because with the introduction
-            return (self.__class__, (udata,))   # bstr UNICODE REDUCE
+        # of UTF-8bk the way bstr decodes unicode will change, and so if we
+        # would use `bstr UNICODE` for pickling it will result in corrupt data
+        # to be loaded after the switch to UTF-8bk.
+        #
+        # TODO pickle via bstr UNICODE REDUCE/NEWOBJ after switch from UTF-8b to UTF-8bk.
+        udata = _utf8_decode_surrogateescape(self)
+        if self.__class__ is pybstr:
+            return (_butf8b,                    # _butf8b UNICODE REDUCE
+                    (udata,))
        else:
-            return (pycopyreg.__newobj__,
+            return (_butf8b,                    # _butf8b bstr UNICODE REDUCE
-                    (self.__class__, udata))    # bstr UNICODE NEWOBJ
+                    (self.__class__, udata))
    else:
        # use BYTES for data
        bdata = _bdata(self)
@@ -73,10 +81,25 @@ cdef _bstr__reduce_ex__(self, protocol):
 cdef _ustr__reduce_ex__(self, protocol):
    # emit ustr(UNICODE).
-    # TODO later we might want to switch to emitting ustr(BYTES)
+    # TODO after UTF-8bk we might want to switch to emitting ustr(BYTES)
    #      even if we do this, it should be backward compatible
    if protocol < 2:
        return (self.__class__, (_udata(self),))# ustr UNICODE REDUCE
    else:
        return (pycopyreg.__newobj__,           # ustr UNICODE NEWOBJ
                (self.__class__, _udata(self)))
+# `_butf8b [bcls] udata` serves unpickling of bstr pickled with data
+# represented via UTF-8b decoded unicode.
+def _butf8b(*argv):
+    cdef object bcls = pybstr
+    cdef object udata
+    cdef int l = len(argv)
+    if l == 1:
+        udata = argv[0]
+    elif l == 2:
+        bcls, udata = argv
+    else:
+        raise TypeError("_butf8b() takes 1 or 2 arguments; %d given" % l)
+    return _pyb(bcls, _utf8_encode_surrogateescape(udata))
+_butf8b.__module__ = "golang"
--- a/golang/golang_str_pickle_test.py
+++ b/golang/golang_str_pickle_test.py
 # -*- coding: utf-8 -*-
-# Copyright (C) 2022-2024  Nexedi SA and Contributors.
+# Copyright (C) 2022-2025  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -128,13 +128,13 @@ def test_strings_pickle_bstr_ustr(pickle):
    _ = assert_pickle
    _(bs, 0,
-             b"cgolang\nbstr\n(V\\u043c\\u0438\\u0440\\udcff\ntR.")         # bstr(UNICODE)
+             b"cgolang\n_butf8b\n(V\\u043c\\u0438\\u0440\\udcff\ntR.")      # _butf8b(UNICODE)
    _(us, 0,
             b'cgolang\nustr\n(V\\u043c\\u0430\\u0439\\udcff\ntR.')         # ustr(UNICODE)
    _(bs, 1,
-             b'cgolang\nbstr\n(X\x09\x00\x00\x00'                           # bstr(BINUNICODE)
+             b'cgolang\n_butf8b\n(X\x09\x00\x00\x00'                        # _butf8b(BINUNICODE)
                        b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbftR.')
    # NOTE BINUNICODE ...edb3bf not ...ff  (see test_strings_pickle_loadsave_UNICODE for details)
@@ -143,8 +143,8 @@ def test_strings_pickle_bstr_ustr(pickle):
                        b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbftR.')
    _(bs, 2,
-             b'cgolang\nbstr\nX\x09\x00\x00\x00'                            # bstr(BINUNICODE)
+             b'cgolang\n_butf8b\nX\x09\x00\x00\x00'                         # _butf8b(BINUNICODE)
-                        b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf\x85\x81.')
+                        b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf\x85R.')
    _(us, 2,
             b'cgolang\nustr\nX\x09\x00\x00\x00'                            # ustr(BINUNICODE)