golang_str: Implement bstr/ustr constructors

Both bstr and ustr constructors mimic constructor of unicode(= str on py3) - an object is either stringified, or decoded if it provides buffer interface, or the constructor is invoked with optional encoding and errors argument: # py2 class unicode(basestring) | unicode(object='') -> unicode object | unicode(string[, encoding[, errors]]) -> unicode object # py3 class str(object) | str(object='') -> str | str(bytes_or_buffer[, encoding[, errors]]) -> str Stringification of all bstr/ustr / unicode/bytes is handled automatically with the meaning to convert to created type via b or u. We follow unicode semantic for both ustr _and_ bstr, because bstr/ustr are intended to be used as strings.

golang_str: Implement bstr/ustr constructors
Both bstr and ustr constructors mimic constructor of unicode(= str on py3) - an object is either stringified, or decoded if it provides buffer interface, or the constructor is invoked with optional encoding and errors argument: # py2 class unicode(basestring) | unicode(object='') -> unicode object | unicode(string[, encoding[, errors]]) -> unicode object # py3 class str(object) | str(object='') -> str | str(bytes_or_buffer[, encoding[, errors]]) -> str Stringification of all bstr/ustr / unicode/bytes is handled automatically with the meaning to convert to created type via b or u. We follow unicode semantic for both ustr _and_ bstr, because bstr/ustr are intended to be used as strings.
781802d4 · Kirill Smelkov · 54c2a3cf · 781802d4 · 781802d4 · 781802d4
Commit 781802d4 authored Oct 06, 2022 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 243 additions and 25 deletions

README.rst README.rst +5 -3

golang/_golang_str.pyx golang/_golang_str.pyx +146 -20

golang/golang_str_test.py golang/golang_str_test.py +92 -2

No files found.
--- a/README.rst
+++ b/README.rst
@@ -233,8 +233,8 @@ Pygolang, similarly to Go, provides uniform UTF8-based approach to strings with
 the idea to make working with byte- and unicode- strings easy and transparently
 interoperable:
- `bstr` is byte-string: it is based on `bytes` and can automatically convert to `unicode` [*]_.
+- `bstr` is byte-string: it is based on `bytes` and can automatically convert to/from `unicode` [*]_.
- `ustr` is unicode-string: it is based on `unicode` and can automatically convert to `bytes`.
+- `ustr` is unicode-string: it is based on `unicode` and can automatically convert to/from `bytes`.
 The conversion, in both encoding and decoding, never fails and never looses
 information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
@@ -245,7 +245,9 @@ operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce
 to `ustr`.  When the coercion happens, `bytes`, similarly to
 `bstr`, are also treated as UTF8-encoded strings.
-`bstr`/`ustr` constructors will accept arbitrary objects and either convert or stringify them. For
+`bstr` and `ustr` are meant to be drop-in replacements for standard
+`str`/`unicode` classes. They support all methods of `str`/`unicode` and in
+particular their constructors accept arbitrary objects and either convert or stringify them. For
 cases when no stringification is desired, and one only wants to convert
 `bstr`/`ustr` / `unicode`/`bytes`
 to Pygolang string, `b` and `u` provide way to make sure an

--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -51,17 +51,10 @@ def pyb(s): # -> bstr
       See also: u, bstr/ustr.
    """
-    if type(s) is pybstr:
+    bs = _pyb(pybstr, s)
-        return s
+    if bs is None:
-    if isinstance(s, bytes):                    # py2: str      py3: bytes
-        pass
-    elif isinstance(s, unicode):                # py2: unicode  py3: str
-        s = _utf8_encode_surrogateescape(s)
-    else:
        raise TypeError("b: invalid type %s" % type(s))
+    return bs
-    return pybstr(s)
 def pyu(s): # -> ustr
    """u converts object to ustr.
@@ -81,17 +74,41 @@ def pyu(s): # -> ustr
       See also: b, bstr/ustr.
    """
-    if type(s) is pyustr:
+    us = _pyu(pyustr, s)
+    if us is None:
+        raise TypeError("u: invalid type %s" % type(s))
+    return us
+cdef _pyb(bcls, s): # -> ~bstr | None
+    if type(s) is bcls:
        return s
-    if isinstance(s, unicode):                  # py2: unicode  py3: str
+    if isinstance(s, bytes):
-        pass
+        if type(s) is not bytes:
-    elif isinstance(s, bytes):                  # py2: str      py3: bytes
+            s = _bdata(s)
+    elif isinstance(s, unicode):
+        s = _utf8_encode_surrogateescape(s)
+    else:
+        return None
+    assert type(s) is bytes
+    return bytes.__new__(bcls, s)
+cdef _pyu(ucls, s): # -> ~ustr | None
+    if type(s) is ucls:
+        return s
+    if isinstance(s, unicode):
+        if type(s) is not unicode:
+            s = _udata(s)
+    elif isinstance(s, bytes):
        s = _utf8_decode_surrogateescape(s)
    else:
-        raise TypeError("u: invalid type %s" % type(s))
+        return None
-    return pyustr(s)
+    assert type(s) is unicode
+    return unicode.__new__(ucls, s)
 # _pyb_coerce coerces x from `b op x` to be used in operation with pyb.
@@ -136,7 +153,7 @@ cdef __pystr(object obj): # -> ~str
 class pybstr(bytes):
    """bstr is byte-string.
-    It is based on bytes and can automatically convert to unicode.
+    It is based on bytes and can automatically convert to/from unicode.
    The conversion never fails and never looses information:
        bstr → ustr → bstr
@@ -147,6 +164,15 @@ class pybstr(bytes):
    When the coercion happens, bytes, similarly to bstr, are also
    treated as UTF8-encoded strings.
+    bstr constructor accepts arbitrary objects and stringify them:
+    - if encoding and/or errors is specified, the object must provide buffer
+      interface. The data in the buffer is decoded according to provided
+      encoding/errors and further encoded via UTF-8 into bstr.
+    - if the object is bstr/ustr / unicode/bytes - it is converted
+      to bstr. See b for details.
+    - otherwise bstr will have string representation of the object.
    See also: b, ustr/u.
    """
@@ -154,6 +180,18 @@ class pybstr(bytes):
    # won't be needed after switch to -> `cdef class`
    __slots__ = ()
+    def __new__(cls, object='', encoding=None, errors=None):
+        # encoding or errors  ->  object must expose buffer interface
+        if not (encoding is None and errors is None):
+            object = _buffer_decode(object, encoding, errors)
+        # _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented
+        object = _bstringify(object)
+        assert isinstance(object, (unicode, bytes)), object
+        bobj = _pyb(cls, object)
+        assert bobj is not None
+        return bobj
    def __bytes__(self):    return self
    def __unicode__(self):  return pyu(self)
@@ -191,10 +229,11 @@ class pybstr(bytes):
    def __ge__(a, b):   return bytes.__ge__(a, _pyb_coerce(b))
-cdef class pyustr(unicode):
+# XXX cannot `cdef class` with __new__: https://github.com/cython/cython/issues/799
+class pyustr(unicode):
    """ustr is unicode-string.
-    It is based on unicode and can automatically convert to bytes.
+    It is based on unicode and can automatically convert to/from bytes.
    The conversion never fails and never looses information:
        ustr → bstr → ustr
@@ -205,9 +244,29 @@ cdef class pyustr(unicode):
    When the coercion happens, bytes, similarly to bstr, are also
    treated as UTF8-encoded strings.
+    ustr constructor, similarly to the one in bstr, accepts arbitrary objects
+    and stringify them. Please refer to bstr and u documentation for details.
    See also: u, bstr/b.
    """
+    # don't allow to set arbitrary attributes.
+    # won't be needed after switch to -> `cdef class`
+    __slots__ = ()
+    def __new__(cls, object='', encoding=None, errors=None):
+        # encoding or errors  ->  object must expose buffer interface
+        if not (encoding is None and errors is None):
+            object = _buffer_decode(object, encoding, errors)
+        # _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented
+        object = _bstringify(object)
+        assert isinstance(object, (unicode, bytes)), object
+        uobj = _pyu(cls, object)
+        assert uobj is not None
+        return uobj
    def __bytes__(self):    return pyb(self)
    def __unicode__(self):  return self
@@ -312,6 +371,37 @@ def pyqq(obj):
    return qobj
+# ---- _bstringify ----
+# _bstringify returns string representation of obj.
+# it is similar to unicode(obj).
+cdef _bstringify(object obj): # -> unicode|bytes
+    if type(obj) in (pybstr, pyustr, bytes, unicode):
+        return obj
+    if PY_MAJOR_VERSION >= 3:
+        return unicode(obj)
+    else:
+        # on py2 mimic manually what unicode(·) does on py3
+        # the reason we do it manually is because if we try just
+        # unicode(obj), and obj's __str__ returns UTF-8 bytestring, it will
+        # fail with UnicodeDecodeError. Similarly if we unconditionally do
+        # str(obj), it will fail if obj's __str__ returns unicode.
+        if hasattr(obj, '__unicode__'):
+            return obj.__unicode__()
+        elif hasattr(obj, '__str__'):
+            # (u'β').__str__() gives UnicodeEncodeError, but unicode has no
+            # .__unicode__ method. Work it around to handle custom unicode
+            # subclasses that do not override __str__.
+            if type(obj).__str__ is unicode.__str__:
+                return unicode(obj)
+            return obj.__str__()
+        else:
+            return repr(obj)
 # py2: adjust unicode.tp_richcompare(a,b) to return NotImplemented if b is bstr.
 # This way we avoid `UnicodeWarning: Unicode equal comparison failed to convert
 # both arguments to Unicode - interpreting them as being unequal`, and that
@@ -381,6 +471,42 @@ cdef class _UnboundMethod(object): # they removed unbound methods on py3
        return pyfunctools.partial(self.func, obj)
+# ---- misc ----
+# _buffer_py2 returns buffer(obj) on py2 / fails on py3
+cdef object _buffer_py2(object obj):
+    IF PY2:                 # cannot `if PY_MAJOR_VERSION < 3` because then cython errors
+        return buffer(obj)  # "undeclared name not builtin: buffer"
+    ELSE:
+        raise AssertionError("must be called only on py2")
+# _buffer_decode decodes buf to unicode according to encoding and errors.
+#
+# buf must expose buffer interface.
+# encoding/errors can be None meaning to use default utf-8/strict.
+cdef unicode _buffer_decode(buf, encoding, errors):
+    if encoding is None: encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
+    if errors   is None: errors   = 'strict'
+    if _XPyObject_CheckOldBuffer(buf):
+        buf = _buffer_py2(buf)
+    else:
+        buf = memoryview(buf)
+    return bytearray(buf).decode(encoding, errors)
+cdef extern from "Python.h":
+    """
+    static int _XPyObject_CheckOldBuffer(PyObject *o) {
+    #if PY_MAJOR_VERSION >= 3
+        // no old-style buffers on py3
+        return 0;
+    #else
+        return PyObject_CheckReadBuffer(o);
+    #endif
+    }
+    """
+    bint _XPyObject_CheckOldBuffer(object o)
 # ---- UTF-8 encode/decode ----
 from six import unichr                      # py2: unichr       py3: chr
@@ -472,7 +598,7 @@ def _utf8_decode_surrogateescape(const uint8_t[::1] s): # -> unicode
 def _utf8_encode_surrogateescape(s): # -> bytes
    assert isinstance(s, unicode)
    if PY_MAJOR_VERSION >= 3:
-        return s.encode('UTF-8', 'surrogateescape')
+        return unicode.encode(s, 'UTF-8', 'surrogateescape')
    # py2 does not have surrogateescape error handler, and even if we
    # provide one, builtin unicode.encode() does not treat

--- a/golang/golang_str_test.py
+++ b/golang/golang_str_test.py
@@ -28,8 +28,20 @@ from golang.strconv_test import byterange
 from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
 from pytest import raises, mark, skip
 import sys
+import six
 from six import text_type as unicode
 from six.moves import range as xrange
+import array
+# buftypes lists types with buffer interface that we will test against.
+buftypes = [
+        bytearray,
+        memoryview,
+        lambda x: array.array('B', x),
+]
+if six.PY2:
+    buftypes.append(buffer) # no buffer on py3
 # verify b/u and bstr/ustr basics.
@@ -123,6 +135,17 @@ def test_strings_basic():
    with raises(TypeError): b(object())
    with raises(TypeError): u(object())
+    # bstr/ustr - similarly to str - accept arbitrary objects
+    _ = bstr();         assert type(_) is bstr;  assert _ == ''
+    _ = ustr();         assert type(_) is ustr;  assert _ == ''
+    _ = bstr(123);      assert type(_) is bstr;  assert _ == '123'
+    _ = ustr(123);      assert type(_) is ustr;  assert _ == '123'
+    _ = bstr([1,'b']);  assert type(_) is bstr;  assert _ == "[1, 'b']"
+    _ = ustr([1,'b']);  assert type(_) is ustr;  assert _ == "[1, 'b']"
+    obj = object()
+    _ = bstr(obj);      assert type(_) is bstr;  assert _ == str(obj)  # <object ...>
+    _ = ustr(obj);      assert type(_) is ustr;  assert _ == str(obj)  # <object ...>
    b_  = xbytes    ("мир");  assert type(b_) is bytes
    u_  = xunicode  ("мир");  assert type(u_) is unicode
@@ -130,17 +153,46 @@ def test_strings_basic():
    # b/u from unicode
    bs = b(u_);    assert isinstance(bs, bytes);    assert type(bs) is bstr
    us = u(u_);    assert isinstance(us, unicode);  assert type(us) is ustr
+    _ = bstr(u_);  assert type(_) is bstr;  assert _ == "мир"
+    _ = ustr(u_);  assert type(_) is ustr;  assert _ == "мир"
    # b/u from bytes
    _ = b(b_);     assert type(_) is bstr;  assert _ == "мир"
    _ = u(b_);     assert type(_) is ustr;  assert _ == "мир"
+    _ = bstr(b_);  assert type(_) is bstr;  assert _ == "мир"
+    _ = ustr(b_);  assert type(_) is ustr;  assert _ == "мир"
    # TODO also handle bytearray?
+    # bstr/ustr from bytes/buffer with encoding
+    k8mir_bytes = u"мир".encode('koi8-r')
+    for tbuf in [bytes] + buftypes:
+        k8mir = tbuf(k8mir_bytes)
+        _ = bstr(k8mir, 'koi8-r');  assert type(_) is bstr;  assert _ == "мир"
+        _ = ustr(k8mir, 'koi8-r');  assert type(_) is ustr;  assert _ == "мир"
+        with raises(UnicodeDecodeError): bstr(k8mir, 'ascii')
+        with raises(UnicodeDecodeError): ustr(k8mir, 'ascii')
+        _ = bstr(k8mir, 'ascii', 'replace');  assert type(_) is bstr;  assert _ == u'\ufffd\ufffd\ufffd'
+        _ = ustr(k8mir, 'ascii', 'replace');  assert type(_) is ustr;  assert _ == u'\ufffd\ufffd\ufffd'
+        # no encoding -> utf8 with surrogateescape for bytes,  stringify for the rest
+        k8mir_usurrogateescape = u'\udccd\udcc9\udcd2'
+        k8mir_strok = k8mir_usurrogateescape
+        if not tbuf in (bytes,):
+            k8mir_strok = str(k8mir)  # e.g. '<memory at ...>' for memoryview
+        _ = bstr(k8mir);  assert type(_) is bstr;  assert _ == k8mir_strok
+        _ = ustr(k8mir);  assert type(_) is ustr;  assert _ == k8mir_strok
+        # encoding specified -> treat it precisely
+        with raises(UnicodeDecodeError): bstr(k8mir, 'utf-8')
+        with raises(UnicodeDecodeError): ustr(k8mir, 'utf-8')
+        with raises(UnicodeDecodeError): bstr(k8mir, encoding='utf-8')
+        with raises(UnicodeDecodeError): ustr(k8mir, encoding='utf-8')
+        with raises(UnicodeDecodeError): bstr(k8mir, errors='strict')
+        with raises(UnicodeDecodeError): ustr(k8mir, errors='strict')
    # b(b(·)) = identity,   u(u(·)) = identity
-    assert b(bs) is bs
+    assert b(bs) is bs;  assert bstr(bs) is bs
-    assert u(us) is us
+    assert u(us) is us;  assert ustr(us) is us
    # bytes(b(·)) = identity,   unicode(u(·)) = identity
    assert bytes  (bs) is bs
@@ -274,6 +326,44 @@ def test_strings_print():
    assertDoc(outok, stdout)
+# verify behaviour of bstr|ustr subclasses.
+@mark.parametrize('tx', (unicode, bstr, ustr))
+def test_strings_subclasses(tx):
+    x = xstr(u'мир', tx);  assert type(x) is tx
+    # subclass without __str__
+    class MyStr(tx):
+        pass
+    xx = MyStr(x);  assert type(xx) is MyStr
+    _  = tx(xx);    assert type(_)  is tx   ; assert _ == x  # e.g. unicode(MyStr) -> unicode, not MyStr
+    _  = bstr(xx);  assert type(_)  is bstr ; assert _ == 'мир'
+    _  = ustr(xx);  assert type(_)  is ustr ; assert _ == 'мир'
+    _  = b(xx);     assert type(_)  is bstr ; assert _ == 'мир'
+    _  = u(xx);     assert type(_)  is ustr ; assert _ == 'мир'
+    # subclass with __str__
+    class MyStr(tx):
+        def __str__(self): return u'αβγ'
+        __unicode__ = __str__
+    xx = MyStr(x);  assert type(xx) is MyStr
+    _  = tx(xx);    assert type(_)  is tx   ; assert _ == u'αβγ' # unicode(MyStr) -> u'αβγ', not 'мир'
+    _  = bstr(xx);  assert type(_)  is bstr ; assert _ == u'αβγ'
+    _  = ustr(xx);  assert type(_)  is ustr ; assert _ == u'αβγ'
+    _  = b(xx);     assert type(_)  is bstr ; assert _ == u'мир' # b(MyStr) -> 'мир', not 'αβγ'
+    _  = u(xx);     assert type(_)  is ustr ; assert _ == u'мир'
+    # non-subclass with __str__  (for completeness)
+    class MyObj(object):
+        def __str__(self):
+            return 'myobj'
+    xx = MyObj();   assert type(xx) is MyObj
+    _  = tx(xx);    assert type(_)  is tx   ; assert _ == 'myobj'
+    _  = bstr(xx);  assert type(_)  is bstr ; assert _ == 'myobj'
+    _  = ustr(xx);  assert type(_)  is ustr ; assert _ == 'myobj'
+    with raises(TypeError): b(xx)   # NOTE b/u reports "convertion failure"
+    with raises(TypeError): u(xx)
 def test_qq():
    # NOTE qq is also tested as part of strconv.quote