golang_str: bstr/ustr encode/decode

So far we've overridden almost all string methods, that bstr/ustr inherited from bytes and unicode. However 2 of the methods remained intact until now: unicode.encode() and bytes.decode(). Let's override them too for completeness: - we want ustr.encode() to follow signature of unicode.encode and for ustr.encode('utf-8') to return bstr. - for consistency we also want ustr.encode() to return the same type irregardless of which encoding/errors pair is used in the arguments. - => ustr.encode() always returns bstr. - we want bstr.decode() to follow signature of bytes.decode and for bstr.decode('utf-8') to return ustr. - for consistency we also want bstr.decode() to return the same type irregardless of which encoding/errors pair is used in the arguments. - -> bstr.decode() always returns ustr. So ustr.encode() -> bstr and bstr.decode() -> ustr. Let's implement this carrying out encoding/decoding process internally similarly to regular bytes and unicode and wrapping the result into corresponding pygolang type at the end.

golang_str: bstr/ustr encode/decode
So far we've overridden almost all string methods, that bstr/ustr inherited from bytes and unicode. However 2 of the methods remained intact until now: unicode.encode() and bytes.decode(). Let's override them too for completeness: - we want ustr.encode() to follow signature of unicode.encode and for ustr.encode('utf-8') to return bstr. - for consistency we also want ustr.encode() to return the same type irregardless of which encoding/errors pair is used in the arguments. - => ustr.encode() always returns bstr. - we want bstr.decode() to follow signature of bytes.decode and for bstr.decode('utf-8') to return ustr. - for consistency we also want bstr.decode() to return the same type irregardless of which encoding/errors pair is used in the arguments. - -> bstr.decode() always returns ustr. So ustr.encode() -> bstr and bstr.decode() -> ustr. Let's implement this carrying out encoding/decoding process internally similarly to regular bytes and unicode and wrapping the result into corresponding pygolang type at the end.
023907ee · Kirill Smelkov · 0985c583 · 023907ee · 023907ee
Commit 023907ee authored Oct 09, 2022 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 110 additions and 0 deletions

golang/_golang_str.pyx golang/_golang_str.pyx +42 -0

golang/golang_str_test.py golang/golang_str_test.py +68 -0

No files found.
--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -407,6 +407,27 @@ class pybstr(bytes):
        return pyu(self).__format__(format_spec)


+    # encode/decode
+    def decode(self, encoding=None, errors=None):
+        if encoding is None and errors is None:
+            encoding = 'utf-8'             # NOTE always UTF-8, not sys.getdefaultencoding
+            errors   = 'surrogateescape'
+        else:
+            if encoding is None:  encoding = 'utf-8'
+            if errors   is None:  errors   = 'strict'
+
+        if encoding == 'utf-8'  and  errors == 'surrogateescape':
+            x = _utf8_decode_surrogateescape(self)
+        else:
+            x = bytes.decode(self, encoding, errors)
+        return pyu(x)
+
+    if PY_MAJOR_VERSION < 3:
+        # whiteout encode inherited from bytes
+        # TODO ideally whiteout it in such a way that bstr.encode also raises AttributeError
+        encode = property(doc='bstr has no encode')
+
+
    # all other string methods

    def capitalize(self):                       return pyb(pyu(self).capitalize())
@@ -673,6 +694,27 @@ class pyustr(unicode):
        return pyu(unicode.__format__(self, format_spec))


+    # encode/decode
+    def encode(self, encoding=None, errors=None):
+        if encoding is None and errors is None:
+            encoding = 'utf-8'             # NOTE always UTF-8, not sys.getdefaultencoding
+            errors   = 'surrogateescape'
+        else:
+            if encoding is None:  encoding = 'utf-8'
+            if errors   is None:  errors   = 'strict'
+
+        if encoding == 'utf-8'  and  errors == 'surrogateescape':
+            x = _utf8_encode_surrogateescape(self)
+        else:
+            x = unicode.encode(self, encoding, errors)
+        return pyb(x)
+
+    if PY_MAJOR_VERSION < 3:
+        # whiteout decode inherited from unicode
+        # TODO ideally whiteout it in such a way that ustr.decode also raises AttributeError
+        decode = property(doc='ustr has no decode')
+
+
    # all other string methods

    def capitalize(self):   return pyu(unicode.capitalize(self))

--- a/golang/golang_str_test.py
+++ b/golang/golang_str_test.py
@@ -610,6 +610,74 @@ def test_strings_iter():
    assert list(XIter()) == ['м','и','р','у',' ','м','и','р']


+# verify .encode/.decode .
+def test_strings_encodedecode():
+    us = u('мир')
+    bs = b('май')
+
+    # TODO also raise AttributeError on .encode/.decode lookup on classes
+    assert     hasattr(us, 'encode')   ;   assert     hasattr(ustr, 'encode')
+    assert not hasattr(bs, 'encode')  #;   assert not hasattr(bstr, 'encode')
+    assert not hasattr(us, 'decode')  #;   assert not hasattr(ustr, 'decode')
+    assert     hasattr(bs, 'decode')   ;   assert     hasattr(bstr, 'decode')
+
+    _ = us.encode();         assert type(_) is bstr;  assert _bdata(_) == xbytes('мир')
+    _ = us.encode('utf-8');  assert type(_) is bstr;  assert _bdata(_) == xbytes('мир')
+    _ = bs.decode();         assert type(_) is ustr;  assert _udata(_) == u'май'
+    _ = bs.decode('utf-8');  assert type(_) is ustr;  assert _udata(_) == u'май'
+
+    # !utf-8
+    k8mir = u'мир'.encode('koi8-r')
+    b_k8mir = b(k8mir)
+    assert type(b_k8mir) is bstr
+    assert _bdata(b_k8mir) == k8mir
+    assert _bdata(b_k8mir) == b'\xcd\xc9\xd2'
+
+    _ = b_k8mir.decode('koi8-r')
+    assert type(_) is ustr
+    assert _udata(_) == u'мир'
+
+    b_cpmir = us.encode('cp1251')
+    assert type(b_cpmir) is bstr
+    assert _bdata(b_cpmir) == u'мир'.encode('cp1251')
+    assert _bdata(b_cpmir) == b'\xec\xe8\xf0'
+
+    # decode/encode errors
+    u_k8mir = b_k8mir.decode()                          # no decode error with
+    assert type(u_k8mir) is ustr                        # default parameters
+    assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2'
+    _ = b_k8mir.decode('utf-8', 'surrogateescape')      # no decode error with
+    assert type(_) is ustr                              # explicit utf-8/surrogateescape
+    assert _udata(_) == _udata(u_k8mir)
+
+    with raises(UnicodeDecodeError):  # decode error if encoding is explicitly specified
+        b_k8mir.decode('utf-8')
+    with raises(UnicodeDecodeError):
+        b_k8mir.decode('utf-8', 'strict')
+    with raises(UnicodeDecodeError):
+        b_k8mir.decode('ascii')
+
+    with raises(UnicodeEncodeError):
+        us.encode('ascii')
+
+    _ = u_k8mir.encode()                                # no encode error with
+    assert type(_) is bstr                              # default parameters
+    assert _bdata(_) == k8mir
+    _ = u_k8mir.encode('utf-8', 'surrogateescape')      # no encode error with
+    assert type(_) is bstr                              # explicit utf-8/surrogateescape
+    assert _bdata(_) == k8mir
+
+    # on py2 unicode.encode accepts surrogate pairs and does not complain
+    # TODO(?) manually implement encode/py2 and reject surrogate pairs by default
+    if six.PY3:
+        with raises(UnicodeEncodeError):  # encode error if encoding is explicit specified
+            u_k8mir.encode('utf-8')
+        with raises(UnicodeEncodeError):
+            u_k8mir.encode('utf-8', 'strict')
+    with raises(UnicodeEncodeError):
+        u_k8mir.encode('ascii')
+
+
 # verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr.
 @mark.parametrize('tx', (bytes, unicode, bytearray, bstr, ustr))
 def test_strings_ops1(tx):