Commit 023907ee authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: bstr/ustr encode/decode

So far we've overridden almost all string methods, that bstr/ustr
inherited from bytes and unicode. However 2 of the methods remained
intact until now: unicode.encode() and bytes.decode(). Let's override
them too for completeness:

- we want ustr.encode() to follow signature of unicode.encode and for ustr.encode('utf-8') to return bstr.
- for consistency we also want ustr.encode() to return the same type
  irregardless of which encoding/errors pair is used in the arguments.
- => ustr.encode() always returns bstr.
- we want bstr.decode() to follow signature of bytes.decode and for bstr.decode('utf-8') to return ustr.
- for consistency we also want bstr.decode() to return the same type
  irregardless of which encoding/errors pair is used in the arguments.
- -> bstr.decode() always returns ustr.

So  ustr.encode() -> bstr  and  bstr.decode() -> ustr.

Let's implement this carrying out encoding/decoding process internally
similarly to regular bytes and unicode and wrapping the result into
corresponding pygolang type at the end.
parent 0985c583
......@@ -407,6 +407,27 @@ class pybstr(bytes):
return pyu(self).__format__(format_spec)
# encode/decode
def decode(self, encoding=None, errors=None):
if encoding is None and errors is None:
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
errors = 'surrogateescape'
else:
if encoding is None: encoding = 'utf-8'
if errors is None: errors = 'strict'
if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_decode_surrogateescape(self)
else:
x = bytes.decode(self, encoding, errors)
return pyu(x)
if PY_MAJOR_VERSION < 3:
# whiteout encode inherited from bytes
# TODO ideally whiteout it in such a way that bstr.encode also raises AttributeError
encode = property(doc='bstr has no encode')
# all other string methods
def capitalize(self): return pyb(pyu(self).capitalize())
......@@ -673,6 +694,27 @@ class pyustr(unicode):
return pyu(unicode.__format__(self, format_spec))
# encode/decode
def encode(self, encoding=None, errors=None):
if encoding is None and errors is None:
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
errors = 'surrogateescape'
else:
if encoding is None: encoding = 'utf-8'
if errors is None: errors = 'strict'
if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_encode_surrogateescape(self)
else:
x = unicode.encode(self, encoding, errors)
return pyb(x)
if PY_MAJOR_VERSION < 3:
# whiteout decode inherited from unicode
# TODO ideally whiteout it in such a way that ustr.decode also raises AttributeError
decode = property(doc='ustr has no decode')
# all other string methods
def capitalize(self): return pyu(unicode.capitalize(self))
......
......@@ -610,6 +610,74 @@ def test_strings_iter():
assert list(XIter()) == ['м','и','р','у',' ','м','и','р']
# verify .encode/.decode .
def test_strings_encodedecode():
us = u('мир')
bs = b('май')
# TODO also raise AttributeError on .encode/.decode lookup on classes
assert hasattr(us, 'encode') ; assert hasattr(ustr, 'encode')
assert not hasattr(bs, 'encode') #; assert not hasattr(bstr, 'encode')
assert not hasattr(us, 'decode') #; assert not hasattr(ustr, 'decode')
assert hasattr(bs, 'decode') ; assert hasattr(bstr, 'decode')
_ = us.encode(); assert type(_) is bstr; assert _bdata(_) == xbytes('мир')
_ = us.encode('utf-8'); assert type(_) is bstr; assert _bdata(_) == xbytes('мир')
_ = bs.decode(); assert type(_) is ustr; assert _udata(_) == u'май'
_ = bs.decode('utf-8'); assert type(_) is ustr; assert _udata(_) == u'май'
# !utf-8
k8mir = u'мир'.encode('koi8-r')
b_k8mir = b(k8mir)
assert type(b_k8mir) is bstr
assert _bdata(b_k8mir) == k8mir
assert _bdata(b_k8mir) == b'\xcd\xc9\xd2'
_ = b_k8mir.decode('koi8-r')
assert type(_) is ustr
assert _udata(_) == u'мир'
b_cpmir = us.encode('cp1251')
assert type(b_cpmir) is bstr
assert _bdata(b_cpmir) == u'мир'.encode('cp1251')
assert _bdata(b_cpmir) == b'\xec\xe8\xf0'
# decode/encode errors
u_k8mir = b_k8mir.decode() # no decode error with
assert type(u_k8mir) is ustr # default parameters
assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2'
_ = b_k8mir.decode('utf-8', 'surrogateescape') # no decode error with
assert type(_) is ustr # explicit utf-8/surrogateescape
assert _udata(_) == _udata(u_k8mir)
with raises(UnicodeDecodeError): # decode error if encoding is explicitly specified
b_k8mir.decode('utf-8')
with raises(UnicodeDecodeError):
b_k8mir.decode('utf-8', 'strict')
with raises(UnicodeDecodeError):
b_k8mir.decode('ascii')
with raises(UnicodeEncodeError):
us.encode('ascii')
_ = u_k8mir.encode() # no encode error with
assert type(_) is bstr # default parameters
assert _bdata(_) == k8mir
_ = u_k8mir.encode('utf-8', 'surrogateescape') # no encode error with
assert type(_) is bstr # explicit utf-8/surrogateescape
assert _bdata(_) == k8mir
# on py2 unicode.encode accepts surrogate pairs and does not complain
# TODO(?) manually implement encode/py2 and reject surrogate pairs by default
if six.PY3:
with raises(UnicodeEncodeError): # encode error if encoding is explicit specified
u_k8mir.encode('utf-8')
with raises(UnicodeEncodeError):
u_k8mir.encode('utf-8', 'strict')
with raises(UnicodeEncodeError):
u_k8mir.encode('ascii')
# verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr.
@mark.parametrize('tx', (bytes, unicode, bytearray, bstr, ustr))
def test_strings_ops1(tx):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment