Commit da4b857b authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Add ustr.decode for symmetry with bstr.decode and because gpy2 breaks without it

Without working unicode.decode gpython/py2 with unicode replaced by ustr
fails when running ERP5 as follows:

    $ /srv/slapgrid/slappart49/t/ekg/i/5/bin/runTestSuite --help
    No handlers could be found for logger "SecurityInfo"
    Traceback (most recent call last):
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/.runTestSuite.pyexe", line 296, in <module>
        main()
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 484, in main
        pymain(argv, init)
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 292, in pymain
        run(mmain)
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 192, in run
        _execfile(filepath, mmain.__dict__)
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 339, in _execfile
        six.exec_(code, globals, locals)
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/six-1.16.0-py2.7.egg/six.py", line 735, in exec_
        exec("""exec _code_ in _globs_, _locs_""")
      File "<string>", line 1, in <module>
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/runTestSuite", line 10, in <module>
        from Products.ERP5Type.tests.runTestSuite import main; sys.exit(main())
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/__init__.py", line 96, in <module>
        from . import ZopePatch
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/ZopePatch.py", line 75, in <module>
        from Products.ERP5Type.patches import ZopePageTemplateUtils
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/patches/ZopePageTemplateUtils.py", line 58, in <module>
        convertToUnicode(u'', 'text/xml', ())
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/Zope-4.8.9+slapospatched002-py2.7.egg/Products/PageTemplates/utils.py", line 73, in convertToUnicode
        return source.decode(encoding), encoding
    AttributeError: unreadable attribute

and in general if we treat both bstr ans ustr being two different
representations of the same entity, if we have bstr.decode, having
ustr.decode is also needed for symmetry with both operations converting
bytes representation of the string into unicode.

Now there is full symmetry in between bstr/ustr and encode/decode. Quoting updated encode/decode text:

    Encode encodes unicode representation of the string into bytes, leaving string domain.
    Decode decodes bytes   representation of the string into ustr, staying inside string domain.

    Both bstr and ustr are accepted by encode and decode treating them as two
    different representations of the same entity.

    On encoding, for bstr, the string representation is first converted to
    unicode and encoded to bytes from there. For ustr unicode representation
    of the string is directly encoded.

    On decoding, for ustr, the string representation is first converted to
    bytes and decoded to unicode from there. For bstr bytes representation of
    the string is directly decoded.
parent 6f26b32c
......@@ -460,25 +460,31 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711
# encode/decode
#
# Encoding strings - both bstr and ustr - convert type to bytes leaving string domain.
# Encode encodes unicode representation of the string into bytes, leaving string domain.
# Decode decodes bytes representation of the string into ustr, staying inside string domain.
#
# Encode treats bstr and ustr as string, encoding unicode representation of
# the string to bytes. For bstr it means that the string representation is
# first converted to unicode and encoded to bytes from there. For ustr
# unicode representation of the string is directly encoded.
# Both bstr and ustr are accepted by encode and decode treating them as two
# different representations of the same entity.
#
# Decoding strings is not provided. However for bstr the decode is provided
# treating input data as raw bytes and producing ustr as the result.
# On encoding, for bstr, the string representation is first converted to
# unicode and encoded to bytes from there. For ustr unicode representation
# of the string is directly encoded.
#
# On decoding, for ustr, the string representation is first converted to
# bytes and decoded to unicode from there. For bstr bytes representation of
# the string is directly decoded.
#
# NOTE __bytes__ and encode are the only operations that leave string domain.
def encode(self, encoding=None, errors=None): # -> bytes
encoding, errors = _encoding_with_defaults(encoding, errors)
if encoding == 'utf-8' and errors == 'surrogateescape':
return _bdata(self)
# on py2 e.g. bytes.encode('string-escape') works on bytes directly
if PY_MAJOR_VERSION < 3:
codec = pycodecs.lookup(encoding)
if not codec._is_text_encoding or \
encoding in ('string-escape',): # string-escape also works on bytes
codec = _pycodecs_lookup_binary(encoding)
if codec is not None:
return codec.encode(self, errors)[0]
return pyu(self).encode(encoding, errors)
......@@ -795,15 +801,23 @@ cdef class _pyustr(unicode):
encoding, errors = _encoding_with_defaults(encoding, errors)
if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_encode_surrogateescape(self)
else:
x = zunicode.encode(self, encoding, errors)
return x
return _utf8_encode_surrogateescape(self)
# on py2 e.g. 'string-escape' works on bytes
if PY_MAJOR_VERSION < 3:
# whiteout decode inherited from unicode
# TODO ideally whiteout it in such a way that ustr.decode also raises AttributeError
decode = property(doc='ustr has no decode')
codec = _pycodecs_lookup_binary(encoding)
if codec is not None:
return codec.encode(pyb(self), errors)[0]
return zunicode.encode(self, encoding, errors)
def decode(self, encoding=None, errors=None): # -> ustr | bstr for encodings like string-escape
encoding, errors = _encoding_with_defaults(encoding, errors)
if encoding == 'utf-8' and errors == 'surrogateescape':
return pyu(self)
return pyb(self).decode(encoding, errors)
# all other string methods
......@@ -1891,6 +1905,15 @@ cdef extern from "Python.h":
"""
bint _XPyMapping_Check(object o)
# _pycodecs_lookup_binary returns codec corresponding to encoding if the codec works on binary input.
# example of such codecs are string-escape and hex encodings.
cdef _pycodecs_lookup_binary(encoding): # -> codec | None (text) | LookupError (no such encoding)
codec = pycodecs.lookup(encoding)
if not codec._is_text_encoding or \
encoding in ('string-escape',): # string-escape also works on bytes
return codec
return None
# ---- UTF-8 encode/decode ----
......
......@@ -653,58 +653,61 @@ def test_strings_encodedecode():
us = u('мир')
bs = b('май')
_ = us.encode(); assert type(_) is bytes; assert _ == xbytes('мир')
_ = us.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('мир')
_ = bs.encode(); assert type(_) is bytes; assert _ == xbytes('май')
_ = bs.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('май')
# encode does obj.encode and makes sure result type is bytes
def encode(obj, *argv):
_ = obj.encode(*argv)
assert type(_) is bytes
return _
# TODO also raise AttributeError on .encode/.decode lookup on classes
assert not hasattr(us, 'decode') #; assert not hasattr(ustr, 'decode')
_ = bs.decode(); assert type(_) is ustr; assert _udata(_) == u'май'
_ = bs.decode('utf-8'); assert type(_) is ustr; assert _udata(_) == u'май'
# decode does obj.decode and makes sure result type is ustr
def decode(obj, *argv):
_ = obj.decode(*argv)
assert type(_) is ustr
return _
# !utf-8
k8mir = u'мир'.encode('koi8-r')
b_k8mir = b(k8mir)
assert type(b_k8mir) is bstr
assert _bdata(b_k8mir) == k8mir
assert _bdata(b_k8mir) == b'\xcd\xc9\xd2'
_ = encode(us); assert _ == xbytes('мир')
_ = encode(us, 'utf-8'); assert _ == xbytes('мир')
_ = encode(bs); assert _ == xbytes('май')
_ = encode(bs, 'utf-8'); assert _ == xbytes('май')
_ = b_k8mir.decode('koi8-r')
assert type(_) is ustr
assert _udata(_) == u'мир'
_ = decode(us); assert _udata(_) == u'мир'
_ = decode(us, 'utf-8'); assert _udata(_) == u'мир'
_ = decode(bs); assert _udata(_) == u'май'
_ = decode(bs, 'utf-8'); assert _udata(_) == u'май'
cpmir = us.encode('cp1251')
assert type(cpmir) is bytes
assert cpmir == u'мир'.encode('cp1251')
assert cpmir == b'\xec\xe8\xf0'
# !utf-8
k8mir = u'мир'.encode('koi8-r'); assert k8mir == b'\xcd\xc9\xd2'
b_k8mir = b(k8mir); assert type(b_k8mir) is bstr; assert _bdata(b_k8mir) == b'\xcd\xc9\xd2'
u_k8mir = u(k8mir); assert type(u_k8mir) is ustr; assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2'
# decode/encode errors
u_k8mir = b_k8mir.decode() # no decode error with
assert type(u_k8mir) is ustr # default parameters
assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2'
_ = b_k8mir.decode('utf-8', 'surrogateescape') # no decode error with
assert type(_) is ustr # explicit utf-8/surrogateescape
assert _udata(_) == _udata(u_k8mir)
with raises(UnicodeDecodeError): # decode error if encoding is explicitly specified
b_k8mir.decode('utf-8')
with raises(UnicodeDecodeError):
b_k8mir.decode('utf-8', 'strict')
with raises(UnicodeDecodeError):
b_k8mir.decode('ascii')
_ = decode(b_k8mir, 'koi8-r'); assert _udata(_) == u'мир'
_ = decode(u_k8mir, 'koi8-r'); assert _udata(_) == u'мир'
with raises(UnicodeEncodeError):
us.encode('ascii')
_ = encode(us, 'cp1251'); assert _ == u'мир'.encode('cp1251'); assert _ == b'\xec\xe8\xf0'
_ = encode(bs, 'cp1251'); assert _ == u'май'.encode('cp1251'); assert _ == b'\xec\xe0\xe9'
_ = u_k8mir.encode() # no encode error with
assert type(_) is bytes # default parameters
assert _ == k8mir
_ = u_k8mir.encode('utf-8', 'surrogateescape') # no encode error with
assert type(_) is bytes # explicit utf-8/surrogateescape
# decode/encode errors
_ = decode(b_k8mir); assert _ == u_k8mir # no decode error with default parameters
_ = decode(b_k8mir, 'utf-8', 'surrogateescape') # or with explicit utf-8/surrogateescape
assert _ == u_k8mir
_ = decode(u_k8mir); assert _ == u_k8mir
_ = decode(u_k8mir, 'utf-8', 'surrogateescape'); assert _ == u_k8mir
with raises(UnicodeDecodeError): b_k8mir.decode('utf-8') # decode error on unmatching explicit encoding
with raises(UnicodeDecodeError): u_k8mir.decode('utf-8')
with raises(UnicodeDecodeError): b_k8mir.decode('utf-8', 'strict')
with raises(UnicodeDecodeError): u_k8mir.decode('utf-8', 'strict')
with raises(UnicodeDecodeError): b_k8mir.decode('ascii')
with raises(UnicodeDecodeError): u_k8mir.decode('ascii')
with raises(UnicodeEncodeError): us.encode('ascii') # encode error if target encoding cannot represent string
with raises(UnicodeEncodeError): bs.encode('ascii')
_ = encode(u_k8mir); assert _ == k8mir # no encode error with default parameters
_ = encode(u_k8mir, 'utf-8', 'surrogateescape') # or with explicit utf-8/surrogateescape
assert _ == k8mir
_ = b_k8mir.encode() # bstr.encode = bstr -> ustr -> encode
assert type(_) is bytes
_ = encode(b_k8mir); assert _ == k8mir # bstr.encode = bstr -> ustr -> encode
_ = encode(b_k8mir, 'utf-8', 'surrogateescape')
assert _ == k8mir
# on py2 unicode.encode accepts surrogate pairs and does not complain
......@@ -722,19 +725,28 @@ def test_strings_encodedecode():
# verify that this exact semantic is preserved
if six.PY3:
with raises(LookupError): bs.decode('hex')
with raises(LookupError): us.decode('hex')
with raises(LookupError): bs.decode('string-escape')
with raises(LookupError): us.decode('string-escape')
else:
_ = bs.decode('string-escape'); assert type(_) is bstr; assert _ == bs
_ = us.decode('string-escape'); assert type(_) is bstr; assert _ == us
_ = b(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y"
_ = u(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y"
_ = b('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc"
_ = u('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc"
# similarly for bytes.encode
if six.PY3:
with raises(LookupError): bs.encode('hex')
with raises(LookupError): us.encode('hex')
with raises(LookupError): bs.encode('string-escape')
with raises(LookupError): us.encode('string-escape')
else:
_ = bs.encode('hex'); assert type(_) is bytes; assert _ == b'd0bcd0b0d0b9'
_ = bs.encode('string-escape'); assert type(_) is bytes; assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9'
_ = encode(bs, 'hex'); assert _ == b'd0bcd0b0d0b9'
_ = encode(us, 'hex'); assert _ == b'd0bcd0b8d180'
_ = encode(bs, 'string-escape'); assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9'
_ = encode(us, 'string-escape'); assert _ == br'\xd0\xbc\xd0\xb8\xd1\x80'
# verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment