Commit 93e9c25a authored by Kirill Smelkov's avatar Kirill Smelkov

X golang_str: Add ustr.decode for symmetry with bstr.decode and because gpy2 breaks without it

Without working unicode.decode gpy2 fails when running ERP5 as follows:

    $ /srv/slapgrid/slappart49/t/ekg/i/5/bin/runTestSuite --help
    No handlers could be found for logger "SecurityInfo"
    Traceback (most recent call last):
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/.runTestSuite.pyexe", line 296, in <module>
        main()
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 484, in main
        pymain(argv, init)
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 292, in pymain
        run(mmain)
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 192, in run
        _execfile(filepath, mmain.__dict__)
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 339, in _execfile
        six.exec_(code, globals, locals)
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/six-1.16.0-py2.7.egg/six.py", line 735, in exec_
        exec("""exec _code_ in _globs_, _locs_""")
      File "<string>", line 1, in <module>
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/runTestSuite", line 10, in <module>
        from Products.ERP5Type.tests.runTestSuite import main; sys.exit(main())
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/__init__.py", line 96, in <module>
        from . import ZopePatch
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/ZopePatch.py", line 75, in <module>
        from Products.ERP5Type.patches import ZopePageTemplateUtils
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/patches/ZopePageTemplateUtils.py", line 58, in <module>
        convertToUnicode(u'', 'text/xml', ())
      File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/Zope-4.8.9+slapospatched002-py2.7.egg/Products/PageTemplates/utils.py", line 73, in convertToUnicode
        return source.decode(encoding), encoding
    AttributeError: unreadable attribute

and in general if we treat both bstr ans ustr being two different
representations of the same entity, if we have bstr.decode, having
ustr.decode is also needed for symmetry with both operations converting
bytes representation of the string into unicode.

Now there is full symmetry in between bstr/ustr and encode/decode. Quoting updated encode/decode text:

    Encode encodes unicode representation of the string into bytes, leaving string domain.
    Decode decodes bytes   representation of the string into ustr, staying inside string domain.

    Both bstr and ustr are accepted by encode and decode treating them as two
    different representations of the same entity.

    On encoding, for bstr, the string representation is first converted to
    unicode and encoded to bytes from there. For ustr unicode representation
    of the string is directly encoded.

    On decoding, for ustr, the string representation is first converted to
    bytes and decoded to unicode from there. For bstr bytes representation of
    the string is directly decoded.
parent abf3dcec
Pipeline #34475 failed with stage
in 0 seconds
...@@ -528,25 +528,31 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 ...@@ -528,25 +528,31 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711
# encode/decode # encode/decode
# #
# Encoding strings - both bstr and ustr - convert type to bytes leaving string domain. # Encode encodes unicode representation of the string into bytes, leaving string domain.
# Decode decodes bytes representation of the string into ustr, staying inside string domain.
# #
# Encode treats bstr and ustr as string, encoding unicode representation of # Both bstr and ustr are accepted by encode and decode treating them as two
# the string to bytes. For bstr it means that the string representation is # different representations of the same entity.
# first converted to unicode and encoded to bytes from there. For ustr
# unicode representation of the string is directly encoded.
# #
# Decoding strings is not provided. However for bstr the decode is provided # On encoding, for bstr, the string representation is first converted to
# treating input data as raw bytes and producing ustr as the result. # unicode and encoded to bytes from there. For ustr unicode representation
# of the string is directly encoded.
#
# On decoding, for ustr, the string representation is first converted to
# bytes and decoded to unicode from there. For bstr bytes representation of
# the string is directly decoded.
# #
# NOTE __bytes__ and encode are the only operations that leave string domain. # NOTE __bytes__ and encode are the only operations that leave string domain.
def encode(self, encoding=None, errors=None): # -> bytes def encode(self, encoding=None, errors=None): # -> bytes
encoding, errors = _encoding_with_defaults(encoding, errors) encoding, errors = _encoding_with_defaults(encoding, errors)
if encoding == 'utf-8' and errors == 'surrogateescape':
return _bdata(self)
# on py2 e.g. bytes.encode('string-escape') works on bytes directly # on py2 e.g. bytes.encode('string-escape') works on bytes directly
if PY_MAJOR_VERSION < 3: if PY_MAJOR_VERSION < 3:
codec = pycodecs.lookup(encoding) codec = _pycodecs_lookup_binary(encoding)
if not codec._is_text_encoding or \ if codec is not None:
encoding in ('string-escape',): # string-escape also works on bytes
return codec.encode(self, errors)[0] return codec.encode(self, errors)[0]
return pyu(self).encode(encoding, errors) return pyu(self).encode(encoding, errors)
...@@ -894,15 +900,23 @@ cdef class _pyustr(unicode): ...@@ -894,15 +900,23 @@ cdef class _pyustr(unicode):
encoding, errors = _encoding_with_defaults(encoding, errors) encoding, errors = _encoding_with_defaults(encoding, errors)
if encoding == 'utf-8' and errors == 'surrogateescape': if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_encode_surrogateescape(self) return _utf8_encode_surrogateescape(self)
else:
x = zunicode.encode(self, encoding, errors)
return x
# on py2 e.g. 'string-escape' works on bytes
if PY_MAJOR_VERSION < 3: if PY_MAJOR_VERSION < 3:
# whiteout decode inherited from unicode codec = _pycodecs_lookup_binary(encoding)
# TODO ideally whiteout it in such a way that ustr.decode also raises AttributeError if codec is not None:
decode = property(doc='ustr has no decode') return codec.encode(pyb(self), errors)[0]
return zunicode.encode(self, encoding, errors)
def decode(self, encoding=None, errors=None): # -> ustr | bstr for encodings like string-escape
encoding, errors = _encoding_with_defaults(encoding, errors)
if encoding == 'utf-8' and errors == 'surrogateescape':
return pyu(self)
return pyb(self).decode(encoding, errors)
# all other string methods # all other string methods
...@@ -2161,6 +2175,15 @@ cdef extern from "Python.h": ...@@ -2161,6 +2175,15 @@ cdef extern from "Python.h":
""" """
bint _XPyMapping_Check(object o) bint _XPyMapping_Check(object o)
# _pycodecs_lookup_binary returns codec corresponding to encoding if the codec works on binary input.
# example of such codecs are string-escape and hex encodings.
cdef _pycodecs_lookup_binary(encoding): # -> codec | None (text) | LookupError (no such encoding)
codec = pycodecs.lookup(encoding)
if not codec._is_text_encoding or \
encoding in ('string-escape',): # string-escape also works on bytes
return codec
return None
# ---- UTF-8 encode/decode ---- # ---- UTF-8 encode/decode ----
...@@ -2426,8 +2449,6 @@ cdef _patch_str(): ...@@ -2426,8 +2449,6 @@ cdef _patch_str():
# XXX explain # XXX explain
bpreserve_slots = upreserve_slots = ("maketrans",) bpreserve_slots = upreserve_slots = ("maketrans",)
if PY_MAJOR_VERSION < 3:
upreserve_slots += ("decode",)
# patch unicode to be pyustr. This patches # patch unicode to be pyustr. This patches
# - unicode (py2) # - unicode (py2)
......
...@@ -657,58 +657,61 @@ def test_strings_encodedecode(): ...@@ -657,58 +657,61 @@ def test_strings_encodedecode():
us = u('мир') us = u('мир')
bs = b('май') bs = b('май')
_ = us.encode(); assert type(_) is bytes; assert _ == xbytes('мир') # encode does obj.encode and makes sure result type is bytes
_ = us.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('мир') def encode(obj, *argv):
_ = bs.encode(); assert type(_) is bytes; assert _ == xbytes('май') _ = obj.encode(*argv)
_ = bs.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('май') assert type(_) is bytes
return _
# TODO also raise AttributeError on .encode/.decode lookup on classes # decode does obj.decode and makes sure result type is ustr
assert not hasattr(us, 'decode') #; assert not hasattr(ustr, 'decode') def decode(obj, *argv):
_ = bs.decode(); assert type(_) is ustr; assert _udata(_) == u'май' _ = obj.decode(*argv)
_ = bs.decode('utf-8'); assert type(_) is ustr; assert _udata(_) == u'май' assert type(_) is ustr
return _
# !utf-8 _ = encode(us); assert _ == xbytes('мир')
k8mir = u'мир'.encode('koi8-r') _ = encode(us, 'utf-8'); assert _ == xbytes('мир')
b_k8mir = b(k8mir) _ = encode(bs); assert _ == xbytes('май')
assert type(b_k8mir) is bstr _ = encode(bs, 'utf-8'); assert _ == xbytes('май')
assert _bdata(b_k8mir) == k8mir
assert _bdata(b_k8mir) == b'\xcd\xc9\xd2'
_ = b_k8mir.decode('koi8-r') _ = decode(us); assert _udata(_) == u'мир'
assert type(_) is ustr _ = decode(us, 'utf-8'); assert _udata(_) == u'мир'
assert _udata(_) == u'мир' _ = decode(bs); assert _udata(_) == u'май'
_ = decode(bs, 'utf-8'); assert _udata(_) == u'май'
cpmir = us.encode('cp1251') # !utf-8
assert type(cpmir) is bytes k8mir = u'мир'.encode('koi8-r'); assert k8mir == b'\xcd\xc9\xd2'
assert cpmir == u'мир'.encode('cp1251') b_k8mir = b(k8mir); assert type(b_k8mir) is bstr; assert _bdata(b_k8mir) == b'\xcd\xc9\xd2'
assert cpmir == b'\xec\xe8\xf0' u_k8mir = u(k8mir); assert type(u_k8mir) is ustr; assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2'
# decode/encode errors _ = decode(b_k8mir, 'koi8-r'); assert _udata(_) == u'мир'
u_k8mir = b_k8mir.decode() # no decode error with _ = decode(u_k8mir, 'koi8-r'); assert _udata(_) == u'мир'
assert type(u_k8mir) is ustr # default parameters
assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2'
_ = b_k8mir.decode('utf-8', 'surrogateescape') # no decode error with
assert type(_) is ustr # explicit utf-8/surrogateescape
assert _udata(_) == _udata(u_k8mir)
with raises(UnicodeDecodeError): # decode error if encoding is explicitly specified
b_k8mir.decode('utf-8')
with raises(UnicodeDecodeError):
b_k8mir.decode('utf-8', 'strict')
with raises(UnicodeDecodeError):
b_k8mir.decode('ascii')
with raises(UnicodeEncodeError): _ = encode(us, 'cp1251'); assert _ == u'мир'.encode('cp1251'); assert _ == b'\xec\xe8\xf0'
us.encode('ascii') _ = encode(bs, 'cp1251'); assert _ == u'май'.encode('cp1251'); assert _ == b'\xec\xe0\xe9'
_ = u_k8mir.encode() # no encode error with # decode/encode errors
assert type(_) is bytes # default parameters _ = decode(b_k8mir); assert _ == u_k8mir # no decode error with default parameters
assert _ == k8mir _ = decode(b_k8mir, 'utf-8', 'surrogateescape') # or with explicit utf-8/surrogateescape
_ = u_k8mir.encode('utf-8', 'surrogateescape') # no encode error with assert _ == u_k8mir
assert type(_) is bytes # explicit utf-8/surrogateescape _ = decode(u_k8mir); assert _ == u_k8mir
_ = decode(u_k8mir, 'utf-8', 'surrogateescape'); assert _ == u_k8mir
with raises(UnicodeDecodeError): b_k8mir.decode('utf-8') # decode error on unmatching explicit encoding
with raises(UnicodeDecodeError): u_k8mir.decode('utf-8')
with raises(UnicodeDecodeError): b_k8mir.decode('utf-8', 'strict')
with raises(UnicodeDecodeError): u_k8mir.decode('utf-8', 'strict')
with raises(UnicodeDecodeError): b_k8mir.decode('ascii')
with raises(UnicodeDecodeError): u_k8mir.decode('ascii')
with raises(UnicodeEncodeError): us.encode('ascii') # encode error if target encoding cannot represent string
with raises(UnicodeEncodeError): bs.encode('ascii')
_ = encode(u_k8mir); assert _ == k8mir # no encode error with default parameters
_ = encode(u_k8mir, 'utf-8', 'surrogateescape') # or with explicit utf-8/surrogateescape
assert _ == k8mir assert _ == k8mir
_ = b_k8mir.encode() # bstr.encode = bstr -> ustr -> encode _ = encode(b_k8mir); assert _ == k8mir # bstr.encode = bstr -> ustr -> encode
assert type(_) is bytes _ = encode(b_k8mir, 'utf-8', 'surrogateescape')
assert _ == k8mir assert _ == k8mir
# on py2 unicode.encode accepts surrogate pairs and does not complain # on py2 unicode.encode accepts surrogate pairs and does not complain
...@@ -726,19 +729,28 @@ def test_strings_encodedecode(): ...@@ -726,19 +729,28 @@ def test_strings_encodedecode():
# verify that this exact semantic is preserved # verify that this exact semantic is preserved
if six.PY3: if six.PY3:
with raises(LookupError): bs.decode('hex') with raises(LookupError): bs.decode('hex')
with raises(LookupError): us.decode('hex')
with raises(LookupError): bs.decode('string-escape') with raises(LookupError): bs.decode('string-escape')
with raises(LookupError): us.decode('string-escape')
else: else:
_ = bs.decode('string-escape'); assert type(_) is bstr; assert _ == bs _ = bs.decode('string-escape'); assert type(_) is bstr; assert _ == bs
_ = us.decode('string-escape'); assert type(_) is bstr; assert _ == us
_ = b(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y" _ = b(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y"
_ = u(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y"
_ = b('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc" _ = b('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc"
_ = u('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc"
# similarly for bytes.encode # similarly for bytes.encode
if six.PY3: if six.PY3:
with raises(LookupError): bs.encode('hex') with raises(LookupError): bs.encode('hex')
with raises(LookupError): us.encode('hex')
with raises(LookupError): bs.encode('string-escape') with raises(LookupError): bs.encode('string-escape')
with raises(LookupError): us.encode('string-escape')
else: else:
_ = bs.encode('hex'); assert type(_) is bytes; assert _ == b'd0bcd0b0d0b9' _ = encode(bs, 'hex'); assert _ == b'd0bcd0b0d0b9'
_ = bs.encode('string-escape'); assert type(_) is bytes; assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9' _ = encode(us, 'hex'); assert _ == b'd0bcd0b8d180'
_ = encode(bs, 'string-escape'); assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9'
_ = encode(us, 'string-escape'); assert _ == br'\xd0\xbc\xd0\xb8\xd1\x80'
# verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr. # verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment