Commit 2bb971ba authored by Kirill Smelkov's avatar Kirill Smelkov

X golang_str: Adjust bstr/ustr .encode() and .__bytes__ to leave string domain into bytes

Initially I implemented things in such a way that (b|u)str.__bytes__
were giving bstr and ustr.encode() was giving bstr as well. My logic
here was that bstr is based on bytes and it is ok to give that.

However this logic did not pass backward compatibility test: for example
when LXML is imported it does

    cdef bytes _FILENAME_ENCODING = (sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii').encode("UTF-8")

and under gpython it breaks with

      File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/bin/runwsgi", line 4, in <module>
        from Products.ERP5.bin.zopewsgi import runwsgi; sys.exit(runwsgi())
      File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/parts/erp5/product/ERP5/__init__.py", line 36, in <module>
        from Products.ERP5Type.Utils import initializeProduct, updateGlobals
      File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/parts/erp5/product/ERP5Type/__init__.py", line 42, in <module>
        from .patches import pylint
      File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/parts/erp5/product/ERP5Type/patches/pylint.py", line 524, in <module>
        __import__(module_name, fromlist=[module_name], level=0))
      File "src/lxml/sax.py", line 18, in init lxml.sax
      File "src/lxml/etree.pyx", line 154, in init lxml.etree
    TypeError: Expected bytes, got golang.bstr

The breakage highlights a thinko in my previous reasoning: yes bstr is based on
bytes, but bstr has different semantics compared to bytes: even though e.g.
__getitem__ works the same way for bytes on py2, it works differently compared
to py3. This way if on py3 a program is doing bytes(x) or x.encode() it then
expects the result to have bytes semantics of current python which is not the
case if the result is bstr.

-> Fix that by adjusting .encode() and .__bytes__() to produce bytes type of
   current python and leave string domain.

I initially was contemplating for some time to introduce a third type, e.g.
bvec also based on bytes, but having bytes semantic and that bvec.decode would
return back to pygolang strings domain. But due to the fact that bytes semantic
is different in between py2 and py3, it would mean that bvec provided by
pygolang would need to have different behaviours dependent on current python
version which is undesirable.

In the end with leaving into native bytes the "bytes inconsistency" problem is
left to remain under std python with pygolang targeting only to fix strings
inconsistency in between py2 and py3 and providing the same semantic for
bstr and ustr on all python versions.

It also does not harm that bytes.decode() returns std unicode instead of str:
for programs that run under unpatched python we have u() to convert the result
to ustr, while under gpython std unicode is actually ustr which makes
bytes.decode() behaviour still quite ok.

P.S. we enable bstr.encode for consistency and because under py2, if not
enabled, it will break when running pytest under gpython in

          File ".../_pytest/assertion/rewrite.py", line 352, in <module>
            RN = "\r\n".encode("utf-8")
        AttributeError: unreadable attribute
parent 28c353f8
...@@ -106,6 +106,7 @@ from cython cimport no_gc ...@@ -106,6 +106,7 @@ from cython cimport no_gc
from libc.stdio cimport FILE from libc.stdio cimport FILE
from golang cimport strconv from golang cimport strconv
import codecs as pycodecs
import string as pystring import string as pystring
import types as pytypes import types as pytypes
import functools as pyfunctools import functools as pyfunctools
...@@ -343,9 +344,12 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 ...@@ -343,9 +344,12 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711
# _pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↑ _pybstr__new__() . # _pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↑ _pybstr__new__() .
def __bytes__(self): return pyb(self) # see __str__ # __bytes__ converts string to bytes leaving string domain.
def __unicode__(self): return pyu(self) # NOTE __bytes__ and encode are the only operations that leave string domain.
# NOTE __bytes__ is used only by py3 and only for `bytes(obj)` and `b'%s/%b' % obj`.
def __bytes__(self): return _bdata(self) # -> bytes
def __unicode__(self): return pyu(self)
def __str__(self): def __str__(self):
if PY_MAJOR_VERSION >= 3: if PY_MAJOR_VERSION >= 3:
return pyu(self) return pyu(self)
...@@ -482,13 +486,32 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 ...@@ -482,13 +486,32 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711
# encode/decode # encode/decode
def decode(self, encoding=None, errors=None): #
if encoding is None and errors is None: # Encoding strings - both bstr and ustr - convert type to bytes leaving string domain.
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding #
errors = 'surrogateescape' # Encode treats bstr and ustr as string, encoding unicode representation of
else: # the string to bytes. For bstr it means that the string representation is
if encoding is None: encoding = 'utf-8' # first converted to unicode and encoded to bytes from there. For ustr
if errors is None: errors = 'strict' # unicode representation of the string is directly encoded.
#
# Decoding strings is not provided. However for bstr the decode is provided
# treating input data as raw bytes and producing ustr as the result.
#
# NOTE __bytes__ and encode are the only operations that leave string domain.
def encode(self, encoding=None, errors=None): # -> bytes
encoding, errors = _encoding_with_defaults(encoding, errors)
# on py2 e.g. bytes.encode('string-escape') works on bytes directly
if PY_MAJOR_VERSION < 3:
codec = pycodecs.lookup(encoding)
if not codec._is_text_encoding or \
encoding in ('string-escape',): # string-escape also works on bytes
return codec.encode(self, errors)[0]
return pyu(self).encode(encoding, errors)
def decode(self, encoding=None, errors=None): # -> ustr | bstr on py2 for encodings like string-escape
encoding, errors = _encoding_with_defaults(encoding, errors)
if encoding == 'utf-8' and errors == 'surrogateescape': if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_decode_surrogateescape(self) x = _utf8_decode_surrogateescape(self)
...@@ -499,11 +522,6 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 ...@@ -499,11 +522,6 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711
return pyb(x) return pyb(x)
return pyu(x) return pyu(x)
if PY_MAJOR_VERSION < 3:
# whiteout encode inherited from bytes
# TODO ideally whiteout it in such a way that bstr.encode also raises AttributeError
encode = property(doc='bstr has no encode')
# all other string methods # all other string methods
...@@ -667,9 +685,11 @@ cdef class _pyustr(unicode): ...@@ -667,9 +685,11 @@ cdef class _pyustr(unicode):
# _pyustr.__new__ is hand-made in _pyustr_tp_new which invokes ↑ _pyustr__new__() . # _pyustr.__new__ is hand-made in _pyustr_tp_new which invokes ↑ _pyustr__new__() .
def __bytes__(self): return pyb(self) # __bytes__ converts string to bytes leaving string domain.
def __unicode__(self): return pyu(self) # see __str__ # see bstr.__bytes__ for more details.
def __bytes__(self): return _bdata(pyb(self)) # -> bytes
def __unicode__(self): return pyu(self) # see __str__
def __str__(self): def __str__(self):
if PY_MAJOR_VERSION >= 3: if PY_MAJOR_VERSION >= 3:
return pyu(self) # = self or pyustr if it was subclass return pyu(self) # = self or pyustr if it was subclass
...@@ -793,20 +813,15 @@ cdef class _pyustr(unicode): ...@@ -793,20 +813,15 @@ cdef class _pyustr(unicode):
return pyu(zunicode.__format__(self, format_spec)) return pyu(zunicode.__format__(self, format_spec))
# encode/decode # encode/decode (see bstr for details)
def encode(self, encoding=None, errors=None): def encode(self, encoding=None, errors=None): # -> bytes
if encoding is None and errors is None: encoding, errors = _encoding_with_defaults(encoding, errors)
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
errors = 'surrogateescape'
else:
if encoding is None: encoding = 'utf-8'
if errors is None: errors = 'strict'
if encoding == 'utf-8' and errors == 'surrogateescape': if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_encode_surrogateescape(self) x = _utf8_encode_surrogateescape(self)
else: else:
x = zunicode.encode(self, encoding, errors) x = zunicode.encode(self, encoding, errors)
return pyb(x) return x
if PY_MAJOR_VERSION < 3: if PY_MAJOR_VERSION < 3:
# whiteout decode inherited from unicode # whiteout decode inherited from unicode
...@@ -1987,6 +2002,18 @@ cdef extern from "Python.h": ...@@ -1987,6 +2002,18 @@ cdef extern from "Python.h":
# ---- UTF-8 encode/decode ---- # ---- UTF-8 encode/decode ----
# _encoding_with_defaults returns encoding and errors substituted with defaults
# as needed for functions like ustr.encode and bstr.decode .
cdef _encoding_with_defaults(encoding, errors): # -> (encoding, errors)
if encoding is None and errors is None:
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
errors = 'surrogateescape'
else:
if encoding is None: encoding = 'utf-8'
if errors is None: errors = 'strict'
return (encoding, errors)
# TODO(kirr) adjust UTF-8 encode/decode surrogateescape(*) a bit so that not # TODO(kirr) adjust UTF-8 encode/decode surrogateescape(*) a bit so that not
# only bytes -> unicode -> bytes is always identity for any bytes (this is # only bytes -> unicode -> bytes is always identity for any bytes (this is
# already true), but also that unicode -> bytes -> unicode is also always true # already true), but also that unicode -> bytes -> unicode is also always true
...@@ -2238,7 +2265,6 @@ cdef _patch_str(): ...@@ -2238,7 +2265,6 @@ cdef _patch_str():
# XXX explain # XXX explain
bpreserve_slots = upreserve_slots = ("maketrans",) bpreserve_slots = upreserve_slots = ("maketrans",)
if PY_MAJOR_VERSION < 3: if PY_MAJOR_VERSION < 3:
bpreserve_slots += ("encode",) # @property'ies
upreserve_slots += ("decode",) upreserve_slots += ("decode",)
# patch unicode to be pyustr. This patches # patch unicode to be pyustr. This patches
......
...@@ -231,13 +231,15 @@ def test_strings_basic(): ...@@ -231,13 +231,15 @@ def test_strings_basic():
assert b(bs) is bs; assert bstr(bs) is bs assert b(bs) is bs; assert bstr(bs) is bs
assert u(us) is us; assert ustr(us) is us assert u(us) is us; assert ustr(us) is us
# bytes(b(·)) = identity, unicode(u(·)) = identity # unicode(u(·)) = identity
assert bytes (bs) is bs
assert unicode(us) is us assert unicode(us) is us
# unicode(b) -> u, bytes(u) -> b # unicode(b) -> u
_ = unicode(bs); assert type(_) is ustr; assert _ == "мир" _ = unicode(bs); assert type(_) is ustr; assert _ == "мир"
_ = bytes (us); assert type(_) is bstr; assert _ == "мир"
# bytes(b|u) -> bytes
_ = bytes(bs); assert type(_) is x32(bytes, bstr); assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
_ = bytes(us); assert type(_) is x32(bytes, bstr); assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
# bytearray(b|u) -> bytearray # bytearray(b|u) -> bytearray
_ = bytearray(bs); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80' _ = bytearray(bs); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
...@@ -651,14 +653,13 @@ def test_strings_encodedecode(): ...@@ -651,14 +653,13 @@ def test_strings_encodedecode():
us = u('мир') us = u('мир')
bs = b('май') bs = b('май')
_ = us.encode(); assert type(_) is bytes; assert _ == xbytes('мир')
_ = us.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('мир')
_ = bs.encode(); assert type(_) is bytes; assert _ == xbytes('май')
_ = bs.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('май')
# TODO also raise AttributeError on .encode/.decode lookup on classes # TODO also raise AttributeError on .encode/.decode lookup on classes
assert hasattr(us, 'encode') ; assert hasattr(ustr, 'encode')
assert not hasattr(bs, 'encode') #; assert not hasattr(bstr, 'encode')
assert not hasattr(us, 'decode') #; assert not hasattr(ustr, 'decode') assert not hasattr(us, 'decode') #; assert not hasattr(ustr, 'decode')
assert hasattr(bs, 'decode') ; assert hasattr(bstr, 'decode')
_ = us.encode(); assert type(_) is bstr; assert _bdata(_) == xbytes('мир')
_ = us.encode('utf-8'); assert type(_) is bstr; assert _bdata(_) == xbytes('мир')
_ = bs.decode(); assert type(_) is ustr; assert _udata(_) == u'май' _ = bs.decode(); assert type(_) is ustr; assert _udata(_) == u'май'
_ = bs.decode('utf-8'); assert type(_) is ustr; assert _udata(_) == u'май' _ = bs.decode('utf-8'); assert type(_) is ustr; assert _udata(_) == u'май'
...@@ -673,10 +674,10 @@ def test_strings_encodedecode(): ...@@ -673,10 +674,10 @@ def test_strings_encodedecode():
assert type(_) is ustr assert type(_) is ustr
assert _udata(_) == u'мир' assert _udata(_) == u'мир'
b_cpmir = us.encode('cp1251') cpmir = us.encode('cp1251')
assert type(b_cpmir) is bstr assert type(cpmir) is bytes
assert _bdata(b_cpmir) == u'мир'.encode('cp1251') assert cpmir == u'мир'.encode('cp1251')
assert _bdata(b_cpmir) == b'\xec\xe8\xf0' assert cpmir == b'\xec\xe8\xf0'
# decode/encode errors # decode/encode errors
u_k8mir = b_k8mir.decode() # no decode error with u_k8mir = b_k8mir.decode() # no decode error with
...@@ -697,11 +698,14 @@ def test_strings_encodedecode(): ...@@ -697,11 +698,14 @@ def test_strings_encodedecode():
us.encode('ascii') us.encode('ascii')
_ = u_k8mir.encode() # no encode error with _ = u_k8mir.encode() # no encode error with
assert type(_) is bstr # default parameters assert type(_) is bytes # default parameters
assert _bdata(_) == k8mir assert _ == k8mir
_ = u_k8mir.encode('utf-8', 'surrogateescape') # no encode error with _ = u_k8mir.encode('utf-8', 'surrogateescape') # no encode error with
assert type(_) is bstr # explicit utf-8/surrogateescape assert type(_) is bytes # explicit utf-8/surrogateescape
assert _bdata(_) == k8mir assert _ == k8mir
_ = b_k8mir.encode() # bstr.encode = bstr -> ustr -> encode
assert type(_) is bytes
assert _ == k8mir
# on py2 unicode.encode accepts surrogate pairs and does not complain # on py2 unicode.encode accepts surrogate pairs and does not complain
# TODO(?) manually implement encode/py2 and reject surrogate pairs by default # TODO(?) manually implement encode/py2 and reject surrogate pairs by default
...@@ -724,6 +728,14 @@ def test_strings_encodedecode(): ...@@ -724,6 +728,14 @@ def test_strings_encodedecode():
_ = b(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y" _ = b(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y"
_ = b('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc" _ = b('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc"
# similarly for bytes.encode
if six.PY3:
with raises(LookupError): bs.encode('hex')
with raises(LookupError): bs.encode('string-escape')
else:
_ = bs.encode('hex'); assert type(_) is bytes; assert _ == b'd0bcd0b0d0b9'
_ = bs.encode('string-escape'); assert type(_) is bytes; assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9'
# verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr. # verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr.
@mark.parametrize('tx', (bytes, unicode, bytearray, bstr, ustr)) @mark.parametrize('tx', (bytes, unicode, bytearray, bstr, ustr))
...@@ -1418,6 +1430,8 @@ def test_strings_mod_and_format(): ...@@ -1418,6 +1430,8 @@ def test_strings_mod_and_format():
M("α %s π", BB(xbytes('мир2')) , "α байты π") # not мир2 M("α %s π", BB(xbytes('мир2')) , "α байты π") # not мир2
# vvv does not work on py3 as b'' % b'' does not consult __str__ nor __bytes__ of the argument # vvv does not work on py3 as b'' % b'' does not consult __str__ nor __bytes__ of the argument
# even though it is not 100% we are ok here, because customizing bytes or unicode is very exotic # even though it is not 100% we are ok here, because customizing bytes or unicode is very exotic
#
# XXX the code in bytesobject.c::format_obj tells different -> recheck.
if six.PY2: if six.PY2:
M("α %s π", (BB(xbytes('мир2')),) , "α байты π") # not мир2 M("α %s π", (BB(xbytes('мир2')),) , "α байты π") # not мир2
M("α %s π", [BB(xbytes('мир2'))] , "α [BB(байты)] π") # not [мир2] M("α %s π", [BB(xbytes('мир2'))] , "α [BB(байты)] π") # not [мир2]
...@@ -1884,8 +1898,8 @@ def test_strings_subclasses(tx): ...@@ -1884,8 +1898,8 @@ def test_strings_subclasses(tx):
# for bstr/ustr __bytes__/__unicode__ return *str, never MyStr # for bstr/ustr __bytes__/__unicode__ return *str, never MyStr
# (builtin unicode has no __bytes__/__unicode__) # (builtin unicode has no __bytes__/__unicode__)
if tx is not unicode: if tx is not unicode:
_ = xx.__bytes__(); assert type(_) is bstr; assert _ == 'мир' _ = xx.__bytes__(); assert type(_) is bytes; assert _ == xbytes('мир')
_ = xx.__unicode__(); assert type(_) is ustr; assert _ == 'мир' _ = xx.__unicode__(); assert type(_) is ustr; assert _ == 'мир'
# subclass with __str__ # subclass with __str__
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment