Commit 781802d4 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Implement bstr/ustr constructors

Both bstr and ustr constructors mimic constructor of unicode(= str on py3) -
an object is either stringified, or decoded if it provides buffer
interface, or the constructor is invoked with optional encoding and
errors argument:

    # py2
    class unicode(basestring)
     |  unicode(object='') -> unicode object
     |  unicode(string[, encoding[, errors]]) -> unicode object

    # py3
    class str(object)
     |  str(object='') -> str
     |  str(bytes_or_buffer[, encoding[, errors]]) -> str

Stringification of all bstr/ustr / unicode/bytes is handled
automatically with the meaning to convert to created type via b or u.

We follow unicode semantic for both ustr _and_ bstr, because bstr/ustr
are intended to be used as strings.
parent 54c2a3cf
...@@ -233,8 +233,8 @@ Pygolang, similarly to Go, provides uniform UTF8-based approach to strings with ...@@ -233,8 +233,8 @@ Pygolang, similarly to Go, provides uniform UTF8-based approach to strings with
the idea to make working with byte- and unicode- strings easy and transparently the idea to make working with byte- and unicode- strings easy and transparently
interoperable: interoperable:
- `bstr` is byte-string: it is based on `bytes` and can automatically convert to `unicode` [*]_. - `bstr` is byte-string: it is based on `bytes` and can automatically convert to/from `unicode` [*]_.
- `ustr` is unicode-string: it is based on `unicode` and can automatically convert to `bytes`. - `ustr` is unicode-string: it is based on `unicode` and can automatically convert to/from `bytes`.
The conversion, in both encoding and decoding, never fails and never looses The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
...@@ -245,7 +245,9 @@ operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce ...@@ -245,7 +245,9 @@ operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes`, similarly to to `ustr`. When the coercion happens, `bytes`, similarly to
`bstr`, are also treated as UTF8-encoded strings. `bstr`, are also treated as UTF8-encoded strings.
`bstr`/`ustr` constructors will accept arbitrary objects and either convert or stringify them. For `bstr` and `ustr` are meant to be drop-in replacements for standard
`str`/`unicode` classes. They support all methods of `str`/`unicode` and in
particular their constructors accept arbitrary objects and either convert or stringify them. For
cases when no stringification is desired, and one only wants to convert cases when no stringification is desired, and one only wants to convert
`bstr`/`ustr` / `unicode`/`bytes` `bstr`/`ustr` / `unicode`/`bytes`
to Pygolang string, `b` and `u` provide way to make sure an to Pygolang string, `b` and `u` provide way to make sure an
......
...@@ -51,17 +51,10 @@ def pyb(s): # -> bstr ...@@ -51,17 +51,10 @@ def pyb(s): # -> bstr
See also: u, bstr/ustr. See also: u, bstr/ustr.
""" """
if type(s) is pybstr: bs = _pyb(pybstr, s)
return s if bs is None:
if isinstance(s, bytes): # py2: str py3: bytes
pass
elif isinstance(s, unicode): # py2: unicode py3: str
s = _utf8_encode_surrogateescape(s)
else:
raise TypeError("b: invalid type %s" % type(s)) raise TypeError("b: invalid type %s" % type(s))
return bs
return pybstr(s)
def pyu(s): # -> ustr def pyu(s): # -> ustr
"""u converts object to ustr. """u converts object to ustr.
...@@ -81,17 +74,41 @@ def pyu(s): # -> ustr ...@@ -81,17 +74,41 @@ def pyu(s): # -> ustr
See also: b, bstr/ustr. See also: b, bstr/ustr.
""" """
if type(s) is pyustr: us = _pyu(pyustr, s)
if us is None:
raise TypeError("u: invalid type %s" % type(s))
return us
cdef _pyb(bcls, s): # -> ~bstr | None
if type(s) is bcls:
return s return s
if isinstance(s, unicode): # py2: unicode py3: str if isinstance(s, bytes):
pass if type(s) is not bytes:
elif isinstance(s, bytes): # py2: str py3: bytes s = _bdata(s)
elif isinstance(s, unicode):
s = _utf8_encode_surrogateescape(s)
else:
return None
assert type(s) is bytes
return bytes.__new__(bcls, s)
cdef _pyu(ucls, s): # -> ~ustr | None
if type(s) is ucls:
return s
if isinstance(s, unicode):
if type(s) is not unicode:
s = _udata(s)
elif isinstance(s, bytes):
s = _utf8_decode_surrogateescape(s) s = _utf8_decode_surrogateescape(s)
else: else:
raise TypeError("u: invalid type %s" % type(s)) return None
return pyustr(s) assert type(s) is unicode
return unicode.__new__(ucls, s)
# _pyb_coerce coerces x from `b op x` to be used in operation with pyb. # _pyb_coerce coerces x from `b op x` to be used in operation with pyb.
...@@ -136,7 +153,7 @@ cdef __pystr(object obj): # -> ~str ...@@ -136,7 +153,7 @@ cdef __pystr(object obj): # -> ~str
class pybstr(bytes): class pybstr(bytes):
"""bstr is byte-string. """bstr is byte-string.
It is based on bytes and can automatically convert to unicode. It is based on bytes and can automatically convert to/from unicode.
The conversion never fails and never looses information: The conversion never fails and never looses information:
bstr → ustr → bstr bstr → ustr → bstr
...@@ -147,6 +164,15 @@ class pybstr(bytes): ...@@ -147,6 +164,15 @@ class pybstr(bytes):
When the coercion happens, bytes, similarly to bstr, are also When the coercion happens, bytes, similarly to bstr, are also
treated as UTF8-encoded strings. treated as UTF8-encoded strings.
bstr constructor accepts arbitrary objects and stringify them:
- if encoding and/or errors is specified, the object must provide buffer
interface. The data in the buffer is decoded according to provided
encoding/errors and further encoded via UTF-8 into bstr.
- if the object is bstr/ustr / unicode/bytes - it is converted
to bstr. See b for details.
- otherwise bstr will have string representation of the object.
See also: b, ustr/u. See also: b, ustr/u.
""" """
...@@ -154,6 +180,18 @@ class pybstr(bytes): ...@@ -154,6 +180,18 @@ class pybstr(bytes):
# won't be needed after switch to -> `cdef class` # won't be needed after switch to -> `cdef class`
__slots__ = () __slots__ = ()
def __new__(cls, object='', encoding=None, errors=None):
# encoding or errors -> object must expose buffer interface
if not (encoding is None and errors is None):
object = _buffer_decode(object, encoding, errors)
# _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented
object = _bstringify(object)
assert isinstance(object, (unicode, bytes)), object
bobj = _pyb(cls, object)
assert bobj is not None
return bobj
def __bytes__(self): return self def __bytes__(self): return self
def __unicode__(self): return pyu(self) def __unicode__(self): return pyu(self)
...@@ -191,10 +229,11 @@ class pybstr(bytes): ...@@ -191,10 +229,11 @@ class pybstr(bytes):
def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b)) def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b))
cdef class pyustr(unicode): # XXX cannot `cdef class` with __new__: https://github.com/cython/cython/issues/799
class pyustr(unicode):
"""ustr is unicode-string. """ustr is unicode-string.
It is based on unicode and can automatically convert to bytes. It is based on unicode and can automatically convert to/from bytes.
The conversion never fails and never looses information: The conversion never fails and never looses information:
ustr → bstr → ustr ustr → bstr → ustr
...@@ -205,9 +244,29 @@ cdef class pyustr(unicode): ...@@ -205,9 +244,29 @@ cdef class pyustr(unicode):
When the coercion happens, bytes, similarly to bstr, are also When the coercion happens, bytes, similarly to bstr, are also
treated as UTF8-encoded strings. treated as UTF8-encoded strings.
ustr constructor, similarly to the one in bstr, accepts arbitrary objects
and stringify them. Please refer to bstr and u documentation for details.
See also: u, bstr/b. See also: u, bstr/b.
""" """
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
def __new__(cls, object='', encoding=None, errors=None):
# encoding or errors -> object must expose buffer interface
if not (encoding is None and errors is None):
object = _buffer_decode(object, encoding, errors)
# _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented
object = _bstringify(object)
assert isinstance(object, (unicode, bytes)), object
uobj = _pyu(cls, object)
assert uobj is not None
return uobj
def __bytes__(self): return pyb(self) def __bytes__(self): return pyb(self)
def __unicode__(self): return self def __unicode__(self): return self
...@@ -312,6 +371,37 @@ def pyqq(obj): ...@@ -312,6 +371,37 @@ def pyqq(obj):
return qobj return qobj
# ---- _bstringify ----
# _bstringify returns string representation of obj.
# it is similar to unicode(obj).
cdef _bstringify(object obj): # -> unicode|bytes
if type(obj) in (pybstr, pyustr, bytes, unicode):
return obj
if PY_MAJOR_VERSION >= 3:
return unicode(obj)
else:
# on py2 mimic manually what unicode(·) does on py3
# the reason we do it manually is because if we try just
# unicode(obj), and obj's __str__ returns UTF-8 bytestring, it will
# fail with UnicodeDecodeError. Similarly if we unconditionally do
# str(obj), it will fail if obj's __str__ returns unicode.
if hasattr(obj, '__unicode__'):
return obj.__unicode__()
elif hasattr(obj, '__str__'):
# (u'β').__str__() gives UnicodeEncodeError, but unicode has no
# .__unicode__ method. Work it around to handle custom unicode
# subclasses that do not override __str__.
if type(obj).__str__ is unicode.__str__:
return unicode(obj)
return obj.__str__()
else:
return repr(obj)
# py2: adjust unicode.tp_richcompare(a,b) to return NotImplemented if b is bstr. # py2: adjust unicode.tp_richcompare(a,b) to return NotImplemented if b is bstr.
# This way we avoid `UnicodeWarning: Unicode equal comparison failed to convert # This way we avoid `UnicodeWarning: Unicode equal comparison failed to convert
# both arguments to Unicode - interpreting them as being unequal`, and that # both arguments to Unicode - interpreting them as being unequal`, and that
...@@ -381,6 +471,42 @@ cdef class _UnboundMethod(object): # they removed unbound methods on py3 ...@@ -381,6 +471,42 @@ cdef class _UnboundMethod(object): # they removed unbound methods on py3
return pyfunctools.partial(self.func, obj) return pyfunctools.partial(self.func, obj)
# ---- misc ----
# _buffer_py2 returns buffer(obj) on py2 / fails on py3
cdef object _buffer_py2(object obj):
IF PY2: # cannot `if PY_MAJOR_VERSION < 3` because then cython errors
return buffer(obj) # "undeclared name not builtin: buffer"
ELSE:
raise AssertionError("must be called only on py2")
# _buffer_decode decodes buf to unicode according to encoding and errors.
#
# buf must expose buffer interface.
# encoding/errors can be None meaning to use default utf-8/strict.
cdef unicode _buffer_decode(buf, encoding, errors):
if encoding is None: encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
if errors is None: errors = 'strict'
if _XPyObject_CheckOldBuffer(buf):
buf = _buffer_py2(buf)
else:
buf = memoryview(buf)
return bytearray(buf).decode(encoding, errors)
cdef extern from "Python.h":
"""
static int _XPyObject_CheckOldBuffer(PyObject *o) {
#if PY_MAJOR_VERSION >= 3
// no old-style buffers on py3
return 0;
#else
return PyObject_CheckReadBuffer(o);
#endif
}
"""
bint _XPyObject_CheckOldBuffer(object o)
# ---- UTF-8 encode/decode ---- # ---- UTF-8 encode/decode ----
from six import unichr # py2: unichr py3: chr from six import unichr # py2: unichr py3: chr
...@@ -472,7 +598,7 @@ def _utf8_decode_surrogateescape(const uint8_t[::1] s): # -> unicode ...@@ -472,7 +598,7 @@ def _utf8_decode_surrogateescape(const uint8_t[::1] s): # -> unicode
def _utf8_encode_surrogateescape(s): # -> bytes def _utf8_encode_surrogateescape(s): # -> bytes
assert isinstance(s, unicode) assert isinstance(s, unicode)
if PY_MAJOR_VERSION >= 3: if PY_MAJOR_VERSION >= 3:
return s.encode('UTF-8', 'surrogateescape') return unicode.encode(s, 'UTF-8', 'surrogateescape')
# py2 does not have surrogateescape error handler, and even if we # py2 does not have surrogateescape error handler, and even if we
# provide one, builtin unicode.encode() does not treat # provide one, builtin unicode.encode() does not treat
......
...@@ -28,8 +28,20 @@ from golang.strconv_test import byterange ...@@ -28,8 +28,20 @@ from golang.strconv_test import byterange
from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises, mark, skip from pytest import raises, mark, skip
import sys import sys
import six
from six import text_type as unicode from six import text_type as unicode
from six.moves import range as xrange from six.moves import range as xrange
import array
# buftypes lists types with buffer interface that we will test against.
buftypes = [
bytearray,
memoryview,
lambda x: array.array('B', x),
]
if six.PY2:
buftypes.append(buffer) # no buffer on py3
# verify b/u and bstr/ustr basics. # verify b/u and bstr/ustr basics.
...@@ -123,6 +135,17 @@ def test_strings_basic(): ...@@ -123,6 +135,17 @@ def test_strings_basic():
with raises(TypeError): b(object()) with raises(TypeError): b(object())
with raises(TypeError): u(object()) with raises(TypeError): u(object())
# bstr/ustr - similarly to str - accept arbitrary objects
_ = bstr(); assert type(_) is bstr; assert _ == ''
_ = ustr(); assert type(_) is ustr; assert _ == ''
_ = bstr(123); assert type(_) is bstr; assert _ == '123'
_ = ustr(123); assert type(_) is ustr; assert _ == '123'
_ = bstr([1,'b']); assert type(_) is bstr; assert _ == "[1, 'b']"
_ = ustr([1,'b']); assert type(_) is ustr; assert _ == "[1, 'b']"
obj = object()
_ = bstr(obj); assert type(_) is bstr; assert _ == str(obj) # <object ...>
_ = ustr(obj); assert type(_) is ustr; assert _ == str(obj) # <object ...>
b_ = xbytes ("мир"); assert type(b_) is bytes b_ = xbytes ("мир"); assert type(b_) is bytes
u_ = xunicode ("мир"); assert type(u_) is unicode u_ = xunicode ("мир"); assert type(u_) is unicode
...@@ -130,17 +153,46 @@ def test_strings_basic(): ...@@ -130,17 +153,46 @@ def test_strings_basic():
# b/u from unicode # b/u from unicode
bs = b(u_); assert isinstance(bs, bytes); assert type(bs) is bstr bs = b(u_); assert isinstance(bs, bytes); assert type(bs) is bstr
us = u(u_); assert isinstance(us, unicode); assert type(us) is ustr us = u(u_); assert isinstance(us, unicode); assert type(us) is ustr
_ = bstr(u_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(u_); assert type(_) is ustr; assert _ == "мир"
# b/u from bytes # b/u from bytes
_ = b(b_); assert type(_) is bstr; assert _ == "мир" _ = b(b_); assert type(_) is bstr; assert _ == "мир"
_ = u(b_); assert type(_) is ustr; assert _ == "мир" _ = u(b_); assert type(_) is ustr; assert _ == "мир"
_ = bstr(b_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(b_); assert type(_) is ustr; assert _ == "мир"
# TODO also handle bytearray? # TODO also handle bytearray?
# bstr/ustr from bytes/buffer with encoding
k8mir_bytes = u"мир".encode('koi8-r')
for tbuf in [bytes] + buftypes:
k8mir = tbuf(k8mir_bytes)
_ = bstr(k8mir, 'koi8-r'); assert type(_) is bstr; assert _ == "мир"
_ = ustr(k8mir, 'koi8-r'); assert type(_) is ustr; assert _ == "мир"
with raises(UnicodeDecodeError): bstr(k8mir, 'ascii')
with raises(UnicodeDecodeError): ustr(k8mir, 'ascii')
_ = bstr(k8mir, 'ascii', 'replace'); assert type(_) is bstr; assert _ == u'\ufffd\ufffd\ufffd'
_ = ustr(k8mir, 'ascii', 'replace'); assert type(_) is ustr; assert _ == u'\ufffd\ufffd\ufffd'
# no encoding -> utf8 with surrogateescape for bytes, stringify for the rest
k8mir_usurrogateescape = u'\udccd\udcc9\udcd2'
k8mir_strok = k8mir_usurrogateescape
if not tbuf in (bytes,):
k8mir_strok = str(k8mir) # e.g. '<memory at ...>' for memoryview
_ = bstr(k8mir); assert type(_) is bstr; assert _ == k8mir_strok
_ = ustr(k8mir); assert type(_) is ustr; assert _ == k8mir_strok
# encoding specified -> treat it precisely
with raises(UnicodeDecodeError): bstr(k8mir, 'utf-8')
with raises(UnicodeDecodeError): ustr(k8mir, 'utf-8')
with raises(UnicodeDecodeError): bstr(k8mir, encoding='utf-8')
with raises(UnicodeDecodeError): ustr(k8mir, encoding='utf-8')
with raises(UnicodeDecodeError): bstr(k8mir, errors='strict')
with raises(UnicodeDecodeError): ustr(k8mir, errors='strict')
# b(b(·)) = identity, u(u(·)) = identity # b(b(·)) = identity, u(u(·)) = identity
assert b(bs) is bs assert b(bs) is bs; assert bstr(bs) is bs
assert u(us) is us assert u(us) is us; assert ustr(us) is us
# bytes(b(·)) = identity, unicode(u(·)) = identity # bytes(b(·)) = identity, unicode(u(·)) = identity
assert bytes (bs) is bs assert bytes (bs) is bs
...@@ -274,6 +326,44 @@ def test_strings_print(): ...@@ -274,6 +326,44 @@ def test_strings_print():
assertDoc(outok, stdout) assertDoc(outok, stdout)
# verify behaviour of bstr|ustr subclasses.
@mark.parametrize('tx', (unicode, bstr, ustr))
def test_strings_subclasses(tx):
x = xstr(u'мир', tx); assert type(x) is tx
# subclass without __str__
class MyStr(tx):
pass
xx = MyStr(x); assert type(xx) is MyStr
_ = tx(xx); assert type(_) is tx ; assert _ == x # e.g. unicode(MyStr) -> unicode, not MyStr
_ = bstr(xx); assert type(_) is bstr ; assert _ == 'мир'
_ = ustr(xx); assert type(_) is ustr ; assert _ == 'мир'
_ = b(xx); assert type(_) is bstr ; assert _ == 'мир'
_ = u(xx); assert type(_) is ustr ; assert _ == 'мир'
# subclass with __str__
class MyStr(tx):
def __str__(self): return u'αβγ'
__unicode__ = __str__
xx = MyStr(x); assert type(xx) is MyStr
_ = tx(xx); assert type(_) is tx ; assert _ == u'αβγ' # unicode(MyStr) -> u'αβγ', not 'мир'
_ = bstr(xx); assert type(_) is bstr ; assert _ == u'αβγ'
_ = ustr(xx); assert type(_) is ustr ; assert _ == u'αβγ'
_ = b(xx); assert type(_) is bstr ; assert _ == u'мир' # b(MyStr) -> 'мир', not 'αβγ'
_ = u(xx); assert type(_) is ustr ; assert _ == u'мир'
# non-subclass with __str__ (for completeness)
class MyObj(object):
def __str__(self):
return 'myobj'
xx = MyObj(); assert type(xx) is MyObj
_ = tx(xx); assert type(_) is tx ; assert _ == 'myobj'
_ = bstr(xx); assert type(_) is bstr ; assert _ == 'myobj'
_ = ustr(xx); assert type(_) is ustr ; assert _ == 'myobj'
with raises(TypeError): b(xx) # NOTE b/u reports "convertion failure"
with raises(TypeError): u(xx)
def test_qq(): def test_qq():
# NOTE qq is also tested as part of strconv.quote # NOTE qq is also tested as part of strconv.quote
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment