Commit d7e55bb0 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Teach b/u to accept objects with buffer interface

And to convert them to bstr/ustr decoding buffer data as if it was
bytes. This is needed if e.g. we have data in mmap or numpy.ndarray, and
want to convert the data to string. The conversion is always explicit via
explicit call to b/u. And for bstr/ustr constructors, we preserver their
behaviour to match unicode constructor not to convert automatically, but
instead to stringify the object, e.g. as shown below:

    In [1]: bdata = b'hello 123'

    In [2]: mview = memoryview(bdata)

    In [3]: str(mview)
    Out[3]: '<memory at 0x7fb226b26700>'	# NOTE _not_ b'hello 123'
parent e4d5cb21
...@@ -249,8 +249,8 @@ to `ustr`. When the coercion happens, `bytes` and `bytearray`, similarly to ...@@ -249,8 +249,8 @@ to `ustr`. When the coercion happens, `bytes` and `bytearray`, similarly to
`str`/`unicode` classes. They support all methods of `str`/`unicode` and in `str`/`unicode` classes. They support all methods of `str`/`unicode` and in
particular their constructors accept arbitrary objects and either convert or stringify them. For particular their constructors accept arbitrary objects and either convert or stringify them. For
cases when no stringification is desired, and one only wants to convert cases when no stringification is desired, and one only wants to convert
`bstr`/`ustr` / `unicode`/`bytes`/`bytearray` `bstr`/`ustr` / `unicode`/`bytes`/`bytearray`, or an object with `buffer`
to Pygolang string, `b` and `u` provide way to make sure an interface [*]_, to Pygolang string, `b` and `u` provide way to make sure an
object is either `bstr` or `ustr` correspondingly. object is either `bstr` or `ustr` correspondingly.
Usage example:: Usage example::
...@@ -258,10 +258,12 @@ Usage example:: ...@@ -258,10 +258,12 @@ Usage example::
s = b('привет') # s is bstr corresponding to UTF-8 encoding of 'привет'. s = b('привет') # s is bstr corresponding to UTF-8 encoding of 'привет'.
def f(s): def f(s):
s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes or bytearray. s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes, bytearray or buffer.
... # (*) the decoding never fails nor looses information. ... # (*) the decoding never fails nor looses information.
.. [*] `unicode` on Python2, `str` on Python3. .. [*] `unicode` on Python2, `str` on Python3.
.. [*] | data in buffer, similarly to `bytes` and `bytearray`, is treated as UTF8-encoded string.
| Notice that only explicit conversion through `b` and `u` accept objects with buffer interface. Automatic coercion does not.
Import Import
......
...@@ -26,6 +26,7 @@ from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnico ...@@ -26,6 +26,7 @@ from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnico
from cpython cimport PyUnicode_DecodeUTF8 from cpython cimport PyUnicode_DecodeUTF8
from cpython cimport PyTypeObject, Py_TYPE, richcmpfunc from cpython cimport PyTypeObject, Py_TYPE, richcmpfunc
from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
from cpython cimport PyObject_CheckBuffer
cdef extern from "Python.h": cdef extern from "Python.h":
void PyType_Modified(PyTypeObject *) void PyType_Modified(PyTypeObject *)
...@@ -44,7 +45,7 @@ def pyb(s): # -> bstr ...@@ -44,7 +45,7 @@ def pyb(s): # -> bstr
"""b converts object to bstr. """b converts object to bstr.
- For bstr the same object is returned. - For bstr the same object is returned.
- For bytes or bytearray the data is - For bytes, bytearray, or object with buffer interface, the data is
preserved as-is and only result type is changed to bstr. preserved as-is and only result type is changed to bstr.
- For ustr/unicode the data is UTF-8 encoded. The encoding always succeeds. - For ustr/unicode the data is UTF-8 encoded. The encoding always succeeds.
...@@ -66,7 +67,7 @@ def pyu(s): # -> ustr ...@@ -66,7 +67,7 @@ def pyu(s): # -> ustr
- For ustr the same object is returned. - For ustr the same object is returned.
- For unicode the data is preserved as-is and only result type is changed to ustr. - For unicode the data is preserved as-is and only result type is changed to ustr.
- For bstr, bytes or bytearray the data is UTF-8 decoded. - For bstr, bytes, bytearray, or object with buffer interface, the data is UTF-8 decoded.
The decoding always succeeds and input The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF. surrogate codes ranging from U+DC80 to U+DCFF.
...@@ -95,9 +96,8 @@ cdef _pyb(bcls, s): # -> ~bstr | None ...@@ -95,9 +96,8 @@ cdef _pyb(bcls, s): # -> ~bstr | None
elif isinstance(s, unicode): elif isinstance(s, unicode):
s = _utf8_encode_surrogateescape(s) s = _utf8_encode_surrogateescape(s)
else: else:
if isinstance(s, bytearray): s = _ifbuffer_data(s) # bytearray and buffer
s = bytes(s) if s is None:
else:
return None return None
assert type(s) is bytes assert type(s) is bytes
...@@ -111,8 +111,9 @@ cdef _pyu(ucls, s): # -> ~ustr | None ...@@ -111,8 +111,9 @@ cdef _pyu(ucls, s): # -> ~ustr | None
if type(s) is not unicode: if type(s) is not unicode:
s = _udata(s) s = _udata(s)
else: else:
if isinstance(s, bytearray): _ = _ifbuffer_data(s) # bytearray and buffer
s = bytes(s) if _ is not None:
s = _
if isinstance(s, bytes): if isinstance(s, bytes):
s = _utf8_decode_surrogateescape(s) s = _utf8_decode_surrogateescape(s)
else: else:
...@@ -121,6 +122,19 @@ cdef _pyu(ucls, s): # -> ~ustr | None ...@@ -121,6 +122,19 @@ cdef _pyu(ucls, s): # -> ~ustr | None
assert type(s) is unicode assert type(s) is unicode
return unicode.__new__(ucls, s) return unicode.__new__(ucls, s)
# _ifbuffer_data returns contained data if obj provides buffer interface.
cdef _ifbuffer_data(obj): # -> bytes|None
if PyObject_CheckBuffer(obj):
if PY_MAJOR_VERSION >= 3:
return bytes(obj)
else:
# py2: bytes(memoryview) returns '<memory at ...>'
return bytes(bytearray(obj))
elif _XPyObject_CheckOldBuffer(obj): # old-style buffer, py2-only
return bytes(_buffer_py2(obj))
else:
return None
# _pyb_coerce coerces x from `b op x` to be used in operation with pyb. # _pyb_coerce coerces x from `b op x` to be used in operation with pyb.
cdef _pyb_coerce(x): # -> bstr|bytes cdef _pyb_coerce(x): # -> bstr|bytes
......
...@@ -128,7 +128,7 @@ def test_strings_basic(): ...@@ -128,7 +128,7 @@ def test_strings_basic():
assert ub_tunicode_ == tunicode assert ub_tunicode_ == tunicode
# b/u accept only ~bytes/~unicode/bytearray # b/u accept only ~bytes/~unicode/bytearray/buffer
with raises(TypeError): b() with raises(TypeError): b()
with raises(TypeError): u() with raises(TypeError): u()
with raises(TypeError): b(123) with raises(TypeError): b(123)
...@@ -172,6 +172,15 @@ def test_strings_basic(): ...@@ -172,6 +172,15 @@ def test_strings_basic():
_ = bstr(ba_); assert type(_) is bstr; assert _ == "мир" _ = bstr(ba_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(ba_); assert type(_) is ustr; assert _ == "мир" _ = ustr(ba_); assert type(_) is ustr; assert _ == "мир"
# b/u from buffer
for tbuf in buftypes:
bbuf_ = tbuf(b_)
bbuf_std_str = str(bbuf_) # e.g. '<memory at ...>' for memoryview
_ = b(bbuf_); assert type(_) is bstr; assert _ == "мир"
_ = u(bbuf_); assert type(_) is ustr; assert _ == "мир"
_ = bstr(bbuf_); assert type(_) is bstr; assert _ == bbuf_std_str # NOTE not 'мир'
_ = ustr(bbuf_); assert type(_) is ustr; assert _ == bbuf_std_str
# bstr/ustr from bytes/bytearray/buffer with encoding # bstr/ustr from bytes/bytearray/buffer with encoding
k8mir_bytes = u"мир".encode('koi8-r') k8mir_bytes = u"мир".encode('koi8-r')
for tbuf in [bytes, bytearray] + buftypes: for tbuf in [bytes, bytearray] + buftypes:
...@@ -189,6 +198,8 @@ def test_strings_basic(): ...@@ -189,6 +198,8 @@ def test_strings_basic():
k8mir_strok = str(k8mir) # e.g. '<memory at ...>' for memoryview k8mir_strok = str(k8mir) # e.g. '<memory at ...>' for memoryview
_ = bstr(k8mir); assert type(_) is bstr; assert _ == k8mir_strok _ = bstr(k8mir); assert type(_) is bstr; assert _ == k8mir_strok
_ = ustr(k8mir); assert type(_) is ustr; assert _ == k8mir_strok _ = ustr(k8mir); assert type(_) is ustr; assert _ == k8mir_strok
_ = b (k8mir); assert type(_) is bstr; assert _ == k8mir_usurrogateescape # always surrogateescape
_ = u (k8mir); assert type(_) is ustr; assert _ == k8mir_usurrogateescape
# encoding specified -> treat it precisely # encoding specified -> treat it precisely
with raises(UnicodeDecodeError): bstr(k8mir, 'utf-8') with raises(UnicodeDecodeError): bstr(k8mir, 'utf-8')
with raises(UnicodeDecodeError): ustr(k8mir, 'utf-8') with raises(UnicodeDecodeError): ustr(k8mir, 'utf-8')
...@@ -284,6 +295,31 @@ def test_strings_ops2(tx, ty): ...@@ -284,6 +295,31 @@ def test_strings_ops2(tx, ty):
assert y > x assert y > x
# verify string operations like `x + y` for x being bstr/ustr and y being a
# type unsupported for coercion.
@mark.parametrize('tx', (bstr, ustr))
@mark.parametrize('ty', buftypes)
def test_strings_ops2_bufreject(tx, ty):
x = xstr(u'мир', tx)
y = ty(b'123')
assert (x == y) is False # see test_strings_ops2_eq_any
assert (x != y) is True
with raises(TypeError): x >= y
with raises(TypeError): x <= y
with raises(TypeError): x > y
with raises(TypeError): x < y
# `y > x` does not raise when x is bstr (= provides buffer):
y == x # not raises TypeError - see test_strings_ops2_eq_any
y != x #
if tx is not bstr:
with raises(TypeError): y >= x
with raises(TypeError): y <= x
with raises(TypeError): y > x
with raises(TypeError): y < x
# verify string operations like `x == *` for x being bstr/ustr. # verify string operations like `x == *` for x being bstr/ustr.
# Those operations must succeed for any hashable type or else bstr/ustr could # Those operations must succeed for any hashable type or else bstr/ustr could
# not be used as dict keys. # not be used as dict keys.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment