Commit 8a240b5b authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Fix ustr to provide buffer interface, like bstr already does

Kazuhiko reports that using base64.b64encode with ustr fails on py3:

    >>> base64.b64encode(b('a'))
    b'YQ=='
    >>> base64.b64encode(u('a'))
    Traceback (most recent call last):
      File "<console>", line 1, in <module>
      File "/*/lib/python3.8/base64.py", line 58, in b64encode
        encoded = binascii.b2a_base64(s, newline=False)
    TypeError: a bytes-like object is required, not 'pyustr'

which uncovers a thought bug of mine: initially in 105d03d4 (golang_str: Add
test for memoryview(bstr)) I made only bstr to provide buffer interface, while
ustr does not provide it with wrong-thinking that it contains unicode
characters, not binary data. But to fully respect the promise that ustr can be
automatically converted to bytes, it also means that ustr should provide buffer
interface so that things like PyArg_Parse("s#") or PyArg_Parse("y") could
accept it.

While PyArg_Parse("s#") is not yet completely fixed to work with this patch, as
it still reports UnicodeEncodeError for ustr corresponding to non-UTF8 data,
adding buffer interface to ustr is still a step into the right direction
becuase of the way e.g. binascii.b64encode(u) is implemented:

    base64.b64encode(x)     ->  binascii.b2a_base64(x)

    binascii.b2a_base64(u)  ->  py2: PyArg_ParseTuple('s*', u)  ->  _PyUnicode_AsDefaultEncodedString(u)
                                py3: PyObject_GetBuffer(u)      ->  u.tp_as_buffer.bf_getbuffer

Here we see that on py3 it tails to retrieve object's data via
.tp_as_buffer.bf_getbuffer and if there is no buffer interface provided that
will fail. But we can't let base64.b64encode(ustr) to fail if
base64.b64encode(bstr) works ok because both bstr and ustr represent the
same string entity just into two different forms.

-> So teach ustr to provide buffer interface so that e.g. memoryview starts to
   work on it and observe corresponding bytes data. This fixes
   binascii.b64encode(ustr) on py3 and also fixes t_hash/py2, and y, y_star and
   y_hash test_strings_capi_getargs_to_cstr cases on py3.

Note: the original unicode on py2 has:

    .bf_getreadbuf      -> []wchar  for     []UCS                                   ; used by buffer(u)
    .bf_getcharbuffer   -> []byte   for     encode([]UCS, sys.defaultencoding)      ; used by t#  and PyObject_AsCharBuffer
    .bf_getbuffer = 0                                                               ; used by memoryview(u)

and on py3:

    .tp_as_buffer = 0

/reported-by @kazuhiko
/reported-at nexedi/pygolang!21 (comment 172595)
parent 99b9c59b
......@@ -30,17 +30,17 @@ from cpython cimport PyTypeObject, Py_TYPE, reprfunc, richcmpfunc, binaryfunc
from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
from cpython.iterobject cimport PySeqIter_New
from cpython cimport PyThreadState_GetDict, PyDict_SetItem
from cpython cimport PyObject_CheckBuffer
from cpython cimport PyObject_CheckBuffer, PyBuffer_FillInfo, Py_SIZE
cdef extern from "Python.h":
PyTypeObject PyBytes_Type
ctypedef struct PyBytesObject:
pass
char *ob_sval
cdef extern from "Python.h":
PyTypeObject PyUnicode_Type
ctypedef struct PyUnicodeObject:
pass
PyObject *defenc # NOTE py2 only; ~ .utf8 on py3
cdef extern from "Python.h":
"""
......@@ -1006,6 +1006,63 @@ cdef class _pyustr(unicode):
return t
# buffer interface so that ustr can be automatically converted to bytes for
# e.g. PyArg_Parse("s#") and memoryview.
def __getbuffer__(self, Py_buffer *buf, int flags):
# TODO py2: use .defenc directly if present (via _pyustr_getbuf)
# TODO py3: use .utf8 directly if present
bself = pyb(self)
bbself = <PyBytesObject*>bself
PyBuffer_FillInfo(buf, bself, bbself.ob_sval, Py_SIZE(bself), 1, flags)
# keep .bf_releasebuffer = NULL
# e.g. for t# py2 rejects conversion if it is !NULL with
# "argument ... must be string or pinned buffer"
# https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Python/getargs.c#L1356-L1391
#def __releasebuffer__(self, Py_buffer *buf):
# pass
# old-style buffer - used by py2
IF PY2:
def __getreadbuffer__(self, Py_ssize_t idx, void **pptr):
return _pyustr_getbuf(self, idx, pptr)
def __getcharbuffer__(self, Py_ssize_t idx, char **pptr):
return _pyustr_getbuf(self, idx, <void**>pptr)
def __getsegcount__(self, Py_ssize_t *lenp):
cdef void *_
if lenp != NULL:
lenp[0] = _pyustr_getbuf(self, 0, &_)
return 1
IF PY2:
# _pyustr_getbuf returns pointer to bytes data that correspond to ustr content.
#
# its definition is kept outside pyustr class becase
# vtab is still created even with `@staticmethod cdef ...`
# https://github.com/cython/cython/issues/5337
# so we work it around via out-of-class definition
cdef Py_ssize_t _pyustr_getbuf(self, Py_ssize_t idx, void **pptr) except -1:
if idx != 0:
raise SystemError("accessing non-existent string segment")
uself = <PyUnicodeObject*>self
cdef PyObject* xbcopy = uself.defenc
if xbcopy == NULL:
bcopy = pyb(self)
Py_INCREF(bcopy)
xbcopy = <PyObject*>bcopy
uself.defenc = xbcopy
else:
bcopy = <object>xbcopy
assert isinstance(bcopy, bytes)
pptr[0] = (<PyBytesObject*>xbcopy).ob_sval
return Py_SIZE(bcopy)
# hand-made _pyustr.__new__ (workaround for https://github.com/cython/cython/issues/799)
cdef PyObject* _pyustr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw) except NULL:
argv = ()
......
......@@ -34,6 +34,7 @@ from six import text_type as unicode, unichr
from six.moves import range as xrange
import gc, re, pickle, copy, types
import array, collections
import base64
# buftypes lists types with buffer interface that we will test against.
......@@ -313,26 +314,43 @@ def test_strings_refcount():
# verify memoryview(bstr|ustr).
def test_strings_memoryview():
bs = b('мир')
us = u('май')
with raises(TypeError):
memoryview(us)
m = memoryview(bs)
assert len(m) == 6
_ = (memoryview,)
if six.PY2:
# also verify buffer() on py2
def mbuffer(x):
return memoryview(buffer(x))
_ += (mbuffer,)
@mark.parametrize('tx', (bytes, bstr, ustr))
@mark.parametrize('mview', _)
def test_strings_memoryview(tx, mview):
# NOTE memoryview/buffer work for both bstr and ustr. In particular
# memoryview(ustr) does not raise TypeError and instead returns memoryview
# for bytes-representation of ustr.
x = xstr(xbytes('мир')+b'\xff', tx) # note: invalid utf-8
m = mview(x)
assert m.format == 'B'
assert m.itemsize == 1
assert m.ndim == 1
assert m.strides == (1,)
assert m.readonly
assert m.shape == (7,)
assert len(m) == 7
def _(i): # returns m[i] as int
x = m[i]
mi = m[i]
if six.PY2: # on py2 memoryview[i] returns bytechar
x = ord(x)
return x
mi = ord(mi)
return mi
assert _(0) == 0xd0
assert _(1) == 0xbc
assert _(2) == 0xd0
assert _(3) == 0xb8
assert _(4) == 0xd1
assert _(5) == 0x80
assert _(6) == 0xff
# memoryview/buffer must be read-only
with raises(TypeError, match="cannot modify read-only memory"):
m[0] = m[0]
# verify that ord on bstr/ustr works as expected.
......@@ -2062,7 +2080,7 @@ if six.PY3:
@mark.parametrize('fmt', _)
def test_strings_capi_getargs_to_cstr(tx, fmt):
if six.PY2:
if tx is ustr and fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash', 't_hash'):
if tx is ustr and fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash'):
# UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-3: ordinal not in range(128)
xfail("TODO: py2: PyArg_Parse(%s) vs ustr" % fmt)
......@@ -2077,9 +2095,8 @@ def test_strings_capi_getargs_to_cstr(tx, fmt):
# TODO we will try to handle this later
xfail("TODO: py3: PyArg_Parse(%s) vs bstr" % fmt)
if tx is ustr and fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash', 'y', 'y_star', 'y_hash'):
if tx is ustr and fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash'):
# UnicodeEncodeError: 'utf-8' codec can't encode character '\udcff' in position 3: surrogates not allowed
# TypeError: a bytes-like object is required, not 'golang.ustr'
xfail("TODO: py3: PyArg_Parse(%s) vs ustr" % fmt)
bmirf = xbytes('мир') + b'\xff' # invalid UTF-8 to make sure conversion
......@@ -2739,6 +2756,21 @@ def test_strings_cmp_wrt_distutils_LooseVersion(tx):
assert not (l < x)
# base.b64encode(ustr) used to raise TypeError.
# https://lab.nexedi.com/nexedi/pygolang/merge_requests/21#note_172595
@mark.parametrize('tx', (bytes, bstr, ustr))
def test_strings_base64(tx):
if six.PY2 and tx is ustr:
# PyArg_Parse('s*', u) -> _PyUnicode_AsDefaultEncodedString(u)
# -> UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-3: ordinal not in range(128)
#
# even if default encoding is utf-8 (gpython) the result is 0LzQuNGA7bO
xfail("TODO: py2: ustr -> default encoded bstr")
x = xstr(u'мир', tx) + b'\xff' ; assert type(x) is tx
assert base64.b64encode(x) == b'0LzQuNGA/w=='
# ---- benchmarks ----
# utf-8 decoding
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment