golang_str: Fix ustr to provide buffer interface, like bstr already does

Kazuhiko reports that using base64.b64encode with ustr fails on py3: >>> base64.b64encode(b('a')) b'YQ==' >>> base64.b64encode(u('a')) Traceback (most recent call last): File "<console>", line 1, in <module> File "/*/lib/python3.8/base64.py", line 58, in b64encode encoded = binascii.b2a_base64(s, newline=False) TypeError: a bytes-like object is required, not 'pyustr' which uncovers a thought bug of mine: initially in 105d03d4 (golang_str: Add test for memoryview(bstr)) I made only bstr to provide buffer interface, while ustr does not provide it with wrong-thinking that it contains unicode characters, not binary data. But to fully respect the promise that ustr can be automatically converted to bytes, it also means that ustr should provide buffer interface so that things like PyArg_Parse("s#") or PyArg_Parse("y") could accept it. While PyArg_Parse("s#") is not yet completely fixed to work with this patch, as it still reports UnicodeEncodeError for ustr corresponding to non-UTF8 data, adding buffer interface to ustr is still a step into the right direction becuase of the way e.g. binascii.b64encode(u) is implemented: base64.b64encode(x) -> binascii.b2a_base64(x) binascii.b2a_base64(u) -> py2: PyArg_ParseTuple('s*', u) -> _PyUnicode_AsDefaultEncodedString(u) py3: PyObject_GetBuffer(u) -> u.tp_as_buffer.bf_getbuffer Here we see that on py3 it tails to retrieve object's data via .tp_as_buffer.bf_getbuffer and if there is no buffer interface provided that will fail. But we can't let base64.b64encode(ustr) to fail if base64.b64encode(bstr) works ok because both bstr and ustr represent the same string entity just into two different forms. -> So teach ustr to provide buffer interface so that e.g. memoryview starts to work on it and observe corresponding bytes data. This fixes binascii.b64encode(ustr) on py3 and also fixes t_hash/py2, and y, y_star and y_hash test_strings_capi_getargs_to_cstr cases on py3. Note: the original unicode on py2 has: .bf_getreadbuf -> []wchar for []UCS ; used by buffer(u) .bf_getcharbuffer -> []byte for encode([]UCS, sys.defaultencoding) ; used by t# and PyObject_AsCharBuffer .bf_getbuffer = 0 ; used by memoryview(u) and on py3: .tp_as_buffer = 0 /reported-by @kazuhiko /reported-at nexedi/pygolang!21 (comment 172595)

golang_str: Fix ustr to provide buffer interface, like bstr already does
Kazuhiko reports that using base64.b64encode with ustr fails on py3: >>> base64.b64encode(b('a')) b'YQ==' >>> base64.b64encode(u('a')) Traceback (most recent call last): File "<console>", line 1, in <module> File "/*/lib/python3.8/base64.py", line 58, in b64encode encoded = binascii.b2a_base64(s, newline=False) TypeError: a bytes-like object is required, not 'pyustr' which uncovers a thought bug of mine: initially in 105d03d4 (golang_str: Add test for memoryview(bstr)) I made only bstr to provide buffer interface, while ustr does not provide it with wrong-thinking that it contains unicode characters, not binary data. But to fully respect the promise that ustr can be automatically converted to bytes, it also means that ustr should provide buffer interface so that things like PyArg_Parse("s#") or PyArg_Parse("y") could accept it. While PyArg_Parse("s#") is not yet completely fixed to work with this patch, as it still reports UnicodeEncodeError for ustr corresponding to non-UTF8 data, adding buffer interface to ustr is still a step into the right direction becuase of the way e.g. binascii.b64encode(u) is implemented: base64.b64encode(x) -> binascii.b2a_base64(x) binascii.b2a_base64(u) -> py2: PyArg_ParseTuple('s*', u) -> _PyUnicode_AsDefaultEncodedString(u) py3: PyObject_GetBuffer(u) -> u.tp_as_buffer.bf_getbuffer Here we see that on py3 it tails to retrieve object's data via .tp_as_buffer.bf_getbuffer and if there is no buffer interface provided that will fail. But we can't let base64.b64encode(ustr) to fail if base64.b64encode(bstr) works ok because both bstr and ustr represent the same string entity just into two different forms. -> So teach ustr to provide buffer interface so that e.g. memoryview starts to work on it and observe corresponding bytes data. This fixes binascii.b64encode(ustr) on py3 and also fixes t_hash/py2, and y, y_star and y_hash test_strings_capi_getargs_to_cstr cases on py3. Note: the original unicode on py2 has: .bf_getreadbuf -> []wchar for []UCS ; used by buffer(u) .bf_getcharbuffer -> []byte for encode([]UCS, sys.defaultencoding) ; used by t# and PyObject_AsCharBuffer .bf_getbuffer = 0 ; used by memoryview(u) and on py3: .tp_as_buffer = 0 /reported-by @kazuhiko /reported-at nexedi/pygolang!21 (comment 172595)
8a240b5b · Kirill Smelkov · 99b9c59b · 8a240b5b · 8a240b5b
Commit 8a240b5b authored Dec 22, 2024 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 107 additions and 18 deletions

golang/_golang_str.pyx golang/_golang_str.pyx +60 -3

golang/golang_str_test.py golang/golang_str_test.py +47 -15

No files found.
--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -30,17 +30,17 @@ from cpython cimport PyTypeObject, Py_TYPE, reprfunc, richcmpfunc, binaryfunc
 from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
 from cpython.iterobject cimport PySeqIter_New
 from cpython cimport PyThreadState_GetDict, PyDict_SetItem
-from cpython cimport PyObject_CheckBuffer
+from cpython cimport PyObject_CheckBuffer, PyBuffer_FillInfo, Py_SIZE

 cdef extern from "Python.h":
    PyTypeObject PyBytes_Type
    ctypedef struct PyBytesObject:
-        pass
+        char *ob_sval

 cdef extern from "Python.h":
    PyTypeObject PyUnicode_Type
    ctypedef struct PyUnicodeObject:
-        pass
+        PyObject   *defenc  # NOTE py2 only; ~ .utf8 on py3

 cdef extern from "Python.h":
    """
@@ -1006,6 +1006,63 @@ cdef class _pyustr(unicode):
        return t


+    # buffer interface so that ustr can be automatically converted to bytes for
+    # e.g. PyArg_Parse("s#") and memoryview.
+    def __getbuffer__(self, Py_buffer *buf, int flags):
+        # TODO py2: use .defenc directly if present   (via _pyustr_getbuf)
+        # TODO py3: use .utf8   directly if present
+        bself = pyb(self)
+        bbself = <PyBytesObject*>bself
+
+        PyBuffer_FillInfo(buf, bself, bbself.ob_sval, Py_SIZE(bself), 1, flags)
+
+    # keep .bf_releasebuffer = NULL
+    # e.g. for t# py2 rejects conversion if it is !NULL with
+    #   "argument ... must be string or pinned buffer"
+    # https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Python/getargs.c#L1356-L1391
+    #def __releasebuffer__(self, Py_buffer *buf):
+    #    pass
+
+    # old-style buffer - used by py2
+    IF PY2:
+        def __getreadbuffer__(self, Py_ssize_t idx, void **pptr):
+            return _pyustr_getbuf(self, idx, pptr)
+
+        def __getcharbuffer__(self, Py_ssize_t idx, char **pptr):
+            return _pyustr_getbuf(self, idx, <void**>pptr)
+
+        def __getsegcount__(self, Py_ssize_t *lenp):
+            cdef void *_
+            if lenp != NULL:
+                lenp[0] = _pyustr_getbuf(self, 0, &_)
+            return 1
+
+IF PY2:
+    # _pyustr_getbuf returns pointer to bytes data that correspond to ustr content.
+    #
+    # its definition is kept outside pyustr class becase
+    # vtab is still created even with `@staticmethod cdef ...`
+    # https://github.com/cython/cython/issues/5337
+    # so we work it around via out-of-class definition
+    cdef Py_ssize_t _pyustr_getbuf(self, Py_ssize_t idx, void **pptr) except -1:
+        if idx != 0:
+            raise SystemError("accessing non-existent string segment")
+
+        uself  = <PyUnicodeObject*>self
+        cdef PyObject* xbcopy = uself.defenc
+        if xbcopy == NULL:
+            bcopy = pyb(self)
+            Py_INCREF(bcopy)
+            xbcopy = <PyObject*>bcopy
+            uself.defenc = xbcopy
+        else:
+            bcopy = <object>xbcopy
+        assert isinstance(bcopy, bytes)
+
+        pptr[0] = (<PyBytesObject*>xbcopy).ob_sval
+        return Py_SIZE(bcopy)
+
+
 # hand-made _pyustr.__new__  (workaround for https://github.com/cython/cython/issues/799)
 cdef PyObject* _pyustr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw) except NULL:
    argv = ()

--- a/golang/golang_str_test.py
+++ b/golang/golang_str_test.py
@@ -34,6 +34,7 @@ from six import text_type as unicode, unichr
 from six.moves import range as xrange
 import gc, re, pickle, copy, types
 import array, collections
+import base64


 # buftypes lists types with buffer interface that we will test against.
@@ -313,26 +314,43 @@ def test_strings_refcount():


 # verify memoryview(bstr|ustr).
-def test_strings_memoryview():
-    bs = b('мир')
-    us = u('май')
-
-    with raises(TypeError):
-        memoryview(us)
-
-    m = memoryview(bs)
-    assert len(m) == 6
+_ = (memoryview,)
+if six.PY2:
+    # also verify buffer() on py2
+    def mbuffer(x):
+        return memoryview(buffer(x))
+    _ += (mbuffer,)
+@mark.parametrize('tx', (bytes, bstr, ustr))
+@mark.parametrize('mview', _)
+def test_strings_memoryview(tx, mview):
+    # NOTE memoryview/buffer work for both bstr and ustr. In particular
+    # memoryview(ustr) does not raise TypeError and instead returns memoryview
+    # for bytes-representation of ustr.
+    x = xstr(xbytes('мир')+b'\xff', tx)     # note: invalid utf-8
+    m = mview(x)
+    assert m.format         == 'B'
+    assert m.itemsize       == 1
+    assert m.ndim           == 1
+    assert m.strides        == (1,)
+    assert m.readonly
+    assert m.shape == (7,)
+    assert len(m) == 7
    def _(i): # returns m[i] as int
-        x = m[i]
+        mi = m[i]
        if six.PY2: # on py2 memoryview[i] returns bytechar
-            x = ord(x)
-        return x
+            mi = ord(mi)
+        return mi
    assert _(0) == 0xd0
    assert _(1) == 0xbc
    assert _(2) == 0xd0
    assert _(3) == 0xb8
    assert _(4) == 0xd1
    assert _(5) == 0x80
+    assert _(6) == 0xff
+
+    # memoryview/buffer must be read-only
+    with raises(TypeError, match="cannot modify read-only memory"):
+        m[0] = m[0]


 # verify that ord on bstr/ustr works as expected.
@@ -2062,7 +2080,7 @@ if six.PY3:
 @mark.parametrize('fmt', _)
 def test_strings_capi_getargs_to_cstr(tx, fmt):
    if six.PY2:
-        if tx is ustr  and  fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash', 't_hash'):
+        if tx is ustr  and  fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash'):
            # UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-3: ordinal not in range(128)
            xfail("TODO: py2: PyArg_Parse(%s) vs ustr" % fmt)

@@ -2077,9 +2095,8 @@ def test_strings_capi_getargs_to_cstr(tx, fmt):
            # TODO we will try to handle this later
            xfail("TODO: py3: PyArg_Parse(%s) vs bstr" % fmt)

-        if tx is ustr  and  fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash', 'y', 'y_star', 'y_hash'):
+        if tx is ustr  and  fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash'):
            # UnicodeEncodeError: 'utf-8' codec can't encode character '\udcff' in position 3: surrogates not allowed
-            # TypeError: a bytes-like object is required, not 'golang.ustr'
            xfail("TODO: py3: PyArg_Parse(%s) vs ustr" % fmt)

    bmirf = xbytes('мир') + b'\xff'                         # invalid UTF-8 to make sure conversion
@@ -2739,6 +2756,21 @@ def test_strings_cmp_wrt_distutils_LooseVersion(tx):
    assert not (l < x)


+# base.b64encode(ustr) used to raise TypeError.
+# https://lab.nexedi.com/nexedi/pygolang/merge_requests/21#note_172595
+@mark.parametrize('tx', (bytes, bstr, ustr))
+def test_strings_base64(tx):
+    if six.PY2 and tx is ustr:
+        # PyArg_Parse('s*', u) -> _PyUnicode_AsDefaultEncodedString(u)
+        #   -> UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-3: ordinal not in range(128)
+        #
+        # even if default encoding is utf-8 (gpython) the result is 0LzQuNGA7bO
+        xfail("TODO: py2: ustr -> default encoded bstr")
+    x = xstr(u'мир', tx) + b'\xff'  ; assert type(x) is tx
+    assert base64.b64encode(x) == b'0LzQuNGA/w=='
+
+
+
 # ---- benchmarks ----

 # utf-8 decoding