X on bstr/ustr ; Almost ready

c9648c44 · Kirill Smelkov · 7b72d418 · c9648c44 · c9648c44 · c9648c44
Commit c9648c44 authored Jul 21, 2022 by Kirill Smelkov
12 changed files
--- a/README.rst
+++ b/README.rst
@@ -10,7 +10,7 @@ Package `golang` provides Go-like features for Python:
 - `func` allows to define methods separate from class.
 - `defer` allows to schedule a cleanup from the main control flow.
 - `error` and package `errors` provide error chaining.
- `b` and `u` provide way to make sure an object is either bytes or unicode.
+- `b`, `u` and `bstr`/`ustr` provide uniform UTF8-based approach to strings.
 - `gimport` allows to import python modules by full path in a Go workspace.

 Package `golang.pyx` provides__ similar features for Cython/nogil.
@@ -229,19 +229,60 @@ __ https://www.python.org/dev/peps/pep-3134/
 Strings
 -------

-`b` and `u` provide way to make sure an object is either bytes or unicode.
-`b(obj)` converts str/unicode/bytes obj to UTF-8 encoded bytestring, while
-`u(obj)` converts str/unicode/bytes obj to unicode string. For example::
+Pygolang, similarly to Go, provides uniform UTF8-based approach to strings with
+the idea to make working with byte- and unicode- strings easy and transparently
+interoperable:

-   b("привет мир")   # -> gives bytes corresponding to UTF-8 encoding of "привет мир".
+- `bstr` is byte-string: it is based on `bytes` and can automatically convert to/from `unicode` [*]_.
+- `ustr` is unicode-string: it is based on `unicode` and can automatically convert to/from `bytes`.

-   def f(s):
-      s = u(s)       # make sure s is unicode, decoding as UTF-8(*) if it was bytes.
-      ...            # (*) but see below about lack of decode errors.
+The conversion, in both encoding and decoding, never fails and never looses
+information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
+even if bytes data is not valid UTF-8.
+
+Semantically `bstr` is array of bytes, while `ustr` is array of
+unicode-characters. Accessing their elements by `[index]` yields byte and
+unicode character correspondingly [*]_. Iterating them, however, yields unicode
+characters for both `bstr` and `ustr`. In practice `bstr` is enough 99% of the
+time, and `ustr` only needs to be used for random access to string characters.
+See `Strings, bytes, runes and characters in Go`__ for overview of this approach.
+
+__ https://blog.golang.org/strings
+
+Operations in between `bstr` and `ustr`/`unicode` / `bytes`/`bytearray` coerce to `bstr`, while
+operations in between `ustr` and `bstr`/`bytes`/`bytearray` / `unicode` coerce
+to `ustr`.  When the coercion happens, `bytes` and `bytearray`, similarly to
+`bstr`, are also treated as UTF8-encoded strings.
+
+`bstr` and `ustr` are meant to be drop-in replacements for standard
+`str`/`unicode` classes. They support all methods of `str`/`unicode` and in
+particular their constructors accept arbitrary objects and either convert or stringify them. For
+cases when no stringification is desired, and one only wants to convert
+`bstr`/`ustr` / `unicode`/`bytes`/`bytearray`, or an object with `buffer`
+interface [*]_, to Pygolang string, `b` and `u` provide way to make sure an
+object is either `bstr` or `ustr` correspondingly.

-The conversion in both encoding and decoding never fails and never looses
-information: `b(u(·))` and `u(b(·))` are always identity for bytes and unicode
-correspondingly, even if bytes input is not valid UTF-8.
+Usage example::
+
+   s  = b('привет')     # s is bstr corresponding to UTF-8 encoding of 'привет'.
+   s += ' мир'          # s is b('привет мир')
+   for c in s:          # c will iterate through
+        ...             #     [u(_) for _ in ('п','р','и','в','е','т',' ','м','и','р')]
+
+   # the following gives b('привет мир труд май')
+   b('привет %s %s %s') % (u'мир',                  # raw unicode
+                           u'труд'.encode('utf-8'), # raw bytes
+                           u('май'))                # ustr
+
+   def f(s):
+      s = u(s)          # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes, bytearray or buffer.
+      ...               # (*) the decoding never fails nor looses information.
+
+.. [*] `unicode` on Python2, `str` on Python3.
+.. [*] | ordinal of such byte and unicode character can be obtained via regular `ord`.
+       | For completeness `bbyte` and `uchr` are also provided for constructing 1-byte `bstr` and 1-character `ustr` from ordinal.
+.. [*] | data in buffer, similarly to `bytes` and `bytearray`, is treated as UTF8-encoded string.
+       | Notice that only explicit conversion through `b`/`u` and `bstr`/`ustr` accept objects with buffer interface. Automatic coercion does not.


 Import

--- a/golang/__init__.py
+++ b/golang/__init__.py
@@ -24,7 +24,7 @@
 - `func` allows to define methods separate from class.
 - `defer` allows to schedule a cleanup from the main control flow.
 - `error` and package `errors` provide error chaining.
- `b` and `u` provide way to make sure an object is either bytes or unicode.
+- `b`, `u` and `bstr`/`ustr` provide uniform UTF8-based approach to strings.
 - `gimport` allows to import python modules by full path in a Go workspace.

 See README for thorough overview.
@@ -36,7 +36,7 @@ from __future__ import print_function, absolute_import
 __version__ = "0.1"

 __all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic',
-           'recover', 'func', 'error', 'b', 'u', 'gimport']
+           'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'bbyte', 'uchr', 'gimport']

 from golang._gopath import gimport  # make gimport available from golang
 import inspect, sys
@@ -316,4 +316,16 @@ from ._golang import    \
    pypanic     as panic,   \
    pyerror     as error,   \
    pyb         as b,       \
-    pyu         as u
+    pybstr      as bstr,    \
+    pybbyte     as bbyte,   \
+    pyu         as u,       \
+    pyustr      as ustr,    \
+    pyuchr      as uchr
+
+# import golang.strconv into _golang from here to workaround cyclic golang ↔ strconv dependency
+def _():
+    from . import _golang
+    from . import strconv
+    _golang.pystrconv = strconv
+_()
+del _
--- a/golang/_golang.pyx
+++ b/golang/_golang.pyx
@@ -2,8 +2,9 @@
 # cython: language_level=2
 # cython: binding=False
 # cython: c_string_type=str, c_string_encoding=utf8
+# cython: auto_pickle=False
 # distutils: language = c++
-# distutils: depends = libgolang.h os/signal.h
+# distutils: depends = libgolang.h os/signal.h _golang_str.pyx
 #
 # Copyright (C) 2018-2022  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
@@ -808,151 +809,7 @@ cdef DType parse_dtype(dtype) except <DType>-1:

 # ---- strings ----

-from golang import strconv as pystrconv
-
-def pyb(s): # -> bytes
-    """b converts str/unicode/bytes s to UTF-8 encoded bytestring.
-
-       Bytes input is preserved as-is:
-
-          b(bytes_input) == bytes_input
-
-       Unicode input is UTF-8 encoded. The encoding always succeeds.
-       b is reverse operation to u - the following invariant is always true:
-
-          b(u(bytes_input)) == bytes_input
-
-       TypeError is raised if type(s) is not one of the above.
-
-       See also: u.
-    """
-    bs, _ = pystrconv._bstr(s)
-    return bs
-
-def pyu(s): # -> unicode
-    """u converts str/unicode/bytes s to unicode string.
-
-       Unicode input is preserved as-is:
-
-          u(unicode_input) == unicode_input
-
-       Bytes input is UTF-8 decoded. The decoding always succeeds and input
-       information is not lost: non-valid UTF-8 bytes are decoded into
-       surrogate codes ranging from U+DC80 to U+DCFF.
-       u is reverse operation to b - the following invariant is always true:
-
-          u(b(unicode_input)) == unicode_input
-
-       TypeError is raised if type(s) is not one of the above.
-
-       See also: b.
-    """
-    us, _ = pystrconv._ustr(s)
-    return us
-
-# qq is substitute for %q, which is missing in python.
-#
-# (python's automatic escape uses smartquotes quoting with either ' or ").
-#
-# like %s, %q automatically converts its argument to string.
-def pyqq(obj):
-    # make sure obj is text | bytes
-    # py2: unicode | str
-    # py3: str     | bytes
-    if not isinstance(obj, (unicode, bytes)):
-        obj = str(obj)
-
-    qobj = pystrconv.quote(obj)
-
-    # `printf('%s', qq(obj))` should work. For this make sure qobj is always
-    # a-la str type (unicode on py3, bytes on py2), that can be transparently
-    # converted to unicode or bytes as needed.
-    if PY_MAJOR_VERSION >= 3:
-        qobj = _pyunicode(pyu(qobj))
-    else:
-        qobj = _pystr(pyb(qobj))
-
-    return qobj
-
-
-# XXX cannot `cdef class`: github.com/cython/cython/issues/711
-class _pystr(bytes):
-    """_str is like bytes but can be automatically converted to Python unicode
-    string via UTF-8 decoding.
-
-    The decoding never fails nor looses information - see u for details.
-    """
-
-    # don't allow to set arbitrary attributes.
-    # won't be needed after switch to -> `cdef class`
-    __slots__ = ()
-
-
-    # __bytes__ - no need
-    def __unicode__(self):  return pyu(self)
-
-    def __str__(self):
-        if PY_MAJOR_VERSION >= 3:
-            return pyu(self)
-        else:
-            return self
-
-
-cdef class _pyunicode(unicode):
-    """_unicode is like unicode(py2)|str(py3) but can be automatically converted
-    to bytes via UTF-8 encoding.
-
-    The encoding always succeeds - see b for details.
-    """
-
-    def __bytes__(self):    return pyb(self)
-    # __unicode__ - no need
-
-    def __str__(self):
-        if PY_MAJOR_VERSION >= 3:
-            return self
-        else:
-            return pyb(self)
-
-# initialize .tp_print for _pystr so that this type could be printed.
-# If we don't - printing it will result in `RuntimeError: print recursion`
-# because str of this type never reaches real bytes or unicode.
-# Do it only on python2, because python3 does not use tp_print at all.
-# NOTE _pyunicode does not need this because on py2 str(_pyunicode) returns _pystr.
-IF PY2:
-    # NOTE Cython does not define tp_print for PyTypeObject - do it ourselves
-    from libc.stdio cimport FILE
-    cdef extern from "Python.h":
-        ctypedef int (*printfunc)(PyObject *, FILE *, int) except -1
-        ctypedef struct PyTypeObject:
-            printfunc tp_print
-        cdef PyTypeObject *Py_TYPE(object)
-
-    cdef int _pystr_tp_print(PyObject *obj, FILE *f, int nesting) except -1:
-        o = <bytes>obj
-        o = bytes(buffer(o))  # change tp_type to bytes instead of _pystr
-        return Py_TYPE(o).tp_print(<PyObject*>o, f, nesting)
-
-    Py_TYPE(_pystr()).tp_print = _pystr_tp_print
-
-
-# __pystr converts obj to str of current python:
-#
-#   - to bytes,   via b, if running on py2, or
-#   - to unicode, via u, if running on py3.
-#
-# It is handy to use __pystr when implementing __str__ methods.
-#
-# NOTE __pystr is currently considered to be internal function and should not
-# be used by code outside of pygolang.
-#
-# XXX we should be able to use _pystr, but py3's str verify that it must have
-# Py_TPFLAGS_UNICODE_SUBCLASS in its type flags.
-cdef __pystr(object obj):
-    if PY_MAJOR_VERSION >= 3:
-        return pyu(obj)
-    else:
-        return pyb(obj)
+include "_golang_str.pyx"


 # ---- error ----

--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
--- a/golang/golang_str_test.py
+++ b/golang/golang_str_test.py
--- a/golang/golang_test.py
+++ b/golang/golang_test.py
@@ -21,17 +21,14 @@
 from __future__ import print_function, absolute_import

 from golang import go, chan, select, default, nilchan, _PanicError, func, panic, \
-        defer, recover, u, b
-from golang.gcompat import qq
+        defer, recover, u
 from golang import sync
-from golang.strconv_test import byterange
 from pytest import raises, mark, fail
 from _pytest._code import Traceback
 from os.path import dirname
 import os, sys, inspect, importlib, traceback, doctest
 from subprocess import Popen, PIPE
 import six
-from six import text_type as unicode
 from six.moves import range as xrange
 import gc, weakref, warnings
 import re
@@ -1705,114 +1702,7 @@ def bench_defer(b):


 # test_error lives in errors_test.py
-
-
-# verify b, u
-def test_strings():
-    testv = (
-        # bytes          <->            unicode
-        (b'',                           u''),
-        (b'hello',                      u'hello'),
-        (b'hello\nworld',               u'hello\nworld'),
-        (b'\xd0\xbc\xd0\xb8\xd1\x80',   u'мир'),
-
-        # invalid utf-8
-        (b'\xd0',                       u'\udcd0'),
-        (b'a\xd0b',                     u'a\udcd0b'),
-        # invalid utf-8 with byte < 0x80
-        (b'\xe2\x28\xa1',               u'\udce2(\udca1'),
-
-        # more invalid utf-8
-        # https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
-        (b"\xc3\x28",                   u'\udcc3('),        # Invalid 2 Octet Sequence
-        (b"\xa0\xa1",                   u'\udca0\udca1'),   # Invalid Sequence Identifier
-        (b"\xe2\x82\xa1",               u'\u20a1'),         # Valid 3 Octet Sequence '₡'
-        (b"\xe2\x28\xa1",               u'\udce2(\udca1'),  # Invalid 3 Octet Sequence (in 2nd Octet)
-        (b"\xe2\x82\x28",               u'\udce2\udc82('),  # Invalid 3 Octet Sequence (in 3rd Octet)
-        (b"\xf0\x90\x8c\xbc",           u'\U0001033c'),     # Valid 4 Octet Sequence '𐌼'
-        (b"\xf0\x28\x8c\xbc",           u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
-        (b"\xf0\x90\x28\xbc",           u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
-        (b"\xf0\x28\x8c\x28",           u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
-        (b"\xf8\xa1\xa1\xa1\xa1",                           # Valid 5 Octet Sequence (but not Unicode!)
-                                        u'\udcf8\udca1\udca1\udca1\udca1'),
-        (b"\xfc\xa1\xa1\xa1\xa1\xa1",                       # Valid 6 Octet Sequence (but not Unicode!)
-                                        u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
-
-        # surrogate
-        (b'\xed\xa0\x80',               u'\udced\udca0\udc80'),
-
-        # x00 - x1f
-        (byterange(0,32),
-         u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
-         u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
-
-        # non-printable utf-8
-        (b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
-                                        u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
-
-        # some characters with U >= 0x10000
-        (b'\xf0\x9f\x99\x8f',           u'\U0001f64f'),    # 🙏
-        (b'\xf0\x9f\x9a\x80',           u'\U0001f680'),    # 🚀
-    )
-
-    for tbytes, tunicode in testv:
-        assert b(tbytes)   == tbytes
-        assert u(tunicode) == tunicode
-
-        assert b(tunicode) == tbytes
-        assert u(tbytes)   == tunicode
-
-        assert b(u(tbytes))     == tbytes
-        assert u(b(tunicode))   == tunicode
-
-
-    # invalid types
-    with raises(TypeError): b(1)
-    with raises(TypeError): u(1)
-    with raises(TypeError): b(object())
-    with raises(TypeError): u(object())
-
-    # TODO also handle bytearray?
-
-    # b(b(·)) = identity
-    _ = b(u'миру мир 123')
-    assert isinstance(_, bytes)
-    assert b(_) is _
-
-    # u(u(·)) = identity
-    _ = u(u'мир труд май')
-    assert isinstance(_, unicode)
-    assert u(_) is _
-
-# verify print for _pystr and _pyunicode
-def test_strings_print():
-    outok = readfile(dir_testprog + "/golang_test_str.txt")
-    retcode, stdout, stderr = _pyrun(["golang_test_str.py"],
-                                cwd=dir_testprog, stdout=PIPE, stderr=PIPE)
-    assert retcode == 0, (stdout, stderr)
-    assert stderr == b""
-    assertDoc(outok, stdout)
-
-
-def test_qq():
-    # NOTE qq is also tested as part of strconv.quote
-
-    # qq(any) returns string type
-    assert isinstance(qq(b('мир')), str)    # qq(b) -> str (bytes·py2, unicode·py3)
-    assert isinstance(qq( u'мир'),  str)    # qq(u) -> str (bytes·py2, unicode·py3)
-
-    # however what qq returns can be mixed with both unicode and bytes
-    assert b'hello %s !' % qq(b('мир')) == b('hello "мир" !')   # b % qq(b)
-    assert b'hello %s !' % qq(u('мир')) == b('hello "мир" !')   # b % qq(u) -> b
-    assert u'hello %s !' % qq(u('мир')) == u('hello "мир" !')   # u % qq(u)
-    assert u'hello %s !' % qq(b('мир')) ==  u'hello "мир" !'    # u % qq(b) -> u
-
-    # custom attributes cannot be injected to what qq returns
-    x = qq('мир')
-    if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
-        with raises(AttributeError):
-            x.hello = 1
-
+# strings tests live in golang_str_test.py

 # ---- misc ----


--- a/golang/strconv.py
+++ b/golang/strconv.py
--- a/golang/strconv_test.py
+++ b/golang/strconv_test.py
--- a/golang/testprog/golang_test_str.py
+++ b/golang/testprog/golang_test_str.py
@@ -18,9 +18,9 @@
 #
 # See COPYING file for full licensing terms.
 # See https://www.nexedi.com/licensing for rationale and options.
-"""This program helps to verify _pystr and _pyunicode.
+"""This program helps to verify b, u and underlying bstr and ustr.

-It complements golang_test.test_strings.
+It complements golang_str_test.test_strings_print.
 """

 from __future__ import print_function, absolute_import
@@ -31,8 +31,17 @@ from golang.gcompat import qq
 def main():
    sb = b("привет b")
    su = u("привет u")
+    print("print(b):", sb)
+    print("print(u):", su)
    print("print(qq(b)):", qq(sb))
    print("print(qq(u)):", qq(su))
+    print("print(repr(b)):", repr(sb))
+    print("print(repr(u)):", repr(su))
+
+    # py2: print(dict) calls PyObject_Print(flags=0) for both keys and values,
+    #      not with flags=Py_PRINT_RAW used by default almost everywhere else.
+    #      this way we can verify whether bstr.tp_print handles flags correctly.
+    print("print({b: u}):", {sb: su})


 if __name__ == '__main__':

--- a/golang/testprog/golang_test_str.txt
+++ b/golang/testprog/golang_test_str.txt
+print(b): привет b
+print(u): привет u
 print(qq(b)): "привет b"
 print(qq(u)): "привет u"
+print(repr(b)): b('привет b')
+print(repr(u)): u('привет u')
+print({b: u}): {b('привет b'): u('привет u')}
--- a/gpython/gpython_test.py
+++ b/gpython/gpython_test.py
 # -*- coding: utf-8 -*-
-# Copyright (C) 2019-2021  Nexedi SA and Contributors.
+# Copyright (C) 2019-2022  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -71,6 +71,10 @@ def test_golang_builtins():
    assert error  is golang.error
    assert b      is golang.b
    assert u      is golang.u
+    assert bstr   is golang.bstr
+    assert ustr   is golang.ustr
+    assert bbyte  is golang.bbyte
+    assert uchr   is golang.uchr

    # indirectly verify golang.__all__
    for k in golang.__all__:

--- a/setup.py
+++ b/setup.py
@@ -239,7 +239,8 @@ setup(

    ext_modules = [
                    Ext('golang._golang',
-                        ['golang/_golang.pyx']),
+                        ['golang/_golang.pyx'],
+                        depends = ['golang/_golang_str.pyx']),

                    Ext('golang.runtime._runtime_thread',
                        ['golang/runtime/_runtime_thread.pyx']),