Commit c9648c44 authored by Kirill Smelkov's avatar Kirill Smelkov

X on bstr/ustr ; Almost ready

parent 7b72d418
......@@ -10,7 +10,7 @@ Package `golang` provides Go-like features for Python:
- `func` allows to define methods separate from class.
- `defer` allows to schedule a cleanup from the main control flow.
- `error` and package `errors` provide error chaining.
- `b` and `u` provide way to make sure an object is either bytes or unicode.
- `b`, `u` and `bstr`/`ustr` provide uniform UTF8-based approach to strings.
- `gimport` allows to import python modules by full path in a Go workspace.
Package `golang.pyx` provides__ similar features for Cython/nogil.
......@@ -229,19 +229,60 @@ __ https://www.python.org/dev/peps/pep-3134/
Strings
-------
`b` and `u` provide way to make sure an object is either bytes or unicode.
`b(obj)` converts str/unicode/bytes obj to UTF-8 encoded bytestring, while
`u(obj)` converts str/unicode/bytes obj to unicode string. For example::
Pygolang, similarly to Go, provides uniform UTF8-based approach to strings with
the idea to make working with byte- and unicode- strings easy and transparently
interoperable:
b("привет мир") # -> gives bytes corresponding to UTF-8 encoding of "привет мир".
- `bstr` is byte-string: it is based on `bytes` and can automatically convert to/from `unicode` [*]_.
- `ustr` is unicode-string: it is based on `unicode` and can automatically convert to/from `bytes`.
def f(s):
s = u(s) # make sure s is unicode, decoding as UTF-8(*) if it was bytes.
... # (*) but see below about lack of decode errors.
The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
even if bytes data is not valid UTF-8.
Semantically `bstr` is array of bytes, while `ustr` is array of
unicode-characters. Accessing their elements by `[index]` yields byte and
unicode character correspondingly [*]_. Iterating them, however, yields unicode
characters for both `bstr` and `ustr`. In practice `bstr` is enough 99% of the
time, and `ustr` only needs to be used for random access to string characters.
See `Strings, bytes, runes and characters in Go`__ for overview of this approach.
__ https://blog.golang.org/strings
Operations in between `bstr` and `ustr`/`unicode` / `bytes`/`bytearray` coerce to `bstr`, while
operations in between `ustr` and `bstr`/`bytes`/`bytearray` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes` and `bytearray`, similarly to
`bstr`, are also treated as UTF8-encoded strings.
`bstr` and `ustr` are meant to be drop-in replacements for standard
`str`/`unicode` classes. They support all methods of `str`/`unicode` and in
particular their constructors accept arbitrary objects and either convert or stringify them. For
cases when no stringification is desired, and one only wants to convert
`bstr`/`ustr` / `unicode`/`bytes`/`bytearray`, or an object with `buffer`
interface [*]_, to Pygolang string, `b` and `u` provide way to make sure an
object is either `bstr` or `ustr` correspondingly.
The conversion in both encoding and decoding never fails and never looses
information: `b(u(·))` and `u(b(·))` are always identity for bytes and unicode
correspondingly, even if bytes input is not valid UTF-8.
Usage example::
s = b('привет') # s is bstr corresponding to UTF-8 encoding of 'привет'.
s += ' мир' # s is b('привет мир')
for c in s: # c will iterate through
... # [u(_) for _ in ('п','р','и','в','е','т',' ','м','и','р')]
# the following gives b('привет мир труд май')
b('привет %s %s %s') % (u'мир', # raw unicode
u'труд'.encode('utf-8'), # raw bytes
u('май')) # ustr
def f(s):
s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes, bytearray or buffer.
... # (*) the decoding never fails nor looses information.
.. [*] `unicode` on Python2, `str` on Python3.
.. [*] | ordinal of such byte and unicode character can be obtained via regular `ord`.
| For completeness `bbyte` and `uchr` are also provided for constructing 1-byte `bstr` and 1-character `ustr` from ordinal.
.. [*] | data in buffer, similarly to `bytes` and `bytearray`, is treated as UTF8-encoded string.
| Notice that only explicit conversion through `b`/`u` and `bstr`/`ustr` accept objects with buffer interface. Automatic coercion does not.
Import
......
......@@ -24,7 +24,7 @@
- `func` allows to define methods separate from class.
- `defer` allows to schedule a cleanup from the main control flow.
- `error` and package `errors` provide error chaining.
- `b` and `u` provide way to make sure an object is either bytes or unicode.
- `b`, `u` and `bstr`/`ustr` provide uniform UTF8-based approach to strings.
- `gimport` allows to import python modules by full path in a Go workspace.
See README for thorough overview.
......@@ -36,7 +36,7 @@ from __future__ import print_function, absolute_import
__version__ = "0.1"
__all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic',
'recover', 'func', 'error', 'b', 'u', 'gimport']
'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'bbyte', 'uchr', 'gimport']
from golang._gopath import gimport # make gimport available from golang
import inspect, sys
......@@ -316,4 +316,16 @@ from ._golang import \
pypanic as panic, \
pyerror as error, \
pyb as b, \
pyu as u
pybstr as bstr, \
pybbyte as bbyte, \
pyu as u, \
pyustr as ustr, \
pyuchr as uchr
# import golang.strconv into _golang from here to workaround cyclic golang ↔ strconv dependency
def _():
from . import _golang
from . import strconv
_golang.pystrconv = strconv
_()
del _
......@@ -2,8 +2,9 @@
# cython: language_level=2
# cython: binding=False
# cython: c_string_type=str, c_string_encoding=utf8
# cython: auto_pickle=False
# distutils: language = c++
# distutils: depends = libgolang.h os/signal.h
# distutils: depends = libgolang.h os/signal.h _golang_str.pyx
#
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
......@@ -808,151 +809,7 @@ cdef DType parse_dtype(dtype) except <DType>-1:
# ---- strings ----
from golang import strconv as pystrconv
def pyb(s): # -> bytes
"""b converts str/unicode/bytes s to UTF-8 encoded bytestring.
Bytes input is preserved as-is:
b(bytes_input) == bytes_input
Unicode input is UTF-8 encoded. The encoding always succeeds.
b is reverse operation to u - the following invariant is always true:
b(u(bytes_input)) == bytes_input
TypeError is raised if type(s) is not one of the above.
See also: u.
"""
bs, _ = pystrconv._bstr(s)
return bs
def pyu(s): # -> unicode
"""u converts str/unicode/bytes s to unicode string.
Unicode input is preserved as-is:
u(unicode_input) == unicode_input
Bytes input is UTF-8 decoded. The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF.
u is reverse operation to b - the following invariant is always true:
u(b(unicode_input)) == unicode_input
TypeError is raised if type(s) is not one of the above.
See also: b.
"""
us, _ = pystrconv._ustr(s)
return us
# qq is substitute for %q, which is missing in python.
#
# (python's automatic escape uses smartquotes quoting with either ' or ").
#
# like %s, %q automatically converts its argument to string.
def pyqq(obj):
# make sure obj is text | bytes
# py2: unicode | str
# py3: str | bytes
if not isinstance(obj, (unicode, bytes)):
obj = str(obj)
qobj = pystrconv.quote(obj)
# `printf('%s', qq(obj))` should work. For this make sure qobj is always
# a-la str type (unicode on py3, bytes on py2), that can be transparently
# converted to unicode or bytes as needed.
if PY_MAJOR_VERSION >= 3:
qobj = _pyunicode(pyu(qobj))
else:
qobj = _pystr(pyb(qobj))
return qobj
# XXX cannot `cdef class`: github.com/cython/cython/issues/711
class _pystr(bytes):
"""_str is like bytes but can be automatically converted to Python unicode
string via UTF-8 decoding.
The decoding never fails nor looses information - see u for details.
"""
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
# __bytes__ - no need
def __unicode__(self): return pyu(self)
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return pyu(self)
else:
return self
cdef class _pyunicode(unicode):
"""_unicode is like unicode(py2)|str(py3) but can be automatically converted
to bytes via UTF-8 encoding.
The encoding always succeeds - see b for details.
"""
def __bytes__(self): return pyb(self)
# __unicode__ - no need
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return self
else:
return pyb(self)
# initialize .tp_print for _pystr so that this type could be printed.
# If we don't - printing it will result in `RuntimeError: print recursion`
# because str of this type never reaches real bytes or unicode.
# Do it only on python2, because python3 does not use tp_print at all.
# NOTE _pyunicode does not need this because on py2 str(_pyunicode) returns _pystr.
IF PY2:
# NOTE Cython does not define tp_print for PyTypeObject - do it ourselves
from libc.stdio cimport FILE
cdef extern from "Python.h":
ctypedef int (*printfunc)(PyObject *, FILE *, int) except -1
ctypedef struct PyTypeObject:
printfunc tp_print
cdef PyTypeObject *Py_TYPE(object)
cdef int _pystr_tp_print(PyObject *obj, FILE *f, int nesting) except -1:
o = <bytes>obj
o = bytes(buffer(o)) # change tp_type to bytes instead of _pystr
return Py_TYPE(o).tp_print(<PyObject*>o, f, nesting)
Py_TYPE(_pystr()).tp_print = _pystr_tp_print
# __pystr converts obj to str of current python:
#
# - to bytes, via b, if running on py2, or
# - to unicode, via u, if running on py3.
#
# It is handy to use __pystr when implementing __str__ methods.
#
# NOTE __pystr is currently considered to be internal function and should not
# be used by code outside of pygolang.
#
# XXX we should be able to use _pystr, but py3's str verify that it must have
# Py_TPFLAGS_UNICODE_SUBCLASS in its type flags.
cdef __pystr(object obj):
if PY_MAJOR_VERSION >= 3:
return pyu(obj)
else:
return pyb(obj)
include "_golang_str.pyx"
# ---- error ----
......
This diff is collapsed.
This diff is collapsed.
......@@ -21,17 +21,14 @@
from __future__ import print_function, absolute_import
from golang import go, chan, select, default, nilchan, _PanicError, func, panic, \
defer, recover, u, b
from golang.gcompat import qq
defer, recover, u
from golang import sync
from golang.strconv_test import byterange
from pytest import raises, mark, fail
from _pytest._code import Traceback
from os.path import dirname
import os, sys, inspect, importlib, traceback, doctest
from subprocess import Popen, PIPE
import six
from six import text_type as unicode
from six.moves import range as xrange
import gc, weakref, warnings
import re
......@@ -1705,114 +1702,7 @@ def bench_defer(b):
# test_error lives in errors_test.py
# verify b, u
def test_strings():
testv = (
# bytes <-> unicode
(b'', u''),
(b'hello', u'hello'),
(b'hello\nworld', u'hello\nworld'),
(b'\xd0\xbc\xd0\xb8\xd1\x80', u'мир'),
# invalid utf-8
(b'\xd0', u'\udcd0'),
(b'a\xd0b', u'a\udcd0b'),
# invalid utf-8 with byte < 0x80
(b'\xe2\x28\xa1', u'\udce2(\udca1'),
# more invalid utf-8
# https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
(b"\xc3\x28", u'\udcc3('), # Invalid 2 Octet Sequence
(b"\xa0\xa1", u'\udca0\udca1'), # Invalid Sequence Identifier
(b"\xe2\x82\xa1", u'\u20a1'), # Valid 3 Octet Sequence '₡'
(b"\xe2\x28\xa1", u'\udce2(\udca1'), # Invalid 3 Octet Sequence (in 2nd Octet)
(b"\xe2\x82\x28", u'\udce2\udc82('), # Invalid 3 Octet Sequence (in 3rd Octet)
(b"\xf0\x90\x8c\xbc", u'\U0001033c'), # Valid 4 Octet Sequence '𐌼'
(b"\xf0\x28\x8c\xbc", u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
(b"\xf0\x90\x28\xbc", u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
(b"\xf0\x28\x8c\x28", u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
(b"\xf8\xa1\xa1\xa1\xa1", # Valid 5 Octet Sequence (but not Unicode!)
u'\udcf8\udca1\udca1\udca1\udca1'),
(b"\xfc\xa1\xa1\xa1\xa1\xa1", # Valid 6 Octet Sequence (but not Unicode!)
u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
# surrogate
(b'\xed\xa0\x80', u'\udced\udca0\udc80'),
# x00 - x1f
(byterange(0,32),
u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
# non-printable utf-8
(b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
# some characters with U >= 0x10000
(b'\xf0\x9f\x99\x8f', u'\U0001f64f'), # 🙏
(b'\xf0\x9f\x9a\x80', u'\U0001f680'), # 🚀
)
for tbytes, tunicode in testv:
assert b(tbytes) == tbytes
assert u(tunicode) == tunicode
assert b(tunicode) == tbytes
assert u(tbytes) == tunicode
assert b(u(tbytes)) == tbytes
assert u(b(tunicode)) == tunicode
# invalid types
with raises(TypeError): b(1)
with raises(TypeError): u(1)
with raises(TypeError): b(object())
with raises(TypeError): u(object())
# TODO also handle bytearray?
# b(b(·)) = identity
_ = b(u'миру мир 123')
assert isinstance(_, bytes)
assert b(_) is _
# u(u(·)) = identity
_ = u(u'мир труд май')
assert isinstance(_, unicode)
assert u(_) is _
# verify print for _pystr and _pyunicode
def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt")
retcode, stdout, stderr = _pyrun(["golang_test_str.py"],
cwd=dir_testprog, stdout=PIPE, stderr=PIPE)
assert retcode == 0, (stdout, stderr)
assert stderr == b""
assertDoc(outok, stdout)
def test_qq():
# NOTE qq is also tested as part of strconv.quote
# qq(any) returns string type
assert isinstance(qq(b('мир')), str) # qq(b) -> str (bytes·py2, unicode·py3)
assert isinstance(qq( u'мир'), str) # qq(u) -> str (bytes·py2, unicode·py3)
# however what qq returns can be mixed with both unicode and bytes
assert b'hello %s !' % qq(b('мир')) == b('hello "мир" !') # b % qq(b)
assert b'hello %s !' % qq(u('мир')) == b('hello "мир" !') # b % qq(u) -> b
assert u'hello %s !' % qq(u('мир')) == u('hello "мир" !') # u % qq(u)
assert u'hello %s !' % qq(b('мир')) == u'hello "мир" !' # u % qq(b) -> u
# custom attributes cannot be injected to what qq returns
x = qq('мир')
if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
with raises(AttributeError):
x.hello = 1
# strings tests live in golang_str_test.py
# ---- misc ----
......
This diff is collapsed.
This diff is collapsed.
......@@ -18,9 +18,9 @@
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""This program helps to verify _pystr and _pyunicode.
"""This program helps to verify b, u and underlying bstr and ustr.
It complements golang_test.test_strings.
It complements golang_str_test.test_strings_print.
"""
from __future__ import print_function, absolute_import
......@@ -31,8 +31,17 @@ from golang.gcompat import qq
def main():
sb = b("привет b")
su = u("привет u")
print("print(b):", sb)
print("print(u):", su)
print("print(qq(b)):", qq(sb))
print("print(qq(u)):", qq(su))
print("print(repr(b)):", repr(sb))
print("print(repr(u)):", repr(su))
# py2: print(dict) calls PyObject_Print(flags=0) for both keys and values,
# not with flags=Py_PRINT_RAW used by default almost everywhere else.
# this way we can verify whether bstr.tp_print handles flags correctly.
print("print({b: u}):", {sb: su})
if __name__ == '__main__':
......
print(b): привет b
print(u): привет u
print(qq(b)): "привет b"
print(qq(u)): "привет u"
print(repr(b)): b('привет b')
print(repr(u)): u('привет u')
print({b: u}): {b('привет b'): u('привет u')}
# -*- coding: utf-8 -*-
# Copyright (C) 2019-2021 Nexedi SA and Contributors.
# Copyright (C) 2019-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -71,6 +71,10 @@ def test_golang_builtins():
assert error is golang.error
assert b is golang.b
assert u is golang.u
assert bstr is golang.bstr
assert ustr is golang.ustr
assert bbyte is golang.bbyte
assert uchr is golang.uchr
# indirectly verify golang.__all__
for k in golang.__all__:
......
......@@ -239,7 +239,8 @@ setup(
ext_modules = [
Ext('golang._golang',
['golang/_golang.pyx']),
['golang/_golang.pyx'],
depends = ['golang/_golang_str.pyx']),
Ext('golang.runtime._runtime_thread',
['golang/runtime/_runtime_thread.pyx']),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment