Commit e72a459f authored by Kirill Smelkov's avatar Kirill Smelkov

golang: Move strings-related code to _golang_str "submodule"

We are going to significantly extend py-strings related functionality soon
- to the point where amount of strings related code will be
approximately the same compared to the amount of all other
python-related code inside golang module.

-> First move everything related to py strings to dedicated
_golang_str.pyx as a preparatory step.

Keep that new file included from _golang.pyx instead of being real new
module, because we want strings functionality to be provided by golang
main namespace itself, and to ease internal code interdependencies.

Plain code movement.

/reviewed-by @jerome
/reviewed-at !18
parent 7b72d418
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# cython: binding=False # cython: binding=False
# cython: c_string_type=str, c_string_encoding=utf8 # cython: c_string_type=str, c_string_encoding=utf8
# distutils: language = c++ # distutils: language = c++
# distutils: depends = libgolang.h os/signal.h # distutils: depends = libgolang.h os/signal.h _golang_str.pyx
# #
# Copyright (C) 2018-2022 Nexedi SA and Contributors. # Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
...@@ -808,151 +808,7 @@ cdef DType parse_dtype(dtype) except <DType>-1: ...@@ -808,151 +808,7 @@ cdef DType parse_dtype(dtype) except <DType>-1:
# ---- strings ---- # ---- strings ----
from golang import strconv as pystrconv include "_golang_str.pyx"
def pyb(s): # -> bytes
"""b converts str/unicode/bytes s to UTF-8 encoded bytestring.
Bytes input is preserved as-is:
b(bytes_input) == bytes_input
Unicode input is UTF-8 encoded. The encoding always succeeds.
b is reverse operation to u - the following invariant is always true:
b(u(bytes_input)) == bytes_input
TypeError is raised if type(s) is not one of the above.
See also: u.
"""
bs, _ = pystrconv._bstr(s)
return bs
def pyu(s): # -> unicode
"""u converts str/unicode/bytes s to unicode string.
Unicode input is preserved as-is:
u(unicode_input) == unicode_input
Bytes input is UTF-8 decoded. The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF.
u is reverse operation to b - the following invariant is always true:
u(b(unicode_input)) == unicode_input
TypeError is raised if type(s) is not one of the above.
See also: b.
"""
us, _ = pystrconv._ustr(s)
return us
# qq is substitute for %q, which is missing in python.
#
# (python's automatic escape uses smartquotes quoting with either ' or ").
#
# like %s, %q automatically converts its argument to string.
def pyqq(obj):
# make sure obj is text | bytes
# py2: unicode | str
# py3: str | bytes
if not isinstance(obj, (unicode, bytes)):
obj = str(obj)
qobj = pystrconv.quote(obj)
# `printf('%s', qq(obj))` should work. For this make sure qobj is always
# a-la str type (unicode on py3, bytes on py2), that can be transparently
# converted to unicode or bytes as needed.
if PY_MAJOR_VERSION >= 3:
qobj = _pyunicode(pyu(qobj))
else:
qobj = _pystr(pyb(qobj))
return qobj
# XXX cannot `cdef class`: github.com/cython/cython/issues/711
class _pystr(bytes):
"""_str is like bytes but can be automatically converted to Python unicode
string via UTF-8 decoding.
The decoding never fails nor looses information - see u for details.
"""
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
# __bytes__ - no need
def __unicode__(self): return pyu(self)
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return pyu(self)
else:
return self
cdef class _pyunicode(unicode):
"""_unicode is like unicode(py2)|str(py3) but can be automatically converted
to bytes via UTF-8 encoding.
The encoding always succeeds - see b for details.
"""
def __bytes__(self): return pyb(self)
# __unicode__ - no need
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return self
else:
return pyb(self)
# initialize .tp_print for _pystr so that this type could be printed.
# If we don't - printing it will result in `RuntimeError: print recursion`
# because str of this type never reaches real bytes or unicode.
# Do it only on python2, because python3 does not use tp_print at all.
# NOTE _pyunicode does not need this because on py2 str(_pyunicode) returns _pystr.
IF PY2:
# NOTE Cython does not define tp_print for PyTypeObject - do it ourselves
from libc.stdio cimport FILE
cdef extern from "Python.h":
ctypedef int (*printfunc)(PyObject *, FILE *, int) except -1
ctypedef struct PyTypeObject:
printfunc tp_print
cdef PyTypeObject *Py_TYPE(object)
cdef int _pystr_tp_print(PyObject *obj, FILE *f, int nesting) except -1:
o = <bytes>obj
o = bytes(buffer(o)) # change tp_type to bytes instead of _pystr
return Py_TYPE(o).tp_print(<PyObject*>o, f, nesting)
Py_TYPE(_pystr()).tp_print = _pystr_tp_print
# __pystr converts obj to str of current python:
#
# - to bytes, via b, if running on py2, or
# - to unicode, via u, if running on py3.
#
# It is handy to use __pystr when implementing __str__ methods.
#
# NOTE __pystr is currently considered to be internal function and should not
# be used by code outside of pygolang.
#
# XXX we should be able to use _pystr, but py3's str verify that it must have
# Py_TPFLAGS_UNICODE_SUBCLASS in its type flags.
cdef __pystr(object obj):
if PY_MAJOR_VERSION >= 3:
return pyu(obj)
else:
return pyb(obj)
# ---- error ---- # ---- error ----
......
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""_golang_str.pyx complements _golang.pyx and keeps everything related to strings.
It is included from _golang.pyx .
"""
from golang import strconv as pystrconv
def pyb(s): # -> bytes
"""b converts str/unicode/bytes s to UTF-8 encoded bytestring.
Bytes input is preserved as-is:
b(bytes_input) == bytes_input
Unicode input is UTF-8 encoded. The encoding always succeeds.
b is reverse operation to u - the following invariant is always true:
b(u(bytes_input)) == bytes_input
TypeError is raised if type(s) is not one of the above.
See also: u.
"""
bs, _ = pystrconv._bstr(s)
return bs
def pyu(s): # -> unicode
"""u converts str/unicode/bytes s to unicode string.
Unicode input is preserved as-is:
u(unicode_input) == unicode_input
Bytes input is UTF-8 decoded. The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF.
u is reverse operation to b - the following invariant is always true:
u(b(unicode_input)) == unicode_input
TypeError is raised if type(s) is not one of the above.
See also: b.
"""
us, _ = pystrconv._ustr(s)
return us
# __pystr converts obj to str of current python:
#
# - to bytes, via b, if running on py2, or
# - to unicode, via u, if running on py3.
#
# It is handy to use __pystr when implementing __str__ methods.
#
# NOTE __pystr is currently considered to be internal function and should not
# be used by code outside of pygolang.
#
# XXX we should be able to use _pystr, but py3's str verify that it must have
# Py_TPFLAGS_UNICODE_SUBCLASS in its type flags.
cdef __pystr(object obj):
if PY_MAJOR_VERSION >= 3:
return pyu(obj)
else:
return pyb(obj)
# XXX cannot `cdef class`: github.com/cython/cython/issues/711
class _pystr(bytes):
"""_str is like bytes but can be automatically converted to Python unicode
string via UTF-8 decoding.
The decoding never fails nor looses information - see u for details.
"""
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
# __bytes__ - no need
def __unicode__(self): return pyu(self)
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return pyu(self)
else:
return self
cdef class _pyunicode(unicode):
"""_unicode is like unicode(py2)|str(py3) but can be automatically converted
to bytes via UTF-8 encoding.
The encoding always succeeds - see b for details.
"""
def __bytes__(self): return pyb(self)
# __unicode__ - no need
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return self
else:
return pyb(self)
# initialize .tp_print for _pystr so that this type could be printed.
# If we don't - printing it will result in `RuntimeError: print recursion`
# because str of this type never reaches real bytes or unicode.
# Do it only on python2, because python3 does not use tp_print at all.
# NOTE _pyunicode does not need this because on py2 str(_pyunicode) returns _pystr.
IF PY2:
# NOTE Cython does not define tp_print for PyTypeObject - do it ourselves
from libc.stdio cimport FILE
cdef extern from "Python.h":
ctypedef int (*printfunc)(PyObject *, FILE *, int) except -1
ctypedef struct PyTypeObject:
printfunc tp_print
cdef PyTypeObject *Py_TYPE(object)
cdef int _pystr_tp_print(PyObject *obj, FILE *f, int nesting) except -1:
o = <bytes>obj
o = bytes(buffer(o)) # change tp_type to bytes instead of _pystr
return Py_TYPE(o).tp_print(<PyObject*>o, f, nesting)
Py_TYPE(_pystr()).tp_print = _pystr_tp_print
# qq is substitute for %q, which is missing in python.
#
# (python's automatic escape uses smartquotes quoting with either ' or ").
#
# like %s, %q automatically converts its argument to string.
def pyqq(obj):
# make sure obj is text | bytes
# py2: unicode | str
# py3: str | bytes
if not isinstance(obj, (unicode, bytes)):
obj = str(obj)
qobj = pystrconv.quote(obj)
# `printf('%s', qq(obj))` should work. For this make sure qobj is always
# a-la str type (unicode on py3, bytes on py2), that can be transparently
# converted to unicode or bytes as needed.
if PY_MAJOR_VERSION >= 3:
qobj = _pyunicode(pyu(qobj))
else:
qobj = _pystr(pyb(qobj))
return qobj
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
from __future__ import print_function, absolute_import
from golang import b, u
from golang.gcompat import qq
from golang.strconv_test import byterange
from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises
import sys
from six import text_type as unicode
# verify b, u
def test_strings():
testv = (
# bytes <-> unicode
(b'', u''),
(b'hello', u'hello'),
(b'hello\nworld', u'hello\nworld'),
(b'\xd0\xbc\xd0\xb8\xd1\x80', u'мир'),
# invalid utf-8
(b'\xd0', u'\udcd0'),
(b'a\xd0b', u'a\udcd0b'),
# invalid utf-8 with byte < 0x80
(b'\xe2\x28\xa1', u'\udce2(\udca1'),
# more invalid utf-8
# https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
(b"\xc3\x28", u'\udcc3('), # Invalid 2 Octet Sequence
(b"\xa0\xa1", u'\udca0\udca1'), # Invalid Sequence Identifier
(b"\xe2\x82\xa1", u'\u20a1'), # Valid 3 Octet Sequence '₡'
(b"\xe2\x28\xa1", u'\udce2(\udca1'), # Invalid 3 Octet Sequence (in 2nd Octet)
(b"\xe2\x82\x28", u'\udce2\udc82('), # Invalid 3 Octet Sequence (in 3rd Octet)
(b"\xf0\x90\x8c\xbc", u'\U0001033c'), # Valid 4 Octet Sequence '𐌼'
(b"\xf0\x28\x8c\xbc", u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
(b"\xf0\x90\x28\xbc", u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
(b"\xf0\x28\x8c\x28", u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
(b"\xf8\xa1\xa1\xa1\xa1", # Valid 5 Octet Sequence (but not Unicode!)
u'\udcf8\udca1\udca1\udca1\udca1'),
(b"\xfc\xa1\xa1\xa1\xa1\xa1", # Valid 6 Octet Sequence (but not Unicode!)
u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
# surrogate
(b'\xed\xa0\x80', u'\udced\udca0\udc80'),
# x00 - x1f
(byterange(0,32),
u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
# non-printable utf-8
(b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
# some characters with U >= 0x10000
(b'\xf0\x9f\x99\x8f', u'\U0001f64f'), # 🙏
(b'\xf0\x9f\x9a\x80', u'\U0001f680'), # 🚀
)
for tbytes, tunicode in testv:
assert b(tbytes) == tbytes
assert u(tunicode) == tunicode
assert b(tunicode) == tbytes
assert u(tbytes) == tunicode
assert b(u(tbytes)) == tbytes
assert u(b(tunicode)) == tunicode
# invalid types
with raises(TypeError): b(1)
with raises(TypeError): u(1)
with raises(TypeError): b(object())
with raises(TypeError): u(object())
# TODO also handle bytearray?
# b(b(·)) = identity
_ = b(u'миру мир 123')
assert isinstance(_, bytes)
assert b(_) is _
# u(u(·)) = identity
_ = u(u'мир труд май')
assert isinstance(_, unicode)
assert u(_) is _
# verify print for _pystr and _pyunicode
def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt")
retcode, stdout, stderr = _pyrun(["golang_test_str.py"],
cwd=dir_testprog, stdout=PIPE, stderr=PIPE)
assert retcode == 0, (stdout, stderr)
assert stderr == b""
assertDoc(outok, stdout)
def test_qq():
# NOTE qq is also tested as part of strconv.quote
# qq(any) returns string type
assert isinstance(qq(b('мир')), str) # qq(b) -> str (bytes·py2, unicode·py3)
assert isinstance(qq( u'мир'), str) # qq(u) -> str (bytes·py2, unicode·py3)
# however what qq returns can be mixed with both unicode and bytes
assert b'hello %s !' % qq(b('мир')) == b('hello "мир" !') # b % qq(b)
assert b'hello %s !' % qq(u('мир')) == b('hello "мир" !') # b % qq(u) -> b
assert u'hello %s !' % qq(u('мир')) == u('hello "мир" !') # u % qq(u)
assert u'hello %s !' % qq(b('мир')) == u'hello "мир" !' # u % qq(b) -> u
# custom attributes cannot be injected to what qq returns
x = qq('мир')
if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
with raises(AttributeError):
x.hello = 1
...@@ -21,17 +21,14 @@ ...@@ -21,17 +21,14 @@
from __future__ import print_function, absolute_import from __future__ import print_function, absolute_import
from golang import go, chan, select, default, nilchan, _PanicError, func, panic, \ from golang import go, chan, select, default, nilchan, _PanicError, func, panic, \
defer, recover, u, b defer, recover, u
from golang.gcompat import qq
from golang import sync from golang import sync
from golang.strconv_test import byterange
from pytest import raises, mark, fail from pytest import raises, mark, fail
from _pytest._code import Traceback from _pytest._code import Traceback
from os.path import dirname from os.path import dirname
import os, sys, inspect, importlib, traceback, doctest import os, sys, inspect, importlib, traceback, doctest
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
import six import six
from six import text_type as unicode
from six.moves import range as xrange from six.moves import range as xrange
import gc, weakref, warnings import gc, weakref, warnings
import re import re
...@@ -1705,114 +1702,7 @@ def bench_defer(b): ...@@ -1705,114 +1702,7 @@ def bench_defer(b):
# test_error lives in errors_test.py # test_error lives in errors_test.py
# strings tests live in golang_str_test.py
# verify b, u
def test_strings():
testv = (
# bytes <-> unicode
(b'', u''),
(b'hello', u'hello'),
(b'hello\nworld', u'hello\nworld'),
(b'\xd0\xbc\xd0\xb8\xd1\x80', u'мир'),
# invalid utf-8
(b'\xd0', u'\udcd0'),
(b'a\xd0b', u'a\udcd0b'),
# invalid utf-8 with byte < 0x80
(b'\xe2\x28\xa1', u'\udce2(\udca1'),
# more invalid utf-8
# https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
(b"\xc3\x28", u'\udcc3('), # Invalid 2 Octet Sequence
(b"\xa0\xa1", u'\udca0\udca1'), # Invalid Sequence Identifier
(b"\xe2\x82\xa1", u'\u20a1'), # Valid 3 Octet Sequence '₡'
(b"\xe2\x28\xa1", u'\udce2(\udca1'), # Invalid 3 Octet Sequence (in 2nd Octet)
(b"\xe2\x82\x28", u'\udce2\udc82('), # Invalid 3 Octet Sequence (in 3rd Octet)
(b"\xf0\x90\x8c\xbc", u'\U0001033c'), # Valid 4 Octet Sequence '𐌼'
(b"\xf0\x28\x8c\xbc", u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
(b"\xf0\x90\x28\xbc", u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
(b"\xf0\x28\x8c\x28", u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
(b"\xf8\xa1\xa1\xa1\xa1", # Valid 5 Octet Sequence (but not Unicode!)
u'\udcf8\udca1\udca1\udca1\udca1'),
(b"\xfc\xa1\xa1\xa1\xa1\xa1", # Valid 6 Octet Sequence (but not Unicode!)
u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
# surrogate
(b'\xed\xa0\x80', u'\udced\udca0\udc80'),
# x00 - x1f
(byterange(0,32),
u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
# non-printable utf-8
(b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
# some characters with U >= 0x10000
(b'\xf0\x9f\x99\x8f', u'\U0001f64f'), # 🙏
(b'\xf0\x9f\x9a\x80', u'\U0001f680'), # 🚀
)
for tbytes, tunicode in testv:
assert b(tbytes) == tbytes
assert u(tunicode) == tunicode
assert b(tunicode) == tbytes
assert u(tbytes) == tunicode
assert b(u(tbytes)) == tbytes
assert u(b(tunicode)) == tunicode
# invalid types
with raises(TypeError): b(1)
with raises(TypeError): u(1)
with raises(TypeError): b(object())
with raises(TypeError): u(object())
# TODO also handle bytearray?
# b(b(·)) = identity
_ = b(u'миру мир 123')
assert isinstance(_, bytes)
assert b(_) is _
# u(u(·)) = identity
_ = u(u'мир труд май')
assert isinstance(_, unicode)
assert u(_) is _
# verify print for _pystr and _pyunicode
def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt")
retcode, stdout, stderr = _pyrun(["golang_test_str.py"],
cwd=dir_testprog, stdout=PIPE, stderr=PIPE)
assert retcode == 0, (stdout, stderr)
assert stderr == b""
assertDoc(outok, stdout)
def test_qq():
# NOTE qq is also tested as part of strconv.quote
# qq(any) returns string type
assert isinstance(qq(b('мир')), str) # qq(b) -> str (bytes·py2, unicode·py3)
assert isinstance(qq( u'мир'), str) # qq(u) -> str (bytes·py2, unicode·py3)
# however what qq returns can be mixed with both unicode and bytes
assert b'hello %s !' % qq(b('мир')) == b('hello "мир" !') # b % qq(b)
assert b'hello %s !' % qq(u('мир')) == b('hello "мир" !') # b % qq(u) -> b
assert u'hello %s !' % qq(u('мир')) == u('hello "мир" !') # u % qq(u)
assert u'hello %s !' % qq(b('мир')) == u'hello "мир" !' # u % qq(b) -> u
# custom attributes cannot be injected to what qq returns
x = qq('мир')
if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
with raises(AttributeError):
x.hello = 1
# ---- misc ---- # ---- misc ----
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
# See https://www.nexedi.com/licensing for rationale and options. # See https://www.nexedi.com/licensing for rationale and options.
"""This program helps to verify _pystr and _pyunicode. """This program helps to verify _pystr and _pyunicode.
It complements golang_test.test_strings. It complements golang_str_test.test_strings_print.
""" """
from __future__ import print_function, absolute_import from __future__ import print_function, absolute_import
......
...@@ -239,7 +239,8 @@ setup( ...@@ -239,7 +239,8 @@ setup(
ext_modules = [ ext_modules = [
Ext('golang._golang', Ext('golang._golang',
['golang/_golang.pyx']), ['golang/_golang.pyx'],
depends = ['golang/_golang_str.pyx']),
Ext('golang.runtime._runtime_thread', Ext('golang.runtime._runtime_thread',
['golang/runtime/_runtime_thread.pyx']), ['golang/runtime/_runtime_thread.pyx']),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment