Commit e72a459f authored by Kirill Smelkov's avatar Kirill Smelkov

golang: Move strings-related code to _golang_str "submodule"

We are going to significantly extend py-strings related functionality soon
- to the point where amount of strings related code will be
approximately the same compared to the amount of all other
python-related code inside golang module.

-> First move everything related to py strings to dedicated
_golang_str.pyx as a preparatory step.

Keep that new file included from _golang.pyx instead of being real new
module, because we want strings functionality to be provided by golang
main namespace itself, and to ease internal code interdependencies.

Plain code movement.

/reviewed-by @jerome
/reviewed-at !18
parent 7b72d418
......@@ -3,7 +3,7 @@
# cython: binding=False
# cython: c_string_type=str, c_string_encoding=utf8
# distutils: language = c++
# distutils: depends = libgolang.h os/signal.h
# distutils: depends = libgolang.h os/signal.h _golang_str.pyx
#
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
......@@ -808,151 +808,7 @@ cdef DType parse_dtype(dtype) except <DType>-1:
# ---- strings ----
from golang import strconv as pystrconv
def pyb(s): # -> bytes
"""b converts str/unicode/bytes s to UTF-8 encoded bytestring.
Bytes input is preserved as-is:
b(bytes_input) == bytes_input
Unicode input is UTF-8 encoded. The encoding always succeeds.
b is reverse operation to u - the following invariant is always true:
b(u(bytes_input)) == bytes_input
TypeError is raised if type(s) is not one of the above.
See also: u.
"""
bs, _ = pystrconv._bstr(s)
return bs
def pyu(s): # -> unicode
"""u converts str/unicode/bytes s to unicode string.
Unicode input is preserved as-is:
u(unicode_input) == unicode_input
Bytes input is UTF-8 decoded. The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF.
u is reverse operation to b - the following invariant is always true:
u(b(unicode_input)) == unicode_input
TypeError is raised if type(s) is not one of the above.
See also: b.
"""
us, _ = pystrconv._ustr(s)
return us
# qq is substitute for %q, which is missing in python.
#
# (python's automatic escape uses smartquotes quoting with either ' or ").
#
# like %s, %q automatically converts its argument to string.
def pyqq(obj):
# make sure obj is text | bytes
# py2: unicode | str
# py3: str | bytes
if not isinstance(obj, (unicode, bytes)):
obj = str(obj)
qobj = pystrconv.quote(obj)
# `printf('%s', qq(obj))` should work. For this make sure qobj is always
# a-la str type (unicode on py3, bytes on py2), that can be transparently
# converted to unicode or bytes as needed.
if PY_MAJOR_VERSION >= 3:
qobj = _pyunicode(pyu(qobj))
else:
qobj = _pystr(pyb(qobj))
return qobj
# XXX cannot `cdef class`: github.com/cython/cython/issues/711
class _pystr(bytes):
"""_str is like bytes but can be automatically converted to Python unicode
string via UTF-8 decoding.
The decoding never fails nor looses information - see u for details.
"""
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
# __bytes__ - no need
def __unicode__(self): return pyu(self)
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return pyu(self)
else:
return self
cdef class _pyunicode(unicode):
"""_unicode is like unicode(py2)|str(py3) but can be automatically converted
to bytes via UTF-8 encoding.
The encoding always succeeds - see b for details.
"""
def __bytes__(self): return pyb(self)
# __unicode__ - no need
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return self
else:
return pyb(self)
# initialize .tp_print for _pystr so that this type could be printed.
# If we don't - printing it will result in `RuntimeError: print recursion`
# because str of this type never reaches real bytes or unicode.
# Do it only on python2, because python3 does not use tp_print at all.
# NOTE _pyunicode does not need this because on py2 str(_pyunicode) returns _pystr.
IF PY2:
# NOTE Cython does not define tp_print for PyTypeObject - do it ourselves
from libc.stdio cimport FILE
cdef extern from "Python.h":
ctypedef int (*printfunc)(PyObject *, FILE *, int) except -1
ctypedef struct PyTypeObject:
printfunc tp_print
cdef PyTypeObject *Py_TYPE(object)
cdef int _pystr_tp_print(PyObject *obj, FILE *f, int nesting) except -1:
o = <bytes>obj
o = bytes(buffer(o)) # change tp_type to bytes instead of _pystr
return Py_TYPE(o).tp_print(<PyObject*>o, f, nesting)
Py_TYPE(_pystr()).tp_print = _pystr_tp_print
# __pystr converts obj to str of current python:
#
# - to bytes, via b, if running on py2, or
# - to unicode, via u, if running on py3.
#
# It is handy to use __pystr when implementing __str__ methods.
#
# NOTE __pystr is currently considered to be internal function and should not
# be used by code outside of pygolang.
#
# XXX we should be able to use _pystr, but py3's str verify that it must have
# Py_TPFLAGS_UNICODE_SUBCLASS in its type flags.
cdef __pystr(object obj):
if PY_MAJOR_VERSION >= 3:
return pyu(obj)
else:
return pyb(obj)
include "_golang_str.pyx"
# ---- error ----
......
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""_golang_str.pyx complements _golang.pyx and keeps everything related to strings.
It is included from _golang.pyx .
"""
from golang import strconv as pystrconv
def pyb(s): # -> bytes
"""b converts str/unicode/bytes s to UTF-8 encoded bytestring.
Bytes input is preserved as-is:
b(bytes_input) == bytes_input
Unicode input is UTF-8 encoded. The encoding always succeeds.
b is reverse operation to u - the following invariant is always true:
b(u(bytes_input)) == bytes_input
TypeError is raised if type(s) is not one of the above.
See also: u.
"""
bs, _ = pystrconv._bstr(s)
return bs
def pyu(s): # -> unicode
"""u converts str/unicode/bytes s to unicode string.
Unicode input is preserved as-is:
u(unicode_input) == unicode_input
Bytes input is UTF-8 decoded. The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF.
u is reverse operation to b - the following invariant is always true:
u(b(unicode_input)) == unicode_input
TypeError is raised if type(s) is not one of the above.
See also: b.
"""
us, _ = pystrconv._ustr(s)
return us
# __pystr converts obj to str of current python:
#
# - to bytes, via b, if running on py2, or
# - to unicode, via u, if running on py3.
#
# It is handy to use __pystr when implementing __str__ methods.
#
# NOTE __pystr is currently considered to be internal function and should not
# be used by code outside of pygolang.
#
# XXX we should be able to use _pystr, but py3's str verify that it must have
# Py_TPFLAGS_UNICODE_SUBCLASS in its type flags.
cdef __pystr(object obj):
if PY_MAJOR_VERSION >= 3:
return pyu(obj)
else:
return pyb(obj)
# XXX cannot `cdef class`: github.com/cython/cython/issues/711
class _pystr(bytes):
"""_str is like bytes but can be automatically converted to Python unicode
string via UTF-8 decoding.
The decoding never fails nor looses information - see u for details.
"""
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
# __bytes__ - no need
def __unicode__(self): return pyu(self)
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return pyu(self)
else:
return self
cdef class _pyunicode(unicode):
"""_unicode is like unicode(py2)|str(py3) but can be automatically converted
to bytes via UTF-8 encoding.
The encoding always succeeds - see b for details.
"""
def __bytes__(self): return pyb(self)
# __unicode__ - no need
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return self
else:
return pyb(self)
# initialize .tp_print for _pystr so that this type could be printed.
# If we don't - printing it will result in `RuntimeError: print recursion`
# because str of this type never reaches real bytes or unicode.
# Do it only on python2, because python3 does not use tp_print at all.
# NOTE _pyunicode does not need this because on py2 str(_pyunicode) returns _pystr.
IF PY2:
# NOTE Cython does not define tp_print for PyTypeObject - do it ourselves
from libc.stdio cimport FILE
cdef extern from "Python.h":
ctypedef int (*printfunc)(PyObject *, FILE *, int) except -1
ctypedef struct PyTypeObject:
printfunc tp_print
cdef PyTypeObject *Py_TYPE(object)
cdef int _pystr_tp_print(PyObject *obj, FILE *f, int nesting) except -1:
o = <bytes>obj
o = bytes(buffer(o)) # change tp_type to bytes instead of _pystr
return Py_TYPE(o).tp_print(<PyObject*>o, f, nesting)
Py_TYPE(_pystr()).tp_print = _pystr_tp_print
# qq is substitute for %q, which is missing in python.
#
# (python's automatic escape uses smartquotes quoting with either ' or ").
#
# like %s, %q automatically converts its argument to string.
def pyqq(obj):
# make sure obj is text | bytes
# py2: unicode | str
# py3: str | bytes
if not isinstance(obj, (unicode, bytes)):
obj = str(obj)
qobj = pystrconv.quote(obj)
# `printf('%s', qq(obj))` should work. For this make sure qobj is always
# a-la str type (unicode on py3, bytes on py2), that can be transparently
# converted to unicode or bytes as needed.
if PY_MAJOR_VERSION >= 3:
qobj = _pyunicode(pyu(qobj))
else:
qobj = _pystr(pyb(qobj))
return qobj
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
from __future__ import print_function, absolute_import
from golang import b, u
from golang.gcompat import qq
from golang.strconv_test import byterange
from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises
import sys
from six import text_type as unicode
# verify b, u
def test_strings():
testv = (
# bytes <-> unicode
(b'', u''),
(b'hello', u'hello'),
(b'hello\nworld', u'hello\nworld'),
(b'\xd0\xbc\xd0\xb8\xd1\x80', u'мир'),
# invalid utf-8
(b'\xd0', u'\udcd0'),
(b'a\xd0b', u'a\udcd0b'),
# invalid utf-8 with byte < 0x80
(b'\xe2\x28\xa1', u'\udce2(\udca1'),
# more invalid utf-8
# https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
(b"\xc3\x28", u'\udcc3('), # Invalid 2 Octet Sequence
(b"\xa0\xa1", u'\udca0\udca1'), # Invalid Sequence Identifier
(b"\xe2\x82\xa1", u'\u20a1'), # Valid 3 Octet Sequence '₡'
(b"\xe2\x28\xa1", u'\udce2(\udca1'), # Invalid 3 Octet Sequence (in 2nd Octet)
(b"\xe2\x82\x28", u'\udce2\udc82('), # Invalid 3 Octet Sequence (in 3rd Octet)
(b"\xf0\x90\x8c\xbc", u'\U0001033c'), # Valid 4 Octet Sequence '𐌼'
(b"\xf0\x28\x8c\xbc", u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
(b"\xf0\x90\x28\xbc", u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
(b"\xf0\x28\x8c\x28", u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
(b"\xf8\xa1\xa1\xa1\xa1", # Valid 5 Octet Sequence (but not Unicode!)
u'\udcf8\udca1\udca1\udca1\udca1'),
(b"\xfc\xa1\xa1\xa1\xa1\xa1", # Valid 6 Octet Sequence (but not Unicode!)
u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
# surrogate
(b'\xed\xa0\x80', u'\udced\udca0\udc80'),
# x00 - x1f
(byterange(0,32),
u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
# non-printable utf-8
(b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
# some characters with U >= 0x10000
(b'\xf0\x9f\x99\x8f', u'\U0001f64f'), # 🙏
(b'\xf0\x9f\x9a\x80', u'\U0001f680'), # 🚀
)
for tbytes, tunicode in testv:
assert b(tbytes) == tbytes
assert u(tunicode) == tunicode
assert b(tunicode) == tbytes
assert u(tbytes) == tunicode
assert b(u(tbytes)) == tbytes
assert u(b(tunicode)) == tunicode
# invalid types
with raises(TypeError): b(1)
with raises(TypeError): u(1)
with raises(TypeError): b(object())
with raises(TypeError): u(object())
# TODO also handle bytearray?
# b(b(·)) = identity
_ = b(u'миру мир 123')
assert isinstance(_, bytes)
assert b(_) is _
# u(u(·)) = identity
_ = u(u'мир труд май')
assert isinstance(_, unicode)
assert u(_) is _
# verify print for _pystr and _pyunicode
def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt")
retcode, stdout, stderr = _pyrun(["golang_test_str.py"],
cwd=dir_testprog, stdout=PIPE, stderr=PIPE)
assert retcode == 0, (stdout, stderr)
assert stderr == b""
assertDoc(outok, stdout)
def test_qq():
# NOTE qq is also tested as part of strconv.quote
# qq(any) returns string type
assert isinstance(qq(b('мир')), str) # qq(b) -> str (bytes·py2, unicode·py3)
assert isinstance(qq( u'мир'), str) # qq(u) -> str (bytes·py2, unicode·py3)
# however what qq returns can be mixed with both unicode and bytes
assert b'hello %s !' % qq(b('мир')) == b('hello "мир" !') # b % qq(b)
assert b'hello %s !' % qq(u('мир')) == b('hello "мир" !') # b % qq(u) -> b
assert u'hello %s !' % qq(u('мир')) == u('hello "мир" !') # u % qq(u)
assert u'hello %s !' % qq(b('мир')) == u'hello "мир" !' # u % qq(b) -> u
# custom attributes cannot be injected to what qq returns
x = qq('мир')
if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
with raises(AttributeError):
x.hello = 1
......@@ -21,17 +21,14 @@
from __future__ import print_function, absolute_import
from golang import go, chan, select, default, nilchan, _PanicError, func, panic, \
defer, recover, u, b
from golang.gcompat import qq
defer, recover, u
from golang import sync
from golang.strconv_test import byterange
from pytest import raises, mark, fail
from _pytest._code import Traceback
from os.path import dirname
import os, sys, inspect, importlib, traceback, doctest
from subprocess import Popen, PIPE
import six
from six import text_type as unicode
from six.moves import range as xrange
import gc, weakref, warnings
import re
......@@ -1705,114 +1702,7 @@ def bench_defer(b):
# test_error lives in errors_test.py
# verify b, u
def test_strings():
testv = (
# bytes <-> unicode
(b'', u''),
(b'hello', u'hello'),
(b'hello\nworld', u'hello\nworld'),
(b'\xd0\xbc\xd0\xb8\xd1\x80', u'мир'),
# invalid utf-8
(b'\xd0', u'\udcd0'),
(b'a\xd0b', u'a\udcd0b'),
# invalid utf-8 with byte < 0x80
(b'\xe2\x28\xa1', u'\udce2(\udca1'),
# more invalid utf-8
# https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
(b"\xc3\x28", u'\udcc3('), # Invalid 2 Octet Sequence
(b"\xa0\xa1", u'\udca0\udca1'), # Invalid Sequence Identifier
(b"\xe2\x82\xa1", u'\u20a1'), # Valid 3 Octet Sequence '₡'
(b"\xe2\x28\xa1", u'\udce2(\udca1'), # Invalid 3 Octet Sequence (in 2nd Octet)
(b"\xe2\x82\x28", u'\udce2\udc82('), # Invalid 3 Octet Sequence (in 3rd Octet)
(b"\xf0\x90\x8c\xbc", u'\U0001033c'), # Valid 4 Octet Sequence '𐌼'
(b"\xf0\x28\x8c\xbc", u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
(b"\xf0\x90\x28\xbc", u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
(b"\xf0\x28\x8c\x28", u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
(b"\xf8\xa1\xa1\xa1\xa1", # Valid 5 Octet Sequence (but not Unicode!)
u'\udcf8\udca1\udca1\udca1\udca1'),
(b"\xfc\xa1\xa1\xa1\xa1\xa1", # Valid 6 Octet Sequence (but not Unicode!)
u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
# surrogate
(b'\xed\xa0\x80', u'\udced\udca0\udc80'),
# x00 - x1f
(byterange(0,32),
u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
# non-printable utf-8
(b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
# some characters with U >= 0x10000
(b'\xf0\x9f\x99\x8f', u'\U0001f64f'), # 🙏
(b'\xf0\x9f\x9a\x80', u'\U0001f680'), # 🚀
)
for tbytes, tunicode in testv:
assert b(tbytes) == tbytes
assert u(tunicode) == tunicode
assert b(tunicode) == tbytes
assert u(tbytes) == tunicode
assert b(u(tbytes)) == tbytes
assert u(b(tunicode)) == tunicode
# invalid types
with raises(TypeError): b(1)
with raises(TypeError): u(1)
with raises(TypeError): b(object())
with raises(TypeError): u(object())
# TODO also handle bytearray?
# b(b(·)) = identity
_ = b(u'миру мир 123')
assert isinstance(_, bytes)
assert b(_) is _
# u(u(·)) = identity
_ = u(u'мир труд май')
assert isinstance(_, unicode)
assert u(_) is _
# verify print for _pystr and _pyunicode
def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt")
retcode, stdout, stderr = _pyrun(["golang_test_str.py"],
cwd=dir_testprog, stdout=PIPE, stderr=PIPE)
assert retcode == 0, (stdout, stderr)
assert stderr == b""
assertDoc(outok, stdout)
def test_qq():
# NOTE qq is also tested as part of strconv.quote
# qq(any) returns string type
assert isinstance(qq(b('мир')), str) # qq(b) -> str (bytes·py2, unicode·py3)
assert isinstance(qq( u'мир'), str) # qq(u) -> str (bytes·py2, unicode·py3)
# however what qq returns can be mixed with both unicode and bytes
assert b'hello %s !' % qq(b('мир')) == b('hello "мир" !') # b % qq(b)
assert b'hello %s !' % qq(u('мир')) == b('hello "мир" !') # b % qq(u) -> b
assert u'hello %s !' % qq(u('мир')) == u('hello "мир" !') # u % qq(u)
assert u'hello %s !' % qq(b('мир')) == u'hello "мир" !' # u % qq(b) -> u
# custom attributes cannot be injected to what qq returns
x = qq('мир')
if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
with raises(AttributeError):
x.hello = 1
# strings tests live in golang_str_test.py
# ---- misc ----
......
......@@ -20,7 +20,7 @@
# See https://www.nexedi.com/licensing for rationale and options.
"""This program helps to verify _pystr and _pyunicode.
It complements golang_test.test_strings.
It complements golang_str_test.test_strings_print.
"""
from __future__ import print_function, absolute_import
......
......@@ -239,7 +239,8 @@ setup(
ext_modules = [
Ext('golang._golang',
['golang/_golang.pyx']),
['golang/_golang.pyx'],
depends = ['golang/_golang_str.pyx']),
Ext('golang.runtime._runtime_thread',
['golang/runtime/_runtime_thread.pyx']),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment