Commit 54c2a3cf authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Teach bstr/ustr to compare wrt any string with automatic coercion

So that e.g. `bstr == <any string type>` works. We want `bstr == ustr`
to work because we intend those types to be interoperable. We also want
e.g. `bstr == "a_string"` to work because we want bstr to be
interoperable with standard strings. In general we want to have full
automatic interoperability with all string types, so that e.g. `bstr == X`
works for X being all bstr, ustr, unicode, bytes (and later bytearray).

For now we add support only for comparison operators. But later, we
will be adding support for e.g. +, string methods, etc - and in all
those operations we will be following the same approach: to have
automatic interoperability with all string types out of the box.

The text added to README reflects this.

The patch to unicode.tp_richcompare on py2 illustrates our approach to
adjust builtin types when absolutely needed. In this particular case
original builtin unicode.__eq__(unicode, bstr) is always returning False
for non-ASCII bstr even despite bstr having .__unicode__() method. Our
adjustment is non-intrusive - we adjust unicode behaviour only wrt bstr
and it stays exactly the same as before wrt all other types.

We anyway do that with care and add a test that verifies that behaviour
of what we patched stays unaffected when used outside of bstr/ustr
context.
parent 34667355
...@@ -240,6 +240,11 @@ The conversion, in both encoding and decoding, never fails and never looses ...@@ -240,6 +240,11 @@ The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
even if bytes data is not valid UTF-8. even if bytes data is not valid UTF-8.
Operations in between `bstr` and `ustr`/`unicode` / `bytes` coerce to `bstr`, while
operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes`, similarly to
`bstr`, are also treated as UTF8-encoded strings.
`bstr`/`ustr` constructors will accept arbitrary objects and either convert or stringify them. For `bstr`/`ustr` constructors will accept arbitrary objects and either convert or stringify them. For
cases when no stringification is desired, and one only wants to convert cases when no stringification is desired, and one only wants to convert
`bstr`/`ustr` / `unicode`/`bytes` `bstr`/`ustr` / `unicode`/`bytes`
......
...@@ -24,7 +24,8 @@ It is included from _golang.pyx . ...@@ -24,7 +24,8 @@ It is included from _golang.pyx .
from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode
from cpython cimport PyUnicode_DecodeUTF8 from cpython cimport PyUnicode_DecodeUTF8
from cpython cimport PyTypeObject, Py_TYPE from cpython cimport PyTypeObject, Py_TYPE, richcmpfunc
from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
cdef extern from "Python.h": cdef extern from "Python.h":
void PyType_Modified(PyTypeObject *) void PyType_Modified(PyTypeObject *)
...@@ -93,6 +94,25 @@ def pyu(s): # -> ustr ...@@ -93,6 +94,25 @@ def pyu(s): # -> ustr
return pyustr(s) return pyustr(s)
# _pyb_coerce coerces x from `b op x` to be used in operation with pyb.
cdef _pyb_coerce(x): # -> bstr|bytes
if isinstance(x, bytes):
return x
elif isinstance(x, unicode):
return pyb(x)
else:
raise TypeError("b: coerce: invalid type %s" % type(x))
# _pyu_coerce coerces x from `u op x` to be used in operation with pyu.
cdef _pyu_coerce(x): # -> ustr|unicode
if isinstance(x, unicode):
return x
elif isinstance(x, bytes):
return pyu(x)
else:
raise TypeError("u: coerce: invalid type %s" % type(x))
# __pystr converts obj to ~str of current python: # __pystr converts obj to ~str of current python:
# #
# - to ~bytes, via b, if running on py2, or # - to ~bytes, via b, if running on py2, or
...@@ -123,6 +143,10 @@ class pybstr(bytes): ...@@ -123,6 +143,10 @@ class pybstr(bytes):
is always identity even if bytes data is not valid UTF-8. is always identity even if bytes data is not valid UTF-8.
Operations in between bstr and ustr/unicode / bytes coerce to bstr.
When the coercion happens, bytes, similarly to bstr, are also
treated as UTF8-encoded strings.
See also: b, ustr/u. See also: b, ustr/u.
""" """
...@@ -141,6 +165,32 @@ class pybstr(bytes): ...@@ -141,6 +165,32 @@ class pybstr(bytes):
return self return self
def __hash__(self):
# hash of the same unicode and UTF-8 encoded bytes is generally different
# -> we can't make hash(bstr) == both hash(bytes) and hash(unicode) at the same time.
# -> make hash(bstr) == hash(str type of current python) so that bstr
# could be used as keys in dictionary interchangeably with native str type.
if PY_MAJOR_VERSION >= 3:
return hash(pyu(self))
else:
return bytes.__hash__(self)
# == != < > <= >=
# NOTE == and != are special: they must succeed against any type so that
# bstr could be used as dict key.
def __eq__(a, b):
try:
b = _pyb_coerce(b)
except TypeError:
return False
return bytes.__eq__(a, b)
def __ne__(a, b): return not a.__eq__(b)
def __lt__(a, b): return bytes.__lt__(a, _pyb_coerce(b))
def __gt__(a, b): return bytes.__gt__(a, _pyb_coerce(b))
def __le__(a, b): return bytes.__le__(a, _pyb_coerce(b))
def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b))
cdef class pyustr(unicode): cdef class pyustr(unicode):
"""ustr is unicode-string. """ustr is unicode-string.
...@@ -151,6 +201,10 @@ cdef class pyustr(unicode): ...@@ -151,6 +201,10 @@ cdef class pyustr(unicode):
is always identity even if bytes data is not valid UTF-8. is always identity even if bytes data is not valid UTF-8.
Operations in between ustr and bstr/bytes / unicode coerce to ustr.
When the coercion happens, bytes, similarly to bstr, are also
treated as UTF8-encoded strings.
See also: u, bstr/b. See also: u, bstr/b.
""" """
...@@ -164,6 +218,29 @@ cdef class pyustr(unicode): ...@@ -164,6 +218,29 @@ cdef class pyustr(unicode):
return pyb(self) return pyb(self)
def __hash__(self):
# see pybstr.__hash__ for why we stick to hash of current str
if PY_MAJOR_VERSION >= 3:
return unicode.__hash__(self)
else:
return hash(pyb(self))
# == != < > <= >=
# NOTE == and != are special: they must succeed against any type so that
# ustr could be used as dict key.
def __eq__(a, b):
try:
b = _pyu_coerce(b)
except TypeError:
return False
return unicode.__eq__(a, b)
def __ne__(a, b): return not a.__eq__(b)
def __lt__(a, b): return unicode.__lt__(a, _pyu_coerce(b))
def __gt__(a, b): return unicode.__gt__(a, _pyu_coerce(b))
def __le__(a, b): return unicode.__le__(a, _pyu_coerce(b))
def __ge__(a, b): return unicode.__ge__(a, _pyu_coerce(b))
# _bdata/_udata retrieve raw data from bytes/unicode. # _bdata/_udata retrieve raw data from bytes/unicode.
def _bdata(obj): # -> bytes def _bdata(obj): # -> bytes
assert isinstance(obj, bytes) assert isinstance(obj, bytes)
...@@ -235,6 +312,44 @@ def pyqq(obj): ...@@ -235,6 +312,44 @@ def pyqq(obj):
return qobj return qobj
# py2: adjust unicode.tp_richcompare(a,b) to return NotImplemented if b is bstr.
# This way we avoid `UnicodeWarning: Unicode equal comparison failed to convert
# both arguments to Unicode - interpreting them as being unequal`, and that
# further `a == b` returns False even if `b == a` gives True.
#
# NOTE there is no need to do the same for ustr, because ustr inherits from
# unicode and can be always natively converted to unicode by python itself.
cdef richcmpfunc _unicode_tp_richcompare = Py_TYPE(u'').tp_richcompare
cdef object _unicode_tp_xrichcompare(object a, object b, int op):
if isinstance(b, pybstr):
return NotImplemented
return _unicode_tp_richcompare(a, b, op)
cdef object _unicode_x__eq__(object a, object b): return _unicode_tp_richcompare(a, b, Py_EQ)
cdef object _unicode_x__ne__(object a, object b): return _unicode_tp_richcompare(a, b, Py_NE)
cdef object _unicode_x__lt__(object a, object b): return _unicode_tp_richcompare(a, b, Py_LT)
cdef object _unicode_x__gt__(object a, object b): return _unicode_tp_richcompare(a, b, Py_GT)
cdef object _unicode_x__le__(object a, object b): return _unicode_tp_richcompare(a, b, Py_LE)
cdef object _unicode_x__ge__(object a, object b): return _unicode_tp_richcompare(a, b, Py_GE)
if PY_MAJOR_VERSION < 3:
def _():
cdef PyTypeObject* t
for pyt in [unicode] + unicode.__subclasses__():
assert isinstance(pyt, type)
t = <PyTypeObject*>pyt
if t.tp_richcompare == _unicode_tp_richcompare:
t.tp_richcompare = _unicode_tp_xrichcompare
_patch_slot(t, "__eq__", _unicode_x__eq__)
_patch_slot(t, "__ne__", _unicode_x__ne__)
_patch_slot(t, "__lt__", _unicode_x__lt__)
_patch_slot(t, "__gt__", _unicode_x__gt__)
_patch_slot(t, "__le__", _unicode_x__le__)
_patch_slot(t, "__ge__", _unicode_x__ge__)
_()
# _patch_slot installs func_or_descr into typ's __dict__ as name. # _patch_slot installs func_or_descr into typ's __dict__ as name.
# #
# if func_or_descr is descriptor (has __get__), it is installed as is. # if func_or_descr is descriptor (has __get__), it is installed as is.
......
...@@ -26,7 +26,7 @@ from golang._golang import _udata, _bdata ...@@ -26,7 +26,7 @@ from golang._golang import _udata, _bdata
from golang.gcompat import qq from golang.gcompat import qq
from golang.strconv_test import byterange from golang.strconv_test import byterange
from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises from pytest import raises, mark, skip
import sys import sys
from six import text_type as unicode from six import text_type as unicode
from six.moves import range as xrange from six.moves import range as xrange
...@@ -132,8 +132,8 @@ def test_strings_basic(): ...@@ -132,8 +132,8 @@ def test_strings_basic():
us = u(u_); assert isinstance(us, unicode); assert type(us) is ustr us = u(u_); assert isinstance(us, unicode); assert type(us) is ustr
# b/u from bytes # b/u from bytes
_ = b(b_); assert type(_) is bstr _ = b(b_); assert type(_) is bstr; assert _ == "мир"
_ = u(b_); assert type(_) is ustr _ = u(b_); assert type(_) is ustr; assert _ == "мир"
# TODO also handle bytearray? # TODO also handle bytearray?
...@@ -147,14 +147,19 @@ def test_strings_basic(): ...@@ -147,14 +147,19 @@ def test_strings_basic():
assert unicode(us) is us assert unicode(us) is us
# unicode(b) -> u, bytes(u) -> b # unicode(b) -> u, bytes(u) -> b
_ = unicode(bs); assert type(_) is ustr _ = unicode(bs); assert type(_) is ustr; assert _ == "мир"
_ = bytes (us); assert type(_) is bstr _ = bytes (us); assert type(_) is bstr; assert _ == "мир"
# b(u(·)), u(b(·)) # b(u(·)), u(b(·))
_ = b(us); assert type(_) is bstr _ = b(us); assert type(_) is bstr; assert _ == "мир"
_ = u(bs); assert type(_) is ustr _ = u(bs); assert type(_) is ustr; assert _ == "мир"
_ = bstr(us); assert type(_) is bstr _ = bstr(us); assert type(_) is bstr; assert _ == "мир"
_ = ustr(bs); assert type(_) is ustr _ = ustr(bs); assert type(_) is ustr; assert _ == "мир"
# hash of b/u is made to be equal to hash of current str
# (it cannot be equal to hash(b'мир') and hash(u'мир') at the same time as those hashes differ)
assert hash(us) == hash("мир"); assert us == "мир"
assert hash(bs) == hash("мир"); assert bs == "мир"
# str # str
_ = str(us); assert isinstance(_, str); assert _ == "мир" _ = str(us); assert isinstance(_, str); assert _ == "мир"
...@@ -167,6 +172,98 @@ def test_strings_basic(): ...@@ -167,6 +172,98 @@ def test_strings_basic():
with raises(AttributeError): with raises(AttributeError):
bs.hello = 1 bs.hello = 1
# verify string operations like `x + y` for all combinations of pairs from
# bytes, unicode, bstr and ustr. Except if both x and y are std
# python types, e.g. (bytes, unicode), because those combinations are handled
# only by builtin python code and might be rejected.
@mark.parametrize('tx', (bytes, unicode, bstr, ustr))
@mark.parametrize('ty', (bytes, unicode, bstr, ustr))
def test_strings_ops2(tx, ty):
# skip e.g. regular bytes vs regular unicode
tstd = {bytes, unicode}
if tx in tstd and ty in tstd and tx is not ty:
skip()
# == != <= >= < > for ~equal
x = xstr(u'мир', tx); assert type(x) is tx
y = xstr(u'мир', ty); assert type(y) is ty
assert x == y
assert y == x
assert not (x != y)
assert not (y != x)
assert x >= y
assert y >= x
assert x <= y
assert y <= x
assert not (x > y)
assert not (y > x)
assert not (x < y)
assert not (y < x)
# now not equal
x = xstr(u'hello ', tx)
y = xstr(u'мир', ty)
# == != <= >= < >
assert not (x == y)
assert not (y == x)
assert x != y
assert y != x
assert not (x >= y)
assert y >= x
assert x <= y
assert not (y <= x)
assert x < y
assert not (y < x)
assert not (x > y)
assert y > x
# verify string operations like `x == *` for x being bstr/ustr.
# Those operations must succeed for any hashable type or else bstr/ustr could
# not be used as dict keys.
@mark.parametrize('tx', (bstr, ustr))
def test_strings_ops2_eq_any(tx):
x = xstr(u'мир', tx)
while 1:
hx = hash(x)
if hash(hx) == hx: # positive int32 will have this property
break
x += xstr('!', tx)
# assertNE asserts that (x==y) is False and (x!=y) is True.
# it also asserts that e.g. x < y raises TypeError
def assertNE(y):
assert (x == y) is False
assert (x != y) is True
with raises(TypeError): x >= y
with raises(TypeError): x <= y
with raises(TypeError): x > y
with raises(TypeError): x < y
_ = assertNE
_(None)
_(0)
_(1)
_(2)
assert hash(x) == hx
assert hash(hx) == hx
_(hx)
d = {x: 1, hx: 2} # creating dict will fail if `x == hx` raises TypeError
assert d[x] == 1
assert d[hx] == 2
_(())
_((1,))
_((x,))
# == wrt non-hashable type also succeeds following std python where e.g. 's' == [1] gives False
l = [1]
with raises(TypeError): hash(l)
_(l)
# verify print for bstr/ustr. # verify print for bstr/ustr.
def test_strings_print(): def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt") outok = readfile(dir_testprog + "/golang_test_str.txt")
...@@ -191,6 +288,30 @@ def test_qq(): ...@@ -191,6 +288,30 @@ def test_qq():
assert u'hello %s !' % qq(b('мир')) == u'hello "мир" !' # u % qq(b) -> u assert u'hello %s !' % qq(b('мир')) == u'hello "мир" !' # u % qq(b) -> u
# ----------------------------------------
# verify that what we patched stay unaffected when
# called outside of bstr/ustr context.
def test_strings_patched_transparently():
u_ = xunicode ("мир"); assert type(u_) is unicode
# unicode comparison stay unaffected
assert (u_ == u_) is True
assert (u_ != u_) is False
assert (u_ < u_) is False
assert (u_ > u_) is False
assert (u_ <= u_) is True
assert (u_ >= u_) is True
u2 = xunicode("май"); assert type(u2) is unicode
assert (u_ == u2) is False ; assert (u2 == u_) is False
assert (u_ != u2) is True ; assert (u2 != u_) is True
assert (u_ < u2) is False ; assert (u2 < u_) is True
assert (u_ > u2) is True ; assert (u2 > u_) is False
assert (u_ <= u2) is False ; assert (u2 <= u_) is True
assert (u_ >= u2) is True ; assert (u2 >= u_) is False
# ---- benchmarks ---- # ---- benchmarks ----
# utf-8 decoding # utf-8 decoding
...@@ -224,3 +345,17 @@ def bench_bencode(b): ...@@ -224,3 +345,17 @@ def bench_bencode(b):
# unicode correspondingly to function name. # unicode correspondingly to function name.
def xbytes(x): return x.encode('utf-8') if type(x) is unicode else x def xbytes(x): return x.encode('utf-8') if type(x) is unicode else x
def xunicode(x): return x.decode('utf-8') if type(x) is bytes else x def xunicode(x): return x.decode('utf-8') if type(x) is bytes else x
# xstr returns string corresponding to specified type and data.
def xstr(text, typ):
def _():
t = {
bytes: xbytes,
unicode: xunicode,
bstr: b,
ustr: u,
}
return t[typ](text)
s = _()
assert type(s) is typ
return s
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment