Commit 04be919b authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: bstr/ustr index access

Implement access to bstr/ustr by [index] and by slice. Result of such
[index] access - similarly to standard str - returns the same bstr/ustr
type with one character:

  - ustr[i] returns ustr with one unicode character taken from i'th character of original string, while
  - bstr[i] returns bstr with one byte taken from i'th byte of original bytestring.

This follows str/unicode semantics on both py2/py3, bytes semantic on
py2, but diverges from bytes semantics on py3. I originally tried to
follow bytes/py3 semantic - for bstr to return an integer instead of
1-byte character, but later found several compatibility breakages due to
it. I contemplated about this divergence for a long time and finally
took decision to follow strings semantics for both ustr and bstr. This
preserves backward compatibility with Python2 and also allows for bstr
to be practically drop-in replacement for str type.

To get an ordinal corresponding to retrieved character, one can use
standard `ord`, e.g. as in `ord(bstr[i])`. This will always return an
integer for all bstr/ustr/str/unicode. Similarly to standard `chr` and
`unichr`, we also provide two utility functions - `uchr` and `bbyte` to
create 1-character and 1-byte ustr/bstr correspondingly.
parent 105d03d4
...@@ -240,6 +240,10 @@ The conversion, in both encoding and decoding, never fails and never looses ...@@ -240,6 +240,10 @@ The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
even if bytes data is not valid UTF-8. even if bytes data is not valid UTF-8.
Semantically `bstr` is array of bytes, while `ustr` is array of
unicode-characters. Accessing their elements by `[index]` yields byte and
unicode character correspondingly [*]_.
Operations in between `bstr` and `ustr`/`unicode` / `bytes`/`bytearray` coerce to `bstr`, while Operations in between `bstr` and `ustr`/`unicode` / `bytes`/`bytearray` coerce to `bstr`, while
operations in between `ustr` and `bstr`/`bytes`/`bytearray` / `unicode` coerce operations in between `ustr` and `bstr`/`bytes`/`bytearray` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes` and `bytearray`, similarly to to `ustr`. When the coercion happens, `bytes` and `bytearray`, similarly to
...@@ -262,6 +266,8 @@ Usage example:: ...@@ -262,6 +266,8 @@ Usage example::
... # (*) the decoding never fails nor looses information. ... # (*) the decoding never fails nor looses information.
.. [*] `unicode` on Python2, `str` on Python3. .. [*] `unicode` on Python2, `str` on Python3.
.. [*] | ordinal of such byte and unicode character can be obtained via regular `ord`.
| For completeness `bbyte` and `uchr` are also provided for constructing 1-byte `bstr` and 1-character `ustr` from ordinal.
.. [*] | data in buffer, similarly to `bytes` and `bytearray`, is treated as UTF8-encoded string. .. [*] | data in buffer, similarly to `bytes` and `bytearray`, is treated as UTF8-encoded string.
| Notice that only explicit conversion through `b` and `u` accept objects with buffer interface. Automatic coercion does not. | Notice that only explicit conversion through `b` and `u` accept objects with buffer interface. Automatic coercion does not.
......
...@@ -36,7 +36,7 @@ from __future__ import print_function, absolute_import ...@@ -36,7 +36,7 @@ from __future__ import print_function, absolute_import
__version__ = "0.1" __version__ = "0.1"
__all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic', __all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic',
'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'gimport'] 'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'bbyte', 'uchr', 'gimport']
from golang._gopath import gimport # make gimport available from golang from golang._gopath import gimport # make gimport available from golang
import inspect, sys import inspect, sys
...@@ -317,8 +317,10 @@ from ._golang import \ ...@@ -317,8 +317,10 @@ from ._golang import \
pyerror as error, \ pyerror as error, \
pyb as b, \ pyb as b, \
pybstr as bstr, \ pybstr as bstr, \
pybbyte as bbyte, \
pyu as u, \ pyu as u, \
pyustr as ustr pyustr as ustr, \
pyuchr as uchr
# import golang.strconv into _golang from here to workaround cyclic golang ↔ strconv dependency # import golang.strconv into _golang from here to workaround cyclic golang ↔ strconv dependency
def _(): def _():
......
...@@ -174,6 +174,15 @@ cdef __pystr(object obj): # -> ~str ...@@ -174,6 +174,15 @@ cdef __pystr(object obj): # -> ~str
return pyb(obj) return pyb(obj)
def pybbyte(int i): # -> 1-byte bstr
"""bbyte(i) returns 1-byte bstr with ordinal i."""
return pyb(bytearray([i]))
def pyuchr(int i): # -> 1-character ustr
"""uchr(i) returns 1-character ustr with unicode ordinal i."""
return pyu(unichr(i))
# XXX cannot `cdef class`: github.com/cython/cython/issues/711 # XXX cannot `cdef class`: github.com/cython/cython/issues/711
class pybstr(bytes): class pybstr(bytes):
"""bstr is byte-string. """bstr is byte-string.
...@@ -185,6 +194,9 @@ class pybstr(bytes): ...@@ -185,6 +194,9 @@ class pybstr(bytes):
is always identity even if bytes data is not valid UTF-8. is always identity even if bytes data is not valid UTF-8.
Semantically bstr is array of bytes. Accessing its elements by [index]
yields byte character.
Operations in between bstr and ustr/unicode / bytes/bytearray coerce to bstr. Operations in between bstr and ustr/unicode / bytes/bytearray coerce to bstr.
When the coercion happens, bytes and bytearray, similarly to bstr, are also When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings. treated as UTF8-encoded strings.
...@@ -253,6 +265,21 @@ class pybstr(bytes): ...@@ -253,6 +265,21 @@ class pybstr(bytes):
def __le__(a, b): return bytes.__le__(a, _pyb_coerce(b)) def __le__(a, b): return bytes.__le__(a, _pyb_coerce(b))
def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b)) def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b))
# len - no need to override
# [], [:]
def __getitem__(self, idx):
x = bytes.__getitem__(self, idx)
if type(idx) is slice:
return pyb(x)
else:
# bytes[i] returns 1-character bytestring(py2) or int(py3)
# we always return 1-character bytestring
if PY_MAJOR_VERSION >= 3:
return pybbyte(x)
else:
return pyb(x)
# XXX cannot `cdef class` with __new__: https://github.com/cython/cython/issues/799 # XXX cannot `cdef class` with __new__: https://github.com/cython/cython/issues/799
class pyustr(unicode): class pyustr(unicode):
...@@ -265,6 +292,9 @@ class pyustr(unicode): ...@@ -265,6 +292,9 @@ class pyustr(unicode):
is always identity even if bytes data is not valid UTF-8. is always identity even if bytes data is not valid UTF-8.
ustr is similar to standard unicode type - accessing its
elements by [index] yields unicode characters.
Operations in between ustr and bstr/bytes/bytearray / unicode coerce to ustr. Operations in between ustr and bstr/bytes/bytearray / unicode coerce to ustr.
When the coercion happens, bytes and bytearray, similarly to bstr, are also When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings. treated as UTF8-encoded strings.
...@@ -324,6 +354,12 @@ class pyustr(unicode): ...@@ -324,6 +354,12 @@ class pyustr(unicode):
def __le__(a, b): return unicode.__le__(a, _pyu_coerce(b)) def __le__(a, b): return unicode.__le__(a, _pyu_coerce(b))
def __ge__(a, b): return unicode.__ge__(a, _pyu_coerce(b)) def __ge__(a, b): return unicode.__ge__(a, _pyu_coerce(b))
# len - no need to override
# [], [:]
def __getitem__(self, idx):
return pyu(unicode.__getitem__(self, idx))
# _bdata/_udata retrieve raw data from bytes/unicode. # _bdata/_udata retrieve raw data from bytes/unicode.
def _bdata(obj): # -> bytes def _bdata(obj): # -> bytes
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
from __future__ import print_function, absolute_import from __future__ import print_function, absolute_import
import golang import golang
from golang import b, u, bstr, ustr from golang import b, u, bstr, ustr, bbyte, uchr
from golang._golang import _udata, _bdata from golang._golang import _udata, _bdata
from golang.gcompat import qq from golang.gcompat import qq
from golang.strconv_test import byterange from golang.strconv_test import byterange
...@@ -29,7 +29,7 @@ from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE ...@@ -29,7 +29,7 @@ from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises, mark, skip from pytest import raises, mark, skip
import sys import sys
import six import six
from six import text_type as unicode from six import text_type as unicode, unichr
from six.moves import range as xrange from six.moves import range as xrange
import array import array
...@@ -271,6 +271,119 @@ def test_strings_memoryview(): ...@@ -271,6 +271,119 @@ def test_strings_memoryview():
assert _(5) == 0x80 assert _(5) == 0x80
# verify that ord on bstr/ustr works as expected.
def test_strings_ord():
with raises(TypeError): ord(b(''))
with raises(TypeError): ord(u(''))
with raises(TypeError): ord(b('ab'))
with raises(TypeError): ord(u('ab'))
assert ord(b('a')) == 97
assert ord(u('a')) == 97
with raises(TypeError): ord(b('м')) # 2 bytes, not 1
assert ord(u('м')) == 1084
for i in range(0x100):
bc = b(bytearray([i]))
assert len(bc) == 1
assert ord(bc) == i
for i in range(0x10000):
uc = u(unichr(i))
assert len(uc) == 1
assert ord(uc) == i
# verify bbyte.
def test_strings_bbyte():
with raises(ValueError): bbyte(-1)
with raises(ValueError): bbyte(0x100)
for i in range(0x100):
bi = bbyte(i)
assert type(bi) is bstr
assert len(bi) == 1
assert ord(bi) == i
assert bi == bytearray([i])
# verify uchr.
def test_strings_uchr():
with raises(ValueError): unichr(-1)
# upper limit depends on whether python was built with ucs as 2-bytes or 4-bytes long
# but at least it all should work for small 2-bytes range
for i in range(0x10000):
ui = uchr(i)
assert type(ui) is ustr
assert len(ui) == 1
assert ord(ui) == i
assert ui == unichr(i)
# verify strings access by index.
def test_strings_index():
us = u("миру мир"); u_ = u"миру мир"
bs = b("миру мир"); b_ = xbytes("миру мир")
assert len(us) == 8; assert len(u_) == 8
assert len(bs) == 15; assert len(b_) == 15
# u/unicode [idx] -> unicode character
def uidx(i):
x = us[i]; assert type(x) is ustr
y = u_[i]; assert type(y) is unicode
assert x == y
return x
for i, x in enumerate(['м','и','р','у',' ','м','и','р']):
assert uidx(i) == x
# b/bytes [idx] -> bytechar of byte value @ position idx
def bidx(i):
x = bs[i]; assert type(x) is bstr; assert len(x) == 1
y = b_[i]
if six.PY3:
y = bytes([y]) # on py3 bytes[i] returns int instead of 1-byte string
assert type(y) is bytes; assert len(y) == 1
assert x == y
return x
for i, x in enumerate([0xd0,0xbc,0xd0,0xb8,0xd1,0x80,0xd1,0x83,0x20,0xd0,0xbc,0xd0,0xb8,0xd1,0x80]):
assert bidx(i) == bytearray([x])
# u/unicode [:] -> unicode string
class USlice:
def __getitem__(self, key):
x = us[key]; assert type(x) is ustr
y = u_[key]; assert type(y) is unicode
assert x == y
return x
def __len__(self): # py2
x = len(us)
y = len(u_)
assert x == y
return x
_ = USlice()
assert _[:] == u"миру мир"
assert _[1:] == u"иру мир"
assert _[:-1] == u"миру ми"
assert _[2:5] == u"ру "
assert _[1:-1:2]== u"иум"
# b/bytes [:] -> bytestring
class BSlice:
def __getitem__(self, key):
x = bs[key]; assert type(x) is bstr
y = b_[key]; assert type(y) is bytes
assert x == y
return x
def __len__(self): # py2
x = len(bs)
y = len(b_)
assert x == y
return x
_ = BSlice()
assert _[:] == "миру мир"
assert _[1:] == b'\xbc\xd0\xb8\xd1\x80\xd1\x83 \xd0\xbc\xd0\xb8\xd1\x80'
assert _[:-1] == b'\xd0\xbc\xd0\xb8\xd1\x80\xd1\x83 \xd0\xbc\xd0\xb8\xd1'
assert _[3:12] == b'\xb8\xd1\x80\xd1\x83 \xd0\xbc\xd0'
assert _[1:-1:2]== b'\xbc\xb8\x80\x83\xd0\xd0\xd1'
# verify string operations like `x + y` for all combinations of pairs from # verify string operations like `x + y` for all combinations of pairs from
# bytes, unicode, bstr, ustr and bytearray. Except if both x and y are std # bytes, unicode, bstr, ustr and bytearray. Except if both x and y are std
# python types, e.g. (bytes, unicode), because those combinations are handled # python types, e.g. (bytes, unicode), because those combinations are handled
......
...@@ -73,6 +73,8 @@ def test_golang_builtins(): ...@@ -73,6 +73,8 @@ def test_golang_builtins():
assert u is golang.u assert u is golang.u
assert bstr is golang.bstr assert bstr is golang.bstr
assert ustr is golang.ustr assert ustr is golang.ustr
assert bbyte is golang.bbyte
assert uchr is golang.uchr
# indirectly verify golang.__all__ # indirectly verify golang.__all__
for k in golang.__all__: for k in golang.__all__:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment