Commit 496d3224 authored by Stefan Behnel's avatar Stefan Behnel

fix surrogates in Unicode literals in Python 3.3 (the UTF-8 codec rejects them explictly)

parent 02a5f1e0
...@@ -24,6 +24,13 @@ Features added ...@@ -24,6 +24,13 @@ Features added
Bugs fixed Bugs fixed
---------- ----------
* Surrogate code points in Unicode string literals failed to compile and/or
load in CPython 3.3. To work around this change introduced by CPython,
Cython switched from UTF-8 to Python Unicode escapes ('\u0101') internally
for storing literal Unicode strings in C code. This may add a slight
initialisation overhead if a large number of non-Latin1 characters are
used in the code.
Other changes Other changes
------------- -------------
......
...@@ -991,7 +991,7 @@ class GlobalState(object): ...@@ -991,7 +991,7 @@ class GlobalState(object):
def get_string_const(self, text, py_version=None): def get_string_const(self, text, py_version=None):
# return a C string constant, creating a new one if necessary # return a C string constant, creating a new one if necessary
if text.is_unicode: if text.is_unicode:
byte_string = text.utf8encode() byte_string = text.escapeencode()
else: else:
byte_string = text.byteencode() byte_string = text.byteencode()
try: try:
...@@ -1006,7 +1006,7 @@ class GlobalState(object): ...@@ -1006,7 +1006,7 @@ class GlobalState(object):
# return a Python string constant, creating a new one if necessary # return a Python string constant, creating a new one if necessary
py3str_cstring = None py3str_cstring = None
if is_str and unicode_value is not None \ if is_str and unicode_value is not None \
and unicode_value.utf8encode() != text.byteencode(): and unicode_value.escapeencode() != text.byteencode():
py3str_cstring = self.get_string_const(unicode_value, py_version=3) py3str_cstring = self.get_string_const(unicode_value, py_version=3)
c_string = self.get_string_const(text, py_version=2) c_string = self.get_string_const(text, py_version=2)
else: else:
......
...@@ -12,6 +12,8 @@ else: ...@@ -12,6 +12,8 @@ else:
_unicode, _str, _bytes = unicode, str, str _unicode, _str, _bytes = unicode, str, str
IS_PYTHON3 = False IS_PYTHON3 = False
IS_PYTHON24 = sys.version_info[:2] < (2,5)
empty_bytes = _bytes() empty_bytes = _bytes()
empty_unicode = _unicode() empty_unicode = _unicode()
...@@ -126,6 +128,13 @@ class EncodedString(_unicode): ...@@ -126,6 +128,13 @@ class EncodedString(_unicode):
assert self.encoding is None assert self.encoding is None
return self.encode("UTF-8") return self.encode("UTF-8")
def escapeencode(self):
assert self.encoding is None
if IS_PYTHON24:
# work around bug in Py24 encoder
return self.replace(u'\\', u'\\\\').encode('unicode_escape')
return self.encode('unicode_escape')
def is_unicode(self): def is_unicode(self):
return self.encoding is None return self.encoding is None
is_unicode = property(is_unicode) is_unicode = property(is_unicode)
...@@ -147,6 +156,9 @@ class BytesLiteral(_bytes): ...@@ -147,6 +156,9 @@ class BytesLiteral(_bytes):
def utf8encode(self): def utf8encode(self):
assert False, "this is not a unicode string: %r" % self assert False, "this is not a unicode string: %r" % self
def escapeencode(self):
assert False, "this is not a unicode string: %r" % self
def __str__(self): def __str__(self):
"""Fake-decode the byte string to unicode to support % """Fake-decode the byte string to unicode to support %
formatting of unicode strings. formatting of unicode strings.
...@@ -165,6 +177,8 @@ char_from_escape_sequence = { ...@@ -165,6 +177,8 @@ char_from_escape_sequence = {
r'\v' : u'\v', r'\v' : u'\v',
}.get }.get
_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
def _to_escape_sequence(s): def _to_escape_sequence(s):
if s in '\n\r\t': if s in '\n\r\t':
return repr(s)[1:-1] return repr(s)[1:-1]
...@@ -176,19 +190,22 @@ def _to_escape_sequence(s): ...@@ -176,19 +190,22 @@ def _to_escape_sequence(s):
# within a character sequence, oct passes much better than hex # within a character sequence, oct passes much better than hex
return ''.join(['\\%03o' % ord(c) for c in s]) return ''.join(['\\%03o' % ord(c) for c in s])
_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) def _build_specials_replacer():
_c_special_replacements = [(orig.encode('ASCII'),
_to_escape_sequence(orig).encode('ASCII'))
for orig in _c_special ]
def _build_specials_test():
subexps = [] subexps = []
replacements = {}
for special in _c_special: for special in _c_special:
regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
subexps.append(regexp) subexps.append(regexp)
return re.compile('|'.join(subexps).encode('ASCII')).search replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
def replace_specials(m):
return replacements[m.group(1)]
def replace(s):
return sub(replace_specials, s)
return replace
_has_specials = _build_specials_test() _replace_specials = _build_specials_replacer()
def escape_char(c): def escape_char(c):
if IS_PYTHON3: if IS_PYTHON3:
...@@ -210,10 +227,7 @@ def escape_byte_string(s): ...@@ -210,10 +227,7 @@ def escape_byte_string(s):
encoded as ISO-8859-1, will result in the correct byte sequence encoded as ISO-8859-1, will result in the correct byte sequence
being written. being written.
""" """
if _has_specials(s): s = _replace_specials(s)
for special, replacement in _c_special_replacements:
if special in s:
s = s.replace(special, replacement)
try: try:
return s.decode("ASCII") # trial decoding: plain ASCII => done return s.decode("ASCII") # trial decoding: plain ASCII => done
except UnicodeDecodeError: except UnicodeDecodeError:
......
...@@ -17,7 +17,7 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { ...@@ -17,7 +17,7 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
while (t->p) { while (t->p) {
#if PY_MAJOR_VERSION < 3 #if PY_MAJOR_VERSION < 3
if (t->is_unicode) { if (t->is_unicode) {
*t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL); *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL);
} else if (t->intern) { } else if (t->intern) {
*t->p = PyString_InternFromString(t->s); *t->p = PyString_InternFromString(t->s);
} else { } else {
...@@ -25,12 +25,13 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { ...@@ -25,12 +25,13 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
} }
#else /* Python 3+ has unicode identifiers */ #else /* Python 3+ has unicode identifiers */
if (t->is_unicode | t->is_str) { if (t->is_unicode | t->is_str) {
if (t->intern) { if (unlikely(t->encoding)) {
*t->p = PyUnicode_InternFromString(t->s);
} else if (t->encoding) {
*t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL); *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
} else { } else {
*t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1); *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL);
}
if (t->intern && likely(*t->p)) {
PyUnicode_InternInPlace(t->p);
} }
} else { } else {
*t->p = PyBytes_FromStringAndSize(t->s, t->n - 1); *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
......
...@@ -17,6 +17,10 @@ __doc__ = br""" ...@@ -17,6 +17,10 @@ __doc__ = br"""
u'\x03g\xf8\uf8d2S\xf8k ik' u'\x03g\xf8\uf8d2S\xf8k ik'
>>> f >>> f
u'\xf8' u'\xf8'
>>> g
u'\udc00'
>>> h
u'\ud800'
>>> add >>> add
u'S\xf8k ik\xfc\xd6\xe4abc' u'S\xf8k ik\xfc\xd6\xe4abc'
>>> null >>> null
...@@ -36,6 +40,10 @@ __doc__ = br""" ...@@ -36,6 +40,10 @@ __doc__ = br"""
10 10
>>> len(f) >>> len(f)
1 1
>>> len(g)
1
>>> len(h)
1
>>> len(add) >>> len(add)
12 12
>>> len(null) >>> len(null)
...@@ -63,6 +71,10 @@ __doc__ = br""" ...@@ -63,6 +71,10 @@ __doc__ = br"""
True True
>>> f == u'\\xf8' # unescaped by Python >>> f == u'\\xf8' # unescaped by Python
True True
>>> g == u'\\udc00' # unescaped by Python (required by doctest)
True
>>> h == u'\\ud800' # unescaped by Python (required by doctest)
True
>>> add == u'Søk ik' + u'üÖä' + 'abc' >>> add == u'Søk ik' + u'üÖä' + 'abc'
True True
>>> null == u'\\x00' # unescaped by Python (required by doctest) >>> null == u'\\x00' # unescaped by Python (required by doctest)
...@@ -93,6 +105,8 @@ c = u'Søk ik' ...@@ -93,6 +105,8 @@ c = u'Søk ik'
d = u'üÖä' d = u'üÖä'
e = u'\x03\x67\xf8\uf8d2Søk ik' e = u'\x03\x67\xf8\uf8d2Søk ik'
f = u'\xf8' f = u'\xf8'
g = u'\udc00' # lone trail surrogate
h = u'\ud800' # lone lead surrogate
add = u'Søk ik' + u'üÖä' + u'abc' add = u'Søk ik' + u'üÖä' + u'abc'
null = u'\x00' null = u'\x00'
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment