From 297d7c2bf3f023ddea8cc98080d6998efee4007f Mon Sep 17 00:00:00 2001 From: Stefan Behnel <stefan_ml@behnel.de> Date: Tue, 23 Apr 2019 22:39:11 +0200 Subject: [PATCH] Fix a compiler crash when non-ASCII characters appear in unprefixed strings in "3str" parsing mode. --- Cython/Compiler/Parsing.py | 3 +- tests/run/cython3.pyx | 42 ++++++++++++++- tests/run/cython3_no_unicode_literals.pyx | 63 +++++++++++++++++++++++ 3 files changed, 106 insertions(+), 2 deletions(-) diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py index 010870fa9..56884f0a2 100644 --- a/Cython/Compiler/Parsing.py +++ b/Cython/Compiler/Parsing.py @@ -958,7 +958,8 @@ def p_string_literal(s, kind_override=None): error(pos, u"invalid character literal: %r" % bytes_value) else: bytes_value, unicode_value = chars.getstrings() - if is_python3_source and has_non_ascii_literal_characters: + if (has_non_ascii_literal_characters + and is_python3_source and Future.unicode_literals in s.context.future_directives): # Python 3 forbids literal non-ASCII characters in byte strings if kind == 'b': s.error("bytes can only contain ASCII literal characters.", pos=pos) diff --git a/tests/run/cython3.pyx b/tests/run/cython3.pyx index 0e8b97f78..335a2cfe8 100644 --- a/tests/run/cython3.pyx +++ b/tests/run/cython3.pyx @@ -21,7 +21,8 @@ True """ import sys -if sys.version_info[0] >= 3: +IS_PY2 = sys.version_info[0] < 3 +if not IS_PY2: __doc__ = __doc__.replace(" u'", " '") def locals_function(a, b=2): @@ -312,6 +313,45 @@ def unicode_literals(): return ustring +def non_ascii_unprefixed_str(): + u""" + >>> s = non_ascii_unprefixed_str() + >>> isinstance(s, bytes) + False + >>> len(s) + 3 + """ + s = 'ø\x20\u0020' + assert isinstance(s, unicode) + return s + + +def non_ascii_raw_str(): + u""" + >>> s = non_ascii_raw_str() + >>> isinstance(s, bytes) + False + >>> len(s) + 11 + """ + s = r'ø\x20\u0020' + assert isinstance(s, unicode) + return s + + +def non_ascii_raw_prefixed_unicode(): + u""" + >>> s = non_ascii_raw_prefixed_unicode() + >>> isinstance(s, bytes) + False + >>> len(s) + 11 + """ + s = ru'ø\x20\u0020' + assert isinstance(s, unicode) + return s + + def str_type_is_unicode(): """ >>> str_type, s = str_type_is_unicode() diff --git a/tests/run/cython3_no_unicode_literals.pyx b/tests/run/cython3_no_unicode_literals.pyx index ba6143931..c9690b305 100644 --- a/tests/run/cython3_no_unicode_literals.pyx +++ b/tests/run/cython3_no_unicode_literals.pyx @@ -13,6 +13,10 @@ b = 2 x = 'abc' """ +import sys +IS_PY2 = sys.version_info[0] < 3 + + def locals_function(a, b=2): x = 'abc' return locals() @@ -64,6 +68,65 @@ def no_unicode_literals(): return str_string +def non_ascii_str(): + u""" + >>> s = 'ø\\x20\\u0020' + >>> isinstance(s, str) + True + >>> print(not IS_PY2 or len(s) == 9 or len(s)) # first is 2-char bytes in Py2, hex escape is resolved + True + >>> print(IS_PY2 or len(s) == 3 or len(s)) # 3 unicode characters in Py3 + True + + >>> s = non_ascii_str() + >>> isinstance(s, str) + True + >>> print(not IS_PY2 or len(s) == 9 or len(s)) # first is 2-char bytes in Py2, hex escape is resolved + True + >>> print(IS_PY2 or len(s) == 3 or len(s)) # 3 unicode characters in Py3 + True + """ + s = 'ø\x20\u0020' + assert isinstance(s, str) + assert (IS_PY2 and isinstance(s, bytes)) or (not IS_PY2 and isinstance(s, unicode)) + return s + + +def non_ascii_raw_str(): + u""" + >>> s = r'ø\\x20\\u0020' + >>> print(not IS_PY2 or len(s) == 12 or len(s)) # Py2 (first character is two bytes) + True + >>> print(IS_PY2 or len(s) == 11 or len(s)) # Py3 (unicode string) + True + + >>> s = non_ascii_raw_str() + >>> isinstance(s, str) + True + >>> print(not IS_PY2 or len(s) == 12 or len(s)) # Py2 (first character is two bytes) + True + >>> print(IS_PY2 or len(s) == 11 or len(s)) # Py3 (unicode string) + True + """ + s = r'ø\x20\u0020' + assert isinstance(s, str) + assert (IS_PY2 and isinstance(s, bytes)) or (not IS_PY2 and isinstance(s, unicode)) + return s + + +def non_ascii_raw_unicode(): + u""" + >>> s = non_ascii_raw_unicode() + >>> isinstance(s, bytes) + False + >>> len(s) + 11 + """ + s = ru'ø\x20\u0020' + assert isinstance(s, unicode) + return s + + def str_type_is_str(): """ >>> str_type, s = str_type_is_str() -- 2.30.9