From 297d7c2bf3f023ddea8cc98080d6998efee4007f Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Tue, 23 Apr 2019 22:39:11 +0200
Subject: [PATCH] Fix a compiler crash when non-ASCII characters appear in
 unprefixed strings in "3str" parsing mode.

---
 Cython/Compiler/Parsing.py                |  3 +-
 tests/run/cython3.pyx                     | 42 ++++++++++++++-
 tests/run/cython3_no_unicode_literals.pyx | 63 +++++++++++++++++++++++
 3 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py
index 010870fa9..56884f0a2 100644
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -958,7 +958,8 @@ def p_string_literal(s, kind_override=None):
             error(pos, u"invalid character literal: %r" % bytes_value)
     else:
         bytes_value, unicode_value = chars.getstrings()
-        if is_python3_source and has_non_ascii_literal_characters:
+        if (has_non_ascii_literal_characters
+                and is_python3_source and Future.unicode_literals in s.context.future_directives):
             # Python 3 forbids literal non-ASCII characters in byte strings
             if kind == 'b':
                 s.error("bytes can only contain ASCII literal characters.", pos=pos)
diff --git a/tests/run/cython3.pyx b/tests/run/cython3.pyx
index 0e8b97f78..335a2cfe8 100644
--- a/tests/run/cython3.pyx
+++ b/tests/run/cython3.pyx
@@ -21,7 +21,8 @@ True
 """
 
 import sys
-if sys.version_info[0] >= 3:
+IS_PY2 = sys.version_info[0] < 3
+if not IS_PY2:
     __doc__ = __doc__.replace(" u'", " '")
 
 def locals_function(a, b=2):
@@ -312,6 +313,45 @@ def unicode_literals():
     return ustring
 
 
+def non_ascii_unprefixed_str():
+    u"""
+    >>> s = non_ascii_unprefixed_str()
+    >>> isinstance(s, bytes)
+    False
+    >>> len(s)
+    3
+    """
+    s = 'ø\x20\u0020'
+    assert isinstance(s, unicode)
+    return s
+
+
+def non_ascii_raw_str():
+    u"""
+    >>> s = non_ascii_raw_str()
+    >>> isinstance(s, bytes)
+    False
+    >>> len(s)
+    11
+    """
+    s = r'ø\x20\u0020'
+    assert isinstance(s, unicode)
+    return s
+
+
+def non_ascii_raw_prefixed_unicode():
+    u"""
+    >>> s = non_ascii_raw_prefixed_unicode()
+    >>> isinstance(s, bytes)
+    False
+    >>> len(s)
+    11
+    """
+    s = ru'ø\x20\u0020'
+    assert isinstance(s, unicode)
+    return s
+
+
 def str_type_is_unicode():
     """
     >>> str_type, s = str_type_is_unicode()
diff --git a/tests/run/cython3_no_unicode_literals.pyx b/tests/run/cython3_no_unicode_literals.pyx
index ba6143931..c9690b305 100644
--- a/tests/run/cython3_no_unicode_literals.pyx
+++ b/tests/run/cython3_no_unicode_literals.pyx
@@ -13,6 +13,10 @@ b = 2
 x = 'abc'
 """
 
+import sys
+IS_PY2 = sys.version_info[0] < 3
+
+
 def locals_function(a, b=2):
     x = 'abc'
     return locals()
@@ -64,6 +68,65 @@ def no_unicode_literals():
     return str_string
 
 
+def non_ascii_str():
+    u"""
+    >>> s = 'ø\\x20\\u0020'
+    >>> isinstance(s, str)
+    True
+    >>> print(not IS_PY2 or len(s) == 9 or len(s))  # first is 2-char bytes in Py2, hex escape is resolved
+    True
+    >>> print(IS_PY2 or len(s) == 3 or len(s))      # 3 unicode characters in Py3
+    True
+
+    >>> s = non_ascii_str()
+    >>> isinstance(s, str)
+    True
+    >>> print(not IS_PY2 or len(s) == 9 or len(s))  # first is 2-char bytes in Py2, hex escape is resolved
+    True
+    >>> print(IS_PY2 or len(s) == 3 or len(s))      # 3 unicode characters in Py3
+    True
+    """
+    s = 'ø\x20\u0020'
+    assert isinstance(s, str)
+    assert (IS_PY2 and isinstance(s, bytes)) or (not IS_PY2 and isinstance(s, unicode))
+    return s
+
+
+def non_ascii_raw_str():
+    u"""
+    >>> s = r'ø\\x20\\u0020'
+    >>> print(not IS_PY2 or len(s) == 12 or len(s))  # Py2 (first character is two bytes)
+    True
+    >>> print(IS_PY2 or len(s) == 11 or len(s))      # Py3 (unicode string)
+    True
+
+    >>> s = non_ascii_raw_str()
+    >>> isinstance(s, str)
+    True
+    >>> print(not IS_PY2 or len(s) == 12 or len(s))  # Py2 (first character is two bytes)
+    True
+    >>> print(IS_PY2 or len(s) == 11 or len(s))      # Py3 (unicode string)
+    True
+    """
+    s = r'ø\x20\u0020'
+    assert isinstance(s, str)
+    assert (IS_PY2 and isinstance(s, bytes)) or (not IS_PY2 and isinstance(s, unicode))
+    return s
+
+
+def non_ascii_raw_unicode():
+    u"""
+    >>> s = non_ascii_raw_unicode()
+    >>> isinstance(s, bytes)
+    False
+    >>> len(s)
+    11
+    """
+    s = ru'ø\x20\u0020'
+    assert isinstance(s, unicode)
+    return s
+
+
 def str_type_is_str():
     """
     >>> str_type, s = str_type_is_str()
-- 
2.30.9