From 5703194d0ed3112ce78a67014e495ff1887c9e68 Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Mon, 11 Sep 2017 17:11:37 +0200
Subject: [PATCH] Prevent character escape sequences from being resolved in raw
 f-strings (fr"..."). Also fix some error reporting issues along the way.
 Update test_fstring.py test file from Py3.7.

---
 CHANGES.rst                |  3 +++
 Cython/Compiler/Parsing.py | 46 +++++++++++++++++++++-----------------
 tests/run/test_fstring.pyx | 46 ++++++++++++++++++++++++++++----------
 3 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index ad0f9ddbb..744a7fae6 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -80,6 +80,9 @@ Bugs fixed
 * Compile time evaluations of (partially) constant f-strings could show incorrect
   results.
 
+* Escape sequences in raw f-strings (``fr'...'``) were resolved instead of passing
+  them through as expected.
+
 * Some ref-counting issues in buffer error handling have been resolved.
 
 Other changes
diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py
index bf8c5a2a6..5d6b11f81 100644
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -11,8 +11,8 @@ cython.declare(Nodes=object, ExprNodes=object, EncodedString=object,
                bytes_literal=object, StringEncoding=object,
                FileSourceDescriptor=object, lookup_unicodechar=object, unicode_category=object,
                Future=object, Options=object, error=object, warning=object,
-               Builtin=object, ModuleNode=object, Utils=object,
-               re=object, sys=object, _parse_escape_sequences=object, _unicode=object, _bytes=object,
+               Builtin=object, ModuleNode=object, Utils=object, _unicode=object, _bytes=object,
+               re=object, sys=object, _parse_escape_sequences=object, _parse_escape_sequences_raw=object,
                partial=object, reduce=object, _IS_PY3=cython.bint, _IS_2BYTE_UNICODE=cython.bint)
 
 from io import StringIO
@@ -1013,22 +1013,25 @@ def _append_escape_sequence(kind, builder, escape_sequence, s):
         builder.append(escape_sequence)
 
 
-_parse_escape_sequences = re.compile(
+_parse_escape_sequences_raw, _parse_escape_sequences = [re.compile((
     # escape sequences:
-    br'(\\(?:'
-    br'[\\abfnrtv"\'{]|'
-    br'[0-7]{2,3}|'
-    br'N\{[^}]*\}|'
-    br'x[0-9a-fA-F]{2}|'
-    br'u[0-9a-fA-F]{4}|'
-    br'U[0-9a-fA-F]{8}|'
-    br'[NuU]|'  # detect invalid escape sequences that do not match above
+    br'(\\(?:' +
+    (br'\\?' if is_raw else (
+        br'[\\abfnrtv"\'{]|'
+        br'[0-7]{2,3}|'
+        br'N\{[^}]*\}|'
+        br'x[0-9a-fA-F]{2}|'
+        br'u[0-9a-fA-F]{4}|'
+        br'U[0-9a-fA-F]{8}|'
+        br'[NxuU]|'  # detect invalid escape sequences that do not match above
+    )) +
     br')?|'
     # non-escape sequences:
     br'\{\{?|'
     br'\}\}?|'
-    br'[^\\{}]+)'.decode('us-ascii')
-).match
+    br'[^\\{}]+)'
+    ).decode('us-ascii')).match
+    for is_raw in (True, False)]
 
 
 def p_f_string(s, unicode_value, pos, is_raw):
@@ -1038,13 +1041,15 @@ def p_f_string(s, unicode_value, pos, is_raw):
     next_start = 0
     size = len(unicode_value)
     builder = StringEncoding.UnicodeLiteralBuilder()
+    error_pos = list(pos)  # [src, line, column]
+    _parse_seq = _parse_escape_sequences_raw if is_raw else _parse_escape_sequences
 
     while next_start < size:
         end = next_start
-        match = _parse_escape_sequences(unicode_value, next_start)
+        error_pos[2] = pos[2] + end  # FIXME: handle newlines in string
+        match = _parse_seq(unicode_value, next_start)
         if match is None:
-            error_pos = (pos[0], pos[1] + end, pos[2])  # FIXME: handle newlines in string
-            error(error_pos, "Invalid escape sequence")
+            error(tuple(error_pos), "Invalid escape sequence")
 
         next_start = match.end()
         part = match.group()
@@ -1068,8 +1073,7 @@ def p_f_string(s, unicode_value, pos, is_raw):
             if part == '}}':
                 builder.append('}')
             else:
-                error_pos = (pos[0], pos[1] + end, pos[2])  # FIXME: handle newlines in string
-                s.error("f-string: single '}' is not allowed", pos=error_pos)
+                s.error("f-string: single '}' is not allowed", pos=tuple(error_pos))
         else:
             builder.append(part)
 
@@ -1134,12 +1138,12 @@ def p_f_string_expr(s, unicode_value, pos, starting_index, is_raw):
     expr_pos = (pos[0], pos[1], pos[2] + starting_index + 2)  # TODO: find exact code position (concat, multi-line, ...)
 
     if not expr_str.strip():
-        error(pos, "empty expression not allowed in f-string")
+        error(expr_pos, "empty expression not allowed in f-string")
 
     if terminal_char == '!':
         i += 1
         if i + 2 > size:
-            error(pos, "invalid conversion char at end of string")
+            error(expr_pos, "invalid conversion char at end of string")
         else:
             conversion_char = unicode_value[i]
             i += 1
@@ -1152,7 +1156,7 @@ def p_f_string_expr(s, unicode_value, pos, starting_index, is_raw):
         start_format_spec = i + 1
         while True:
             if i >= size:
-                s.error("missing '}' in format specifier")
+                s.error("missing '}' in format specifier", pos=expr_pos)
             c = unicode_value[i]
             if not in_triple_quotes and not in_string:
                 if c == '{':
diff --git a/tests/run/test_fstring.pyx b/tests/run/test_fstring.pyx
index 8c46aea0a..309696c28 100644
--- a/tests/run/test_fstring.pyx
+++ b/tests/run/test_fstring.pyx
@@ -35,9 +35,10 @@ class TestCase(CythonTest):
             if exception_type is SyntaxError:
                 try:
                     self.fragment(str)
-                    assert held_errors(), "Invalid Cython code failed to raise SyntaxError: %s" % str
                 except CompileError:
                     assert True
+                else:
+                    assert held_errors(), "Invalid Cython code failed to raise SyntaxError: %r" % str
                 finally:
                     release_errors(ignore=True)
             else:
@@ -46,7 +47,7 @@ class TestCase(CythonTest):
                 except exception_type:
                     assert True
                 else:
-                    assert False, "Invalid Cython code failed to raise %s: %s" % (exception_type, str)
+                    assert False, "Invalid Cython code failed to raise %s: %r" % (exception_type, str)
                 finally:
                     if error_stack:
                         release_errors(ignore=True)
@@ -141,18 +142,9 @@ f'{a * x()}'"""
         self.assertTrue(g.__doc__ is None)
 
     def __test_literal_eval(self):
-        # With no expressions, an f-string is okay.
-        self.assertEqual(ast.literal_eval("f'x'"), 'x')
-        self.assertEqual(ast.literal_eval("f'x' 'y'"), 'xy')
-
-        # But this should raise an error.
         with self.assertRaisesRegex(ValueError, 'malformed node or string'):
             ast.literal_eval("f'x'")
 
-        # As should this, which uses a different ast node
-        with self.assertRaisesRegex(ValueError, 'malformed node or string'):
-            ast.literal_eval("f'{3}'")
-
     def __test_ast_compile_time_concat(self):
         x = ['']
 
@@ -354,6 +346,10 @@ f'{a * x()}'"""
                              "f'{10:{ }}'",
                              "f' { } '",
 
+                             # The Python parser ignores also the following
+                             # whitespace characters in additional to a space.
+                             "f'''{\t\f\r\n}'''",
+
                              # Catch the empty expression before the
                              #  invalid conversion.
                              "f'{!x}'",
@@ -374,6 +370,12 @@ f'{a * x()}'"""
                              "f'{:x'",
                              ])
 
+        # Different error message is raised for other whitespace characters.
+        self.assertAllRaise(SyntaxError, 'invalid character in identifier',
+                            ["f'''{\xa0}'''",
+                             #"\xa0",
+                             ])
+
     def test_parens_in_expressions(self):
         self.assertEqual(f'{3,}', '(3,)')
 
@@ -435,6 +437,20 @@ f'{a * x()}'"""
         self.assertEqual(f'2\x203', '2 3')
         self.assertEqual(f'\x203', ' 3')
 
+        #with self.assertWarns(DeprecationWarning):  # invalid escape sequence
+        #    value = cy_eval(r"f'\{6*7}'")
+        #self.assertEqual(value, '\\42')
+        self.assertEqual(f'\\{6*7}', '\\42')
+        self.assertEqual(fr'\{6*7}', '\\42')
+
+        AMPERSAND = 'spam'
+        # Get the right unicode character (&), or pick up local variable
+        # depending on the number of backslashes.
+        self.assertEqual(f'\N{AMPERSAND}', '&')
+        self.assertEqual(f'\\N{AMPERSAND}', '\\Nspam')
+        self.assertEqual(fr'\N{AMPERSAND}', '\\Nspam')
+        self.assertEqual(f'\\\N{AMPERSAND}', '\\&')
+
     def test_misformed_unicode_character_name(self):
         # These test are needed because unicode names are parsed
         # differently inside f-strings.
@@ -808,7 +824,8 @@ f'{a * x()}'"""
 
     def test_errors(self):
         # see issue 26287
-        self.assertAllRaise((TypeError, ValueError), 'non-empty',  # TypeError in Py3.4+
+        exc = ValueError if sys.version_info < (3, 4) else TypeError
+        self.assertAllRaise(exc, 'unsupported',
                             [r"f'{(lambda: 0):x}'",
                              r"f'{(0,):x}'",
                              ])
@@ -832,6 +849,11 @@ f'{a * x()}'"""
         self.assertEqual(f'{d["foo"]}', 'bar')
         self.assertEqual(f"{d['foo']}", 'bar')
 
+    def __test_backslash_char(self):
+        # Check eval of a backslash followed by a control char.
+        # See bpo-30682: this used to raise an assert in pydebug mode.
+        self.assertEqual(cy_eval('f"\\\n"'), '')
+        self.assertEqual(cy_eval('f"\\\r"'), '')
 
 if __name__ == '__main__':
     unittest.main()
-- 
2.30.9