Optimise %-formatting of strings into f-strings when possible.

c7a1e68b · Stefan Behnel · 7e653adb · c7a1e68b · c7a1e68b · c7a1e68b
Commit c7a1e68b authored Feb 25, 2018 by Stefan Behnel
Hide whitespace changes
Inline Side-by-side

Showing with 113 additions and 2 deletions

CHANGES.rst CHANGES.rst +2 -0

Cython/Compiler/Optimize.py Cython/Compiler/Optimize.py +74 -1

tests/run/fstring.pyx tests/run/fstring.pyx +37 -1

No files found.
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -63,6 +63,8 @@ Features added

 * Formatting C enum values in f-strings is faster, as well as some other special cases.

+* String formatting with the '%' operator is optimised into f-strings in simple cases.
+
 * Subscripting (item access) is faster in some cases.

 * Some ``bytearray`` operations have been optimised similar to ``bytes``.

--- a/Cython/Compiler/Optimize.py
+++ b/Cython/Compiler/Optimize.py
 from __future__ import absolute_import

+import re
 import sys
 import copy
 import codecs
@@ -29,7 +30,7 @@ from . import Options

 from .Code import UtilityCode, TempitaUtilityCode
 from .StringEncoding import EncodedString, bytes_literal, encoded_string
-from .Errors import error
+from .Errors import error, warning
 from .ParseTreeTransforms import SkipDeclarations

 try:
@@ -4250,6 +4251,78 @@ class ConstantFolding(Visitor.VisitorTransform, SkipDeclarations):
                sequence_node.mult_factor = factor
        return sequence_node

+    def visit_ModNode(self, node):
+        self.visitchildren(node)
+        if isinstance(node.operand1, ExprNodes.UnicodeNode) and isinstance(node.operand2, ExprNodes.TupleNode):
+            if not node.operand2.mult_factor:
+                fstring = self._build_fstring(node.operand1.pos, node.operand1.value, node.operand2.args)
+                if fstring is not None:
+                    return fstring
+        return node
+
+    _parse_string_format_regex = (
+        u'(%(?:'            # %...
+        u'(?:[0-9]+|[ ])?'  # width (optional) or space prefix fill character (optional)
+        u'(?:[.][0-9]+)?'   # precision (optional)
+        u')?.)'             # format type (or something different for unsupported formats)
+    )
+
+    def _build_fstring(self, pos, ustring, format_args):
+        # Issues formatting warnings instead of errors since we really only catch a few errors by accident.
+        args = iter(format_args)
+        substrings = []
+        can_be_optimised = True
+        for s in re.split(self._parse_string_format_regex, ustring):
+            if not s:
+                continue
+            if s == u'%%':
+                substrings.append(ExprNodes.UnicodeNode(pos, value=EncodedString(u'%'), constant_result=u'%'))
+                continue
+            if s[0] != u'%':
+                if s[-1] == u'%':
+                    warning(pos, "Incomplete format: '...%s'" % s[-3:], level=1)
+                    can_be_optimised = False
+                substrings.append(ExprNodes.UnicodeNode(pos, value=EncodedString(s), constant_result=s))
+                continue
+            format_type = s[-1]
+            try:
+                arg = next(args)
+            except StopIteration:
+                warning(pos, "Too few arguments for format placeholders", level=1)
+                can_be_optimised = False
+                break
+            if format_type in u'srfdoxX':
+                format_spec = s[1:]
+                if format_type in u'doxX' and u'.' in format_spec:
+                    # Precision is not allowed for integers in format(), but ok in %-formatting.
+                    can_be_optimised = False
+                elif format_type in u'rs':
+                    format_spec = format_spec[:-1]
+                substrings.append(ExprNodes.FormattedValueNode(
+                    arg.pos, value=arg,
+                    conversion_char=format_type if format_type in u'rs' else None,
+                    format_spec=ExprNodes.UnicodeNode(
+                        pos, value=EncodedString(format_spec), constant_result=format_spec)
+                        if format_spec else None,
+                ))
+            else:
+                # keep it simple for now ...
+                can_be_optimised = False
+
+        if not can_be_optimised:
+            # Print all warnings we can find before finally giving up here.
+            return None
+
+        try:
+            next(args)
+        except StopIteration: pass
+        else:
+            warning(pos, "Too many arguments for format placeholders", level=1)
+            return None
+
+        node = ExprNodes.JoinedStrNode(pos, values=substrings)
+        return self.visit_JoinedStrNode(node)
+
    def visit_FormattedValueNode(self, node):
        self.visitchildren(node)
        conversion_char = node.conversion_char or 's'

--- a/tests/run/fstring.pyx
+++ b/tests/run/fstring.pyx
 # mode: run
-# tag: f_strings, pep498
+# tag: f_strings, pep498, werror

 ####
 # Cython specific PEP 498 tests in addition to test_fstring.pyx from CPython
@@ -444,3 +444,39 @@ def format_decoded_bytes(bytes value):
    U-xyz
    """
    return f"U-{value.decode('utf-8')}"
+
+
+@cython.test_fail_if_path_exists(
+    "//AddNode",
+    "//ModNode",
+)
+@cython.test_assert_path_exists(
+    "//FormattedValueNode",
+    "//JoinedStrNode",
+)
+def generated_fstring(int i, unicode u not None, o):
+    """
+    >>> i, u, o = 11, u'xyz', [1]
+    >>> print(((
+    ...     u"(i) %s-%.3s-%r-%.3r-%d-%3d-%o-%04o-%x-%4x-%X-%03X-%.1f-%04.2f %% "
+    ...     u"(u) %s-%.2s-%r-%.7r %% "
+    ...     u"(o) %s-%.2s-%r-%.2r"
+    ... ) % (
+    ...     i, i, i, i, i, i, i, i, i, i, i, i, i, i,
+    ...     u, u, u, u,
+    ...     o, o, o, o,
+    ... )).replace("-u'xyz'", "-'xyz'"))
+    (i) 11-11-11-11-11- 11-13-0013-b-   b-B-00B-11.0-11.00 % (u) xyz-xy-'xyz'-'xyz' % (o) [1]-[1-[1]-[1
+
+    >>> print(generated_fstring(i, u, o).replace("-u'xyz'", "-'xyz'"))
+    (i) 11-11-11-11-11- 11-13-0013-b-   b-B-00B-11.0-11.00 % (u) xyz-xy-'xyz'-'xyz' % (o) [1]-[1-[1]-[1
+    """
+    return (
+        u"(i) %s-%.3s-%r-%.3r-%d-%3d-%o-%04o-%x-%4x-%X-%03X-%.1f-%04.2f %% "
+        u"(u) %s-%.2s-%r-%.7r %% "
+        u"(o) %s-%.2s-%r-%.2r"
+    ) % (
+        i, i, i, i, i, i, i, i, i, i, i, i, i, i,
+        u, u, u, u,
+        o, o, o, o,
+    )