adapt 'unicode' usage to Py2/Py3

595502fc · Stefan Behnel · e2922b0c · 595502fc · 595502fc · 595502fc
Commit 595502fc authored Jul 26, 2015 by Stefan Behnel
18 changed files
--- a/Cython/Build/Dependencies.py
+++ b/Cython/Build/Dependencies.py
@@ -56,7 +56,7 @@ if sys.version_info[0] < 3:
    if _fs_encoding is None:
        _fs_encoding = sys.getdefaultencoding()
    def encode_filename_in_py2(filename):
-        if isinstance(filename, unicode):
+        if not isinstance(filename, bytes):
            return filename.encode(_fs_encoding)
        return filename
 else:

--- a/Cython/Build/Inline.py
+++ b/Cython/Build/Inline.py
@@ -27,7 +27,7 @@ IS_PY3 = sys.version_info >= (3, 0)
 # A utility function to convert user-supplied ASCII strings to unicode.
 if sys.version_info[0] < 3:
    def to_unicode(s):
-        if not isinstance(s, unicode):
+        if isinstance(s, bytes):
            return s.decode('ascii')
        else:
            return s

--- a/Cython/CodeWriter.py
+++ b/Cython/CodeWriter.py
@@ -6,8 +6,11 @@ The output is in a strict format, no whitespace or comments from the input
 is preserved (and it could not be as it is not present in the code tree).
 """
-from Cython.Compiler.Visitor import TreeVisitor
+from __future__ import absolute_import, print_function
-from Cython.Compiler.ExprNodes import *
+from .Compiler.Visitor import TreeVisitor
+from .Compiler.ExprNodes import *
 class LinesResult(object):
    def __init__(self):
@@ -497,7 +500,7 @@ class CodeWriter(DeclarationWriter):
 class PxdWriter(DeclarationWriter):
    def __call__(self, node):
-        print u'\n'.join(self.write(node).lines)
+        print(u'\n'.join(self.write(node).lines))
        return node
    def visit_CFuncDefNode(self, node):
@@ -516,5 +519,3 @@ class PxdWriter(DeclarationWriter):
    def visit_StatNode(self, node):
        pass
--- a/Cython/Compiler/Annotate.py
+++ b/Cython/Compiler/Annotate.py
@@ -218,7 +218,7 @@ class AnnotationCCodeWriter(CCodeWriter):
        def annotate(match):
            group_name = match.lastgroup
            calls[group_name] += 1
-            return ur"<span class='%s'>%s</span>" % (
+            return u"<span class='%s'>%s</span>" % (
                group_name, match.group(group_name))
        lines = self._htmlify_code(cython_code).splitlines()
@@ -275,22 +275,22 @@ class AnnotationCCodeWriter(CCodeWriter):
        return outlist
-_parse_code = re.compile(
+_parse_code = re.compile((
-    ur'(?P<refnanny>__Pyx_X?(?:GOT|GIVE)REF|__Pyx_RefNanny[A-Za-z]+)|'
+    br'(?P<refnanny>__Pyx_X?(?:GOT|GIVE)REF|__Pyx_RefNanny[A-Za-z]+)|'
-    ur'(?P<trace>__Pyx_Trace[A-Za-z]+)|'
+    br'(?P<trace>__Pyx_Trace[A-Za-z]+)|'
-    ur'(?:'
+    br'(?:'
-    ur'(?P<pyx_macro_api>__Pyx_[A-Z][A-Z_]+)|'
+    br'(?P<pyx_macro_api>__Pyx_[A-Z][A-Z_]+)|'
-    ur'(?P<pyx_c_api>__Pyx_[A-Z][a-z_][A-Za-z_]*)|'
+    br'(?P<pyx_c_api>__Pyx_[A-Z][a-z_][A-Za-z_]*)|'
-    ur'(?P<py_macro_api>Py[A-Z][a-z]+_[A-Z][A-Z_]+)|'
+    br'(?P<py_macro_api>Py[A-Z][a-z]+_[A-Z][A-Z_]+)|'
-    ur'(?P<py_c_api>Py[A-Z][a-z]+_[A-Z][a-z][A-Za-z_]*)'
+    br'(?P<py_c_api>Py[A-Z][a-z]+_[A-Z][a-z][A-Za-z_]*)'
-    ur')(?=\()|'       # look-ahead to exclude subsequent '(' from replacement
+    br')(?=\()|'       # look-ahead to exclude subsequent '(' from replacement
-    ur'(?P<error_goto>(?:(?<=;) *if .* +)?\{__pyx_filename = .*goto __pyx_L\w+;\})'
+    br'(?P<error_goto>(?:(?<=;) *if .* +)?\{__pyx_filename = .*goto __pyx_L\w+;\})'
-).sub
+).decode('ascii')).sub
 _replace_pos_comment = re.compile(
    # this matches what Cython generates as code line marker comment
-    ur'^\s*/\*(?:(?:[^*]|\*[^/])*\n)+\s*\*/\s*\n',
+    br'^\s*/\*(?:(?:[^*]|\*[^/])*\n)+\s*\*/\s*\n'.decode('ascii'),
    re.M
 ).sub

--- a/Cython/Compiler/Code.py
+++ b/Cython/Compiler/Code.py
@@ -163,7 +163,7 @@ class UtilityCodeBase(object):
        if ext in ('.pyx', '.py', '.pxd', '.pxi'):
            comment = '#'
            strip_comments = partial(re.compile(r'^\s*#.*').sub, '')
-            rstrip = unicode.rstrip
+            rstrip = str.rstrip
        else:
            comment = '/'
            strip_comments = partial(re.compile(r'^\s*//.*|/\*[^*]*\*/').sub, '')
@@ -819,7 +819,7 @@ class PyObjectConst(object):
 cython.declare(possible_unicode_identifier=object, possible_bytes_identifier=object,
               replace_identifier=object, find_alphanums=object)
-possible_unicode_identifier = re.compile(ur"(?![0-9])\w+$", re.U).match
+possible_unicode_identifier = re.compile(br"(?![0-9])\w+$".decode('ascii'), re.U).match
 possible_bytes_identifier = re.compile(r"(?![0-9])\w+$".encode('ASCII')).match
 replace_identifier = re.compile(r'[^a-zA-Z0-9_]+').sub
 find_alphanums = re.compile('([a-zA-Z0-9]+)').findall
@@ -876,10 +876,10 @@ class StringConst(object):
        if identifier:
            intern = True
        elif identifier is None:
-            if isinstance(text, unicode):
+            if isinstance(text, bytes):
-                intern = bool(possible_unicode_identifier(text))
-            else:
                intern = bool(possible_bytes_identifier(text))
+            else:
+                intern = bool(possible_unicode_identifier(text))
        else:
            intern = False
        if intern:
@@ -2298,9 +2298,8 @@ class PyxCodeWriter(object):
    def getvalue(self):
        result = self.buffer.getvalue()
-        if not isinstance(result, unicode):
+        if isinstance(result, bytes):
            result = result.decode(self.encoding)
        return result
    def putln(self, line, context=None):

--- a/Cython/Compiler/Errors.py
+++ b/Cython/Compiler/Errors.py
@@ -4,6 +4,11 @@
 from __future__ import absolute_import
+try:
+    from __builtin__ import basestring as any_string_type
+except ImportError:
+    any_string_type = (bytes, str)
 import sys
 from ..Utils import open_new_file
@@ -21,7 +26,7 @@ class PyrexWarning(Exception):
 def context(position):
    source = position[0]
-    assert not (isinstance(source, unicode) or isinstance(source, str)), (
+    assert not (isinstance(source, any_string_type)), (
        "Please replace filename strings with Scanning.FileSourceDescriptor instances %r" % source)
    try:
        F = source.get_lines()
@@ -167,7 +172,7 @@ def report_error(err):
 def error(position, message):
-    #print "Errors.error:", repr(position), repr(message) ###
+    #print("Errors.error:", repr(position), repr(message)) ###
    if position is None:
        raise InternalError(message)
    err = CompileError(position, message)

--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -45,12 +45,12 @@ from .DebugFlags import debug_disposal_code, debug_temp_alloc, \
 try:
    from __builtin__ import basestring
 except ImportError:
-    basestring = str # Python 3
+    # Python 3
+    basestring = str
-try:
+    any_string_type = (bytes, str)
-    from builtins import bytes
+else:
-except ImportError:
+    # Python 2
-    bytes = str # Python 2
+    any_string_type = (bytes, unicode)
 if sys.version_info[0] >= 3:
@@ -1216,7 +1216,7 @@ class FloatNode(ConstNode):
    def get_constant_c_result_code(self):
        strval = self.value
-        assert isinstance(strval, (str, unicode))
+        assert isinstance(strval, basestring)
        cmpval = repr(float(strval))
        if cmpval == 'nan':
            return "(Py_HUGE_VAL * 0)"
@@ -10907,8 +10907,8 @@ class CmpNode(object):
    def calculate_cascaded_constant_result(self, operand1_result):
        func = compile_time_binary_operators[self.operator]
        operand2_result = self.operand2.constant_result
-        if (isinstance(operand1_result, (bytes, unicode)) and
+        if (isinstance(operand1_result, any_string_type) and
-                isinstance(operand2_result, (bytes, unicode)) and
+                isinstance(operand2_result, any_string_type) and
                type(operand1_result) != type(operand2_result)):
            # string comparison of different types isn't portable
            return

--- a/Cython/Compiler/ParseTreeTransforms.py
+++ b/Cython/Compiler/ParseTreeTransforms.py
@@ -6,7 +6,7 @@ import cython
 cython.declare(PyrexTypes=object, Naming=object, ExprNodes=object, Nodes=object,
               Options=object, UtilNodes=object, LetNode=object,
               LetRefNode=object, TreeFragment=object, EncodedString=object,
-               error=object, warning=object, copy=object)
+               error=object, warning=object, copy=object, _unicode=object)
 from . import PyrexTypes
 from . import Naming
@@ -19,7 +19,7 @@ from .Visitor import VisitorTransform, TreeVisitor
 from .Visitor import CythonTransform, EnvTransform, ScopeTrackingTransform
 from .UtilNodes import LetNode, LetRefNode, ResultRefNode
 from .TreeFragment import TreeFragment
-from .StringEncoding import EncodedString
+from .StringEncoding import EncodedString, _unicode
 from .Errors import error, warning, CompileError, InternalError
 from .Code import UtilityCode
@@ -663,7 +663,7 @@ class InterpretCompilerDirectives(CythonTransform, SkipDeclarations):
        self.parallel_directives = {}
        directives = copy.deepcopy(Options.directive_defaults)
        for key, value in compilation_directive_defaults.items():
-            directives[unicode(key)] = copy.deepcopy(value)
+            directives[_unicode(key)] = copy.deepcopy(value)
        self.directives = directives
    def check_directive_scope(self, pos, directive, scope):

--- a/Cython/Compiler/Scanning.py
+++ b/Cython/Compiler/Scanning.py
@@ -7,7 +7,7 @@ from __future__ import absolute_import
 import cython
 cython.declare(make_lexicon=object, lexicon=object,
-               any_string_prefix=unicode, IDENT=unicode,
+               any_string_prefix=cython.unicode, IDENT=cython.unicode,
               print_function=object, error=object, warning=object,
               os=object, platform=object)

--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -8,10 +8,10 @@ import re
 import sys
 if sys.version_info[0] >= 3:
-    _unicode, _str, _bytes = str, str, bytes
+    _unicode, _str, _bytes, _unichr = str, str, bytes, chr
    IS_PYTHON3 = True
 else:
-    _unicode, _str, _bytes = unicode, str, str
+    _unicode, _str, _bytes, _unichr = unicode, str, str, unichr
    IS_PYTHON3 = False
 empty_bytes = _bytes()
@@ -39,13 +39,13 @@ class UnicodeLiteralBuilder(object):
                # wide Unicode character on narrow platform => replace
                # by surrogate pair
                char_number -= 0x10000
-                self.chars.append( unichr((char_number // 1024) + 0xD800) )
+                self.chars.append( _unichr((char_number // 1024) + 0xD800) )
-                self.chars.append( unichr((char_number  % 1024) + 0xDC00) )
+                self.chars.append( _unichr((char_number  % 1024) + 0xDC00) )
            else:
-                self.chars.append( unichr(char_number) )
+                self.chars.append( _unichr(char_number) )
    else:
        def append_charval(self, char_number):
-            self.chars.append( unichr(char_number) )
+            self.chars.append( _unichr(char_number) )
    def append_uescape(self, char_number, escape_string):
        self.append_charval(char_number)
@@ -71,7 +71,7 @@ class BytesLiteralBuilder(object):
        self.chars.append(characters)
    def append_charval(self, char_number):
-        self.chars.append( unichr(char_number).encode('ISO-8859-1') )
+        self.chars.append( _unichr(char_number).encode('ISO-8859-1') )
    def append_uescape(self, char_number, escape_string):
        self.append(escape_string)
@@ -311,4 +311,4 @@ def encode_pyunicode_string(s):
    if utf16 == utf32:
        utf16 = []
-    return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))
+    return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))
--- a/Cython/Compiler/TreeFragment.py
+++ b/Cython/Compiler/TreeFragment.py
@@ -17,6 +17,7 @@ from . import PyrexTypes
 from .Visitor import VisitorTransform
 from .Nodes import Node, StatListNode
 from .ExprNodes import NameNode
+from .StringEncoding import _unicode
 from . import Parsing
 from . import Main
 from . import UtilNodes
@@ -59,7 +60,7 @@ def parse_from_strings(name, code, pxds={}, level=None, initial_pos=None,
    # to use a unicode string so that code fragments don't have to bother
    # with encoding. This means that test code passed in should not have an
    # encoding header.
-    assert isinstance(code, unicode), "unicode code snippets only please"
+    assert isinstance(code, _unicode), "unicode code snippets only please"
    encoding = "UTF-8"
    module_name = name
@@ -198,7 +199,7 @@ def copy_code_tree(node):
    return TreeCopier()(node)
-_match_indent = re.compile(ur"^ *").match
+_match_indent = re.compile(u"^ *").match
 def strip_common_indent(lines):
@@ -214,7 +215,7 @@ class TreeFragment(object):
    def __init__(self, code, name=None, pxds={}, temps=[], pipeline=[], level=None, initial_pos=None):
        if not name:
            name = "(tree fragment)"
-        if isinstance(code, unicode):
+        if isinstance(code, _unicode):
            def fmt(x): return u"\n".join(strip_common_indent(x.split(u"\n")))
            fmt_code = fmt(code)

--- a/Cython/Plex/Machines.py
+++ b/Cython/Plex/Machines.py
@@ -17,6 +17,11 @@ try:
 except ImportError:
    from sys import maxint
+try:
+    unichr
+except NameError:
+    unichr = chr
 LOWEST_PRIORITY = -maxint

--- a/Cython/Runtime/refnanny.pyx
+++ b/Cython/Runtime/refnanny.pyx
+# cython: language_level=3
 from cpython.ref cimport PyObject, Py_INCREF, Py_DECREF, Py_XDECREF, Py_XINCREF
 from cpython.exc cimport PyErr_Fetch, PyErr_Restore
 from cpython.pystate cimport PyThreadState_Get
@@ -72,7 +74,7 @@ cdef void report_unraisable(object e=None):
        if e is None:
            import sys
            e = sys.exc_info()[1]
-        print u"refnanny raised an exception: %s" % e
+        print(u"refnanny raised an exception: %s" % e)
    except:
        pass # We absolutely cannot exit with an exception
@@ -159,9 +161,10 @@ cdef void FinishContext(PyObject** ctx):
            context = <Context>ctx[0]
            errors = context.end()
            if errors:
-                print u"%s: %s()" % (context.filename.decode('latin1'),
+                print(u"%s: %s()" % (
-                                     context.name.decode('latin1'))
+                    context.filename.decode('latin1'),
-                print errors
+                    context.name.decode('latin1')))
+                print(errors)
            context = None
        except:
            report_unraisable()

--- a/Cython/Tempita/_tempita.py
+++ b/Cython/Tempita/_tempita.py
@@ -43,7 +43,7 @@ import tokenize
 from io import StringIO
 from ._looper import looper
-from .compat3 import bytes, basestring_, next, is_unicode, coerce_text
+from .compat3 import bytes, unicode, basestring_, next, is_unicode, coerce_text
 __all__ = ['TemplateError', 'Template', 'sub', 'HTMLTemplate',
           'sub_html', 'html', 'bunch']

--- a/Cython/Tempita/compat3.py
+++ b/Cython/Tempita/compat3.py
 import sys
-__all__ = ['b', 'basestring_', 'bytes', 'next', 'is_unicode']
+__all__ = ['b', 'basestring_', 'bytes', 'unicode', 'next', 'is_unicode']
 if sys.version < "3":
    b = bytes = str
    basestring_ = basestring
+    unicode = unicode
 else:
    def b(s):
@@ -13,6 +14,7 @@ else:
        return bytes(s)
    basestring_ = (bytes, str)
    bytes = bytes
+    unicode = str
 text = str
 if sys.version < "3":

--- a/Cython/Tests/TestJediTyper.py
+++ b/Cython/Tests/TestJediTyper.py
@@ -20,7 +20,7 @@ TOOLS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..',
 @contextmanager
 def _tempfile(code):
    code = dedent(code)
-    if isinstance(code, unicode):
+    if not isinstance(code, bytes):
        code = code.encode('utf8')
    with NamedTemporaryFile(suffix='.py') as f:

--- a/Cython/Utility/MemoryView.pyx
+++ b/Cython/Utility/MemoryView.pyx
@@ -131,8 +131,8 @@ cdef class array:
        if itemsize <= 0:
            raise ValueError("itemsize <= 0 for cython.array")
-        if isinstance(format, unicode):
+        if not isinstance(format, bytes):
-            format = (<unicode>format).encode('ASCII')
+            format = format.encode('ASCII')
        self._format = format  # keep a reference to the byte string
        self.format = self._format

--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -187,15 +187,14 @@ def path_exists(path):
 # file name encodings
 def decode_filename(filename):
-    if isinstance(filename, unicode):
+    if isinstance(filename, bytes):
-        return filename
+        try:
-    try:
+            filename_encoding = sys.getfilesystemencoding()
-        filename_encoding = sys.getfilesystemencoding()
+            if filename_encoding is None:
-        if filename_encoding is None:
+                filename_encoding = sys.getdefaultencoding()
-            filename_encoding = sys.getdefaultencoding()
+            filename = filename.decode(filename_encoding)
-        filename = filename.decode(filename_encoding)
+        except UnicodeDecodeError:
-    except UnicodeDecodeError:
+            pass
-        pass
    return filename
 # support for source file encoding detection