Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cython
Commits
ca8763a2
Commit
ca8763a2
authored
Sep 02, 2019
by
da-woods
Committed by
Stefan Behnel
Sep 02, 2019
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Handle normalization of unicode identifiers (GH-3096)
parent
00c1dc96
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
177 additions
and
7 deletions
+177
-7
Cython/Compiler/Lexicon.py
Cython/Compiler/Lexicon.py
+1
-1
Cython/Compiler/Scanning.py
Cython/Compiler/Scanning.py
+8
-0
Cython/Utils.py
Cython/Utils.py
+2
-2
runtests.py
runtests.py
+16
-4
tests/errors/unicode_identifiers_e1.pyx
tests/errors/unicode_identifiers_e1.pyx
+8
-0
tests/errors/unicode_identifiers_e2.pyx
tests/errors/unicode_identifiers_e2.pyx
+9
-0
tests/errors/unicode_identifiers_e3.pyx
tests/errors/unicode_identifiers_e3.pyx
+11
-0
tests/errors/unicode_identifiers_e4.pyx
tests/errors/unicode_identifiers_e4.pyx
+13
-0
tests/run/unicode_identifiers.pyx
tests/run/unicode_identifiers.pyx
+28
-0
tests/run/unicode_identifiers_normalization.srctree
tests/run/unicode_identifiers_normalization.srctree
+81
-0
No files found.
Cython/Compiler/Lexicon.py
View file @
ca8763a2
...
@@ -85,7 +85,7 @@ def make_lexicon():
...
@@ -85,7 +85,7 @@ def make_lexicon():
comment
=
Str
(
"#"
)
+
Rep
(
AnyBut
(
"
\
n
"
))
comment
=
Str
(
"#"
)
+
Rep
(
AnyBut
(
"
\
n
"
))
return
Lexicon
([
return
Lexicon
([
(
name
,
IDENT
),
(
name
,
Method
(
'normalize_ident'
)
),
(
intliteral
,
Method
(
'strip_underscores'
,
symbol
=
'INT'
)),
(
intliteral
,
Method
(
'strip_underscores'
,
symbol
=
'INT'
)),
(
fltconst
,
Method
(
'strip_underscores'
,
symbol
=
'FLOAT'
)),
(
fltconst
,
Method
(
'strip_underscores'
,
symbol
=
'FLOAT'
)),
(
imagconst
,
Method
(
'strip_underscores'
,
symbol
=
'IMAG'
)),
(
imagconst
,
Method
(
'strip_underscores'
,
symbol
=
'IMAG'
)),
...
...
Cython/Compiler/Scanning.py
View file @
ca8763a2
...
@@ -12,6 +12,7 @@ cython.declare(make_lexicon=object, lexicon=object,
...
@@ -12,6 +12,7 @@ cython.declare(make_lexicon=object, lexicon=object,
import
os
import
os
import
platform
import
platform
from
unicodedata
import
normalize
from
..
import
Utils
from
..
import
Utils
from
..Plex.Scanners
import
Scanner
from
..Plex.Scanners
import
Scanner
...
@@ -341,6 +342,13 @@ class PyrexScanner(Scanner):
...
@@ -341,6 +342,13 @@ class PyrexScanner(Scanner):
self
.
sy
=
''
self
.
sy
=
''
self
.
next
()
self
.
next
()
def
normalize_ident
(
self
,
text
):
try
:
text
.
encode
(
'ascii'
)
# really just name.isascii but supports Python 2 and 3
except
UnicodeEncodeError
:
text
=
normalize
(
'NFKC'
,
text
)
self
.
produce
(
IDENT
,
text
)
def
commentline
(
self
,
text
):
def
commentline
(
self
,
text
):
if
self
.
parse_comments
:
if
self
.
parse_comments
:
self
.
produce
(
'commentline'
,
text
)
self
.
produce
(
'commentline'
,
text
)
...
...
Cython/Utils.py
View file @
ca8763a2
...
@@ -216,7 +216,7 @@ def decode_filename(filename):
...
@@ -216,7 +216,7 @@ def decode_filename(filename):
_match_file_encoding
=
re
.
compile
(
br"(\
w*codi
ng)[:=]\
s*([-
\w.]+)"
).
search
_match_file_encoding
=
re
.
compile
(
br"(\
w*codi
ng)[:=]\
s*([-
\w.]+)"
).
search
def
detect_opened_file_encoding
(
f
):
def
detect_opened_file_encoding
(
f
,
default
=
'UTF-8'
):
# PEPs 263 and 3120
# PEPs 263 and 3120
# Most of the time the first two lines fall in the first couple of hundred chars,
# Most of the time the first two lines fall in the first couple of hundred chars,
# and this bulk read/split is much faster.
# and this bulk read/split is much faster.
...
@@ -236,7 +236,7 @@ def detect_opened_file_encoding(f):
...
@@ -236,7 +236,7 @@ def detect_opened_file_encoding(f):
m
=
_match_file_encoding
(
lines
[
1
])
m
=
_match_file_encoding
(
lines
[
1
])
if
m
:
if
m
:
return
m
.
group
(
2
).
decode
(
'iso8859-1'
)
return
m
.
group
(
2
).
decode
(
'iso8859-1'
)
return
"UTF-8"
return
default
def
skip_bom
(
f
):
def
skip_bom
(
f
):
...
...
runtests.py
View file @
ca8763a2
...
@@ -545,9 +545,14 @@ class build_ext(_build_ext):
...
@@ -545,9 +545,14 @@ class build_ext(_build_ext):
class ErrorWriter(object):
class ErrorWriter(object):
match_error = re.compile(r'(warning:)?(?:.*:)?
\
s*([-
0
-9]+)
\
s*:
\
s*([-0-9]+)
\
s*:
\
s*(.*)').match
match_error = re.compile(r'(warning:)?(?:.*:)?
\
s*([-
0
-9]+)
\
s*:
\
s*([-0-9]+)
\
s*:
\
s*(.*)').match
def __init__(self):
def __init__(self
, encoding=None
):
self.output = []
self.output = []
self.write = self.output.append
self.encoding = encoding
def write(self, value):
if self.encoding:
value = value.encode('ISO-8859-1').decode(self.encoding)
self.output.append(value)
def _collect(self):
def _collect(self):
s = ''.join(self.output)
s = ''.join(self.output)
...
@@ -1002,6 +1007,13 @@ class CythonCompileTestCase(unittest.TestCase):
...
@@ -1002,6 +1007,13 @@ class CythonCompileTestCase(unittest.TestCase):
def split_source_and_output(self, test_directory, module, workdir):
def split_source_and_output(self, test_directory, module, workdir):
source_file = self.find_module_source_file(os.path.join(test_directory, module) + '.pyx')
source_file = self.find_module_source_file(os.path.join(test_directory, module) + '.pyx')
from Cython.Utils import detect_opened_file_encoding
with io_open(source_file, 'rb') as f:
# encoding is passed to ErrorWriter but not used on the source
# since it is sometimes deliberately wrong
encoding = detect_opened_file_encoding(f, default=None)
with io_open(source_file, 'r', encoding='ISO-8859-1') as source_and_output:
with io_open(source_file, 'r', encoding='ISO-8859-1') as source_and_output:
error_writer = warnings_writer = None
error_writer = warnings_writer = None
out = io_open(os.path.join(workdir, module + os.path.splitext(source_file)[1]),
out = io_open(os.path.join(workdir, module + os.path.splitext(source_file)[1]),
...
@@ -1010,10 +1022,10 @@ class CythonCompileTestCase(unittest.TestCase):
...
@@ -1010,10 +1022,10 @@ class CythonCompileTestCase(unittest.TestCase):
for line in source_and_output:
for line in source_and_output:
if line.startswith("_ERRORS"):
if line.startswith("_ERRORS"):
out.close()
out.close()
out = error_writer = ErrorWriter()
out = error_writer = ErrorWriter(
encoding=encoding
)
elif line.startswith("_WARNINGS"):
elif line.startswith("_WARNINGS"):
out.close()
out.close()
out = warnings_writer = ErrorWriter()
out = warnings_writer = ErrorWriter(
encoding=encoding
)
else:
else:
out.write(line)
out.write(line)
finally:
finally:
...
...
tests/errors/unicode_identifiers_e1.pyx
0 → 100644
View file @
ca8763a2
# -*- coding: utf-8 -*-
# mode: error
★
1
=
5
# invalid start symbol
_ERRORS
=
u"""
4:0: Unrecognized character
"""
tests/errors/unicode_identifiers_e2.pyx
0 → 100644
View file @
ca8763a2
# -*- coding: utf-8 -*-
# mode: error
class
MyClass
₡
:
# invalid continue symbol
pass
_ERRORS
=
u"""
4:13: Unrecognized character
"""
tests/errors/unicode_identifiers_e3.pyx
0 → 100644
View file @
ca8763a2
# -*- coding: utf-8 -*-
# mode: error
def
f
():
a
=
1
́
b
=
2
# looks like an identation error but is actually a combining accent as the first letter of column 4
c
=
3
_ERRORS
=
u"""
6:4: Unrecognized character
"""
tests/errors/unicode_identifiers_e4.pyx
0 → 100644
View file @
ca8763a2
# -*- coding: utf-8 -*-
# mode: error
cdef
class
C
:
# these two symbols "\u1e69" and "\u1e9b\u0323" normalize to the same thing
# so the two attributes can't coexist
cdef
int
ṩ
omething
cdef
double
ẛ̣
omething
_ERRORS
=
u"""
7:13: Previous declaration is here
8:16: 'ṩomething' redeclared
"""
tests/run/unicode_identifiers.pyx
View file @
ca8763a2
...
@@ -49,6 +49,12 @@ if sys.version_info[0]>2:
...
@@ -49,6 +49,12 @@ if sys.version_info[0]>2:
10
10
>>> NormalClassΓΓ().εxciting_function(None).__qualname__
>>> NormalClassΓΓ().εxciting_function(None).__qualname__
'NormalClassΓΓ.εxciting_function.<locals>.nestεd'
'NormalClassΓΓ.εxciting_function.<locals>.nestεd'
Do kwargs work?
>>> unicode_kwarg(αrg=5)
5
>>> unicode_kwarg_from_cy()
1
"""
"""
else
:
else
:
__doc__
=
""
__doc__
=
""
...
@@ -184,6 +190,28 @@ class NormalClassΓΓ(Γναμε2):
...
@@ -184,6 +190,28 @@ class NormalClassΓΓ(Γναμε2):
pass
pass
return nestεd
return nestεd
def unicode_kwarg(*,αrg):
return αrg
def unicode_kwarg_from_cy():
return unicode_kwarg(αrg=1)
cdef class NormalizeAttrCdef:
"""
Python
normalizes
identifier
names
before
they
are
used
;
therefore
fi
and
fi
should
access
the
same
attribute
.
A
more
comprehensive
version
of
this
is
in
"unicode_identifiers_normalize.py"
comparing
the
behaviour
to
Python
.
The
version
here
shows
it
behaves
the
same
in
a
cdef
class
and
is
tested
with
Python
2
>>>
NormalizeAttrCdef
().
get
()
5
"""
cdef int fi # note unicode ligature symbol
def __init__(self):
self.fi = 5
def get(self):
return self.fi
if sys.version_info[0]<=2:
if sys.version_info[0]<=2:
# These symbols are causing problems for doctest
# These symbols are causing problems for doctest
del NormalClassΓΓ
del NormalClassΓΓ
...
...
tests/run/unicode_identifiers_normalization.srctree
0 → 100644
View file @
ca8763a2
# -*- coding: utf-8 -*-
# mode: run
# tag: pure3.0, pep3131
PYTHON build_tests.py
# show behaviour in Python mode
PYTHON -m doctest test0.py
PYTHON -m doctest test1.py
PYTHON -m doctest test2.py
PYTHON setup.py build_ext --inplace
# test in Cython mode
PYTHON -c "import doctest; import test0 as m; exit(doctest.testmod(m)[0])"
PYTHON -c "import doctest; import test1 as m; exit(doctest.testmod(m)[0])"
PYTHON -c "import doctest; import test2 as m; exit(doctest.testmod(m)[0])"
########## setup.py #########
from Cython.Build.Dependencies import cythonize
from distutils.core import setup
setup(
ext_modules = cythonize("test*.py"),
)
######### build_tests.py ########
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
import unicodedata
# a few pairs of unicode strings that should be equivalent after normalization
string_pairs = [("fi", "fi"), # ligature and two letters
("a\u0301", '\u00e1'), # a with acute accent with combining character or as 1 character
("α\u0334\u0362", "α\u0362\u0334") # alpha with a pair of combining characters
# in a different order. No single character to normalize to
]
# Show that the pairs genuinely aren't equal before normalization
for sp in string_pairs:
assert sp[0] != sp[1]
assert unicodedata.normalize('NFKC', sp[0]) == unicodedata.normalize('NFKC', sp[1])
# some code that accesses the identifiers through the two different names
# contains doctests
example_code = [
"""
class C:
'''
>>> C().get()
True
'''
def __init__(self):
self.{0} = True
def get(self):
return self.{1}
""", """
def pass_through({0}):
'''
>>> pass_through(True)
True
'''
return {1}
""", """
import cython
{0} = True
def test():
'''
>>> test()
True
'''
return {1}
"""]
for idx in range(len(example_code)):
with open("test{0}.py".format(idx),"w") as f:
if sys.version_info[0] > 2:
f.write("# -*- coding: utf-8 -*-\n")
f.write(example_code[idx].format(*string_pairs[idx]))
else:
f.write("\n") # code isn't Python 2 compatible - write a dummy file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment