Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Commits
Open sidebar
nexedi
cython
Commits
70ea30b6
Commit
70ea30b6
authored
Apr 22, 2008
by
Stefan Behnel
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
source code encoding support (PEP 263) and UTF-8 default source encoding (PEP 3120)
parent
2986d78b
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
162 additions
and
82 deletions
+162
-82
Cython/Compiler/ExprNodes.py
Cython/Compiler/ExprNodes.py
+27
-3
Cython/Compiler/Main.py
Cython/Compiler/Main.py
+20
-4
Cython/Compiler/ModuleNode.py
Cython/Compiler/ModuleNode.py
+1
-1
Cython/Compiler/Nodes.py
Cython/Compiler/Nodes.py
+11
-7
Cython/Compiler/Parsing.py
Cython/Compiler/Parsing.py
+33
-58
Cython/Compiler/PyrexTypes.py
Cython/Compiler/PyrexTypes.py
+57
-3
Cython/Compiler/Scanning.py
Cython/Compiler/Scanning.py
+2
-1
Cython/Compiler/Symtab.py
Cython/Compiler/Symtab.py
+11
-5
No files found.
Cython/Compiler/ExprNodes.py
View file @
70ea30b6
...
...
@@ -18,6 +18,29 @@ from Cython.Debugging import print_call_chain
from
DebugFlags
import
debug_disposal_code
,
debug_temp_alloc
,
\
debug_coercion
class
EncodedString
(
unicode
):
# unicode string subclass to keep track of the original encoding.
# 'encoding' is None for unicode strings and the source encoding
# otherwise
encoding
=
None
def
byteencode
(
self
):
assert
self
.
encoding
is
not
None
return
self
.
encode
(
self
.
encoding
)
def
utf8encode
(
self
):
assert
self
.
encoding
is
None
return
self
.
encode
(
"UTF-8"
)
def
is_unicode
(
self
):
return
self
.
encoding
is
None
is_unicode
=
property
(
is_unicode
)
# def __eq__(self, other):
# return unicode.__eq__(self, other) and \
# getattr(other, 'encoding', '') == self.encoding
class
ExprNode
(
Node
):
# subexprs [string] Class var holding names of subexpr node attrs
# type PyrexType Type of the result
...
...
@@ -696,15 +719,16 @@ class StringNode(ConstNode):
type
=
PyrexTypes
.
c_char_ptr_type
def
compile_time_value
(
self
,
denv
):
return
eval
(
'"%s"'
%
self
.
value
)
return
self
.
value
def
analyse_types
(
self
,
env
):
self
.
entry
=
env
.
add_string_const
(
self
.
value
)
def
coerce_to
(
self
,
dst_type
,
env
):
if
dst_type
.
is_int
:
if
not
self
.
type
.
is_pyobject
and
len
(
self
.
value
)
==
1
:
return
CharNode
(
self
.
pos
,
value
=
self
.
value
)
if
not
self
.
type
.
is_pyobject
and
len
(
self
.
entry
.
init
)
==
1
:
# we use the *encoded* value here
return
CharNode
(
self
.
pos
,
value
=
self
.
entry
.
init
)
else
:
error
(
self
.
pos
,
"Only coerce single-character ascii strings can be used as ints."
)
return
self
...
...
Cython/Compiler/Main.py
View file @
70ea30b6
...
...
@@ -2,12 +2,11 @@
# Cython Top Level
#
import
os
,
sys
,
re
import
os
,
sys
,
re
,
codecs
if
sys
.
version_info
[:
2
]
<
(
2
,
2
):
print
>>
sys
.
stderr
,
"Sorry, Cython requires Python 2.2 or later"
sys
.
exit
(
1
)
import
os
from
time
import
time
import
Version
from
Scanning
import
PyrexScanner
...
...
@@ -138,10 +137,27 @@ class Context:
self
.
modules
[
name
]
=
scope
return
scope
match_file_encoding
=
re
.
compile
(
"coding[:=]
\
s*([-
\
w.]+)"
).
search
def
detect_file_encoding
(
self
,
source_filename
):
# PEPs 263 and 3120
f
=
codecs
.
open
(
source_filename
,
"rU"
,
encoding
=
"UTF-8"
)
try
:
for
line_no
,
line
in
enumerate
(
f
):
encoding
=
self
.
match_file_encoding
(
line
)
if
encoding
:
return
encoding
.
group
(
1
)
if
line_no
==
1
:
break
finally
:
f
.
close
()
return
"UTF-8"
def
parse
(
self
,
source_filename
,
type_names
,
pxd
,
full_module_name
):
# Parse the given source file and return a parse tree.
f
=
open
(
source_filename
,
"rU"
)
s
=
PyrexScanner
(
f
,
source_filename
,
encoding
=
self
.
detect_file_encoding
(
source_filename
)
f
=
codecs
.
open
(
source_filename
,
"rU"
,
encoding
=
encoding
)
s
=
PyrexScanner
(
f
,
source_filename
,
source_encoding
=
encoding
,
type_names
=
type_names
,
context
=
self
)
try
:
tree
=
Parsing
.
p_module
(
s
,
pxd
,
full_module_name
)
...
...
Cython/Compiler/ModuleNode.py
View file @
70ea30b6
...
...
@@ -1270,7 +1270,7 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
entry
.
pystring_cname
,
entry
.
cname
,
entry
.
cname
,
isinstance
(
entry
.
init
,
unicode
)
entry
.
type
.
is_unicode
))
code
.
putln
(
"{0, 0, 0, 0}"
)
...
...
Cython/Compiler/Nodes.py
View file @
70ea30b6
...
...
@@ -1199,7 +1199,7 @@ class DefNode(FuncDefNode):
# args [CArgDeclNode] formal arguments
# star_arg PyArgDeclNode or None * argument
# starstar_arg PyArgDeclNode or None ** argument
# doc
s
tring or None
# doc
EncodedS
tring or None
# body StatListNode
#
# The following subnode is constructed internally
...
...
@@ -1358,12 +1358,15 @@ class DefNode(FuncDefNode):
entry
.
pymethdef_cname
=
\
Naming
.
pymethdef_prefix
+
prefix
+
name
if
not
Options
.
docstrings
:
self
.
entry
.
doc
=
None
entry
.
doc
=
None
else
:
if
Options
.
embed_pos_in_docstring
:
entry
.
doc
=
'File: %s (starting at line %s)'
%
relative_position
(
self
.
pos
)
doc
=
u
'File: %s (starting at line %s)'
%
relative_position
(
self
.
pos
)
if
not
self
.
doc
is
None
:
entry
.
doc
=
entry
.
doc
+
'
\
\
n'
+
self
.
doc
doc
=
doc
+
u'
\
\
n'
+
self
.
doc
doc
=
ExprNodes
.
EncodedString
(
doc
)
doc
.
encoding
=
self
.
doc
.
encoding
entry
.
doc
=
doc
else
:
entry
.
doc
=
self
.
doc
entry
.
doc_cname
=
\
...
...
@@ -1920,8 +1923,9 @@ class PyClassDefNode(StatNode, BlockNode):
self
.
dict
=
ExprNodes
.
DictNode
(
pos
,
key_value_pairs
=
[])
if
self
.
doc
and
Options
.
docstrings
:
if
Options
.
embed_pos_in_docstring
:
doc
=
'File: %s (starting at line %s)'
%
relative_position
(
self
.
pos
)
doc
=
doc
+
'
\
\
n'
+
self
.
doc
doc
=
u'File: %s (starting at line %s)'
%
relative_position
(
self
.
pos
)
doc
=
ExprNodes
.
EncodedString
(
doc
+
'u
\
\
n'
+
self
.
doc
)
doc
.
encoding
=
self
.
doc
.
encoding
doc_node
=
ExprNodes
.
StringNode
(
pos
,
value
=
doc
)
else
:
doc_node
=
None
...
...
@@ -2073,7 +2077,7 @@ class PropertyNode(StatNode):
# Definition of a property in an extension type.
#
# name string
# doc
s
tring or None Doc string
# doc
EncodedS
tring or None Doc string
# body StatListNode
child_attrs
=
[
"body"
]
...
...
Cython/Compiler/Parsing.py
View file @
70ea30b6
...
...
@@ -281,8 +281,10 @@ def p_call(s, function):
if
not
arg
.
is_name
:
s
.
error
(
"Expected an identifier before '='"
,
pos
=
arg
.
pos
)
encoded_name
=
ExprNodes
.
EncodedString
(
arg
.
name
)
encoded_name
.
encoding
=
s
.
source_encoding
keyword
=
ExprNodes
.
StringNode
(
arg
.
pos
,
value
=
arg
.
name
)
value
=
encoded_
name
)
arg
=
p_simple_expr
(
s
)
keyword_args
.
append
((
keyword
,
arg
))
else
:
...
...
@@ -459,7 +461,7 @@ def p_atom(s):
value
=
s
.
systring
[:
-
1
]
s
.
next
()
return
ExprNodes
.
ImagNode
(
pos
,
value
=
value
)
elif
sy
==
'
STRING'
or
sy
==
'
BEGIN_STRING'
:
elif
sy
==
'BEGIN_STRING'
:
kind
,
value
=
p_cat_string_literal
(
s
)
if
kind
==
'c'
:
return
ExprNodes
.
CharNode
(
pos
,
value
=
value
)
...
...
@@ -500,7 +502,12 @@ def p_name(s, name):
elif
isinstance
(
value
,
float
):
return
ExprNodes
.
FloatNode
(
pos
,
value
=
rep
)
elif
isinstance
(
value
,
str
):
return
ExprNodes
.
StringNode
(
pos
,
value
=
rep
[
1
:
-
1
])
sval
=
ExprNodes
.
EncodedString
(
rep
[
1
:
-
1
])
sval
.
encoding
=
value
.
encoding
return
ExprNodes
.
StringNode
(
pos
,
value
=
sval
)
elif
isinstance
(
value
,
unicode
):
sval
=
ExprNodes
.
EncodedString
(
rep
[
2
:
-
1
])
return
ExprNodes
.
StringNode
(
pos
,
value
=
sval
)
else
:
error
(
pos
,
"Invalid type for compile-time constant: %s"
%
value
.
__class__
.
__name__
)
...
...
@@ -508,21 +515,25 @@ def p_name(s, name):
def
p_cat_string_literal
(
s
):
# A sequence of one or more adjacent string literals.
# Returns (kind, value) where kind in ('', 'c', 'r')
# Returns (kind, value) where kind in ('', 'c', 'r'
, 'u'
)
kind
,
value
=
p_string_literal
(
s
)
if
kind
!=
'c'
:
strings
=
[
value
]
while
s
.
sy
==
'
STRING'
or
s
.
sy
==
'
BEGIN_STRING'
:
while
s
.
sy
==
'BEGIN_STRING'
:
next_kind
,
next_value
=
p_string_literal
(
s
)
if
next_kind
==
'c'
:
self
.
error
(
"Cannot concatenate char literal with another string or char literal"
)
elif
next_kind
==
'u'
:
kind
=
'u'
strings
.
append
(
next_value
)
value
=
''
.
join
(
strings
)
value
=
ExprNodes
.
EncodedString
(
u''
.
join
(
strings
)
)
if
kind
!=
'u'
:
value
.
encoding
=
s
.
source_encoding
return
kind
,
value
def
p_opt_string_literal
(
s
):
if
s
.
sy
==
'
STRING'
or
s
.
sy
==
'
BEGIN_STRING'
:
if
s
.
sy
==
'BEGIN_STRING'
:
return
p_string_literal
(
s
)
else
:
return
None
...
...
@@ -530,10 +541,6 @@ def p_opt_string_literal(s):
def
p_string_literal
(
s
):
# A single string or char literal.
# Returns (kind, value) where kind in ('', 'c', 'r', 'u')
if
s
.
sy
==
'STRING'
:
value
=
unquote
(
s
.
systring
)
s
.
next
()
return
value
# s.sy == 'BEGIN_STRING'
pos
=
s
.
position
()
#is_raw = s.systring[:1].lower() == "r"
...
...
@@ -549,8 +556,6 @@ def p_string_literal(s):
systr
=
s
.
systring
if
len
(
systr
)
==
1
and
systr
in
"'
\
"
\
n
"
:
chars
.
append
(
'
\
\
'
)
if
kind
==
'u'
and
not
isinstance
(
systr
,
unicode
):
systr
=
systr
.
decode
(
"UTF-8"
)
chars
.
append
(
systr
)
elif
sy
==
'ESCAPE'
:
systr
=
s
.
systring
...
...
@@ -572,7 +577,8 @@ def p_string_literal(s):
elif
c
in
'ux'
:
if
kind
==
'u'
:
try
:
chars
.
append
(
systr
.
decode
(
'unicode_escape'
))
chars
.
append
(
systr
.
encode
(
"ASCII"
).
decode
(
'unicode_escape'
))
except
UnicodeDecodeError
:
s
.
error
(
"Invalid unicode escape '%s'"
%
systr
,
pos
=
pos
)
...
...
@@ -593,50 +599,12 @@ def p_string_literal(s):
"Unexpected token %r:%r in string literal"
%
(
sy
,
s
.
systring
))
s
.
next
()
value
=
''
.
join
(
chars
)
value
=
ExprNodes
.
EncodedString
(
u''
.
join
(
chars
)
)
if
kind
!=
'u'
:
value
.
encoding
=
s
.
source_encoding
#print "p_string_literal: value =", repr(value) ###
return
kind
,
value
def
unquote
(
s
):
is_raw
=
0
if
s
[:
1
].
lower
()
==
"r"
:
is_raw
=
1
s
=
s
[
1
:]
q
=
s
[:
3
]
if
q
==
'"""'
or
q
==
"'''"
:
s
=
s
[
3
:
-
3
]
else
:
s
=
s
[
1
:
-
1
]
if
is_raw
:
s
=
s
.
replace
(
'
\
\
'
,
'
\
\
\
\
'
)
s
=
s
.
replace
(
'
\
n
'
,
'
\
\
\
n
'
)
else
:
# Split into double quotes, newlines, escape sequences
# and spans of regular chars
l1
=
re
.
split
(
r'((?:\\[0-7]{1,3})|(?:\\x[0-9A-Fa-f]{2})|(?:\\.)|(?:\\\n)|(?:\n)|")'
,
s
)
#print "unquote: l1 =", l1 ###
l2
=
[]
for
item
in
l1
:
if
item
==
'"'
or
item
==
'
\
n
'
:
l2
.
append
(
'
\
\
'
+
item
)
elif
item
==
'
\
\
\
n
'
:
pass
elif
item
[:
1
]
==
'
\
\
'
:
if
len
(
item
)
==
2
:
if
item
[
1
]
in
'"
\
\
abfnrtv'
:
l2
.
append
(
item
)
else
:
l2
.
append
(
item
[
1
])
elif
item
[
1
:
2
]
==
'x'
:
l2
.
append
(
'
\
\
x0'
+
item
[
2
:])
else
:
# octal escape
l2
.
append
(
item
)
else
:
l2
.
append
(
item
)
s
=
""
.
join
(
l2
)
return
s
# list_display ::= "[" [listmaker] "]"
# listmaker ::= expression ( list_for | ( "," expression )* [","] )
# list_iter ::= list_for | list_if
...
...
@@ -946,6 +914,8 @@ def p_import_statement(s):
ExprNodes
.
StringNode
(
pos
,
value
=
"*"
)])
else
:
name_list
=
None
dotted_name
=
ExprNodes
.
EncodedString
(
dotted_name
)
dotted_name
.
encoding
=
s
.
source_encoding
stat
=
Nodes
.
SingleAssignmentNode
(
pos
,
lhs
=
ExprNodes
.
NameNode
(
pos
,
name
=
as_name
or
target_name
),
...
...
@@ -984,14 +954,18 @@ def p_from_import_statement(s):
imported_name_strings
=
[]
items
=
[]
for
(
name_pos
,
name
,
as_name
)
in
imported_names
:
encoded_name
=
ExprNodes
.
EncodedString
(
name
)
encoded_name
.
encoding
=
s
.
source_encoding
imported_name_strings
.
append
(
ExprNodes
.
StringNode
(
name_pos
,
value
=
name
))
ExprNodes
.
StringNode
(
name_pos
,
value
=
encoded_
name
))
items
.
append
(
(
name
,
ExprNodes
.
NameNode
(
name_pos
,
name
=
as_name
or
name
)))
import_list
=
ExprNodes
.
ListNode
(
imported_names
[
0
][
0
],
args
=
imported_name_strings
)
dotted_name
=
ExprNodes
.
EncodedString
(
dotted_name
)
dotted_name
.
encoding
=
s
.
source_encoding
return
Nodes
.
FromImportStatNode
(
pos
,
module
=
ExprNodes
.
ImportNode
(
dotted_name_pos
,
module_name
=
ExprNodes
.
StringNode
(
dotted_name_pos
,
...
...
@@ -1996,7 +1970,8 @@ def p_class_statement(s):
# s.sy == 'class'
pos
=
s
.
position
()
s
.
next
()
class_name
=
p_ident
(
s
)
class_name
=
ExprNodes
.
EncodedString
(
p_ident
(
s
)
)
class_name
.
encoding
=
s
.
source_encoding
if
s
.
sy
==
'('
:
s
.
next
()
base_list
=
p_simple_expr_list
(
s
)
...
...
@@ -2113,7 +2088,7 @@ def p_property_decl(s):
return
Nodes
.
PropertyNode
(
pos
,
name
=
name
,
doc
=
doc
,
body
=
body
)
def
p_doc_string
(
s
):
if
s
.
sy
==
'
STRING'
or
s
.
sy
==
'
BEGIN_STRING'
:
if
s
.
sy
==
'BEGIN_STRING'
:
_
,
result
=
p_cat_string_literal
(
s
)
if
s
.
sy
!=
'EOF'
:
s
.
expect_newline
(
"Syntax error in doc string"
)
...
...
Cython/Compiler/PyrexTypes.py
View file @
70ea30b6
...
...
@@ -37,6 +37,7 @@ class PyrexType(BaseType):
# is_enum boolean Is a C enum type
# is_typedef boolean Is a typedef type
# is_string boolean Is a C char * type
# is_unicode boolean Is a UTF-8 encoded C char * type
# is_returncode boolean Is used only to signal exceptions
# is_error boolean Is the dummy error type
# has_attributes boolean Has C dot-selectable attributes
...
...
@@ -83,6 +84,7 @@ class PyrexType(BaseType):
is_enum
=
0
is_typedef
=
0
is_string
=
0
is_unicode
=
0
is_returncode
=
0
is_error
=
0
has_attributes
=
0
...
...
@@ -875,19 +877,49 @@ class CEnumType(CType):
return
self
.
base_declaration_code
(
public_decl
(
base
,
dll_linkage
),
entity_code
)
def
_escape_byte_string
(
s
):
try
:
s
.
decode
(
"ASCII"
)
return
s
except
UnicodeDecodeError
:
pass
l
=
[]
append
=
l
.
append
for
c
in
s
:
o
=
ord
(
c
)
if
o
>=
128
:
append
(
'
\
\
x%X'
%
o
)
else
:
append
(
c
)
return
''
.
join
(
l
)
class
CStringType
:
# Mixin class for C string types.
is_string
=
1
is_unicode
=
0
to_py_function
=
"PyString_FromString"
from_py_function
=
"PyString_AsString"
exception_value
=
"NULL"
def
literal_code
(
self
,
value
):
if
isinstance
(
value
,
unicode
):
value
=
value
.
encode
(
"UTF-8"
)
return
'"%s"'
%
value
assert
isinstance
(
value
,
str
)
return
'"%s"'
%
_escape_byte_string
(
value
)
class
CUTF8StringType
:
# Mixin class for C unicode types.
is_string
=
1
is_unicode
=
1
to_py_function
=
"PyUnicode_DecodeUTF8"
exception_value
=
"NULL"
def
literal_code
(
self
,
value
):
assert
isinstance
(
value
,
str
)
return
'"%s"'
%
_escape_byte_string
(
value
)
class
CCharArrayType
(
CStringType
,
CArrayType
):
...
...
@@ -900,6 +932,16 @@ class CCharArrayType(CStringType, CArrayType):
CArrayType
.
__init__
(
self
,
c_char_type
,
size
)
class
CUTF8CharArrayType
(
CUTF8StringType
,
CArrayType
):
# C 'char []' type.
parsetuple_format
=
"s"
pymemberdef_typecode
=
"T_STRING_INPLACE"
def
__init__
(
self
,
size
):
CArrayType
.
__init__
(
self
,
c_char_type
,
size
)
class
CCharPtrType
(
CStringType
,
CPtrType
):
# C 'char *' type.
...
...
@@ -910,6 +952,16 @@ class CCharPtrType(CStringType, CPtrType):
CPtrType
.
__init__
(
self
,
c_char_type
)
class
CUTF8CharPtrType
(
CUTF8StringType
,
CPtrType
):
# C 'char *' type, encoded in UTF-8.
parsetuple_format
=
"s"
pymemberdef_typecode
=
"T_STRING"
def
__init__
(
self
):
CPtrType
.
__init__
(
self
,
c_char_type
)
class
ErrorType
(
PyrexType
):
# Used to prevent propagation of error messages.
...
...
@@ -974,7 +1026,9 @@ c_longdouble_type = CFloatType(8)
c_null_ptr_type
=
CNullPtrType
(
c_void_type
)
c_char_array_type
=
CCharArrayType
(
None
)
c_utf8_char_array_type
=
CUTF8CharArrayType
(
None
)
c_char_ptr_type
=
CCharPtrType
()
c_utf8_char_ptr_type
=
CUTF8CharPtrType
()
c_char_ptr_ptr_type
=
CPtrType
(
c_char_ptr_type
)
c_int_ptr_type
=
CPtrType
(
c_int_type
)
...
...
Cython/Compiler/Scanning.py
View file @
70ea30b6
...
...
@@ -212,7 +212,7 @@ class PyrexScanner(Scanner):
resword_dict
=
build_resword_dict
()
def
__init__
(
self
,
file
,
filename
,
parent_scanner
=
None
,
type_names
=
None
,
context
=
None
):
type_names
=
None
,
context
=
None
,
source_encoding
=
None
):
Scanner
.
__init__
(
self
,
get_lexicon
(),
file
,
filename
)
if
parent_scanner
:
self
.
context
=
parent_scanner
.
context
...
...
@@ -226,6 +226,7 @@ class PyrexScanner(Scanner):
self
.
compile_time_env
=
initial_compile_time_env
()
self
.
compile_time_eval
=
1
self
.
compile_time_expr
=
0
self
.
source_encoding
=
source_encoding
self
.
trace
=
trace_scanner
self
.
indentation_stack
=
[
0
]
self
.
indentation_char
=
None
...
...
Cython/Compiler/Symtab.py
View file @
70ea30b6
...
...
@@ -438,7 +438,13 @@ class Scope:
def
add_string_const
(
self
,
value
):
# Add an entry for a string constant.
cname
=
self
.
new_const_cname
()
entry
=
Entry
(
""
,
cname
,
c_char_array_type
,
init
=
value
)
if
value
.
is_unicode
:
c_type
=
c_utf8_char_array_type
value
=
value
.
utf8encode
()
else
:
c_type
=
c_char_array_type
value
=
value
.
byteencode
()
entry
=
Entry
(
""
,
cname
,
c_type
,
init
=
value
)
entry
.
used
=
1
self
.
const_entries
.
append
(
entry
)
return
entry
...
...
@@ -460,7 +466,7 @@ class Scope:
# Python identifier, it will be interned.
if
not
entry
.
pystring_cname
:
value
=
entry
.
init
if
identifier_pattern
.
match
(
value
)
and
isinstance
(
value
,
str
):
if
not
entry
.
type
.
is_unicode
and
identifier_pattern
.
match
(
value
):
entry
.
pystring_cname
=
self
.
intern
(
value
)
entry
.
is_interned
=
1
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment