Commit 9f1f8025 authored by Stefan Behnel's avatar Stefan Behnel

make encoding detection code a bit faster if the first two source file lines...

make encoding detection code a bit faster if the first two source file lines are longer than 250 bytes
parent ad356fd1
...@@ -189,36 +189,41 @@ def decode_filename(filename): ...@@ -189,36 +189,41 @@ def decode_filename(filename):
_match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search _match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search
def detect_file_encoding(source_filename): def detect_file_encoding(source_filename):
f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore') f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore')
try: try:
return detect_opened_file_encoding(f) return detect_opened_file_encoding(f)
finally: finally:
f.close() f.close()
def detect_opened_file_encoding(f): def detect_opened_file_encoding(f):
# PEPs 263 and 3120 # PEPs 263 and 3120
# Most of the time the first two lines fall in the first 250 chars, # Most of the time the first two lines fall in the first 250 chars,
# and this bulk read/split is much faster. # and this bulk read/split is much faster.
lines = f.read(250).split("\n") lines = f.read(250).split(u"\n")
if len(lines) > 2: if len(lines) > 1:
m = _match_file_encoding(lines[0]) or _match_file_encoding(lines[1]) m = _match_file_encoding(lines[0])
if m: if m:
return m.group(1) return m.group(1)
else: elif len(lines) > 2:
return "UTF-8" m = _match_file_encoding(lines[1])
else: if m:
# Fallback to one-char-at-a-time detection. return m.group(1)
f.seek(0) else:
chars = [] return "UTF-8"
for i in range(2): # Fallback to one-char-at-a-time detection.
f.seek(0)
chars = []
for i in range(2):
c = f.read(1)
while c and c != u'\n':
chars.append(c)
c = f.read(1) c = f.read(1)
while c and c != u'\n': encoding = _match_file_encoding(u''.join(chars))
chars.append(c) if encoding:
c = f.read(1) return encoding.group(1)
encoding = _match_file_encoding(u''.join(chars))
if encoding:
return encoding.group(1)
return "UTF-8" return "UTF-8"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment