Commit 4a9737a9 authored by Vincent Pelletier's avatar Vincent Pelletier

Toward Python3: unicode literals

Costs 30% performance on pypy.
parent b068f82d
...@@ -26,7 +26,8 @@ ...@@ -26,7 +26,8 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# #
############################################################################## ##############################################################################
from __future__ import print_function, division, absolute_import from __future__ import print_function, division, absolute_import, \
unicode_literals
from cgi import escape from cgi import escape
from collections import defaultdict, Counter from collections import defaultdict, Counter
from datetime import datetime, timedelta, date from datetime import datetime, timedelta, date
...@@ -37,6 +38,7 @@ import argparse ...@@ -37,6 +38,7 @@ import argparse
import bz2 import bz2
import calendar import calendar
import codecs import codecs
import functools
import gzip import gzip
import httplib import httplib
import itertools import itertools
...@@ -54,18 +56,58 @@ import traceback ...@@ -54,18 +56,58 @@ import traceback
def getResource(name, encoding='utf-8'): def getResource(name, encoding='utf-8'):
return pkgutil.get_data(__name__, name).decode(encoding) return pkgutil.get_data(__name__, name).decode(encoding)
def _wrapOpen(func):
@functools.wraps(func)
def wrapper(*args, **kw):
encoding = kw.pop('encoding', None)
info = codecs.lookup(encoding)
errors = kw.pop('errors', 'strict')
file_object = func(*args, **kw)
if encoding is None:
return file_object
srw = codecs.StreamReaderWriter(
file_object,
info.streamreader,
info.streamwriter,
errors,
)
srw.encoding = encoding
return srw
return wrapper
lzma = None
gzip_open = gzip.open
if sys.version_info >= (3, 3):
import lzma
bz2_open = bz2.open
_read_mode = 'rt'
else:
open = codecs.open
gzip_open = _wrapOpen(gzip_open)
bz2_open = _wrapOpen(bz2.BZ2File)
_read_mode = 'r'
FILE_OPENER_LIST = [ FILE_OPENER_LIST = [
(gzip.open, IOError), (gzip_open, IOError),
(bz2.BZ2File, IOError), (bz2_open, IOError),
] ]
if lzma is None:
try: try:
from backports import lzma from backports import lzma
except ImportError: except ImportError:
pass pass
else: if lzma is not None:
FILE_OPENER_LIST.append((lzma.open, lzma.LZMAError)) FILE_OPENER_LIST.append((lzma.open, lzma.LZMAError))
# XXX: what encoding ? apache doesn't document one, but requests are supposed
# to be urlencoded, so pure ascii. Are timestamps localised ?
INPUT_ENCODING = 'ascii'
if sys.version_info < (3, ):
unquoteToHtml = lambda x: escape(unquote(x.encode('ascii')).decode('utf-8'))
else:
unquoteToHtml = lambda x: escape(unquote(x))
MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar', MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1)) 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))
...@@ -351,7 +393,7 @@ class GenericSiteStats(object): ...@@ -351,7 +393,7 @@ class GenericSiteStats(object):
reverse=True)[:N_SLOWEST]: reverse=True)[:N_SLOWEST]:
append('<tr>') append('<tr>')
append(data.asHTML(self.threshold)) append(data.asHTML(self.threshold))
append('<td class="text">%s</td></tr>' % unquoteToHtml(url, encoding)) append('<td class="text">%s</td></tr>' % unquoteToHtml(url))
append('</table>') append('</table>')
append('<h2>User agents</h2><table class="stats"><tr><th>hits</th>' append('<h2>User agents</h2><table class="stats"><tr><th>hits</th>'
'<th>user agent</th></tr>') '<th>user agent</th></tr>')
...@@ -413,8 +455,8 @@ class GenericSiteStats(object): ...@@ -413,8 +455,8 @@ class GenericSiteStats(object):
append('<td>%s</td><td class="text">%s</td>' append('<td>%s</td><td class="text">%s</td>'
'<td class="text">%s</td>' % ( '<td class="text">%s</td>' % (
getHitForUrl(referer_counter), getHitForUrl(referer_counter),
unquoteToHtml(url, encoding), unquoteToHtml(url),
'<br/>'.join('%i: %s' % (hit, unquoteToHtml(referer, encoding)) '<br/>'.join('%i: %s' % (hit, unquoteToHtml(referer))
for referer, hit in referer_counter.most_common( for referer, hit in referer_counter.most_common(
N_REFERRER_PER_ERROR_URL)), N_REFERRER_PER_ERROR_URL)),
)) ))
...@@ -931,9 +973,6 @@ period_parser = { ...@@ -931,9 +973,6 @@ period_parser = {
), ),
} }
unquoteToHtml = lambda x, encoding: escape(unquote(x).decode(encoding,
'replace'))
apdex_y_scale_dict = { apdex_y_scale_dict = {
'linear': None, 'linear': None,
'log': 'log100To0', 'log': 'log100To0',
...@@ -980,8 +1019,7 @@ def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict, ...@@ -980,8 +1019,7 @@ def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict,
key=lambda x: site_caption_dict[x[0]]))) key=lambda x: site_caption_dict[x[0]])))
html_site_caption_dict = {} html_site_caption_dict = {}
for i, (site_id, _) in site_list: for i, (site_id, _) in site_list:
html_site_caption_dict[site_id] = unquoteToHtml(site_caption_dict[site_id], html_site_caption_dict[site_id] = unquoteToHtml(site_caption_dict[site_id])
encoding)
if len(per_site) > 1: if len(per_site) > 1:
out.write('<h2>Index</h2><ol>') out.write('<h2>Index</h2><ol>')
for i, (site_id, _) in site_list: for i, (site_id, _) in site_list:
...@@ -1084,17 +1122,6 @@ format_generator = { ...@@ -1084,17 +1122,6 @@ format_generator = {
'json': (asJSON, 'ascii'), 'json': (asJSON, 'ascii'),
} }
# XXX: monkey-patching json module to emit strings instead of unicode objects.
# Because strings are faster, (30% overall performance hit moving to unicode
# objects), and only ASCII is expected (urlencoded is ASCII).
# Subclassing JSONDecoder is not enough as object parser uses scanstring
# directly.
original_scanstring = json.decoder.scanstring
def _scanstring(*args, **kw):
string, end = original_scanstring(*args, **kw)
return string.encode('ascii'), end
json.decoder.scanstring = _scanstring
def main(): def main():
parser = ShlexArgumentParser(description='Compute Apdex out of ' parser = ShlexArgumentParser(description='Compute Apdex out of '
'apache-style log files', fromfile_prefix_chars='@') 'apache-style log files', fromfile_prefix_chars='@')
...@@ -1246,7 +1273,7 @@ def main(): ...@@ -1246,7 +1273,7 @@ def main():
if state_file_name == '-': if state_file_name == '-':
state_file = sys.stdin state_file = sys.stdin
else: else:
state_file = open(state_file_name) state_file = open(state_file_name, encoding='ascii')
with state_file: with state_file:
load_start = time.time() load_start = time.time()
state = json.load(state_file) state = json.load(state_file)
...@@ -1289,7 +1316,7 @@ def main(): ...@@ -1289,7 +1316,7 @@ def main():
logfile = sys.stdin logfile = sys.stdin
else: else:
for opener, exc in FILE_OPENER_LIST: for opener, exc in FILE_OPENER_LIST:
logfile = opener(filename) logfile = opener(filename, _read_mode, encoding=INPUT_ENCODING)
try: try:
logfile.readline() logfile.readline()
except exc: except exc:
...@@ -1298,7 +1325,7 @@ def main(): ...@@ -1298,7 +1325,7 @@ def main():
logfile.seek(0) logfile.seek(0)
break break
else: else:
logfile = open(filename) logfile = open(filename, _read_mode, encoding=INPUT_ENCODING)
lineno = 0 lineno = 0
for lineno, line in enumerate(logfile, 1): for lineno, line in enumerate(logfile, 1):
if show_progress and lineno % 5000 == 0: if show_progress and lineno % 5000 == 0:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment