Commit c8ace6fb authored by Vincent Pelletier's avatar Vincent Pelletier

Document support for non-ASCII urls.

Also, display them decoded in report.
parent 58e82c25
......@@ -31,8 +31,9 @@ from collections import defaultdict, Counter
from datetime import datetime, timedelta
from functools import partial
from operator import itemgetter
from urllib import splittype, splithost
from urllib import splittype, splithost, unquote
import argparse
import codecs
import gzip
import httplib
import itertools
......@@ -282,7 +283,7 @@ class GenericSiteStats(object):
def getApdexData(self):
return getDataPoints(self.apdex)
def asHTML(self, date_format, placeholder_delta, graph_period,
def asHTML(self, date_format, placeholder_delta, graph_period, encoding,
stat_filter=lambda x: x):
result = []
append = result.append
......@@ -300,7 +301,7 @@ class GenericSiteStats(object):
reverse=True)[:N_SLOWEST]:
append('<tr>')
append(data.asHTML(self.threshold))
append('<td class="text">%s</td></tr>' % escape(url))
append('<td class="text">%s</td></tr>' % unquoteToHtml(url, encoding))
append('</table>')
column_set = set()
filtered_status = defaultdict(partial(defaultdict, int))
......@@ -357,9 +358,12 @@ class GenericSiteStats(object):
append('<td>%s</td><td class="text">%s</td>'
'<td class="text">%s</td>' % (
getHitForUrl(referer_counter),
escape(url),
'<br/>'.join('%i: %s' % (hit, escape(referer)) for referer, hit in sorted(
referer_counter.iteritems(), key=ITEMGETTER1, reverse=True
unquoteToHtml(url, encoding),
'<br/>'.join('%i: %s' % (hit, unquoteToHtml(referer, encoding))
for referer, hit in sorted(
referer_counter.iteritems(),
key=ITEMGETTER1,
reverse=True,
)[:N_REFERRER_PER_ERROR_URL]),
))
append('</tr>')
......@@ -424,7 +428,7 @@ class ERP5SiteStats(GenericSiteStats):
return sorted(((date, apdex, hit)
for date, (apdex, hit) in date_dict.iteritems()), key=ITEMGETTER0)
def asHTML(self, date_format, placeholder_delta, graph_period,
def asHTML(self, date_format, placeholder_delta, graph_period, encoding,
stat_filter=lambda x: x):
result = []
append = result.append
......@@ -490,7 +494,7 @@ class ERP5SiteStats(GenericSiteStats):
apdexAsColumns(filtered_no_module)
append('</tr></table>')
append(super(ERP5SiteStats, self).asHTML(date_format,
placeholder_delta, graph_period, stat_filter=stat_filter))
placeholder_delta, graph_period, encoding, stat_filter=stat_filter))
return '\n'.join(result)
DURATION_US_FORMAT = '%D'
......@@ -634,14 +638,17 @@ period_parser = {
),
}
def asHTML(out, per_site, args, default_site, period_parameter_dict, stats):
unquoteToHtml = lambda x, encoding: escape(unquote(x).decode(encoding))
def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict,
stats):
period = period_parameter_dict['period']
decimator = period_parameter_dict['decimator']
date_format = period_parameter_dict['date_format']
placeholder_delta = period_parameter_dict['placeholder_delta']
graph_period = period_parameter_dict['graph_period']
out.write('<!DOCTYPE html>\n<html><head><meta charset="utf-8">'
'<title>Stats</title>')
out.write('<!DOCTYPE html>\n<html><head><meta charset="%s">'
'<title>Stats</title>' % encoding)
js_embed = getattr(args, 'js_embed', True)
js_path = getattr(args, 'js', None)
if js_embed:
......@@ -680,7 +687,7 @@ def asHTML(out, per_site, args, default_site, period_parameter_dict, stats):
for site_id, data in per_site.iteritems():
if site_id is None:
site_id = default_site
out.write('<h1>Site: %s</h1>' % site_id)
out.write('<h1>Site: %s</h1>' % unquoteToHtml(site_id, encoding))
out.write(
graphPair(
prepareDataForGraph(
......@@ -693,7 +700,7 @@ def asHTML(out, per_site, args, default_site, period_parameter_dict, stats):
)
)
out.write(data.asHTML(date_format, placeholder_delta, graph_period,
decimator))
encoding, decimator))
end_stat_time = time.time()
if args.stats:
out.write('<h1>Parsing stats</h1><table class="stats">')
......@@ -726,7 +733,7 @@ def asHTML(out, per_site, args, default_site, period_parameter_dict, stats):
out.write('</body></html>')
format_generator = {
'html': asHTML,
'html': (asHTML, 'utf-8'),
}
def main():
......@@ -770,7 +777,10 @@ def main():
group = parser.add_argument_group('site matching', 'Earlier arguments take '
'precedence. For example: --skip-base "/foo/bar(/|$|\\?)" '
'--base "/foo(/|$|\\?)" generates stats for /foo, excluding /foo/bar. '
'Arguments (except for -d/--default) are interpreted as Python regexes.')
'Arguments (except for -d/--default) are interpreted as Python regexes. '
'Literal values are expected urlencoded. For example: '
'--base "/%E6%96%87%E5%AD%97%E5%8C%96%E3%81%91(/|$|\\?)" matches '
'"/\xe6\x96\x87\xe5\xad\x97\xe5\x8c\x96\xe3\x81\x91" ("mojibake").')
group.add_argument('-d', '--default',
help='Caption for lines matching no prefix, or skip them if not provided.')
group.add_argument('--base', dest='path', default=[], nargs='+',
......@@ -936,12 +946,13 @@ def main():
all_lines += lineno
sys.stderr.write('%i\n' % lineno)
end_parsing_time = time.time()
generator, out_encoding = format_generator[args.format]
if args.out == '-':
out = sys.stdout
else:
out = open(args.out, 'w')
out = codecs.open(args.out, 'w', encoding=out_encoding)
with out:
format_generator[args.format](out, per_site, args, default_site, {
generator(out, out_encoding, per_site, args, default_site, {
'period': period,
'decimator': decimator,
'date_format': date_format,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment