- use some templating system instead of hardcoded html strings
- provide some form of raw data output, not just html
- allow user to specify min & max dates
- automatically select period from log data ?
......@@ -66,6 +66,7 @@ N_REFERRER_PER_ERROR_URL = 5
ITEMGETTER0 = itemgetter(0)
ITEMGETTER1 = itemgetter(1)
def statusIsError(status):
return status[0] > '3'
......@@ -260,6 +261,18 @@ class GenericSiteStats(object):
self.url_apdex = defaultdict(partial(APDEXStats, threshold, getDuration))
self.apdex = defaultdict(partial(APDEXStats, threshold, getDuration))
def rescale(self, convert, getDuration):
self.getDuration = getDuration
for status, date_dict in self.status.iteritems():
new_date_dict = defaultdict(int)
for date, status_count in date_dict.iteritems():
new_date_dict[convert(date)] += status_count
self.status[status] = new_date_dict
new_apdex = defaultdict(partial(APDEXStats, self.threshold, getDuration))
for date, data in self.apdex.iteritems():
self.apdex = new_apdex
def accumulate(self, match, url_match, date):
if url_match is None:
......@@ -381,6 +394,20 @@ class ERP5SiteStats(GenericSiteStats):
defaultdict, partial(APDEXStats, threshold, getDuration))))
self.no_module = defaultdict(partial(APDEXStats, threshold, getDuration))
def rescale(self, convert, getDuration):
super(ERP5SiteStats, self).rescale(convert, getDuration)
threshold = self.threshold
for document_dict in self.module.itervalues():
for is_document, date_dict in document_dict.iteritems():
new_date_dict = defaultdict(partial(APDEXStats, threshold, getDuration))
for date, data in date_dict.iteritems():
document_dict[is_document] = new_date_dict
new_no_module = defaultdict(partial(APDEXStats, threshold, getDuration))
for date, data in self.no_module.iteritems():
self.no_module = new_no_module
def accumulate(self, match, url_match, date):
split = self.suffix('url')).split('?', 1)[0].split('/')
if split and split[0].endswith('_module'):
......@@ -527,6 +554,9 @@ def _weekStringAsQuarterString(timestamp):
year, month, _ = timestamp.split('/')
return '%s/%02i' % (year, int(month) / 3 * 3 + 1)
def _roundWeek(dt):
return dt.replace( / 7 * 7 + 1)
def _asDayString(timestamp):
dt, _ = timestamp.split(' ')
day, month, year = dt.split(':', 1)[0].split('/')
......@@ -539,6 +569,9 @@ def _as6HourString(timestamp):
return '%s/%02i/%s %02i' % (year, MONTH_VALUE_DICT[month], day,
int(hour) / 6 * 6)
def _round6Hour(dt):
return dt.replace(hour=dt.hour / 6 * 6)
def _hourAsWeekString(timestamp):
dt = datetime.strptime(timestamp, '%Y/%m/%d %H')
return (dt - timedelta(dt.weekday())).date().strftime('%Y/%m/%d')
......@@ -559,6 +592,8 @@ def _asHourString(timestamp):
# datetime.datetime instance
# - period during which a placeholder point will be added if there is no data
# point
# - round a datetime.datetime instance so once represented using given format
# string it is a valid graph-granularity date for period
period_parser = {
'year': (
......@@ -567,6 +602,7 @@ period_parser = {
# Longest month: 31 days
lambda x: x,
'quarter': (
......@@ -576,6 +612,7 @@ period_parser = {
'7 days',
'month': (
......@@ -584,6 +621,7 @@ period_parser = {
# Longest day: 24 hours + 1h DST (never more ?)
timedelta(seconds=3600 * 25),
lambda x: x,
'week': (
......@@ -591,6 +629,7 @@ period_parser = {
'6 hours',
'%Y/%m/%d %H',
timedelta(seconds=3600 * 6),
'day': (
......@@ -599,6 +638,7 @@ period_parser = {
'%Y/%m/%d %H',
# Longest hour: 60 * 60 seconds + 1 leap second.
lambda x: x,
......@@ -623,8 +663,12 @@ def main():
'Default: %(default).2fs')
group.add_argument('-e', '--error-detail', action='store_true',
help='Include detailed report (url & referers) for error statuses.')
group.add_argument('-p', '--period', default='day', choices=period_parser,
help='Periodicity of sampling buckets. Default: %(default)r')
group.add_argument('-p', '--period', choices=period_parser,
help='Periodicity of sampling buckets. Default: (decide from data). '
'Performance note: leaving out this parameter reduces parsing '
'performance, as each period increase requires re-dispatching already '
'processed data. To mitigate this, provide earliest and latest log '
'files before all others (ex: log0 log3 log1 log2).')
group.add_argument('-s', '--stats', action='store_true',
help='Enable parsing stats (time spent parsing input, time spent '
'generating output, ...)')
......@@ -690,8 +734,23 @@ def main():
assert not key, key
matchline = re.compile(line_regex).match
matchrequest = REQUEST_PATTERN.match
asDate, decimator, graph_period, date_format, placeholder_delta = \
if args.period is None:
next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in
sorted(period_parser.iteritems(), key=lambda x: x[1][4])).next
period, to_next_period = next_period_data()
earliest_date = latest_date = None
def getNextPeriod():
# datetime is slow (compared to string operations), but not many choices
return (datetime.strptime(earliest_date, date_format) + to_next_period
def rescale(x):
result = round_date(datetime.strptime(x, old_date_format)).strftime(date_format)
return result
to_next_period = None
period = args.period
asDate, decimator, graph_period, date_format, placeholder_delta, \
round_date = period_parser[period]
site_list = args.path
default_site = args.default
if default_site is None:
......@@ -757,6 +816,30 @@ def main():
skipped_lines += 1
date = asDate('timestamp'))
if to_next_period is not None:
if date > latest_date: # '' > None is True
latest_date = date
if date < earliest_date or earliest_date is None:
earliest_date = date
next_period = getNextPeriod()
if latest_date > next_period:
while latest_date > next_period:
period, to_next_period = next_period_data()
next_period = getNextPeriod()
except StopIteration:
print >> sys.stderr, 'Increasing period to', period, '...',
old_date_format = date_format
asDate, decimator, graph_period, date_format, placeholder_delta, \
round_date = period_parser[period]
period_increase_start = time.time()
print old_date_format, date_format
for site_data in per_site.itervalues():
site_data.rescale(rescale, getDuration)
print >> sys.stderr, 'done (%s)' % timedelta(seconds=time.time()
- period_increase_start)
date = asDate('timestamp'))
site_data = per_site[site]
except KeyError:
......@@ -793,7 +876,7 @@ def main():
'<table class="stats">')
for caption, value in (
('apdex threshold', '%.2fs' % args.apdex),
('period', args.period),
('period', args.period or (period + ' (auto)')),
out.write('<tr><th class="text">%s</th><td>%s</td></tr>' % (
caption, value))
