From 75fdd631b6aced60fedd5975d417f3b06dccb97a Mon Sep 17 00:00:00 2001
From: Vincent Pelletier <vincent@nexedi.com>
Date: Wed, 3 Apr 2013 14:36:40 +0200
Subject: [PATCH] Add optional detailed error analysis.

---
 apachedex/__init__.py | 59 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/apachedex/__init__.py b/apachedex/__init__.py
index 70f7e61..e7de674 100755
--- a/apachedex/__init__.py
+++ b/apachedex/__init__.py
@@ -33,7 +33,7 @@
 # - provide some form of raw data output, not just html
 # - allow user to specify min & max dates
 from cgi import escape
-from collections import defaultdict
+from collections import defaultdict, Counter
 from datetime import datetime, tzinfo, timedelta
 from functools import partial
 from operator import itemgetter
@@ -68,7 +68,10 @@ US_PER_S = 10 ** 6
 
 N_SLOWEST = 20
 N_SLOWEST_THRESHOLD = N_SLOWEST * 4
+N_ERROR_URL = 10
+N_REFERRER_PER_ERROR_URL = 5
 ITEMGETTER0 = itemgetter(0)
+ITEMGETTER1 = itemgetter(1)
 APDEX_TOLERATING_COEF = 4
 
 def statusIsError(status):
@@ -124,15 +127,17 @@ class APDEXStats(object):
     return 0
 
 class GenericSiteStats(object):
-  def __init__(self, threshold, prefix=1):
+  def __init__(self, threshold, prefix=1, error_detail=False):
     self.threshold = threshold
     self.prefix = prefix
+    self.error_detail = error_detail
     self.status = defaultdict(partial(defaultdict, int))
+    if error_detail:
+      self.error_url_count = defaultdict(partial(defaultdict, list))
     self.slowest_list = [(-1, None, None, None)]
     self.apdex = defaultdict(partial(APDEXStats, threshold))
 
   def accumulate(self, match, url_match, date):
-    self.status[match.group('status')][date] += 1
     self.apdex[date].accumulate(match)
     duration = int(match.group('duration'))
     if url_match is None:
@@ -145,6 +150,11 @@ class GenericSiteStats(object):
         match.group('referer')))
       if len(slowest_list) > N_SLOWEST_THRESHOLD:
         self._housekeeping()
+    status = match.group('status')
+    self.status[status][date] += 1
+    if self.error_detail and statusIsError(status):
+      # XXX: can eat memory if there are many errors on many different urls
+      self.error_url_count[status][url].append(match.group('referer'))
 
   def _housekeeping(self):
     slowest_list = self.slowest_list
@@ -196,7 +206,37 @@ class GenericSiteStats(object):
       for date in column_list:
         append(hitTd(data_dict[date], status))
       append('</tr>')
-    append('</table><h2>Slowest pages</h2><table><tr><th>duration (s)</th>'
+    append('</table>')
+    if self.error_detail:
+      def getHitForUrl(referer_counter):
+        return sum(referer_counter.itervalues())
+      filtered_status_url = defaultdict(partial(defaultdict, dict))
+      for status, url_dict in self.error_url_count.iteritems():
+        filtered_status_url[status] = sorted(
+          ((key, Counter(value)) for key, value in url_dict.iteritems()),
+          key=lambda x: getHitForUrl(x[1]), reverse=True)[:N_ERROR_URL]
+      append('<h3>Error detail</h3><table><tr><th>status</th><th>hit</th>'
+        '<th>url</th><th>referers</th></tr>')
+      for status, url_list in sorted(filtered_status_url.iteritems(),
+          key=ITEMGETTER0):
+        append('<tr><th rowspan="%s">%s</th>' % (len(url_list), status))
+        first_url = True
+        for url, referer_counter in url_list:
+          if first_url:
+            first_url = False
+          else:
+            append('<tr>')
+          append('<td>%s</td><td class="text">%s</td>'
+            '<td class="text">%s</td>' % (
+            getHitForUrl(referer_counter),
+            url,
+            '<br/>'.join('%i: %s' % (hit, referer) for referer, hit in sorted(
+              referer_counter.iteritems(), key=ITEMGETTER1, reverse=True
+            )[:N_REFERRER_PER_ERROR_URL]),
+          ))
+          append('</tr>')
+      append('</table>')
+    append('<h2>Slowest pages</h2><table><tr><th>duration (s)</th>'
       '<th>date</th><th>url</th><th>referer</th></tr>')
     for duration, timestamp, url, referer in reversed(self.slowest_list):
       if timestamp is None:
@@ -221,8 +261,9 @@ class ERP5SiteStats(GenericSiteStats):
   - If a line belongs to a module and has at least 2 slashes after module,
     count line as belonging to a document of that module
   """
-  def __init__(self, threshold, prefix=1):
-    super(ERP5SiteStats, self).__init__(threshold, prefix=prefix)
+  def __init__(self, threshold, prefix=1, error_detail=False):
+    super(ERP5SiteStats, self).__init__(threshold, prefix=prefix,
+      error_detail=error_detail)
     # Key levels:
     # - module id (string)
     # - is document (bool)
@@ -355,6 +396,8 @@ def main():
   parser.add_argument('-a', '--apdex', default=US_PER_S, type=int,
     help='First threshold for Apdex computation, in microseconds. '
       'Default: %(default)r')
+  parser.add_argument('-e', '--error-detail', action='store_true',
+    help='Include detailed report (url & referers) for error statuses.')
   parser.add_argument('-d', '--default',
     help='Caption for lines matching no prefix, or skip them if not provided.')
   parser.add_argument('--base', dest='site_list', default=[],
@@ -418,6 +461,7 @@ def main():
   infile_list = args.logfile
   quiet = args.quiet
   threshold = args.apdex
+  error_detail = args.error_detail
   file_count = len(infile_list)
   per_site = {}
   hit_per_day = defaultdict(int)
@@ -469,7 +513,8 @@ def main():
       try:
         site_data = per_site[site]
       except KeyError:
-        site_data = per_site[site] = action(threshold)
+        site_data = per_site[site] = action(threshold,
+          error_detail=error_detail)
       site_data.accumulate(match, url_match, date)
     all_lines += lineno
   end_parsing_time = time.time()
-- 
2.30.9