Add optional detailed error analysis.

75fdd631 · Vincent Pelletier · 724784ae · 75fdd631
Commit 75fdd631 authored Apr 03, 2013 by Vincent Pelletier
Show whitespace changes
Inline Side-by-side

Showing with 52 additions and 7 deletions

apachedex/__init__.py apachedex/__init__.py +52 -7

No files found.
--- a/apachedex/__init__.py
+++ b/apachedex/__init__.py
@@ -33,7 +33,7 @@
 # - provide some form of raw data output, not just html
 # - allow user to specify min & max dates
 from cgi import escape
-from collections import defaultdict
+from collections import defaultdict, Counter
 from datetime import datetime, tzinfo, timedelta
 from functools import partial
 from operator import itemgetter
@@ -68,7 +68,10 @@ US_PER_S = 10 ** 6
 N_SLOWEST = 20
 N_SLOWEST_THRESHOLD = N_SLOWEST * 4
+N_ERROR_URL = 10
+N_REFERRER_PER_ERROR_URL = 5
 ITEMGETTER0 = itemgetter(0)
+ITEMGETTER1 = itemgetter(1)
 APDEX_TOLERATING_COEF = 4
 def statusIsError(status):
@@ -124,15 +127,17 @@ class APDEXStats(object):
    return 0
 class GenericSiteStats(object):
-  def __init__(self, threshold, prefix=1):
+  def __init__(self, threshold, prefix=1, error_detail=False):
    self.threshold = threshold
    self.prefix = prefix
+    self.error_detail = error_detail
    self.status = defaultdict(partial(defaultdict, int))
+    if error_detail:
+      self.error_url_count = defaultdict(partial(defaultdict, list))
    self.slowest_list = [(-1, None, None, None)]
    self.apdex = defaultdict(partial(APDEXStats, threshold))
  def accumulate(self, match, url_match, date):
-    self.status[match.group('status')][date] += 1
    self.apdex[date].accumulate(match)
    duration = int(match.group('duration'))
    if url_match is None:
@@ -145,6 +150,11 @@ class GenericSiteStats(object):
        match.group('referer')))
      if len(slowest_list) > N_SLOWEST_THRESHOLD:
        self._housekeeping()
+    status = match.group('status')
+    self.status[status][date] += 1
+    if self.error_detail and statusIsError(status):
+      # XXX: can eat memory if there are many errors on many different urls
+      self.error_url_count[status][url].append(match.group('referer'))
  def _housekeeping(self):
    slowest_list = self.slowest_list
@@ -196,7 +206,37 @@ class GenericSiteStats(object):
      for date in column_list:
        append(hitTd(data_dict[date], status))
      append('</tr>')
-    append('</table><h2>Slowest pages</h2><table><tr><th>duration (s)</th>'
+    append('</table>')
+    if self.error_detail:
+      def getHitForUrl(referer_counter):
+        return sum(referer_counter.itervalues())
+      filtered_status_url = defaultdict(partial(defaultdict, dict))
+      for status, url_dict in self.error_url_count.iteritems():
+        filtered_status_url[status] = sorted(
+          ((key, Counter(value)) for key, value in url_dict.iteritems()),
+          key=lambda x: getHitForUrl(x[1]), reverse=True)[:N_ERROR_URL]
+      append('<h3>Error detail</h3><table><tr><th>status</th><th>hit</th>'
+        '<th>url</th><th>referers</th></tr>')
+      for status, url_list in sorted(filtered_status_url.iteritems(),
+          key=ITEMGETTER0):
+        append('<tr><th rowspan="%s">%s</th>' % (len(url_list), status))
+        first_url = True
+        for url, referer_counter in url_list:
+          if first_url:
+            first_url = False
+          else:
+            append('<tr>')
+          append('<td>%s</td><td class="text">%s</td>'
+            '<td class="text">%s</td>' % (
+            getHitForUrl(referer_counter),
+            url,
+            '<br/>'.join('%i: %s' % (hit, referer) for referer, hit in sorted(
+              referer_counter.iteritems(), key=ITEMGETTER1, reverse=True
+            )[:N_REFERRER_PER_ERROR_URL]),
+          ))
+          append('</tr>')
+      append('</table>')
+    append('<h2>Slowest pages</h2><table><tr><th>duration (s)</th>'
      '<th>date</th><th>url</th><th>referer</th></tr>')
    for duration, timestamp, url, referer in reversed(self.slowest_list):
      if timestamp is None:
@@ -221,8 +261,9 @@ class ERP5SiteStats(GenericSiteStats):
  - If a line belongs to a module and has at least 2 slashes after module,
    count line as belonging to a document of that module
  """
-  def __init__(self, threshold, prefix=1):
+  def __init__(self, threshold, prefix=1, error_detail=False):
-    super(ERP5SiteStats, self).__init__(threshold, prefix=prefix)
+    super(ERP5SiteStats, self).__init__(threshold, prefix=prefix,
+      error_detail=error_detail)
    # Key levels:
    # - module id (string)
    # - is document (bool)
@@ -355,6 +396,8 @@ def main():
  parser.add_argument('-a', '--apdex', default=US_PER_S, type=int,
    help='First threshold for Apdex computation, in microseconds. '
      'Default: %(default)r')
+  parser.add_argument('-e', '--error-detail', action='store_true',
+    help='Include detailed report (url & referers) for error statuses.')
  parser.add_argument('-d', '--default',
    help='Caption for lines matching no prefix, or skip them if not provided.')
  parser.add_argument('--base', dest='site_list', default=[],
@@ -418,6 +461,7 @@ def main():
  infile_list = args.logfile
  quiet = args.quiet
  threshold = args.apdex
+  error_detail = args.error_detail
  file_count = len(infile_list)
  per_site = {}
  hit_per_day = defaultdict(int)
@@ -469,7 +513,8 @@ def main():
      try:
        site_data = per_site[site]
      except KeyError:
-        site_data = per_site[site] = action(threshold)
+        site_data = per_site[site] = action(threshold,
+          error_detail=error_detail)
      site_data.accumulate(match, url_match, date)
    all_lines += lineno
  end_parsing_time = time.time()