#!/usr/bin/python
##############################################################################
#
# Copyright (c) 2009 Nexedi SA and Contributors. All Rights Reserved.
#                    Vincent Pelletier <vincent@nexedi.com>
#                    Sebastien Robin <seb@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################
from datetime import date
from os import path
import rpy2.robjects as robjects
import os
from optparse import OptionParser
r = robjects.r

usage = """
  Usage:
    %prog [OPTION] file1.csv [file2.csv [...]]
  Result:
    Generates, in current directory, a graph per csv column in out-type format.
    Their name is composed of:
    - csv file basename (without extension)
    - csv column title
    - the ratio of present points (100 to 000). The higher the number, the
      more the plot will be complete (less holes, longer timespan coverage).
    - out-type extension

  CSV files must have been generated by parse_timing_log.py tool.
"""

class CSVFile(object):
  def __init__(self, file_name, field_delim=','):
    file = open(file_name, 'r')
    self.column_dict = column_dict = {}
    self.column_list = column_list = []
    self.ratio_dict = ratio_dict = {}
    line_num = 0
    self.value_max = value_max = {}
    next_ord = 0
    for x, title in enumerate(file.readline().split(field_delim)):
      title = title.strip()
      title = title.strip('"')
      if title in column_dict:
        title = next_ord
        while title in column_dict:
          title += 1
        next_ord = title + 1
        title = str(title)
      column_dict[title] = []
      column_list.append(title)
    for line in file.readlines():
      line_num += 1
      for x, cell in enumerate(line.split(field_delim)):
        cell = cell.strip()
        key = column_list[x]
        if x != 0:
          cell = computeExpr(cell)
          if cell is not None:
            ratio = ratio_dict.get(key, 0)
            ratio_dict[key] = ratio + 1
            if cell > value_max.get(key, 0):
              value_max[key] = cell
        column_dict[key].append(cell)
    line_num = float(line_num) / 100
    for key in ratio_dict:
      ratio_dict[key] /= line_num

  def getColumn(self, column_id):
    return self.column_dict[self.column_list[column_id]]

  def iterColumns(self, start=0, stop=None):
    if stop is None:
      column_list = self.column_list[start:]
    else:
      column_list = self.column_list[start:stop]
    return ((x, self.column_dict[x], self.value_max.get(x, 0), self.ratio_dict.get(x, 0)) for x in column_list)

def computeExpr(expr):
  # only supports '=x/y'
  if expr:
    assert expr[0] == '='
    num, denom = expr[1:].split('/')
    result = float(int(num)) / int(denom)
  else:
    result = None
  return result

def main():
  parser = OptionParser(usage)
  parser.add_option("--with-regression", action="store_true",
    dest="regression_enabled", help="enable B-spline regression")
  parser.add_option("--ignored-quantity", type="int", dest="ignored_quantity",
    help="ignore IGNORED_QUANTITY higher values that might make a graph totally unusable")
  parser.add_option("--out-type", type="string", default="png",
    help="can be %default (default) or svg")
  parser.add_option("--minimal-non-empty-values-ratio", type="float",
    dest="minimal_non_empty_ratio", default=None,
    help="graph with ratio of non empty values with lesser than value, then graph is ignored")
  (options, file_name_list) = parser.parse_args()

  current_dir = os.getcwd()
  for file_name in file_name_list:
    print 'Loading %s...' % (file_name, )
    file = CSVFile(file_name)

    date_string_list = file.getColumn(0)
    date_list = []
    x_label_value_list = []
    # plotting functionnalities does not select smartly
    # a good number of x values to display, so we will display 20 dates
    # in order to have good enough dates on the x axis.
    # x_label_value_list will be like [1, 5, 10...]
    # date_list will be like ['2009/07/01', '2009/07/05', '2009/07/10', ...]
    factor = 1
    if len(date_string_list) > 20:
      factor = int(len(date_string_list) / 20)
    i = 0
    for date_string in date_string_list:
      if i % factor == 0:
        x_label_value_list.append(i)
        date_split = date_string.replace('"','').split('/')
        date_split.reverse()
        new_date = '/'.join(date_split)
        date_list.append(new_date)
      i += 1
    max_x = len(date_string_list)
    # knots are used for B-spline regression
    # We need to add three additional knots at the begin and end in
    # order to have the right basis
    knot_list  = [x_label_value_list[0]] * 3 + x_label_value_list \
        + [max_x] * 4
    r_x_label_value_list = robjects.FloatVector(x_label_value_list)
    robjects.globalenv["x_label_value_list"] = r_x_label_value_list
    robjects.globalenv["knot_list"] = knot_list
    r("x_label <- c(%s)" % ','.join(['"%s"' % x for x in date_list]))
    # import the splines library in R
    if options.regression_enabled:
      r("library(splines)")
    # now parse all columns and store a out-type file
    for title, column, value_max, ratio in file.iterColumns(start=1):
      out_file, out_ext = path.splitext(path.basename(file_name))
      if out_ext != '.csv':
        out_file = '.'.join((out_file, out_ext))
      out_file_name = '%s_%s_%03i.%s' % (out_file, title.replace('%',''),
          ratio, options.out_type)
      i = 0
      x_data = []
      y_data = []
      # First parse the list to retrieve values that we might want to remove
      ignored_value_set = set([])
      max_y_data = []
      if options.ignored_quantity not in (None, 0):
        for value in column:
          if value is not None:
            max_y_data.append(value)
        max_y_data.sort()
        ignored_value_set = set(max_y_data[-options.ignored_quantity:])
      # build list with all data that we want to display
      for value in column:
        if value is not None and not (value in ignored_value_set):
          x_data.append(i)
          y_data.append(value)
        i += 1
      if len(x_data) == 0:
        print 'Nothing to plot for %s...' % (out_file_name, )
        continue
      if options.minimal_non_empty_ratio is not None:
        column_len = len(column)
        if column_len:
          if float(len(x_data))/column_len < options.minimal_non_empty_ratio:
            print 'Not enough values to plot for %s...' % (out_file_name, )
            continue
      r_y_data = robjects.FloatVector(y_data)
      r_x_data = robjects.FloatVector(x_data)
      robjects.globalenv["y_data"] = r_y_data
      robjects.globalenv["x_data"] = r_x_data
      display_column_regression = options.regression_enabled
      # if there is no more than one unique point, regression is useless
      if len(set([x for x in r_y_data])) <= 1:
        display_column_regression = 0
      regression_string = ''
      # Calculate a B-spline regression in order to give clear overview
      # about the direction of chaotics values.
      if display_column_regression:
        r("bx <- splineDesign(knot_list, x_data)")
        r("fitted_model <- lm(y_data ~ bx)")
        regression_string = ', fitted_model$fit'
      # Define the place where to store the graphe and format of the image
      r("""%s(file='%s/%s', width=800, height=600)""" % (options.out_type,
        current_dir, out_file_name))
      # Increase the size for the place of the bottom axis labels (x)
      r("""par(mar=c(9, 4, 4, 2) + 0.1)""")
      # Plot the graph itself
      r("""matplot(x_data, cbind(y_data %s), type='ll',
                lty=1, main='%s (average display time per day)',
                xlab='', ylab='time (s)', xaxt='n')""" % (
                  regression_string, title))
      r("""axis(1, at=x_label_value_list, lab=x_label, las=2)""")
      # stop changing the out-type file
      r("""dev.off()""")

      print 'Saving %s...' % (out_file_name, )

if __name__ == '__main__':
  main()