Commit 1e854294 authored by Łukasz Nowak's avatar Łukasz Nowak
parent 28b54557
No related merge requests found
# Copyright (c) 2009 Nexedi SA and Contributors. All Rights Reserved.
# Vincent Pelletier <>
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import os
import sys
import imp
import gzip
import getopt
from time import time
from tiny_profiler import profiler_decorator, profiler_report
def profiler_decorator(funct):
return funct
def profiler_report():
usage = """
Usage: [--prefix <prefix>] --config <config> [--debug]
[--no-average] [--sum] [--load <file>] [--save <file>]
[--decimate <int>] [file_1 [file_2 [...]]]
Either --prefix or --save must be given.
--prefix <prefix>
<prefix> is a string which is used to prefix result file names.
If ommited, no CSV will be generated.
Disable the generation of CSV files with average values.
Generate CSV files with time sum as values.
They use the same names as average files, suffixed with "_sum.csv"
Ignored if --prefix was not given.
--load <file>
Load internal data dict from given file before processing any given file.
If it's given multiple time, the content of all those files will be merged.
--save <file>
Save interal data dict to given file after processing all given files.
--config <config>
<config> is a python script defining 2 values:
- a method called "processLine"
- a compiled regex called "LINE_PATTERN"
- a date list sort key computation function called "date_key"
Display missed and skipped lines.
--decimate <int>
Instead of generating a line per measure, generate one line per <int>
Remain of the integer division of the number of measures per decimate value
are all put in latest output line.
file_1 ...
Log files to process.
Order in which files are given does not matter.
Files can be gzip or plain text.
Output files:
CSV, one file per distinct processLine return value, one line per log day,
one column per measure.
First line contains column titles.
First column contains measure date (first recognisable date in current file).
Each other cell contains:
=<value sum>/<value count>
Which means an average of 3.472s over 125 values.
Empty clls means that there are no values for that measure in current file.
Strings are surrounded by double quotes (").
Fields are sparated by colons (,).
def parseFile(filename, measure_dict):
date = None
line_number = 0
match_count = 0
skip_count = 0
logfile =, 'r')
line = logfile.readline()
except IOError:
logfile = open(filename, 'r')
line = logfile.readline()
begin = time()
while line != '':
line_number += 1
if line_number % 5000 == 0:
sys.stderr.write('%i\r' % (line_number, ))
match_list = LINE_PATTERN.findall(line)
if len(match_list) != 1:
print >>sys.stderr, 'Unparseable line: %s:%i %r' % (filename, line_number, line)
result, filter_id, date, duration = processLine(match_list[0], filename, line_number)
# Possible result values & meaning:
# False: try next filter_method
# True: ignore & skip to next line
# (string): use & skip to next line
if result is False:
if debug:
print >>sys.stderr, '? %s:%i %r' % (filename, line_number, match_list[0])
elif result is True:
if debug:
print >>sys.stderr, '- %s:%i %r' % (filename, line_number, match_list[0])
skip_count += 1
measure_dict.setdefault(filter_id, {}).setdefault(result, {}).setdefault(date, []).append(int(duration))
match_count += 1
line = logfile.readline()
print >>sys.stderr, '%i' % (line_number, )
if line_number > 0:
duration = time() - begin
print >>sys.stderr, "Matched %i lines (%.2f%%), %i skipped (%.2f%%), %i unmatched (%.2f%%) in %.2fs (%i lines per second)." % \
(match_count, (float(match_count) / line_number) * 100, skip_count, (float(skip_count) / line_number) * 100, (line_number - match_count - skip_count), (1 - (float(match_count + skip_count) / line_number)) * 100, duration, line_number / duration)
debug = False
outfile_prefix = None
configuration = None
do_average = True
do_sum = False
load_file_name_list = []
save_file_name = None
decimate_count = 1
opts, file_list = getopt.getopt(sys.argv[1:], '', ['debug', 'config=', 'prefix=', 'no-average', 'sum', 'load=', 'save=', 'decimate='])
except Exception, reason:
print >>sys.stderr, reason
print >>sys.stderr, usage
for name, value in opts:
if name == '--debug':
debug = True
elif name == '--config':
configuration = value
elif name == '--prefix':
outfile_prefix = value
elif name == '--no-average':
do_average = False
elif name == '--sum':
do_sum = True
elif name == '--load':
elif name == '--save':
save_file_name = value
elif name == '--decimate':
decimate_count = int(value)
if configuration is None:
raise ValueError, '--config is mandatory'
config_file = os.path.splitext(os.path.basename(configuration))[0]
config_path = [os.path.dirname(os.path.abspath(configuration))] + sys.path
file, path, description = imp.find_module(config_file, config_path)
module = imp.load_module(config_file, file, path, description)
processLine = module.processLine
date_key = module.date_key
file_count = len(file_list)
file_number = 0
measure_dict = {}
if len(load_file_name_list):
for load_file_name in load_file_name_list:
load_file = open(load_file_name)
temp_measure_dict = eval(, {})
assert isinstance(measure_dict, dict)
for filter_id, result_dict in temp_measure_dict.iteritems():
for result, date_dict in result_dict.iteritems():
for date, duration_list in date_dict.iteritems():
measure_dict.setdefault(filter_id, {}).setdefault(result, {}).setdefault(date, []).extend(duration_list)
print >>sys.stderr, 'Previous processing result restored from %r' % (load_file_name, )
for filename in file_list:
file_number += 1
print >>sys.stderr, 'Processing %s [%i/%i]...' % (filename, file_number, file_count)
parseFile(filename, measure_dict)
if save_file_name is not None:
save_file = open(save_file_name, 'w')
print >>sys.stderr, 'Processing result saved to %r' % (save_file_name, )
if outfile_prefix is not None:
## Generate a list of all measures and a 2-levels dictionnary with date as key and measure dictionnary as value
measure_id_list = []
append = measure_id_list.append
sheet_dict = {}
line_dict = {}
for match_id, match_dict in measure_dict.iteritems():
for result_id, result_dict in match_dict.iteritems():
measure_id = (match_id, result_id)
sheet_dict.setdefault(match_id, []).append((result_id, measure_id))
for date, measure_list in result_dict.iteritems():
first_level_dict = line_dict.setdefault(date, {})
assert measure_id not in first_level_dict
first_level_dict[measure_id] = measure_list
date_list = line_dict.keys()
def render_cell(value_list, format):
if isinstance(value_list, (list, tuple)):
return format % {'sum': sum(value_list), 'count': len(value_list)}
return value_list
def renderOutput(data_format, filename_suffix):
for sheet_id, sheet_column_list in sheet_dict.iteritems():
outfile_name = '%s_%s_%s.csv' % (outfile_prefix, sheet_id, filename_suffix)
print >>sys.stderr, 'Writing to %r...' % (outfile_name, )
outfile = open(outfile_name, 'w')
print >>outfile, '"date",%s' % (','.join(['"%s"' % (x[0], ) for x in sheet_column_list]), )
decimate_dict = {}
decimate = 0
for date in date_list:
for key, value in line_dict[date].iteritems():
decimate_dict.setdefault(key, []).extend(value)
decimate += 1
if decimate == decimate_count:
print >>outfile, '"%s",%s' % (date, ','.join([render_cell(decimate_dict.get(x[1], ''), data_format) for x in sheet_column_list]))
decimate_dict = {}
decimate = 0
if len(decimate_dict):
print >>outfile, '"%s",%s' % (date, ','.join([render_cell(decimate_dict.get(x[1], ''), data_format) for x in sheet_column_list]))
if do_average:
renderOutput('=%(sum)i/%(count)i', 'avg')
if do_sum:
renderOutput('=%(sum)i', 'sum')
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment