Commit 039da94e authored by Vincent Pelletier's avatar Vincent Pelletier

Add optional median computation.

For every measure, display the median in addition to the existing values
(score, average, max).
Optional, because it requires an amount of ram proportional to the number
of hits.
parent 6a4d6f5c
...@@ -91,6 +91,12 @@ FILE_OPENER_LIST = [ ...@@ -91,6 +91,12 @@ FILE_OPENER_LIST = [
INPUT_ENCODING = 'ascii' INPUT_ENCODING = 'ascii'
INPUT_ENCODING_ERROR_HANDLER = 'replace' INPUT_ENCODING_ERROR_HANDLER = 'replace'
class _NullList(list):
@staticmethod
def append(_):
pass
NULL_LIST = _NullList()
MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar', MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1)) 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))
...@@ -260,9 +266,11 @@ class APDEXStats: ...@@ -260,9 +266,11 @@ class APDEXStats:
'duration_total', 'duration_total',
'duration_max', 'duration_max',
'getDuration', 'getDuration',
'duration_list',
'enable_median',
) )
def __init__(self, threshold, getDuration): def __init__(self, threshold, getDuration, enable_median):
threshold *= US_PER_S threshold *= US_PER_S
self.threshold = threshold self.threshold = threshold
self.threshold4 = threshold * APDEX_TOLERATING_COEF self.threshold4 = threshold * APDEX_TOLERATING_COEF
...@@ -272,6 +280,12 @@ class APDEXStats: ...@@ -272,6 +280,12 @@ class APDEXStats:
self.duration_total = 0 self.duration_total = 0
self.duration_max = 0 self.duration_max = 0
self.getDuration = getDuration self.getDuration = getDuration
self.enable_median = enable_median
self.duration_list = (
[]
if enable_median else
NULL_LIST
)
def accumulate(self, match): def accumulate(self, match):
duration = self.getDuration(match) duration = self.getDuration(match)
...@@ -283,12 +297,15 @@ class APDEXStats: ...@@ -283,12 +297,15 @@ class APDEXStats:
elif duration <= self.threshold4: elif duration <= self.threshold4:
self.apdex_4 += 1 self.apdex_4 += 1
self.hit += 1 self.hit += 1
self.duration_list.append(duration)
def accumulateFrom(self, other): def accumulateFrom(self, other):
for attribute in ('apdex_1', 'apdex_4', 'hit', 'duration_total'): for attribute in ('apdex_1', 'apdex_4', 'hit', 'duration_total'):
setattr(self, attribute, setattr(self, attribute,
getattr(self, attribute) + getattr(other, attribute)) getattr(self, attribute) + getattr(other, attribute))
self.duration_max = max(self.duration_max, other.duration_max) self.duration_max = max(self.duration_max, other.duration_max)
if self.enable_median:
self.duration_list = itertools.chain(self.duration_list, other.duration_list)
def getApdex(self): def getApdex(self):
if self.hit: if self.hit:
...@@ -304,9 +321,16 @@ class APDEXStats: ...@@ -304,9 +321,16 @@ class APDEXStats:
return float(self.duration_max) / US_PER_S return float(self.duration_max) / US_PER_S
@staticmethod @staticmethod
def asHTMLHeader(overall=False): def asHTMLHeader(overall=False, enable_median=False):
return ( return (
'<th>apdex</th><th>hits</th><th>avg (s)</th>' '<th>apdex</th>'
'<th>hits</th>'
'<th>avg (s)</th>' +
(
'<th>med (s)</th>'
if enable_median else
''
) +
'<th' + (' class="overall_right"' if overall else '') + '>max (s)</th>' '<th' + (' class="overall_right"' if overall else '') + '>max (s)</th>'
) )
...@@ -328,15 +352,35 @@ class APDEXStats: ...@@ -328,15 +352,35 @@ class APDEXStats:
extra_right_class = 'overall_right' extra_right_class = 'overall_right'
else: else:
extra_right_class = '' extra_right_class = ''
if self.enable_median:
duration_list = sorted(self.duration_list)
if duration_list:
duration_list_len = len(duration_list)
half_duration_list_len = duration_list_len >> 1
if duration_list_len & 1:
median = duration_list[half_duration_list_len]
else:
median = (
duration_list[half_duration_list_len - 1] +
duration_list[half_duration_list_len]
) / 2
median /= US_PER_S
else:
median = 0
median_string = f'<td class="{getClassForDuration(median, threshold)} {extra_class}">{median:.2f}</td>'
else:
median_string = ''
return ( return (
f'<td style="{apdex_style}" class="{extra_class} group_left">{round(apdex * 100)}%</td>' f'<td style="{apdex_style}" class="{extra_class} group_left">{round(apdex * 100)}%</td>'
f'<td class="{extra_class}">{hit}</td>' f'<td class="{extra_class}">{hit}</td>'
f'<td class="{getClassForDuration(average, threshold)} {extra_class}">{average:.2f}</td>' f'<td class="{getClassForDuration(average, threshold)} {extra_class}">{average:.2f}</td>' +
median_string +
f'<td class="{getClassForDuration(maximum, threshold)} {extra_class} group_right {extra_right_class}">{maximum:.2f}</td>' f'<td class="{getClassForDuration(maximum, threshold)} {extra_class} group_right {extra_right_class}">{maximum:.2f}</td>'
) )
_IGNORE_IN_STATE = ( _IGNORE_IN_STATE = (
'getDuration', 'getDuration',
'duration_list',
) )
@classmethod @classmethod
...@@ -344,9 +388,10 @@ class APDEXStats: ...@@ -344,9 +388,10 @@ class APDEXStats:
result = cls( result = cls(
threshold=0, threshold=0,
getDuration=getDuration, getDuration=getDuration,
enable_median=False,
) )
for key in self.__slots__: for key in cls.__slots__:
if key in self._IGNORE_IN_STATE: if key in cls._IGNORE_IN_STATE:
continue continue
try: try:
value = state[key] value = state[key]
...@@ -377,6 +422,7 @@ class GenericSiteStats: ...@@ -377,6 +422,7 @@ class GenericSiteStats:
suffix, suffix,
error_detail=False, error_detail=False,
user_agent_detail=False, user_agent_detail=False,
enable_median=False,
# Non-generic parameters # Non-generic parameters
**_ **_
): ):
...@@ -387,10 +433,11 @@ class GenericSiteStats: ...@@ -387,10 +433,11 @@ class GenericSiteStats:
if error_detail: if error_detail:
# status -> url -> referrer -> count # status -> url -> referrer -> count
self.error_url_count = defaultdict(partial(defaultdict, Counter)) self.error_url_count = defaultdict(partial(defaultdict, Counter))
self.url_apdex = defaultdict(partial(APDEXStats, threshold, getDuration)) self.url_apdex = defaultdict(partial(APDEXStats, threshold, getDuration, enable_median))
self.apdex = defaultdict(partial(APDEXStats, threshold, getDuration)) self.apdex = defaultdict(partial(APDEXStats, threshold, getDuration, enable_median))
self.user_agent_detail = user_agent_detail self.user_agent_detail = user_agent_detail
self.user_agent_counter = Counter() self.user_agent_counter = Counter()
self.enable_median = enable_median
def rescale(self, convert, getDuration): def rescale(self, convert, getDuration):
for status, date_dict in self.status.items(): for status, date_dict in self.status.items():
...@@ -398,7 +445,7 @@ class GenericSiteStats: ...@@ -398,7 +445,7 @@ class GenericSiteStats:
for value_date, status_count in date_dict.items(): for value_date, status_count in date_dict.items():
new_date_dict[convert(value_date)] += status_count new_date_dict[convert(value_date)] += status_count
self.status[status] = new_date_dict self.status[status] = new_date_dict
new_apdex = defaultdict(partial(APDEXStats, self.threshold, getDuration)) new_apdex = defaultdict(partial(APDEXStats, self.threshold, getDuration, self.enable_median))
for value_date, data in self.apdex.items(): for value_date, data in self.apdex.items():
new_apdex[convert(value_date)].accumulateFrom(data) new_apdex[convert(value_date)].accumulateFrom(data)
self.apdex = new_apdex self.apdex = new_apdex
...@@ -431,15 +478,15 @@ class GenericSiteStats: ...@@ -431,15 +478,15 @@ class GenericSiteStats:
): # pylint: disable=unused-argument ): # pylint: disable=unused-argument
result = [] result = []
append = result.append append = result.append
apdex = APDEXStats(self.threshold, None) apdex = APDEXStats(self.threshold, None, self.enable_median)
for data in self.apdex.values(): for data in self.apdex.values():
apdex.accumulateFrom(data) apdex.accumulateFrom(data)
append('<h2>Overall</h2><table class="stats"><tr>') append('<h2>Overall</h2><table class="stats"><tr>')
append(APDEXStats.asHTMLHeader()) append(APDEXStats.asHTMLHeader(enable_median=self.enable_median))
append('</tr><tr>') append('</tr><tr>')
append(apdex.asHTML(self.threshold)) append(apdex.asHTML(self.threshold))
append('</tr></table><h2>Hottest pages</h2><table class="stats"><tr>') append('</tr></table><h2>Hottest pages</h2><table class="stats"><tr>')
append(APDEXStats.asHTMLHeader()) append(APDEXStats.asHTMLHeader(enable_median=self.enable_median))
append('<th>url</th></tr>') append('<th>url</th></tr>')
for url, data in sorted(self.url_apdex.items(), key=lambda x: x[1].getAverage() * x[1].hit, for url, data in sorted(self.url_apdex.items(), key=lambda x: x[1].getAverage() * x[1].hit,
reverse=True)[:n_hottest_pages]: reverse=True)[:n_hottest_pages]:
...@@ -521,6 +568,8 @@ class GenericSiteStats: ...@@ -521,6 +568,8 @@ class GenericSiteStats:
suffix=suffix, suffix=suffix,
error_detail=error_detail, error_detail=error_detail,
user_agent_detail=state.get('user_agent_detail', True), user_agent_detail=state.get('user_agent_detail', True),
# json format does not support median, due to how large they can get
enable_median=False,
) )
if error_detail: if error_detail:
error_url_count = result.error_url_count error_url_count = result.error_url_count
...@@ -586,6 +635,7 @@ class ERP5SiteStats(GenericSiteStats): ...@@ -586,6 +635,7 @@ class ERP5SiteStats(GenericSiteStats):
suffix, suffix,
error_detail=False, error_detail=False,
user_agent_detail=False, user_agent_detail=False,
enable_median=False,
erp5_expand_other=False, erp5_expand_other=False,
): ):
super().__init__( super().__init__(
...@@ -594,6 +644,7 @@ class ERP5SiteStats(GenericSiteStats): ...@@ -594,6 +644,7 @@ class ERP5SiteStats(GenericSiteStats):
suffix, suffix,
error_detail=error_detail, error_detail=error_detail,
user_agent_detail=user_agent_detail, user_agent_detail=user_agent_detail,
enable_median=enable_median,
) )
self.expand_other = erp5_expand_other self.expand_other = erp5_expand_other
...@@ -602,35 +653,54 @@ class ERP5SiteStats(GenericSiteStats): ...@@ -602,35 +653,54 @@ class ERP5SiteStats(GenericSiteStats):
# - module id (string) # - module id (string)
# - is document (bool) # - is document (bool)
# - date (string) # - date (string)
self.module = defaultdict(partial(defaultdict, partial( self.module = defaultdict(
defaultdict, partial(APDEXStats, threshold, getDuration)))) partial(
defaultdict,
partial(
defaultdict,
partial(APDEXStats, threshold, getDuration, enable_median),
),
),
)
# Key levels: # Key levels:
# - id (string) # - id (string)
# => 'other' only if expand_other == False # => 'other' only if expand_other == False
# - date (string) # - date (string)
self.no_module = defaultdict(partial( self.no_module = defaultdict(
defaultdict, partial(APDEXStats, threshold, getDuration))) partial(
defaultdict,
partial(APDEXStats, threshold, getDuration, enable_median),
),
)
self.site_search = defaultdict(partial(APDEXStats, threshold, getDuration)) self.site_search = defaultdict(
partial(APDEXStats, threshold, getDuration, enable_median),
)
def rescale(self, convert, getDuration): def rescale(self, convert, getDuration):
super().rescale(convert, getDuration) super().rescale(convert, getDuration)
threshold = self.threshold threshold = self.threshold
for document_dict in self.module.values(): for document_dict in self.module.values():
for is_document, date_dict in document_dict.items(): for is_document, date_dict in document_dict.items():
new_date_dict = defaultdict(partial(APDEXStats, threshold, getDuration)) new_date_dict = defaultdict(
partial(APDEXStats, threshold, getDuration, self.enable_median),
)
for value_date, data in date_dict.items(): for value_date, data in date_dict.items():
new_date_dict[convert(value_date)].accumulateFrom(data) new_date_dict[convert(value_date)].accumulateFrom(data)
document_dict[is_document] = new_date_dict document_dict[is_document] = new_date_dict
for id_, date_dict in self.no_module.items(): for id_, date_dict in self.no_module.items():
new_date_dict = defaultdict(partial(APDEXStats, threshold, getDuration)) new_date_dict = defaultdict(
partial(APDEXStats, threshold, getDuration, self.enable_median),
)
for value_date, data in date_dict.items(): for value_date, data in date_dict.items():
new_date_dict[convert(value_date)].accumulateFrom(data) new_date_dict[convert(value_date)].accumulateFrom(data)
self.no_module[id_] = new_date_dict self.no_module[id_] = new_date_dict
attribute = defaultdict(partial(APDEXStats, threshold, getDuration)) attribute = defaultdict(
partial(APDEXStats, threshold, getDuration, self.enable_median),
)
for value_date, data in self.site_search.items(): for value_date, data in self.site_search.items():
attribute[convert(value_date)].accumulateFrom(data) attribute[convert(value_date)].accumulateFrom(data)
self.site_search = attribute self.site_search = attribute
...@@ -662,13 +732,25 @@ class ERP5SiteStats(GenericSiteStats): ...@@ -662,13 +732,25 @@ class ERP5SiteStats(GenericSiteStats):
append('<h2>Stats per module</h2><table class="stats stats_erp5"><tr>' append('<h2>Stats per module</h2><table class="stats stats_erp5"><tr>'
'<th rowspan="2" colspan="3">module</th>' '<th rowspan="2" colspan="3">module</th>'
'<th colspan="4" class="overall_right">overall</th>') '<th colspan="4" class="overall_right">overall</th>')
module_document_overall = defaultdict(partial(APDEXStats, self.threshold, module_document_overall = defaultdict(
None)) partial(APDEXStats, self.threshold, None, self.enable_median),
filtered_module = defaultdict(partial(defaultdict, partial( )
defaultdict, partial(APDEXStats, self.threshold, None)))) filtered_module = defaultdict(
other_overall = APDEXStats(self.threshold, None) partial(
filtered_no_module = defaultdict(partial( defaultdict,
defaultdict, partial(APDEXStats, self.threshold, None))) partial(
defaultdict,
partial(APDEXStats, self.threshold, None, self.enable_median),
),
),
)
other_overall = APDEXStats(self.threshold, None, self.enable_median)
filtered_no_module = defaultdict(
partial(
defaultdict,
partial(APDEXStats, self.threshold, None, self.enable_median),
),
)
column_set = set() column_set = set()
for key, data_dict in self.no_module.items(): for key, data_dict in self.no_module.items():
filtered_id_dict = filtered_no_module[key] filtered_id_dict = filtered_no_module[key]
...@@ -676,8 +758,9 @@ class ERP5SiteStats(GenericSiteStats): ...@@ -676,8 +758,9 @@ class ERP5SiteStats(GenericSiteStats):
filtered_id_dict[stat_filter(value_date)].accumulateFrom(value) filtered_id_dict[stat_filter(value_date)].accumulateFrom(value)
other_overall.accumulateFrom(value) other_overall.accumulateFrom(value)
column_set.update(filtered_id_dict) column_set.update(filtered_id_dict)
filtered_site_search = defaultdict(partial(APDEXStats, self.threshold, filtered_site_search = defaultdict(
None)) partial(APDEXStats, self.threshold, None, self.enable_median),
)
for value_date, value in self.site_search.items(): for value_date, value in self.site_search.items():
filtered_site_search[stat_filter(value_date)].accumulateFrom(value) filtered_site_search[stat_filter(value_date)].accumulateFrom(value)
column_set.update(filtered_site_search) column_set.update(filtered_site_search)
...@@ -695,10 +778,13 @@ class ERP5SiteStats(GenericSiteStats): ...@@ -695,10 +778,13 @@ class ERP5SiteStats(GenericSiteStats):
append(f'<th colspan="4">{column}</th>') append(f'<th colspan="4">{column}</th>')
append('</tr><tr>') append('</tr><tr>')
for i in range(len(column_list) + 1): for i in range(len(column_list) + 1):
append(APDEXStats.asHTMLHeader(i == 0)) append(APDEXStats.asHTMLHeader(
overall=i == 0,
enable_median=self.enable_median,
))
append('</tr>') append('</tr>')
def apdexAsColumns(data_dict): def apdexAsColumns(data_dict):
data_total = APDEXStats(self.threshold, None) data_total = APDEXStats(self.threshold, None, self.enable_median)
for data in data_dict.values(): for data in data_dict.values():
data_total.accumulateFrom(data) data_total.accumulateFrom(data)
append(data_total.asHTML(self.threshold, True)) append(data_total.asHTML(self.threshold, True))
...@@ -754,7 +840,7 @@ class ERP5SiteStats(GenericSiteStats): ...@@ -754,7 +840,7 @@ class ERP5SiteStats(GenericSiteStats):
append('</tr>') append('</tr>')
append('</table><h2>Per-level overall</h2><table class="stats"><tr>' append('</table><h2>Per-level overall</h2><table class="stats"><tr>'
'<th>level</th>') '<th>level</th>')
append(APDEXStats.asHTMLHeader()) append(APDEXStats.asHTMLHeader(enable_median=self.enable_median))
append('</tr><tr><th>other</th>') append('</tr><tr><th>other</th>')
append(other_overall.asHTML(self.threshold)) append(other_overall.asHTML(self.threshold))
append('</tr><tr><th>site search</th>') append('</tr><tr><th>site search</th>')
...@@ -1169,6 +1255,7 @@ def asHTML( ...@@ -1169,6 +1255,7 @@ def asHTML(
('apdex threshold', f'{args.apdex:.2f}s'), ('apdex threshold', f'{args.apdex:.2f}s'),
('period', args.period or (period + ' (auto)')), ('period', args.period or (period + ' (auto)')),
('timezone', args.to_timezone or "(input's)"), ('timezone', args.to_timezone or "(input's)"),
('median', ('enabled' if args.enable_median else 'disabled')),
): ):
out.write(f'<tr><th class="text">{caption}</th><td>{value}</td></tr>') out.write(f'<tr><th class="text">{caption}</th><td>{value}</td></tr>')
out.write(f'</table><h2>Hits per {period}</h2><table class="stats">' out.write(f'</table><h2>Hits per {period}</h2><table class="stats">'
...@@ -1253,8 +1340,8 @@ def asJSON(out, encoding, per_site, *_): # pylint: disable=unused-argument ...@@ -1253,8 +1340,8 @@ def asJSON(out, encoding, per_site, *_): # pylint: disable=unused-argument
json.dump([(x, y.asJSONState()) for x, y in per_site.items()], out) json.dump([(x, y.asJSONState()) for x, y in per_site.items()], out)
format_generator = { format_generator = {
'html': (asHTML, 'utf-8'), 'html': (asHTML, 'utf-8', True),
'json': (asJSON, 'ascii'), 'json': (asJSON, 'ascii', False),
} }
ZERO_TIMEDELTA = timedelta(0, 0) ZERO_TIMEDELTA = timedelta(0, 0)
...@@ -1335,6 +1422,9 @@ def main(): ...@@ -1335,6 +1422,9 @@ def main():
'Useful when migrating from one configuration/software package to ' 'Useful when migrating from one configuration/software package to '
'another while keeping results comparable even if the latter has a ' 'another while keeping results comparable even if the latter has a '
'much larger (and possibly non-configurable) total request timeout.') 'much larger (and possibly non-configurable) total request timeout.')
parser.add_argument('--enable-median', action='store_true',
help='Enable median computation. Increases memory use. Forcibly '
'disabled when state files are used, either as input or output.')
group = parser.add_argument_group('generated content (all formats)') group = parser.add_argument_group('generated content (all formats)')
group.add_argument('-a', '--apdex', default=1.0, type=float, group.add_argument('-a', '--apdex', default=1.0, type=float,
...@@ -1413,6 +1503,8 @@ def main(): ...@@ -1413,6 +1503,8 @@ def main():
else: else:
parser.error('Neither %D nor %T are present in logformat, apdex ' parser.error('Neither %D nor %T are present in logformat, apdex '
'cannot be computed.') 'cannot be computed.')
generator, out_encoding, enable_median = format_generator[args.format]
args.enable_median = enable_median = enable_median and args.enable_median and not args.state_file
if args.duration_cap: if args.duration_cap:
def getDuration( # pylint: disable=function-redefined def getDuration( # pylint: disable=function-redefined
match, match,
...@@ -1662,6 +1754,7 @@ def main(): ...@@ -1662,6 +1754,7 @@ def main():
error_detail=error_detail, error_detail=error_detail,
user_agent_detail=user_agent_detail, user_agent_detail=user_agent_detail,
erp5_expand_other=erp5_expand_other, erp5_expand_other=erp5_expand_other,
enable_median=enable_median,
) )
try: try:
site_data.accumulate(match, url_match, hit_date) site_data.accumulate(match, url_match, hit_date)
...@@ -1674,7 +1767,6 @@ def main(): ...@@ -1674,7 +1767,6 @@ def main():
if show_progress: if show_progress:
print(lineno, file=sys.stderr) print(lineno, file=sys.stderr)
end_parsing_time = time.time() end_parsing_time = time.time()
generator, out_encoding = format_generator[args.format]
if args.out == '-': if args.out == '-':
out = sys.stdout out = sys.stdout
out.reconfigure(encoding=out_encoding) out.reconfigure(encoding=out_encoding)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment