Commit 1a40ddfe authored by Mikolaj Wawrzyniak's avatar Mikolaj Wawrzyniak

Calculate intersetion of redis hll metrics

In order to enable better analysis of collected usage metrics
we should enable obtaining value of intersection of any metrics.
As a first iteration we focus on Redis HLL counter
based metrics.
parent 7b798038
#- name: unique name of aggregated metric
# operator: aggregation operator. Valid values are:
# - "ANY": counts unique elements that were observed triggering any of following events
# - "ALL": counts unique elements that were observed triggering all of following events
# events: list of events names to aggregate into metric. All events in this list must have the same 'redis_slot' and 'aggregation' attributes
# see from lib/gitlab/usage_data_counters/known_events.yml for the list of valid events.
---
- name: product_analytics_test_aggregated_metrics
operator: ANY
events: ['i_search_total', 'i_search_advanced', 'i_search_paid']
- name: product_analytics_test_combined_events
operator: ALL
events: ['i_search_total', 'i_search_advanced', 'i_search_paid']
......@@ -17,8 +17,10 @@ module Gitlab
KNOWN_EVENTS_PATH = File.expand_path('known_events/*.yml', __dir__)
ALLOWED_AGGREGATIONS = %i(daily weekly).freeze
UNION_OF_AGGREGATED_METRICS = 'ANY'
INTERSECTION_OF_AGGREGATED_METRICS = 'ALL'
ALLOWED_METRICS_AGGREGATIONS = [UNION_OF_AGGREGATED_METRICS, INTERSECTION_OF_AGGREGATED_METRICS].freeze
AGGREGATED_METRICS_PATH = File.expand_path('aggregated_metrics/*.yml', __dir__)
ALLOWED_METRICS_AGGREGATIONS = %w[ANY].freeze
# Track event on entity_id
# Increment a Redis HLL counter for unique event_name and entity_id
......@@ -113,9 +115,79 @@ module Gitlab
private
def calculate_count_for_aggregation(aggregation, start_date:, end_date:)
validate_aggregation_operator!(aggregation[:operator])
case aggregation[:operator]
when UNION_OF_AGGREGATED_METRICS
calculate_events_union(event_names: aggregation[:events], start_date: start_date, end_date: end_date)
when INTERSECTION_OF_AGGREGATED_METRICS
calculate_events_intersections(event_names: aggregation[:events], start_date: start_date, end_date: end_date)
else
raise UnknownAggregationOperator, "Events should be aggregated with one of operators #{ALLOWED_METRICS_AGGREGATIONS}"
end
end
# calculate intersection of 'n' sets based on inclusion exclusion principle https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle
# this method will be extracted to dedicated module with https://gitlab.com/gitlab-org/gitlab/-/issues/273391
def calculate_events_intersections(event_names:, start_date:, end_date:, subset_powers_cache: Hash.new({}))
# calculate power of intersection of all given metrics from inclusion exclusion principle
# |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C|) =>
# |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C|
# |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| =>
# |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D|
# calculate each components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ...
subset_powers_data = subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache)
# calculate last component of the equation |A & B & C & D| = .... - |A + B + C + D|
power_of_union_of_all_events = begin
subset_powers_cache[event_names.size][event_names.join('_+_')] ||= \
calculate_events_union(event_names: event_names, start_date: start_date, end_date: end_date)
end
# in order to determine if part of equation (|A & B & C|, |A & B & C & D|), that represents the intersection that we need to calculate,
# is positive or negative in particular equation we need to determine if number of subsets is even or odd. Please take a look at two examples below
# |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + |A & B & C| =>
# |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C|
# |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| =>
# |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D|
subset_powers_size_even = subset_powers_data.size.even?
# sum all components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ... =>
sum_of_all_subset_powers = sum_subset_powers(subset_powers_data, subset_powers_size_even)
# add last component of the equation |A & B & C & D| = sum_of_all_subset_powers - |A + B + C + D|
sum_of_all_subset_powers + (subset_powers_size_even ? power_of_union_of_all_events : -power_of_union_of_all_events)
end
def sum_subset_powers(subset_powers_data, subset_powers_size_even)
sum_without_sign = subset_powers_data.to_enum.with_index.sum do |value, index|
(index + 1).odd? ? value : -value
end
count_unique_events(event_names: aggregation[:events], start_date: start_date, end_date: end_date) do |events|
(subset_powers_size_even ? -1 : 1) * sum_without_sign
end
def subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache)
subset_sizes = (1..(event_names.size - 1))
subset_sizes.map do |subset_size|
if subset_size > 1
# calculate sum of powers of intersection between each subset (with given size) of metrics: #|A + B + C + D| = ... - (|A & B| + |A & C| + .. + |C & D|)
event_names.combination(subset_size).sum do |events_subset|
subset_powers_cache[subset_size][events_subset.join('_&_')] ||= \
calculate_events_intersections(event_names: events_subset, start_date: start_date, end_date: end_date, subset_powers_cache: subset_powers_cache)
end
else
# calculate sum of powers of each set (metric) alone #|A + B + C + D| = (|A| + |B| + |C| + |D|) - ...
event_names.sum do |event|
subset_powers_cache[subset_size][event] ||= \
unique_events(event_names: event, start_date: start_date, end_date: end_date)
end
end
end
end
def calculate_events_union(event_names:, start_date:, end_date:)
count_unique_events(event_names: event_names, start_date: start_date, end_date: end_date) do |events|
raise SlotMismatch, events unless events_in_same_slot?(events)
raise AggregationMismatch, events unless events_same_aggregation?(events)
end
......@@ -226,12 +298,6 @@ module Gitlab
end.flatten
end
def validate_aggregation_operator!(operator)
return true if ALLOWED_METRICS_AGGREGATIONS.include?(operator)
raise UnknownAggregationOperator.new("Events should be aggregated with one of operators #{ALLOWED_METRICS_AGGREGATIONS}")
end
def weekly_redis_keys(events:, start_date:, end_date:)
weeks = end_date.to_date.cweek - start_date.to_date.cweek
weeks = 1 if weeks == 0
......
......@@ -277,29 +277,23 @@ RSpec.describe Gitlab::UsageDataCounters::HLLRedisCounter, :clean_gitlab_redis_s
end
end
context 'aggregated metrics' do
context 'aggregated_metrics_data' do
let(:known_events) do
[
{ name: 'event1_slot', redis_slot: "slot", category: 'category1', aggregation: "weekly" },
{ name: 'event2_slot', redis_slot: "slot", category: 'category2', aggregation: "weekly" },
{ name: 'event3', category: 'category2', aggregation: "weekly" }
].map(&:with_indifferent_access)
end
let(:aggregated_metrics) do
[
{ name: 'gmau_1', events: %w[event1_slot event2_slot], operator: "ANY" },
{ name: 'gmau_2', events: %w[event3], operator: "ANY" }
{ name: 'event3_slot', redis_slot: "slot", category: 'category3', aggregation: "weekly" },
{ name: 'event5_slot', redis_slot: "slot", category: 'category4', aggregation: "weekly" },
{ name: 'event4', category: 'category2', aggregation: "weekly" }
].map(&:with_indifferent_access)
end
before do
allow(described_class).to receive(:known_events).and_return(known_events)
allow(described_class).to receive(:aggregated_metrics).and_return(aggregated_metrics)
end
shared_examples 'aggregated_metrics_data' do
context 'no combination is tracked' do
context 'no aggregated metrics is defined' do
it 'returns empty hash' do
allow(described_class).to receive(:aggregated_metrics).and_return([])
......@@ -307,14 +301,51 @@ RSpec.describe Gitlab::UsageDataCounters::HLLRedisCounter, :clean_gitlab_redis_s
end
end
context 'there are some combinations defined' do
it 'returns the number of unique events for all known events' do
results = {
'gmau_1' => 2,
'gmau_2' => 3
}
context 'there are aggregated metrics defined' do
before do
allow(described_class).to receive(:aggregated_metrics).and_return(aggregated_metrics)
end
context 'with ALL operator' do
let(:aggregated_metrics) do
[
{ name: 'gmau_1', events: %w[event1_slot event2_slot], operator: "ALL" },
{ name: 'gmau_2', events: %w[event1_slot event2_slot event3_slot], operator: "ALL" },
{ name: 'gmau_3', events: %w[event1_slot event2_slot event3_slot event5_slot], operator: "ALL" },
{ name: 'gmau_4', events: %w[event4], operator: "ALL" }
].map(&:with_indifferent_access)
end
expect(aggregated_metrics_data).to eq(results)
it 'returns the number of unique events for all known events' do
results = {
'gmau_1' => 3,
'gmau_2' => 2,
'gmau_3' => 1,
'gmau_4' => 3
}
expect(aggregated_metrics_data).to eq(results)
end
end
context 'with ANY operator' do
let(:aggregated_metrics) do
[
{ name: 'gmau_1', events: %w[event3_slot event5_slot], operator: "ANY" },
{ name: 'gmau_2', events: %w[event1_slot event2_slot event3_slot event5_slot], operator: "ANY" },
{ name: 'gmau_3', events: %w[event4], operator: "ANY" }
].map(&:with_indifferent_access)
end
it 'returns the number of unique events for all known events' do
results = {
'gmau_1' => 2,
'gmau_2' => 3,
'gmau_3' => 3
}
expect(aggregated_metrics_data).to eq(results)
end
end
end
end
......@@ -324,16 +355,22 @@ RSpec.describe Gitlab::UsageDataCounters::HLLRedisCounter, :clean_gitlab_redis_s
before do
described_class.track_event(entity1, 'event1_slot', 2.days.ago)
described_class.track_event(entity2, 'event1_slot', 2.days.ago)
described_class.track_event(entity3, 'event1_slot', 2.days.ago)
described_class.track_event(entity1, 'event2_slot', 2.days.ago)
described_class.track_event(entity2, 'event2_slot', 3.days.ago)
described_class.track_event(entity3, 'event2_slot', 3.days.ago)
described_class.track_event(entity1, 'event3_slot', 3.days.ago)
described_class.track_event(entity2, 'event3_slot', 3.days.ago)
described_class.track_event(entity2, 'event5_slot', 3.days.ago)
# events out of time scope
described_class.track_event(entity3, 'event2_slot', 8.days.ago)
# events in different slots
described_class.track_event(entity1, 'event3', 2.days.ago)
described_class.track_event(entity2, 'event3', 2.days.ago)
described_class.track_event(entity4, 'event3', 2.days.ago)
described_class.track_event(entity1, 'event4', 2.days.ago)
described_class.track_event(entity2, 'event4', 2.days.ago)
described_class.track_event(entity4, 'event4', 2.days.ago)
end
it_behaves_like 'aggregated_metrics_data'
......@@ -342,21 +379,58 @@ RSpec.describe Gitlab::UsageDataCounters::HLLRedisCounter, :clean_gitlab_redis_s
describe '.aggregated_metrics_monthly_data' do
subject(:aggregated_metrics_data) { described_class.aggregated_metrics_monthly_data }
before do
described_class.track_event(entity1, 'event1_slot', 2.days.ago)
described_class.track_event(entity1, 'event2_slot', 10.days.ago)
described_class.track_event(entity3, 'event2_slot', 4.weeks.ago.advance(days: 1))
it_behaves_like 'aggregated_metrics_data' do
before do
described_class.track_event(entity1, 'event1_slot', 2.days.ago)
described_class.track_event(entity2, 'event1_slot', 2.days.ago)
described_class.track_event(entity3, 'event1_slot', 2.days.ago)
described_class.track_event(entity1, 'event2_slot', 2.days.ago)
described_class.track_event(entity2, 'event2_slot', 3.days.ago)
described_class.track_event(entity3, 'event2_slot', 3.days.ago)
described_class.track_event(entity1, 'event3_slot', 3.days.ago)
described_class.track_event(entity2, 'event3_slot', 10.days.ago)
described_class.track_event(entity2, 'event5_slot', 4.weeks.ago.advance(days: 1))
# events out of time scope
described_class.track_event(entity1, 'event5_slot', 4.weeks.ago.advance(days: -1))
# events in different slots
described_class.track_event(entity1, 'event4', 2.days.ago)
described_class.track_event(entity2, 'event4', 2.days.ago)
described_class.track_event(entity4, 'event4', 2.days.ago)
end
end
# events out of time scope
described_class.track_event(entity3, 'event2_slot', 4.weeks.ago.advance(days: -1))
context 'Redis calls' do
let(:aggregated_metrics) do
[
{ name: 'gmau_3', events: %w[event1_slot event2_slot event3_slot event5_slot], operator: "ALL" }
].map(&:with_indifferent_access)
end
# events in different slots
described_class.track_event(entity1, 'event3', 2.days.ago)
described_class.track_event(entity2, 'event3', 2.days.ago)
described_class.track_event(entity4, 'event3', 2.days.ago)
end
let(:known_events) do
[
{ name: 'event1_slot', redis_slot: "slot", category: 'category1', aggregation: "weekly" },
{ name: 'event2_slot', redis_slot: "slot", category: 'category2', aggregation: "weekly" },
{ name: 'event3_slot', redis_slot: "slot", category: 'category3', aggregation: "weekly" },
{ name: 'event5_slot', redis_slot: "slot", category: 'category4', aggregation: "weekly" }
].map(&:with_indifferent_access)
end
it_behaves_like 'aggregated_metrics_data'
it 'caches intermediate operations' do
allow(described_class).to receive(:known_events).and_return(known_events)
allow(described_class).to receive(:aggregated_metrics).and_return(aggregated_metrics)
4.downto(1) do |subset_size|
known_events.combination(subset_size).each do |events|
keys = described_class.send(:weekly_redis_keys, events: events, start_date: 4.weeks.ago.to_date, end_date: Date.current)
expect(Gitlab::Redis::HLL).to receive(:count).with(keys: keys).once.and_return(0)
end
end
subject
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment