Commit 3e99032d authored by Catalin Irimie's avatar Catalin Irimie

Update formula for PG-HyperLogLog small cardinality estimates

Based on the original HLL paper and referenced linear probabilistic
counting paper, this should be m*log(m/V) instead of using the
alpha(m) constant for small cardinality, as linear counting is
used instead.
parent 6bd1a6f6
...@@ -65,8 +65,7 @@ module Gitlab ...@@ -65,8 +65,7 @@ module Gitlab
).to_i ).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS * TOTAL_BUCKETS * Math.log(TOTAL_BUCKETS.to_f / num_zero_buckets)
Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
else else
num_uniques num_uniques
end end
......
...@@ -11,6 +11,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -11,6 +11,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
let(:metric_1) { 'metric_1' } let(:metric_1) { 'metric_1' }
let(:metric_2) { 'metric_2' } let(:metric_2) { 'metric_2' }
let(:metric_names) { [metric_1, metric_2] } let(:metric_names) { [metric_1, metric_2] }
let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE }
describe 'metric calculations' do describe 'metric calculations' do
before do before do
...@@ -38,7 +39,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -38,7 +39,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
end end
it 'returns the number of unique events in the union of all metrics' do it 'returns the number of unique events in the union of all metrics' do
expect(calculate_metrics_union.round(2)).to eq(3.12) expect(calculate_metrics_union.round(2)).to be_within(error_rate).percent_of(3)
end end
context 'when there is no aggregated data saved' do context 'when there is no aggregated data saved' do
...@@ -53,7 +54,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -53,7 +54,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
let(:metric_names) { [metric_1] } let(:metric_names) { [metric_1] }
it 'returns the number of unique events for that metric' do it 'returns the number of unique events for that metric' do
expect(calculate_metrics_union.round(2)).to eq(2.08) expect(calculate_metrics_union.round(2)).to be_within(error_rate).percent_of(2)
end end
end end
end end
...@@ -64,7 +65,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -64,7 +65,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
end end
it 'returns the number of common events in the intersection of all metrics' do it 'returns the number of common events in the intersection of all metrics' do
expect(calculate_metrics_intersections.round(2)).to eq(1.04) expect(calculate_metrics_intersections.round(2)).to be_within(error_rate).percent_of(1)
end end
context 'when there is no aggregated data saved' do context 'when there is no aggregated data saved' do
...@@ -79,7 +80,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -79,7 +80,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
let(:metric_names) { [metric_1] } let(:metric_names) { [metric_1] }
it 'returns the number of common/unique events for the intersection of that metric' do it 'returns the number of common/unique events for the intersection of that metric' do
expect(calculate_metrics_intersections.round(2)).to eq(2.08) expect(calculate_metrics_intersections.round(2)).to be_within(error_rate).percent_of(2)
end end
end end
end end
......
...@@ -82,7 +82,7 @@ RSpec.describe Gitlab::Usage::Metrics::Instrumentations::DatabaseMetric do ...@@ -82,7 +82,7 @@ RSpec.describe Gitlab::Usage::Metrics::Instrumentations::DatabaseMetric do
end.new(time_frame: 'all') end.new(time_frame: 'all')
end end
it 'calculates a correct result', quarantine: 'https://gitlab.com/gitlab-org/gitlab/-/issues/348139' do it 'calculates a correct result' do
expect(subject.value).to be_within(Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE).percent_of(3) expect(subject.value).to be_within(Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE).percent_of(3)
end end
......
...@@ -118,7 +118,7 @@ RSpec.describe Gitlab::Utils::UsageData do ...@@ -118,7 +118,7 @@ RSpec.describe Gitlab::Utils::UsageData do
# build_needs set: ['1', '2', '3', '4', '5'] # build_needs set: ['1', '2', '3', '4', '5']
# ci_build set ['a', 'b'] # ci_build set ['a', 'b']
# with them, current implementation is expected to consistently report # with them, current implementation is expected to consistently report
# 5.217656147118495 and 2.0809220082170614 values # the same static values
# This test suite is expected to assure, that HyperLogLog implementation # This test suite is expected to assure, that HyperLogLog implementation
# behaves consistently between changes made to other parts of codebase. # behaves consistently between changes made to other parts of codebase.
# In case of fine tuning or changes to HyperLogLog algorithm implementation # In case of fine tuning or changes to HyperLogLog algorithm implementation
...@@ -130,8 +130,8 @@ RSpec.describe Gitlab::Utils::UsageData do ...@@ -130,8 +130,8 @@ RSpec.describe Gitlab::Utils::UsageData do
let(:model) { Ci::BuildNeed } let(:model) { Ci::BuildNeed }
let(:column) { :name } let(:column) { :name }
let(:build_needs_estimated_cardinality) { 5.217656147118495 } let(:build_needs_estimated_cardinality) { 5.024574181542231 }
let(:ci_builds_estimated_cardinality) { 2.0809220082170614 } let(:ci_builds_estimated_cardinality) { 2.003916452421793 }
before do before do
allow(model.connection).to receive(:transaction_open?).and_return(false) allow(model.connection).to receive(:transaction_open?).and_return(false)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment