Commit a898ae32 authored by Mikołaj Wawrzyniak's avatar Mikołaj Wawrzyniak

Merge branch 'cat-update-pg-hll-formula' into 'master'

Update formula for PG-HyperLogLog small cardinality estimates

See merge request gitlab-org/gitlab!76617
parents dadec8cb 3e99032d
...@@ -65,8 +65,7 @@ module Gitlab ...@@ -65,8 +65,7 @@ module Gitlab
).to_i ).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS * TOTAL_BUCKETS * Math.log(TOTAL_BUCKETS.to_f / num_zero_buckets)
Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
else else
num_uniques num_uniques
end end
......
...@@ -11,6 +11,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -11,6 +11,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
let(:metric_1) { 'metric_1' } let(:metric_1) { 'metric_1' }
let(:metric_2) { 'metric_2' } let(:metric_2) { 'metric_2' }
let(:metric_names) { [metric_1, metric_2] } let(:metric_names) { [metric_1, metric_2] }
let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE }
describe 'metric calculations' do describe 'metric calculations' do
before do before do
...@@ -38,7 +39,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -38,7 +39,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
end end
it 'returns the number of unique events in the union of all metrics' do it 'returns the number of unique events in the union of all metrics' do
expect(calculate_metrics_union.round(2)).to eq(3.12) expect(calculate_metrics_union.round(2)).to be_within(error_rate).percent_of(3)
end end
context 'when there is no aggregated data saved' do context 'when there is no aggregated data saved' do
...@@ -53,7 +54,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -53,7 +54,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
let(:metric_names) { [metric_1] } let(:metric_names) { [metric_1] }
it 'returns the number of unique events for that metric' do it 'returns the number of unique events for that metric' do
expect(calculate_metrics_union.round(2)).to eq(2.08) expect(calculate_metrics_union.round(2)).to be_within(error_rate).percent_of(2)
end end
end end
end end
...@@ -64,7 +65,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -64,7 +65,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
end end
it 'returns the number of common events in the intersection of all metrics' do it 'returns the number of common events in the intersection of all metrics' do
expect(calculate_metrics_intersections.round(2)).to eq(1.04) expect(calculate_metrics_intersections.round(2)).to be_within(error_rate).percent_of(1)
end end
context 'when there is no aggregated data saved' do context 'when there is no aggregated data saved' do
...@@ -79,7 +80,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_ ...@@ -79,7 +80,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
let(:metric_names) { [metric_1] } let(:metric_names) { [metric_1] }
it 'returns the number of common/unique events for the intersection of that metric' do it 'returns the number of common/unique events for the intersection of that metric' do
expect(calculate_metrics_intersections.round(2)).to eq(2.08) expect(calculate_metrics_intersections.round(2)).to be_within(error_rate).percent_of(2)
end end
end end
end end
......
...@@ -82,7 +82,7 @@ RSpec.describe Gitlab::Usage::Metrics::Instrumentations::DatabaseMetric do ...@@ -82,7 +82,7 @@ RSpec.describe Gitlab::Usage::Metrics::Instrumentations::DatabaseMetric do
end.new(time_frame: 'all') end.new(time_frame: 'all')
end end
it 'calculates a correct result', quarantine: 'https://gitlab.com/gitlab-org/gitlab/-/issues/348139' do it 'calculates a correct result' do
expect(subject.value).to be_within(Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE).percent_of(3) expect(subject.value).to be_within(Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE).percent_of(3)
end end
......
...@@ -118,7 +118,7 @@ RSpec.describe Gitlab::Utils::UsageData do ...@@ -118,7 +118,7 @@ RSpec.describe Gitlab::Utils::UsageData do
# build_needs set: ['1', '2', '3', '4', '5'] # build_needs set: ['1', '2', '3', '4', '5']
# ci_build set ['a', 'b'] # ci_build set ['a', 'b']
# with them, current implementation is expected to consistently report # with them, current implementation is expected to consistently report
# 5.217656147118495 and 2.0809220082170614 values # the same static values
# This test suite is expected to assure, that HyperLogLog implementation # This test suite is expected to assure, that HyperLogLog implementation
# behaves consistently between changes made to other parts of codebase. # behaves consistently between changes made to other parts of codebase.
# In case of fine tuning or changes to HyperLogLog algorithm implementation # In case of fine tuning or changes to HyperLogLog algorithm implementation
...@@ -130,8 +130,8 @@ RSpec.describe Gitlab::Utils::UsageData do ...@@ -130,8 +130,8 @@ RSpec.describe Gitlab::Utils::UsageData do
let(:model) { Ci::BuildNeed } let(:model) { Ci::BuildNeed }
let(:column) { :name } let(:column) { :name }
let(:build_needs_estimated_cardinality) { 5.217656147118495 } let(:build_needs_estimated_cardinality) { 5.024574181542231 }
let(:ci_builds_estimated_cardinality) { 2.0809220082170614 } let(:ci_builds_estimated_cardinality) { 2.003916452421793 }
before do before do
allow(model.connection).to receive(:transaction_open?).and_return(false) allow(model.connection).to receive(:transaction_open?).and_return(false)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment