Merge branch...

Merge branch 'mwaw/296169-usage-data-hll-count-for-estimate_batch_distinct_count-outside-expected-error-range' into 'master' Use fixed dataset in estimate_batch_distinct_count test suite to avoid flaky test results See merge request gitlab-org/gitlab!51207

Merge branch...
Merge branch 'mwaw/296169-usage-data-hll-count-for-estimate_batch_distinct_count-outside-expected-error-range' into 'master' Use fixed dataset in estimate_batch_distinct_count test suite to avoid flaky test results See merge request gitlab-org/gitlab!51207
acf9b5df · Gabriel Mazetto · e05636ae · 1a4916aa · acf9b5df
Commit acf9b5df authored Jan 12, 2021 by Gabriel Mazetto
Show whitespace changes
Inline Side-by-side

Showing with 38 additions and 45 deletions

spec/lib/gitlab/utils/usage_data_spec.rb spec/lib/gitlab/utils/usage_data_spec.rb +38 -45

No files found.
--- a/spec/lib/gitlab/utils/usage_data_spec.rb
+++ b/spec/lib/gitlab/utils/usage_data_spec.rb
@@ -58,76 +58,69 @@ RSpec.describe Gitlab::Utils::UsageData do
      expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5)
    end
-    context 'quasi integration test for different counting parameters', quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/296169' } do
+    context 'quasi integration test for different counting parameters' do
-      let_it_be(:user) { create(:user, email: 'email1@domain.com') }
+      # HyperLogLog http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf algorithm
-      let_it_be(:another_user) { create(:user, email: 'email2@domain.com') }
+      # used in estimate_batch_distinct_count produce probabilistic
+      # estimations of unique values present in dataset, because of that its results
-      let(:model) { Issue }
+      # are always off by some small factor from real value. However for given
-      let(:column) { :author_id }
+      # dataset it provide consistent and deterministic result. In the following context
+      # analyzed sets consist of values:
-      context 'different distribution of relation records' do
+      # build_needs set: ['1', '2', '3', '4', '5']
-        [10, 100, 100_000].each do |spread|
+      # ci_build set ['a', 'b']
-          context "records are spread within #{spread}" do
+      # with them, current implementation is expected to consistently report
-            before do
+      # 5.217656147118495 and 2.0809220082170614 values
-              ids = (1..spread).to_a.sample(10)
+      # This test suite is expected to assure, that HyperLogLog implementation
-              create_list(:issue, 10).each_with_index do |issue, i|
+      # behaves consistently between changes made to other parts of codebase.
-                issue.id = ids[i]
+      # In case of fine tuning or changes to HyperLogLog algorithm implementation
-              end
+      # one should run in depth analysis of accuracy with supplementary rake tasks
-            end
+      # currently under implementation at https://gitlab.com/gitlab-org/gitlab/-/merge_requests/51118
+      # and adjust used values in this context accordingly.
-            it 'counts table' do
+      let_it_be(:build) { create(:ci_build, name: 'a') }
-              expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(10)
+      let_it_be(:another_build) { create(:ci_build, name: 'b') }
-            end
-          end
+      let(:model) { Ci::BuildNeed }
-        end
+      let(:column) { :name }
-      end
+      let(:build_needs_estimated_cardinality) { 5.217656147118495 }
+      let(:ci_builds_estimated_cardinality) { 2.0809220082170614 }
      context 'different counting parameters' do
        before_all do
-          create_list(:issue, 3, author: user)
+          1.upto(3) { |i| create(:ci_build_need, name: i, build: build) }
-          create_list(:issue, 2, author: another_user)
+          4.upto(5) { |i| create(:ci_build_need, name: i, build: another_build) }
-        end
-        it 'counts table' do
-          expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(5)
-        end
-        it 'counts with column field' do
-          expect(described_class.estimate_batch_distinct_count(model, column)).to be_within(error_rate).percent_of(2)
        end
-        it 'counts with :id field' do
+        it 'counts with symbol passed in column argument' do
-          expect(described_class.estimate_batch_distinct_count(model, :id)).to be_within(error_rate).percent_of(5)
+          expect(described_class.estimate_batch_distinct_count(model, column)).to eq(build_needs_estimated_cardinality)
        end
-        it 'counts with "id" field' do
+        it 'counts with string passed in column argument' do
-          expect(described_class.estimate_batch_distinct_count(model, "id")).to be_within(error_rate).percent_of(5)
+          expect(described_class.estimate_batch_distinct_count(model, column.to_s)).to eq(build_needs_estimated_cardinality)
        end
-        it 'counts with table.column field' do
+        it 'counts with table.column passed in column argument' do
-          expect(described_class.estimate_batch_distinct_count(model, "#{model.table_name}.#{column}")).to be_within(error_rate).percent_of(2)
+          expect(described_class.estimate_batch_distinct_count(model, "#{model.table_name}.#{column}")).to eq(build_needs_estimated_cardinality)
        end
-        it 'counts with Arel column' do
+        it 'counts with Arel passed in column argument' do
-          expect(described_class.estimate_batch_distinct_count(model, model.arel_table[column])).to be_within(error_rate).percent_of(2)
+          expect(described_class.estimate_batch_distinct_count(model, model.arel_table[column])).to eq(build_needs_estimated_cardinality)
        end
        it 'counts over joined relations' do
-          expect(described_class.estimate_batch_distinct_count(model.joins(:author), "users.email")).to be_within(error_rate).percent_of(2)
+          expect(described_class.estimate_batch_distinct_count(model.joins(:build), "ci_builds.name")).to eq(ci_builds_estimated_cardinality)
        end
        it 'counts with :column field with batch_size of 50K' do
-          expect(described_class.estimate_batch_distinct_count(model, column, batch_size: 50_000)).to be_within(error_rate).percent_of(2)
+          expect(described_class.estimate_batch_distinct_count(model, column, batch_size: 50_000)).to eq(build_needs_estimated_cardinality)
        end
        it 'counts with different number of batches and aggregates total result' do
          stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
-          [1, 2, 4, 5, 6].each { |i| expect(described_class.estimate_batch_distinct_count(model, batch_size: i)).to be_within(error_rate).percent_of(5) }
+          [1, 2, 4, 5, 6].each { |i| expect(described_class.estimate_batch_distinct_count(model, column, batch_size: i)).to eq(build_needs_estimated_cardinality) }
        end
        it 'counts with a start and finish' do
-          expect(described_class.estimate_batch_distinct_count(model, column, start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
+          expect(described_class.estimate_batch_distinct_count(model, column, start: model.minimum(:id), finish: model.maximum(:id))).to eq(build_needs_estimated_cardinality)
        end
      end
    end