Merge branch 'mwaw/extract_postgres_hll_bucktes_class' into 'master'

Extract Buckets out of BatchDistinctCounter See merge request gitlab-org/gitlab!49344

Merge branch 'mwaw/extract_postgres_hll_bucktes_class' into 'master'
Extract Buckets out of BatchDistinctCounter See merge request gitlab-org/gitlab!49344
281fda79 · Alper Akgun · 065c88d7 · c111cb39 · 281fda79 · 281fda79
Commit 281fda79 authored Dec 23, 2020 by Alper Akgun
6 changed files
--- a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
+++ b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
@@ -16,9 +16,9 @@ module Gitlab
      # Grouped relations are NOT supported yet.
      #
      # @example Usage
-      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).execute
      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
-      #    .estimate_distinct_count(
+      #    .execute(
      #      batch_size: 1_000,
      #      start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
      #      finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
@@ -30,7 +30,6 @@ module Gitlab
      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
      class BatchDistinctCounter
        ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
-        FALLBACK = -1
        MIN_REQUIRED_BATCH_SIZE = 750
        SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
        MAX_DATA_VOLUME = 4_000_000_000
@@ -38,8 +37,10 @@ module Gitlab
        # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
        DEFAULT_BATCH_SIZE = 10_000
+        ZERO_OFFSET = 1
+        BUCKET_ID_MASK = (Buckets::TOTAL_BUCKETS - ZERO_OFFSET).to_s(2)
        BIT_31_MASK = "B'0#{'1' * 31}'"
-        BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
+        BIT_32_NORMALIZED_BUCKET_ID_MASK = "B'#{'0' * (32 - BUCKET_ID_MASK.size)}#{BUCKET_ID_MASK}'"
        # @example source_query
        #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
        #   FROM %{relation}
@@ -48,73 +49,58 @@ module Gitlab
        #   AND %{column} IS NOT NULL
        BUCKETED_DATA_SQL = <<~SQL
          WITH hashed_attributes AS (%{source_query})
-          SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
+          SELECT (attr_hash_32_bits & #{BIT_32_NORMALIZED_BUCKET_ID_MASK})::int AS bucket_num,
            (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
          FROM hashed_attributes
          GROUP BY 1
        SQL
-        TOTAL_BUCKETS_NUMBER = 512
+        WRONG_CONFIGURATION_ERROR = Class.new(ActiveRecord::StatementInvalid)
        def initialize(relation, column = nil)
          @relation = relation
          @column = column || relation.primary_key
        end
-        def unwanted_configuration?(finish, batch_size, start)
+        # Executes counter that iterates over database source and return Gitlab::Database::PostgresHll::Buckets
-          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
+        # that can be used to estimation of number of uniq elements in analysed set
-            (finish - start) >= MAX_DATA_VOLUME ||
+        #
-            start > finish
+        # @param batch_size maximal number of rows that will be analysed by single database query
-        end
+        # @param start initial pkey range
+        # @param finish final pkey range
-        def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
+        # @return [Gitlab::Database::PostgresHll::Buckets] HyperLogLog data structure instance that can estimate number of unique elements
+        def execute(batch_size: nil, start: nil, finish: nil)
          raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
          batch_size ||= DEFAULT_BATCH_SIZE
          start = actual_start(start)
          finish = actual_finish(finish)
-          raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
+          raise WRONG_CONFIGURATION_ERROR if unwanted_configuration?(start, finish, batch_size)
-          return FALLBACK if unwanted_configuration?(finish, batch_size, start)
          batch_start = start
-          hll_blob = {}
+          hll_buckets = Buckets.new
          while batch_start <= finish
            begin
-              hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
+              hll_buckets.merge_hash!(hll_buckets_for_batch(batch_start, batch_start + batch_size))
              batch_start += batch_size
            end
            sleep(SLEEP_TIME_IN_SECONDS)
          end
-          estimate_cardinality(hll_blob)
+          hll_buckets
        end
        private
-        # arbitrary values that are present in #estimate_cardinality
+        def unwanted_configuration?(start, finish, batch_size)
-        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
-        # article, they are not representing any entity and serves as tune value
+            (finish - start) >= MAX_DATA_VOLUME ||
-        # for the whole equation
+            start > finish || start < 0 || finish < 0
-        def estimate_cardinality(hll_blob)
-          num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
-          num_uniques = (
-            ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
-              (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
-          ).to_i
-          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
-            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
-              Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
-          else
-            num_uniques
-          end
        end
-        def hll_blob_for_batch(start, finish)
+        def hll_buckets_for_batch(start, finish)
          @relation
            .connection
            .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })

--- a/lib/gitlab/database/postgres_hll/buckets.rb
+++ b/lib/gitlab/database/postgres_hll/buckets.rb
+# frozen_string_literal: true
+module Gitlab
+  module Database
+    module PostgresHll
+      # Bucket class represent data structure build with HyperLogLog algorithm
+      # that models data distribution in analysed set. This representation than can be used
+      # for following purposes
+      #   1. Estimating number of unique elements that this structure represents
+      #   2. Merging with other Buckets structure to later estimate number of unique elements in sum of two
+      #      represented data sets
+      #   3. Serializing Buckets structure to json format, that can be stored in various persistence layers
+      #
+      # @example Usage
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).estimated_distinct_count
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).merge_hash!(141 => 1, 56 => 5).estimated_distinct_count
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).to_json
+      # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
+      #  Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
+      #  for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
+      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
+      class Buckets
+        TOTAL_BUCKETS = 512
+        def initialize(buckets = {})
+          @buckets = buckets
+        end
+        # Based on HyperLogLog structure estimates number of unique elements in analysed set.
+        #
+        # @return [Float] Estimate number of unique elements
+        def estimated_distinct_count
+          @estimated_distinct_count ||= estimate_cardinality
+        end
+        # Updates instance underlying HyperLogLog structure by merging it with other HyperLogLog structure
+        #
+        # @param other_buckets_hash hash with HyperLogLog structure representation
+        def merge_hash!(other_buckets_hash)
+          buckets.merge!(other_buckets_hash) {|_key, old, new| new > old ? new : old }
+        end
+        # Serialize instance underlying HyperLogLog structure to JSON format, that can be stored in various persistence layers
+        #
+        # @return [String] HyperLogLog data structure serialized to JSON
+        def to_json(_ = nil)
+          buckets.to_json
+        end
+        private
+        attr_accessor :buckets
+        # arbitrary values that are present in #estimate_cardinality
+        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+        # article, they are not representing any entity and serves as tune value
+        # for the whole equation
+        def estimate_cardinality
+          num_zero_buckets = TOTAL_BUCKETS - buckets.size
+          num_uniques = (
+            ((TOTAL_BUCKETS**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS))) /
+            (num_zero_buckets + buckets.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
+          ).to_i
+          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
+            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
+              Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
+          else
+            num_uniques
+          end
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/utils/usage_data.rb
+++ b/lib/gitlab/utils/usage_data.rb
@@ -61,7 +61,10 @@ module Gitlab
      end
      def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
-        Gitlab::Database::PostgresHll::BatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
+        Gitlab::Database::PostgresHll::BatchDistinctCounter
+          .new(relation, column)
+          .execute(batch_size: batch_size, start: start, finish: finish)
+          .estimated_distinct_count
      rescue ActiveRecord::StatementInvalid
        FALLBACK
      # catch all rescue should be removed as a part of feature flag rollout issue

--- a/spec/lib/gitlab/database/postgres_hll/batch_distinct_counter_spec.rb
+++ b/spec/lib/gitlab/database/postgres_hll/batch_distinct_counter_spec.rb
@@ -24,107 +24,48 @@ RSpec.describe Gitlab::Database::PostgresHll::BatchDistinctCounter do
    allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(in_transaction)
  end
-  context 'different distribution of relation records' do
-    [10, 100, 100_000].each do |spread|
-      context "records are spread within #{spread}" do
-        before do
-          ids = (1..spread).to_a.sample(10)
-          create_list(:issue, 10).each_with_index do |issue, i|
-            issue.id = ids[i]
-          end
-        end
-        it 'counts table' do
-          expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(10)
-        end
-      end
-    end
-  end
  context 'unit test for different counting parameters' do
    before_all do
      create_list(:issue, 3, author: user)
      create_list(:issue, 2, author: another_user)
    end
-    describe '#estimate_distinct_count' do
+    describe '#execute' do
-      it 'counts table' do
+      it 'builds hll buckets' do
-        expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(5)
+        expect(described_class.new(model).execute).to be_an_instance_of(Gitlab::Database::PostgresHll::Buckets)
-      end
-      it 'counts with column field' do
-        expect(described_class.new(model, column).estimate_distinct_count).to be_within(error_rate).percent_of(2)
-      end
-      it 'counts with :id field' do
-        expect(described_class.new(model, :id).estimate_distinct_count).to be_within(error_rate).percent_of(5)
-      end
-      it 'counts with "id" field' do
-        expect(described_class.new(model, "id").estimate_distinct_count).to be_within(error_rate).percent_of(5)
-      end
-      it 'counts with table.column field' do
-        expect(described_class.new(model, "#{model.table_name}.#{column}").estimate_distinct_count).to be_within(error_rate).percent_of(2)
-      end
-      it 'counts with Arel column' do
-        expect(described_class.new(model, model.arel_table[column]).estimate_distinct_count).to be_within(error_rate).percent_of(2)
-      end
-      it 'counts over joined relations' do
-        expect(described_class.new(model.joins(:author), "users.email").estimate_distinct_count).to be_within(error_rate).percent_of(2)
-      end
-      it 'counts with :column field with batch_size of 50K' do
-        expect(described_class.new(model, column).estimate_distinct_count(batch_size: 50_000)).to be_within(error_rate).percent_of(2)
-      end
-      it 'will not count table with a batch size less than allowed' do
-        expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
-      end
-      it 'counts with different number of batches and aggregates total result' do
-        stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
-        [1, 2, 4, 5, 6].each { |i| expect(described_class.new(model).estimate_distinct_count(batch_size: i)).to be_within(error_rate).percent_of(5) }
-      end
-      it 'counts with a start and finish' do
-        expect(described_class.new(model, column).estimate_distinct_count(start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
      end
-      it "defaults the batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
+      it "defaults batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
        min_id = model.minimum(:id)
        batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE)
        expect(model).to receive(:where).with("id" => min_id..batch_end_id).and_call_original
-        described_class.new(model).estimate_distinct_count
+        described_class.new(model).execute
      end
      context 'when a transaction is open' do
        let(:in_transaction) { true }
        it 'raises an error' do
-          expect { described_class.new(model, column).estimate_distinct_count }.to raise_error('BatchCount can not be run inside a transaction')
+          expect { described_class.new(model, column).execute }.to raise_error('BatchCount can not be run inside a transaction')
        end
      end
      context 'disallowed configurations' do
        let(:default_batch_size) { Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE }
-        it 'returns fallback if start is bigger than finish' do
+        it 'raises WRONG_CONFIGURATION_ERROR if start is bigger than finish' do
-          expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
+          expect { described_class.new(model, column).execute(start: 1, finish: 0) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
        end
-        it 'returns fallback if data volume exceeds upper limit' do
+        it 'raises WRONG_CONFIGURATION_ERROR if data volume exceeds upper limit' do
          large_finish = Gitlab::Database::PostgresHll::BatchDistinctCounter::MAX_DATA_VOLUME + 1
-          expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
+          expect { described_class.new(model, column).execute(start: 1, finish: large_finish) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
        end
-        it 'returns fallback if batch size is less than min required' do
+        it 'raises WRONG_CONFIGURATION_ERROR if batch size is less than min required' do
-          expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
+          expect { described_class.new(model, column).execute(batch_size: small_batch_size) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
        end
      end
    end

--- a/spec/lib/gitlab/database/postgres_hll/buckets_spec.rb
+++ b/spec/lib/gitlab/database/postgres_hll/buckets_spec.rb
+# frozen_string_literal: true
+require 'spec_helper'
+RSpec.describe Gitlab::Database::PostgresHll::Buckets do
+  let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
+  let(:buckets_hash_5) { { 121 => 2, 126 => 1, 141 => 1, 383 => 1, 56 => 1 } }
+  let(:buckets_hash_2) { { 141 => 1, 56 => 1 } }
+  describe '#estimated_distinct_count' do
+    it 'provides estimated cardinality', :aggregate_failures do
+      expect(described_class.new(buckets_hash_5).estimated_distinct_count).to be_within(error_rate).percent_of(5)
+      expect(described_class.new(buckets_hash_2).estimated_distinct_count).to be_within(error_rate).percent_of(2)
+      expect(described_class.new({}).estimated_distinct_count).to eq 0
+      expect(described_class.new.estimated_distinct_count).to eq 0
+    end
+  end
+  describe '#merge_hash!' do
+    let(:hash_a) { { 1 => 1, 2 => 3 } }
+    let(:hash_b) { { 1 => 2, 2 => 1 } }
+    it 'merges two hashes together into union of two sets' do
+      expect(described_class.new(hash_a).merge_hash!(hash_b).to_json).to eq described_class.new(1 => 2, 2 => 3).to_json
+    end
+  end
+  describe '#to_json' do
+    it 'serialize HyperLogLog buckets as hash' do
+      expect(described_class.new(1 => 5).to_json).to eq '{"1":5}'
+    end
+  end
+end
--- a/spec/lib/gitlab/utils/usage_data_spec.rb
+++ b/spec/lib/gitlab/utils/usage_data_spec.rb
@@ -38,32 +38,123 @@ RSpec.describe Gitlab::Utils::UsageData do
  end
  describe '#estimate_batch_distinct_count' do
+    let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
    let(:relation) { double(:relation) }
+    before do
+      allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(false)
+    end
    it 'delegates counting to counter class instance' do
+      buckets = instance_double(Gitlab::Database::PostgresHll::Buckets)
      expect_next_instance_of(Gitlab::Database::PostgresHll::BatchDistinctCounter, relation, 'column') do |instance|
-        expect(instance).to receive(:estimate_distinct_count)
+        expect(instance).to receive(:execute)
                              .with(batch_size: nil, start: nil, finish: nil)
-                              .and_return(5)
+                              .and_return(buckets)
      end
+      expect(buckets).to receive(:estimated_distinct_count).and_return(5)
      expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5)
    end
+    context 'quasi integration test for different counting parameters' do
+      let_it_be(:user) { create(:user, email: 'email1@domain.com') }
+      let_it_be(:another_user) { create(:user, email: 'email2@domain.com') }
+      let(:model) { Issue }
+      let(:column) { :author_id }
+      context 'different distribution of relation records' do
+        [10, 100, 100_000].each do |spread|
+          context "records are spread within #{spread}" do
+            before do
+              ids = (1..spread).to_a.sample(10)
+              create_list(:issue, 10).each_with_index do |issue, i|
+                issue.id = ids[i]
+              end
+            end
+            it 'counts table' do
+              expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(10)
+            end
+          end
+        end
+      end
+      context 'different counting parameters' do
+        before_all do
+          create_list(:issue, 3, author: user)
+          create_list(:issue, 2, author: another_user)
+        end
+        it 'counts table' do
+          expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(5)
+        end
+        it 'counts with column field' do
+          expect(described_class.estimate_batch_distinct_count(model, column)).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts with :id field' do
+          expect(described_class.estimate_batch_distinct_count(model, :id)).to be_within(error_rate).percent_of(5)
+        end
+        it 'counts with "id" field' do
+          expect(described_class.estimate_batch_distinct_count(model, "id")).to be_within(error_rate).percent_of(5)
+        end
+        it 'counts with table.column field' do
+          expect(described_class.estimate_batch_distinct_count(model, "#{model.table_name}.#{column}")).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts with Arel column' do
+          expect(described_class.estimate_batch_distinct_count(model, model.arel_table[column])).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts over joined relations' do
+          expect(described_class.estimate_batch_distinct_count(model.joins(:author), "users.email")).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts with :column field with batch_size of 50K' do
+          expect(described_class.estimate_batch_distinct_count(model, column, batch_size: 50_000)).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts with different number of batches and aggregates total result' do
+          stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
+          [1, 2, 4, 5, 6].each { |i| expect(described_class.estimate_batch_distinct_count(model, batch_size: i)).to be_within(error_rate).percent_of(5) }
+        end
+        it 'counts with a start and finish' do
+          expect(described_class.estimate_batch_distinct_count(model, column, start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
+        end
+      end
+    end
+    describe 'error handling' do
+      before do
+        stub_const("Gitlab::Utils::UsageData::FALLBACK", 3)
+        stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 4)
+      end
+      it 'returns fallback if counter raises WRONG_CONFIGURATION_ERROR' do
+        expect(described_class.estimate_batch_distinct_count(relation, 'id', start: 1, finish: 0)).to eq 3
+      end
      it 'returns default fallback value when counting fails due to database error' do
-      stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
        allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
-      expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
+        expect(described_class.estimate_batch_distinct_count(relation)).to eq(3)
      end
      it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
        error = StandardError.new('')
-      stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
        allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(error)
        expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
-      expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
+        expect(described_class.estimate_batch_distinct_count(relation)).to eq(4)
+      end
    end
  end