Extract Buckets out of BatchDistinctCounter

To enable more flexible operations over hll buckets we need to extract dedicated class to model their behaviour.

Extract Buckets out of BatchDistinctCounter
To enable more flexible operations over hll buckets we need to extract dedicated class to model their behaviour.
c111cb39 · Mikołaj Wawrzyniak · Alper Akgun · fd79577a · c111cb39 · c111cb39
Commit c111cb39 authored Dec 23, 2020 by Mikołaj Wawrzyniak Committed by Alper Akgun Dec 23, 2020
6 changed files
--- a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
+++ b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
@@ -16,9 +16,9 @@ module Gitlab
      # Grouped relations are NOT supported yet.
      #
      # @example Usage
-      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).execute
      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
-      #    .estimate_distinct_count(
+      #    .execute(
      #      batch_size: 1_000,
      #      start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
      #      finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
@@ -30,7 +30,6 @@ module Gitlab
      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
      class BatchDistinctCounter
        ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
-        FALLBACK = -1
        MIN_REQUIRED_BATCH_SIZE = 750
        SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
        MAX_DATA_VOLUME = 4_000_000_000
@@ -38,8 +37,10 @@ module Gitlab
        # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
        DEFAULT_BATCH_SIZE = 10_000
+        ZERO_OFFSET = 1
+        BUCKET_ID_MASK = (Buckets::TOTAL_BUCKETS - ZERO_OFFSET).to_s(2)
        BIT_31_MASK = "B'0#{'1' * 31}'"
-        BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
+        BIT_32_NORMALIZED_BUCKET_ID_MASK = "B'#{'0' * (32 - BUCKET_ID_MASK.size)}#{BUCKET_ID_MASK}'"
        # @example source_query
        #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
        #   FROM %{relation}
@@ -48,73 +49,58 @@ module Gitlab
        #   AND %{column} IS NOT NULL
        BUCKETED_DATA_SQL = <<~SQL
          WITH hashed_attributes AS (%{source_query})
-          SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
+          SELECT (attr_hash_32_bits & #{BIT_32_NORMALIZED_BUCKET_ID_MASK})::int AS bucket_num,
            (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
          FROM hashed_attributes
          GROUP BY 1
        SQL
-        TOTAL_BUCKETS_NUMBER = 512
+        WRONG_CONFIGURATION_ERROR = Class.new(ActiveRecord::StatementInvalid)
        def initialize(relation, column = nil)
          @relation = relation
          @column = column || relation.primary_key
        end
-        def unwanted_configuration?(finish, batch_size, start)
+        # Executes counter that iterates over database source and return Gitlab::Database::PostgresHll::Buckets
-          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
+        # that can be used to estimation of number of uniq elements in analysed set
-            (finish - start) >= MAX_DATA_VOLUME ||
+        #
-            start > finish
+        # @param batch_size maximal number of rows that will be analysed by single database query
-        end
+        # @param start initial pkey range
+        # @param finish final pkey range
-        def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
+        # @return [Gitlab::Database::PostgresHll::Buckets] HyperLogLog data structure instance that can estimate number of unique elements
+        def execute(batch_size: nil, start: nil, finish: nil)
          raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
          batch_size ||= DEFAULT_BATCH_SIZE
          start = actual_start(start)
          finish = actual_finish(finish)
-          raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
+          raise WRONG_CONFIGURATION_ERROR if unwanted_configuration?(start, finish, batch_size)
-          return FALLBACK if unwanted_configuration?(finish, batch_size, start)
          batch_start = start
-          hll_blob = {}
+          hll_buckets = Buckets.new
          while batch_start <= finish
            begin
-              hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
+              hll_buckets.merge_hash!(hll_buckets_for_batch(batch_start, batch_start + batch_size))
              batch_start += batch_size
            end
            sleep(SLEEP_TIME_IN_SECONDS)
          end
-          estimate_cardinality(hll_blob)
+          hll_buckets
        end
        private
-        # arbitrary values that are present in #estimate_cardinality
+        def unwanted_configuration?(start, finish, batch_size)
-        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
-        # article, they are not representing any entity and serves as tune value
+            (finish - start) >= MAX_DATA_VOLUME ||
-        # for the whole equation
+            start > finish || start < 0 || finish < 0
-        def estimate_cardinality(hll_blob)
-          num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
-          num_uniques = (
-            ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
-              (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
-          ).to_i
-          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
-            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
-              Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
-          else
-            num_uniques
-          end
        end
-        def hll_blob_for_batch(start, finish)
+        def hll_buckets_for_batch(start, finish)
          @relation
            .connection
            .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })

--- a/lib/gitlab/database/postgres_hll/buckets.rb
+++ b/lib/gitlab/database/postgres_hll/buckets.rb
+# frozen_string_literal: true
+module Gitlab
+  module Database
+    module PostgresHll
+      # Bucket class represent data structure build with HyperLogLog algorithm
+      # that models data distribution in analysed set. This representation than can be used
+      # for following purposes
+      #   1. Estimating number of unique elements that this structure represents
+      #   2. Merging with other Buckets structure to later estimate number of unique elements in sum of two
+      #      represented data sets
+      #   3. Serializing Buckets structure to json format, that can be stored in various persistence layers
+      #
+      # @example Usage
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).estimated_distinct_count
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).merge_hash!(141 => 1, 56 => 5).estimated_distinct_count
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).to_json
+      # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
+      #  Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
+      #  for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
+      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
+      class Buckets
+        TOTAL_BUCKETS = 512
+        def initialize(buckets = {})
+          @buckets = buckets
+        end
+        # Based on HyperLogLog structure estimates number of unique elements in analysed set.
+        #
+        # @return [Float] Estimate number of unique elements
+        def estimated_distinct_count
+          @estimated_distinct_count ||= estimate_cardinality
+        end
+        # Updates instance underlying HyperLogLog structure by merging it with other HyperLogLog structure
+        #
+        # @param other_buckets_hash hash with HyperLogLog structure representation
+        def merge_hash!(other_buckets_hash)
+          buckets.merge!(other_buckets_hash) {|_key, old, new| new > old ? new : old }
+        end
+        # Serialize instance underlying HyperLogLog structure to JSON format, that can be stored in various persistence layers
+        #
+        # @return [String] HyperLogLog data structure serialized to JSON
+        def to_json(_ = nil)
+          buckets.to_json
+        end
+        private
+        attr_accessor :buckets
+        # arbitrary values that are present in #estimate_cardinality
+        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+        # article, they are not representing any entity and serves as tune value
+        # for the whole equation
+        def estimate_cardinality
+          num_zero_buckets = TOTAL_BUCKETS - buckets.size
+          num_uniques = (
+            ((TOTAL_BUCKETS**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS))) /
+            (num_zero_buckets + buckets.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
+          ).to_i
+          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
+            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
+              Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
+          else
+            num_uniques
+          end
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/utils/usage_data.rb
+++ b/lib/gitlab/utils/usage_data.rb
@@ -61,7 +61,10 @@ module Gitlab
      end
      def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
-        Gitlab::Database::PostgresHll::BatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
+        Gitlab::Database::PostgresHll::BatchDistinctCounter
+          .new(relation, column)
+          .execute(batch_size: batch_size, start: start, finish: finish)
+          .estimated_distinct_count
      rescue ActiveRecord::StatementInvalid
        FALLBACK
      # catch all rescue should be removed as a part of feature flag rollout issue

--- a/spec/lib/gitlab/database/postgres_hll/batch_distinct_counter_spec.rb
+++ b/spec/lib/gitlab/database/postgres_hll/batch_distinct_counter_spec.rb
@@ -24,107 +24,48 @@ RSpec.describe Gitlab::Database::PostgresHll::BatchDistinctCounter do
    allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(in_transaction)
  end
-  context 'different distribution of relation records' do
-    [10, 100, 100_000].each do |spread|
-      context "records are spread within #{spread}" do
-        before do
-          ids = (1..spread).to_a.sample(10)
-          create_list(:issue, 10).each_with_index do |issue, i|
-            issue.id = ids[i]
-          end
-        end
-        it 'counts table' do
-          expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(10)
-        end
-      end
-    end
-  end
  context 'unit test for different counting parameters' do
    before_all do
      create_list(:issue, 3, author: user)
      create_list(:issue, 2, author: another_user)
    end
-    describe '#estimate_distinct_count' do
+    describe '#execute' do
-      it 'counts table' do
+      it 'builds hll buckets' do
-        expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(5)
+        expect(described_class.new(model).execute).to be_an_instance_of(Gitlab::Database::PostgresHll::Buckets)
-      end
-      it 'counts with column field' do
-        expect(described_class.new(model, column).estimate_distinct_count).to be_within(error_rate).percent_of(2)
-      end
-      it 'counts with :id field' do
-        expect(described_class.new(model, :id).estimate_distinct_count).to be_within(error_rate).percent_of(5)
-      end
-      it 'counts with "id" field' do
-        expect(described_class.new(model, "id").estimate_distinct_count).to be_within(error_rate).percent_of(5)
-      end
-      it 'counts with table.column field' do
-        expect(described_class.new(model, "#{model.table_name}.#{column}").estimate_distinct_count).to be_within(error_rate).percent_of(2)
-      end
-      it 'counts with Arel column' do
-        expect(described_class.new(model, model.arel_table[column]).estimate_distinct_count).to be_within(error_rate).percent_of(2)
-      end
-      it 'counts over joined relations' do
-        expect(described_class.new(model.joins(:author), "users.email").estimate_distinct_count).to be_within(error_rate).percent_of(2)
-      end
-      it 'counts with :column field with batch_size of 50K' do
-        expect(described_class.new(model, column).estimate_distinct_count(batch_size: 50_000)).to be_within(error_rate).percent_of(2)
-      end
-      it 'will not count table with a batch size less than allowed' do
-        expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
-      end
-      it 'counts with different number of batches and aggregates total result' do
-        stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
-        [1, 2, 4, 5, 6].each { |i| expect(described_class.new(model).estimate_distinct_count(batch_size: i)).to be_within(error_rate).percent_of(5) }
-      end
-      it 'counts with a start and finish' do
-        expect(described_class.new(model, column).estimate_distinct_count(start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
      end
-      it "defaults the batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
+      it "defaults batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
        min_id = model.minimum(:id)
        batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE)
        expect(model).to receive(:where).with("id" => min_id..batch_end_id).and_call_original
-        described_class.new(model).estimate_distinct_count
+        described_class.new(model).execute
      end
      context 'when a transaction is open' do
        let(:in_transaction) { true }
        it 'raises an error' do
-          expect { described_class.new(model, column).estimate_distinct_count }.to raise_error('BatchCount can not be run inside a transaction')
+          expect { described_class.new(model, column).execute }.to raise_error('BatchCount can not be run inside a transaction')
        end
      end
      context 'disallowed configurations' do
        let(:default_batch_size) { Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE }
-        it 'returns fallback if start is bigger than finish' do
+        it 'raises WRONG_CONFIGURATION_ERROR if start is bigger than finish' do
-          expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
+          expect { described_class.new(model, column).execute(start: 1, finish: 0) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
        end
-        it 'returns fallback if data volume exceeds upper limit' do
+        it 'raises WRONG_CONFIGURATION_ERROR if data volume exceeds upper limit' do
          large_finish = Gitlab::Database::PostgresHll::BatchDistinctCounter::MAX_DATA_VOLUME + 1
-          expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
+          expect { described_class.new(model, column).execute(start: 1, finish: large_finish) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
        end
-        it 'returns fallback if batch size is less than min required' do
+        it 'raises WRONG_CONFIGURATION_ERROR if batch size is less than min required' do
-          expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
+          expect { described_class.new(model, column).execute(batch_size: small_batch_size) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
        end
      end
    end

--- a/spec/lib/gitlab/database/postgres_hll/buckets_spec.rb
+++ b/spec/lib/gitlab/database/postgres_hll/buckets_spec.rb
+# frozen_string_literal: true
+require 'spec_helper'
+RSpec.describe Gitlab::Database::PostgresHll::Buckets do
+  let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
+  let(:buckets_hash_5) { { 121 => 2, 126 => 1, 141 => 1, 383 => 1, 56 => 1 } }
+  let(:buckets_hash_2) { { 141 => 1, 56 => 1 } }
+  describe '#estimated_distinct_count' do
+    it 'provides estimated cardinality', :aggregate_failures do
+      expect(described_class.new(buckets_hash_5).estimated_distinct_count).to be_within(error_rate).percent_of(5)
+      expect(described_class.new(buckets_hash_2).estimated_distinct_count).to be_within(error_rate).percent_of(2)
+      expect(described_class.new({}).estimated_distinct_count).to eq 0
+      expect(described_class.new.estimated_distinct_count).to eq 0
+    end
+  end
+  describe '#merge_hash!' do
+    let(:hash_a) { { 1 => 1, 2 => 3 } }
+    let(:hash_b) { { 1 => 2, 2 => 1 } }
+    it 'merges two hashes together into union of two sets' do
+      expect(described_class.new(hash_a).merge_hash!(hash_b).to_json).to eq described_class.new(1 => 2, 2 => 3).to_json
+    end
+  end
+  describe '#to_json' do
+    it 'serialize HyperLogLog buckets as hash' do
+      expect(described_class.new(1 => 5).to_json).to eq '{"1":5}'
+    end
+  end
+end
--- a/spec/lib/gitlab/utils/usage_data_spec.rb
+++ b/spec/lib/gitlab/utils/usage_data_spec.rb
@@ -38,32 +38,123 @@ RSpec.describe Gitlab::Utils::UsageData do
  end
  describe '#estimate_batch_distinct_count' do
+    let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
    let(:relation) { double(:relation) }
+    before do
+      allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(false)
+    end
    it 'delegates counting to counter class instance' do
+      buckets = instance_double(Gitlab::Database::PostgresHll::Buckets)
      expect_next_instance_of(Gitlab::Database::PostgresHll::BatchDistinctCounter, relation, 'column') do |instance|
-        expect(instance).to receive(:estimate_distinct_count)
+        expect(instance).to receive(:execute)
                              .with(batch_size: nil, start: nil, finish: nil)
-                              .and_return(5)
+                              .and_return(buckets)
      end
+      expect(buckets).to receive(:estimated_distinct_count).and_return(5)
      expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5)
    end
+    context 'quasi integration test for different counting parameters' do
+      let_it_be(:user) { create(:user, email: 'email1@domain.com') }
+      let_it_be(:another_user) { create(:user, email: 'email2@domain.com') }
+      let(:model) { Issue }
+      let(:column) { :author_id }
+      context 'different distribution of relation records' do
+        [10, 100, 100_000].each do |spread|
+          context "records are spread within #{spread}" do
+            before do
+              ids = (1..spread).to_a.sample(10)
+              create_list(:issue, 10).each_with_index do |issue, i|
+                issue.id = ids[i]
+              end
+            end
+            it 'counts table' do
+              expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(10)
+            end
+          end
+        end
+      end
+      context 'different counting parameters' do
+        before_all do
+          create_list(:issue, 3, author: user)
+          create_list(:issue, 2, author: another_user)
+        end
+        it 'counts table' do
+          expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(5)
+        end
+        it 'counts with column field' do
+          expect(described_class.estimate_batch_distinct_count(model, column)).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts with :id field' do
+          expect(described_class.estimate_batch_distinct_count(model, :id)).to be_within(error_rate).percent_of(5)
+        end
+        it 'counts with "id" field' do
+          expect(described_class.estimate_batch_distinct_count(model, "id")).to be_within(error_rate).percent_of(5)
+        end
+        it 'counts with table.column field' do
+          expect(described_class.estimate_batch_distinct_count(model, "#{model.table_name}.#{column}")).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts with Arel column' do
+          expect(described_class.estimate_batch_distinct_count(model, model.arel_table[column])).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts over joined relations' do
+          expect(described_class.estimate_batch_distinct_count(model.joins(:author), "users.email")).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts with :column field with batch_size of 50K' do
+          expect(described_class.estimate_batch_distinct_count(model, column, batch_size: 50_000)).to be_within(error_rate).percent_of(2)
+        end
+        it 'counts with different number of batches and aggregates total result' do
+          stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
+          [1, 2, 4, 5, 6].each { |i| expect(described_class.estimate_batch_distinct_count(model, batch_size: i)).to be_within(error_rate).percent_of(5) }
+        end
+        it 'counts with a start and finish' do
+          expect(described_class.estimate_batch_distinct_count(model, column, start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
+        end
+      end
+    end
+    describe 'error handling' do
+      before do
+        stub_const("Gitlab::Utils::UsageData::FALLBACK", 3)
+        stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 4)
+      end
+      it 'returns fallback if counter raises WRONG_CONFIGURATION_ERROR' do
+        expect(described_class.estimate_batch_distinct_count(relation, 'id', start: 1, finish: 0)).to eq 3
+      end
      it 'returns default fallback value when counting fails due to database error' do
-      stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
        allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
-      expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
+        expect(described_class.estimate_batch_distinct_count(relation)).to eq(3)
      end
      it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
        error = StandardError.new('')
-      stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
        allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(error)
        expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
-      expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
+        expect(described_class.estimate_batch_distinct_count(relation)).to eq(4)
+      end
    end
  end