Commit c111cb39 authored by Mikołaj Wawrzyniak's avatar Mikołaj Wawrzyniak Committed by Alper Akgun

Extract Buckets out of BatchDistinctCounter

To enable more flexible operations over hll buckets we need to extract
dedicated class to model their behaviour.
parent fd79577a
......@@ -16,9 +16,9 @@ module Gitlab
# Grouped relations are NOT supported yet.
#
# @example Usage
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).execute
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
# .estimate_distinct_count(
# .execute(
# batch_size: 1_000,
# start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
# finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
......@@ -30,7 +30,6 @@ module Gitlab
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class BatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 750
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
......@@ -38,8 +37,10 @@ module Gitlab
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 10_000
ZERO_OFFSET = 1
BUCKET_ID_MASK = (Buckets::TOTAL_BUCKETS - ZERO_OFFSET).to_s(2)
BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
BIT_32_NORMALIZED_BUCKET_ID_MASK = "B'#{'0' * (32 - BUCKET_ID_MASK.size)}#{BUCKET_ID_MASK}'"
# @example source_query
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
......@@ -48,73 +49,58 @@ module Gitlab
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL = <<~SQL
WITH hashed_attributes AS (%{source_query})
SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
SELECT (attr_hash_32_bits & #{BIT_32_NORMALIZED_BUCKET_ID_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes
GROUP BY 1
SQL
TOTAL_BUCKETS_NUMBER = 512
WRONG_CONFIGURATION_ERROR = Class.new(ActiveRecord::StatementInvalid)
def initialize(relation, column = nil)
@relation = relation
@column = column || relation.primary_key
end
def unwanted_configuration?(finish, batch_size, start)
batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) >= MAX_DATA_VOLUME ||
start > finish
end
def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
# Executes counter that iterates over database source and return Gitlab::Database::PostgresHll::Buckets
# that can be used to estimation of number of uniq elements in analysed set
#
# @param batch_size maximal number of rows that will be analysed by single database query
# @param start initial pkey range
# @param finish final pkey range
# @return [Gitlab::Database::PostgresHll::Buckets] HyperLogLog data structure instance that can estimate number of unique elements
def execute(batch_size: nil, start: nil, finish: nil)
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
batch_size ||= DEFAULT_BATCH_SIZE
start = actual_start(start)
finish = actual_finish(finish)
raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
return FALLBACK if unwanted_configuration?(finish, batch_size, start)
raise WRONG_CONFIGURATION_ERROR if unwanted_configuration?(start, finish, batch_size)
batch_start = start
hll_blob = {}
hll_buckets = Buckets.new
while batch_start <= finish
begin
hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
hll_buckets.merge_hash!(hll_buckets_for_batch(batch_start, batch_start + batch_size))
batch_start += batch_size
end
sleep(SLEEP_TIME_IN_SECONDS)
end
estimate_cardinality(hll_blob)
hll_buckets
end
private
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def estimate_cardinality(hll_blob)
num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
num_uniques = (
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
else
num_uniques
end
def unwanted_configuration?(start, finish, batch_size)
batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) >= MAX_DATA_VOLUME ||
start > finish || start < 0 || finish < 0
end
def hll_blob_for_batch(start, finish)
def hll_buckets_for_batch(start, finish)
@relation
.connection
.execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
......
# frozen_string_literal: true
module Gitlab
module Database
module PostgresHll
# Bucket class represent data structure build with HyperLogLog algorithm
# that models data distribution in analysed set. This representation than can be used
# for following purposes
# 1. Estimating number of unique elements that this structure represents
# 2. Merging with other Buckets structure to later estimate number of unique elements in sum of two
# represented data sets
# 3. Serializing Buckets structure to json format, that can be stored in various persistence layers
#
# @example Usage
# ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).estimated_distinct_count
# ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).merge_hash!(141 => 1, 56 => 5).estimated_distinct_count
# ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).to_json
# @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
# Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class Buckets
TOTAL_BUCKETS = 512
def initialize(buckets = {})
@buckets = buckets
end
# Based on HyperLogLog structure estimates number of unique elements in analysed set.
#
# @return [Float] Estimate number of unique elements
def estimated_distinct_count
@estimated_distinct_count ||= estimate_cardinality
end
# Updates instance underlying HyperLogLog structure by merging it with other HyperLogLog structure
#
# @param other_buckets_hash hash with HyperLogLog structure representation
def merge_hash!(other_buckets_hash)
buckets.merge!(other_buckets_hash) {|_key, old, new| new > old ? new : old }
end
# Serialize instance underlying HyperLogLog structure to JSON format, that can be stored in various persistence layers
#
# @return [String] HyperLogLog data structure serialized to JSON
def to_json(_ = nil)
buckets.to_json
end
private
attr_accessor :buckets
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def estimate_cardinality
num_zero_buckets = TOTAL_BUCKETS - buckets.size
num_uniques = (
((TOTAL_BUCKETS**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS))) /
(num_zero_buckets + buckets.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
else
num_uniques
end
end
end
end
end
end
......@@ -61,7 +61,10 @@ module Gitlab
end
def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::PostgresHll::BatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
Gitlab::Database::PostgresHll::BatchDistinctCounter
.new(relation, column)
.execute(batch_size: batch_size, start: start, finish: finish)
.estimated_distinct_count
rescue ActiveRecord::StatementInvalid
FALLBACK
# catch all rescue should be removed as a part of feature flag rollout issue
......
......@@ -24,107 +24,48 @@ RSpec.describe Gitlab::Database::PostgresHll::BatchDistinctCounter do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(in_transaction)
end
context 'different distribution of relation records' do
[10, 100, 100_000].each do |spread|
context "records are spread within #{spread}" do
before do
ids = (1..spread).to_a.sample(10)
create_list(:issue, 10).each_with_index do |issue, i|
issue.id = ids[i]
end
end
it 'counts table' do
expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(10)
end
end
end
end
context 'unit test for different counting parameters' do
before_all do
create_list(:issue, 3, author: user)
create_list(:issue, 2, author: another_user)
end
describe '#estimate_distinct_count' do
it 'counts table' do
expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(5)
end
it 'counts with column field' do
expect(described_class.new(model, column).estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts with :id field' do
expect(described_class.new(model, :id).estimate_distinct_count).to be_within(error_rate).percent_of(5)
end
it 'counts with "id" field' do
expect(described_class.new(model, "id").estimate_distinct_count).to be_within(error_rate).percent_of(5)
end
it 'counts with table.column field' do
expect(described_class.new(model, "#{model.table_name}.#{column}").estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts with Arel column' do
expect(described_class.new(model, model.arel_table[column]).estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts over joined relations' do
expect(described_class.new(model.joins(:author), "users.email").estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts with :column field with batch_size of 50K' do
expect(described_class.new(model, column).estimate_distinct_count(batch_size: 50_000)).to be_within(error_rate).percent_of(2)
end
it 'will not count table with a batch size less than allowed' do
expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
end
it 'counts with different number of batches and aggregates total result' do
stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
[1, 2, 4, 5, 6].each { |i| expect(described_class.new(model).estimate_distinct_count(batch_size: i)).to be_within(error_rate).percent_of(5) }
end
it 'counts with a start and finish' do
expect(described_class.new(model, column).estimate_distinct_count(start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
describe '#execute' do
it 'builds hll buckets' do
expect(described_class.new(model).execute).to be_an_instance_of(Gitlab::Database::PostgresHll::Buckets)
end
it "defaults the batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
it "defaults batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
min_id = model.minimum(:id)
batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE)
expect(model).to receive(:where).with("id" => min_id..batch_end_id).and_call_original
described_class.new(model).estimate_distinct_count
described_class.new(model).execute
end
context 'when a transaction is open' do
let(:in_transaction) { true }
it 'raises an error' do
expect { described_class.new(model, column).estimate_distinct_count }.to raise_error('BatchCount can not be run inside a transaction')
expect { described_class.new(model, column).execute }.to raise_error('BatchCount can not be run inside a transaction')
end
end
context 'disallowed configurations' do
let(:default_batch_size) { Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE }
it 'returns fallback if start is bigger than finish' do
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
it 'raises WRONG_CONFIGURATION_ERROR if start is bigger than finish' do
expect { described_class.new(model, column).execute(start: 1, finish: 0) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
end
it 'returns fallback if data volume exceeds upper limit' do
it 'raises WRONG_CONFIGURATION_ERROR if data volume exceeds upper limit' do
large_finish = Gitlab::Database::PostgresHll::BatchDistinctCounter::MAX_DATA_VOLUME + 1
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
expect { described_class.new(model, column).execute(start: 1, finish: large_finish) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
end
it 'returns fallback if batch size is less than min required' do
expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
it 'raises WRONG_CONFIGURATION_ERROR if batch size is less than min required' do
expect { described_class.new(model, column).execute(batch_size: small_batch_size) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
end
end
end
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::PostgresHll::Buckets do
let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let(:buckets_hash_5) { { 121 => 2, 126 => 1, 141 => 1, 383 => 1, 56 => 1 } }
let(:buckets_hash_2) { { 141 => 1, 56 => 1 } }
describe '#estimated_distinct_count' do
it 'provides estimated cardinality', :aggregate_failures do
expect(described_class.new(buckets_hash_5).estimated_distinct_count).to be_within(error_rate).percent_of(5)
expect(described_class.new(buckets_hash_2).estimated_distinct_count).to be_within(error_rate).percent_of(2)
expect(described_class.new({}).estimated_distinct_count).to eq 0
expect(described_class.new.estimated_distinct_count).to eq 0
end
end
describe '#merge_hash!' do
let(:hash_a) { { 1 => 1, 2 => 3 } }
let(:hash_b) { { 1 => 2, 2 => 1 } }
it 'merges two hashes together into union of two sets' do
expect(described_class.new(hash_a).merge_hash!(hash_b).to_json).to eq described_class.new(1 => 2, 2 => 3).to_json
end
end
describe '#to_json' do
it 'serialize HyperLogLog buckets as hash' do
expect(described_class.new(1 => 5).to_json).to eq '{"1":5}'
end
end
end
......@@ -38,32 +38,123 @@ RSpec.describe Gitlab::Utils::UsageData do
end
describe '#estimate_batch_distinct_count' do
let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let(:relation) { double(:relation) }
before do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(false)
end
it 'delegates counting to counter class instance' do
buckets = instance_double(Gitlab::Database::PostgresHll::Buckets)
expect_next_instance_of(Gitlab::Database::PostgresHll::BatchDistinctCounter, relation, 'column') do |instance|
expect(instance).to receive(:estimate_distinct_count)
expect(instance).to receive(:execute)
.with(batch_size: nil, start: nil, finish: nil)
.and_return(5)
.and_return(buckets)
end
expect(buckets).to receive(:estimated_distinct_count).and_return(5)
expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5)
end
context 'quasi integration test for different counting parameters' do
let_it_be(:user) { create(:user, email: 'email1@domain.com') }
let_it_be(:another_user) { create(:user, email: 'email2@domain.com') }
let(:model) { Issue }
let(:column) { :author_id }
context 'different distribution of relation records' do
[10, 100, 100_000].each do |spread|
context "records are spread within #{spread}" do
before do
ids = (1..spread).to_a.sample(10)
create_list(:issue, 10).each_with_index do |issue, i|
issue.id = ids[i]
end
end
it 'counts table' do
expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(10)
end
end
end
end
context 'different counting parameters' do
before_all do
create_list(:issue, 3, author: user)
create_list(:issue, 2, author: another_user)
end
it 'counts table' do
expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(5)
end
it 'counts with column field' do
expect(described_class.estimate_batch_distinct_count(model, column)).to be_within(error_rate).percent_of(2)
end
it 'counts with :id field' do
expect(described_class.estimate_batch_distinct_count(model, :id)).to be_within(error_rate).percent_of(5)
end
it 'counts with "id" field' do
expect(described_class.estimate_batch_distinct_count(model, "id")).to be_within(error_rate).percent_of(5)
end
it 'counts with table.column field' do
expect(described_class.estimate_batch_distinct_count(model, "#{model.table_name}.#{column}")).to be_within(error_rate).percent_of(2)
end
it 'counts with Arel column' do
expect(described_class.estimate_batch_distinct_count(model, model.arel_table[column])).to be_within(error_rate).percent_of(2)
end
it 'counts over joined relations' do
expect(described_class.estimate_batch_distinct_count(model.joins(:author), "users.email")).to be_within(error_rate).percent_of(2)
end
it 'counts with :column field with batch_size of 50K' do
expect(described_class.estimate_batch_distinct_count(model, column, batch_size: 50_000)).to be_within(error_rate).percent_of(2)
end
it 'counts with different number of batches and aggregates total result' do
stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
[1, 2, 4, 5, 6].each { |i| expect(described_class.estimate_batch_distinct_count(model, batch_size: i)).to be_within(error_rate).percent_of(5) }
end
it 'counts with a start and finish' do
expect(described_class.estimate_batch_distinct_count(model, column, start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
end
end
end
describe 'error handling' do
before do
stub_const("Gitlab::Utils::UsageData::FALLBACK", 3)
stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 4)
end
it 'returns fallback if counter raises WRONG_CONFIGURATION_ERROR' do
expect(described_class.estimate_batch_distinct_count(relation, 'id', start: 1, finish: 0)).to eq 3
end
it 'returns default fallback value when counting fails due to database error' do
stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(3)
end
it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
error = StandardError.new('')
stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(error)
expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(4)
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment