Commit 281fda79 authored by Alper Akgun's avatar Alper Akgun

Merge branch 'mwaw/extract_postgres_hll_bucktes_class' into 'master'

Extract Buckets out of BatchDistinctCounter

See merge request gitlab-org/gitlab!49344
parents 065c88d7 c111cb39
......@@ -16,9 +16,9 @@ module Gitlab
# Grouped relations are NOT supported yet.
#
# @example Usage
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).execute
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
# .estimate_distinct_count(
# .execute(
# batch_size: 1_000,
# start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
# finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
......@@ -30,7 +30,6 @@ module Gitlab
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class BatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 750
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
......@@ -38,8 +37,10 @@ module Gitlab
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 10_000
ZERO_OFFSET = 1
BUCKET_ID_MASK = (Buckets::TOTAL_BUCKETS - ZERO_OFFSET).to_s(2)
BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
BIT_32_NORMALIZED_BUCKET_ID_MASK = "B'#{'0' * (32 - BUCKET_ID_MASK.size)}#{BUCKET_ID_MASK}'"
# @example source_query
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
......@@ -48,73 +49,58 @@ module Gitlab
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL = <<~SQL
WITH hashed_attributes AS (%{source_query})
SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
SELECT (attr_hash_32_bits & #{BIT_32_NORMALIZED_BUCKET_ID_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes
GROUP BY 1
SQL
TOTAL_BUCKETS_NUMBER = 512
WRONG_CONFIGURATION_ERROR = Class.new(ActiveRecord::StatementInvalid)
def initialize(relation, column = nil)
@relation = relation
@column = column || relation.primary_key
end
def unwanted_configuration?(finish, batch_size, start)
batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) >= MAX_DATA_VOLUME ||
start > finish
end
def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
# Executes counter that iterates over database source and return Gitlab::Database::PostgresHll::Buckets
# that can be used to estimation of number of uniq elements in analysed set
#
# @param batch_size maximal number of rows that will be analysed by single database query
# @param start initial pkey range
# @param finish final pkey range
# @return [Gitlab::Database::PostgresHll::Buckets] HyperLogLog data structure instance that can estimate number of unique elements
def execute(batch_size: nil, start: nil, finish: nil)
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
batch_size ||= DEFAULT_BATCH_SIZE
start = actual_start(start)
finish = actual_finish(finish)
raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
return FALLBACK if unwanted_configuration?(finish, batch_size, start)
raise WRONG_CONFIGURATION_ERROR if unwanted_configuration?(start, finish, batch_size)
batch_start = start
hll_blob = {}
hll_buckets = Buckets.new
while batch_start <= finish
begin
hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
hll_buckets.merge_hash!(hll_buckets_for_batch(batch_start, batch_start + batch_size))
batch_start += batch_size
end
sleep(SLEEP_TIME_IN_SECONDS)
end
estimate_cardinality(hll_blob)
hll_buckets
end
private
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def estimate_cardinality(hll_blob)
num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
num_uniques = (
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
else
num_uniques
end
def unwanted_configuration?(start, finish, batch_size)
batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) >= MAX_DATA_VOLUME ||
start > finish || start < 0 || finish < 0
end
def hll_blob_for_batch(start, finish)
def hll_buckets_for_batch(start, finish)
@relation
.connection
.execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
......
# frozen_string_literal: true
module Gitlab
module Database
module PostgresHll
# Bucket class represent data structure build with HyperLogLog algorithm
# that models data distribution in analysed set. This representation than can be used
# for following purposes
# 1. Estimating number of unique elements that this structure represents
# 2. Merging with other Buckets structure to later estimate number of unique elements in sum of two
# represented data sets
# 3. Serializing Buckets structure to json format, that can be stored in various persistence layers
#
# @example Usage
# ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).estimated_distinct_count
# ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).merge_hash!(141 => 1, 56 => 5).estimated_distinct_count
# ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).to_json
# @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
# Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class Buckets
TOTAL_BUCKETS = 512
def initialize(buckets = {})
@buckets = buckets
end
# Based on HyperLogLog structure estimates number of unique elements in analysed set.
#
# @return [Float] Estimate number of unique elements
def estimated_distinct_count
@estimated_distinct_count ||= estimate_cardinality
end
# Updates instance underlying HyperLogLog structure by merging it with other HyperLogLog structure
#
# @param other_buckets_hash hash with HyperLogLog structure representation
def merge_hash!(other_buckets_hash)
buckets.merge!(other_buckets_hash) {|_key, old, new| new > old ? new : old }
end
# Serialize instance underlying HyperLogLog structure to JSON format, that can be stored in various persistence layers
#
# @return [String] HyperLogLog data structure serialized to JSON
def to_json(_ = nil)
buckets.to_json
end
private
attr_accessor :buckets
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def estimate_cardinality
num_zero_buckets = TOTAL_BUCKETS - buckets.size
num_uniques = (
((TOTAL_BUCKETS**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS))) /
(num_zero_buckets + buckets.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
else
num_uniques
end
end
end
end
end
end
......@@ -61,7 +61,10 @@ module Gitlab
end
def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::PostgresHll::BatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
Gitlab::Database::PostgresHll::BatchDistinctCounter
.new(relation, column)
.execute(batch_size: batch_size, start: start, finish: finish)
.estimated_distinct_count
rescue ActiveRecord::StatementInvalid
FALLBACK
# catch all rescue should be removed as a part of feature flag rollout issue
......
......@@ -24,107 +24,48 @@ RSpec.describe Gitlab::Database::PostgresHll::BatchDistinctCounter do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(in_transaction)
end
context 'different distribution of relation records' do
[10, 100, 100_000].each do |spread|
context "records are spread within #{spread}" do
before do
ids = (1..spread).to_a.sample(10)
create_list(:issue, 10).each_with_index do |issue, i|
issue.id = ids[i]
end
end
it 'counts table' do
expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(10)
end
end
end
end
context 'unit test for different counting parameters' do
before_all do
create_list(:issue, 3, author: user)
create_list(:issue, 2, author: another_user)
end
describe '#estimate_distinct_count' do
it 'counts table' do
expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(5)
end
it 'counts with column field' do
expect(described_class.new(model, column).estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts with :id field' do
expect(described_class.new(model, :id).estimate_distinct_count).to be_within(error_rate).percent_of(5)
end
it 'counts with "id" field' do
expect(described_class.new(model, "id").estimate_distinct_count).to be_within(error_rate).percent_of(5)
end
it 'counts with table.column field' do
expect(described_class.new(model, "#{model.table_name}.#{column}").estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts with Arel column' do
expect(described_class.new(model, model.arel_table[column]).estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts over joined relations' do
expect(described_class.new(model.joins(:author), "users.email").estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts with :column field with batch_size of 50K' do
expect(described_class.new(model, column).estimate_distinct_count(batch_size: 50_000)).to be_within(error_rate).percent_of(2)
end
it 'will not count table with a batch size less than allowed' do
expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
end
it 'counts with different number of batches and aggregates total result' do
stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
[1, 2, 4, 5, 6].each { |i| expect(described_class.new(model).estimate_distinct_count(batch_size: i)).to be_within(error_rate).percent_of(5) }
end
it 'counts with a start and finish' do
expect(described_class.new(model, column).estimate_distinct_count(start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
describe '#execute' do
it 'builds hll buckets' do
expect(described_class.new(model).execute).to be_an_instance_of(Gitlab::Database::PostgresHll::Buckets)
end
it "defaults the batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
it "defaults batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
min_id = model.minimum(:id)
batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE)
expect(model).to receive(:where).with("id" => min_id..batch_end_id).and_call_original
described_class.new(model).estimate_distinct_count
described_class.new(model).execute
end
context 'when a transaction is open' do
let(:in_transaction) { true }
it 'raises an error' do
expect { described_class.new(model, column).estimate_distinct_count }.to raise_error('BatchCount can not be run inside a transaction')
expect { described_class.new(model, column).execute }.to raise_error('BatchCount can not be run inside a transaction')
end
end
context 'disallowed configurations' do
let(:default_batch_size) { Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE }
it 'returns fallback if start is bigger than finish' do
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
it 'raises WRONG_CONFIGURATION_ERROR if start is bigger than finish' do
expect { described_class.new(model, column).execute(start: 1, finish: 0) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
end
it 'returns fallback if data volume exceeds upper limit' do
it 'raises WRONG_CONFIGURATION_ERROR if data volume exceeds upper limit' do
large_finish = Gitlab::Database::PostgresHll::BatchDistinctCounter::MAX_DATA_VOLUME + 1
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
expect { described_class.new(model, column).execute(start: 1, finish: large_finish) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
end
it 'returns fallback if batch size is less than min required' do
expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
it 'raises WRONG_CONFIGURATION_ERROR if batch size is less than min required' do
expect { described_class.new(model, column).execute(batch_size: small_batch_size) }.to raise_error(described_class::WRONG_CONFIGURATION_ERROR)
end
end
end
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::PostgresHll::Buckets do
let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let(:buckets_hash_5) { { 121 => 2, 126 => 1, 141 => 1, 383 => 1, 56 => 1 } }
let(:buckets_hash_2) { { 141 => 1, 56 => 1 } }
describe '#estimated_distinct_count' do
it 'provides estimated cardinality', :aggregate_failures do
expect(described_class.new(buckets_hash_5).estimated_distinct_count).to be_within(error_rate).percent_of(5)
expect(described_class.new(buckets_hash_2).estimated_distinct_count).to be_within(error_rate).percent_of(2)
expect(described_class.new({}).estimated_distinct_count).to eq 0
expect(described_class.new.estimated_distinct_count).to eq 0
end
end
describe '#merge_hash!' do
let(:hash_a) { { 1 => 1, 2 => 3 } }
let(:hash_b) { { 1 => 2, 2 => 1 } }
it 'merges two hashes together into union of two sets' do
expect(described_class.new(hash_a).merge_hash!(hash_b).to_json).to eq described_class.new(1 => 2, 2 => 3).to_json
end
end
describe '#to_json' do
it 'serialize HyperLogLog buckets as hash' do
expect(described_class.new(1 => 5).to_json).to eq '{"1":5}'
end
end
end
......@@ -38,32 +38,123 @@ RSpec.describe Gitlab::Utils::UsageData do
end
describe '#estimate_batch_distinct_count' do
let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let(:relation) { double(:relation) }
before do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(false)
end
it 'delegates counting to counter class instance' do
buckets = instance_double(Gitlab::Database::PostgresHll::Buckets)
expect_next_instance_of(Gitlab::Database::PostgresHll::BatchDistinctCounter, relation, 'column') do |instance|
expect(instance).to receive(:estimate_distinct_count)
expect(instance).to receive(:execute)
.with(batch_size: nil, start: nil, finish: nil)
.and_return(5)
.and_return(buckets)
end
expect(buckets).to receive(:estimated_distinct_count).and_return(5)
expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5)
end
context 'quasi integration test for different counting parameters' do
let_it_be(:user) { create(:user, email: 'email1@domain.com') }
let_it_be(:another_user) { create(:user, email: 'email2@domain.com') }
let(:model) { Issue }
let(:column) { :author_id }
context 'different distribution of relation records' do
[10, 100, 100_000].each do |spread|
context "records are spread within #{spread}" do
before do
ids = (1..spread).to_a.sample(10)
create_list(:issue, 10).each_with_index do |issue, i|
issue.id = ids[i]
end
end
it 'counts table' do
expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(10)
end
end
end
end
context 'different counting parameters' do
before_all do
create_list(:issue, 3, author: user)
create_list(:issue, 2, author: another_user)
end
it 'counts table' do
expect(described_class.estimate_batch_distinct_count(model)).to be_within(error_rate).percent_of(5)
end
it 'counts with column field' do
expect(described_class.estimate_batch_distinct_count(model, column)).to be_within(error_rate).percent_of(2)
end
it 'counts with :id field' do
expect(described_class.estimate_batch_distinct_count(model, :id)).to be_within(error_rate).percent_of(5)
end
it 'counts with "id" field' do
expect(described_class.estimate_batch_distinct_count(model, "id")).to be_within(error_rate).percent_of(5)
end
it 'counts with table.column field' do
expect(described_class.estimate_batch_distinct_count(model, "#{model.table_name}.#{column}")).to be_within(error_rate).percent_of(2)
end
it 'counts with Arel column' do
expect(described_class.estimate_batch_distinct_count(model, model.arel_table[column])).to be_within(error_rate).percent_of(2)
end
it 'counts over joined relations' do
expect(described_class.estimate_batch_distinct_count(model.joins(:author), "users.email")).to be_within(error_rate).percent_of(2)
end
it 'counts with :column field with batch_size of 50K' do
expect(described_class.estimate_batch_distinct_count(model, column, batch_size: 50_000)).to be_within(error_rate).percent_of(2)
end
it 'counts with different number of batches and aggregates total result' do
stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
[1, 2, 4, 5, 6].each { |i| expect(described_class.estimate_batch_distinct_count(model, batch_size: i)).to be_within(error_rate).percent_of(5) }
end
it 'counts with a start and finish' do
expect(described_class.estimate_batch_distinct_count(model, column, start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
end
end
end
describe 'error handling' do
before do
stub_const("Gitlab::Utils::UsageData::FALLBACK", 3)
stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 4)
end
it 'returns fallback if counter raises WRONG_CONFIGURATION_ERROR' do
expect(described_class.estimate_batch_distinct_count(relation, 'id', start: 1, finish: 0)).to eq 3
end
it 'returns default fallback value when counting fails due to database error' do
stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(3)
end
it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
error = StandardError.new('')
stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(error)
expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(4)
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment