Commit f048036e authored by Bob Van Landuyt's avatar Bob Van Landuyt

Merge branch 'mwaw/use_postgres_hll_batch_counting_in_usage_ping' into 'master'

Apply Postgres HLL distributed batch counter to Usage Ping secure pipelines metrics

See merge request gitlab-org/gitlab!48233
parents 91fd7b72 7ead5a08
---
name: postgres_hll_batch_counting
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/48233
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/285485
milestone: '13.7'
type: development
group: group::product analytics
default_enabled: false
...@@ -396,9 +396,36 @@ module EE ...@@ -396,9 +396,36 @@ module EE
def count_secure_pipelines(time_period) def count_secure_pipelines(time_period)
return {} if time_period.blank? return {} if time_period.blank?
pipelines_with_secure_jobs = {}
# HLL batch counting always iterate over pkey of
# given relation, while ordinary batch count
# iterated over counted attribute, one-to-many joins
# can break batch size limitation, and lead to
# time outing batch queries, to avoid that
# different join strategy is used for HLL counter
if ::Feature.enabled?(:postgres_hll_batch_counting)
relation = ::Security::Scan.where(time_period).group(:created_at)
start = relation.select('MIN(id) as min_id').order('min_id ASC').first&.min_id
finish = relation.select('MAX(id) as max_id').order('max_id DESC').first&.max_id
::Security::Scan.scan_types.each do |name, scan_type|
relation = ::Security::Scan.joins(:build)
.where(ci_builds: { status: 'success', retried: [nil, false] })
.where('security_scans.scan_type = ?', scan_type)
.where(security_scans: time_period)
pipelines_with_secure_jobs["#{name}_pipeline".to_sym] =
if start && finish
estimate_batch_distinct_count(relation, :commit_id, batch_size: 1000, start: start, finish: finish)
else
0
end
end
else
start = ::Ci::Pipeline.minimum(:id) start = ::Ci::Pipeline.minimum(:id)
finish = ::Ci::Pipeline.maximum(:id) finish = ::Ci::Pipeline.maximum(:id)
pipelines_with_secure_jobs = {}
::Security::Scan.scan_types.each do |name, scan_type| ::Security::Scan.scan_types.each do |name, scan_type|
relation = ::Ci::Build.joins(:security_scans) relation = ::Ci::Build.joins(:security_scans)
...@@ -407,6 +434,7 @@ module EE ...@@ -407,6 +434,7 @@ module EE
.where(time_period) .where(time_period)
pipelines_with_secure_jobs["#{name}_pipeline".to_sym] = distinct_count(relation, :commit_id, start: start, finish: finish, batch: false) pipelines_with_secure_jobs["#{name}_pipeline".to_sym] = distinct_count(relation, :commit_id, start: start, finish: finish, batch: false)
end end
end
pipelines_with_secure_jobs pipelines_with_secure_jobs
end end
......
This diff is collapsed.
...@@ -28,13 +28,14 @@ module Gitlab ...@@ -28,13 +28,14 @@ module Gitlab
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3% # for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used. # for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class PostgresHllBatchDistinctCounter class PostgresHllBatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
FALLBACK = -1 FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 1_250 MIN_REQUIRED_BATCH_SIZE = 750
MAX_ALLOWED_LOOPS = 10_000
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705 # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 100_000 DEFAULT_BATCH_SIZE = 10_000
BIT_31_MASK = "B'0#{'1' * 31}'" BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'" BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
...@@ -49,7 +50,7 @@ module Gitlab ...@@ -49,7 +50,7 @@ module Gitlab
SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num, SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes FROM hashed_attributes
GROUP BY 1 ORDER BY 1 GROUP BY 1
SQL SQL
TOTAL_BUCKETS_NUMBER = 512 TOTAL_BUCKETS_NUMBER = 512
...@@ -61,7 +62,7 @@ module Gitlab ...@@ -61,7 +62,7 @@ module Gitlab
def unwanted_configuration?(finish, batch_size, start) def unwanted_configuration?(finish, batch_size, start)
batch_size <= MIN_REQUIRED_BATCH_SIZE || batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) / batch_size >= MAX_ALLOWED_LOOPS || (finish - start) >= MAX_DATA_VOLUME ||
start > finish start > finish
end end
...@@ -101,7 +102,7 @@ module Gitlab ...@@ -101,7 +102,7 @@ module Gitlab
num_uniques = ( num_uniques = (
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) / ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash, _| 2**(-1 * bucket_hash)} ) (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i ).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
......
...@@ -25,6 +25,13 @@ module Gitlab ...@@ -25,6 +25,13 @@ module Gitlab
relation.select(relation.all.table[column].sum).to_sql relation.select(relation.all.table[column].sum).to_sql
end end
# For estimated distinct count use exact query instead of hll
# buckets query, because it can't be used to obtain estimations without
# supplementary ruby code present in Gitlab::Database::PostgresHllBatchDistinctCounter
def estimate_batch_distinct_count(relation, column = nil, *rest)
raw_sql(relation, column, :distinct)
end
private private
def raw_sql(relation, column, distinct = nil) def raw_sql(relation, column, distinct = nil)
......
...@@ -38,6 +38,7 @@ module Gitlab ...@@ -38,6 +38,7 @@ module Gitlab
extend self extend self
FALLBACK = -1 FALLBACK = -1
DISTRIBUTED_HLL_FALLBACK = -2
def count(relation, column = nil, batch: true, batch_size: nil, start: nil, finish: nil) def count(relation, column = nil, batch: true, batch_size: nil, start: nil, finish: nil)
if batch if batch
...@@ -59,6 +60,17 @@ module Gitlab ...@@ -59,6 +60,17 @@ module Gitlab
FALLBACK FALLBACK
end end
def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::PostgresHllBatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
rescue ActiveRecord::StatementInvalid
FALLBACK
# catch all rescue should be removed as a part of feature flag rollout issue
# https://gitlab.com/gitlab-org/gitlab/-/issues/285485
rescue StandardError => error
Gitlab::ErrorTracking.track_and_raise_for_dev_exception(error)
DISTRIBUTED_HLL_FALLBACK
end
def sum(relation, column, batch_size: nil, start: nil, finish: nil) def sum(relation, column, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::BatchCount.batch_sum(relation, column, batch_size: batch_size, start: start, finish: finish) Gitlab::Database::BatchCount.batch_sum(relation, column, batch_size: batch_size, start: start, finish: finish)
rescue ActiveRecord::StatementInvalid rescue ActiveRecord::StatementInvalid
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
require 'spec_helper' require 'spec_helper'
RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
let_it_be(:error_rate) { 4.9 } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin let_it_be(:error_rate) { described_class::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be(:fallback) { ::Gitlab::Database::BatchCounter::FALLBACK } let_it_be(:fallback) { ::Gitlab::Database::BatchCounter::FALLBACK }
let_it_be(:small_batch_size) { calculate_batch_size(::Gitlab::Database::BatchCounter::MIN_REQUIRED_BATCH_SIZE) } let_it_be(:small_batch_size) { calculate_batch_size(described_class::MIN_REQUIRED_BATCH_SIZE) }
let(:model) { Issue } let(:model) { Issue }
let(:column) { :author_id } let(:column) { :author_id }
...@@ -118,8 +118,8 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do ...@@ -118,8 +118,8 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback) expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
end end
it 'returns fallback if loops more than allowed' do it 'returns fallback if data volume exceeds upper limit' do
large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_ALLOWED_LOOPS * default_batch_size + 1 large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_DATA_VOLUME + 1
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback) expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
end end
......
...@@ -37,6 +37,36 @@ RSpec.describe Gitlab::Utils::UsageData do ...@@ -37,6 +37,36 @@ RSpec.describe Gitlab::Utils::UsageData do
end end
end end
describe '#estimate_batch_distinct_count' do
let(:relation) { double(:relation) }
it 'delegates counting to counter class instance' do
expect_next_instance_of(Gitlab::Database::PostgresHllBatchDistinctCounter, relation, 'column') do |instance|
expect(instance).to receive(:estimate_distinct_count)
.with(batch_size: nil, start: nil, finish: nil)
.and_return(5)
end
expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5)
end
it 'returns default fallback value when counting fails due to database error' do
stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
end
it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
error = StandardError.new('')
stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(error)
expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
end
end
describe '#sum' do describe '#sum' do
let(:relation) { double(:relation) } let(:relation) { double(:relation) }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment