Commit 0f3ff95f authored by Adam Hegyi's avatar Adam Hegyi

Introducing DistinctCount helper class

Introducing DistinctCount helper class and allow UNION to skip removing
the ordering.
parent d729746f
---
name: loose_index_scan_for_distinct_values
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/55985
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/324210
milestone: '13.10'
type: development
group: group::optimize
default_enabled: false
......@@ -88,11 +88,16 @@ module Gitlab
batch_start = start
while batch_start < finish
begin
batch_end = [batch_start + batch_size, finish].min
batch_relation = build_relation_batch(batch_start, batch_end, mode)
begin
results = merge_results(results, batch_relation.send(@operation, *@operation_args)) # rubocop:disable GitlabSecurity/PublicSend
op_args = @operation_args
if @operation == :count && @operation_args.blank? && use_loose_index_scan_for_distinct_values?(mode)
op_args = [Gitlab::Database::LooseIndexScanDistinctCount::COLUMN_ALIAS]
end
results = merge_results(results, batch_relation.send(@operation, *op_args)) # rubocop:disable GitlabSecurity/PublicSend
batch_start = batch_end
rescue ActiveRecord::QueryCanceled => error
# retry with a safe batch size & warmer cache
......@@ -102,6 +107,18 @@ module Gitlab
log_canceled_batch_fetch(batch_start, mode, batch_relation.to_sql, error)
return FALLBACK
end
rescue Gitlab::Database::LooseIndexScanDistinctCount::ColumnConfigurationError => error
Gitlab::AppJsonLogger
.error(
event: 'batch_count',
relation: @relation.table_name,
operation: @operation,
operation_args: @operation_args,
mode: mode,
message: "LooseIndexScanDistinctCount column error: #{error.message}"
)
return FALLBACK
end
sleep(SLEEP_TIME_IN_SECONDS)
......@@ -123,8 +140,12 @@ module Gitlab
private
def build_relation_batch(start, finish, mode)
if use_loose_index_scan_for_distinct_values?(mode)
Gitlab::Database::LooseIndexScanDistinctCount.new(@relation, @column).build_query(from: start, to: finish)
else
@relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend
end
end
def batch_size_for_mode_and_operation(mode, operation)
return DEFAULT_SUM_BATCH_SIZE if operation == :sum
......@@ -165,6 +186,14 @@ module Gitlab
message: "Query has been canceled with message: #{error.message}"
)
end
def use_loose_index_scan_for_distinct_values?(mode)
Feature.enabled?(:loose_index_scan_for_distinct_values) && not_group_by_query? && mode == :distinct
end
def not_group_by_query?
!@relation.is_a?(ActiveRecord::Relation) || @relation.group_values.blank?
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Database
# This class builds efficient batched distinct query by using loose index scan.
# Consider the following example:
# > Issue.distinct(:project_id).where(project_id: (1...100)).count
#
# Note: there is an index on project_id
#
# This query will read each element in the index matching the project_id filter.
# If for a project_id has 100_000 issues, all 100_000 elements will be read.
#
# A loose index scan will read only one entry from the index for each project_id to reduce the number of disk reads.
#
# Usage:
#
# Gitlab::Database::LooseIndexScanDisctinctCount.new(Issue, :project_id).count(from: 1, to: 100)
#
# The query will return the number of distinct projects_ids between 1 and 100
#
# Getting the Arel query:
#
# Gitlab::Database::LooseIndexScanDisctinctCount.new(Issue, :project_id).build_query(from: 1, to: 100)
class LooseIndexScanDistinctCount
COLUMN_ALIAS = 'distinct_count_column'
ColumnConfigurationError = Class.new(StandardError)
def initialize(scope, column)
if scope.is_a?(ActiveRecord::Relation)
@scope = scope
@model = scope.model
else
@scope = scope.where({})
@model = scope
end
@column = transform_column(column)
end
def count(from:, to:)
build_query(from: from, to: to).count(COLUMN_ALIAS)
end
def build_query(from:, to:) # rubocop:disable Metrics/AbcSize
cte = Gitlab::SQL::RecursiveCTE.new(:counter_cte, union_args: { remove_order: false })
table = model.arel_table
cte << @scope
.dup
.select(column.as(COLUMN_ALIAS))
.where(column.gteq(from))
.where(column.lt(to))
.order(column)
.limit(1)
inner_query = @scope
.dup
.where(column.gt(cte.table[COLUMN_ALIAS]))
.where(column.lt(to))
.select(column.as(COLUMN_ALIAS))
.order(column)
.limit(1)
cte << cte.table
.project(Arel::Nodes::Grouping.new(Arel.sql(inner_query.to_sql)).as(COLUMN_ALIAS))
.where(cte.table[COLUMN_ALIAS].lt(to))
model
.with
.recursive(cte.to_arel)
.from(cte.alias_to(table))
.unscope(where: :source_type)
.unscope(where: model.inheritance_column) # Remove STI query, not needed here
end
private
attr_reader :column, :model
# Transforms the column so it can be used in Arel expressions
#
# 'table.column' => 'table.column'
# 'column' => 'table_name.column'
# :column => 'table_name.column'
# Arel::Attributes::Attribute => name of the column
def transform_column(column)
if column.is_a?(String) || column.is_a?(Symbol)
column_as_string = column.to_s
column_as_string = "#{model.table_name}.#{column_as_string}" unless column_as_string.include?('.')
Arel.sql(column_as_string)
elsif column.is_a?(Arel::Attributes::Attribute)
column
else
raise ColumnConfigurationError.new("Cannot transform the column: #{column.inspect}, please provide the column name as string")
end
end
end
end
end
......@@ -23,9 +23,11 @@ module Gitlab
attr_reader :table
# name - The name of the CTE as a String or Symbol.
def initialize(name)
# union_args - The arguments supplied to Gitlab::SQL::Union class when building inner recursive query
def initialize(name, union_args: {})
@table = Arel::Table.new(name)
@queries = []
@union_args = union_args
end
# Adds a query to the body of the CTE.
......@@ -37,7 +39,7 @@ module Gitlab
# Returns the Arel relation for this CTE.
def to_arel
sql = Arel::Nodes::SqlLiteral.new(Union.new(@queries).to_sql)
sql = Arel::Nodes::SqlLiteral.new(Union.new(@queries, **@union_args).to_sql)
Arel::Nodes::As.new(table, Arel::Nodes::Grouping.new(sql))
end
......
......@@ -8,6 +8,9 @@ module Gitlab
# ORDER BYs are dropped from the relations as the final sort order is not
# guaranteed any way.
#
# remove_order: false option can be used in special cases where the
# ORDER BY is necessary for the query.
#
# Example usage:
#
# union = Gitlab::SQL::Union.new([user.personal_projects, user.projects])
......@@ -15,9 +18,10 @@ module Gitlab
#
# Project.where("id IN (#{sql})")
class SetOperator
def initialize(relations, remove_duplicates: true)
def initialize(relations, remove_duplicates: true, remove_order: true)
@relations = relations
@remove_duplicates = remove_duplicates
@remove_order = remove_order
end
def self.operator_keyword
......@@ -30,7 +34,9 @@ module Gitlab
# By using "unprepared_statements" we remove the usage of placeholders
# (thus fixing this problem), at a slight performance cost.
fragments = ActiveRecord::Base.connection.unprepared_statement do
relations.map { |rel| rel.reorder(nil).to_sql }.reject(&:blank?)
relations.map do |rel|
remove_order ? rel.reorder(nil).to_sql : rel.to_sql
end.reject(&:blank?)
end
if fragments.any?
......@@ -47,7 +53,7 @@ module Gitlab
private
attr_reader :relations, :remove_duplicates
attr_reader :relations, :remove_duplicates, :remove_order
end
end
end
......@@ -4,8 +4,8 @@ module Gitlab
module SQL
# Class for building SQL UNION statements.
#
# ORDER BYs are dropped from the relations as the final sort order is not
# guaranteed any way.
# By default ORDER BYs are dropped from the relations as the final sort
# order is not guaranteed any way.
#
# Example usage:
#
......
......@@ -270,6 +270,8 @@ RSpec.describe Gitlab::Database::BatchCount do
end
it "defaults the batch size to #{Gitlab::Database::BatchCounter::DEFAULT_DISTINCT_BATCH_SIZE}" do
stub_feature_flags(loose_index_scan_for_distinct_values: false)
min_id = model.minimum(:id)
relation = instance_double(ActiveRecord::Relation)
allow(model).to receive_message_chain(:select, public_send: relation)
......@@ -315,6 +317,7 @@ RSpec.describe Gitlab::Database::BatchCount do
end
end
context 'when the loose_index_scan_for_distinct_values feature flag is off' do
it_behaves_like 'when batch fetch query is canceled' do
let(:mode) { :distinct }
let(:operation) { :count }
......@@ -322,6 +325,77 @@ RSpec.describe Gitlab::Database::BatchCount do
let(:column) { nil }
subject { described_class.method(:batch_distinct_count) }
before do
stub_feature_flags(loose_index_scan_for_distinct_values: false)
end
end
end
context 'when the loose_index_scan_for_distinct_values feature flag is on' do
let(:mode) { :distinct }
let(:operation) { :count }
let(:operation_args) { nil }
let(:column) { nil }
let(:batch_size) { 10_000 }
subject { described_class.method(:batch_distinct_count) }
before do
stub_feature_flags(loose_index_scan_for_distinct_values: true)
end
it 'reduces batch size by half and retry fetch' do
too_big_batch_relation_mock = instance_double(ActiveRecord::Relation)
count_method = double(send: 1)
allow(too_big_batch_relation_mock).to receive(:send).and_raise(ActiveRecord::QueryCanceled)
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive_message_chain(:new, :build_query).with(from: 0, to: batch_size).and_return(too_big_batch_relation_mock)
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive_message_chain(:new, :build_query).with(from: 0, to: batch_size / 2).and_return(count_method)
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive_message_chain(:new, :build_query).with(from: batch_size / 2, to: batch_size).and_return(count_method)
subject.call(model, column, batch_size: batch_size, start: 0, finish: batch_size - 1)
end
context 'when all retries fail' do
let(:batch_count_query) { 'SELECT COUNT(id) FROM relation WHERE id BETWEEN 0 and 1' }
before do
relation = instance_double(ActiveRecord::Relation)
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive_message_chain(:new, :build_query).and_return(relation)
allow(relation).to receive(:send).and_raise(ActiveRecord::QueryCanceled.new('query timed out'))
allow(relation).to receive(:to_sql).and_return(batch_count_query)
end
it 'logs failing query' do
expect(Gitlab::AppJsonLogger).to receive(:error).with(
event: 'batch_count',
relation: model.table_name,
operation: operation,
operation_args: operation_args,
start: 0,
mode: mode,
query: batch_count_query,
message: 'Query has been canceled with message: query timed out'
)
expect(subject.call(model, column, batch_size: batch_size, start: 0)).to eq(-1)
end
end
context 'when LooseIndexScanDistinctCount raises error' do
let(:column) { :creator_id }
let(:error_class) { Gitlab::Database::LooseIndexScanDistinctCount::ColumnConfigurationError }
it 'rescues ColumnConfigurationError' do
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive(:new).and_raise(error_class.new('error message'))
expect(Gitlab::AppJsonLogger).to receive(:error).with(a_hash_including(message: 'LooseIndexScanDistinctCount column error: error message'))
expect(subject.call(Project, column, batch_size: 10_000, start: 0)).to eq(-1)
end
end
end
end
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::LooseIndexScanDistinctCount do
context 'counting distinct users' do
let_it_be(:user) { create(:user) }
let_it_be(:other_user) { create(:user) }
let(:column) { :creator_id }
before_all do
create_list(:project, 3, creator: user)
create_list(:project, 1, creator: other_user)
end
subject(:count) { described_class.new(Project, :creator_id).count(from: Project.minimum(:creator_id), to: Project.maximum(:creator_id) + 1) }
it { is_expected.to eq(2) }
context 'when STI model is queried' do
it 'does not raise error' do
expect { described_class.new(Group, :owner_id).count(from: 0, to: 1) }.not_to raise_error
end
end
context 'when model with default_scope is queried' do
it 'does not raise error' do
expect { described_class.new(GroupMember, :id).count(from: 0, to: 1) }.not_to raise_error
end
end
context 'when the fully qualified column is given' do
let(:column) { 'projects.creator_id' }
it { is_expected.to eq(2) }
end
context 'when AR attribute is given' do
let(:column) { Project.arel_table[:creator_id] }
it { is_expected.to eq(2) }
end
context 'when invalid value is given for the column' do
let(:column) { Class.new }
it { expect { described_class.new(Group, column) }.to raise_error(Gitlab::Database::LooseIndexScanDistinctCount::ColumnConfigurationError) }
end
context 'when null values are present' do
before do
create_list(:project, 2).each { |p| p.update_column(:creator_id, nil) }
end
it { is_expected.to eq(2) }
end
end
context 'counting STI models' do
let!(:groups) { create_list(:group, 3) }
let!(:namespaces) { create_list(:namespace, 2) }
let(:max_id) { Namespace.maximum(:id) + 1 }
it 'counts groups' do
count = described_class.new(Group, :id).count(from: 0, to: max_id)
expect(count).to eq(3)
end
end
end
......@@ -43,4 +43,33 @@ RSpec.shared_examples 'SQL set operator' do |operator_keyword|
expect(set_operator.to_sql).to eq('NULL')
end
end
describe 'remove_order parameter' do
let(:scopes) do
[
User.where(id: 1).order(id: :desc).limit(1),
User.where(id: 2).order(id: :asc).limit(1)
]
end
subject(:union_query) { described_class.new(scopes, remove_order: remove_order).to_sql }
context 'when remove_order: true' do
let(:remove_order) { true }
it 'removes the ORDER BY from the query' do
expect(union_query).not_to include('ORDER BY "users"."id" DESC')
expect(union_query).not_to include('ORDER BY "users"."id" ASC')
end
end
context 'when remove_order: false' do
let(:remove_order) { false }
it 'does not remove the ORDER BY from the query' do
expect(union_query).to include('ORDER BY "users"."id" DESC')
expect(union_query).to include('ORDER BY "users"."id" ASC')
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment