Commit 83f0798e authored by Stan Hu's avatar Stan Hu

Merge branch 'ab-approximate-counts' into 'master'

Approximate table counts based on TABLESAMPLE

See merge request gitlab-org/gitlab-ce!22650
parents eeb0e98d 9c059a02
---
title: Approximate counting strategy with TABLESAMPLE.
merge_request: 22650
author:
type: performance
# frozen_string_literal: true # frozen_string_literal: true
# For large tables, PostgreSQL can take a long time to count rows due to MVCC. # For large tables, PostgreSQL can take a long time to count rows due to MVCC.
# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting. # We can optimize this by using various strategies for approximate counting.
#
# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
#
# However, since statistics are not always up to date, we also implement a table sampling strategy
# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy.
module Gitlab module Gitlab
module Database module Database
module Count module Count
...@@ -20,68 +25,30 @@ module Gitlab ...@@ -20,68 +25,30 @@ module Gitlab
end end
# Takes in an array of models and returns a Hash for the approximate # Takes in an array of models and returns a Hash for the approximate
# counts for them. If the model's table has not been vacuumed or # counts for them.
# analyzed recently, simply run the Model.count to get the data. #
# Various count strategies can be specified that are executed in
# sequence until all tables have an approximate count attached
# or we run out of strategies.
#
# Note that not all strategies are available on all supported RDBMS.
# #
# @param [Array] # @param [Array]
# @return [Hash] of Model -> count mapping # @return [Hash] of Model -> count mapping
def self.approximate_counts(models) def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy])
table_to_model_map = models.each_with_object({}) do |model, hash| strategies.each_with_object({}) do |strategy, counts_by_model|
hash[model.table_name] = model if strategy.enabled?
end models_with_missing_counts = models - counts_by_model.keys
table_names = table_to_model_map.keys
counts_by_table_name = Gitlab::Database.postgresql? ? reltuples_from_recently_updated(table_names) : {}
# Convert table -> count to Model -> count break if models_with_missing_counts.empty?
counts_by_model = counts_by_table_name.each_with_object({}) do |pair, hash|
model = table_to_model_map[pair.first]
hash[model] = pair.second
end
missing_tables = table_names - counts_by_table_name.keys counts = strategy.new(models_with_missing_counts).count
missing_tables.each do |table| counts.each do |model, count|
model = table_to_model_map[table] counts_by_model[model] = count
counts_by_model[model] = model.count end
end
end end
counts_by_model
end
# Returns a hash of the table names that have recently updated tuples.
#
# @param [Array] table names
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
def self.reltuples_from_recently_updated(table_names)
query = postgresql_estimate_query(table_names)
rows = []
# Querying tuple stats only works on the primary. Due to load
# balancing, we need to ensure this query hits the load balancer. The
# easiest way to do this is to start a transaction.
ActiveRecord::Base.transaction do
rows = ActiveRecord::Base.connection.select_all(query)
end
rows.each_with_object({}) { |row, data| data[row['table_name']] = row['estimate'].to_i }
rescue *CONNECTION_ERRORS
{}
end
# Generates the PostgreSQL query to return the tuples for tables
# that have been vacuumed or analyzed in the last hour.
#
# @param [Array] table names
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
def self.postgresql_estimate_query(table_names)
time = "to_timestamp(#{1.hour.ago.to_i})"
<<~SQL
SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class
LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname
WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')})
AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})
SQL
end end
end end
end end
......
# frozen_string_literal: true
module Gitlab
module Database
module Count
# This strategy performs an exact count on the model.
#
# This is guaranteed to be accurate, however it also scans the
# whole table. Hence, there are no guarantees with respect
# to runtime.
#
# Note that for very large tables, this may even timeout.
class ExactCountStrategy
attr_reader :models
def initialize(models)
@models = models
end
def count
models.each_with_object({}) do |model, data|
data[model] = model.count
end
end
def self.enabled?
true
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Database
module Count
class PgClass < ActiveRecord::Base
self.table_name = 'pg_class'
end
# This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables.
#
# Specifically, it relies on the column reltuples in said table. An additional
# check is performed to make sure statistics were updated within the last hour.
#
# Otherwise, this strategy skips tables with outdated statistics.
#
# There are no guarantees with respect to the accuracy of this strategy. Runtime
# however is guaranteed to be "fast", because it only looks up statistics.
class ReltuplesCountStrategy
attr_reader :models
def initialize(models)
@models = models
end
# Returns a hash of the table names that have recently updated tuples.
#
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
def count
size_estimates
rescue *CONNECTION_ERRORS
{}
end
def self.enabled?
Gitlab::Database.postgresql?
end
private
def table_names
models.map(&:table_name)
end
def size_estimates(check_statistics: true)
table_to_model = models.each_with_object({}) { |model, h| h[model.table_name] = model }
# Querying tuple stats only works on the primary. Due to load balancing, the
# easiest way to do this is to start a transaction.
ActiveRecord::Base.transaction do
get_statistics(table_names, check_statistics: check_statistics).each_with_object({}) do |row, data|
model = table_to_model[row.table_name]
data[model] = row.estimate
end
end
end
# Generates the PostgreSQL query to return the tuples for tables
# that have been vacuumed or analyzed in the last hour.
#
# @param [Array] table names
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
def get_statistics(table_names, check_statistics: true)
time = 1.hour.ago
query = PgClass.joins("LEFT JOIN pg_stat_user_tables USING (relname)")
.where(relname: table_names)
.select('pg_class.relname AS table_name, reltuples::bigint AS estimate')
if check_statistics
query = query.where('last_vacuum > ? OR last_autovacuum > ? OR last_analyze > ? OR last_autoanalyze > ?',
time, time, time, time)
end
query
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Database
module Count
# A tablesample count executes in two phases:
# * Estimate table sizes based on reltuples.
# * Based on the estimate:
# * If the table is considered 'small', execute an exact relation count.
# * Otherwise, count on a sample of the table using TABLESAMPLE.
#
# The size of the sample is chosen in a way that we always roughly scan
# the same amount of rows (see TABLESAMPLE_ROW_TARGET).
#
# There are no guarantees with respect to the accuracy of the result or runtime.
class TablesampleCountStrategy < ReltuplesCountStrategy
EXACT_COUNT_THRESHOLD = 10_000
TABLESAMPLE_ROW_TARGET = 10_000
def count
estimates = size_estimates(check_statistics: false)
models.each_with_object({}) do |model, count_by_model|
count = perform_count(model, estimates[model])
count_by_model[model] = count if count
end
rescue *CONNECTION_ERRORS
{}
end
def self.enabled?
Gitlab::Database.postgresql? && Feature.enabled?(:tablesample_counts)
end
private
def perform_count(model, estimate)
# If we estimate 0, we may not have statistics at all. Don't use them.
return nil unless estimate && estimate > 0
if estimate < EXACT_COUNT_THRESHOLD
# The table is considered small, the assumption here is that
# the exact count will be fast anyways.
model.count
else
# The table is considered large, let's only count on a sample.
tablesample_count(model, estimate)
end
end
def tablesample_count(model, estimate)
portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4)
inverse = 1 / portion
query = <<~SQL
SELECT (COUNT(*)*#{inverse})::integer AS count
FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion * 100})
SQL
rows = ActiveRecord::Base.connection.select_all(query)
Integer(rows.first['count'])
end
end
end
end
end
require 'spec_helper'
describe Gitlab::Database::Count::ExactCountStrategy do
before do
create_list(:project, 3)
create(:identity)
end
let(:models) { [Project, Identity] }
subject { described_class.new(models).count }
describe '#count' do
it 'counts all models' do
expect(models).to all(receive(:count).and_call_original)
expect(subject).to eq({ Project => 3, Identity => 1 })
end
end
describe '.enabled?' do
it 'is enabled for PostgreSQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
expect(described_class.enabled?).to be_truthy
end
it 'is enabled for MySQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
expect(described_class.enabled?).to be_truthy
end
end
end
require 'spec_helper'
describe Gitlab::Database::Count::ReltuplesCountStrategy do
before do
create_list(:project, 3)
create(:identity)
end
let(:models) { [Project, Identity] }
subject { described_class.new(models).count }
describe '#count', :postgresql do
context 'when reltuples is up to date' do
before do
ActiveRecord::Base.connection.execute('ANALYZE projects')
ActiveRecord::Base.connection.execute('ANALYZE identities')
end
it 'uses statistics to do the count' do
models.each { |model| expect(model).not_to receive(:count) }
expect(subject).to eq({ Project => 3, Identity => 1 })
end
end
context 'insufficient permissions' do
it 'returns an empty hash' do
allow(ActiveRecord::Base).to receive(:transaction).and_raise(PG::InsufficientPrivilege)
expect(subject).to eq({})
end
end
end
describe '.enabled?' do
it 'is enabled for PostgreSQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
expect(described_class.enabled?).to be_truthy
end
it 'is disabled for MySQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
expect(described_class.enabled?).to be_falsey
end
end
end
require 'spec_helper'
describe Gitlab::Database::Count::TablesampleCountStrategy do
before do
create_list(:project, 3)
create(:identity)
end
let(:models) { [Project, Identity] }
let(:strategy) { described_class.new(models) }
subject { strategy.count }
describe '#count', :postgresql do
let(:estimates) { { Project => threshold + 1, Identity => threshold - 1 } }
let(:threshold) { Gitlab::Database::Count::TablesampleCountStrategy::EXACT_COUNT_THRESHOLD }
before do
allow(strategy).to receive(:size_estimates).with(check_statistics: false).and_return(estimates)
end
context 'for tables with an estimated small size' do
it 'performs an exact count' do
expect(Identity).to receive(:count).and_call_original
expect(subject).to include({ Identity => 1 })
end
end
context 'for tables with an estimated large size' do
it 'performs a tablesample count' do
expect(Project).not_to receive(:count)
result = subject
expect(result[Project]).to eq(3)
end
end
context 'insufficient permissions' do
it 'returns an empty hash' do
allow(strategy).to receive(:size_estimates).and_raise(PG::InsufficientPrivilege)
expect(subject).to eq({})
end
end
end
describe '.enabled?' do
before do
stub_feature_flags(tablesample_counts: true)
end
it 'is enabled for PostgreSQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
expect(described_class.enabled?).to be_truthy
end
it 'is disabled for MySQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
expect(described_class.enabled?).to be_falsey
end
end
end
...@@ -8,63 +8,51 @@ describe Gitlab::Database::Count do ...@@ -8,63 +8,51 @@ describe Gitlab::Database::Count do
let(:models) { [Project, Identity] } let(:models) { [Project, Identity] }
describe '.approximate_counts' do context '.approximate_counts' do
context 'with MySQL' do context 'selecting strategies' do
context 'when reltuples have not been updated' do let(:strategies) { [double('s1', enabled?: true), double('s2', enabled?: false)] }
it 'counts all models the normal way' do
expect(Gitlab::Database).to receive(:postgresql?).and_return(false)
expect(Project).to receive(:count).and_call_original it 'uses only enabled strategies' do
expect(Identity).to receive(:count).and_call_original expect(strategies[0]).to receive(:new).and_return(double('strategy1', count: {}))
expect(strategies[1]).not_to receive(:new)
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 }) described_class.approximate_counts(models, strategies: strategies)
end
end end
end end
context 'with PostgreSQL', :postgresql do context 'fallbacks' do
describe 'when reltuples have not been updated' do subject { described_class.approximate_counts(models, strategies: strategies) }
it 'counts all models the normal way' do
expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({})
expect(Project).to receive(:count).and_call_original let(:strategies) do
expect(Identity).to receive(:count).and_call_original [
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 }) double('s1', enabled?: true, new: first_strategy),
end double('s2', enabled?: true, new: second_strategy)
]
end end
describe 'no permission' do let(:first_strategy) { double('first strategy', count: {}) }
it 'falls back to standard query' do let(:second_strategy) { double('second strategy', count: {}) }
allow(described_class).to receive(:postgresql_estimate_query).and_raise(PG::InsufficientPrivilege)
expect(Project).to receive(:count).and_call_original it 'gets results from first strategy' do
expect(Identity).to receive(:count).and_call_original expect(strategies[0]).to receive(:new).with(models).and_return(first_strategy)
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 }) expect(first_strategy).to receive(:count)
end
subject
end end
describe 'when some reltuples have been updated' do it 'gets more results from second strategy if some counts are missing' do
it 'counts projects in the fast way' do expect(first_strategy).to receive(:count).and_return({ Project => 3 })
expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({ 'projects' => 3 }) expect(strategies[1]).to receive(:new).with([Identity]).and_return(second_strategy)
expect(second_strategy).to receive(:count).and_return({ Identity => 1 })
expect(Project).not_to receive(:count).and_call_original expect(subject).to eq({ Project => 3, Identity => 1 })
expect(Identity).to receive(:count).and_call_original
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
end
end end
describe 'when all reltuples have been updated' do it 'does not get more results as soon as all counts are present' do
before do expect(first_strategy).to receive(:count).and_return({ Project => 3, Identity => 1 })
ActiveRecord::Base.connection.execute('ANALYZE projects') expect(strategies[1]).not_to receive(:new)
ActiveRecord::Base.connection.execute('ANALYZE identities')
end
it 'counts models with the standard way' do
expect(Project).not_to receive(:count)
expect(Identity).not_to receive(:count)
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 }) subject
end
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment