Merge branch 'ab-approximate-counts' into 'master'

Approximate table counts based on TABLESAMPLE See merge request gitlab-org/gitlab-ce!22650

Merge branch 'ab-approximate-counts' into 'master'
Approximate table counts based on TABLESAMPLE See merge request gitlab-org/gitlab-ce!22650
83f0798e · Stan Hu · eeb0e98d · 9c059a02 · 83f0798e · 83f0798e
Commit 83f0798e authored Dec 04, 2018 by Stan Hu
9 changed files
--- a/changelogs/unreleased/ab-approximate-counts.yml
+++ b/changelogs/unreleased/ab-approximate-counts.yml
+---
+title: Approximate counting strategy with TABLESAMPLE.
+merge_request: 22650
+author:
+type: performance
--- a/lib/gitlab/database/count.rb
+++ b/lib/gitlab/database/count.rb
 # frozen_string_literal: true
 # For large tables, PostgreSQL can take a long time to count rows due to MVCC.
-# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+# We can optimize this by using various strategies for approximate counting.
+#
+# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+#
+# However, since statistics are not always up to date, we also implement a table sampling strategy
+# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy.
 module Gitlab
  module Database
    module Count
@@ -20,68 +25,30 @@ module Gitlab
        end
      # Takes in an array of models and returns a Hash for the approximate
-      # counts for them.  If the model's table has not been vacuumed or
+      # counts for them.
-      # analyzed recently, simply run the Model.count to get the data.
+      #
+      # Various count strategies can be specified that are executed in
+      # sequence until all tables have an approximate count attached
+      # or we run out of strategies.
+      #
+      # Note that not all strategies are available on all supported RDBMS.
      #
      # @param [Array]
      # @return [Hash] of Model -> count mapping
-      def self.approximate_counts(models)
+      def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy])
-        table_to_model_map = models.each_with_object({}) do |model, hash|
+        strategies.each_with_object({}) do |strategy, counts_by_model|
-          hash[model.table_name] = model
+          if strategy.enabled?
-        end
+            models_with_missing_counts = models - counts_by_model.keys
-        table_names = table_to_model_map.keys
-        counts_by_table_name = Gitlab::Database.postgresql? ? reltuples_from_recently_updated(table_names) : {}
-        # Convert table -> count to Model -> count
+            break if models_with_missing_counts.empty?
-        counts_by_model = counts_by_table_name.each_with_object({}) do |pair, hash|
-          model = table_to_model_map[pair.first]
-          hash[model] = pair.second
-        end
-        missing_tables = table_names - counts_by_table_name.keys
+            counts = strategy.new(models_with_missing_counts).count
-        missing_tables.each do |table|
+            counts.each do |model, count|
-          model = table_to_model_map[table]
+              counts_by_model[model] = count
-          counts_by_model[model] = model.count
+            end
+          end
        end
-        counts_by_model
-      end
-      # Returns a hash of the table names that have recently updated tuples.
-      #
-      # @param [Array] table names
-      # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
-      def self.reltuples_from_recently_updated(table_names)
-        query = postgresql_estimate_query(table_names)
-        rows = []
-        # Querying tuple stats only works on the primary. Due to load
-        # balancing, we need to ensure this query hits the load balancer.  The
-        # easiest way to do this is to start a transaction.
-        ActiveRecord::Base.transaction do
-          rows = ActiveRecord::Base.connection.select_all(query)
-        end
-        rows.each_with_object({}) { |row, data| data[row['table_name']] = row['estimate'].to_i }
-      rescue *CONNECTION_ERRORS
-        {}
-      end
-      # Generates the PostgreSQL query to return the tuples for tables
-      # that have been vacuumed or analyzed in the last hour.
-      #
-      # @param [Array] table names
-      # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
-      def self.postgresql_estimate_query(table_names)
-        time = "to_timestamp(#{1.hour.ago.to_i})"
-        <<~SQL
-          SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class
-          LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname
-          WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')})
-          AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})
-        SQL
      end
    end
  end

--- a/lib/gitlab/database/count/exact_count_strategy.rb
+++ b/lib/gitlab/database/count/exact_count_strategy.rb
+# frozen_string_literal: true
+module Gitlab
+  module Database
+    module Count
+      # This strategy performs an exact count on the model.
+      #
+      # This is guaranteed to be accurate, however it also scans the
+      # whole table. Hence, there are no guarantees with respect
+      # to runtime.
+      #
+      # Note that for very large tables, this may even timeout.
+      class ExactCountStrategy
+        attr_reader :models
+        def initialize(models)
+          @models = models
+        end
+        def count
+          models.each_with_object({}) do |model, data|
+            data[model] = model.count
+          end
+        end
+        def self.enabled?
+          true
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/database/count/reltuples_count_strategy.rb
+++ b/lib/gitlab/database/count/reltuples_count_strategy.rb
+# frozen_string_literal: true
+module Gitlab
+  module Database
+    module Count
+      class PgClass < ActiveRecord::Base
+        self.table_name = 'pg_class'
+      end
+      # This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables.
+      #
+      # Specifically, it relies on the column reltuples in said table. An additional
+      # check is performed to make sure statistics were updated within the last hour.
+      #
+      # Otherwise, this strategy skips tables with outdated statistics.
+      #
+      # There are no guarantees with respect to the accuracy of this strategy. Runtime
+      # however is guaranteed to be "fast", because it only looks up statistics.
+      class ReltuplesCountStrategy
+        attr_reader :models
+        def initialize(models)
+          @models = models
+        end
+        # Returns a hash of the table names that have recently updated tuples.
+        #
+        # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
+        def count
+          size_estimates
+        rescue *CONNECTION_ERRORS
+          {}
+        end
+        def self.enabled?
+          Gitlab::Database.postgresql?
+        end
+        private
+        def table_names
+          models.map(&:table_name)
+        end
+        def size_estimates(check_statistics: true)
+          table_to_model = models.each_with_object({}) { |model, h| h[model.table_name] = model }
+          # Querying tuple stats only works on the primary. Due to load balancing, the
+          # easiest way to do this is to start a transaction.
+          ActiveRecord::Base.transaction do
+            get_statistics(table_names, check_statistics: check_statistics).each_with_object({}) do |row, data|
+              model = table_to_model[row.table_name]
+              data[model] = row.estimate
+            end
+          end
+        end
+        # Generates the PostgreSQL query to return the tuples for tables
+        # that have been vacuumed or analyzed in the last hour.
+        #
+        # @param [Array] table names
+        # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
+        def get_statistics(table_names, check_statistics: true)
+          time = 1.hour.ago
+          query = PgClass.joins("LEFT JOIN pg_stat_user_tables USING (relname)")
+            .where(relname: table_names)
+            .select('pg_class.relname AS table_name, reltuples::bigint AS estimate')
+          if check_statistics
+            query = query.where('last_vacuum > ? OR last_autovacuum > ? OR last_analyze > ? OR last_autoanalyze > ?',
+                                time, time, time, time)
+          end
+          query
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/database/count/tablesample_count_strategy.rb
+++ b/lib/gitlab/database/count/tablesample_count_strategy.rb
+# frozen_string_literal: true
+module Gitlab
+  module Database
+    module Count
+      # A tablesample count executes in two phases:
+      # * Estimate table sizes based on reltuples.
+      # * Based on the estimate:
+      #   * If the table is considered 'small', execute an exact relation count.
+      #   * Otherwise, count on a sample of the table using TABLESAMPLE.
+      #
+      # The size of the sample is chosen in a way that we always roughly scan
+      # the same amount of rows (see TABLESAMPLE_ROW_TARGET).
+      #
+      # There are no guarantees with respect to the accuracy of the result or runtime.
+      class TablesampleCountStrategy < ReltuplesCountStrategy
+        EXACT_COUNT_THRESHOLD = 10_000
+        TABLESAMPLE_ROW_TARGET = 10_000
+        def count
+          estimates = size_estimates(check_statistics: false)
+          models.each_with_object({}) do |model, count_by_model|
+            count = perform_count(model, estimates[model])
+            count_by_model[model] = count if count
+          end
+        rescue *CONNECTION_ERRORS
+          {}
+        end
+        def self.enabled?
+          Gitlab::Database.postgresql? && Feature.enabled?(:tablesample_counts)
+        end
+        private
+        def perform_count(model, estimate)
+          # If we estimate 0, we may not have statistics at all. Don't use them.
+          return nil unless estimate && estimate > 0
+          if estimate < EXACT_COUNT_THRESHOLD
+            # The table is considered small, the assumption here is that
+            # the exact count will be fast anyways.
+            model.count
+          else
+            # The table is considered large, let's only count on a sample.
+            tablesample_count(model, estimate)
+          end
+        end
+        def tablesample_count(model, estimate)
+          portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4)
+          inverse = 1 / portion
+          query = <<~SQL
+            SELECT (COUNT(*)*#{inverse})::integer AS count
+            FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion * 100})
+          SQL
+          rows = ActiveRecord::Base.connection.select_all(query)
+          Integer(rows.first['count'])
+        end
+      end
+    end
+  end
+end
--- a/spec/lib/gitlab/database/count/exact_count_strategy_spec.rb
+++ b/spec/lib/gitlab/database/count/exact_count_strategy_spec.rb
+require 'spec_helper'
+describe Gitlab::Database::Count::ExactCountStrategy do
+  before do
+    create_list(:project, 3)
+    create(:identity)
+  end
+  let(:models) { [Project, Identity] }
+  subject { described_class.new(models).count }
+  describe '#count' do
+    it 'counts all models' do
+      expect(models).to all(receive(:count).and_call_original)
+      expect(subject).to eq({ Project => 3, Identity => 1 })
+    end
+  end
+  describe '.enabled?' do
+    it 'is enabled for PostgreSQL' do
+      allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
+      expect(described_class.enabled?).to be_truthy
+    end
+    it 'is enabled for MySQL' do
+      allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
+      expect(described_class.enabled?).to be_truthy
+    end
+  end
+end
--- a/spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb
+++ b/spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb
+require 'spec_helper'
+describe Gitlab::Database::Count::ReltuplesCountStrategy do
+  before do
+    create_list(:project, 3)
+    create(:identity)
+  end
+  let(:models) { [Project, Identity] }
+  subject { described_class.new(models).count }
+  describe '#count', :postgresql do
+    context 'when reltuples is up to date' do
+      before do
+        ActiveRecord::Base.connection.execute('ANALYZE projects')
+        ActiveRecord::Base.connection.execute('ANALYZE identities')
+      end
+      it 'uses statistics to do the count' do
+        models.each { |model| expect(model).not_to receive(:count) }
+        expect(subject).to eq({ Project => 3, Identity => 1 })
+      end
+    end
+    context 'insufficient permissions' do
+      it 'returns an empty hash' do
+        allow(ActiveRecord::Base).to receive(:transaction).and_raise(PG::InsufficientPrivilege)
+        expect(subject).to eq({})
+      end
+    end
+  end
+  describe '.enabled?' do
+    it 'is enabled for PostgreSQL' do
+      allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
+      expect(described_class.enabled?).to be_truthy
+    end
+    it 'is disabled for MySQL' do
+      allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
+      expect(described_class.enabled?).to be_falsey
+    end
+  end
+end
--- a/spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb
+++ b/spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb
+require 'spec_helper'
+describe Gitlab::Database::Count::TablesampleCountStrategy do
+  before do
+    create_list(:project, 3)
+    create(:identity)
+  end
+  let(:models) { [Project, Identity] }
+  let(:strategy) { described_class.new(models) }
+  subject { strategy.count }
+  describe '#count', :postgresql do
+    let(:estimates) { { Project => threshold + 1, Identity => threshold - 1 } }
+    let(:threshold) { Gitlab::Database::Count::TablesampleCountStrategy::EXACT_COUNT_THRESHOLD }
+    before do
+      allow(strategy).to receive(:size_estimates).with(check_statistics: false).and_return(estimates)
+    end
+    context 'for tables with an estimated small size' do
+      it 'performs an exact count' do
+        expect(Identity).to receive(:count).and_call_original
+        expect(subject).to include({ Identity => 1 })
+      end
+    end
+    context 'for tables with an estimated large size' do
+      it 'performs a tablesample count' do
+        expect(Project).not_to receive(:count)
+        result = subject
+        expect(result[Project]).to eq(3)
+      end
+    end
+    context 'insufficient permissions' do
+      it 'returns an empty hash' do
+        allow(strategy).to receive(:size_estimates).and_raise(PG::InsufficientPrivilege)
+        expect(subject).to eq({})
+      end
+    end
+  end
+  describe '.enabled?' do
+    before do
+      stub_feature_flags(tablesample_counts: true)
+    end
+    it 'is enabled for PostgreSQL' do
+      allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
+      expect(described_class.enabled?).to be_truthy
+    end
+    it 'is disabled for MySQL' do
+      allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
+      expect(described_class.enabled?).to be_falsey
+    end
+  end
+end
--- a/spec/lib/gitlab/database/count_spec.rb
+++ b/spec/lib/gitlab/database/count_spec.rb
@@ -8,63 +8,51 @@ describe Gitlab::Database::Count do
  let(:models) { [Project, Identity] }
-  describe '.approximate_counts' do
+  context '.approximate_counts' do
-    context 'with MySQL' do
+    context 'selecting strategies' do
-      context 'when reltuples have not been updated' do
+      let(:strategies) { [double('s1', enabled?: true), double('s2', enabled?: false)] }
-        it 'counts all models the normal way' do
-          expect(Gitlab::Database).to receive(:postgresql?).and_return(false)
-          expect(Project).to receive(:count).and_call_original
+      it 'uses only enabled strategies' do
-          expect(Identity).to receive(:count).and_call_original
+        expect(strategies[0]).to receive(:new).and_return(double('strategy1', count: {}))
+        expect(strategies[1]).not_to receive(:new)
-          expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
+        described_class.approximate_counts(models, strategies: strategies)
-        end
      end
    end
-    context 'with PostgreSQL', :postgresql do
+    context 'fallbacks' do
-      describe 'when reltuples have not been updated' do
+      subject { described_class.approximate_counts(models, strategies: strategies) }
-        it 'counts all models the normal way' do
-          expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({})
-          expect(Project).to receive(:count).and_call_original
+      let(:strategies) do
-          expect(Identity).to receive(:count).and_call_original
+        [
-          expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
+          double('s1', enabled?: true, new: first_strategy),
-        end
+          double('s2', enabled?: true, new: second_strategy)
+        ]
      end
-      describe 'no permission' do
+      let(:first_strategy) { double('first strategy', count: {}) }
-        it 'falls back to standard query' do
+      let(:second_strategy) { double('second strategy', count: {}) }
-          allow(described_class).to receive(:postgresql_estimate_query).and_raise(PG::InsufficientPrivilege)
-          expect(Project).to receive(:count).and_call_original
+      it 'gets results from first strategy' do
-          expect(Identity).to receive(:count).and_call_original
+        expect(strategies[0]).to receive(:new).with(models).and_return(first_strategy)
-          expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
+        expect(first_strategy).to receive(:count)
-        end
+        subject
      end
-      describe 'when some reltuples have been updated' do
+      it 'gets more results from second strategy if some counts are missing' do
-        it 'counts projects in the fast way' do
+        expect(first_strategy).to receive(:count).and_return({ Project => 3 })
-          expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({ 'projects' => 3 })
+        expect(strategies[1]).to receive(:new).with([Identity]).and_return(second_strategy)
+        expect(second_strategy).to receive(:count).and_return({ Identity => 1 })
-          expect(Project).not_to receive(:count).and_call_original
+        expect(subject).to eq({ Project => 3, Identity => 1 })
-          expect(Identity).to receive(:count).and_call_original
-          expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
-        end
      end
-      describe 'when all reltuples have been updated' do
+      it 'does not get more results as soon as all counts are present' do
-        before do
+        expect(first_strategy).to receive(:count).and_return({ Project => 3, Identity => 1 })
-          ActiveRecord::Base.connection.execute('ANALYZE projects')
+        expect(strategies[1]).not_to receive(:new)
-          ActiveRecord::Base.connection.execute('ANALYZE identities')
-        end
-        it 'counts models with the standard way' do
-          expect(Project).not_to receive(:count)
-          expect(Identity).not_to receive(:count)
-          expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
+        subject
-        end
      end
    end
  end