Merge branch 'background-migrations-system-load' into 'master'

Respond to DB health in background migrations See merge request gitlab-org/gitlab-ce!20720

Merge branch 'background-migrations-system-load' into 'master'
Respond to DB health in background migrations See merge request gitlab-org/gitlab-ce!20720
964d9f43 · Stan Hu · b4415c01 · 1e5192cc · 964d9f43 · 964d9f43
Commit 964d9f43 authored Aug 06, 2018 by Stan Hu
8 changed files
--- a/app/models/postgresql/replication_slot.rb
+++ b/app/models/postgresql/replication_slot.rb
+# frozen_string_literal: true
+
+module Postgresql
+  class ReplicationSlot < ActiveRecord::Base
+    self.table_name = 'pg_replication_slots'
+
+    # Returns true if the lag observed across all replication slots exceeds a
+    # given threshold.
+    #
+    # max - The maximum replication lag size, in bytes. Based on GitLab.com
+    #       statistics it takes between 1 and 5 seconds to replicate around
+    #       100 MB of data.
+    def self.lag_too_great?(max = 100.megabytes)
+      lag_function = "#{Gitlab::Database.pg_wal_lsn_diff}" \
+        "(#{Gitlab::Database.pg_current_wal_insert_lsn}(), restart_lsn)::bigint"
+
+      # We force the use of a transaction here so the query always goes to the
+      # primary, even when using the EE DB load balancer.
+      sizes = transaction { pluck(lag_function) }
+      too_great = sizes.count { |size| size >= max }
+
+      # If too many replicas are falling behind too much, the availability of a
+      # GitLab instance might suffer. To prevent this from happening we require
+      # at least 1 replica to have data recent enough.
+      if sizes.any? && too_great.positive?
+        (sizes.length - too_great) <= 1
+      else
+        false
+      end
+    end
+  end
+end
--- a/app/workers/background_migration_worker.rb
+++ b/app/workers/background_migration_worker.rb
@@ -6,10 +6,22 @@ class BackgroundMigrationWorker
  # The minimum amount of time between processing two jobs of the same migration
  # class.
  #
-  # This interval is set to 5 minutes so autovacuuming and other maintenance
-  # related tasks have plenty of time to clean up after a migration has been
-  # performed.
-  MIN_INTERVAL = 5.minutes.to_i
+  # This interval is set to 2 or 5 minutes so autovacuuming and other
+  # maintenance related tasks have plenty of time to clean up after a migration
+  # has been performed.
+  def self.minimum_interval
+    if enable_health_check?
+      2.minutes.to_i
+    else
+      5.minutes.to_i
+    end
+  end
+
+  def self.enable_health_check?
+    Rails.env.development? ||
+      Rails.env.test? ||
+      Feature.enabled?('background_migration_health_check')
+  end

  # Performs the background migration.
  #
@@ -27,7 +39,8 @@ class BackgroundMigrationWorker
      # running a migration of this class or we ran one recently. In this case
      # we'll reschedule the job in such a way that it is picked up again around
      # the time the lease expires.
-      self.class.perform_in(ttl || MIN_INTERVAL, class_name, arguments)
+      self.class
+        .perform_in(ttl || self.class.minimum_interval, class_name, arguments)
    end
  end

@@ -39,17 +52,51 @@ class BackgroundMigrationWorker
      [true, nil]
    else
      lease = lease_for(class_name)
+      perform = !!lease.try_obtain
+
+      # If we managed to acquire the lease but the DB is not healthy, then we
+      # want to simply reschedule our job and try again _after_ the lease
+      # expires.
+      if perform && !healthy_database?
+        database_unhealthy_counter.increment

-      [lease.try_obtain, lease.ttl]
+        perform = false
+      end
+
+      [perform, lease.ttl]
    end
  end

  def lease_for(class_name)
    Gitlab::ExclusiveLease
-      .new("#{self.class.name}:#{class_name}", timeout: MIN_INTERVAL)
+      .new(lease_key_for(class_name), timeout: self.class.minimum_interval)
+  end
+
+  def lease_key_for(class_name)
+    "#{self.class.name}:#{class_name}"
  end

  def always_perform?
    Rails.env.test?
  end
+
+  # Returns true if the database is healthy enough to allow the migration to be
+  # performed.
+  #
+  # class_name - The name of the background migration that we might want to
+  #              run.
+  def healthy_database?
+    return true unless self.class.enable_health_check?
+
+    return true unless Gitlab::Database.postgresql?
+
+    !Postgresql::ReplicationSlot.lag_too_great?
+  end
+
+  def database_unhealthy_counter
+    Gitlab::Metrics.counter(
+      :background_migration_database_health_reschedules,
+      'The number of times a background migration is rescheduled because the database is unhealthy.'
+    )
+  end
 end
--- a/doc/development/background_migrations.md
+++ b/doc/development/background_migrations.md
@@ -5,6 +5,9 @@ otherwise take a very long time (hours, days, years, etc) to complete. For
 example, you can use background migrations to migrate data so that instead of
 storing data in a single JSON column the data is stored in a separate table.

+If the database cluster is considered to be in an unhealthy state, background
+migrations automatically reschedule themselves for a later point in time.
+
 ## When To Use Background Migrations

 >**Note:**

--- a/lib/gitlab/background_migration.rb
+++ b/lib/gitlab/background_migration.rb
@@ -46,7 +46,11 @@ module Gitlab
    # arguments - The arguments to pass to the background migration's "perform"
    #             method.
    def self.perform(class_name, arguments)
-      const_get(class_name).new.perform(*arguments)
+      migration_class_for(class_name).new.perform(*arguments)
+    end
+
+    def self.migration_class_for(class_name)
+      const_get(class_name)
    end
  end
 end
--- a/lib/gitlab/database/migration_helpers.rb
+++ b/lib/gitlab/database/migration_helpers.rb
@@ -979,8 +979,8 @@ into similar problems in the future (e.g. when new tables are created).

        # To not overload the worker too much we enforce a minimum interval both
        # when scheduling and performing jobs.
-        if delay_interval < BackgroundMigrationWorker::MIN_INTERVAL
-          delay_interval = BackgroundMigrationWorker::MIN_INTERVAL
+        if delay_interval < BackgroundMigrationWorker.minimum_interval
+          delay_interval = BackgroundMigrationWorker.minimum_interval
        end

        model_class.each_batch(of: batch_size) do |relation, index|

--- a/spec/migrations/normalize_ldap_extern_uids_spec.rb
+++ b/spec/migrations/normalize_ldap_extern_uids_spec.rb
@@ -27,11 +27,11 @@ describe NormalizeLdapExternUids, :migration, :sidekiq do
        migrate!

        expect(BackgroundMigrationWorker.jobs[0]['args']).to eq([described_class::MIGRATION, [1, 2]])
-        expect(BackgroundMigrationWorker.jobs[0]['at']).to eq(5.minutes.from_now.to_f)
+        expect(BackgroundMigrationWorker.jobs[0]['at']).to eq(2.minutes.from_now.to_f)
        expect(BackgroundMigrationWorker.jobs[1]['args']).to eq([described_class::MIGRATION, [3, 4]])
-        expect(BackgroundMigrationWorker.jobs[1]['at']).to eq(10.minutes.from_now.to_f)
+        expect(BackgroundMigrationWorker.jobs[1]['at']).to eq(4.minutes.from_now.to_f)
        expect(BackgroundMigrationWorker.jobs[2]['args']).to eq([described_class::MIGRATION, [5, 5]])
-        expect(BackgroundMigrationWorker.jobs[2]['at']).to eq(15.minutes.from_now.to_f)
+        expect(BackgroundMigrationWorker.jobs[2]['at']).to eq(6.minutes.from_now.to_f)
        expect(BackgroundMigrationWorker.jobs.size).to eq 3
      end
    end

--- a/spec/models/postgresql/replication_slot_spec.rb
+++ b/spec/models/postgresql/replication_slot_spec.rb
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+describe Postgresql::ReplicationSlot, :postgresql do
+  describe '.lag_too_great?' do
+    it 'returns true when replication lag is too great' do
+      expect(described_class)
+        .to receive(:pluck)
+        .and_return([125.megabytes])
+
+      expect(described_class.lag_too_great?).to eq(true)
+    end
+
+    it 'returns false when more than one replicas is up to date enough' do
+      expect(described_class)
+        .to receive(:pluck)
+        .and_return([125.megabytes, 0.megabytes, 0.megabytes])
+
+      expect(described_class.lag_too_great?).to eq(false)
+    end
+
+    it 'returns false when replication lag is not too great' do
+      expect(described_class)
+        .to receive(:pluck)
+        .and_return([0.megabytes])
+
+      expect(described_class.lag_too_great?).to eq(false)
+    end
+  end
+end
--- a/spec/workers/background_migration_worker_spec.rb
+++ b/spec/workers/background_migration_worker_spec.rb
@@ -3,6 +3,12 @@ require 'spec_helper'
 describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state do
  let(:worker) { described_class.new }

+  describe '.minimum_interval' do
+    it 'returns 2 minutes' do
+      expect(described_class.minimum_interval).to eq(2.minutes.to_i)
+    end
+  end
+
  describe '.perform' do
    it 'performs a background migration' do
      expect(Gitlab::BackgroundMigration)
@@ -28,5 +34,51 @@ describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state d

      worker.perform('Foo', [10, 20])
    end
+
+    it 'reschedules a migration if the database is not healthy' do
+      allow(worker)
+        .to receive(:always_perform?)
+        .and_return(false)
+
+      allow(worker)
+        .to receive(:healthy_database?)
+        .and_return(false)
+
+      expect(described_class)
+        .to receive(:perform_in)
+        .with(a_kind_of(Numeric), 'Foo', [10, 20])
+
+      worker.perform('Foo', [10, 20])
+    end
+  end
+
+  describe '#healthy_database?' do
+    context 'using MySQL', :mysql do
+      it 'returns true' do
+        expect(worker.healthy_database?).to eq(true)
+      end
+    end
+
+    context 'using PostgreSQL', :postgresql do
+      context 'when replication lag is too great' do
+        it 'returns false' do
+          allow(Postgresql::ReplicationSlot)
+            .to receive(:lag_too_great?)
+            .and_return(true)
+
+          expect(worker.healthy_database?).to eq(false)
+        end
+      end
+
+      context 'when replication lag is small enough' do
+        it 'returns true' do
+          allow(Postgresql::ReplicationSlot)
+            .to receive(:lag_too_great?)
+            .and_return(false)
+
+          expect(worker.healthy_database?).to eq(true)
+        end
+      end
+    end
  end
 end