Commit 964d9f43 authored by Stan Hu's avatar Stan Hu

Merge branch 'background-migrations-system-load' into 'master'

Respond to DB health in background migrations

See merge request gitlab-org/gitlab-ce!20720
parents b4415c01 1e5192cc
# frozen_string_literal: true
module Postgresql
class ReplicationSlot < ActiveRecord::Base
self.table_name = 'pg_replication_slots'
# Returns true if the lag observed across all replication slots exceeds a
# given threshold.
#
# max - The maximum replication lag size, in bytes. Based on GitLab.com
# statistics it takes between 1 and 5 seconds to replicate around
# 100 MB of data.
def self.lag_too_great?(max = 100.megabytes)
lag_function = "#{Gitlab::Database.pg_wal_lsn_diff}" \
"(#{Gitlab::Database.pg_current_wal_insert_lsn}(), restart_lsn)::bigint"
# We force the use of a transaction here so the query always goes to the
# primary, even when using the EE DB load balancer.
sizes = transaction { pluck(lag_function) }
too_great = sizes.count { |size| size >= max }
# If too many replicas are falling behind too much, the availability of a
# GitLab instance might suffer. To prevent this from happening we require
# at least 1 replica to have data recent enough.
if sizes.any? && too_great.positive?
(sizes.length - too_great) <= 1
else
false
end
end
end
end
......@@ -6,10 +6,22 @@ class BackgroundMigrationWorker
# The minimum amount of time between processing two jobs of the same migration
# class.
#
# This interval is set to 5 minutes so autovacuuming and other maintenance
# related tasks have plenty of time to clean up after a migration has been
# performed.
MIN_INTERVAL = 5.minutes.to_i
# This interval is set to 2 or 5 minutes so autovacuuming and other
# maintenance related tasks have plenty of time to clean up after a migration
# has been performed.
def self.minimum_interval
if enable_health_check?
2.minutes.to_i
else
5.minutes.to_i
end
end
def self.enable_health_check?
Rails.env.development? ||
Rails.env.test? ||
Feature.enabled?('background_migration_health_check')
end
# Performs the background migration.
#
......@@ -27,7 +39,8 @@ class BackgroundMigrationWorker
# running a migration of this class or we ran one recently. In this case
# we'll reschedule the job in such a way that it is picked up again around
# the time the lease expires.
self.class.perform_in(ttl || MIN_INTERVAL, class_name, arguments)
self.class
.perform_in(ttl || self.class.minimum_interval, class_name, arguments)
end
end
......@@ -39,17 +52,51 @@ class BackgroundMigrationWorker
[true, nil]
else
lease = lease_for(class_name)
perform = !!lease.try_obtain
# If we managed to acquire the lease but the DB is not healthy, then we
# want to simply reschedule our job and try again _after_ the lease
# expires.
if perform && !healthy_database?
database_unhealthy_counter.increment
[lease.try_obtain, lease.ttl]
perform = false
end
[perform, lease.ttl]
end
end
def lease_for(class_name)
Gitlab::ExclusiveLease
.new("#{self.class.name}:#{class_name}", timeout: MIN_INTERVAL)
.new(lease_key_for(class_name), timeout: self.class.minimum_interval)
end
def lease_key_for(class_name)
"#{self.class.name}:#{class_name}"
end
def always_perform?
Rails.env.test?
end
# Returns true if the database is healthy enough to allow the migration to be
# performed.
#
# class_name - The name of the background migration that we might want to
# run.
def healthy_database?
return true unless self.class.enable_health_check?
return true unless Gitlab::Database.postgresql?
!Postgresql::ReplicationSlot.lag_too_great?
end
def database_unhealthy_counter
Gitlab::Metrics.counter(
:background_migration_database_health_reschedules,
'The number of times a background migration is rescheduled because the database is unhealthy.'
)
end
end
......@@ -5,6 +5,9 @@ otherwise take a very long time (hours, days, years, etc) to complete. For
example, you can use background migrations to migrate data so that instead of
storing data in a single JSON column the data is stored in a separate table.
If the database cluster is considered to be in an unhealthy state, background
migrations automatically reschedule themselves for a later point in time.
## When To Use Background Migrations
>**Note:**
......
......@@ -46,7 +46,11 @@ module Gitlab
# arguments - The arguments to pass to the background migration's "perform"
# method.
def self.perform(class_name, arguments)
const_get(class_name).new.perform(*arguments)
migration_class_for(class_name).new.perform(*arguments)
end
def self.migration_class_for(class_name)
const_get(class_name)
end
end
end
......@@ -979,8 +979,8 @@ into similar problems in the future (e.g. when new tables are created).
# To not overload the worker too much we enforce a minimum interval both
# when scheduling and performing jobs.
if delay_interval < BackgroundMigrationWorker::MIN_INTERVAL
delay_interval = BackgroundMigrationWorker::MIN_INTERVAL
if delay_interval < BackgroundMigrationWorker.minimum_interval
delay_interval = BackgroundMigrationWorker.minimum_interval
end
model_class.each_batch(of: batch_size) do |relation, index|
......
......@@ -27,11 +27,11 @@ describe NormalizeLdapExternUids, :migration, :sidekiq do
migrate!
expect(BackgroundMigrationWorker.jobs[0]['args']).to eq([described_class::MIGRATION, [1, 2]])
expect(BackgroundMigrationWorker.jobs[0]['at']).to eq(5.minutes.from_now.to_f)
expect(BackgroundMigrationWorker.jobs[0]['at']).to eq(2.minutes.from_now.to_f)
expect(BackgroundMigrationWorker.jobs[1]['args']).to eq([described_class::MIGRATION, [3, 4]])
expect(BackgroundMigrationWorker.jobs[1]['at']).to eq(10.minutes.from_now.to_f)
expect(BackgroundMigrationWorker.jobs[1]['at']).to eq(4.minutes.from_now.to_f)
expect(BackgroundMigrationWorker.jobs[2]['args']).to eq([described_class::MIGRATION, [5, 5]])
expect(BackgroundMigrationWorker.jobs[2]['at']).to eq(15.minutes.from_now.to_f)
expect(BackgroundMigrationWorker.jobs[2]['at']).to eq(6.minutes.from_now.to_f)
expect(BackgroundMigrationWorker.jobs.size).to eq 3
end
end
......
# frozen_string_literal: true
require 'spec_helper'
describe Postgresql::ReplicationSlot, :postgresql do
describe '.lag_too_great?' do
it 'returns true when replication lag is too great' do
expect(described_class)
.to receive(:pluck)
.and_return([125.megabytes])
expect(described_class.lag_too_great?).to eq(true)
end
it 'returns false when more than one replicas is up to date enough' do
expect(described_class)
.to receive(:pluck)
.and_return([125.megabytes, 0.megabytes, 0.megabytes])
expect(described_class.lag_too_great?).to eq(false)
end
it 'returns false when replication lag is not too great' do
expect(described_class)
.to receive(:pluck)
.and_return([0.megabytes])
expect(described_class.lag_too_great?).to eq(false)
end
end
end
......@@ -3,6 +3,12 @@ require 'spec_helper'
describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state do
let(:worker) { described_class.new }
describe '.minimum_interval' do
it 'returns 2 minutes' do
expect(described_class.minimum_interval).to eq(2.minutes.to_i)
end
end
describe '.perform' do
it 'performs a background migration' do
expect(Gitlab::BackgroundMigration)
......@@ -28,5 +34,51 @@ describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state d
worker.perform('Foo', [10, 20])
end
it 'reschedules a migration if the database is not healthy' do
allow(worker)
.to receive(:always_perform?)
.and_return(false)
allow(worker)
.to receive(:healthy_database?)
.and_return(false)
expect(described_class)
.to receive(:perform_in)
.with(a_kind_of(Numeric), 'Foo', [10, 20])
worker.perform('Foo', [10, 20])
end
end
describe '#healthy_database?' do
context 'using MySQL', :mysql do
it 'returns true' do
expect(worker.healthy_database?).to eq(true)
end
end
context 'using PostgreSQL', :postgresql do
context 'when replication lag is too great' do
it 'returns false' do
allow(Postgresql::ReplicationSlot)
.to receive(:lag_too_great?)
.and_return(true)
expect(worker.healthy_database?).to eq(false)
end
end
context 'when replication lag is small enough' do
it 'returns true' do
allow(Postgresql::ReplicationSlot)
.to receive(:lag_too_great?)
.and_return(false)
expect(worker.healthy_database?).to eq(true)
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment