Commit 7c4f0c96 authored by Stan Hu's avatar Stan Hu

Merge branch '267828-remove-minimum_interval-from-backgroundmigrationworker' into 'master'

Remove possible infinite loop from BackgroundMigrationWorker

See merge request gitlab-org/gitlab!45298
parents c67a1159 ec76b503
...@@ -24,10 +24,14 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -24,10 +24,14 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
# class_name - The class name of the background migration to run. # class_name - The class name of the background migration to run.
# arguments - The arguments to pass to the migration class. # arguments - The arguments to pass to the migration class.
# lease_attempts - The number of times we will try to obtain an exclusive # lease_attempts - The number of times we will try to obtain an exclusive
# lease on the class before running anyway. Pass 0 to always run. # lease on the class before giving up. See MR for more discussion.
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45298#note_434304956
def perform(class_name, arguments = [], lease_attempts = 5) def perform(class_name, arguments = [], lease_attempts = 5)
with_context(caller_id: class_name.to_s) do with_context(caller_id: class_name.to_s) do
should_perform, ttl = perform_and_ttl(class_name) attempts_left = lease_attempts - 1
should_perform, ttl = perform_and_ttl(class_name, attempts_left)
break if should_perform.nil?
if should_perform if should_perform
Gitlab::BackgroundMigration.perform(class_name, arguments) Gitlab::BackgroundMigration.perform(class_name, arguments)
...@@ -37,33 +41,40 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -37,33 +41,40 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
# we'll reschedule the job in such a way that it is picked up again around # we'll reschedule the job in such a way that it is picked up again around
# the time the lease expires. # the time the lease expires.
self.class self.class
.perform_in(ttl || self.class.minimum_interval, class_name, arguments) .perform_in(ttl || self.class.minimum_interval, class_name, arguments, attempts_left)
end end
end end
end end
def perform_and_ttl(class_name) def perform_and_ttl(class_name, attempts_left)
if always_perform?
# In test environments `perform_in` will run right away. This can then # In test environments `perform_in` will run right away. This can then
# lead to stack level errors in the above `#perform`. To work around this # lead to stack level errors in the above `#perform`. To work around this
# we'll just perform the migration right away in the test environment. # we'll just perform the migration right away in the test environment.
[true, nil] return [true, nil] if always_perform?
else
lease = lease_for(class_name) lease = lease_for(class_name)
perform = !!lease.try_obtain lease_obtained = !!lease.try_obtain
healthy_db = healthy_database?
perform = lease_obtained && healthy_db
database_unhealthy_counter.increment if lease_obtained && !healthy_db
# If we managed to acquire the lease but the DB is not healthy, then we # If we've tried several times to get a lease with a healthy DB without success, just give up.
# want to simply reschedule our job and try again _after_ the lease # Otherwise we could end up in an infinite rescheduling loop.
# expires. if !perform && attempts_left < 0
if perform && !healthy_database? msg = if !lease_obtained
database_unhealthy_counter.increment 'Job could not get an exclusive lease after several tries. Giving up.'
else
'Database was unhealthy after several tries. Giving up.'
end
perform = false Sidekiq.logger.warn(class: class_name, message: msg, job_id: jid)
return [nil, nil]
end end
[perform, lease.ttl] [perform, lease.ttl]
end end
end
def lease_for(class_name) def lease_for(class_name)
Gitlab::ExclusiveLease Gitlab::ExclusiveLease
......
---
title: Limit number of times a background migration is rescheduled
merge_request: 45298
author:
type: fixed
...@@ -12,47 +12,93 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do ...@@ -12,47 +12,93 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
end end
describe '#perform' do describe '#perform' do
it 'performs a background migration' do before do
allow(worker).to receive(:jid).and_return(1)
expect(worker).to receive(:always_perform?).and_return(false)
end
context 'when lease can be obtained' do
before do
expect(Gitlab::BackgroundMigration) expect(Gitlab::BackgroundMigration)
.to receive(:perform) .to receive(:perform)
.with('Foo', [10, 20]) .with('Foo', [10, 20])
end
it 'performs a background migration' do
worker.perform('Foo', [10, 20]) worker.perform('Foo', [10, 20])
end end
it 'reschedules a migration if it was performed recently' do context 'when lease_attempts is 1' do
expect(worker) it 'performs a background migration' do
.to receive(:always_perform?) worker.perform('Foo', [10, 20], 1)
.and_return(false) end
end
end
context 'when lease not obtained (migration of same class was performed recently)' do
before do
expect(Gitlab::BackgroundMigration).not_to receive(:perform)
worker.lease_for('Foo').try_obtain worker.lease_for('Foo').try_obtain
end
expect(Gitlab::BackgroundMigration) it 'reschedules the migration and decrements the lease_attempts' do
.not_to receive(:perform) expect(described_class)
.to receive(:perform_in)
.with(a_kind_of(Numeric), 'Foo', [10, 20], 4)
worker.perform('Foo', [10, 20], 5)
end
context 'when lease_attempts is 1' do
it 'reschedules the migration and decrements the lease_attempts' do
expect(described_class) expect(described_class)
.to receive(:perform_in) .to receive(:perform_in)
.with(a_kind_of(Numeric), 'Foo', [10, 20]) .with(a_kind_of(Numeric), 'Foo', [10, 20], 0)
worker.perform('Foo', [10, 20]) worker.perform('Foo', [10, 20], 1)
end
end end
it 'reschedules a migration if the database is not healthy' do context 'when lease_attempts is 0' do
allow(worker) it 'gives up performing the migration' do
.to receive(:always_perform?) expect(described_class).not_to receive(:perform_in)
.and_return(false) expect(Sidekiq.logger).to receive(:warn).with(
class: 'Foo',
message: 'Job could not get an exclusive lease after several tries. Giving up.',
job_id: 1)
allow(worker) worker.perform('Foo', [10, 20], 0)
.to receive(:healthy_database?) end
.and_return(false) end
end
context 'when database is not healthy' do
before do
allow(worker).to receive(:healthy_database?).and_return(false)
end
it 'reschedules a migration if the database is not healthy' do
expect(described_class) expect(described_class)
.to receive(:perform_in) .to receive(:perform_in)
.with(a_kind_of(Numeric), 'Foo', [10, 20]) .with(a_kind_of(Numeric), 'Foo', [10, 20], 4)
worker.perform('Foo', [10, 20]) worker.perform('Foo', [10, 20])
end end
context 'when lease_attempts is 0' do
it 'gives up performing the migration' do
expect(described_class).not_to receive(:perform_in)
expect(Sidekiq.logger).to receive(:warn).with(
class: 'Foo',
message: 'Database was unhealthy after several tries. Giving up.',
job_id: 1)
worker.perform('Foo', [10, 20], 0)
end
end
end
it 'sets the class that will be executed as the caller_id' do it 'sets the class that will be executed as the caller_id' do
expect(Gitlab::BackgroundMigration).to receive(:perform) do expect(Gitlab::BackgroundMigration).to receive(:perform) do
expect(Labkit::Context.current.to_h).to include('meta.caller_id' => 'Foo') expect(Labkit::Context.current.to_h).to include('meta.caller_id' => 'Foo')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment