Commit 1d3d0553 authored by Shinya Maeda's avatar Shinya Maeda

Fix BG migration causes unprocessed jobs due to the concurrency limit

This commit fixes the BG migration bug that its retry mechanizm is
not working as expected due to the concurrency limiting with
exclusive lease.

In order to fix the retry jobs, we have to allow to run multiple
jobs at some points.
parent 958aa425
......@@ -3,6 +3,8 @@
class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
include ApplicationWorker
MAX_LEASE_ATTEMPTS = 5
data_consistency :always
sidekiq_options retry: 3
......@@ -30,10 +32,11 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
# lease_attempts - The number of times we will try to obtain an exclusive
# lease on the class before giving up. See MR for more discussion.
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45298#note_434304956
def perform(class_name, arguments = [], lease_attempts = 5)
def perform(class_name, arguments = [], lease_attempts = MAX_LEASE_ATTEMPTS)
with_context(caller_id: class_name.to_s) do
retried = lease_attempts != MAX_LEASE_ATTEMPTS
attempts_left = lease_attempts - 1
should_perform, ttl = perform_and_ttl(class_name, attempts_left)
should_perform, ttl = perform_and_ttl(class_name, attempts_left, retried)
break if should_perform.nil?
......@@ -50,13 +53,13 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
end
end
def perform_and_ttl(class_name, attempts_left)
def perform_and_ttl(class_name, attempts_left, retried)
# In test environments `perform_in` will run right away. This can then
# lead to stack level errors in the above `#perform`. To work around this
# we'll just perform the migration right away in the test environment.
return [true, nil] if always_perform?
lease = lease_for(class_name)
lease = lease_for(class_name, retried)
lease_obtained = !!lease.try_obtain
healthy_db = healthy_database?
perform = lease_obtained && healthy_db
......@@ -82,13 +85,17 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
[perform, lease.ttl]
end
def lease_for(class_name)
def lease_for(class_name, retried)
Gitlab::ExclusiveLease
.new(lease_key_for(class_name), timeout: self.class.minimum_interval)
.new(lease_key_for(class_name, retried), timeout: self.class.minimum_interval)
end
def lease_key_for(class_name)
"#{self.class.name}:#{class_name}"
def lease_key_for(class_name, retried)
key = "#{self.class.name}:#{class_name}"
# We use a different exclusive lock key for retried jobs to allow them running concurrently with the scheduled jobs.
# See https://gitlab.com/gitlab-org/gitlab/-/merge_requests/68763 for more information.
key += ":retried" if retried
key
end
def always_perform?
......
......@@ -14,7 +14,17 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
describe '#perform' do
before do
allow(worker).to receive(:jid).and_return(1)
expect(worker).to receive(:always_perform?).and_return(false)
allow(worker).to receive(:always_perform?).and_return(false)
end
it 'can run scheduled job and retried job concurrently' do
expect(Gitlab::BackgroundMigration)
.to receive(:perform)
.with('Foo', [10, 20])
.exactly(2).time
worker.perform('Foo', [10, 20])
worker.perform('Foo', [10, 20], described_class::MAX_LEASE_ATTEMPTS - 1)
end
context 'when lease can be obtained' do
......@@ -39,7 +49,7 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
before do
expect(Gitlab::BackgroundMigration).not_to receive(:perform)
worker.lease_for('Foo').try_obtain
worker.lease_for('Foo', false).try_obtain
end
it 'reschedules the migration and decrements the lease_attempts' do
......@@ -51,6 +61,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
end
context 'when lease_attempts is 1' do
before do
worker.lease_for('Foo', true).try_obtain
end
it 'reschedules the migration and decrements the lease_attempts' do
expect(described_class)
.to receive(:perform_in)
......@@ -61,6 +75,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
end
context 'when lease_attempts is 0' do
before do
worker.lease_for('Foo', true).try_obtain
end
it 'gives up performing the migration' do
expect(described_class).not_to receive(:perform_in)
expect(Sidekiq.logger).to receive(:warn).with(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment