Commit 1d3d0553 authored by Shinya Maeda's avatar Shinya Maeda

Fix BG migration causes unprocessed jobs due to the concurrency limit

This commit fixes the BG migration bug that its retry mechanizm is
not working as expected due to the concurrency limiting with
exclusive lease.

In order to fix the retry jobs, we have to allow to run multiple
jobs at some points.
parent 958aa425
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
include ApplicationWorker include ApplicationWorker
MAX_LEASE_ATTEMPTS = 5
data_consistency :always data_consistency :always
sidekiq_options retry: 3 sidekiq_options retry: 3
...@@ -30,10 +32,11 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -30,10 +32,11 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
# lease_attempts - The number of times we will try to obtain an exclusive # lease_attempts - The number of times we will try to obtain an exclusive
# lease on the class before giving up. See MR for more discussion. # lease on the class before giving up. See MR for more discussion.
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45298#note_434304956 # https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45298#note_434304956
def perform(class_name, arguments = [], lease_attempts = 5) def perform(class_name, arguments = [], lease_attempts = MAX_LEASE_ATTEMPTS)
with_context(caller_id: class_name.to_s) do with_context(caller_id: class_name.to_s) do
retried = lease_attempts != MAX_LEASE_ATTEMPTS
attempts_left = lease_attempts - 1 attempts_left = lease_attempts - 1
should_perform, ttl = perform_and_ttl(class_name, attempts_left) should_perform, ttl = perform_and_ttl(class_name, attempts_left, retried)
break if should_perform.nil? break if should_perform.nil?
...@@ -50,13 +53,13 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -50,13 +53,13 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
end end
end end
def perform_and_ttl(class_name, attempts_left) def perform_and_ttl(class_name, attempts_left, retried)
# In test environments `perform_in` will run right away. This can then # In test environments `perform_in` will run right away. This can then
# lead to stack level errors in the above `#perform`. To work around this # lead to stack level errors in the above `#perform`. To work around this
# we'll just perform the migration right away in the test environment. # we'll just perform the migration right away in the test environment.
return [true, nil] if always_perform? return [true, nil] if always_perform?
lease = lease_for(class_name) lease = lease_for(class_name, retried)
lease_obtained = !!lease.try_obtain lease_obtained = !!lease.try_obtain
healthy_db = healthy_database? healthy_db = healthy_database?
perform = lease_obtained && healthy_db perform = lease_obtained && healthy_db
...@@ -82,13 +85,17 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -82,13 +85,17 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
[perform, lease.ttl] [perform, lease.ttl]
end end
def lease_for(class_name) def lease_for(class_name, retried)
Gitlab::ExclusiveLease Gitlab::ExclusiveLease
.new(lease_key_for(class_name), timeout: self.class.minimum_interval) .new(lease_key_for(class_name, retried), timeout: self.class.minimum_interval)
end end
def lease_key_for(class_name) def lease_key_for(class_name, retried)
"#{self.class.name}:#{class_name}" key = "#{self.class.name}:#{class_name}"
# We use a different exclusive lock key for retried jobs to allow them running concurrently with the scheduled jobs.
# See https://gitlab.com/gitlab-org/gitlab/-/merge_requests/68763 for more information.
key += ":retried" if retried
key
end end
def always_perform? def always_perform?
......
...@@ -14,7 +14,17 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do ...@@ -14,7 +14,17 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
describe '#perform' do describe '#perform' do
before do before do
allow(worker).to receive(:jid).and_return(1) allow(worker).to receive(:jid).and_return(1)
expect(worker).to receive(:always_perform?).and_return(false) allow(worker).to receive(:always_perform?).and_return(false)
end
it 'can run scheduled job and retried job concurrently' do
expect(Gitlab::BackgroundMigration)
.to receive(:perform)
.with('Foo', [10, 20])
.exactly(2).time
worker.perform('Foo', [10, 20])
worker.perform('Foo', [10, 20], described_class::MAX_LEASE_ATTEMPTS - 1)
end end
context 'when lease can be obtained' do context 'when lease can be obtained' do
...@@ -39,7 +49,7 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do ...@@ -39,7 +49,7 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
before do before do
expect(Gitlab::BackgroundMigration).not_to receive(:perform) expect(Gitlab::BackgroundMigration).not_to receive(:perform)
worker.lease_for('Foo').try_obtain worker.lease_for('Foo', false).try_obtain
end end
it 'reschedules the migration and decrements the lease_attempts' do it 'reschedules the migration and decrements the lease_attempts' do
...@@ -51,6 +61,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do ...@@ -51,6 +61,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
end end
context 'when lease_attempts is 1' do context 'when lease_attempts is 1' do
before do
worker.lease_for('Foo', true).try_obtain
end
it 'reschedules the migration and decrements the lease_attempts' do it 'reschedules the migration and decrements the lease_attempts' do
expect(described_class) expect(described_class)
.to receive(:perform_in) .to receive(:perform_in)
...@@ -61,6 +75,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do ...@@ -61,6 +75,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
end end
context 'when lease_attempts is 0' do context 'when lease_attempts is 0' do
before do
worker.lease_for('Foo', true).try_obtain
end
it 'gives up performing the migration' do it 'gives up performing the migration' do
expect(described_class).not_to receive(:perform_in) expect(described_class).not_to receive(:perform_in)
expect(Sidekiq.logger).to receive(:warn).with( expect(Sidekiq.logger).to receive(:warn).with(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment