Commit 9361b2bf authored by Valery Sizov's avatar Valery Sizov

Fix: Container repository geo syncs stuck in started state

We add Container Registry class to SyncTimeoutCronWorker
as all the newer SSF registries.

Changelog: fixed
EE: true
parent 01d9cce7
...@@ -13,6 +13,7 @@ class Geo::ContainerRepositoryRegistry < Geo::BaseRegistry ...@@ -13,6 +13,7 @@ class Geo::ContainerRepositoryRegistry < Geo::BaseRegistry
scope :never_attempted_sync, -> { with_state(:pending).where(last_synced_at: nil) } scope :never_attempted_sync, -> { with_state(:pending).where(last_synced_at: nil) }
scope :retry_due, -> { where(arel_table[:retry_at].eq(nil).or(arel_table[:retry_at].lt(Time.current))) } scope :retry_due, -> { where(arel_table[:retry_at].eq(nil).or(arel_table[:retry_at].lt(Time.current))) }
scope :synced, -> { with_state(:synced) } scope :synced, -> { with_state(:synced) }
scope :sync_timed_out, -> { with_state(:started).where("last_synced_at < ?", Geo::ContainerRepositorySyncService::LEASE_TIMEOUT.ago) }
state_machine :state, initial: :pending do state_machine :state, initial: :pending do
state :started state :started
...@@ -38,24 +39,42 @@ class Geo::ContainerRepositoryRegistry < Geo::BaseRegistry ...@@ -38,24 +39,42 @@ class Geo::ContainerRepositoryRegistry < Geo::BaseRegistry
end end
end end
def self.find_registries_needs_sync_again(batch_size:, except_ids: []) class << self
include Delay
def find_registries_needs_sync_again(batch_size:, except_ids: [])
super.order(Gitlab::Database.nulls_first_order(:last_synced_at)) super.order(Gitlab::Database.nulls_first_order(:last_synced_at))
end end
def self.delete_for_model_ids(container_repository_ids) def delete_for_model_ids(container_repository_ids)
where(container_repository_id: container_repository_ids).delete_all where(container_repository_id: container_repository_ids).delete_all
container_repository_ids container_repository_ids
end end
def self.pluck_container_repository_key def pluck_container_repository_key
where(nil).pluck(:container_repository_id) where(nil).pluck(:container_repository_id)
end end
def self.replication_enabled? def replication_enabled?
Gitlab.config.geo.registry_replication.enabled Gitlab.config.geo.registry_replication.enabled
end end
# Fail syncs for records which started syncing a long time ago
def fail_sync_timeouts
attrs = {
state: :failed,
last_sync_failure: "Sync timed out after #{Geo::ContainerRepositorySyncService::LEASE_TIMEOUT} hours",
retry_count: 1,
retry_at: next_retry_time(1)
}
sync_timed_out.all.each_batch do |relation|
relation.update_all(attrs)
end
end
end
def fail_sync!(message, error) def fail_sync!(message, error)
new_retry_count = retry_count + 1 new_retry_count = retry_count + 1
......
...@@ -20,6 +20,8 @@ module Geo ...@@ -20,6 +20,8 @@ module Geo
Gitlab::Geo.enabled_replicator_classes.each do |replicator_class| Gitlab::Geo.enabled_replicator_classes.each do |replicator_class|
replicator_class.fail_sync_timeouts replicator_class.fail_sync_timeouts
end end
::Geo::ContainerRepositoryRegistry.fail_sync_timeouts
end end
end end
end end
...@@ -223,4 +223,16 @@ RSpec.describe Geo::ContainerRepositoryRegistry, :geo do ...@@ -223,4 +223,16 @@ RSpec.describe Geo::ContainerRepositoryRegistry, :geo do
expect(Geo::ContainerRepositoryRegistry.replication_enabled?).to be_falsey expect(Geo::ContainerRepositoryRegistry.replication_enabled?).to be_falsey
end end
end end
describe '.fail_sync_timeouts' do
it 'marks started records as failed if they are expired' do
record1 = create(:container_repository_registry, :sync_started, last_synced_at: 9.hours.ago)
record2 = create(:container_repository_registry, :sync_started, last_synced_at: 1.hour.ago) # not yet expired
described_class.fail_sync_timeouts
expect(record1.reload.state).to eq "failed"
expect(record2.reload.state).to eq "started"
end
end
end end
...@@ -4,11 +4,12 @@ require 'spec_helper' ...@@ -4,11 +4,12 @@ require 'spec_helper'
RSpec.describe Geo::SyncTimeoutCronWorker, :geo do RSpec.describe Geo::SyncTimeoutCronWorker, :geo do
describe '#perform' do describe '#perform' do
it 'calls fail_sync_timeouts on enabled Replicators' do it 'calls fail_sync_timeouts' do
replicator = double('replicator') replicator = double('replicator')
expect(replicator).to receive(:fail_sync_timeouts) expect(replicator).to receive(:fail_sync_timeouts)
expect(Gitlab::Geo).to receive(:enabled_replicator_classes).and_return([replicator]) expect(Gitlab::Geo).to receive(:enabled_replicator_classes).and_return([replicator])
expect(Geo::ContainerRepositoryRegistry).to receive(:fail_sync_timeouts)
described_class.new.perform described_class.new.perform
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment