Commit 83e8876a authored by Michael Kozono's avatar Michael Kozono

Merge branch '337752-geo-some-container-repositories-stuck-in-started-state' into 'master'

Fix: Container repository geo syncs stuck in started state

See merge request gitlab-org/gitlab!68080
parents b011b9db 9361b2bf
...@@ -13,6 +13,7 @@ class Geo::ContainerRepositoryRegistry < Geo::BaseRegistry ...@@ -13,6 +13,7 @@ class Geo::ContainerRepositoryRegistry < Geo::BaseRegistry
scope :never_attempted_sync, -> { with_state(:pending).where(last_synced_at: nil) } scope :never_attempted_sync, -> { with_state(:pending).where(last_synced_at: nil) }
scope :retry_due, -> { where(arel_table[:retry_at].eq(nil).or(arel_table[:retry_at].lt(Time.current))) } scope :retry_due, -> { where(arel_table[:retry_at].eq(nil).or(arel_table[:retry_at].lt(Time.current))) }
scope :synced, -> { with_state(:synced) } scope :synced, -> { with_state(:synced) }
scope :sync_timed_out, -> { with_state(:started).where("last_synced_at < ?", Geo::ContainerRepositorySyncService::LEASE_TIMEOUT.ago) }
state_machine :state, initial: :pending do state_machine :state, initial: :pending do
state :started state :started
...@@ -38,22 +39,40 @@ class Geo::ContainerRepositoryRegistry < Geo::BaseRegistry ...@@ -38,22 +39,40 @@ class Geo::ContainerRepositoryRegistry < Geo::BaseRegistry
end end
end end
def self.find_registries_needs_sync_again(batch_size:, except_ids: []) class << self
super.order(Gitlab::Database.nulls_first_order(:last_synced_at)) include Delay
end
def self.delete_for_model_ids(container_repository_ids) def find_registries_needs_sync_again(batch_size:, except_ids: [])
where(container_repository_id: container_repository_ids).delete_all super.order(Gitlab::Database.nulls_first_order(:last_synced_at))
end
container_repository_ids def delete_for_model_ids(container_repository_ids)
end where(container_repository_id: container_repository_ids).delete_all
def self.pluck_container_repository_key container_repository_ids
where(nil).pluck(:container_repository_id) end
end
def self.replication_enabled? def pluck_container_repository_key
Gitlab.config.geo.registry_replication.enabled where(nil).pluck(:container_repository_id)
end
def replication_enabled?
Gitlab.config.geo.registry_replication.enabled
end
# Fail syncs for records which started syncing a long time ago
def fail_sync_timeouts
attrs = {
state: :failed,
last_sync_failure: "Sync timed out after #{Geo::ContainerRepositorySyncService::LEASE_TIMEOUT} hours",
retry_count: 1,
retry_at: next_retry_time(1)
}
sync_timed_out.all.each_batch do |relation|
relation.update_all(attrs)
end
end
end end
def fail_sync!(message, error) def fail_sync!(message, error)
......
...@@ -20,6 +20,8 @@ module Geo ...@@ -20,6 +20,8 @@ module Geo
Gitlab::Geo.enabled_replicator_classes.each do |replicator_class| Gitlab::Geo.enabled_replicator_classes.each do |replicator_class|
replicator_class.fail_sync_timeouts replicator_class.fail_sync_timeouts
end end
::Geo::ContainerRepositoryRegistry.fail_sync_timeouts
end end
end end
end end
...@@ -223,4 +223,16 @@ RSpec.describe Geo::ContainerRepositoryRegistry, :geo do ...@@ -223,4 +223,16 @@ RSpec.describe Geo::ContainerRepositoryRegistry, :geo do
expect(Geo::ContainerRepositoryRegistry.replication_enabled?).to be_falsey expect(Geo::ContainerRepositoryRegistry.replication_enabled?).to be_falsey
end end
end end
describe '.fail_sync_timeouts' do
it 'marks started records as failed if they are expired' do
record1 = create(:container_repository_registry, :sync_started, last_synced_at: 9.hours.ago)
record2 = create(:container_repository_registry, :sync_started, last_synced_at: 1.hour.ago) # not yet expired
described_class.fail_sync_timeouts
expect(record1.reload.state).to eq "failed"
expect(record2.reload.state).to eq "started"
end
end
end end
...@@ -4,11 +4,12 @@ require 'spec_helper' ...@@ -4,11 +4,12 @@ require 'spec_helper'
RSpec.describe Geo::SyncTimeoutCronWorker, :geo do RSpec.describe Geo::SyncTimeoutCronWorker, :geo do
describe '#perform' do describe '#perform' do
it 'calls fail_sync_timeouts on enabled Replicators' do it 'calls fail_sync_timeouts' do
replicator = double('replicator') replicator = double('replicator')
expect(replicator).to receive(:fail_sync_timeouts) expect(replicator).to receive(:fail_sync_timeouts)
expect(Gitlab::Geo).to receive(:enabled_replicator_classes).and_return([replicator]) expect(Gitlab::Geo).to receive(:enabled_replicator_classes).and_return([replicator])
expect(Geo::ContainerRepositoryRegistry).to receive(:fail_sync_timeouts)
described_class.new.perform described_class.new.perform
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment