Commit 73618b6a authored by Stan Hu's avatar Stan Hu

Merge branch '3119-improve-error-recovery' into 'master'

Geo: Improve error handling

Closes #3119

See merge request gitlab-org/gitlab-ee!3290
parents f0c5977a 005ebf93
class Geo::FileRegistry < Geo::BaseRegistry
scope :failed, -> { where(success: false) }
scope :synced, -> { where(success: true) }
scope :retry_due, -> { where('retry_at is NULL OR retry_at < ?', Time.now) }
scope :lfs_objects, -> { where(file_type: :lfs) }
scope :attachments, -> { where(file_type: Geo::FileService::DEFAULT_OBJECT_TYPES) }
end
......@@ -15,6 +15,15 @@ class Geo::ProjectRegistry < Geo::BaseRegistry
where(repository_sync_failed.or(wiki_sync_failed))
end
def self.retry_due
where(
arel_table[:repository_retry_at].lt(Time.now)
.or(arel_table[:wiki_retry_at].lt(Time.now))
.or(arel_table[:repository_retry_at].eq(nil))
.or(arel_table[:wiki_retry_at].eq(nil))
)
end
def self.synced
where.not(last_repository_synced_at: nil, last_repository_successful_sync_at: nil)
.where(resync_repository: false, resync_wiki: false)
......@@ -39,10 +48,16 @@ class Geo::ProjectRegistry < Geo::BaseRegistry
end
def repository_sync_needed?(timestamp)
resync_repository? && (last_repository_synced_at.nil? || timestamp > last_repository_synced_at)
return false unless resync_repository?
return false if repository_retry_at && timestamp < repository_retry_at
last_repository_synced_at.nil? || timestamp > last_repository_synced_at
end
def wiki_sync_needed?(timestamp)
resync_wiki? && (last_wiki_synced_at.nil? || timestamp > last_wiki_synced_at)
return false unless resync_wiki?
return false if wiki_retry_at && timestamp < wiki_retry_at
last_wiki_synced_at.nil? || timestamp > last_wiki_synced_at
end
end
require 'securerandom'
module Geo
# The clone_url_prefix is used to build URLs for the Geo synchronization
# If this is missing from the primary node we raise this exception
......@@ -6,6 +8,7 @@ module Geo
class BaseSyncService
include ExclusiveLeaseGuard
include ::Gitlab::Geo::ProjectLogHelpers
include Delay
class << self
attr_accessor :type
......@@ -15,6 +18,8 @@ module Geo
LEASE_TIMEOUT = 8.hours.freeze
LEASE_KEY_PREFIX = 'geo_sync_service'.freeze
RETRY_BEFORE_REDOWNLOAD = 5
RETRY_LIMIT = 8
def initialize(project)
@project = project
......@@ -23,7 +28,18 @@ module Geo
def execute
try_obtain_lease do
log_info("Started #{type} sync")
sync_repository
if should_be_retried?
sync_repository
elsif should_be_redownloaded?
sync_repository(true)
else
# Clean up the state of sync to start a new cycle
registry.delete
log_info("Clean up #{type} sync status")
return
end
log_info("Finished #{type} sync")
end
end
......@@ -48,6 +64,22 @@ module Geo
private
def retry_count
registry.public_send("#{type}_retry_count") || 0 # rubocop:disable GitlabSecurity/PublicSend
end
def should_be_retried?
return false if registry.public_send("force_to_redownload_#{type}") # rubocop:disable GitlabSecurity/PublicSend
retry_count <= RETRY_BEFORE_REDOWNLOAD
end
def should_be_redownloaded?
return true if registry.public_send("force_to_redownload_#{type}") # rubocop:disable GitlabSecurity/PublicSend
(RETRY_BEFORE_REDOWNLOAD..RETRY_LIMIT) === retry_count
end
def sync_repository
raise NotImplementedError, 'This class should implement sync_repository method'
end
......@@ -101,11 +133,17 @@ module Geo
attrs = {}
attrs["last_#{type}_synced_at"] = started_at if started_at
if started_at
attrs["last_#{type}_synced_at"] = started_at
attrs["#{type}_retry_count"] = retry_count + 1
attrs["#{type}_retry_at"] = Time.now + delay(attrs["#{type}_retry_count"]).seconds
end
if finished_at
attrs["last_#{type}_successful_sync_at"] = finished_at
attrs["resync_#{type}"] = false
attrs["#{type}_retry_count"] = nil
attrs["#{type}_retry_at"] = nil
end
registry.update!(attrs)
......@@ -134,5 +172,43 @@ module Geo
def last_synced_at
registry.public_send("last_#{type}_synced_at") # rubocop:disable GitlabSecurity/PublicSend
end
def disk_path_temp
unless @disk_path_temp
random_string = SecureRandom.hex(7)
@disk_path_temp = "#{repository.disk_path}_#{random_string}"
end
@disk_path_temp
end
def build_temporary_repository
unless gitlab_shell.add_repository(project.repository_storage, disk_path_temp)
raise Gitlab::Shell::Error, 'Can not create a temporary repository'
end
repository.clone.tap { |repo| repo.disk_path = disk_path_temp }
end
def clean_up_temporary_repository
gitlab_shell.remove_repository(project.repository_storage_path, disk_path_temp)
end
def set_temp_repository_as_main
log_info(
"Setting newly downloaded repository as main",
storage_path: project.repository_storage_path,
temp_path: disk_path_temp,
disk_path: repository.disk_path
)
unless gitlab_shell.remove_repository(project.repository_storage_path, repository.disk_path)
raise Gitlab::Shell::Error, 'Can not remove outdated main repository to replace it'
end
unless gitlab_shell.mv_repository(project.repository_storage_path, disk_path_temp, repository.disk_path)
raise Gitlab::Shell::Error, 'Can not move temporary repository'
end
end
end
end
......@@ -2,6 +2,8 @@ module Geo
class FileDownloadService < FileService
LEASE_TIMEOUT = 8.hours.freeze
include Delay
def execute
try_obtain_lease do |lease|
start_time = Time.now
......@@ -45,6 +47,13 @@ module Geo
transfer.bytes = bytes_downloaded
transfer.success = success
unless success
# We don't limit the amount of retries
transfer.retry_count = (transfer.retry_count || 0) + 1
transfer.retry_at = Time.now + delay(transfer.retry_count).seconds
end
transfer.save
end
......
module Geo
class RepositorySyncService < BaseSyncService
include Gitlab::ShellAdapter
self.type = :repository
private
def sync_repository
fetch_project_repository
def sync_repository(redownload = false)
fetch_project_repository(redownload)
expire_repository_caches
end
def fetch_project_repository
log_info('Fetching project repository')
def fetch_project_repository(redownload)
log_info('Trying to fetch project repository')
update_registry(started_at: DateTime.now)
begin
if redownload
log_info('Redownloading repository')
fetch_geo_mirror(build_temporary_repository)
set_temp_repository_as_main
else
project.ensure_repository
fetch_geo_mirror(project.repository)
update_registry(finished_at: DateTime.now)
log_info("Finished repository sync",
update_delay_s: update_delay_in_seconds,
download_time_s: download_time_in_seconds)
rescue Gitlab::Shell::Error,
Gitlab::Git::RepositoryMirroring::RemoteError,
Geo::EmptyCloneUrlPrefixError => e
log_error('Error syncing repository', e)
rescue Gitlab::Git::Repository::NoRepository => e
log_error('Invalid repository', e)
log_info('Expiring caches')
project.repository.after_create
end
update_registry(finished_at: DateTime.now)
log_info('Finished repository sync',
update_delay_s: update_delay_in_seconds,
download_time_s: download_time_in_seconds)
rescue Gitlab::Shell::Error,
Gitlab::Git::RepositoryMirroring::RemoteError,
Geo::EmptyCloneUrlPrefixError => e
log_error('Error syncing repository', e)
rescue Gitlab::Git::Repository::NoRepository => e
log_error('Invalid repository', e)
log_info('Setting force_to_redownload flag')
registry.update(force_to_redownload_repository: true)
log_info('Expiring caches')
project.repository.after_create
ensure
clean_up_temporary_repository if redownload
end
def expire_repository_caches
......@@ -40,5 +50,9 @@ module Geo
def ssh_url_to_repo
"#{primary_ssh_path_prefix}#{project.full_path}.git"
end
def repository
project.repository
end
end
end
module Geo
class WikiSyncService < BaseSyncService
include Gitlab::ShellAdapter
self.type = :wiki
private
def sync_repository
fetch_wiki_repository
def sync_repository(redownload = false)
fetch_wiki_repository(redownload)
end
def fetch_wiki_repository
def fetch_wiki_repository(redownload)
log_info('Fetching wiki repository')
update_registry(started_at: DateTime.now)
begin
if redownload
log_info('Redownloading wiki')
fetch_geo_mirror(build_temporary_repository)
set_temp_repository_as_main
else
project.wiki.ensure_repository
fetch_geo_mirror(project.wiki.repository)
update_registry(finished_at: DateTime.now)
log_info("Finished wiki sync",
update_delay_s: update_delay_in_seconds,
download_time_s: download_time_in_seconds)
rescue Gitlab::Git::Repository::NoRepository,
Gitlab::Git::RepositoryMirroring::RemoteError,
Gitlab::Shell::Error,
ProjectWiki::CouldNotCreateWikiError,
Geo::EmptyCloneUrlPrefixError => e
log_error('Error syncing wiki repository', e)
end
update_registry(finished_at: DateTime.now)
log_info('Finished wiki sync',
update_delay_s: update_delay_in_seconds,
download_time_s: download_time_in_seconds)
rescue Gitlab::Git::RepositoryMirroring::RemoteError,
Gitlab::Shell::Error,
ProjectWiki::CouldNotCreateWikiError,
Geo::EmptyCloneUrlPrefixError => e
log_error('Error syncing wiki repository', e)
rescue Gitlab::Git::Repository::NoRepository => e
log_error('Invalid wiki', e)
registry.update(force_to_redownload_wiki: true)
ensure
clean_up_temporary_repository if redownload
end
def ssh_url_to_wiki
"#{primary_ssh_path_prefix}#{project.full_path}.wiki.git"
end
def repository
project.wiki.repository
end
end
end
module Delay
# Progressive backoff. It's copied from Sidekiq as is
def delay(retry_count = 0)
(retry_count**4) + 15 + (rand(30) * (retry_count + 1))
end
end
......@@ -33,6 +33,7 @@ module Geo
def find_failed_objects(batch_size:)
Geo::FileRegistry
.failed
.retry_due
.limit(batch_size)
.pluck(:file_id, :file_type)
end
......
......@@ -33,6 +33,7 @@ module Geo
def find_project_ids_updated_recently(batch_size:)
current_node.project_registries
.dirty
.retry_due
.order(Gitlab::Database.nulls_first_order(:last_repository_synced_at, :desc))
.limit(batch_size)
.pluck(:project_id)
......
---
title: Improve error handling
merge_request:
author:
type: added
class AddRetryCountFieldsToRegistries < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
def up
add_column :file_registry, :retry_count, :integer
add_column :file_registry, :retry_at, :datetime
add_column :project_registry, :repository_retry_count, :integer
add_column :project_registry, :repository_retry_at, :datetime
add_column :project_registry, :force_to_redownload_repository, :boolean
add_column :project_registry, :wiki_retry_count, :integer
add_column :project_registry, :wiki_retry_at, :datetime
add_column :project_registry, :force_to_redownload_wiki, :boolean
# Indecies
add_concurrent_index :file_registry, :retry_at
add_concurrent_index :project_registry, :repository_retry_at
add_concurrent_index :project_registry, :wiki_retry_at
end
def down
remove_column :file_registry, :retry_count, :integer
remove_column :file_registry, :retry_at, :datetime
remove_column :project_registry, :repository_retry_count, :integer
remove_column :project_registry, :repository_retry_at, :datetime
remove_column :project_registry, :force_to_redownload_repository, :boolean
remove_column :project_registry, :wiki_retry_count, :integer
remove_column :project_registry, :wiki_retry_at, :datetime
remove_column :project_registry, :force_to_redownload_wiki, :boolean
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171009162209) do
ActiveRecord::Schema.define(version: 20171101105200) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -26,10 +26,13 @@ ActiveRecord::Schema.define(version: 20171009162209) do
t.string "sha256"
t.datetime "created_at", null: false
t.boolean "success", default: false, null: false
t.integer "retry_count"
t.datetime "retry_at"
end
add_index "file_registry", ["file_type", "file_id"], name: "index_file_registry_on_file_type_and_file_id", unique: true, using: :btree
add_index "file_registry", ["file_type"], name: "index_file_registry_on_file_type", using: :btree
add_index "file_registry", ["retry_at"], name: "index_file_registry_on_retry_at", using: :btree
add_index "file_registry", ["success"], name: "index_file_registry_on_success", using: :btree
create_table "project_registry", force: :cascade do |t|
......@@ -41,12 +44,20 @@ ActiveRecord::Schema.define(version: 20171009162209) do
t.boolean "resync_wiki", default: true, null: false
t.datetime "last_wiki_synced_at"
t.datetime "last_wiki_successful_sync_at"
t.integer "repository_retry_count"
t.datetime "repository_retry_at"
t.boolean "force_to_redownload_repository"
t.integer "wiki_retry_count"
t.datetime "wiki_retry_at"
t.boolean "force_to_redownload_wiki"
end
add_index "project_registry", ["last_repository_successful_sync_at"], name: "index_project_registry_on_last_repository_successful_sync_at", using: :btree
add_index "project_registry", ["last_repository_synced_at"], name: "index_project_registry_on_last_repository_synced_at", using: :btree
add_index "project_registry", ["project_id"], name: "index_project_registry_on_project_id", unique: true, using: :btree
add_index "project_registry", ["repository_retry_at"], name: "index_project_registry_on_repository_retry_at", using: :btree
add_index "project_registry", ["resync_repository"], name: "index_project_registry_on_resync_repository", using: :btree
add_index "project_registry", ["resync_wiki"], name: "index_project_registry_on_resync_wiki", using: :btree
add_index "project_registry", ["wiki_retry_at"], name: "index_project_registry_on_wiki_retry_at", using: :btree
end
......@@ -15,4 +15,13 @@ describe Geo::FileRegistry do
expect(described_class.synced).to contain_exactly(synced)
end
end
describe '.retry_due' do
set(:retry_yesterday) { create(:geo_file_registry, retry_at: Date.yesterday) }
set(:retry_tomorrow) { create(:geo_file_registry, retry_at: Date.tomorrow) }
it 'returns registries in the synced state' do
expect(described_class.retry_due).not_to contain_exactly([retry_tomorrow])
end
end
end
......@@ -38,6 +38,16 @@ describe Geo::ProjectRegistry do
end
end
describe '.retry_due' do
it 'returns projects that should be synced' do
create(:geo_project_registry, repository_retry_at: Date.yesterday, wiki_retry_at: Date.yesterday)
tomorrow = create(:geo_project_registry, repository_retry_at: Date.tomorrow, wiki_retry_at: Date.tomorrow)
create(:geo_project_registry)
expect(described_class.retry_due).not_to include(tomorrow)
end
end
describe '#repository_sync_due?' do
where(:resync_repository, :last_successful_sync, :last_sync, :expected) do
now = Time.now
......
......@@ -9,10 +9,10 @@ describe Geo::BaseSyncService do
describe '#lease_key' do
it 'returns a key in the correct pattern' do
described_class.type = :test
allow(described_class).to receive(:type) { :wiki }
allow(project).to receive(:id) { 999 }
expect(subject.lease_key).to eq('geo_sync_service:test:999')
expect(subject.lease_key).to eq('geo_sync_service:wiki:999')
end
end
......
......@@ -29,6 +29,8 @@ describe Geo::FileDownloadService do
stub_transfer(Gitlab::Geo::FileTransfer, -1)
expect { execute! }.to change { Geo::FileRegistry.failed.count }.by(1)
expect(Geo::FileRegistry.last.retry_count).to eq(1)
expect(Geo::FileRegistry.last.retry_at).to be_present
end
it 'registers when the download fails with some other error' do
......
require 'spec_helper'
RSpec.describe Geo::RepositorySyncService do
describe Geo::RepositorySyncService do
include ::EE::GeoHelpers
set(:primary) { create(:geo_node, :primary, host: 'primary-geo-node', relative_url_root: '/gitlab') }
......@@ -122,6 +122,15 @@ RSpec.describe Geo::RepositorySyncService do
subject.execute
end
it 'sets repository_retry_count and repository_retry_at to nil' do
registry = create(:geo_project_registry, project: project, repository_retry_count: 2, repository_retry_at: Date.yesterday)
subject.execute
expect(registry.reload.repository_retry_count).to be_nil
expect(registry.repository_retry_at).to be_nil
end
end
context 'when repository sync fail' do
......@@ -140,6 +149,45 @@ RSpec.describe Geo::RepositorySyncService do
it 'resets last_repository_successful_sync_at' do
expect(registry.last_repository_successful_sync_at).to be_nil
end
it 'resets repository_retry_count' do
expect(registry.repository_retry_count).to eq(1)
end
it 'resets repository_retry_at' do
expect(registry.repository_retry_at).to be_present
end
end
end
context 'retries' do
it 'tries to fetch repo' do
create(:geo_project_registry, project: project, repository_retry_count: Geo::BaseSyncService::RETRY_BEFORE_REDOWNLOAD - 1)
expect_any_instance_of(described_class).to receive(:fetch_project_repository).with(false)
subject.execute
end
it 'tries to redownload repo' do
create(:geo_project_registry, project: project, repository_retry_count: Geo::BaseSyncService::RETRY_BEFORE_REDOWNLOAD + 1)
expect_any_instance_of(described_class).to receive(:fetch_project_repository).with(true)
subject.execute
end
it 'tries to redownload repo when force_redownload flag is set' do
create(
:geo_project_registry,
project: project,
repository_retry_count: Geo::BaseSyncService::RETRY_BEFORE_REDOWNLOAD - 1,
force_to_redownload_repository: true
)
expect_any_instance_of(described_class).to receive(:fetch_project_repository).with(true)
subject.execute
end
end
......
......@@ -8,6 +8,7 @@ shared_examples 'geo base sync execution' do
end
it 'executes the synchronization' do
subject.class.type ||= :wiki
expect(subject).to receive(:sync_repository)
subject.execute
......
......@@ -96,7 +96,7 @@ describe Geo::FileDownloadDispatchWorker, :geo, :truncate do
end
context 'with a failed file' do
let!(:failed_registry) { create(:geo_file_registry, :lfs, file_id: 999, success: false) }
let(:failed_registry) { create(:geo_file_registry, :lfs, file_id: 999, success: false) }
it 'does not stall backfill' do
unsynced = create(:lfs_object, :with_file)
......@@ -114,6 +114,22 @@ describe Geo::FileDownloadDispatchWorker, :geo, :truncate do
subject.perform
end
it 'does not retries failed files when retry_at is tomorrow' do
failed_registry = create(:geo_file_registry, :lfs, file_id: 999, success: false, retry_at: Date.tomorrow)
expect(GeoFileDownloadWorker).not_to receive(:perform_async).with('lfs', failed_registry.file_id)
subject.perform
end
it 'does not retries failed files when retry_at is in the past' do
failed_registry = create(:geo_file_registry, :lfs, file_id: 999, success: false, retry_at: Date.yesterday)
expect(GeoFileDownloadWorker).to receive(:perform_async).with('lfs', failed_registry.file_id)
subject.perform
end
end
context 'when node has namespace restrictions' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment