Commit c1b4fc2c authored by Thong Kuah's avatar Thong Kuah

Merge branch 'concurrent_repo_backup' into 'master'

Add concurrency support for Git repository backups

See merge request gitlab-org/gitlab!37158
parents cf995b42 c2306c34
......@@ -651,6 +651,8 @@ class Project < ApplicationRecord
scope :joins_import_state, -> { joins("INNER JOIN project_mirror_data import_state ON import_state.project_id = projects.id") }
scope :for_group, -> (group) { where(group: group) }
scope :for_group_and_its_subgroups, ->(group) { where(namespace_id: group.self_and_descendants.select(:id)) }
scope :for_repository_storage, -> (repository_storage) { where(repository_storage: repository_storage) }
scope :excluding_repository_storage, -> (repository_storage) { where.not(repository_storage: repository_storage) }
class << self
# Searches for a list of projects based on the query given in `query`.
......
---
title: Add concurrency support for Git repository backups
merge_request: 37158
author:
type: changed
......@@ -295,6 +295,30 @@ For installations from source:
sudo -u git -H bundle exec rake gitlab:backup:create SKIP=tar RAILS_ENV=production
```
#### Back up Git repositories concurrently
> [Introduced](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/37158) in GitLab 13.3.
Repositories can be backed up concurrently to help fully utilise CPU time. The following variables
are available to modify the default behavior of the Rake task:
- `GITLAB_BACKUP_MAX_CONCURRENCY` sets the maximum number of projects to backup at the same time.
Defaults to 1.
- `GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY` sets the maximum number of projects to backup at the same time on each storage. This allows the repository backups to be spread across storages.
Defaults to 1.
For example, for Omnibus GitLab installations:
```shell
sudo gitlab-backup create GITLAB_BACKUP_MAX_CONCURRENCY=4 GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY=1
```
For example, for installations from source:
```shell
sudo -u git -H bundle exec rake gitlab:backup:create GITLAB_BACKUP_MAX_CONCURRENCY=4 GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY=1
```
#### Uploading backups to a remote (cloud) storage
Starting with GitLab 7.4 you can let the backup script upload the `.tar` file it creates.
......
......@@ -10,34 +10,31 @@ module Backup
@progress = progress
end
def dump
def dump(max_concurrency:, max_storage_concurrency:)
prepare
Project.find_each(batch_size: 1000) do |project|
progress.print " * #{display_repo_path(project)} ... "
if project.hashed_storage?(:repository)
FileUtils.mkdir_p(File.dirname(File.join(backup_repos_path, project.disk_path)))
else
FileUtils.mkdir_p(File.join(backup_repos_path, project.namespace.full_path)) if project.namespace
end
if max_concurrency <= 1 && max_storage_concurrency <= 1
return dump_consecutive
end
if !empty_repo?(project)
backup_project(project)
progress.puts "[DONE]".color(:green)
else
progress.puts "[SKIPPED]".color(:cyan)
end
if Project.excluding_repository_storage(Gitlab.config.repositories.storages.keys).exists?
raise Error, 'repositories.storages in gitlab.yml is misconfigured'
end
wiki = ProjectWiki.new(project)
semaphore = Concurrent::Semaphore.new(max_concurrency)
errors = Queue.new
if !empty_repo?(wiki)
backup_project(wiki)
progress.puts "[DONE] Wiki".color(:green)
else
progress.puts "[SKIPPED] Wiki".color(:cyan)
threads = Gitlab.config.repositories.storages.keys.map do |storage|
Thread.new do
dump_storage(storage, semaphore, max_storage_concurrency: max_storage_concurrency)
rescue => e
errors << e
end
end
threads.each(&:join)
raise errors.pop unless errors.empty?
end
def backup_project(project)
......@@ -146,6 +143,71 @@ module Backup
private
def dump_consecutive
Project.find_each(batch_size: 1000) do |project|
dump_project(project)
end
end
def dump_storage(storage, semaphore, max_storage_concurrency:)
errors = Queue.new
queue = SizedQueue.new(1)
threads = Array.new(max_storage_concurrency) do
Thread.new do
while project = queue.pop
semaphore.acquire
begin
dump_project(project)
rescue => e
errors << e
break
ensure
semaphore.release
end
end
end
end
Project.for_repository_storage(storage).find_each(batch_size: 100) do |project|
break unless errors.empty?
queue.push(project)
end
queue.close
threads.each(&:join)
raise errors.pop unless errors.empty?
end
def dump_project(project)
progress.puts " * #{display_repo_path(project)} ... "
if project.hashed_storage?(:repository)
FileUtils.mkdir_p(File.dirname(File.join(backup_repos_path, project.disk_path)))
else
FileUtils.mkdir_p(File.join(backup_repos_path, project.namespace.full_path)) if project.namespace
end
if !empty_repo?(project)
backup_project(project)
progress.puts " * #{display_repo_path(project)} ... " + "[DONE]".color(:green)
else
progress.puts " * #{display_repo_path(project)} ... " + "[SKIPPED]".color(:cyan)
end
wiki = ProjectWiki.new(project)
if !empty_repo?(wiki)
backup_project(wiki)
progress.puts " * #{display_repo_path(project)} ... " + "[DONE] Wiki".color(:green)
else
progress.puts " * #{display_repo_path(project)} ... " + "[SKIPPED] Wiki".color(:cyan)
end
end
def progress_warn(project, cmd, output)
progress.puts "[WARNING] Executing #{cmd}".color(:orange)
progress.puts "Ignoring error on #{display_repo_path(project)} - #{output}".color(:orange)
......
......@@ -93,10 +93,19 @@ namespace :gitlab do
task create: :gitlab_environment do
puts_time "Dumping repositories ...".color(:blue)
max_concurrency = ENV.fetch('GITLAB_BACKUP_MAX_CONCURRENCY', 1).to_i
max_storage_concurrency = ENV.fetch('GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY', 1).to_i
if ENV["SKIP"] && ENV["SKIP"].include?("repositories")
puts_time "[SKIPPED]".color(:cyan)
elsif max_concurrency < 1 || max_storage_concurrency < 1
puts "GITLAB_BACKUP_MAX_CONCURRENCY and GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY must have a value of at least 1".color(:red)
exit 1
else
Backup::Repository.new(progress).dump
Backup::Repository.new(progress).dump(
max_concurrency: max_concurrency,
max_storage_concurrency: max_storage_concurrency
)
puts_time "done".color(:green)
end
end
......
......@@ -3,8 +3,9 @@
require 'spec_helper'
RSpec.describe Backup::Repository do
let_it_be(:project) { create(:project, :wiki_repo) }
let(:progress) { StringIO.new }
let!(:project) { create(:project, :wiki_repo) }
subject { described_class.new(progress) }
......@@ -19,13 +20,88 @@ RSpec.describe Backup::Repository do
end
describe '#dump' do
describe 'repo failure' do
before do
allow(Gitlab::Popen).to receive(:popen).and_return(['normal output', 0])
before do
allow(Gitlab.config.repositories.storages).to receive(:keys).and_return(storage_keys)
end
let_it_be(:projects) { create_list(:project, 5, :wiki_repo) + [project] }
let(:storage_keys) { %w[default test_second_storage] }
context 'no concurrency' do
it 'creates the expected number of threads' do
expect(Thread).not_to receive(:new)
projects.each do |project|
expect(subject).to receive(:dump_project).with(project).and_call_original
end
subject.dump(max_concurrency: 1, max_storage_concurrency: 1)
end
it 'does not raise error' do
expect { subject.dump }.not_to raise_error
describe 'command failure' do
it 'dump_project raises an error' do
allow(subject).to receive(:dump_project).and_raise(IOError)
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: 1) }.to raise_error(IOError)
end
it 'project query raises an error' do
allow(Project).to receive(:find_each).and_raise(ActiveRecord::StatementTimeout)
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: 1) }.to raise_error(ActiveRecord::StatementTimeout)
end
end
end
[4, 10].each do |max_storage_concurrency|
context "max_storage_concurrency #{max_storage_concurrency}" do
it 'creates the expected number of threads' do
expect(Thread).to receive(:new)
.exactly(storage_keys.length * (max_storage_concurrency + 1)).times
.and_call_original
projects.each do |project|
expect(subject).to receive(:dump_project).with(project).and_call_original
end
subject.dump(max_concurrency: 1, max_storage_concurrency: max_storage_concurrency)
end
it 'creates the expected number of threads with extra max concurrency' do
expect(Thread).to receive(:new)
.exactly(storage_keys.length * (max_storage_concurrency + 1)).times
.and_call_original
projects.each do |project|
expect(subject).to receive(:dump_project).with(project).and_call_original
end
subject.dump(max_concurrency: 3, max_storage_concurrency: max_storage_concurrency)
end
describe 'command failure' do
it 'dump_project raises an error' do
allow(subject).to receive(:dump_project)
.and_raise(IOError)
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: max_storage_concurrency) }.to raise_error(IOError)
end
it 'project query raises an error' do
allow(Project).to receive_message_chain('for_repository_storage.find_each').and_raise(ActiveRecord::StatementTimeout)
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: max_storage_concurrency) }.to raise_error(ActiveRecord::StatementTimeout)
end
context 'misconfigured storages' do
let(:storage_keys) { %w[test_second_storage] }
it 'raises an error' do
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: max_storage_concurrency) }.to raise_error(Backup::Error, 'repositories.storages in gitlab.yml is misconfigured')
end
end
end
end
end
end
......
......@@ -5552,6 +5552,32 @@ RSpec.describe Project do
end
end
describe '.for_repository_storage' do
it 'returns the projects for a given repository storage' do
stub_storage_settings('test_second_storage' => {
'path' => TestEnv::SECOND_STORAGE_PATH,
'gitaly_address' => Gitlab.config.repositories.storages.default.gitaly_address
})
expected_project = create(:project, repository_storage: 'default')
create(:project, repository_storage: 'test_second_storage')
expect(described_class.for_repository_storage('default')).to eq([expected_project])
end
end
describe '.excluding_repository_storage' do
it 'returns the projects excluding the given repository storage' do
stub_storage_settings('test_second_storage' => {
'path' => TestEnv::SECOND_STORAGE_PATH,
'gitaly_address' => Gitlab.config.repositories.storages.default.gitaly_address
})
expected_project = create(:project, repository_storage: 'test_second_storage')
create(:project, repository_storage: 'default')
expect(described_class.excluding_repository_storage('default')).to eq([expected_project])
end
end
describe '.deployments' do
subject { project.deployments }
......
......@@ -283,20 +283,7 @@ RSpec.describe 'gitlab:app namespace rake task', :delete do
end
context 'multiple repository storages' do
let(:test_second_storage) do
Gitlab::GitalyClient::StorageSettings.new(@default_storage_hash.merge('path' => 'tmp/tests/custom_storage'))
end
let(:storages) do
{
'default' => Gitlab.config.repositories.storages.default,
'test_second_storage' => test_second_storage
}
end
before(:all) do
@default_storage_hash = Gitlab.config.repositories.storages.default.to_h
end
let_it_be(:default_storage_hash) { Gitlab.config.repositories.storages.default.to_h }
before do
# We only need a backup of the repositories for this test
......@@ -307,17 +294,6 @@ RSpec.describe 'gitlab:app namespace rake task', :delete do
# Avoid asking gitaly about the root ref (which will fail because of the
# mocked storages)
allow_any_instance_of(Repository).to receive(:empty?).and_return(false)
end
after do
FileUtils.rm_rf(Settings.absolute('tmp/tests/custom_storage'))
end
it 'includes repositories in all repository storages' do
project_a = create(:project, :repository)
project_b = create(:project, :repository, repository_storage: 'test_second_storage')
b_storage_dir = File.join(Settings.absolute('tmp/tests/custom_storage'), File.dirname(project_b.disk_path))
FileUtils.mkdir_p(b_storage_dir)
......@@ -328,16 +304,91 @@ RSpec.describe 'gitlab:app namespace rake task', :delete do
Rails.root.join(storages['test_second_storage'].legacy_disk_path, project_b.repository.disk_path + '.git')
)
end
end
after do
FileUtils.rm_rf(test_second_storage_dir)
end
let(:test_second_storage_dir) { Dir.mktmpdir }
let(:test_second_storage) do
Gitlab::GitalyClient::StorageSettings.new(default_storage_hash.merge('path' => test_second_storage_dir))
end
let(:storages) do
{
'default' => Gitlab.config.repositories.storages.default,
'test_second_storage' => test_second_storage
}
end
let!(:project_a) { create(:project, :repository) }
let!(:project_b) { create(:project, :repository, repository_storage: 'test_second_storage') }
let!(:b_storage_dir) { File.join(test_second_storage_dir, File.dirname(project_b.disk_path)) }
context 'no concurrency' do
it 'includes repositories in all repository storages' do
expect { run_rake_task('gitlab:backup:create') }.to output.to_stdout
tar_contents, exit_status = Gitlab::Popen.popen(
%W{tar -tvf #{backup_tar} repositories}
)
expect(exit_status).to eq(0)
expect(tar_contents).to match("repositories/#{project_a.disk_path}.bundle")
expect(tar_contents).to match("repositories/#{project_b.disk_path}.bundle")
end
end
context 'with concurrency' do
before do
stub_env('GITLAB_BACKUP_MAX_CONCURRENCY', 4)
end
it 'includes repositories in all repository storages' do
expect { run_rake_task('gitlab:backup:create') }.to output.to_stdout
tar_contents, exit_status = Gitlab::Popen.popen(
%W{tar -tvf #{backup_tar} repositories}
)
expect(exit_status).to eq(0)
expect(tar_contents).to match("repositories/#{project_a.disk_path}.bundle")
expect(tar_contents).to match("repositories/#{project_b.disk_path}.bundle")
end
end
end
context 'concurrency settings' do
before do
# We only need a backup of the repositories for this test
stub_env('SKIP', 'db,uploads,builds,artifacts,lfs,registry')
create(:project, :repository)
end
it 'has defaults' do
expect_next_instance_of(::Backup::Repository) do |instance|
expect(instance).to receive(:dump)
.with(max_concurrency: 1, max_storage_concurrency: 1)
.and_call_original
end
expect { run_rake_task('gitlab:backup:create') }.to output.to_stdout
end
tar_contents, exit_status = Gitlab::Popen.popen(
%W{tar -tvf #{backup_tar} repositories}
)
it 'passes through concurrency environment variables' do
stub_env('GITLAB_BACKUP_MAX_CONCURRENCY', 5)
stub_env('GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY', 2)
expect(exit_status).to eq(0)
expect(tar_contents).to match("repositories/#{project_a.disk_path}.bundle")
expect(tar_contents).to match("repositories/#{project_b.disk_path}.bundle")
expect_next_instance_of(::Backup::Repository) do |instance|
expect(instance).to receive(:dump)
.with(max_concurrency: 5, max_storage_concurrency: 2)
.and_call_original
end
expect { run_rake_task('gitlab:backup:create') }.to output.to_stdout
end
end
end # backup_create task
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment