Commit 5bf65c93 authored by Sean McGivern's avatar Sean McGivern

Merge branch 'bvl-nfs-circuitbreaker' into 'master'

Circuitbreaker for storage paths

Closes #32207, #33117, gitlab-com/infrastructure#1946, and gitlab-com/infrastructure#1775

See merge request !11449
parents fa716921 fda83a61
class Admin::HealthCheckController < Admin::ApplicationController class Admin::HealthCheckController < Admin::ApplicationController
def show def show
@errors = HealthCheck::Utils.process_checks(['standard']) @errors = HealthCheck::Utils.process_checks(['standard'])
@failing_storage_statuses = Gitlab::Git::Storage::Health.for_failing_storages
end
def reset_storage_health
Gitlab::Git::Storage::CircuitBreaker.reset_all!
redirect_to admin_health_check_path,
notice: _('Git storage health information has been reset')
end end
end end
...@@ -52,6 +52,15 @@ class ApplicationController < ActionController::Base ...@@ -52,6 +52,15 @@ class ApplicationController < ActionController::Base
head :forbidden, retry_after: Gitlab::Auth::UniqueIpsLimiter.config.unique_ips_limit_time_window head :forbidden, retry_after: Gitlab::Auth::UniqueIpsLimiter.config.unique_ips_limit_time_window
end end
rescue_from Gitlab::Git::Storage::Inaccessible, GRPC::Unavailable, Gitlab::Git::CommandError do |exception|
Raven.capture_exception(exception) if sentry_enabled?
log_exception(exception)
headers['Retry-After'] = exception.retry_after if exception.respond_to?(:retry_after)
render_503
end
def redirect_back_or_default(default: root_path, options: {}) def redirect_back_or_default(default: root_path, options: {})
redirect_to request.referer.present? ? :back : default, options redirect_to request.referer.present? ? :back : default, options
end end
...@@ -152,6 +161,19 @@ class ApplicationController < ActionController::Base ...@@ -152,6 +161,19 @@ class ApplicationController < ActionController::Base
head :unprocessable_entity head :unprocessable_entity
end end
def render_503
respond_to do |format|
format.html do
render(
file: Rails.root.join("public", "503"),
layout: false,
status: :service_unavailable
)
end
format.any { head :service_unavailable }
end
end
def no_cache_headers def no_cache_headers
response.headers["Cache-Control"] = "no-cache, no-store, max-age=0, must-revalidate" response.headers["Cache-Control"] = "no-cache, no-store, max-age=0, must-revalidate"
response.headers["Pragma"] = "no-cache" response.headers["Pragma"] = "no-cache"
......
module StorageHealthHelper
def failing_storage_health_message(storage_health)
storage_name = content_tag(:strong, h(storage_health.storage_name))
host_names = h(storage_health.failing_on_hosts.to_sentence)
translation_params = { storage_name: storage_name,
host_names: host_names,
failed_attempts: storage_health.total_failures }
translation = n_('%{storage_name}: failed storage access attempt on host:',
'%{storage_name}: %{failed_attempts} failed storage access attempts:',
storage_health.total_failures) % translation_params
translation.html_safe
end
def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count
permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures,
number_of_seconds: circuit_breaker.failure_wait_time }
if permanently_broken
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
elsif circuit_breaker.circuit_broken?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params
else
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"allow access on the next attempt.") % translation_params
end
end
end
...@@ -133,12 +133,13 @@ class Repository ...@@ -133,12 +133,13 @@ class Repository
ref ||= root_ref ref ||= root_ref
args = %W( args = %W(
#{Gitlab.config.git.bin_path} log #{ref} --pretty=%H --skip #{offset} log #{ref} --pretty=%H --skip #{offset}
--max-count #{limit} --grep=#{query} --regexp-ignore-case --max-count #{limit} --grep=#{query} --regexp-ignore-case
) )
args = args.concat(%W(-- #{path})) if path.present? args = args.concat(%W(-- #{path})) if path.present?
git_log_results = Gitlab::Popen.popen(args, path_to_repo).first.lines git_log_results = run_git(args).first.lines
git_log_results.map { |c| commit(c.chomp) }.compact git_log_results.map { |c| commit(c.chomp) }.compact
end end
...@@ -687,8 +688,8 @@ class Repository ...@@ -687,8 +688,8 @@ class Repository
end end
def refs_contains_sha(ref_type, sha) def refs_contains_sha(ref_type, sha)
args = %W(#{Gitlab.config.git.bin_path} #{ref_type} --contains #{sha}) args = %W(#{ref_type} --contains #{sha})
names = Gitlab::Popen.popen(args, path_to_repo).first names = run_git(args).first
if names.respond_to?(:split) if names.respond_to?(:split)
names = names.split("\n").map(&:strip) names = names.split("\n").map(&:strip)
...@@ -966,15 +967,17 @@ class Repository ...@@ -966,15 +967,17 @@ class Repository
return [] if empty_repo? || query.blank? return [] if empty_repo? || query.blank?
offset = 2 offset = 2
args = %W(#{Gitlab.config.git.bin_path} grep -i -I -n --before-context #{offset} --after-context #{offset} -E -e #{Regexp.escape(query)} #{ref || root_ref}) args = %W(grep -i -I -n --before-context #{offset} --after-context #{offset} -E -e #{Regexp.escape(query)} #{ref || root_ref})
Gitlab::Popen.popen(args, path_to_repo).first.scrub.split(/^--$/)
run_git(args).first.scrub.split(/^--$/)
end end
def search_files_by_name(query, ref) def search_files_by_name(query, ref)
return [] if empty_repo? || query.blank? return [] if empty_repo? || query.blank?
args = %W(#{Gitlab.config.git.bin_path} ls-tree --full-tree -r #{ref || root_ref} --name-status | #{Regexp.escape(query)}) args = %W(ls-tree --full-tree -r #{ref || root_ref} --name-status | #{Regexp.escape(query)})
Gitlab::Popen.popen(args, path_to_repo).first.lines.map(&:strip)
run_git(args).first.lines.map(&:strip)
end end
def with_repo_branch_commit(start_repository, start_branch_name) def with_repo_branch_commit(start_repository, start_branch_name)
...@@ -1019,8 +1022,8 @@ class Repository ...@@ -1019,8 +1022,8 @@ class Repository
end end
def fetch_ref(source_path, source_ref, target_ref) def fetch_ref(source_path, source_ref, target_ref)
args = %W(#{Gitlab.config.git.bin_path} fetch --no-tags -f #{source_path} #{source_ref}:#{target_ref}) args = %W(fetch --no-tags -f #{source_path} #{source_ref}:#{target_ref})
Gitlab::Popen.popen(args, path_to_repo) run_git(args)
end end
def create_ref(ref, ref_path) def create_ref(ref, ref_path)
...@@ -1101,6 +1104,12 @@ class Repository ...@@ -1101,6 +1104,12 @@ class Repository
private private
def run_git(args)
circuit_breaker.perform do
Gitlab::Popen.popen([Gitlab.config.git.bin_path, *args], path_to_repo)
end
end
def blob_data_at(sha, path) def blob_data_at(sha, path)
blob = blob_at(sha, path) blob = blob_at(sha, path)
return unless blob return unless blob
...@@ -1110,7 +1119,9 @@ class Repository ...@@ -1110,7 +1119,9 @@ class Repository
end end
def refs_directory_exists? def refs_directory_exists?
File.exist?(File.join(path_to_repo, 'refs')) circuit_breaker.perform do
File.exist?(File.join(path_to_repo, 'refs'))
end
end end
def cache def cache
...@@ -1158,8 +1169,8 @@ class Repository ...@@ -1158,8 +1169,8 @@ class Repository
end end
def last_commit_id_for_path_by_shelling_out(sha, path) def last_commit_id_for_path_by_shelling_out(sha, path)
args = %W(#{Gitlab.config.git.bin_path} rev-list --max-count=1 #{sha} -- #{path}) args = %W(rev-list --max-count=1 #{sha} -- #{path})
Gitlab::Popen.popen(args, path_to_repo).first.strip run_git(args).first.strip
end end
def repository_storage_path def repository_storage_path
...@@ -1169,4 +1180,8 @@ class Repository ...@@ -1169,4 +1180,8 @@ class Repository
def initialize_raw_repository def initialize_raw_repository
Gitlab::Git::Repository.new(project.repository_storage, disk_path + '.git') Gitlab::Git::Repository.new(project.repository_storage, disk_path + '.git')
end end
def circuit_breaker
@circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(project.repository_storage)
end
end end
- if failing_storages.any?
= _('There are problems accessing Git storage: ')
%ul
- failing_storages.each do |storage_health|
%li
= failing_storage_health_message(storage_health)
%ul
- storage_health.failing_circuit_breakers.each do |circuit_breaker|
%li
#{circuit_breaker.hostname}: #{message_for_circuit_breaker(circuit_breaker)}
= _("Access to failing storages has been temporarily disabled to allow the mount to recover. Reset storage information after the issue has been resolved to allow access again.")
.prepend-top-10
= button_to _("Reset git storage health information"), reset_storage_health_admin_health_check_path,
method: :post, class: 'btn btn-default'
- @no_container = true - @no_container = true
- page_title "Health Check" - page_title _('Health Check')
- no_errors = @errors.blank? && @failing_storage_statuses.blank?
= render 'admin/monitoring/head' = render 'admin/monitoring/head'
%div{ class: container_class } %div{ class: container_class }
%h3.page-title %h3.page-title= page_title
Health Check
.bs-callout.clearfix .bs-callout.clearfix
.pull-left .pull-left
%p %p
Access token is #{ s_('HealthCheck|Access token is') }
%code#health-check-token= current_application_settings.health_check_access_token %code#health-check-token= current_application_settings.health_check_access_token
.prepend-top-10 .prepend-top-10
= button_to "Reset health check access token", reset_health_check_token_admin_application_settings_path, = button_to _("Reset health check access token"), reset_health_check_token_admin_application_settings_path,
method: :put, class: 'btn btn-default', method: :put, class: 'btn btn-default',
data: { confirm: 'Are you sure you want to reset the health check token?' } data: { confirm: _('Are you sure you want to reset the health check token?') }
%p.light %p.light
Health information can be retrieved from the following endpoints. More information is available #{ _('Health information can be retrieved from the following endpoints. More information is available') }
= link_to 'here', help_page_path('user/admin_area/monitoring/health_check') = link_to s_('More information is available|here'), help_page_path('user/admin_area/monitoring/health_check')
%ul %ul
%li %li
%code= readiness_url(token: current_application_settings.health_check_access_token) %code= readiness_url(token: current_application_settings.health_check_access_token)
...@@ -29,14 +29,15 @@ ...@@ -29,14 +29,15 @@
.panel.panel-default .panel.panel-default
.panel-heading .panel-heading
Current Status: Current Status:
- if @errors.blank? - if no_errors
= icon('circle', class: 'cgreen') = icon('circle', class: 'cgreen')
Healthy #{ s_('HealthCheck|Healthy') }
- else - else
= icon('warning', class: 'cred') = icon('warning', class: 'cred')
Unhealthy #{ s_('HealthCheck|Unhealthy') }
.panel-body .panel-body
- if @errors.blank? - if no_errors
No Health Problems Detected #{ s_('HealthCheck|No Health Problems Detected') }
- else - else
= @errors = @errors
= render partial: 'failing_storages', object: @failing_storage_statuses
---
title: Block access to failing repository storage
merge_request: 11449
author:
...@@ -506,6 +506,11 @@ production: &base ...@@ -506,6 +506,11 @@ production: &base
path: /home/git/repositories/ path: /home/git/repositories/
gitaly_address: unix:/home/git/gitlab/tmp/sockets/private/gitaly.socket # TCP connections are supported too (e.g. tcp://host:port) gitaly_address: unix:/home/git/gitlab/tmp/sockets/private/gitaly.socket # TCP connections are supported too (e.g. tcp://host:port)
# gitaly_token: 'special token' # Optional: override global gitaly.token for this storage. # gitaly_token: 'special token' # Optional: override global gitaly.token for this storage.
failure_count_threshold: 10 # number of failures before stopping attempts
failure_wait_time: 30 # Seconds after an access failure before allowing access again
failure_reset_time: 1800 # Time in seconds to expire failures
storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt
## Backup settings ## Backup settings
backup: backup:
...@@ -638,6 +643,10 @@ test: ...@@ -638,6 +643,10 @@ test:
default: default:
path: tmp/tests/repositories/ path: tmp/tests/repositories/
gitaly_address: unix:tmp/tests/gitaly/gitaly.socket gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
broken:
path: tmp/tests/non-existent-repositories
gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
gitaly: gitaly:
enabled: true enabled: true
token: secret token: secret
......
...@@ -433,6 +433,17 @@ end ...@@ -433,6 +433,17 @@ end
Settings.repositories.storages.values.each do |storage| Settings.repositories.storages.values.each do |storage|
# Expand relative paths # Expand relative paths
storage['path'] = Settings.absolute(storage['path']) storage['path'] = Settings.absolute(storage['path'])
# Set failure defaults
storage['failure_count_threshold'] ||= 10
storage['failure_wait_time'] ||= 30
storage['failure_reset_time'] ||= 1800
storage['storage_timeout'] ||= 5
# Set turn strings into numbers
storage['failure_count_threshold'] = storage['failure_count_threshold'].to_i
storage['failure_wait_time'] = storage['failure_wait_time'].to_i
storage['failure_reset_time'] = storage['failure_reset_time'].to_i
# We might want to have a timeout shorter than 1 second.
storage['storage_timeout'] = storage['storage_timeout'].to_f
end end
# #
......
...@@ -7,6 +7,13 @@ def find_parent_path(name, path) ...@@ -7,6 +7,13 @@ def find_parent_path(name, path)
Gitlab.config.repositories.storages.detect do |n, rs| Gitlab.config.repositories.storages.detect do |n, rs|
name != n && Pathname.new(rs['path']).realpath == parent name != n && Pathname.new(rs['path']).realpath == parent
end end
rescue Errno::EIO, Errno::ENOENT => e
warning = "WARNING: couldn't verify #{path} (#{name}). "\
"If this is an external storage, it might be offline."
message = "#{warning}\n#{e.message}"
Rails.logger.error("#{message}\n\t" + e.backtrace.join("\n\t"))
nil
end end
def storage_validation_error(message) def storage_validation_error(message)
...@@ -29,6 +36,15 @@ def validate_storages_config ...@@ -29,6 +36,15 @@ def validate_storages_config
if !repository_storage.is_a?(Hash) || repository_storage['path'].nil? if !repository_storage.is_a?(Hash) || repository_storage['path'].nil?
storage_validation_error("#{name} is not a valid storage, because it has no `path` key. Refer to gitlab.yml.example for an updated example") storage_validation_error("#{name} is not a valid storage, because it has no `path` key. Refer to gitlab.yml.example for an updated example")
end end
%w(failure_count_threshold failure_wait_time failure_reset_time storage_timeout).each do |setting|
# Falling back to the defaults is fine!
next if repository_storage[setting].nil?
unless repository_storage[setting].to_f > 0
storage_validation_error("#{setting}, for storage `#{name}` needs to be greater than 0")
end
end
end end
end end
......
...@@ -67,7 +67,9 @@ namespace :admin do ...@@ -67,7 +67,9 @@ namespace :admin do
end end
resource :logs, only: [:show] resource :logs, only: [:show]
resource :health_check, controller: 'health_check', only: [:show] resource :health_check, controller: 'health_check', only: [:show] do
post :reset_storage_health
end
resource :background_jobs, controller: 'background_jobs', only: [:show] resource :background_jobs, controller: 'background_jobs', only: [:show]
resource :system_info, controller: 'system_info', only: [:show] resource :system_info, controller: 'system_info', only: [:show]
resources :requests_profiles, only: [:index, :show], param: :name, constraints: { name: /.+\.html/ } resources :requests_profiles, only: [:index, :show], param: :name, constraints: { name: /.+\.html/ }
......
...@@ -60,7 +60,7 @@ respectively. ...@@ -60,7 +60,7 @@ respectively.
path: /mnt/cephfs/repositories path: /mnt/cephfs/repositories
``` ```
1. [Restart GitLab] for the changes to take effect. 1. [Restart GitLab][restart-gitlab] for the changes to take effect.
>**Note:** >**Note:**
The [`gitlab_shell: repos_path` entry][repospath] in `gitlab.yml` will be The [`gitlab_shell: repos_path` entry][repospath] in `gitlab.yml` will be
...@@ -97,9 +97,80 @@ be stored via the **Application Settings** in the Admin area. ...@@ -97,9 +97,80 @@ be stored via the **Application Settings** in the Admin area.
Beginning with GitLab 8.13.4, multiple paths can be chosen. New projects will be Beginning with GitLab 8.13.4, multiple paths can be chosen. New projects will be
randomly placed on one of the selected paths. randomly placed on one of the selected paths.
## Handling failing repository storage
> [Introduced][ce-11449] in GitLab 9.5.
When GitLab detects access to the repositories storage fails repeatedly, it can
gracefully prevent attempts to access the storage. This might be useful when
the repositories are stored somewhere on the network.
The configuration could look as follows:
**For Omnibus installations**
1. Edit `/etc/gitlab/gitlab.rb`:
```ruby
git_data_dirs({
"default" => {
"path" => "/mnt/nfs-01/git-data",
"failure_count_threshold" => 10,
"failure_wait_time" => 30,
"failure_reset_time" => 1800,
"storage_timeout" => 5
}
})
```
1. Save the file and [reconfigure GitLab][reconfigure-gitlab] for the changes to take effect.
---
**For installations from source**
1. Edit `config/gitlab.yml`:
```yaml
repositories:
storages: # You must have at least a `default` storage path.
default:
path: /home/git/repositories/
failure_count_threshold: 10 # number of failures before stopping attempts
failure_wait_time: 30 # Seconds after last access failure before trying again
failure_reset_time: 1800 # Time in seconds to expire failures
storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt
```
1. Save the file and [restart GitLab][restart-gitlab] for the changes to take effect.
**`failure_count_threshold`:** The number of failures of after which GitLab will
completely prevent access to the storage. The number of failures can be reset in
the admin interface: `https://gitlab.example.com/admin/health_check` or using the
[api](../api/repository_storage_health.md) to allow access to the storage again.
**`failure_wait_time`:** When access to a storage fails. GitLab will prevent
access to the storage for the time specified here. This allows the filesystem to
recover without.
**`failure_reset_time`:** The time in seconds GitLab will keep failure
information. When no failures occur during this time, information about the
mount is reset.
**`storage_timeout`:** The time in seconds GitLab will try to access storage.
After this time a timeout error will be raised.
When storage failures occur, this will be visible in the admin interface like this:
![failing storage](img/failing_storage.png)
To allow access to all storages, click the `Reset git storage health information` button.
[ce-4578]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/4578 [ce-4578]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/4578
[restart gitlab]: restart_gitlab.md#installations-from-source [restart-gitlab]: restart_gitlab.md#installations-from-source
[reconfigure gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure [reconfigure-gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure
[backups]: ../raketasks/backup_restore.md [backups]: ../raketasks/backup_restore.md
[raketask]: https://gitlab.com/gitlab-org/gitlab-ce/blob/033e5423a2594e08a7ebcd2379bd2331f4c39032/lib/backup/repository.rb#L54-56 [raketask]: https://gitlab.com/gitlab-org/gitlab-ce/blob/033e5423a2594e08a7ebcd2379bd2331f4c39032/lib/backup/repository.rb#L54-56
[repospath]: https://gitlab.com/gitlab-org/gitlab-ce/blob/8-9-stable/config/gitlab.yml.example#L457 [repospath]: https://gitlab.com/gitlab-org/gitlab-ce/blob/8-9-stable/config/gitlab.yml.example#L457
[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449
# Circuitbreaker API
> [Introduced][ce-11449] in GitLab 9.5.
The Circuitbreaker API is only accessible to administrators. All requests by
guests will respond with `401 Unauthorized`, and all requests by normal users
will respond with `403 Forbidden`.
## Repository Storages
### Get all storage information
Returns of all currently configured storages and their health information.
```
GET /circuit_breakers/repository_storage
```
```bash
curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
```
```json
[
{
"storage_name": "default",
"failing_on_hosts": [],
"total_failures": 0
},
{
"storage_name": "broken",
"failing_on_hosts": [
"web01", "worker01"
],
"total_failures": 1
}
]
```
### Get failing storages
This returns a list of all currently failing storages.
```
GET /circuit_breakers/repository_storage/failing
```
```bash
curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage/failing
```
```json
[
{
"storage_name":"broken",
"failing_on_hosts":["web01", "worker01"],
"total_failures":2
}
]
```
## Reset failing storage information
Use this remove all failing storage information and allow access to the storage again.
```
DELETE /circuit_breakers/repository_storage
```
```bash
curl --request DELETE --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
```
[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449
...@@ -95,6 +95,7 @@ module API ...@@ -95,6 +95,7 @@ module API
mount ::API::Boards mount ::API::Boards
mount ::API::Branches mount ::API::Branches
mount ::API::BroadcastMessages mount ::API::BroadcastMessages
mount ::API::CircuitBreakers
mount ::API::Commits mount ::API::Commits
mount ::API::CommitStatuses mount ::API::CommitStatuses
mount ::API::DeployKeys mount ::API::DeployKeys
......
module API
class CircuitBreakers < Grape::API
before { authenticated_as_admin! }
resource :circuit_breakers do
params do
requires :type,
type: String,
desc: "The type of circuitbreaker",
values: ['repository_storage']
end
resource ':type' do
namespace '', requirements: { type: 'repository_storage' } do
helpers do
def failing_storage_health
@failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages
end
def storage_health
@failing_storage_health ||= Gitlab::Git::Storage::Health.for_all_storages
end
end
desc 'Get all failing git storages' do
detail 'This feature was introduced in GitLab 9.5'
success Entities::RepositoryStorageHealth
end
get do
present storage_health, with: Entities::RepositoryStorageHealth
end
desc 'Get all failing git storages' do
detail 'This feature was introduced in GitLab 9.5'
success Entities::RepositoryStorageHealth
end
get 'failing' do
present failing_storage_health, with: Entities::RepositoryStorageHealth
end
desc 'Reset all storage failures and open circuitbreaker' do
detail 'This feature was introduced in GitLab 9.5'
end
delete do
Gitlab::Git::Storage::CircuitBreaker.reset_all!
end
end
end
end
end
end
...@@ -951,5 +951,11 @@ module API ...@@ -951,5 +951,11 @@ module API
expose :ip_address expose :ip_address
expose :submitted, as: :akismet_submitted expose :submitted, as: :akismet_submitted
end end
class RepositoryStorageHealth < Grape::Entity
expose :storage_name
expose :failing_on_hosts
expose :total_failures
end
end end
end end
module Gitlab
module Environment
def self.hostname
@hostname ||= ENV['HOSTNAME'] || Socket.gethostname
end
end
end
...@@ -59,11 +59,17 @@ module Gitlab ...@@ -59,11 +59,17 @@ module Gitlab
end end
def rugged def rugged
@rugged ||= Rugged::Repository.new(path, alternates: alternate_object_directories) @rugged ||= circuit_breaker.perform do
Rugged::Repository.new(path, alternates: alternate_object_directories)
end
rescue Rugged::RepositoryError, Rugged::OSError rescue Rugged::RepositoryError, Rugged::OSError
raise NoRepository.new('no repository for such path') raise NoRepository.new('no repository for such path')
end end
def circuit_breaker
@circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage)
end
# Returns an Array of branch names # Returns an Array of branch names
# sorted by name ASC # sorted by name ASC
def branch_names def branch_names
......
module Gitlab
module Git
module Storage
class Inaccessible < StandardError
attr_reader :retry_after
def initialize(message = nil, retry_after = nil)
super(message)
@retry_after = retry_after
end
end
CircuitOpen = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
def self.redis
Gitlab::Redis::SharedState
end
end
end
end
module Gitlab
module Git
module Storage
class CircuitBreaker
FailureInfo = Struct.new(:last_failure, :failure_count)
attr_reader :storage,
:hostname,
:storage_path,
:failure_count_threshold,
:failure_wait_time,
:failure_reset_time,
:storage_timeout
delegate :last_failure, :failure_count, to: :failure_info
def self.reset_all!
pattern = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}*"
Gitlab::Git::Storage.redis.with do |redis|
all_storage_keys = redis.keys(pattern)
redis.del(*all_storage_keys) unless all_storage_keys.empty?
end
RequestStore.delete(:circuitbreaker_cache)
end
def self.for_storage(storage)
cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do
Hash.new do |hash, storage_name|
hash[storage_name] = new(storage_name)
end
end
cached_circuitbreakers[storage]
end
def initialize(storage, hostname = Gitlab::Environment.hostname)
@storage = storage
@hostname = hostname
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
@failure_count_threshold = config['failure_count_threshold']
@failure_wait_time = config['failure_wait_time']
@failure_reset_time = config['failure_reset_time']
@storage_timeout = config['storage_timeout']
end
def perform
return yield unless Feature.enabled?('git_storage_circuit_breaker')
check_storage_accessible!
yield
end
def circuit_broken?
return false if no_failures?
recent_failure = last_failure > failure_wait_time.seconds.ago
too_many_failures = failure_count > failure_count_threshold
recent_failure || too_many_failures
end
# Memoizing the `storage_available` call means we only do it once per
# request when the storage is available.
#
# When the storage appears not available, and the memoized value is `false`
# we might want to try again.
def storage_available?
return @storage_available if @storage_available
if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck
.storage_available?(storage_path, storage_timeout)
track_storage_accessible
else
track_storage_inaccessible
end
@storage_available
end
def check_storage_accessible!
if circuit_broken?
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time)
end
unless storage_available?
raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time)
end
end
def no_failures?
last_failure.blank? && failure_count == 0
end
def track_storage_inaccessible
@failure_info = FailureInfo.new(Time.now, failure_count + 1)
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hincrby(cache_key, :failure_count, 1)
redis.expire(cache_key, failure_reset_time)
end
end
end
def track_storage_accessible
return if no_failures?
@failure_info = FailureInfo.new(nil, 0)
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :last_failure, nil)
redis.hset(cache_key, :failure_count, 0)
end
end
end
def failure_info
@failure_info ||= get_failure_info
end
def get_failure_info
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :last_failure, :failure_count)
end
last_failure = Time.at(last_failure.to_i) if last_failure.present?
FailureInfo.new(last_failure, failure_count.to_i)
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
end
end
end
end
module Gitlab
module Git
module Storage
module ForkedStorageCheck
extend self
def storage_available?(path, timeout_seconds = 5)
status = timeout_check(path, timeout_seconds)
status.success?
end
def timeout_check(path, timeout_seconds)
filesystem_check_pid = check_filesystem_in_process(path)
deadline = timeout_seconds.seconds.from_now.utc
wait_time = 0.01
status = nil
while status.nil?
if deadline > Time.now.utc
sleep(wait_time)
_pid, status = Process.wait2(filesystem_check_pid, Process::WNOHANG)
else
Process.kill('KILL', filesystem_check_pid)
# Blocking wait, so we are sure the process is gone before continuing
_pid, status = Process.wait2(filesystem_check_pid)
end
end
status
end
# This will spawn a new 2 processes to do the check:
# The outer child (waiter) will spawn another child process (stater).
#
# The stater is the process is performing the actual filesystem check
# the check might hang if the filesystem is acting up.
# In this case we will send a `KILL` to the waiter, which will still
# be responsive while the stater is hanging.
def check_filesystem_in_process(path)
spawn('ruby', '-e', ruby_check, path, [:out, :err] => '/dev/null')
end
def ruby_check
<<~RUBY_FILESYSTEM_CHECK
inner_pid = fork { File.stat(ARGV.first) }
Process.waitpid(inner_pid)
exit $?.exitstatus
RUBY_FILESYSTEM_CHECK
end
end
end
end
end
module Gitlab
module Git
module Storage
class Health
attr_reader :storage_name, :info
def self.pattern_for_storage(storage_name)
"#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage_name}:*"
end
def self.for_all_storages
storage_names = Gitlab.config.repositories.storages.keys
results_per_storage = nil
Gitlab::Git::Storage.redis.with do |redis|
keys_per_storage = all_keys_for_storages(storage_names, redis)
results_per_storage = load_for_keys(keys_per_storage, redis)
end
results_per_storage.map do |name, info|
info.each { |i| i[:failure_count] = i[:failure_count].value.to_i }
new(name, info)
end
end
def self.all_keys_for_storages(storage_names, redis)
keys_per_storage = {}
redis.pipelined do
storage_names.each do |storage_name|
pattern = pattern_for_storage(storage_name)
keys_per_storage[storage_name] = redis.keys(pattern)
end
end
keys_per_storage
end
def self.load_for_keys(keys_per_storage, redis)
info_for_keys = {}
redis.pipelined do
keys_per_storage.each do |storage_name, keys_future|
info_for_storage = keys_future.value.map do |key|
{ name: key, failure_count: redis.hget(key, :failure_count) }
end
info_for_keys[storage_name] = info_for_storage
end
end
info_for_keys
end
def self.for_failing_storages
for_all_storages.select(&:failing?)
end
def initialize(storage_name, info)
@storage_name = storage_name
@info = info
end
def failing_info
@failing_info ||= info.select { |info_for_host| info_for_host[:failure_count] > 0 }
end
def failing?
failing_info.any?
end
def failing_on_hosts
@failing_on_hosts ||= failing_info.map do |info_for_host|
info_for_host[:name].split(':').last
end
end
def failing_circuit_breakers
@failing_circuit_breakers ||= failing_on_hosts.map do |hostname|
CircuitBreaker.new(storage_name, hostname)
end
end
def total_failures
@total_failures ||= failing_info.sum { |info_for_host| info_for_host[:failure_count] }
end
end
end
end
end
...@@ -10,7 +10,9 @@ module Gitlab ...@@ -10,7 +10,9 @@ module Gitlab
def readiness def readiness
repository_storages.map do |storage_name| repository_storages.map do |storage_name|
begin begin
if !storage_stat_test(storage_name) if !storage_circuitbreaker_test(storage_name)
HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
elsif !storage_stat_test(storage_name)
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name) HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
else else
with_temp_file(storage_name) do |tmp_file_path| with_temp_file(storage_name) do |tmp_file_path|
...@@ -36,7 +38,8 @@ module Gitlab ...@@ -36,7 +38,8 @@ module Gitlab
[ [
storage_stat_metrics(storage_name), storage_stat_metrics(storage_name),
storage_write_metrics(storage_name), storage_write_metrics(storage_name),
storage_read_metrics(storage_name) storage_read_metrics(storage_name),
storage_circuitbreaker_metrics(storage_name)
].flatten ].flatten
end end
end end
...@@ -121,6 +124,12 @@ module Gitlab ...@@ -121,6 +124,12 @@ module Gitlab
file_contents == RANDOM_STRING file_contents == RANDOM_STRING
end end
def storage_circuitbreaker_test(storage_name)
Gitlab::Git::Storage::CircuitBreaker.new(storage_name).perform { "OK" }
rescue Gitlab::Git::Storage::Inaccessible
nil
end
def storage_stat_metrics(storage_name) def storage_stat_metrics(storage_name)
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
with_timing { storage_stat_test(storage_name) } with_timing { storage_stat_test(storage_name) }
...@@ -143,6 +152,14 @@ module Gitlab ...@@ -143,6 +152,14 @@ module Gitlab
end end
end end
end end
def storage_circuitbreaker_metrics(storage_name)
operation_metrics(:filesystem_circuitbreaker,
:filesystem_circuitbreaker_latency_seconds,
shard: storage_name) do
with_timing { storage_circuitbreaker_test(storage_name) }
end
end
end end
end end
end end
......
...@@ -5,7 +5,7 @@ module Gitlab ...@@ -5,7 +5,7 @@ module Gitlab
include QueryAdditionalMetrics include QueryAdditionalMetrics
def query(environment_id) def query(environment_id)
Environment.find_by(id: environment_id).try do |environment| ::Environment.find_by(id: environment_id).try do |environment|
query_metrics( query_metrics(
common_query_context(environment, timeframe_start: 8.hours.ago.to_f, timeframe_end: Time.now.to_f) common_query_context(environment, timeframe_start: 8.hours.ago.to_f, timeframe_end: Time.now.to_f)
) )
......
...@@ -3,7 +3,7 @@ module Gitlab ...@@ -3,7 +3,7 @@ module Gitlab
module Queries module Queries
class EnvironmentQuery < BaseQuery class EnvironmentQuery < BaseQuery
def query(environment_id) def query(environment_id)
Environment.find_by(id: environment_id).try do |environment| ::Environment.find_by(id: environment_id).try do |environment|
environment_slug = environment.slug environment_slug = environment.slug
timeframe_start = 8.hours.ago.to_f timeframe_start = 8.hours.ago.to_f
timeframe_end = Time.now.to_f timeframe_end = Time.now.to_f
......
...@@ -27,8 +27,8 @@ module Gitlab ...@@ -27,8 +27,8 @@ module Gitlab
ci_pipeline_schedules: ::Ci::PipelineSchedule.count, ci_pipeline_schedules: ::Ci::PipelineSchedule.count,
deploy_keys: DeployKey.count, deploy_keys: DeployKey.count,
deployments: Deployment.count, deployments: Deployment.count,
environments: Environment.count, environments: ::Environment.count,
in_review_folder: Environment.in_review_folder.count, in_review_folder: ::Environment.in_review_folder.count,
groups: Group.count, groups: Group.count,
issues: Issue.count, issues: Issue.count,
keys: Key.count, keys: Key.count,
......
require 'spec_helper'
describe Admin::HealthCheckController, broken_storage: true do
let(:admin) { create(:admin) }
before do
sign_in(admin)
end
describe 'GET show' do
it 'loads the git storage health information' do
get :show
expect(assigns[:failing_storage_statuses]).not_to be_nil
end
end
describe 'POST reset_storage_health' do
it 'resets all storage health information' do
expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!)
post :reset_storage_health
end
end
end
...@@ -108,6 +108,30 @@ describe ApplicationController do ...@@ -108,6 +108,30 @@ describe ApplicationController do
end end
end end
describe 'rescue from Gitlab::Git::Storage::Inaccessible' do
controller(described_class) do
def index
raise Gitlab::Git::Storage::Inaccessible.new('broken', 100)
end
end
it 'renders a 503 when storage is not available' do
sign_in(create(:user))
get :index
expect(response.status).to eq(503)
end
it 'renders includes a Retry-After header' do
sign_in(create(:user))
get :index
expect(response.headers['Retry-After']).to eq(100)
end
end
describe 'response format' do describe 'response format' do
controller(described_class) do controller(described_class) do
def index def index
......
...@@ -107,6 +107,20 @@ describe ProjectsController do ...@@ -107,6 +107,20 @@ describe ProjectsController do
end end
end end
context 'when the storage is not available', broken_storage: true do
let(:project) { create(:project, :broken_storage) }
before do
project.add_developer(user)
sign_in(user)
end
it 'renders a 503' do
get :show, namespace_id: project.namespace, id: project
expect(response).to have_http_status(503)
end
end
context "project with empty repo" do context "project with empty repo" do
let(:empty_project) { create(:project_empty_repo, :public) } let(:empty_project) { create(:project_empty_repo, :public) }
......
...@@ -54,6 +54,12 @@ FactoryGirl.define do ...@@ -54,6 +54,12 @@ FactoryGirl.define do
avatar { File.open(Rails.root.join('spec/fixtures/dk.png')) } avatar { File.open(Rails.root.join('spec/fixtures/dk.png')) }
end end
trait :broken_storage do
after(:create) do |project|
project.update_column(:repository_storage, 'broken')
end
end
# Test repository - https://gitlab.com/gitlab-org/gitlab-test # Test repository - https://gitlab.com/gitlab-org/gitlab-test
trait :repository do trait :repository do
path { 'gitlabhq' } path { 'gitlabhq' }
......
require 'spec_helper' require 'spec_helper'
feature "Admin Health Check" do feature "Admin Health Check", feature: true, broken_storage: true do
include StubENV include StubENV
before do before do
...@@ -55,4 +55,26 @@ feature "Admin Health Check" do ...@@ -55,4 +55,26 @@ feature "Admin Health Check" do
expect(page).to have_content('The server is on fire') expect(page).to have_content('The server is on fire')
end end
end end
context 'with repository storage failures' do
before do
# Track a failure
Gitlab::Git::Storage::CircuitBreaker.for_storage('broken').perform { nil } rescue nil
visit admin_health_check_path
end
it 'shows storage failure information' do
hostname = Gitlab::Environment.hostname
expect(page).to have_content('broken: failed storage access attempt on host:')
expect(page).to have_content("#{hostname}: 1 of 10 failures.")
end
it 'allows resetting storage failures' do
click_button 'Reset git storage health information'
expect(page).to have_content('Git storage health information has been reset')
expect(page).not_to have_content('failed storage access attempt')
end
end
end end
require 'spec_helper'
describe StorageHealthHelper do
describe '#failing_storage_health_message' do
let(:health) do
Gitlab::Git::Storage::Health.new(
"<script>alert('storage name');)</script>",
[]
)
end
it 'escapes storage names' do
escaped_storage_name = '&lt;script&gt;alert(&#39;storage name&#39;);)&lt;/script&gt;'
result = helper.failing_storage_health_message(health)
expect(result).to include(escaped_storage_name)
end
end
end
...@@ -23,6 +23,16 @@ describe '6_validations' do ...@@ -23,6 +23,16 @@ describe '6_validations' do
end end
end end
context 'when one of the settings is incorrect' do
before do
mock_storages('foo' => { 'path' => 'tmp/tests/paths/a/b/c', 'failure_count_threshold' => 'not a number' })
end
it 'throws an error' do
expect { validate_storages_config }.to raise_error(/failure_count_threshold/)
end
end
context 'with invalid storage names' do context 'with invalid storage names' do
before do before do
mock_storages('name with spaces' => { 'path' => 'tmp/tests/paths/a/b/c' }) mock_storages('name with spaces' => { 'path' => 'tmp/tests/paths/a/b/c' })
...@@ -84,6 +94,17 @@ describe '6_validations' do ...@@ -84,6 +94,17 @@ describe '6_validations' do
expect { validate_storages_paths }.not_to raise_error expect { validate_storages_paths }.not_to raise_error
end end
end end
describe 'inaccessible storage' do
before do
mock_storages('foo' => { 'path' => 'tmp/tests/a/path/that/does/not/exist' })
end
it 'passes through with a warning' do
expect(Rails.logger).to receive(:error)
expect { validate_storages_paths }.not_to raise_error
end
end
end end
def mock_storages(storages) def mock_storages(storages)
......
...@@ -2,6 +2,17 @@ require 'spec_helper' ...@@ -2,6 +2,17 @@ require 'spec_helper'
require_relative '../../config/initializers/1_settings' require_relative '../../config/initializers/1_settings'
describe Settings do describe Settings do
describe '#repositories' do
it 'assigns the default failure attributes' do
repository_settings = Gitlab.config.repositories.storages['broken']
expect(repository_settings['failure_count_threshold']).to eq(10)
expect(repository_settings['failure_wait_time']).to eq(30)
expect(repository_settings['failure_reset_time']).to eq(1800)
expect(repository_settings['storage_timeout']).to eq(5)
end
end
describe '#host_without_www' do describe '#host_without_www' do
context 'URL with protocol' do context 'URL with protocol' do
it 'returns the host' do it 'returns the host' do
......
...@@ -48,8 +48,9 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do ...@@ -48,8 +48,9 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do
described_class.load_in_batch_for_projects([project_without_status]) described_class.load_in_batch_for_projects([project_without_status])
end end
it 'only connects to redis_cache twice' do it 'only connects to redis twice' do
# Once to load, once to store in the cache # Stub circuitbreaker so it doesn't count the redis connections in there
stub_circuit_breaker(project_without_status)
expect(Gitlab::Redis::Cache).to receive(:with).exactly(2).and_call_original expect(Gitlab::Redis::Cache).to receive(:with).exactly(2).and_call_original
described_class.load_in_batch_for_projects([project_without_status]) described_class.load_in_batch_for_projects([project_without_status])
...@@ -301,4 +302,13 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do ...@@ -301,4 +302,13 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do
end end
end end
end end
def stub_circuit_breaker(project)
fake_circuitbreaker = double
allow(fake_circuitbreaker).to receive(:perform).and_yield
allow(project.repository.raw_repository)
.to receive(:circuit_breaker).and_return(fake_circuitbreaker)
allow(project.repository)
.to receive(:circuit_breaker).and_return(fake_circuitbreaker)
end
end end
...@@ -54,6 +54,20 @@ describe Gitlab::Git::Repository, seed_helper: true do ...@@ -54,6 +54,20 @@ describe Gitlab::Git::Repository, seed_helper: true do
end end
describe "#rugged" do describe "#rugged" do
describe 'when storage is broken', broken_storage: true do
it 'raises a storage exception when storage is not available' do
broken_repo = described_class.new('broken', 'a/path.git')
expect { broken_repo.rugged }.to raise_error(Gitlab::Git::Storage::Inaccessible)
end
end
it 'raises a no repository exception when there is no repo' do
broken_repo = described_class.new('default', 'a/path.git')
expect { broken_repo.rugged }.to raise_error(Gitlab::Git::Repository::NoRepository)
end
context 'with no Git env stored' do context 'with no Git env stored' do
before do before do
expect(Gitlab::Git::Env).to receive(:all).and_return({}) expect(Gitlab::Git::Env).to receive(:all).and_return({})
......
require 'spec_helper'
describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: true, broken_storage: true do
let(:circuit_breaker) { described_class.new('default') }
let(:hostname) { Gitlab::Environment.hostname }
let(:cache_key) { "storage_accessible:default:#{hostname}" }
def value_from_redis(name)
Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, name)
end.first
end
def set_in_redis(name, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.hmset(cache_key, name, value)
end.first
end
describe '.reset_all!' do
it 'clears all entries form redis' do
set_in_redis(:failure_count, 10)
described_class.reset_all!
key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) }
expect(key_exists).to be_falsey
end
end
describe '.for_storage' do
it 'only builds a single circuitbreaker per storage' do
expect(described_class).to receive(:new).once.and_call_original
breaker = described_class.for_storage('default')
expect(breaker).to be_a(described_class)
expect(described_class.for_storage('default')).to eq(breaker)
end
end
describe '#initialize' do
it 'assigns the settings' do
expect(circuit_breaker.hostname).to eq(hostname)
expect(circuit_breaker.storage).to eq('default')
expect(circuit_breaker.storage_path).to eq(TestEnv.repos_path)
expect(circuit_breaker.failure_count_threshold).to eq(10)
expect(circuit_breaker.failure_wait_time).to eq(30)
expect(circuit_breaker.failure_reset_time).to eq(1800)
expect(circuit_breaker.storage_timeout).to eq(5)
end
end
describe '#perform' do
it 'raises an exception with retry time when the circuit is open' do
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
expect { |b| circuit_breaker.perform(&b) }
.to raise_error(Gitlab::Git::Storage::CircuitOpen)
end
it 'yields the block' do
expect { |b| circuit_breaker.perform(&b) }
.to yield_control
end
it 'checks if the storage is available' do
expect(circuit_breaker).to receive(:check_storage_accessible!)
circuit_breaker.perform { 'hello world' }
end
it 'returns the value of the block' do
result = circuit_breaker.perform { 'return value' }
expect(result).to eq('return value')
end
it 'raises possible errors' do
expect { circuit_breaker.perform { raise Rugged::OSError.new('Broken') } }
.to raise_error(Rugged::OSError)
end
context 'with the feature disabled' do
it 'returns the block without checking accessibility' do
stub_feature_flags(git_storage_circuit_breaker: false)
expect(circuit_breaker).not_to receive(:circuit_broken?)
result = circuit_breaker.perform { 'hello' }
expect(result).to eq('hello')
end
end
end
describe '#circuit_broken?' do
it 'is closed when there is no last failure' do
set_in_redis(:last_failure, nil)
set_in_redis(:failure_count, 0)
expect(circuit_breaker.circuit_broken?).to be_falsey
end
it 'is open when there was a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 1)
expect(circuit_breaker.circuit_broken?).to be_truthy
end
end
it 'is open when there are to many failures' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 200)
expect(circuit_breaker.circuit_broken?).to be_truthy
end
end
describe "storage_available?" do
context 'when the storage is available' do
it 'tracks that the storage was accessible an raises the error' do
expect(circuit_breaker).to receive(:track_storage_accessible)
circuit_breaker.storage_available?
end
it 'only performs the check once' do
expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).once.and_call_original
2.times { circuit_breaker.storage_available? }
end
end
context 'when storage is not available' do
let(:circuit_breaker) { described_class.new('broken') }
it 'tracks that the storage was inaccessible' do
expect(circuit_breaker).to receive(:track_storage_inaccessible)
circuit_breaker.storage_available?
end
end
end
describe '#check_storage_accessible!' do
it 'raises an exception with retry time when the circuit is open' do
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
expect { circuit_breaker.check_storage_accessible! }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
expect(exception.retry_after).to eq(30)
end
end
context 'when the storage is not available' do
let(:circuit_breaker) { described_class.new('broken') }
it 'raises an error' do
expect(circuit_breaker).to receive(:track_storage_inaccessible)
expect { circuit_breaker.check_storage_accessible! }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
expect(exception.retry_after).to eq(30)
end
end
end
end
describe '#track_storage_inaccessible' do
around(:each) do |example|
Timecop.freeze
example.run
Timecop.return
end
it 'records the failure time in redis' do
circuit_breaker.track_storage_inaccessible
failure_time = value_from_redis(:last_failure)
expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now)
end
it 'sets the failure time on the breaker without reloading' do
circuit_breaker.track_storage_inaccessible
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.last_failure).to eq(Time.now)
end
it 'increments the failure count in redis' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_inaccessible
expect(value_from_redis(:failure_count).to_i).to be(11)
end
it 'increments the failure count on the breaker without reloading' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_inaccessible
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.failure_count).to eq(11)
end
end
describe '#track_storage_accessible' do
it 'sets the failure count to zero in redis' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_accessible
expect(value_from_redis(:failure_count).to_i).to be(0)
end
it 'sets the failure count to zero on the breaker without reloading' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_accessible
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.failure_count).to eq(0)
end
it 'removes the last failure time from redis' do
set_in_redis(:last_failure, Time.now.to_i)
circuit_breaker.track_storage_accessible
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.last_failure).to be_nil
end
it 'removes the last failure time from the breaker without reloading' do
set_in_redis(:last_failure, Time.now.to_i)
circuit_breaker.track_storage_accessible
expect(value_from_redis(:last_failure)).to be_empty
end
it 'wont connect to redis when there are no failures' do
expect(Gitlab::Git::Storage.redis).to receive(:with).once
.and_call_original
expect(circuit_breaker).to receive(:track_storage_accessible)
.and_call_original
circuit_breaker.track_storage_accessible
end
end
describe '#no_failures?' do
it 'is false when a failure was tracked' do
set_in_redis(:last_failure, Time.now.to_i)
set_in_redis(:failure_count, 1)
expect(circuit_breaker.no_failures?).to be_falsey
end
end
describe '#last_failure' do
it 'returns the last failure time' do
time = Time.parse("2017-05-26 17:52:30")
set_in_redis(:last_failure, time.to_i)
expect(circuit_breaker.last_failure).to eq(time)
end
end
describe '#failure_count' do
it 'returns the failure count' do
set_in_redis(:failure_count, 7)
expect(circuit_breaker.failure_count).to eq(7)
end
end
describe '#cache_key' do
it 'includes storage and host' do
expect(circuit_breaker.cache_key).to eq(cache_key)
end
end
end
require 'spec_helper'
describe Gitlab::Git::Storage::ForkedStorageCheck, skip_database_cleaner: true do
let(:existing_path) do
existing_path = TestEnv.repos_path
FileUtils.mkdir_p(existing_path)
existing_path
end
describe '.storage_accessible?' do
it 'detects when a storage is not available' do
expect(described_class.storage_available?('/non/existant/path')).to be_falsey
end
it 'detects when a storage is available' do
expect(described_class.storage_available?(existing_path)).to be_truthy
end
it 'returns false when the check takes to long' do
# We're forking a process here that takes too long
# It will be killed it's parent process will be killed by it's parent
# and waited for inside `Gitlab::Git::Storage::ForkedStorageCheck.timeout_check`
allow(described_class).to receive(:check_filesystem_in_process) do
Process.spawn("sleep 10")
end
result = true
runtime = Benchmark.realtime do
result = described_class.storage_available?(existing_path, 0.5)
end
expect(result).to be_falsey
expect(runtime).to be < 1.0
end
describe 'when using paths with spaces' do
let(:test_dir) { Rails.root.join('tmp', 'tests', 'storage_check') }
let(:path_with_spaces) { File.join(test_dir, 'path with spaces') }
around do |example|
FileUtils.mkdir_p(path_with_spaces)
example.run
FileUtils.rm_r(test_dir)
end
it 'works for paths with spaces' do
expect(described_class.storage_available?(path_with_spaces)).to be_truthy
end
it 'works for a realpath with spaces' do
symlink_location = File.join(test_dir, 'a symlink')
FileUtils.ln_s(path_with_spaces, symlink_location)
expect(described_class.storage_available?(symlink_location)).to be_truthy
end
end
end
end
require 'spec_helper'
describe Gitlab::Git::Storage::Health, clean_gitlab_redis_shared_state: true, broken_storage: true do
let(:host1_key) { 'storage_accessible:broken:web01' }
let(:host2_key) { 'storage_accessible:default:kiq01' }
def set_in_redis(cache_key, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.hmset(cache_key, :failure_count, value)
end.first
end
describe '.for_failing_storages' do
it 'only includes health status for failures' do
set_in_redis(host1_key, 10)
set_in_redis(host2_key, 0)
expect(described_class.for_failing_storages.map(&:storage_name))
.to contain_exactly('broken')
end
end
describe '.load_for_keys' do
let(:subject) do
results = Gitlab::Git::Storage.redis.with do |redis|
fake_future = double
allow(fake_future).to receive(:value).and_return([host1_key])
described_class.load_for_keys({ 'broken' => fake_future }, redis)
end
# Make sure the `Redis#future is loaded
results.inject({}) do |result, (name, info)|
info.each { |i| i[:failure_count] = i[:failure_count].value.to_i }
result[name] = info
result
end
end
it 'loads when there is no info in redis' do
expect(subject).to eq('broken' => [{ name: host1_key, failure_count: 0 }])
end
it 'reads the correct values for a storage from redis' do
set_in_redis(host1_key, 5)
set_in_redis(host2_key, 7)
expect(subject).to eq('broken' => [{ name: host1_key, failure_count: 5 }])
end
end
describe '.for_all_storages' do
it 'loads health status for all configured storages' do
healths = described_class.for_all_storages
expect(healths.map(&:storage_name)).to contain_exactly('default', 'broken')
end
end
describe '#failing_info' do
it 'only contains storages that have failures' do
health = described_class.new('broken', [{ name: host1_key, failure_count: 0 },
{ name: host2_key, failure_count: 3 }])
expect(health.failing_info).to contain_exactly({ name: host2_key, failure_count: 3 })
end
end
describe '#total_failures' do
it 'sums up all the failures' do
health = described_class.new('broken', [{ name: host1_key, failure_count: 2 },
{ name: host2_key, failure_count: 3 }])
expect(health.total_failures).to eq(5)
end
end
describe '#failing_on_hosts' do
it 'collects only the failing hostnames' do
health = described_class.new('broken', [{ name: host1_key, failure_count: 2 },
{ name: host2_key, failure_count: 0 }])
expect(health.failing_on_hosts).to contain_exactly('web01')
end
end
end
...@@ -44,6 +44,15 @@ describe Gitlab::HealthChecks::FsShardsCheck do ...@@ -44,6 +44,15 @@ describe Gitlab::HealthChecks::FsShardsCheck do
describe '#readiness' do describe '#readiness' do
subject { described_class.readiness } subject { described_class.readiness }
context 'storage has a tripped circuitbreaker', broken_storage: true do
let(:repository_storages) { ['broken'] }
let(:storages_paths) do
Gitlab.config.repositories.storages
end
it { is_expected.to include(result_class.new(false, 'circuitbreaker tripped', shard: 'broken')) }
end
context 'storage points to not existing folder' do context 'storage points to not existing folder' do
let(:storages_paths) do let(:storages_paths) do
{ {
...@@ -51,6 +60,10 @@ describe Gitlab::HealthChecks::FsShardsCheck do ...@@ -51,6 +60,10 @@ describe Gitlab::HealthChecks::FsShardsCheck do
}.with_indifferent_access }.with_indifferent_access
end end
before do
allow(described_class).to receive(:storage_circuitbreaker_test) { true }
end
it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: :default)) } it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: :default)) }
end end
...@@ -109,6 +122,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do ...@@ -109,6 +122,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0))
end end
end end
...@@ -127,6 +141,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do ...@@ -127,6 +141,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0))
end end
it 'cleans up files used for metrics' do it 'cleans up files used for metrics' do
......
require 'spec_helper' require 'spec_helper'
describe Repository do describe Repository, models: true do
include RepoHelpers include RepoHelpers
TestBlob = Struct.new(:path) TestBlob = Struct.new(:path)
let(:project) { create(:project, :repository) } let(:project) { create(:project, :repository) }
let(:repository) { project.repository } let(:repository) { project.repository }
let(:broken_repository) { create(:project, :broken_storage).repository }
let(:user) { create(:user) } let(:user) { create(:user) }
let(:commit_options) do let(:commit_options) do
...@@ -27,12 +28,27 @@ describe Repository do ...@@ -27,12 +28,27 @@ describe Repository do
let(:author_email) { 'user@example.org' } let(:author_email) { 'user@example.org' }
let(:author_name) { 'John Doe' } let(:author_name) { 'John Doe' }
def expect_to_raise_storage_error
expect { yield }.to raise_error do |exception|
storage_exceptions = [Gitlab::Git::Storage::Inaccessible, Gitlab::Git::CommandError, GRPC::Unavailable]
expect(exception.class).to be_in(storage_exceptions)
end
end
describe '#branch_names_contains' do describe '#branch_names_contains' do
subject { repository.branch_names_contains(sample_commit.id) } subject { repository.branch_names_contains(sample_commit.id) }
it { is_expected.to include('master') } it { is_expected.to include('master') }
it { is_expected.not_to include('feature') } it { is_expected.not_to include('feature') }
it { is_expected.not_to include('fix') } it { is_expected.not_to include('fix') }
describe 'when storage is broken', broken_storage: true do
it 'should raise a storage error' do
expect_to_raise_storage_error do
broken_repository.branch_names_contains(sample_commit.id)
end
end
end
end end
describe '#tag_names_contains' do describe '#tag_names_contains' do
...@@ -143,6 +159,14 @@ describe Repository do ...@@ -143,6 +159,14 @@ describe Repository do
subject { repository.last_commit_for_path(sample_commit.id, '.gitignore').id } subject { repository.last_commit_for_path(sample_commit.id, '.gitignore').id }
it { is_expected.to eq('c1acaa58bbcbc3eafe538cb8274ba387047b69f8') } it { is_expected.to eq('c1acaa58bbcbc3eafe538cb8274ba387047b69f8') }
describe 'when storage is broken', broken_storage: true do
it 'should raise a storage error' do
expect_to_raise_storage_error do
broken_repository.last_commit_id_for_path(sample_commit.id, '.gitignore')
end
end
end
end end
context 'when Gitaly feature last_commit_for_path is enabled' do context 'when Gitaly feature last_commit_for_path is enabled' do
...@@ -169,6 +193,14 @@ describe Repository do ...@@ -169,6 +193,14 @@ describe Repository do
expect(cache).to receive(:fetch).with(key).and_return('c1acaa5') expect(cache).to receive(:fetch).with(key).and_return('c1acaa5')
is_expected.to eq('c1acaa5') is_expected.to eq('c1acaa5')
end end
describe 'when storage is broken', broken_storage: true do
it 'should raise a storage error' do
expect_to_raise_storage_error do
broken_repository.last_commit_for_path(sample_commit.id, '.gitignore').id
end
end
end
end end
context 'when Gitaly feature last_commit_for_path is enabled' do context 'when Gitaly feature last_commit_for_path is enabled' do
...@@ -216,6 +248,12 @@ describe Repository do ...@@ -216,6 +248,12 @@ describe Repository do
expect(commit_ids).to include('5937ac0a7beb003549fc5fd26fc247adbce4a52e') expect(commit_ids).to include('5937ac0a7beb003549fc5fd26fc247adbce4a52e')
end end
describe 'when storage is broken', broken_storage: true do
it 'should raise a storage error' do
expect_to_raise_storage_error { broken_repository.find_commits_by_message('s') }
end
end
end end
describe '#blob_at' do describe '#blob_at' do
...@@ -541,6 +579,14 @@ describe Repository do ...@@ -541,6 +579,14 @@ describe Repository do
expect(results).to match_array([]) expect(results).to match_array([])
end end
describe 'when storage is broken', broken_storage: true do
it 'should raise a storage error' do
expect_to_raise_storage_error do
broken_repository.search_files_by_content('feature', 'master')
end
end
end
describe 'result' do describe 'result' do
subject { results.first } subject { results.first }
...@@ -569,6 +615,22 @@ describe Repository do ...@@ -569,6 +615,22 @@ describe Repository do
expect(results).to match_array([]) expect(results).to match_array([])
end end
describe 'when storage is broken', broken_storage: true do
it 'should raise a storage error' do
expect_to_raise_storage_error { broken_repository.search_files_by_name('files', 'master') }
end
end
end
describe '#fetch_ref' do
describe 'when storage is broken', broken_storage: true do
it 'should raise a storage error' do
path = broken_repository.path_to_repo
expect_to_raise_storage_error { broken_repository.fetch_ref(path, '1', '2') }
end
end
end end
describe '#create_ref' do describe '#create_ref' do
...@@ -986,6 +1048,12 @@ describe Repository do ...@@ -986,6 +1048,12 @@ describe Repository do
expect(repository.exists?).to eq(false) expect(repository.exists?).to eq(false)
end end
context 'with broken storage', broken_storage: true do
it 'should raise a storage error' do
expect_to_raise_storage_error { broken_repository.exists? }
end
end
end end
describe '#exists?' do describe '#exists?' do
......
require 'spec_helper'
describe API::CircuitBreakers do
let(:user) { create(:user) }
let(:admin) { create(:admin) }
describe 'GET circuit_breakers/repository_storage' do
it 'returns a 401 for anonymous users' do
get api('/circuit_breakers/repository_storage')
expect(response).to have_http_status(401)
end
it 'returns a 403 for users' do
get api('/circuit_breakers/repository_storage', user)
expect(response).to have_http_status(403)
end
it 'returns an Array of storages' do
expect(Gitlab::Git::Storage::Health).to receive(:for_all_storages) do
[Gitlab::Git::Storage::Health.new('broken', [{ name: 'prefix:broken:web01', failure_count: 4 }])]
end
get api('/circuit_breakers/repository_storage', admin)
expect(response).to have_http_status(200)
expect(json_response).to be_kind_of(Array)
expect(json_response.first['storage_name']).to eq('broken')
expect(json_response.first['failing_on_hosts']).to eq(['web01'])
expect(json_response.first['total_failures']).to eq(4)
end
describe 'GET circuit_breakers/repository_storage/failing' do
it 'returns an array of failing storages' do
expect(Gitlab::Git::Storage::Health).to receive(:for_failing_storages) do
[Gitlab::Git::Storage::Health.new('broken', [{ name: 'prefix:broken:web01', failure_count: 4 }])]
end
get api('/circuit_breakers/repository_storage/failing', admin)
expect(response).to have_http_status(200)
expect(json_response).to be_kind_of(Array)
end
end
end
describe 'DELETE circuit_breakers/repository_storage' do
it 'clears all circuit_breakers' do
expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!)
delete api('/circuit_breakers/repository_storage', admin)
expect(response).to have_http_status(204)
end
end
end
...@@ -2,4 +2,16 @@ RSpec.configure do |config| ...@@ -2,4 +2,16 @@ RSpec.configure do |config|
config.before(:each, :repository) do config.before(:each, :repository) do
TestEnv.clean_test_path TestEnv.clean_test_path
end end
config.before(:all, :broken_storage) do
FileUtils.rm_rf Gitlab.config.repositories.storages.broken['path']
end
config.before(:each, :broken_storage) do
allow(Gitlab::GitalyClient).to receive(:call) do
raise GRPC::Unavailable.new('Gitaly broken in this spec')
end
Gitlab::Git::Storage::CircuitBreaker.reset_all!
end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment