Commit 7fbefa3d authored by Sean McGivern's avatar Sean McGivern

Merge branch 'bvl-circuitbreaker-backoff' into 'master'

Circuitbreaker backoff and retries

Closes #37383 and #38231

See merge request gitlab-org/gitlab-ce!14933
parents cc170670 705c15d7
...@@ -120,6 +120,15 @@ module ApplicationSettingsHelper ...@@ -120,6 +120,15 @@ module ApplicationSettingsHelper
message.html_safe message.html_safe
end end
def circuitbreaker_access_retries_help_text
_('The number of attempts GitLab will make to access a storage.')
end
def circuitbreaker_backoff_threshold_help_text
_("The number of failures after which GitLab will start temporarily "\
"disabling access to a storage shard on a host")
end
def circuitbreaker_failure_wait_time_help_text def circuitbreaker_failure_wait_time_help_text
_("When access to a storage fails. GitLab will prevent access to the "\ _("When access to a storage fails. GitLab will prevent access to the "\
"storage for the time specified here. This allows the filesystem to "\ "storage for the time specified here. This allows the filesystem to "\
...@@ -144,6 +153,8 @@ module ApplicationSettingsHelper ...@@ -144,6 +153,8 @@ module ApplicationSettingsHelper
:akismet_api_key, :akismet_api_key,
:akismet_enabled, :akismet_enabled,
:auto_devops_enabled, :auto_devops_enabled,
:circuitbreaker_access_retries,
:circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold, :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time, :circuitbreaker_failure_reset_time,
:circuitbreaker_failure_wait_time, :circuitbreaker_failure_wait_time,
......
...@@ -16,17 +16,16 @@ module StorageHealthHelper ...@@ -16,17 +16,16 @@ module StorageHealthHelper
def message_for_circuit_breaker(circuit_breaker) def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count current_failures = circuit_breaker.failure_count
permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
translation_params = { number_of_failures: current_failures, translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures, maximum_failures: maximum_failures,
number_of_seconds: circuit_breaker.failure_wait_time } number_of_seconds: circuit_breaker.failure_wait_time }
if permanently_broken if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\ s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\ "retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params "resolved.") % translation_params
elsif circuit_breaker.circuit_broken? elsif circuit_breaker.backing_off?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params "block access for %{number_of_seconds} seconds.") % translation_params
else else
......
...@@ -153,13 +153,25 @@ class ApplicationSetting < ActiveRecord::Base ...@@ -153,13 +153,25 @@ class ApplicationSetting < ActiveRecord::Base
presence: true, presence: true,
numericality: { greater_than_or_equal_to: 0 } numericality: { greater_than_or_equal_to: 0 }
validates :circuitbreaker_failure_count_threshold, validates :circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time, :circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time, :circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout, :circuitbreaker_storage_timeout,
presence: true, presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 } numericality: { only_integer: true, greater_than_or_equal_to: 0 }
validates :circuitbreaker_access_retries,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 1 }
validates_each :circuitbreaker_backoff_threshold do |record, attr, value|
if value.to_i >= record.circuitbreaker_failure_count_threshold
record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\
"lower than the failure count threshold"))
end
end
SUPPORTED_KEY_TYPES.each do |type| SUPPORTED_KEY_TYPES.each do |type|
validates :"#{type}_key_restriction", presence: true, key_restriction: { type: type } validates :"#{type}_key_restriction", presence: true, key_restriction: { type: type }
end end
......
...@@ -533,11 +533,23 @@ ...@@ -533,11 +533,23 @@
%fieldset %fieldset
%legend Git Storage Circuitbreaker settings %legend Git Storage Circuitbreaker settings
.form-group .form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2' = f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2'
.col-sm-10 .col-sm-10
= f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control' = f.number_field :circuitbreaker_access_retries, class: 'form-control'
.help-block .help-block
= circuitbreaker_failure_count_help_text = circuitbreaker_access_retries_help_text
.form-group
= f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
.help-block
= circuitbreaker_storage_timeout_help_text
.form-group
= f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_backoff_threshold, class: 'form-control'
.help-block
= circuitbreaker_backoff_threshold_help_text
.form-group .form-group
= f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2' = f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2'
.col-sm-10 .col-sm-10
...@@ -545,17 +557,17 @@ ...@@ -545,17 +557,17 @@
.help-block .help-block
= circuitbreaker_failure_wait_time_help_text = circuitbreaker_failure_wait_time_help_text
.form-group .form-group
= f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'control-label col-sm-2' = f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
.col-sm-10 .col-sm-10
= f.number_field :circuitbreaker_failure_reset_time, class: 'form-control' = f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control'
.help-block .help-block
= circuitbreaker_failure_reset_time_help_text = circuitbreaker_failure_count_help_text
.form-group .form-group
= f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'control-label col-sm-2' = f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'control-label col-sm-2'
.col-sm-10 .col-sm-10
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control' = f.number_field :circuitbreaker_failure_reset_time, class: 'form-control'
.help-block .help-block
= circuitbreaker_storage_timeout_help_text = circuitbreaker_failure_reset_time_help_text
%fieldset %fieldset
%legend Repository Checks %legend Repository Checks
......
---
title: Make the circuitbreaker more robust by adding higher thresholds, and multiple
access attempts.
merge_request: 14933
author:
type: fixed
class AddNewCircuitbreakerSettingsToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def change
add_column :application_settings,
:circuitbreaker_access_retries,
:integer,
default: 3
add_column :application_settings,
:circuitbreaker_backoff_threshold,
:integer,
default: 80
end
end
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171012101043) do ActiveRecord::Schema.define(version: 20171017145932) do
# These are extensions that must be enabled in order to support this database # These are extensions that must be enabled in order to support this database
enable_extension "plpgsql" enable_extension "plpgsql"
...@@ -138,6 +138,8 @@ ActiveRecord::Schema.define(version: 20171012101043) do ...@@ -138,6 +138,8 @@ ActiveRecord::Schema.define(version: 20171012101043) do
t.integer "circuitbreaker_failure_wait_time", default: 30 t.integer "circuitbreaker_failure_wait_time", default: 30
t.integer "circuitbreaker_failure_reset_time", default: 1800 t.integer "circuitbreaker_failure_reset_time", default: 1800
t.integer "circuitbreaker_storage_timeout", default: 30 t.integer "circuitbreaker_storage_timeout", default: 30
t.integer "circuitbreaker_access_retries", default: 3
t.integer "circuitbreaker_backoff_threshold", default: 80
end end
create_table "audit_events", force: :cascade do |t| create_table "audit_events", force: :cascade do |t|
......
...@@ -109,6 +109,11 @@ This can be configured from the admin interface: ...@@ -109,6 +109,11 @@ This can be configured from the admin interface:
![circuitbreaker configuration](img/circuitbreaker_config.png) ![circuitbreaker configuration](img/circuitbreaker_config.png)
**Number of access attempts**: The number of attempts GitLab will make to access a
storage when probing a shard.
**Number of failures before backing off**: The number of failures after which
GitLab will start temporarily disabling access to a storage shard on a host.
**Maximum git storage failures:** The number of failures of after which GitLab will **Maximum git storage failures:** The number of failures of after which GitLab will
completely prevent access to the storage. The number of failures can be reset in completely prevent access to the storage. The number of failures can be reset in
...@@ -126,6 +131,15 @@ mount is reset. ...@@ -126,6 +131,15 @@ mount is reset.
**Seconds to wait for a storage access attempt:** The time in seconds GitLab will **Seconds to wait for a storage access attempt:** The time in seconds GitLab will
try to access storage. After this time a timeout error will be raised. try to access storage. After this time a timeout error will be raised.
To enable the circuitbreaker for repository storage you can flip the feature flag from a rails console:
```
Feature.enable('git_storage_circuit_breaker')
```
Alternatively it can be enabled by setting `true` in the `GIT_STORAGE_CIRCUIT_BREAKER` environment variable.
This approach would be used when enabling the circuit breaker on a single host.
When storage failures occur, this will be visible in the admin interface like this: When storage failures occur, this will be visible in the admin interface like this:
![failing storage](img/failing_storage.png) ![failing storage](img/failing_storage.png)
......
...@@ -69,6 +69,8 @@ PUT /application/settings ...@@ -69,6 +69,8 @@ PUT /application/settings
| `after_sign_up_text` | string | no | Text shown to the user after signing up | | `after_sign_up_text` | string | no | Text shown to the user after signing up |
| `akismet_api_key` | string | no | API key for akismet spam protection | | `akismet_api_key` | string | no | API key for akismet spam protection |
| `akismet_enabled` | boolean | no | Enable or disable akismet spam protection | | `akismet_enabled` | boolean | no | Enable or disable akismet spam protection |
| `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. |
| `circuitbreaker_backoff_threshold | integer | no | The number of failures after which GitLab will start temporarily disabling access to a storage shard on a host. |
| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. | | `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. |
| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. | | `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. |
| `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. | | `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. |
......
...@@ -12,6 +12,7 @@ module Gitlab ...@@ -12,6 +12,7 @@ module Gitlab
CircuitOpen = Class.new(Inaccessible) CircuitOpen = Class.new(Inaccessible)
Misconfiguration = Class.new(Inaccessible) Misconfiguration = Class.new(Inaccessible)
Failing = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
......
...@@ -54,7 +54,7 @@ module Gitlab ...@@ -54,7 +54,7 @@ module Gitlab
end end
def perform def perform
return yield unless Feature.enabled?('git_storage_circuit_breaker') return yield unless enabled?
check_storage_accessible! check_storage_accessible!
...@@ -64,10 +64,27 @@ module Gitlab ...@@ -64,10 +64,27 @@ module Gitlab
def circuit_broken? def circuit_broken?
return false if no_failures? return false if no_failures?
failure_count > failure_count_threshold
end
def backing_off?
return false if no_failures?
recent_failure = last_failure > failure_wait_time.seconds.ago recent_failure = last_failure > failure_wait_time.seconds.ago
too_many_failures = failure_count > failure_count_threshold too_many_failures = failure_count > backoff_threshold
recent_failure || too_many_failures recent_failure && too_many_failures
end
private
# The circuitbreaker can be enabled for the entire fleet using a Feature
# flag.
#
# Enabling it for a single host can be done setting the
# `GIT_STORAGE_CIRCUIT_BREAKER` environment variable.
def enabled?
ENV['GIT_STORAGE_CIRCUIT_BREAKER'].present? || Feature.enabled?('git_storage_circuit_breaker')
end end
def failure_info def failure_info
...@@ -83,7 +100,7 @@ module Gitlab ...@@ -83,7 +100,7 @@ module Gitlab
return @storage_available if @storage_available return @storage_available if @storage_available
if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck
.storage_available?(storage_path, storage_timeout) .storage_available?(storage_path, storage_timeout, access_retries)
track_storage_accessible track_storage_accessible
else else
track_storage_inaccessible track_storage_inaccessible
...@@ -94,7 +111,11 @@ module Gitlab ...@@ -94,7 +111,11 @@ module Gitlab
def check_storage_accessible! def check_storage_accessible!
if circuit_broken? if circuit_broken?
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time) raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
end
if backing_off?
raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
end end
unless storage_available? unless storage_available?
...@@ -131,12 +152,6 @@ module Gitlab ...@@ -131,12 +152,6 @@ module Gitlab
end end
end end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
private
def get_failure_info def get_failure_info
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis| last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :last_failure, :failure_count) redis.hmget(cache_key, :last_failure, :failure_count)
...@@ -146,6 +161,10 @@ module Gitlab ...@@ -146,6 +161,10 @@ module Gitlab
FailureInfo.new(last_failure, failure_count.to_i) FailureInfo.new(last_failure, failure_count.to_i)
end end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
end end
end end
end end
......
...@@ -18,6 +18,14 @@ module Gitlab ...@@ -18,6 +18,14 @@ module Gitlab
application_settings.circuitbreaker_storage_timeout application_settings.circuitbreaker_storage_timeout
end end
def access_retries
application_settings.circuitbreaker_access_retries
end
def backoff_threshold
application_settings.circuitbreaker_backoff_threshold
end
private private
def application_settings def application_settings
......
...@@ -4,8 +4,17 @@ module Gitlab ...@@ -4,8 +4,17 @@ module Gitlab
module ForkedStorageCheck module ForkedStorageCheck
extend self extend self
def storage_available?(path, timeout_seconds = 5) def storage_available?(path, timeout_seconds = 5, retries = 1)
status = timeout_check(path, timeout_seconds) partial_timeout = timeout_seconds / retries
status = timeout_check(path, partial_timeout)
# If the status check did not succeed the first time, we retry a few
# more times to avoid one-off failures
current_attempts = 1
while current_attempts < retries && !status.success?
status = timeout_check(path, partial_timeout)
current_attempts += 1
end
status.success? status.success?
end end
......
...@@ -25,6 +25,10 @@ module Gitlab ...@@ -25,6 +25,10 @@ module Gitlab
!!@error !!@error
end end
def backing_off?
false
end
def last_failure def last_failure
circuit_broken? ? Time.now : nil circuit_broken? ? Time.now : nil
end end
......
...@@ -33,6 +33,21 @@ describe Gitlab::Git::Storage::ForkedStorageCheck, broken_storage: true, skip_da ...@@ -33,6 +33,21 @@ describe Gitlab::Git::Storage::ForkedStorageCheck, broken_storage: true, skip_da
expect(runtime).to be < 1.0 expect(runtime).to be < 1.0
end end
it 'will try the specified amount of times before failing' do
allow(described_class).to receive(:check_filesystem_in_process) do
Process.spawn("sleep 10")
end
expect(Process).to receive(:spawn).with('sleep 10').twice
.and_call_original
runtime = Benchmark.realtime do
described_class.storage_available?(existing_path, 0.5, 2)
end
expect(runtime).to be < 1.0
end
describe 'when using paths with spaces' do describe 'when using paths with spaces' do
let(:test_dir) { Rails.root.join('tmp', 'tests', 'storage_check') } let(:test_dir) { Rails.root.join('tmp', 'tests', 'storage_check') }
let(:path_with_spaces) { File.join(test_dir, 'path with spaces') } let(:path_with_spaces) { File.join(test_dir, 'path with spaces') }
......
...@@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do ...@@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
ours = described_class.public_instance_methods ours = described_class.public_instance_methods
theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods
# These methods are not part of the public API, but are public to allow the expect(theirs - ours).to be_empty
# CircuitBreaker specs to operate. They should be made private over time.
exceptions = %i[
cache_key
check_storage_accessible!
no_failures?
storage_available?
track_storage_accessible
track_storage_inaccessible
]
expect(theirs - ours).to contain_exactly(*exceptions)
end end
end end
...@@ -115,7 +115,8 @@ describe ApplicationSetting do ...@@ -115,7 +115,8 @@ describe ApplicationSetting do
end end
context 'circuitbreaker settings' do context 'circuitbreaker settings' do
[:circuitbreaker_failure_count_threshold, [:circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time, :circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time, :circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout].each do |field| :circuitbreaker_storage_timeout].each do |field|
...@@ -125,6 +126,16 @@ describe ApplicationSetting do ...@@ -125,6 +126,16 @@ describe ApplicationSetting do
.is_greater_than_or_equal_to(0) .is_greater_than_or_equal_to(0)
end end
end end
it 'requires the `backoff_threshold` to be lower than the `failure_count_threshold`' do
setting.circuitbreaker_failure_count_threshold = 10
setting.circuitbreaker_backoff_threshold = 15
failure_message = "The circuitbreaker backoff threshold should be lower "\
"than the failure count threshold"
expect(setting).not_to be_valid
expect(setting.errors[:circuitbreaker_backoff_threshold]).to include(failure_message)
end
end end
context 'repository storages' do context 'repository storages' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment