Commit 893b2da2 authored by Simon Tomlinson's avatar Simon Tomlinson

Run service discovery with up to 3 retries

When postgres service discovery fails, run it up to two more times. This
will help to recover from transient errors with the dns nameserver.
parent 4773f35a
......@@ -13,11 +13,17 @@ module Gitlab
# balancer with said hosts. Requests may continue to use the old hosts
# until they complete.
class ServiceDiscovery
EmptyDnsResponse = Class.new(StandardError)
attr_reader :interval, :record, :record_type, :disconnect_timeout,
:load_balancer
MAX_SLEEP_ADJUSTMENT = 10
MAX_DISCOVERY_RETRIES = 3
RETRY_DELAY_RANGE = (0.1..0.2).freeze
RECORD_TYPES = {
'A' => Net::DNS::A,
'SRV' => Net::DNS::SRV
......@@ -76,15 +82,21 @@ module Gitlab
end
def perform_service_discovery
refresh_if_necessary
rescue StandardError => error
# Any exceptions that might occur should be reported to
# Sentry, instead of silently terminating this thread.
Gitlab::ErrorTracking.track_exception(error)
Gitlab::AppLogger.error(
"Service discovery encountered an error: #{error.message}"
)
MAX_DISCOVERY_RETRIES.times do
return refresh_if_necessary
rescue StandardError => error
# Any exceptions that might occur should be reported to
# Sentry, instead of silently terminating this thread.
Gitlab::ErrorTracking.track_exception(error)
Gitlab::AppLogger.error(
"Service discovery encountered an error: #{error.message}"
)
# Slightly randomize the retry delay so that, in the case of a total
# dns outage, all starting services do not pressure the dns server at the same time.
sleep(rand(RETRY_DELAY_RANGE))
end
interval
end
......@@ -156,6 +168,8 @@ module Gitlab
addresses_from_srv_record(response)
end
raise EmptyDnsResponse if addresses.empty?
# Addresses are sorted so we can directly compare the old and new
# addresses, without having to use any additional data structures.
[new_wait_time_for(resources), addresses.sort]
......
......@@ -69,18 +69,69 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do
end
describe '#perform_service_discovery' do
it 'reports exceptions to Sentry' do
error = StandardError.new
context 'without any failures' do
it 'runs once' do
expect(service)
.to receive(:refresh_if_necessary).once
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error)
expect(service).not_to receive(:sleep)
expect(Gitlab::ErrorTracking)
.to receive(:track_exception)
.with(error)
expect(Gitlab::ErrorTracking).not_to receive(:track_exception)
service.perform_service_discovery
service.perform_service_discovery
end
end
context 'with failures' do
before do
allow(Gitlab::ErrorTracking).to receive(:track_exception)
allow(service).to receive(:sleep)
end
let(:valid_retry_sleep_duration) { satisfy { |val| described_class::RETRY_DELAY_RANGE.include?(val) } }
it 'retries service discovery when under the retry limit' do
error = StandardError.new
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES - 1).times.ordered
expect(service)
.to receive(:sleep).with(valid_retry_sleep_duration)
.exactly(described_class::MAX_DISCOVERY_RETRIES - 1).times
expect(service).to receive(:refresh_if_necessary).and_return(45).ordered
expect(service.perform_service_discovery).to eq(45)
end
it 'does not retry service discovery after exceeding the limit' do
error = StandardError.new
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
expect(service)
.to receive(:sleep).with(valid_retry_sleep_duration)
.exactly(described_class::MAX_DISCOVERY_RETRIES).times
service.perform_service_discovery
end
it 'reports exceptions to Sentry' do
error = StandardError.new
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
expect(Gitlab::ErrorTracking)
.to receive(:track_exception)
.with(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
service.perform_service_discovery
end
end
end
......@@ -224,6 +275,16 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do
expect(service.addresses_from_dns).to eq([90, addresses])
end
end
context 'when the resolver returns an empty response' do
let(:packet) { double(:packet, answer: []) }
let(:record_type) { 'A' }
it 'raises EmptyDnsResponse' do
expect { service.addresses_from_dns }.to raise_error(Gitlab::Database::LoadBalancing::ServiceDiscovery::EmptyDnsResponse)
end
end
end
describe '#new_wait_time_for' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment