Commit 893b2da2 authored by Simon Tomlinson's avatar Simon Tomlinson

Run service discovery with up to 3 retries

When postgres service discovery fails, run it up to two more times. This
will help to recover from transient errors with the dns nameserver.
parent 4773f35a
...@@ -13,11 +13,17 @@ module Gitlab ...@@ -13,11 +13,17 @@ module Gitlab
# balancer with said hosts. Requests may continue to use the old hosts # balancer with said hosts. Requests may continue to use the old hosts
# until they complete. # until they complete.
class ServiceDiscovery class ServiceDiscovery
EmptyDnsResponse = Class.new(StandardError)
attr_reader :interval, :record, :record_type, :disconnect_timeout, attr_reader :interval, :record, :record_type, :disconnect_timeout,
:load_balancer :load_balancer
MAX_SLEEP_ADJUSTMENT = 10 MAX_SLEEP_ADJUSTMENT = 10
MAX_DISCOVERY_RETRIES = 3
RETRY_DELAY_RANGE = (0.1..0.2).freeze
RECORD_TYPES = { RECORD_TYPES = {
'A' => Net::DNS::A, 'A' => Net::DNS::A,
'SRV' => Net::DNS::SRV 'SRV' => Net::DNS::SRV
...@@ -76,15 +82,21 @@ module Gitlab ...@@ -76,15 +82,21 @@ module Gitlab
end end
def perform_service_discovery def perform_service_discovery
refresh_if_necessary MAX_DISCOVERY_RETRIES.times do
rescue StandardError => error return refresh_if_necessary
# Any exceptions that might occur should be reported to rescue StandardError => error
# Sentry, instead of silently terminating this thread. # Any exceptions that might occur should be reported to
Gitlab::ErrorTracking.track_exception(error) # Sentry, instead of silently terminating this thread.
Gitlab::ErrorTracking.track_exception(error)
Gitlab::AppLogger.error(
"Service discovery encountered an error: #{error.message}" Gitlab::AppLogger.error(
) "Service discovery encountered an error: #{error.message}"
)
# Slightly randomize the retry delay so that, in the case of a total
# dns outage, all starting services do not pressure the dns server at the same time.
sleep(rand(RETRY_DELAY_RANGE))
end
interval interval
end end
...@@ -156,6 +168,8 @@ module Gitlab ...@@ -156,6 +168,8 @@ module Gitlab
addresses_from_srv_record(response) addresses_from_srv_record(response)
end end
raise EmptyDnsResponse if addresses.empty?
# Addresses are sorted so we can directly compare the old and new # Addresses are sorted so we can directly compare the old and new
# addresses, without having to use any additional data structures. # addresses, without having to use any additional data structures.
[new_wait_time_for(resources), addresses.sort] [new_wait_time_for(resources), addresses.sort]
......
...@@ -69,18 +69,69 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do ...@@ -69,18 +69,69 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do
end end
describe '#perform_service_discovery' do describe '#perform_service_discovery' do
it 'reports exceptions to Sentry' do context 'without any failures' do
error = StandardError.new it 'runs once' do
expect(service)
.to receive(:refresh_if_necessary).once
expect(service) expect(service).not_to receive(:sleep)
.to receive(:refresh_if_necessary)
.and_raise(error)
expect(Gitlab::ErrorTracking) expect(Gitlab::ErrorTracking).not_to receive(:track_exception)
.to receive(:track_exception)
.with(error)
service.perform_service_discovery service.perform_service_discovery
end
end
context 'with failures' do
before do
allow(Gitlab::ErrorTracking).to receive(:track_exception)
allow(service).to receive(:sleep)
end
let(:valid_retry_sleep_duration) { satisfy { |val| described_class::RETRY_DELAY_RANGE.include?(val) } }
it 'retries service discovery when under the retry limit' do
error = StandardError.new
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES - 1).times.ordered
expect(service)
.to receive(:sleep).with(valid_retry_sleep_duration)
.exactly(described_class::MAX_DISCOVERY_RETRIES - 1).times
expect(service).to receive(:refresh_if_necessary).and_return(45).ordered
expect(service.perform_service_discovery).to eq(45)
end
it 'does not retry service discovery after exceeding the limit' do
error = StandardError.new
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
expect(service)
.to receive(:sleep).with(valid_retry_sleep_duration)
.exactly(described_class::MAX_DISCOVERY_RETRIES).times
service.perform_service_discovery
end
it 'reports exceptions to Sentry' do
error = StandardError.new
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
expect(Gitlab::ErrorTracking)
.to receive(:track_exception)
.with(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
service.perform_service_discovery
end
end end
end end
...@@ -224,6 +275,16 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do ...@@ -224,6 +275,16 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do
expect(service.addresses_from_dns).to eq([90, addresses]) expect(service.addresses_from_dns).to eq([90, addresses])
end end
end end
context 'when the resolver returns an empty response' do
let(:packet) { double(:packet, answer: []) }
let(:record_type) { 'A' }
it 'raises EmptyDnsResponse' do
expect { service.addresses_from_dns }.to raise_error(Gitlab::Database::LoadBalancing::ServiceDiscovery::EmptyDnsResponse)
end
end
end end
describe '#new_wait_time_for' do describe '#new_wait_time_for' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment