Run service discovery with up to 3 retries

When postgres service discovery fails, run it up to two more times. This will help to recover from transient errors with the dns nameserver.

Run service discovery with up to 3 retries
When postgres service discovery fails, run it up to two more times. This will help to recover from transient errors with the dns nameserver.
893b2da2 · Simon Tomlinson · 4773f35a · 893b2da2 · 893b2da2
Commit 893b2da2 authored Aug 23, 2021 by Simon Tomlinson
2 changed files
--- a/lib/gitlab/database/load_balancing/service_discovery.rb
+++ b/lib/gitlab/database/load_balancing/service_discovery.rb
@@ -13,11 +13,17 @@ module Gitlab
      # balancer with said hosts. Requests may continue to use the old hosts
      # until they complete.
      class ServiceDiscovery
+        EmptyDnsResponse = Class.new(StandardError)
        attr_reader :interval, :record, :record_type, :disconnect_timeout,
                    :load_balancer
        MAX_SLEEP_ADJUSTMENT = 10
+        MAX_DISCOVERY_RETRIES = 3
+        RETRY_DELAY_RANGE = (0.1..0.2).freeze
        RECORD_TYPES = {
          'A' => Net::DNS::A,
          'SRV' => Net::DNS::SRV
@@ -76,15 +82,21 @@ module Gitlab
        end
        def perform_service_discovery
-          refresh_if_necessary
+          MAX_DISCOVERY_RETRIES.times do
-        rescue StandardError => error
+            return refresh_if_necessary
-          # Any exceptions that might occur should be reported to
+          rescue StandardError => error
-          # Sentry, instead of silently terminating this thread.
+            # Any exceptions that might occur should be reported to
-          Gitlab::ErrorTracking.track_exception(error)
+            # Sentry, instead of silently terminating this thread.
+            Gitlab::ErrorTracking.track_exception(error)
-          Gitlab::AppLogger.error(
-            "Service discovery encountered an error: #{error.message}"
+            Gitlab::AppLogger.error(
-          )
+              "Service discovery encountered an error: #{error.message}"
+            )
+            # Slightly randomize the retry delay so that, in the case of a total
+            # dns outage, all starting services do not pressure the dns server at the same time.
+            sleep(rand(RETRY_DELAY_RANGE))
+          end
          interval
        end
@@ -156,6 +168,8 @@ module Gitlab
              addresses_from_srv_record(response)
            end
+          raise EmptyDnsResponse if addresses.empty?
          # Addresses are sorted so we can directly compare the old and new
          # addresses, without having to use any additional data structures.
          [new_wait_time_for(resources), addresses.sort]

--- a/spec/lib/gitlab/database/load_balancing/service_discovery_spec.rb
+++ b/spec/lib/gitlab/database/load_balancing/service_discovery_spec.rb
@@ -69,18 +69,69 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do
  end
  describe '#perform_service_discovery' do
-    it 'reports exceptions to Sentry' do
+    context 'without any failures' do
-      error = StandardError.new
+      it 'runs once' do
+        expect(service)
+          .to receive(:refresh_if_necessary).once
-      expect(service)
+        expect(service).not_to receive(:sleep)
-        .to receive(:refresh_if_necessary)
-        .and_raise(error)
-      expect(Gitlab::ErrorTracking)
+        expect(Gitlab::ErrorTracking).not_to receive(:track_exception)
-        .to receive(:track_exception)
-        .with(error)
-      service.perform_service_discovery
+        service.perform_service_discovery
+      end
+    end
+    context 'with failures' do
+      before do
+        allow(Gitlab::ErrorTracking).to receive(:track_exception)
+        allow(service).to receive(:sleep)
+      end
+      let(:valid_retry_sleep_duration) { satisfy { |val| described_class::RETRY_DELAY_RANGE.include?(val) } }
+      it 'retries service discovery when under the retry limit' do
+        error = StandardError.new
+        expect(service)
+          .to receive(:refresh_if_necessary)
+          .and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES - 1).times.ordered
+        expect(service)
+          .to receive(:sleep).with(valid_retry_sleep_duration)
+          .exactly(described_class::MAX_DISCOVERY_RETRIES - 1).times
+        expect(service).to receive(:refresh_if_necessary).and_return(45).ordered
+        expect(service.perform_service_discovery).to eq(45)
+      end
+      it 'does not retry service discovery after exceeding the limit' do
+        error = StandardError.new
+        expect(service)
+          .to receive(:refresh_if_necessary)
+          .and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
+        expect(service)
+          .to receive(:sleep).with(valid_retry_sleep_duration)
+          .exactly(described_class::MAX_DISCOVERY_RETRIES).times
+        service.perform_service_discovery
+      end
+      it 'reports exceptions to Sentry' do
+        error = StandardError.new
+        expect(service)
+          .to receive(:refresh_if_necessary)
+                .and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
+        expect(Gitlab::ErrorTracking)
+          .to receive(:track_exception)
+                .with(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
+        service.perform_service_discovery
+      end
    end
  end
@@ -224,6 +275,16 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do
        expect(service.addresses_from_dns).to eq([90, addresses])
      end
    end
+    context 'when the resolver returns an empty response' do
+      let(:packet) { double(:packet, answer: []) }
+      let(:record_type) { 'A' }
+      it 'raises EmptyDnsResponse' do
+        expect { service.addresses_from_dns }.to raise_error(Gitlab::Database::LoadBalancing::ServiceDiscovery::EmptyDnsResponse)
+      end
+    end
  end
  describe '#new_wait_time_for' do