Commit da77dc56 authored by Michael Kozono's avatar Michael Kozono

Exit geo-logcursor if failing for too long

If the first error was over 30 minutes ago,
and there were no successes after that,
then the next error will cause the loop to exit.
parent f0913527
...@@ -7,12 +7,14 @@ module Gitlab ...@@ -7,12 +7,14 @@ module Gitlab
VERSION = '0.2.0'.freeze VERSION = '0.2.0'.freeze
BATCH_SIZE = 250 BATCH_SIZE = 250
SECONDARY_CHECK_INTERVAL = 60 SECONDARY_CHECK_INTERVAL = 60
MAX_ERROR_DURATION = 1800
attr_reader :options attr_reader :options
def initialize(options = {}) def initialize(options = {})
@options = options @options = options
@exit = false @exit = false
@failing_since = nil
end end
def run! def run!
...@@ -28,6 +30,9 @@ module Gitlab ...@@ -28,6 +30,9 @@ module Gitlab
end end
lease = Lease.try_obtain_with_ttl { run_once! } lease = Lease.try_obtain_with_ttl { run_once! }
handle_error(lease[:error])
return if exit? return if exit?
# When no new event is found sleep for a few moments # When no new event is found sleep for a few moments
...@@ -49,6 +54,29 @@ module Gitlab ...@@ -49,6 +54,29 @@ module Gitlab
private private
def handle_error(did_error)
track_failing_since(did_error)
if excessive_errors?
logger.error("#run!: Exiting due to consecutive errors for over #{MAX_ERROR_DURATION} seconds")
@exit = true
end
end
def track_failing_since(did_error)
if did_error
@failing_since ||= Time.now
else
@failing_since = nil
end
end
def excessive_errors?
return if @failing_since.nil?
MAX_ERROR_DURATION < (Time.now - @failing_since)
end
def sleep_break(seconds) def sleep_break(seconds)
while seconds > 0 while seconds > 0
sleep(1) sleep(1)
......
...@@ -43,7 +43,7 @@ module Gitlab ...@@ -43,7 +43,7 @@ module Gitlab
Gitlab::ExclusiveLease.cancel(LEASE_KEY, lease[:uuid]) Gitlab::ExclusiveLease.cancel(LEASE_KEY, lease[:uuid])
{ uuid: false, ttl: LEASE_TIMEOUT } { uuid: false, ttl: LEASE_TIMEOUT, error: true }
end end
end end
......
...@@ -22,16 +22,26 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do ...@@ -22,16 +22,26 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
allow(daemon).to receive(:arbitrary_sleep).and_return(0.1) allow(daemon).to receive(:arbitrary_sleep).and_return(0.1)
end end
# WARNINGS
#
# 1. Ensure an exit condition for the main run! loop, or RSpec will not stop
# without an interrupt.
#
# I recommend using `ensure_exit_on`.
#
# 2. run! occasionally spawns git processes that run forever at 100% CPU.
#
# I don't know why this happens.
describe '#run!' do describe '#run!' do
it 'traps signals' do it 'traps signals' do
is_expected.to receive(:exit?).and_return(true) ensure_exit_on(1)
is_expected.to receive(:trap_signals) is_expected.to receive(:trap_signals)
daemon.run! daemon.run!
end end
it 'delegates to #run_once! in a loop' do it 'delegates to #run_once! in a loop' do
is_expected.to receive(:exit?).and_return(false, false, false, true) ensure_exit_on(4)
is_expected.to receive(:run_once!).twice is_expected.to receive(:run_once!).twice
daemon.run! daemon.run!
...@@ -44,7 +54,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do ...@@ -44,7 +54,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
allow(lease).to receive(:same_uuid?).and_return(false) allow(lease).to receive(:same_uuid?).and_return(false)
allow(Gitlab::Geo::LogCursor::Lease).to receive(:exclusive_lease).and_return(lease) allow(Gitlab::Geo::LogCursor::Lease).to receive(:exclusive_lease).and_return(lease)
is_expected.to receive(:exit?).and_return(false, true) ensure_exit_on(2)
is_expected.not_to receive(:run_once!) is_expected.not_to receive(:run_once!)
daemon.run! daemon.run!
...@@ -53,7 +63,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do ...@@ -53,7 +63,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
it 'skips execution if not a Geo node' do it 'skips execution if not a Geo node' do
stub_current_geo_node(nil) stub_current_geo_node(nil)
is_expected.to receive(:exit?).and_return(false, true) ensure_exit_on(2)
is_expected.to receive(:sleep_break).with(1.minute) is_expected.to receive(:sleep_break).with(1.minute)
is_expected.not_to receive(:run_once!) is_expected.not_to receive(:run_once!)
...@@ -63,12 +73,61 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do ...@@ -63,12 +73,61 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
it 'skips execution if the current node is a primary' do it 'skips execution if the current node is a primary' do
stub_current_geo_node(primary) stub_current_geo_node(primary)
is_expected.to receive(:exit?).and_return(false, true) ensure_exit_on(2)
is_expected.to receive(:sleep_break).with(1.minute) is_expected.to receive(:sleep_break).with(1.minute)
is_expected.not_to receive(:run_once!) is_expected.not_to receive(:run_once!)
daemon.run! daemon.run!
end end
context 'when run! has handled an error every call for over the allowed duration' do
it 'exits' do
# Can't use ensure_exit_on here since the logic we are testing depends on `exit?` behavior.
# If `exit?` is called a third time, then the "exit if failing for too long" behavior is broken.
expect(daemon).to receive(:exit?).and_call_original.twice
Timecop.freeze do
daemon.send(:handle_error, true)
Timecop.travel(described_class::MAX_ERROR_DURATION + 1.second) do
expect(daemon).to receive(:gap_tracking).and_raise('boom')
is_expected.to receive(:run_once!).and_call_original.once
daemon.run!
end
end
end
end
context 'when run_once! has returned one call without raising an error before the allowed duration' do
it 'does not exit' do
# Can't use ensure_exit_on here since the logic we are testing depends on `exit?` behavior
expect(daemon).to receive(:exit?).and_call_original.exactly(5).times
# Force exit on 6th call
# If `exit?` is not called 6 times, then the daemon stopped too early.
expect(daemon).to receive(:exit?).and_return(true)
Timecop.freeze do
# As if an error occurred
daemon.send(:handle_error, true)
Timecop.travel(described_class::MAX_ERROR_DURATION + 1.second) do
# First, a successful run to reset the timer
expect(daemon).to receive(:gap_tracking).and_call_original
# Then more errors
expect(daemon).to receive(:gap_tracking).and_raise('boom').twice
# It should continue running until we force it to exit
is_expected.to receive(:run_once!).and_call_original.exactly(3).times
daemon.run!
end
end
end
end
end end
describe '#run_once!' do describe '#run_once!' do
...@@ -259,4 +318,20 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do ...@@ -259,4 +318,20 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
gaps gaps
end end
# It is extremely easy to get run! into an infinite loop.
#
# Regardless of `allow` or `expect`, this method ensures that the loop will
# exit at the specified number of exit? calls.
def ensure_exit_on(num_calls = 3, expect = true)
# E.g. If num_calls is `3`, returns is set to `[false, false, true]`.
returns = Array.new(num_calls) { false }
returns[-1] = true
if expect
expect(daemon).to receive(:exit?).and_return(*returns)
else
allow(daemon).to receive(:exit?).and_return(*returns)
end
end
end end
...@@ -33,7 +33,7 @@ describe Gitlab::Geo::LogCursor::Lease, :clean_gitlab_redis_shared_state do ...@@ -33,7 +33,7 @@ describe Gitlab::Geo::LogCursor::Lease, :clean_gitlab_redis_shared_state do
end end
end end
describe '.try_obtain_lease_with_ttl' do describe '.try_obtain_with_ttl' do
it 'returns zero when there is no lease' do it 'returns zero when there is no lease' do
result = described_class.try_obtain_with_ttl {} result = described_class.try_obtain_with_ttl {}
...@@ -78,6 +78,7 @@ describe Gitlab::Geo::LogCursor::Lease, :clean_gitlab_redis_shared_state do ...@@ -78,6 +78,7 @@ describe Gitlab::Geo::LogCursor::Lease, :clean_gitlab_redis_shared_state do
expect(result[:ttl]).to be > 0 expect(result[:ttl]).to be > 0
expect(result[:uuid]).to be false expect(result[:uuid]).to be false
expect(result[:error]).to be true
end end
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment