Commit da77dc56 authored by Michael Kozono's avatar Michael Kozono

Exit geo-logcursor if failing for too long

If the first error was over 30 minutes ago,
and there were no successes after that,
then the next error will cause the loop to exit.
parent f0913527
......@@ -7,12 +7,14 @@ module Gitlab
VERSION = '0.2.0'.freeze
BATCH_SIZE = 250
SECONDARY_CHECK_INTERVAL = 60
MAX_ERROR_DURATION = 1800
attr_reader :options
def initialize(options = {})
@options = options
@exit = false
@failing_since = nil
end
def run!
......@@ -28,6 +30,9 @@ module Gitlab
end
lease = Lease.try_obtain_with_ttl { run_once! }
handle_error(lease[:error])
return if exit?
# When no new event is found sleep for a few moments
......@@ -49,6 +54,29 @@ module Gitlab
private
def handle_error(did_error)
track_failing_since(did_error)
if excessive_errors?
logger.error("#run!: Exiting due to consecutive errors for over #{MAX_ERROR_DURATION} seconds")
@exit = true
end
end
def track_failing_since(did_error)
if did_error
@failing_since ||= Time.now
else
@failing_since = nil
end
end
def excessive_errors?
return if @failing_since.nil?
MAX_ERROR_DURATION < (Time.now - @failing_since)
end
def sleep_break(seconds)
while seconds > 0
sleep(1)
......
......@@ -43,7 +43,7 @@ module Gitlab
Gitlab::ExclusiveLease.cancel(LEASE_KEY, lease[:uuid])
{ uuid: false, ttl: LEASE_TIMEOUT }
{ uuid: false, ttl: LEASE_TIMEOUT, error: true }
end
end
......
......@@ -22,16 +22,26 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
allow(daemon).to receive(:arbitrary_sleep).and_return(0.1)
end
# WARNINGS
#
# 1. Ensure an exit condition for the main run! loop, or RSpec will not stop
# without an interrupt.
#
# I recommend using `ensure_exit_on`.
#
# 2. run! occasionally spawns git processes that run forever at 100% CPU.
#
# I don't know why this happens.
describe '#run!' do
it 'traps signals' do
is_expected.to receive(:exit?).and_return(true)
ensure_exit_on(1)
is_expected.to receive(:trap_signals)
daemon.run!
end
it 'delegates to #run_once! in a loop' do
is_expected.to receive(:exit?).and_return(false, false, false, true)
ensure_exit_on(4)
is_expected.to receive(:run_once!).twice
daemon.run!
......@@ -44,7 +54,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
allow(lease).to receive(:same_uuid?).and_return(false)
allow(Gitlab::Geo::LogCursor::Lease).to receive(:exclusive_lease).and_return(lease)
is_expected.to receive(:exit?).and_return(false, true)
ensure_exit_on(2)
is_expected.not_to receive(:run_once!)
daemon.run!
......@@ -53,7 +63,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
it 'skips execution if not a Geo node' do
stub_current_geo_node(nil)
is_expected.to receive(:exit?).and_return(false, true)
ensure_exit_on(2)
is_expected.to receive(:sleep_break).with(1.minute)
is_expected.not_to receive(:run_once!)
......@@ -63,12 +73,61 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
it 'skips execution if the current node is a primary' do
stub_current_geo_node(primary)
is_expected.to receive(:exit?).and_return(false, true)
ensure_exit_on(2)
is_expected.to receive(:sleep_break).with(1.minute)
is_expected.not_to receive(:run_once!)
daemon.run!
end
context 'when run! has handled an error every call for over the allowed duration' do
it 'exits' do
# Can't use ensure_exit_on here since the logic we are testing depends on `exit?` behavior.
# If `exit?` is called a third time, then the "exit if failing for too long" behavior is broken.
expect(daemon).to receive(:exit?).and_call_original.twice
Timecop.freeze do
daemon.send(:handle_error, true)
Timecop.travel(described_class::MAX_ERROR_DURATION + 1.second) do
expect(daemon).to receive(:gap_tracking).and_raise('boom')
is_expected.to receive(:run_once!).and_call_original.once
daemon.run!
end
end
end
end
context 'when run_once! has returned one call without raising an error before the allowed duration' do
it 'does not exit' do
# Can't use ensure_exit_on here since the logic we are testing depends on `exit?` behavior
expect(daemon).to receive(:exit?).and_call_original.exactly(5).times
# Force exit on 6th call
# If `exit?` is not called 6 times, then the daemon stopped too early.
expect(daemon).to receive(:exit?).and_return(true)
Timecop.freeze do
# As if an error occurred
daemon.send(:handle_error, true)
Timecop.travel(described_class::MAX_ERROR_DURATION + 1.second) do
# First, a successful run to reset the timer
expect(daemon).to receive(:gap_tracking).and_call_original
# Then more errors
expect(daemon).to receive(:gap_tracking).and_raise('boom').twice
# It should continue running until we force it to exit
is_expected.to receive(:run_once!).and_call_original.exactly(3).times
daemon.run!
end
end
end
end
end
describe '#run_once!' do
......@@ -259,4 +318,20 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
gaps
end
# It is extremely easy to get run! into an infinite loop.
#
# Regardless of `allow` or `expect`, this method ensures that the loop will
# exit at the specified number of exit? calls.
def ensure_exit_on(num_calls = 3, expect = true)
# E.g. If num_calls is `3`, returns is set to `[false, false, true]`.
returns = Array.new(num_calls) { false }
returns[-1] = true
if expect
expect(daemon).to receive(:exit?).and_return(*returns)
else
allow(daemon).to receive(:exit?).and_return(*returns)
end
end
end
......@@ -33,7 +33,7 @@ describe Gitlab::Geo::LogCursor::Lease, :clean_gitlab_redis_shared_state do
end
end
describe '.try_obtain_lease_with_ttl' do
describe '.try_obtain_with_ttl' do
it 'returns zero when there is no lease' do
result = described_class.try_obtain_with_ttl {}
......@@ -78,6 +78,7 @@ describe Gitlab::Geo::LogCursor::Lease, :clean_gitlab_redis_shared_state do
expect(result[:ttl]).to be > 0
expect(result[:uuid]).to be false
expect(result[:error]).to be true
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment