Commit cb193cd6 authored by Kamil Trzciński's avatar Kamil Trzciński

Improve resillency of monitor

- Retry connection when it fails
- Properly shutdown daemon
- Stop monitor if the Exception is raised
- Properly guard exception handling
parent 3683d2d2
...@@ -46,7 +46,10 @@ module Gitlab ...@@ -46,7 +46,10 @@ module Gitlab
if thread if thread
thread.wakeup if thread.alive? thread.wakeup if thread.alive?
thread.join unless Thread.current == thread begin
thread.join unless Thread.current == thread
rescue Exception # rubocop:disable Lint/RescueException
end
@thread = nil @thread = nil
end end
end end
......
...@@ -6,6 +6,7 @@ module Gitlab ...@@ -6,6 +6,7 @@ module Gitlab
NOTIFICATION_CHANNEL = 'sidekiq:cancel:notifications'.freeze NOTIFICATION_CHANNEL = 'sidekiq:cancel:notifications'.freeze
CANCEL_DEADLINE = 24.hours.seconds CANCEL_DEADLINE = 24.hours.seconds
RECONNECT_TIME = 3.seconds
# We use exception derived from `Exception` # We use exception derived from `Exception`
# to consider this as an very low-level exception # to consider this as an very low-level exception
...@@ -33,7 +34,8 @@ module Gitlab ...@@ -33,7 +34,8 @@ module Gitlab
action: 'run', action: 'run',
queue: queue, queue: queue,
jid: jid, jid: jid,
canceled: true) canceled: true
)
raise CancelledError raise CancelledError
end end
...@@ -44,12 +46,45 @@ module Gitlab ...@@ -44,12 +46,45 @@ module Gitlab
end end
end end
def self.cancel_job(jid)
payload = {
action: 'cancel',
jid: jid
}.to_json
::Gitlab::Redis::SharedState.with do |redis|
redis.setex(cancel_job_key(jid), CANCEL_DEADLINE, 1)
redis.publish(NOTIFICATION_CHANNEL, payload)
end
end
private
def start_working def start_working
Sidekiq.logger.info( Sidekiq.logger.info(
class: self.class, class: self.class,
action: 'start', action: 'start',
message: 'Starting Monitor Daemon') message: 'Starting Monitor Daemon'
)
while enabled?
process_messages
sleep(RECONNECT_TIME)
end
ensure
Sidekiq.logger.warn(
class: self.class,
action: 'stop',
message: 'Stopping Monitor Daemon'
)
end
def stop_working
thread.raise(Interrupt) if thread.alive?
end
def process_messages
::Gitlab::Redis::SharedState.with do |redis| ::Gitlab::Redis::SharedState.with do |redis|
redis.subscribe(NOTIFICATION_CHANNEL) do |on| redis.subscribe(NOTIFICATION_CHANNEL) do |on|
on.message do |channel, message| on.message do |channel, message|
...@@ -57,39 +92,24 @@ module Gitlab ...@@ -57,39 +92,24 @@ module Gitlab
end end
end end
end end
Sidekiq.logger.warn(
class: self.class,
action: 'stop',
message: 'Stopping Monitor Daemon')
rescue Exception => e # rubocop:disable Lint/RescueException rescue Exception => e # rubocop:disable Lint/RescueException
Sidekiq.logger.warn( Sidekiq.logger.warn(
class: self.class, class: self.class,
action: 'exception', action: 'exception',
message: e.message) message: e.message
raise e )
end
def self.cancel_job(jid)
payload = {
action: 'cancel',
jid: jid
}.to_json
::Gitlab::Redis::SharedState.with do |redis| # we re-raise system exceptions
redis.setex(cancel_job_key(jid), CANCEL_DEADLINE, 1) raise e unless e.is_a?(StandardError)
redis.publish(NOTIFICATION_CHANNEL, payload)
end
end end
private
def process_message(message) def process_message(message)
Sidekiq.logger.info( Sidekiq.logger.info(
class: self.class, class: self.class,
channel: NOTIFICATION_CHANNEL, channel: NOTIFICATION_CHANNEL,
message: 'Received payload on channel', message: 'Received payload on channel',
payload: message) payload: message
)
message = safe_parse(message) message = safe_parse(message)
return unless message return unless message
...@@ -115,14 +135,16 @@ module Gitlab ...@@ -115,14 +135,16 @@ module Gitlab
Thread.new do Thread.new do
# try to find a thread, but with guaranteed # try to find a thread, but with guaranteed
# handle that this thread corresponds to actually running job # that handle for thread corresponds to actually
# running job
find_thread_with_lock(jid) do |thread| find_thread_with_lock(jid) do |thread|
Sidekiq.logger.warn( Sidekiq.logger.warn(
class: self.class, class: self.class,
action: 'cancel', action: 'cancel',
message: 'Canceling thread with CancelledError', message: 'Canceling thread with CancelledError',
jid: jid, jid: jid,
thread_id: thread.object_id) thread_id: thread.object_id
)
thread&.raise(CancelledError) thread&.raise(CancelledError)
end end
......
...@@ -34,12 +34,12 @@ describe Gitlab::Daemon do ...@@ -34,12 +34,12 @@ describe Gitlab::Daemon do
end end
end end
describe 'when Daemon is enabled' do context 'when Daemon is enabled' do
before do before do
allow(subject).to receive(:enabled?).and_return(true) allow(subject).to receive(:enabled?).and_return(true)
end end
describe 'when Daemon is stopped' do context 'when Daemon is stopped' do
describe '#start' do describe '#start' do
it 'starts the Daemon' do it 'starts the Daemon' do
expect { subject.start.join }.to change { subject.thread? }.from(false).to(true) expect { subject.start.join }.to change { subject.thread? }.from(false).to(true)
...@@ -57,14 +57,14 @@ describe Gitlab::Daemon do ...@@ -57,14 +57,14 @@ describe Gitlab::Daemon do
end end
end end
describe 'when Daemon is running' do context 'when Daemon is running' do
before do before do
subject.start.join subject.start
end end
describe '#start' do describe '#start' do
it "doesn't start running Daemon" do it "doesn't start running Daemon" do
expect { subject.start.join }.not_to change { subject.thread? } expect { subject.start.join }.not_to change { subject.thread }
expect(subject).to have_received(:start_working).once expect(subject).to have_received(:start_working).once
end end
...@@ -76,11 +76,29 @@ describe Gitlab::Daemon do ...@@ -76,11 +76,29 @@ describe Gitlab::Daemon do
expect(subject).to have_received(:stop_working) expect(subject).to have_received(:stop_working)
end end
context 'when stop_working raises exception' do
before do
allow(subject).to receive(:start_working) do
sleep(1000)
end
end
it 'shutdowns Daemon' do
expect(subject).to receive(:stop_working) do
subject.thread.raise(Interrupt)
end
expect(subject.thread).to be_alive
expect { subject.stop }.not_to raise_error
expect(subject.thread).to be_nil
end
end
end end
end end
end end
describe 'when Daemon is disabled' do context 'when Daemon is disabled' do
before do before do
allow(subject).to receive(:enabled?).and_return(false) allow(subject).to receive(:enabled?).and_return(false)
end end
......
...@@ -31,19 +31,26 @@ describe Gitlab::SidekiqMonitor do ...@@ -31,19 +31,26 @@ describe Gitlab::SidekiqMonitor do
end end
it 'raises exception' do it 'raises exception' do
expect { monitor.within_job(jid, 'queue') }.to raise_error(described_class::CancelledError) expect { monitor.within_job(jid, 'queue') }.to raise_error(
described_class::CancelledError)
end end
end end
end end
describe '#start_working' do describe '#start_working' do
subject { monitor.start_working } subject { monitor.send(:start_working) }
context 'when structured logging is used' do before do
before do # we want to run at most once cycle
allow_any_instance_of(::Redis).to receive(:subscribe) # we toggle `enabled?` flag after the first call
end stub_const('Gitlab::SidekiqMonitor::RECONNECT_TIME', 0)
allow(monitor).to receive(:enabled?).and_return(true, false)
allow(Sidekiq.logger).to receive(:info)
allow(Sidekiq.logger).to receive(:warn)
end
context 'when structured logging is used' do
it 'logs start message' do it 'logs start message' do
expect(Sidekiq.logger).to receive(:info) expect(Sidekiq.logger).to receive(:info)
.with( .with(
...@@ -51,6 +58,8 @@ describe Gitlab::SidekiqMonitor do ...@@ -51,6 +58,8 @@ describe Gitlab::SidekiqMonitor do
action: 'start', action: 'start',
message: 'Starting Monitor Daemon') message: 'Starting Monitor Daemon')
expect(::Gitlab::Redis::SharedState).to receive(:with)
subject subject
end end
...@@ -61,10 +70,25 @@ describe Gitlab::SidekiqMonitor do ...@@ -61,10 +70,25 @@ describe Gitlab::SidekiqMonitor do
action: 'stop', action: 'stop',
message: 'Stopping Monitor Daemon') message: 'Stopping Monitor Daemon')
expect(::Gitlab::Redis::SharedState).to receive(:with)
subject subject
end end
it 'logs exception message' do it 'logs StandardError message' do
expect(Sidekiq.logger).to receive(:warn)
.with(
class: described_class,
action: 'exception',
message: 'My Exception')
expect(::Gitlab::Redis::SharedState).to receive(:with)
.and_raise(StandardError, 'My Exception')
expect { subject }.not_to raise_error
end
it 'logs and raises Exception message' do
expect(Sidekiq.logger).to receive(:warn) expect(Sidekiq.logger).to receive(:warn)
.with( .with(
class: described_class, class: described_class,
...@@ -78,6 +102,20 @@ describe Gitlab::SidekiqMonitor do ...@@ -78,6 +102,20 @@ describe Gitlab::SidekiqMonitor do
end end
end end
context 'when StandardError is raised' do
it 'does retry connection' do
expect(::Gitlab::Redis::SharedState).to receive(:with)
.and_raise(StandardError, 'My Exception')
expect(::Gitlab::Redis::SharedState).to receive(:with)
# we expect to run `process_messages` twice
expect(monitor).to receive(:enabled?).and_return(true, true, false)
subject
end
end
context 'when message is published' do context 'when message is published' do
let(:subscribed) { double } let(:subscribed) { double }
...@@ -128,6 +166,19 @@ describe Gitlab::SidekiqMonitor do ...@@ -128,6 +166,19 @@ describe Gitlab::SidekiqMonitor do
end end
end end
describe '#stop' do
let!(:monitor_thread) { monitor.start }
it 'does stop the thread' do
expect(monitor_thread).to be_alive
expect { monitor.stop }.not_to raise_error
expect(monitor_thread).not_to be_alive
expect { monitor_thread.value }.to raise_error(Interrupt)
end
end
describe '#process_job_cancel' do describe '#process_job_cancel' do
subject { monitor.send(:process_job_cancel, jid) } subject { monitor.send(:process_job_cancel, jid) }
...@@ -156,6 +207,11 @@ describe Gitlab::SidekiqMonitor do ...@@ -156,6 +207,11 @@ describe Gitlab::SidekiqMonitor do
monitor.jobs_thread[jid] = thread monitor.jobs_thread[jid] = thread
end end
after do
thread.kill
rescue
end
it 'does log cancellation message' do it 'does log cancellation message' do
expect(Sidekiq.logger).to receive(:warn) expect(Sidekiq.logger).to receive(:warn)
.with( .with(
...@@ -175,8 +231,9 @@ describe Gitlab::SidekiqMonitor do ...@@ -175,8 +231,9 @@ describe Gitlab::SidekiqMonitor do
subject.join subject.join
expect(thread).not_to be_alive # we wait for the thread to be cancelled
expect { thread.value }.to raise_error(described_class::CancelledError) # by `process_job_cancel`
expect { thread.join(5) }.to raise_error(described_class::CancelledError)
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment