Commit 9dd56719 authored by Craig Miskell's avatar Craig Miskell Committed by Sean McGivern

Be more vigorous in killing workers when shutting down sidekiq-cluster

In https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/8511
and https://gitlab.com/gitlab-com/gl-infra/production/issues/1309
we have seen cases where sidekiq-cluster has been asked to restart,
but it has left one or more sidekiq worker processes in a stuck/hung
state burning CPU.

With this, sidekiq-cluster now waits for the worker processes to
terminate themselves cleanly (and push jobs back onto the queue) and
another few seconds beyond, then if any processes remain, kills them
hard (:KILL signal) before exiting as usual (allowing whater process
monitor is in place to restart sidekiq-cluster)
parent 16363f28
......@@ -143,6 +143,14 @@ module Gitlab
true
end
def self.any_alive?(pids)
pids_alive(pids).any?
end
def self.pids_alive(pids)
pids.select { |pid| signal(pid, 0) }
end
def self.write_pid(path)
File.open(path, 'w') do |handle|
handle.write(Process.pid.to_s)
......
......@@ -7,6 +7,12 @@ require 'time'
module Gitlab
module SidekiqCluster
class CLI
# How often to check for processes when terminating
CHECK_TERMINATE_INTERVAL_SECONDS = 1
# How long to wait in total when asking for a clean termination
# Sidekiq default to self-terminate is 25s
TERMINATE_TIMEOUT_SECONDS = 30
CommandError = Class.new(StandardError)
def initialize(log_output = STDERR)
......@@ -63,10 +69,19 @@ module Gitlab
SidekiqCluster.write_pid(@pid) if @pid
end
def wait_for_termination(check_interval = CHECK_TERMINATE_INTERVAL_SECONDS, terminate_timeout = TERMINATE_TIMEOUT_SECONDS)
deadline = Gitlab::Metrics::System.monotonic_time + terminate_timeout
sleep(check_interval) while SidekiqCluster.any_alive?(@processes) && Gitlab::Metrics::System.monotonic_time < deadline
# Hard-stop any that remain; they're likely stuck
SidekiqCluster.signal_processes(SidekiqCluster.pids_alive(@processes), :KILL)
end
def trap_signals
SidekiqCluster.trap_terminate do |signal|
@alive = false
SidekiqCluster.signal_processes(@processes, signal)
wait_for_termination
end
SidekiqCluster.trap_forward do |signal|
......
......@@ -84,6 +84,51 @@ describe Gitlab::SidekiqCluster::CLI do
end
end
describe '#wait_for_termination' do
it 'waits for termination of all sub-processes and succeeds after 3 checks' do
expect(Gitlab::SidekiqCluster).to receive(:any_alive?)
.with(an_instance_of(Array)).and_return(true, true, true, false)
expect(Gitlab::SidekiqCluster).to receive(:pids_alive)
.with([]).and_return([])
expect(Gitlab::SidekiqCluster).to receive(:signal_processes)
.with([], :KILL)
# Check every 0.1s, for no more than 1 second total
cli.wait_for_termination(0.1, 1)
end
context 'with hanging workers' do
before do
expect(cli).to receive(:write_pid)
expect(cli).to receive(:trap_signals)
expect(cli).to receive(:start_loop)
end
it 'hard kills workers after timeout expires' do
worker_pids = [101, 102, 103]
expect(Gitlab::SidekiqCluster).to receive(:start)
.with([['foo']], default_options)
.and_return(worker_pids)
expect(Gitlab::SidekiqCluster).to receive(:any_alive?)
.with(worker_pids).and_return(true).at_least(10).times
expect(Gitlab::SidekiqCluster).to receive(:pids_alive)
.with(worker_pids).and_return([102])
expect(Gitlab::SidekiqCluster).to receive(:signal_processes)
.with([102], :KILL)
cli.run(%w(foo))
# Check every 0.1s, for no more than 1 second total
cli.wait_for_termination(0.1, 1)
end
end
end
describe '#trap_signals' do
it 'traps the termination and forwarding signals' do
expect(Gitlab::SidekiqCluster).to receive(:trap_terminate)
......
......@@ -146,6 +146,26 @@ describe Gitlab::SidekiqCluster do
end
end
describe '.any_alive?' do
it 'returns true if at least one process is alive' do
processes = [1, 2]
allow(described_class).to receive(:signal).with(1, 0).and_return(true)
allow(described_class).to receive(:signal).with(2, 0).and_return(false)
expect(described_class.any_alive?(processes)).to eq(true)
end
it 'returns false when all threads are dead' do
processes = [1, 2]
allow(described_class).to receive(:signal).with(1, 0).and_return(false)
allow(described_class).to receive(:signal).with(2, 0).and_return(false)
expect(described_class.any_alive?(processes)).to eq(false)
end
end
describe '.write_pid' do
it 'writes the PID of the current process to the given file' do
handle = double(:handle)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment