Commit 5c80bbb3 authored by Stan Hu's avatar Stan Hu

Merge branch '40396-sidekiq-in-process-group' into 'master'

sidekiq: terminate child processes at shutdown

See merge request gitlab-org/gitlab-ce!25669
parents 088128e6 f0c52df5
---
title: 'sidekiq: terminate child processes at shutdown'
merge_request: 25669
author:
type: added
......@@ -77,6 +77,9 @@ Sidekiq.configure_server do |config|
# Avoid autoload issue such as 'Mail::Parsers::AddressStruct'
# https://github.com/mikel/mail/issues/912#issuecomment-214850355
Mail.eager_autoload!
# Ensure the whole process group is terminated if possible
Gitlab::SidekiqSignals.install!(Sidekiq::CLI::SIGNAL_HANDLERS)
end
Sidekiq.configure_client do |config|
......
......@@ -17,6 +17,11 @@ With the default settings, the MemoryKiller will cause a Sidekiq restart no
more often than once every 15 minutes, with the restart causing about one
minute of delay for incoming background jobs.
Some background jobs rely on long-running external processes. To ensure these
are cleanly terminated when Sidekiq is restarted, each Sidekiq process should be
run as a process group leader (e.g., using `chpst -P`). If using Omnibus or the
`bin/background_jobs` script with `runit` installed, this is handled for you.
## Configuring the MemoryKiller
The MemoryKiller is controlled using environment variables.
......
......@@ -36,11 +36,13 @@ module Gitlab
# Wait `SHUTDOWN_WAIT` to give already fetched jobs time to finish.
# Then, tell Sidekiq to gracefully shut down by giving jobs a few more
# moments to finish, killing and requeuing them if they didn't, and
# then terminating itself.
# then terminating itself. Sidekiq will replicate the TERM to all its
# children if it can.
wait_and_signal(SHUTDOWN_WAIT, 'SIGTERM', 'gracefully shut down')
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't.
wait_and_signal(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
# Kill the whole pgroup, so we can be sure no children are left behind
wait_and_signal_pgroup(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
end
end
......@@ -53,6 +55,17 @@ module Gitlab
output.to_i
end
# If this sidekiq process is pgroup leader, signal to the whole pgroup
def wait_and_signal_pgroup(time, signal, explanation)
return wait_and_signal(time, signal, explanation) unless Process.getpgrp == pid
Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})"
sleep(time)
Sidekiq.logger.warn "sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})"
Process.kill(signal, "-#{pid}")
end
def wait_and_signal(time, signal, explanation)
Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})"
sleep(time)
......
# frozen_string_literal: true
module Gitlab
# As a process group leader, we can ensure that children of sidekiq are killed
# at the same time as sidekiq itself, to stop long-lived children from being
# reparented to init and "escaping". To do this, we override the default
# handlers used by sidekiq for INT and TERM signals
module SidekiqSignals
REPLACE_SIGNALS = %w[INT TERM].freeze
SIDEKIQ_CHANGED_MESSAGE =
"Intercepting signal handlers: #{REPLACE_SIGNALS.join(", ")} failed. " \
"Sidekiq should have registered them, but appears not to have done so."
def self.install!(sidekiq_handlers)
# This only works if we're process group leader
return unless Process.getpgrp == Process.pid
raise SIDEKIQ_CHANGED_MESSAGE unless
REPLACE_SIGNALS == sidekiq_handlers.keys & REPLACE_SIGNALS
REPLACE_SIGNALS.each do |signal|
old_handler = sidekiq_handlers[signal]
sidekiq_handlers[signal] = ->(cli) do
blindly_signal_pgroup!(signal)
old_handler.call(cli)
end
end
end
# The process group leader can forward INT and TERM signals to the whole
# group. However, the forwarded signal is *also* received by the leader,
# which could lead to an infinite loop. We can avoid this by temporarily
# ignoring the forwarded signal. This may cause us to miss some repeated
# signals from outside the process group, but that isn't fatal.
def self.blindly_signal_pgroup!(signal)
old_trap = trap(signal, 'IGNORE')
Process.kill(signal, "-#{Process.getpgrp}")
trap(signal, old_trap)
end
end
end
......@@ -35,7 +35,7 @@ describe Gitlab::SidekiqMiddleware::MemoryKiller do
stub_const("#{described_class}::MAX_RSS", 5.kilobytes)
end
it 'sends the STP, TERM and KILL signals at expected times' do
it 'sends the TSTP, TERM and KILL signals at expected times' do
expect(subject).to receive(:sleep).with(15 * 60).ordered
expect(Process).to receive(:kill).with('SIGTSTP', pid).ordered
......@@ -47,6 +47,17 @@ describe Gitlab::SidekiqMiddleware::MemoryKiller do
run
end
it 'sends TSTP and TERM to the pid, but KILL to the pgroup, when running as process leader' do
allow(Process).to receive(:getpgrp) { pid }
allow(subject).to receive(:sleep)
expect(Process).to receive(:kill).with('SIGTSTP', pid).ordered
expect(Process).to receive(:kill).with('SIGTERM', pid).ordered
expect(Process).to receive(:kill).with('SIGKILL', "-#{pid}").ordered
run
end
end
context 'when MAX_RSS is not exceeded' do
......
require 'spec_helper'
describe Gitlab::SidekiqSignals do
describe '.install' do
let(:result) { Hash.new { |h, k| h[k] = 0 } }
let(:int_handler) { -> (_) { result['INT'] += 1 } }
let(:term_handler) { -> (_) { result['TERM'] += 1 } }
let(:other_handler) { -> (_) { result['OTHER'] += 1 } }
let(:signals) { { 'INT' => int_handler, 'TERM' => term_handler, 'OTHER' => other_handler } }
context 'not a process group leader' do
before do
allow(Process).to receive(:getpgrp) { Process.pid * 2 }
end
it 'does nothing' do
expect { described_class.install!(signals) }
.not_to change { signals }
end
end
context 'as a process group leader' do
before do
allow(Process).to receive(:getpgrp) { Process.pid }
end
it 'installs its own signal handlers for TERM and INT only' do
described_class.install!(signals)
expect(signals['INT']).not_to eq(int_handler)
expect(signals['TERM']).not_to eq(term_handler)
expect(signals['OTHER']).to eq(other_handler)
end
%w[INT TERM].each do |signal|
it "installs a forwarding signal handler for #{signal}" do
described_class.install!(signals)
expect(described_class)
.to receive(:trap)
.with(signal, 'IGNORE')
.and_return(:original_trap)
.ordered
expect(Process)
.to receive(:kill)
.with(signal, "-#{Process.pid}")
.ordered
expect(described_class)
.to receive(:trap)
.with(signal, :original_trap)
.ordered
signals[signal].call(:cli)
expect(result[signal]).to eq(1)
end
it "raises if sidekiq no longer traps SIG#{signal}" do
signals.delete(signal)
expect { described_class.install!(signals) }
.to raise_error(/Sidekiq should have registered/)
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment