Commit 73f48c32 authored by Igor Wiedler's avatar Igor Wiedler

Defer stackprof signal trap when running in sidekiq

Sidekiq currently overrides the trap. This change
defers the definition of the signal handler until
after sidekiq's setup code.

This way, we get to keep our signal handler, and
stackprof can be invoked by sending a SIGUSR2 to
sidekiq processes.

Because sidekiq-cluster forwards SIGUSR2 signals,
we can also send the signal to the sidekiq-cluster
process.
parent f9ef6bc7
......@@ -8,94 +8,122 @@
# * timeout profile after 30 seconds
# * write to $TMPDIR/stackprof.$PID.$RAND.profile
if Gitlab::Utils.to_boolean(ENV['STACKPROF_ENABLED'].to_s)
Gitlab::Cluster::LifecycleEvents.on_worker_start do
require 'stackprof'
require 'tmpdir'
module Gitlab
class StackProf
# this is a workaround for sidekiq, which defines its own SIGUSR2 handler.
# by defering to the sidekiq startup event, we get to set up our own
# handler late enough.
# see also: https://github.com/mperham/sidekiq/pull/4653
def self.install
require 'stackprof'
require 'tmpdir'
if Gitlab::Runtime.sidekiq?
Sidekiq.configure_server do |config|
config.on :startup do
on_worker_start
end
end
else
Gitlab::Cluster::LifecycleEvents.on_worker_start do
on_worker_start
end
end
end
Gitlab::AppJsonLogger.info "stackprof: listening on SIGUSR2 signal"
def self.on_worker_start
Gitlab::AppJsonLogger.info(
event: "stackprof",
message: "listening on SIGUSR2 signal",
pid: Process.pid
)
# create a pipe in order to propagate signal out of the signal handler
# see also: https://cr.yp.to/docs/selfpipe.html
read, write = IO.pipe
# create a pipe in order to propagate signal out of the signal handler
# see also: https://cr.yp.to/docs/selfpipe.html
read, write = IO.pipe
# create a separate thread that polls for signals on the pipe.
#
# this way we do not execute in signal handler context, which
# lifts restrictions and also serializes the calls in a thread-safe
# manner.
#
# it's very similar to a goroutine and channel design.
#
# another nice benefit of this method is that we can timeout the
# IO.select call, allowing the profile to automatically stop after
# a given interval (by default 30 seconds), avoiding unbounded memory
# growth from a profile that was started and never stopped.
t = Thread.new do
timeout_s = ENV['STACKPROF_TIMEOUT_S']&.to_i || 30
current_timeout_s = nil
loop do
got_value = IO.select([read], nil, nil, current_timeout_s)
read.getbyte if got_value
# create a separate thread that polls for signals on the pipe.
#
# this way we do not execute in signal handler context, which
# lifts restrictions and also serializes the calls in a thread-safe
# manner.
#
# it's very similar to a goroutine and channel design.
#
# another nice benefit of this method is that we can timeout the
# IO.select call, allowing the profile to automatically stop after
# a given interval (by default 30 seconds), avoiding unbounded memory
# growth from a profile that was started and never stopped.
t = Thread.new do
timeout_s = ENV['STACKPROF_TIMEOUT_S']&.to_i || 30
current_timeout_s = nil
loop do
got_value = IO.select([read], nil, nil, current_timeout_s)
read.getbyte if got_value
if StackProf.running?
stackprof_file_prefix = ENV['STACKPROF_FILE_PREFIX'] || Dir.tmpdir
stackprof_out_file = "#{stackprof_file_prefix}/stackprof.#{Process.pid}.#{SecureRandom.hex(6)}.profile"
if ::StackProf.running?
stackprof_file_prefix = ENV['STACKPROF_FILE_PREFIX'] || Dir.tmpdir
stackprof_out_file = "#{stackprof_file_prefix}/stackprof.#{Process.pid}.#{SecureRandom.hex(6)}.profile"
Gitlab::AppJsonLogger.info(
event: "stackprof",
message: "stopping profile",
output_filename: stackprof_out_file,
pid: Process.pid,
timeout_s: timeout_s,
timed_out: got_value.nil?
)
Gitlab::AppJsonLogger.info(
event: "stackprof",
message: "stopping profile",
output_filename: stackprof_out_file,
pid: Process.pid,
timeout_s: timeout_s,
timed_out: got_value.nil?
)
StackProf.stop
StackProf.results(stackprof_out_file)
current_timeout_s = nil
else
Gitlab::AppJsonLogger.info(
event: "stackprof",
message: "starting profile",
pid: Process.pid
)
::StackProf.stop
::StackProf.results(stackprof_out_file)
current_timeout_s = nil
else
Gitlab::AppJsonLogger.info(
event: "stackprof",
message: "starting profile",
pid: Process.pid
)
StackProf.start(
mode: :cpu,
raw: Gitlab::Utils.to_boolean(ENV['STACKPROF_RAW'] || 'true'),
interval: ENV['STACKPROF_INTERVAL_US']&.to_i || 10_000
)
current_timeout_s = timeout_s
::StackProf.start(
mode: :cpu,
raw: Gitlab::Utils.to_boolean(ENV['STACKPROF_RAW'] || 'true'),
interval: ENV['STACKPROF_INTERVAL_US']&.to_i || 10_000
)
current_timeout_s = timeout_s
end
end
end
end
t.abort_on_exception = true
t.abort_on_exception = true
# in the case of puma, this will override the existing SIGUSR2 signal handler
# that can be used to trigger a restart.
#
# puma cluster has two types of restarts:
# * SIGUSR1: phased restart
# * SIGUSR2: restart
#
# phased restart is not supported in our configuration, because we use
# preload_app. this means we will always perform a normal restart.
# additionally, phased restart is not supported when sending a SIGUSR2
# directly to a puma worker (as opposed to the master process).
#
# the result is that the behaviour of SIGUSR1 and SIGUSR2 is identical in
# our configuration, and we can always use a SIGUSR1 to perform a restart.
#
# thus, it is acceptable for us to re-appropriate the SIGUSR2 signal, and
# override the puma behaviour.
#
# see also:
# * https://github.com/puma/puma/blob/master/docs/signals.md#puma-signals
# * https://github.com/phusion/unicorn/blob/master/SIGNALS
# * https://github.com/mperham/sidekiq/wiki/Signals
Signal.trap('SIGUSR2') do
write.write('.')
# in the case of puma, this will override the existing SIGUSR2 signal handler
# that can be used to trigger a restart.
#
# puma cluster has two types of restarts:
# * SIGUSR1: phased restart
# * SIGUSR2: restart
#
# phased restart is not supported in our configuration, because we use
# preload_app. this means we will always perform a normal restart.
# additionally, phased restart is not supported when sending a SIGUSR2
# directly to a puma worker (as opposed to the master process).
#
# the result is that the behaviour of SIGUSR1 and SIGUSR2 is identical in
# our configuration, and we can always use a SIGUSR1 to perform a restart.
#
# thus, it is acceptable for us to re-appropriate the SIGUSR2 signal, and
# override the puma behaviour.
#
# see also:
# * https://github.com/puma/puma/blob/master/docs/signals.md#puma-signals
# * https://github.com/phusion/unicorn/blob/master/SIGNALS
# * https://github.com/mperham/sidekiq/wiki/Signals
Signal.trap('SIGUSR2') do
write.write('.')
end
end
end
end
if Gitlab::Utils.to_boolean(ENV['STACKPROF_ENABLED'].to_s)
Gitlab::StackProf.install
end
......@@ -281,6 +281,10 @@ This can be done via `pkill -USR2 puma:`. The `:` disambiguates between `puma
4.3.3.gitlab.2 ...` (the master process) from `puma: cluster worker 0: ...` (the
worker processes), selecting the latter.
For Sidekiq, the signal can be sent to the `sidekiq-cluster` process via `pkill
-USR2 bin/sidekiq-cluster` -- this will forward the signal to all Sidekiq
children. Alternatively, you can also select a specific pid of interest.
Production profiles can be especially noisy. It can be helpful to visualize them
as a [flamegraph](https://github.com/brendangregg/FlameGraph). This can be done
via:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment