Commit 73f48c32 authored by Igor Wiedler's avatar Igor Wiedler

Defer stackprof signal trap when running in sidekiq

Sidekiq currently overrides the trap. This change
defers the definition of the signal handler until
after sidekiq's setup code.

This way, we get to keep our signal handler, and
stackprof can be invoked by sending a SIGUSR2 to
sidekiq processes.

Because sidekiq-cluster forwards SIGUSR2 signals,
we can also send the signal to the sidekiq-cluster
process.
parent f9ef6bc7
...@@ -8,94 +8,122 @@ ...@@ -8,94 +8,122 @@
# * timeout profile after 30 seconds # * timeout profile after 30 seconds
# * write to $TMPDIR/stackprof.$PID.$RAND.profile # * write to $TMPDIR/stackprof.$PID.$RAND.profile
if Gitlab::Utils.to_boolean(ENV['STACKPROF_ENABLED'].to_s) module Gitlab
Gitlab::Cluster::LifecycleEvents.on_worker_start do class StackProf
require 'stackprof' # this is a workaround for sidekiq, which defines its own SIGUSR2 handler.
require 'tmpdir' # by defering to the sidekiq startup event, we get to set up our own
# handler late enough.
# see also: https://github.com/mperham/sidekiq/pull/4653
def self.install
require 'stackprof'
require 'tmpdir'
if Gitlab::Runtime.sidekiq?
Sidekiq.configure_server do |config|
config.on :startup do
on_worker_start
end
end
else
Gitlab::Cluster::LifecycleEvents.on_worker_start do
on_worker_start
end
end
end
Gitlab::AppJsonLogger.info "stackprof: listening on SIGUSR2 signal" def self.on_worker_start
Gitlab::AppJsonLogger.info(
event: "stackprof",
message: "listening on SIGUSR2 signal",
pid: Process.pid
)
# create a pipe in order to propagate signal out of the signal handler # create a pipe in order to propagate signal out of the signal handler
# see also: https://cr.yp.to/docs/selfpipe.html # see also: https://cr.yp.to/docs/selfpipe.html
read, write = IO.pipe read, write = IO.pipe
# create a separate thread that polls for signals on the pipe. # create a separate thread that polls for signals on the pipe.
# #
# this way we do not execute in signal handler context, which # this way we do not execute in signal handler context, which
# lifts restrictions and also serializes the calls in a thread-safe # lifts restrictions and also serializes the calls in a thread-safe
# manner. # manner.
# #
# it's very similar to a goroutine and channel design. # it's very similar to a goroutine and channel design.
# #
# another nice benefit of this method is that we can timeout the # another nice benefit of this method is that we can timeout the
# IO.select call, allowing the profile to automatically stop after # IO.select call, allowing the profile to automatically stop after
# a given interval (by default 30 seconds), avoiding unbounded memory # a given interval (by default 30 seconds), avoiding unbounded memory
# growth from a profile that was started and never stopped. # growth from a profile that was started and never stopped.
t = Thread.new do t = Thread.new do
timeout_s = ENV['STACKPROF_TIMEOUT_S']&.to_i || 30 timeout_s = ENV['STACKPROF_TIMEOUT_S']&.to_i || 30
current_timeout_s = nil current_timeout_s = nil
loop do loop do
got_value = IO.select([read], nil, nil, current_timeout_s) got_value = IO.select([read], nil, nil, current_timeout_s)
read.getbyte if got_value read.getbyte if got_value
if StackProf.running? if ::StackProf.running?
stackprof_file_prefix = ENV['STACKPROF_FILE_PREFIX'] || Dir.tmpdir stackprof_file_prefix = ENV['STACKPROF_FILE_PREFIX'] || Dir.tmpdir
stackprof_out_file = "#{stackprof_file_prefix}/stackprof.#{Process.pid}.#{SecureRandom.hex(6)}.profile" stackprof_out_file = "#{stackprof_file_prefix}/stackprof.#{Process.pid}.#{SecureRandom.hex(6)}.profile"
Gitlab::AppJsonLogger.info( Gitlab::AppJsonLogger.info(
event: "stackprof", event: "stackprof",
message: "stopping profile", message: "stopping profile",
output_filename: stackprof_out_file, output_filename: stackprof_out_file,
pid: Process.pid, pid: Process.pid,
timeout_s: timeout_s, timeout_s: timeout_s,
timed_out: got_value.nil? timed_out: got_value.nil?
) )
StackProf.stop ::StackProf.stop
StackProf.results(stackprof_out_file) ::StackProf.results(stackprof_out_file)
current_timeout_s = nil current_timeout_s = nil
else else
Gitlab::AppJsonLogger.info( Gitlab::AppJsonLogger.info(
event: "stackprof", event: "stackprof",
message: "starting profile", message: "starting profile",
pid: Process.pid pid: Process.pid
) )
StackProf.start( ::StackProf.start(
mode: :cpu, mode: :cpu,
raw: Gitlab::Utils.to_boolean(ENV['STACKPROF_RAW'] || 'true'), raw: Gitlab::Utils.to_boolean(ENV['STACKPROF_RAW'] || 'true'),
interval: ENV['STACKPROF_INTERVAL_US']&.to_i || 10_000 interval: ENV['STACKPROF_INTERVAL_US']&.to_i || 10_000
) )
current_timeout_s = timeout_s current_timeout_s = timeout_s
end
end end
end end
end t.abort_on_exception = true
t.abort_on_exception = true
# in the case of puma, this will override the existing SIGUSR2 signal handler # in the case of puma, this will override the existing SIGUSR2 signal handler
# that can be used to trigger a restart. # that can be used to trigger a restart.
# #
# puma cluster has two types of restarts: # puma cluster has two types of restarts:
# * SIGUSR1: phased restart # * SIGUSR1: phased restart
# * SIGUSR2: restart # * SIGUSR2: restart
# #
# phased restart is not supported in our configuration, because we use # phased restart is not supported in our configuration, because we use
# preload_app. this means we will always perform a normal restart. # preload_app. this means we will always perform a normal restart.
# additionally, phased restart is not supported when sending a SIGUSR2 # additionally, phased restart is not supported when sending a SIGUSR2
# directly to a puma worker (as opposed to the master process). # directly to a puma worker (as opposed to the master process).
# #
# the result is that the behaviour of SIGUSR1 and SIGUSR2 is identical in # the result is that the behaviour of SIGUSR1 and SIGUSR2 is identical in
# our configuration, and we can always use a SIGUSR1 to perform a restart. # our configuration, and we can always use a SIGUSR1 to perform a restart.
# #
# thus, it is acceptable for us to re-appropriate the SIGUSR2 signal, and # thus, it is acceptable for us to re-appropriate the SIGUSR2 signal, and
# override the puma behaviour. # override the puma behaviour.
# #
# see also: # see also:
# * https://github.com/puma/puma/blob/master/docs/signals.md#puma-signals # * https://github.com/puma/puma/blob/master/docs/signals.md#puma-signals
# * https://github.com/phusion/unicorn/blob/master/SIGNALS # * https://github.com/phusion/unicorn/blob/master/SIGNALS
# * https://github.com/mperham/sidekiq/wiki/Signals # * https://github.com/mperham/sidekiq/wiki/Signals
Signal.trap('SIGUSR2') do Signal.trap('SIGUSR2') do
write.write('.') write.write('.')
end
end end
end end
end end
if Gitlab::Utils.to_boolean(ENV['STACKPROF_ENABLED'].to_s)
Gitlab::StackProf.install
end
...@@ -281,6 +281,10 @@ This can be done via `pkill -USR2 puma:`. The `:` disambiguates between `puma ...@@ -281,6 +281,10 @@ This can be done via `pkill -USR2 puma:`. The `:` disambiguates between `puma
4.3.3.gitlab.2 ...` (the master process) from `puma: cluster worker 0: ...` (the 4.3.3.gitlab.2 ...` (the master process) from `puma: cluster worker 0: ...` (the
worker processes), selecting the latter. worker processes), selecting the latter.
For Sidekiq, the signal can be sent to the `sidekiq-cluster` process via `pkill
-USR2 bin/sidekiq-cluster` -- this will forward the signal to all Sidekiq
children. Alternatively, you can also select a specific pid of interest.
Production profiles can be especially noisy. It can be helpful to visualize them Production profiles can be especially noisy. It can be helpful to visualize them
as a [flamegraph](https://github.com/brendangregg/FlameGraph). This can be done as a [flamegraph](https://github.com/brendangregg/FlameGraph). This can be done
via: via:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment