Commit 0c2bda9a authored by Bob Van Landuyt's avatar Bob Van Landuyt

Merge branch '346256-run-separate-metrics-server' into 'master'

Start a separate metrics server that can serve Sidekiq metrics

See merge request gitlab-org/gitlab!75247
parents f26cc4cc 5e2f02c3
...@@ -8,8 +8,9 @@ begin ...@@ -8,8 +8,9 @@ begin
raise "Required: METRICS_SERVER_TARGET=[sidekiq]" unless target == 'sidekiq' raise "Required: METRICS_SERVER_TARGET=[sidekiq]" unless target == 'sidekiq'
metrics_dir = ENV["prometheus_multiproc_dir"] || File.absolute_path("tmp/prometheus_multiproc_dir/#{target}") metrics_dir = ENV["prometheus_multiproc_dir"] || File.absolute_path("tmp/prometheus_multiproc_dir/#{target}")
wipe_metrics_dir = Gitlab::Utils.to_boolean(ENV['WIPE_METRICS_DIR']) || false
# Re-raise exceptions in threads on the main thread. # Re-raise exceptions in threads on the main thread.
Thread.abort_on_exception = true Thread.abort_on_exception = true
MetricsServer.new(target, metrics_dir).start MetricsServer.new(target, metrics_dir, wipe_metrics_dir).start
end end
...@@ -70,11 +70,17 @@ module Gitlab ...@@ -70,11 +70,17 @@ module Gitlab
end end
def self.process_alive?(pid) def self.process_alive?(pid)
return false if pid.nil?
# Signal 0 tests whether the process exists and we have access to send signals # Signal 0 tests whether the process exists and we have access to send signals
# but is otherwise a noop (doesn't actually send a signal to the process) # but is otherwise a noop (doesn't actually send a signal to the process)
signal(pid, 0) signal(pid, 0)
end end
def self.process_died?(pid)
!process_alive?(pid)
end
def self.write_pid(path) def self.write_pid(path)
File.open(path, 'w') do |handle| File.open(path, 'w') do |handle|
handle.write(Process.pid.to_s) handle.write(Process.pid.to_s)
......
...@@ -13,6 +13,7 @@ require 'rack' ...@@ -13,6 +13,7 @@ require 'rack'
require_relative 'settings_overrides' require_relative 'settings_overrides'
require_relative '../lib/gitlab/daemon' require_relative '../lib/gitlab/daemon'
require_relative '../lib/gitlab/utils'
require_relative '../lib/gitlab/utils/strong_memoize' require_relative '../lib/gitlab/utils/strong_memoize'
require_relative '../lib/prometheus/cleanup_multiproc_dir_service' require_relative '../lib/prometheus/cleanup_multiproc_dir_service'
require_relative '../lib/gitlab/metrics/prometheus' require_relative '../lib/gitlab/metrics/prometheus'
......
# frozen_string_literal: true # frozen_string_literal: true
require_relative '../config/bundler_setup' require_relative '../config/boot'
require_relative 'dependencies' require_relative 'dependencies'
class MetricsServer # rubocop:disable Gitlab/NamespacedClass class MetricsServer # rubocop:disable Gitlab/NamespacedClass
class << self class << self
def spawn(target, gitlab_config: nil) def spawn(target, gitlab_config: nil, wipe_metrics_dir: false)
cmd = "#{Rails.root}/bin/metrics-server" cmd = "#{Rails.root}/bin/metrics-server"
env = { env = {
'METRICS_SERVER_TARGET' => target, 'METRICS_SERVER_TARGET' => target,
'GITLAB_CONFIG' => gitlab_config 'GITLAB_CONFIG' => gitlab_config,
'WIPE_METRICS_DIR' => wipe_metrics_dir.to_s
} }
Process.spawn(env, cmd, err: $stderr, out: $stdout).tap do |pid| Process.spawn(env, cmd, err: $stderr, out: $stdout).tap do |pid|
...@@ -19,9 +20,10 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass ...@@ -19,9 +20,10 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass
end end
end end
def initialize(target, metrics_dir) def initialize(target, metrics_dir, wipe_metrics_dir)
@target = target @target = target
@metrics_dir = metrics_dir @metrics_dir = metrics_dir
@wipe_metrics_dir = wipe_metrics_dir
end end
def start def start
...@@ -30,7 +32,7 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass ...@@ -30,7 +32,7 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass
end end
FileUtils.mkdir_p(@metrics_dir, mode: 0700) FileUtils.mkdir_p(@metrics_dir, mode: 0700)
::Prometheus::CleanupMultiprocDirService.new.execute ::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir
settings = Settings.monitoring.sidekiq_exporter settings = Settings.monitoring.sidekiq_exporter
exporter_class = "Gitlab::Metrics::Exporter::#{@target.camelize}Exporter".constantize exporter_class = "Gitlab::Metrics::Exporter::#{@target.camelize}Exporter".constantize
......
# frozen_string_literal: true # frozen_string_literal: true
require_relative '../config/bundler_setup'
require 'optparse' require 'optparse'
require 'logger' require 'logger'
require 'time' require 'time'
...@@ -12,6 +14,7 @@ require_relative '../lib/gitlab/sidekiq_config/cli_methods' ...@@ -12,6 +14,7 @@ require_relative '../lib/gitlab/sidekiq_config/cli_methods'
require_relative '../lib/gitlab/sidekiq_config/worker_matcher' require_relative '../lib/gitlab/sidekiq_config/worker_matcher'
require_relative '../lib/gitlab/sidekiq_logging/json_formatter' require_relative '../lib/gitlab/sidekiq_logging/json_formatter'
require_relative '../lib/gitlab/process_management' require_relative '../lib/gitlab/process_management'
require_relative '../metrics_server/metrics_server'
require_relative 'sidekiq_cluster' require_relative 'sidekiq_cluster'
module Gitlab module Gitlab
...@@ -89,6 +92,8 @@ module Gitlab ...@@ -89,6 +92,8 @@ module Gitlab
@logger.info("Starting cluster with #{queue_groups.length} processes") @logger.info("Starting cluster with #{queue_groups.length} processes")
end end
start_metrics_server(wipe_metrics_dir: true)
@processes = SidekiqCluster.start( @processes = SidekiqCluster.start(
queue_groups, queue_groups,
env: @environment, env: @environment,
...@@ -154,17 +159,69 @@ module Gitlab ...@@ -154,17 +159,69 @@ module Gitlab
while @alive while @alive
sleep(@interval) sleep(@interval)
if metrics_server_enabled? && ProcessManagement.process_died?(@metrics_server_pid)
@logger.warn('Metrics server went away')
start_metrics_server(wipe_metrics_dir: false)
end
unless ProcessManagement.all_alive?(@processes) unless ProcessManagement.all_alive?(@processes)
# If a child process died we'll just terminate the whole cluster. It's up to # If a child process died we'll just terminate the whole cluster. It's up to
# runit and such to then restart the cluster. # runit and such to then restart the cluster.
@logger.info('A worker terminated, shutting down the cluster') @logger.info('A worker terminated, shutting down the cluster')
stop_metrics_server
ProcessManagement.signal_processes(@processes, :TERM) ProcessManagement.signal_processes(@processes, :TERM)
break break
end end
end end
end end
def start_metrics_server(wipe_metrics_dir: false)
return unless metrics_server_enabled?
@logger.info("Starting metrics server on port #{sidekiq_exporter_port}")
@metrics_server_pid = MetricsServer.spawn('sidekiq', wipe_metrics_dir: wipe_metrics_dir)
end
def sidekiq_exporter_enabled?
::Settings.monitoring.sidekiq_exporter.enabled
rescue Settingslogic::MissingSetting
nil
end
def exporter_has_a_unique_port?
# In https://gitlab.com/gitlab-org/gitlab/-/issues/345802 we added settings for sidekiq_health_checks.
# These settings default to the same values as sidekiq_exporter for backwards compatibility.
# If a different port for sidekiq_health_checks has been set up, we know that the
# user wants to serve health checks and metrics from different servers.
return false if sidekiq_health_check_port.nil? || sidekiq_exporter_port.nil?
sidekiq_exporter_port != sidekiq_health_check_port
end
def sidekiq_exporter_port
::Settings.monitoring.sidekiq_exporter.port
rescue Settingslogic::MissingSetting
nil
end
def sidekiq_health_check_port
::Settings.monitoring.sidekiq_health_checks.port
rescue Settingslogic::MissingSetting
nil
end
def metrics_server_enabled?
!@dryrun && sidekiq_exporter_enabled? && exporter_has_a_unique_port?
end
def stop_metrics_server
return unless @metrics_server_pid
@logger.info("Stopping metrics server (PID #{@metrics_server_pid})")
ProcessManagement.signal(@metrics_server_pid, :TERM)
end
def option_parser def option_parser
OptionParser.new do |opt| OptionParser.new do |opt|
opt.banner = "#{File.basename(__FILE__)} [QUEUE,QUEUE] [QUEUE] ... [OPTIONS]" opt.banner = "#{File.basename(__FILE__)} [QUEUE,QUEUE] [QUEUE] ... [OPTIONS]"
......
# rubocop:disable Naming/FileName
# frozen_string_literal: true
require 'shellwords'
# rubocop:enable Naming/FileName
# frozen_string_literal: true # frozen_string_literal: true
require_relative 'dependencies'
require_relative '../lib/gitlab/process_management' require_relative '../lib/gitlab/process_management'
module Gitlab module Gitlab
...@@ -67,7 +66,11 @@ module Gitlab ...@@ -67,7 +66,11 @@ module Gitlab
return return
end end
pid = Process.spawn( # We need to remove Bundler specific env vars, since otherwise the
# child process will think we are passing an alternative Gemfile
# and will clear and reset LOAD_PATH.
pid = Bundler.with_original_env do
Process.spawn(
{ 'ENABLE_SIDEKIQ_CLUSTER' => '1', { 'ENABLE_SIDEKIQ_CLUSTER' => '1',
'SIDEKIQ_WORKER_ID' => worker_id.to_s }, 'SIDEKIQ_WORKER_ID' => worker_id.to_s },
*cmd, *cmd,
...@@ -75,6 +78,7 @@ module Gitlab ...@@ -75,6 +78,7 @@ module Gitlab
err: $stderr, err: $stderr,
out: $stdout out: $stdout
) )
end
ProcessManagement.wait_async(pid) ProcessManagement.wait_async(pid)
......
...@@ -29,7 +29,7 @@ RSpec.describe 'bin/metrics-server', :aggregate_failures do ...@@ -29,7 +29,7 @@ RSpec.describe 'bin/metrics-server', :aggregate_failures do
config_file.write(YAML.dump(config)) config_file.write(YAML.dump(config))
config_file.close config_file.close
@pid = MetricsServer.spawn('sidekiq', gitlab_config: config_file.path) @pid = MetricsServer.spawn('sidekiq', gitlab_config: config_file.path, wipe_metrics_dir: true)
end end
after do after do
......
...@@ -12,8 +12,23 @@ RSpec.describe Gitlab::SidekiqCluster::CLI do # rubocop:disable RSpec/FilePath ...@@ -12,8 +12,23 @@ RSpec.describe Gitlab::SidekiqCluster::CLI do # rubocop:disable RSpec/FilePath
{ env: 'test', directory: Dir.pwd, max_concurrency: 50, min_concurrency: 0, dryrun: false, timeout: timeout } { env: 'test', directory: Dir.pwd, max_concurrency: 50, min_concurrency: 0, dryrun: false, timeout: timeout }
end end
let(:sidekiq_exporter_enabled) { false }
let(:sidekiq_exporter_port) { '3807' }
let(:sidekiq_health_checks_port) { '3807' }
before do before do
stub_env('RAILS_ENV', 'test') stub_env('RAILS_ENV', 'test')
stub_config(
monitoring: {
sidekiq_exporter: {
enabled: sidekiq_exporter_enabled,
port: sidekiq_exporter_port
},
sidekiq_health_checks: {
port: sidekiq_health_checks_port
}
}
)
end end
describe '#run' do describe '#run' do
...@@ -241,6 +256,163 @@ RSpec.describe Gitlab::SidekiqCluster::CLI do # rubocop:disable RSpec/FilePath ...@@ -241,6 +256,163 @@ RSpec.describe Gitlab::SidekiqCluster::CLI do # rubocop:disable RSpec/FilePath
end end
end end
end end
context 'metrics server' do
context 'starting the server' do
context 'without --dryrun' do
context 'when there are no sidekiq_health_checks settings set' do
before do
stub_config(
monitoring: {
sidekiq_exporter: {
enabled: true,
port: sidekiq_exporter_port
}
}
)
allow(Gitlab::SidekiqCluster).to receive(:start)
allow(cli).to receive(:write_pid)
allow(cli).to receive(:trap_signals)
allow(cli).to receive(:start_loop)
end
it 'does not start a sidekiq metrics server' do
expect(MetricsServer).not_to receive(:spawn)
cli.run(%w(foo))
end
it 'rescues Settingslogic::MissingSetting' do
expect { cli.run(%w(foo)) }.not_to raise_error(Settingslogic::MissingSetting)
end
end
context 'when the sidekiq_exporter.port setting is not set' do
before do
stub_config(
monitoring: {
sidekiq_exporter: {
enabled: true
},
sidekiq_health_checks: {
port: sidekiq_health_checks_port
}
}
)
allow(Gitlab::SidekiqCluster).to receive(:start)
allow(cli).to receive(:write_pid)
allow(cli).to receive(:trap_signals)
allow(cli).to receive(:start_loop)
end
it 'does not start a sidekiq metrics server' do
expect(MetricsServer).not_to receive(:spawn)
cli.run(%w(foo))
end
it 'rescues Settingslogic::MissingSetting' do
expect { cli.run(%w(foo)) }.not_to raise_error(Settingslogic::MissingSetting)
end
end
context 'when sidekiq_exporter.enabled setting is not set' do
before do
stub_config(
monitoring: {
sidekiq_exporter: {},
sidekiq_health_checks: {
port: sidekiq_health_checks_port
}
}
)
allow(Gitlab::SidekiqCluster).to receive(:start)
allow(cli).to receive(:write_pid)
allow(cli).to receive(:trap_signals)
allow(cli).to receive(:start_loop)
end
it 'does not start a sidekiq metrics server' do
expect(MetricsServer).not_to receive(:spawn)
cli.run(%w(foo))
end
end
using RSpec::Parameterized::TableSyntax
where(:sidekiq_exporter_enabled, :sidekiq_exporter_port, :sidekiq_health_checks_port, :start_metrics_server) do
true | '3807' | '3907' | true
true | '3807' | '3807' | false
false | '3807' | '3907' | false
false | '3807' | '3907' | false
end
with_them do
before do
allow(Gitlab::SidekiqCluster).to receive(:start)
allow(cli).to receive(:write_pid)
allow(cli).to receive(:trap_signals)
allow(cli).to receive(:start_loop)
end
specify do
if start_metrics_server
expect(MetricsServer).to receive(:spawn).with('sidekiq', wipe_metrics_dir: true)
else
expect(MetricsServer).not_to receive(:spawn)
end
cli.run(%w(foo))
end
end
end
context 'with --dryrun set' do
let(:sidekiq_exporter_enabled) { true }
it 'does not start the server' do
expect(MetricsServer).not_to receive(:spawn)
cli.run(%w(foo --dryrun))
end
end
end
context 'supervising the server' do
let(:sidekiq_exporter_enabled) { true }
let(:sidekiq_health_checks_port) { '3907' }
before do
allow(cli).to receive(:sleep).with(a_kind_of(Numeric))
allow(MetricsServer).to receive(:spawn).with('sidekiq', wipe_metrics_dir: false).and_return(99)
cli.start_metrics_server
end
it 'stops the metrics server when one of the processes has been terminated' do
allow(Gitlab::ProcessManagement).to receive(:process_died?).and_return(false)
allow(Gitlab::ProcessManagement).to receive(:all_alive?).with(an_instance_of(Array)).and_return(false)
allow(Gitlab::ProcessManagement).to receive(:signal_processes).with(an_instance_of(Array), :TERM)
expect(Process).to receive(:kill).with(:TERM, 99)
cli.start_loop
end
it 'starts the metrics server when it is down' do
allow(Gitlab::ProcessManagement).to receive(:process_died?).and_return(true)
allow(Gitlab::ProcessManagement).to receive(:all_alive?).with(an_instance_of(Array)).and_return(false)
allow(cli).to receive(:stop_metrics_server)
expect(MetricsServer).to receive(:spawn).with('sidekiq', wipe_metrics_dir: false)
cli.start_loop
end
end
end
end end
describe '#write_pid' do describe '#write_pid' do
......
...@@ -78,7 +78,7 @@ RSpec.describe Gitlab::ProcessManagement do ...@@ -78,7 +78,7 @@ RSpec.describe Gitlab::ProcessManagement do
end end
describe '.process_alive?' do describe '.process_alive?' do
it 'returns true if the proces is alive' do it 'returns true if the process is alive' do
process = Process.pid process = Process.pid
expect(described_class.process_alive?(process)).to eq(true) expect(described_class.process_alive?(process)).to eq(true)
...@@ -89,6 +89,32 @@ RSpec.describe Gitlab::ProcessManagement do ...@@ -89,6 +89,32 @@ RSpec.describe Gitlab::ProcessManagement do
expect(described_class.process_alive?(process)).to eq(false) expect(described_class.process_alive?(process)).to eq(false)
end end
it 'returns false when no pid is given' do
process = nil
expect(described_class.process_alive?(process)).to eq(false)
end
end
describe '.process_died?' do
it 'returns false if the process is alive' do
process = Process.pid
expect(described_class.process_died?(process)).to eq(false)
end
it 'returns true when a thread was not alive' do
process = -2
expect(described_class.process_died?(process)).to eq(true)
end
it 'returns true when no pid is given' do
process = nil
expect(described_class.process_died?(process)).to eq(true)
end
end end
describe '.pids_alive' do describe '.pids_alive' do
......
...@@ -12,7 +12,8 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath ...@@ -12,7 +12,8 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
let(:env) do let(:env) do
{ {
'METRICS_SERVER_TARGET' => 'sidekiq', 'METRICS_SERVER_TARGET' => 'sidekiq',
'GITLAB_CONFIG' => nil 'GITLAB_CONFIG' => nil,
'WIPE_METRICS_DIR' => 'false'
} }
end end
...@@ -32,7 +33,7 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath ...@@ -32,7 +33,7 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
let(:metrics_dir) { Dir.mktmpdir } let(:metrics_dir) { Dir.mktmpdir }
let(:settings_double) { double(:settings, sidekiq_exporter: {}) } let(:settings_double) { double(:settings, sidekiq_exporter: {}) }
subject(:metrics_server) { described_class.new('fake', metrics_dir)} subject(:metrics_server) { described_class.new('fake', metrics_dir, true)}
before do before do
stub_env('prometheus_multiproc_dir', metrics_dir) stub_env('prometheus_multiproc_dir', metrics_dir)
...@@ -42,6 +43,7 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath ...@@ -42,6 +43,7 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
end end
after do after do
::Prometheus::CleanupMultiprocDirService.new.execute
Dir.rmdir(metrics_dir) Dir.rmdir(metrics_dir)
end end
...@@ -59,11 +61,25 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath ...@@ -59,11 +61,25 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
metrics_server.start metrics_server.start
end end
context 'when wipe_metrics_dir is true' do
subject(:metrics_server) { described_class.new('fake', metrics_dir, true)}
it 'removes any old metrics files' do it 'removes any old metrics files' do
FileUtils.touch("#{metrics_dir}/remove_this.db") FileUtils.touch("#{metrics_dir}/remove_this.db")
expect { metrics_server.start }.to change { Dir.empty?(metrics_dir) }.from(false).to(true) expect { metrics_server.start }.to change { Dir.empty?(metrics_dir) }.from(false).to(true)
end end
end
context 'when wipe_metrics_dir is false' do
subject(:metrics_server) { described_class.new('fake', metrics_dir, false)}
it 'does not remove any old metrics files' do
FileUtils.touch("#{metrics_dir}/remove_this.db")
expect { metrics_server.start }.not_to change { Dir.empty?(metrics_dir) }.from(false)
end
end
it 'starts a metrics server' do it 'starts a metrics server' do
expect(exporter_double).to receive(:start) expect(exporter_double).to receive(:start)
......
...@@ -13,6 +13,8 @@ RSpec.describe Gitlab::SidekiqCluster do # rubocop:disable RSpec/FilePath ...@@ -13,6 +13,8 @@ RSpec.describe Gitlab::SidekiqCluster do # rubocop:disable RSpec/FilePath
out: $stdout out: $stdout
} }
expect(Bundler).to receive(:with_original_env).and_call_original.twice
expect(Process).to receive(:spawn).ordered.with({ expect(Process).to receive(:spawn).ordered.with({
"ENABLE_SIDEKIQ_CLUSTER" => "1", "ENABLE_SIDEKIQ_CLUSTER" => "1",
"SIDEKIQ_WORKER_ID" => "0" "SIDEKIQ_WORKER_ID" => "0"
......
...@@ -28,7 +28,7 @@ RSpec.describe Quality::TestLevel do ...@@ -28,7 +28,7 @@ RSpec.describe Quality::TestLevel do
context 'when level is unit' do context 'when level is unit' do
it 'returns a pattern' do it 'returns a pattern' do
expect(subject.pattern(:unit)) expect(subject.pattern(:unit))
.to eq("spec/{bin,channels,config,db,dependencies,elastic,elastic_integration,experiments,factories,finders,frontend,graphql,haml_lint,helpers,initializers,javascripts,lib,metrics_server,models,policies,presenters,rack_servers,replicators,routing,rubocop,scripts,serializers,services,sidekiq,spam,support_specs,tasks,uploaders,validators,views,workers,tooling}{,/**/}*_spec.rb") .to eq("spec/{bin,channels,config,db,dependencies,elastic,elastic_integration,experiments,factories,finders,frontend,graphql,haml_lint,helpers,initializers,javascripts,lib,metrics_server,models,policies,presenters,rack_servers,replicators,routing,rubocop,scripts,serializers,services,sidekiq,sidekiq_cluster,spam,support_specs,tasks,uploaders,validators,views,workers,tooling}{,/**/}*_spec.rb")
end end
end end
...@@ -110,7 +110,7 @@ RSpec.describe Quality::TestLevel do ...@@ -110,7 +110,7 @@ RSpec.describe Quality::TestLevel do
context 'when level is unit' do context 'when level is unit' do
it 'returns a regexp' do it 'returns a regexp' do
expect(subject.regexp(:unit)) expect(subject.regexp(:unit))
.to eq(%r{spec/(bin|channels|config|db|dependencies|elastic|elastic_integration|experiments|factories|finders|frontend|graphql|haml_lint|helpers|initializers|javascripts|lib|metrics_server|models|policies|presenters|rack_servers|replicators|routing|rubocop|scripts|serializers|services|sidekiq|spam|support_specs|tasks|uploaders|validators|views|workers|tooling)}) .to eq(%r{spec/(bin|channels|config|db|dependencies|elastic|elastic_integration|experiments|factories|finders|frontend|graphql|haml_lint|helpers|initializers|javascripts|lib|metrics_server|models|policies|presenters|rack_servers|replicators|routing|rubocop|scripts|serializers|services|sidekiq|sidekiq_cluster|spam|support_specs|tasks|uploaders|validators|views|workers|tooling)})
end end
end end
......
...@@ -45,6 +45,7 @@ module Quality ...@@ -45,6 +45,7 @@ module Quality
serializers serializers
services services
sidekiq sidekiq
sidekiq_cluster
spam spam
support_specs support_specs
tasks tasks
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment