Commit 0c2bda9a authored by Bob Van Landuyt's avatar Bob Van Landuyt

Merge branch '346256-run-separate-metrics-server' into 'master'

Start a separate metrics server that can serve Sidekiq metrics

See merge request gitlab-org/gitlab!75247
parents f26cc4cc 5e2f02c3
......@@ -8,8 +8,9 @@ begin
raise "Required: METRICS_SERVER_TARGET=[sidekiq]" unless target == 'sidekiq'
metrics_dir = ENV["prometheus_multiproc_dir"] || File.absolute_path("tmp/prometheus_multiproc_dir/#{target}")
wipe_metrics_dir = Gitlab::Utils.to_boolean(ENV['WIPE_METRICS_DIR']) || false
# Re-raise exceptions in threads on the main thread.
Thread.abort_on_exception = true
MetricsServer.new(target, metrics_dir).start
MetricsServer.new(target, metrics_dir, wipe_metrics_dir).start
end
......@@ -70,11 +70,17 @@ module Gitlab
end
def self.process_alive?(pid)
return false if pid.nil?
# Signal 0 tests whether the process exists and we have access to send signals
# but is otherwise a noop (doesn't actually send a signal to the process)
signal(pid, 0)
end
def self.process_died?(pid)
!process_alive?(pid)
end
def self.write_pid(path)
File.open(path, 'w') do |handle|
handle.write(Process.pid.to_s)
......
......@@ -13,6 +13,7 @@ require 'rack'
require_relative 'settings_overrides'
require_relative '../lib/gitlab/daemon'
require_relative '../lib/gitlab/utils'
require_relative '../lib/gitlab/utils/strong_memoize'
require_relative '../lib/prometheus/cleanup_multiproc_dir_service'
require_relative '../lib/gitlab/metrics/prometheus'
......
# frozen_string_literal: true
require_relative '../config/bundler_setup'
require_relative '../config/boot'
require_relative 'dependencies'
class MetricsServer # rubocop:disable Gitlab/NamespacedClass
class << self
def spawn(target, gitlab_config: nil)
def spawn(target, gitlab_config: nil, wipe_metrics_dir: false)
cmd = "#{Rails.root}/bin/metrics-server"
env = {
'METRICS_SERVER_TARGET' => target,
'GITLAB_CONFIG' => gitlab_config
'GITLAB_CONFIG' => gitlab_config,
'WIPE_METRICS_DIR' => wipe_metrics_dir.to_s
}
Process.spawn(env, cmd, err: $stderr, out: $stdout).tap do |pid|
......@@ -19,9 +20,10 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass
end
end
def initialize(target, metrics_dir)
def initialize(target, metrics_dir, wipe_metrics_dir)
@target = target
@metrics_dir = metrics_dir
@wipe_metrics_dir = wipe_metrics_dir
end
def start
......@@ -30,7 +32,7 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass
end
FileUtils.mkdir_p(@metrics_dir, mode: 0700)
::Prometheus::CleanupMultiprocDirService.new.execute
::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir
settings = Settings.monitoring.sidekiq_exporter
exporter_class = "Gitlab::Metrics::Exporter::#{@target.camelize}Exporter".constantize
......
# frozen_string_literal: true
require_relative '../config/bundler_setup'
require 'optparse'
require 'logger'
require 'time'
......@@ -12,6 +14,7 @@ require_relative '../lib/gitlab/sidekiq_config/cli_methods'
require_relative '../lib/gitlab/sidekiq_config/worker_matcher'
require_relative '../lib/gitlab/sidekiq_logging/json_formatter'
require_relative '../lib/gitlab/process_management'
require_relative '../metrics_server/metrics_server'
require_relative 'sidekiq_cluster'
module Gitlab
......@@ -89,6 +92,8 @@ module Gitlab
@logger.info("Starting cluster with #{queue_groups.length} processes")
end
start_metrics_server(wipe_metrics_dir: true)
@processes = SidekiqCluster.start(
queue_groups,
env: @environment,
......@@ -154,17 +159,69 @@ module Gitlab
while @alive
sleep(@interval)
if metrics_server_enabled? && ProcessManagement.process_died?(@metrics_server_pid)
@logger.warn('Metrics server went away')
start_metrics_server(wipe_metrics_dir: false)
end
unless ProcessManagement.all_alive?(@processes)
# If a child process died we'll just terminate the whole cluster. It's up to
# runit and such to then restart the cluster.
@logger.info('A worker terminated, shutting down the cluster')
stop_metrics_server
ProcessManagement.signal_processes(@processes, :TERM)
break
end
end
end
def start_metrics_server(wipe_metrics_dir: false)
return unless metrics_server_enabled?
@logger.info("Starting metrics server on port #{sidekiq_exporter_port}")
@metrics_server_pid = MetricsServer.spawn('sidekiq', wipe_metrics_dir: wipe_metrics_dir)
end
def sidekiq_exporter_enabled?
::Settings.monitoring.sidekiq_exporter.enabled
rescue Settingslogic::MissingSetting
nil
end
def exporter_has_a_unique_port?
# In https://gitlab.com/gitlab-org/gitlab/-/issues/345802 we added settings for sidekiq_health_checks.
# These settings default to the same values as sidekiq_exporter for backwards compatibility.
# If a different port for sidekiq_health_checks has been set up, we know that the
# user wants to serve health checks and metrics from different servers.
return false if sidekiq_health_check_port.nil? || sidekiq_exporter_port.nil?
sidekiq_exporter_port != sidekiq_health_check_port
end
def sidekiq_exporter_port
::Settings.monitoring.sidekiq_exporter.port
rescue Settingslogic::MissingSetting
nil
end
def sidekiq_health_check_port
::Settings.monitoring.sidekiq_health_checks.port
rescue Settingslogic::MissingSetting
nil
end
def metrics_server_enabled?
!@dryrun && sidekiq_exporter_enabled? && exporter_has_a_unique_port?
end
def stop_metrics_server
return unless @metrics_server_pid
@logger.info("Stopping metrics server (PID #{@metrics_server_pid})")
ProcessManagement.signal(@metrics_server_pid, :TERM)
end
def option_parser
OptionParser.new do |opt|
opt.banner = "#{File.basename(__FILE__)} [QUEUE,QUEUE] [QUEUE] ... [OPTIONS]"
......
# rubocop:disable Naming/FileName
# frozen_string_literal: true
require 'shellwords'
# rubocop:enable Naming/FileName
# frozen_string_literal: true
require_relative 'dependencies'
require_relative '../lib/gitlab/process_management'
module Gitlab
......@@ -67,14 +66,19 @@ module Gitlab
return
end
pid = Process.spawn(
{ 'ENABLE_SIDEKIQ_CLUSTER' => '1',
'SIDEKIQ_WORKER_ID' => worker_id.to_s },
*cmd,
pgroup: true,
err: $stderr,
out: $stdout
)
# We need to remove Bundler specific env vars, since otherwise the
# child process will think we are passing an alternative Gemfile
# and will clear and reset LOAD_PATH.
pid = Bundler.with_original_env do
Process.spawn(
{ 'ENABLE_SIDEKIQ_CLUSTER' => '1',
'SIDEKIQ_WORKER_ID' => worker_id.to_s },
*cmd,
pgroup: true,
err: $stderr,
out: $stdout
)
end
ProcessManagement.wait_async(pid)
......
......@@ -29,7 +29,7 @@ RSpec.describe 'bin/metrics-server', :aggregate_failures do
config_file.write(YAML.dump(config))
config_file.close
@pid = MetricsServer.spawn('sidekiq', gitlab_config: config_file.path)
@pid = MetricsServer.spawn('sidekiq', gitlab_config: config_file.path, wipe_metrics_dir: true)
end
after do
......
......@@ -12,8 +12,23 @@ RSpec.describe Gitlab::SidekiqCluster::CLI do # rubocop:disable RSpec/FilePath
{ env: 'test', directory: Dir.pwd, max_concurrency: 50, min_concurrency: 0, dryrun: false, timeout: timeout }
end
let(:sidekiq_exporter_enabled) { false }
let(:sidekiq_exporter_port) { '3807' }
let(:sidekiq_health_checks_port) { '3807' }
before do
stub_env('RAILS_ENV', 'test')
stub_config(
monitoring: {
sidekiq_exporter: {
enabled: sidekiq_exporter_enabled,
port: sidekiq_exporter_port
},
sidekiq_health_checks: {
port: sidekiq_health_checks_port
}
}
)
end
describe '#run' do
......@@ -241,6 +256,163 @@ RSpec.describe Gitlab::SidekiqCluster::CLI do # rubocop:disable RSpec/FilePath
end
end
end
context 'metrics server' do
context 'starting the server' do
context 'without --dryrun' do
context 'when there are no sidekiq_health_checks settings set' do
before do
stub_config(
monitoring: {
sidekiq_exporter: {
enabled: true,
port: sidekiq_exporter_port
}
}
)
allow(Gitlab::SidekiqCluster).to receive(:start)
allow(cli).to receive(:write_pid)
allow(cli).to receive(:trap_signals)
allow(cli).to receive(:start_loop)
end
it 'does not start a sidekiq metrics server' do
expect(MetricsServer).not_to receive(:spawn)
cli.run(%w(foo))
end
it 'rescues Settingslogic::MissingSetting' do
expect { cli.run(%w(foo)) }.not_to raise_error(Settingslogic::MissingSetting)
end
end
context 'when the sidekiq_exporter.port setting is not set' do
before do
stub_config(
monitoring: {
sidekiq_exporter: {
enabled: true
},
sidekiq_health_checks: {
port: sidekiq_health_checks_port
}
}
)
allow(Gitlab::SidekiqCluster).to receive(:start)
allow(cli).to receive(:write_pid)
allow(cli).to receive(:trap_signals)
allow(cli).to receive(:start_loop)
end
it 'does not start a sidekiq metrics server' do
expect(MetricsServer).not_to receive(:spawn)
cli.run(%w(foo))
end
it 'rescues Settingslogic::MissingSetting' do
expect { cli.run(%w(foo)) }.not_to raise_error(Settingslogic::MissingSetting)
end
end
context 'when sidekiq_exporter.enabled setting is not set' do
before do
stub_config(
monitoring: {
sidekiq_exporter: {},
sidekiq_health_checks: {
port: sidekiq_health_checks_port
}
}
)
allow(Gitlab::SidekiqCluster).to receive(:start)
allow(cli).to receive(:write_pid)
allow(cli).to receive(:trap_signals)
allow(cli).to receive(:start_loop)
end
it 'does not start a sidekiq metrics server' do
expect(MetricsServer).not_to receive(:spawn)
cli.run(%w(foo))
end
end
using RSpec::Parameterized::TableSyntax
where(:sidekiq_exporter_enabled, :sidekiq_exporter_port, :sidekiq_health_checks_port, :start_metrics_server) do
true | '3807' | '3907' | true
true | '3807' | '3807' | false
false | '3807' | '3907' | false
false | '3807' | '3907' | false
end
with_them do
before do
allow(Gitlab::SidekiqCluster).to receive(:start)
allow(cli).to receive(:write_pid)
allow(cli).to receive(:trap_signals)
allow(cli).to receive(:start_loop)
end
specify do
if start_metrics_server
expect(MetricsServer).to receive(:spawn).with('sidekiq', wipe_metrics_dir: true)
else
expect(MetricsServer).not_to receive(:spawn)
end
cli.run(%w(foo))
end
end
end
context 'with --dryrun set' do
let(:sidekiq_exporter_enabled) { true }
it 'does not start the server' do
expect(MetricsServer).not_to receive(:spawn)
cli.run(%w(foo --dryrun))
end
end
end
context 'supervising the server' do
let(:sidekiq_exporter_enabled) { true }
let(:sidekiq_health_checks_port) { '3907' }
before do
allow(cli).to receive(:sleep).with(a_kind_of(Numeric))
allow(MetricsServer).to receive(:spawn).with('sidekiq', wipe_metrics_dir: false).and_return(99)
cli.start_metrics_server
end
it 'stops the metrics server when one of the processes has been terminated' do
allow(Gitlab::ProcessManagement).to receive(:process_died?).and_return(false)
allow(Gitlab::ProcessManagement).to receive(:all_alive?).with(an_instance_of(Array)).and_return(false)
allow(Gitlab::ProcessManagement).to receive(:signal_processes).with(an_instance_of(Array), :TERM)
expect(Process).to receive(:kill).with(:TERM, 99)
cli.start_loop
end
it 'starts the metrics server when it is down' do
allow(Gitlab::ProcessManagement).to receive(:process_died?).and_return(true)
allow(Gitlab::ProcessManagement).to receive(:all_alive?).with(an_instance_of(Array)).and_return(false)
allow(cli).to receive(:stop_metrics_server)
expect(MetricsServer).to receive(:spawn).with('sidekiq', wipe_metrics_dir: false)
cli.start_loop
end
end
end
end
describe '#write_pid' do
......
......@@ -78,7 +78,7 @@ RSpec.describe Gitlab::ProcessManagement do
end
describe '.process_alive?' do
it 'returns true if the proces is alive' do
it 'returns true if the process is alive' do
process = Process.pid
expect(described_class.process_alive?(process)).to eq(true)
......@@ -89,6 +89,32 @@ RSpec.describe Gitlab::ProcessManagement do
expect(described_class.process_alive?(process)).to eq(false)
end
it 'returns false when no pid is given' do
process = nil
expect(described_class.process_alive?(process)).to eq(false)
end
end
describe '.process_died?' do
it 'returns false if the process is alive' do
process = Process.pid
expect(described_class.process_died?(process)).to eq(false)
end
it 'returns true when a thread was not alive' do
process = -2
expect(described_class.process_died?(process)).to eq(true)
end
it 'returns true when no pid is given' do
process = nil
expect(described_class.process_died?(process)).to eq(true)
end
end
describe '.pids_alive' do
......
......@@ -12,7 +12,8 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
let(:env) do
{
'METRICS_SERVER_TARGET' => 'sidekiq',
'GITLAB_CONFIG' => nil
'GITLAB_CONFIG' => nil,
'WIPE_METRICS_DIR' => 'false'
}
end
......@@ -32,7 +33,7 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
let(:metrics_dir) { Dir.mktmpdir }
let(:settings_double) { double(:settings, sidekiq_exporter: {}) }
subject(:metrics_server) { described_class.new('fake', metrics_dir)}
subject(:metrics_server) { described_class.new('fake', metrics_dir, true)}
before do
stub_env('prometheus_multiproc_dir', metrics_dir)
......@@ -42,6 +43,7 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
end
after do
::Prometheus::CleanupMultiprocDirService.new.execute
Dir.rmdir(metrics_dir)
end
......@@ -59,10 +61,24 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
metrics_server.start
end
it 'removes any old metrics files' do
FileUtils.touch("#{metrics_dir}/remove_this.db")
context 'when wipe_metrics_dir is true' do
subject(:metrics_server) { described_class.new('fake', metrics_dir, true)}
expect { metrics_server.start }.to change { Dir.empty?(metrics_dir) }.from(false).to(true)
it 'removes any old metrics files' do
FileUtils.touch("#{metrics_dir}/remove_this.db")
expect { metrics_server.start }.to change { Dir.empty?(metrics_dir) }.from(false).to(true)
end
end
context 'when wipe_metrics_dir is false' do
subject(:metrics_server) { described_class.new('fake', metrics_dir, false)}
it 'does not remove any old metrics files' do
FileUtils.touch("#{metrics_dir}/remove_this.db")
expect { metrics_server.start }.not_to change { Dir.empty?(metrics_dir) }.from(false)
end
end
it 'starts a metrics server' do
......
......@@ -13,6 +13,8 @@ RSpec.describe Gitlab::SidekiqCluster do # rubocop:disable RSpec/FilePath
out: $stdout
}
expect(Bundler).to receive(:with_original_env).and_call_original.twice
expect(Process).to receive(:spawn).ordered.with({
"ENABLE_SIDEKIQ_CLUSTER" => "1",
"SIDEKIQ_WORKER_ID" => "0"
......
......@@ -28,7 +28,7 @@ RSpec.describe Quality::TestLevel do
context 'when level is unit' do
it 'returns a pattern' do
expect(subject.pattern(:unit))
.to eq("spec/{bin,channels,config,db,dependencies,elastic,elastic_integration,experiments,factories,finders,frontend,graphql,haml_lint,helpers,initializers,javascripts,lib,metrics_server,models,policies,presenters,rack_servers,replicators,routing,rubocop,scripts,serializers,services,sidekiq,spam,support_specs,tasks,uploaders,validators,views,workers,tooling}{,/**/}*_spec.rb")
.to eq("spec/{bin,channels,config,db,dependencies,elastic,elastic_integration,experiments,factories,finders,frontend,graphql,haml_lint,helpers,initializers,javascripts,lib,metrics_server,models,policies,presenters,rack_servers,replicators,routing,rubocop,scripts,serializers,services,sidekiq,sidekiq_cluster,spam,support_specs,tasks,uploaders,validators,views,workers,tooling}{,/**/}*_spec.rb")
end
end
......@@ -110,7 +110,7 @@ RSpec.describe Quality::TestLevel do
context 'when level is unit' do
it 'returns a regexp' do
expect(subject.regexp(:unit))
.to eq(%r{spec/(bin|channels|config|db|dependencies|elastic|elastic_integration|experiments|factories|finders|frontend|graphql|haml_lint|helpers|initializers|javascripts|lib|metrics_server|models|policies|presenters|rack_servers|replicators|routing|rubocop|scripts|serializers|services|sidekiq|spam|support_specs|tasks|uploaders|validators|views|workers|tooling)})
.to eq(%r{spec/(bin|channels|config|db|dependencies|elastic|elastic_integration|experiments|factories|finders|frontend|graphql|haml_lint|helpers|initializers|javascripts|lib|metrics_server|models|policies|presenters|rack_servers|replicators|routing|rubocop|scripts|serializers|services|sidekiq|sidekiq_cluster|spam|support_specs|tasks|uploaders|validators|views|workers|tooling)})
end
end
......
......@@ -45,6 +45,7 @@ module Quality
serializers
services
sidekiq
sidekiq_cluster
spam
support_specs
tasks
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment