Commit 4d048f64 authored by Mark Chao's avatar Mark Chao

Merge branch '356505-fix-missing-sidekiq-exporter-metrics' into 'master'

Fix missing metrics for Sidekiq exporter server

See merge request gitlab-org/gitlab!83407
parents d4654fc3 0e77c181
...@@ -4,7 +4,7 @@ PUMA_EXTERNAL_METRICS_SERVER = Gitlab::Utils.to_boolean(ENV['PUMA_EXTERNAL_METRI ...@@ -4,7 +4,7 @@ PUMA_EXTERNAL_METRICS_SERVER = Gitlab::Utils.to_boolean(ENV['PUMA_EXTERNAL_METRI
require Rails.root.join('metrics_server', 'metrics_server') if PUMA_EXTERNAL_METRICS_SERVER require Rails.root.join('metrics_server', 'metrics_server') if PUMA_EXTERNAL_METRICS_SERVER
# Keep separate directories for separate processes # Keep separate directories for separate processes
def prometheus_default_multiproc_dir def metrics_temp_dir
return unless Rails.env.development? || Rails.env.test? return unless Rails.env.development? || Rails.env.test?
if Gitlab::Runtime.sidekiq? if Gitlab::Runtime.sidekiq?
...@@ -16,20 +16,22 @@ def prometheus_default_multiproc_dir ...@@ -16,20 +16,22 @@ def prometheus_default_multiproc_dir
end end
end end
def puma_metrics_server_process? def prometheus_metrics_dir
Prometheus::PidProvider.worker_id == 'puma_master' ENV['prometheus_multiproc_dir'] || metrics_temp_dir
end end
def sidekiq_metrics_server_process? def puma_metrics_server_process?
Gitlab::Runtime.sidekiq? && (!ENV['SIDEKIQ_WORKER_ID'] || ENV['SIDEKIQ_WORKER_ID'] == '0') Prometheus::PidProvider.worker_id == 'puma_master'
end end
if puma_metrics_server_process? || sidekiq_metrics_server_process? if puma_metrics_server_process?
# The following is necessary to ensure stale Prometheus metrics don't accumulate over time. # The following is necessary to ensure stale Prometheus metrics don't accumulate over time.
# It needs to be done as early as here to ensure metrics files aren't deleted. # It needs to be done as early as possible to ensure new metrics aren't being deleted.
# After we hit our app in `warmup`, first metrics and corresponding files already being created, #
# for example in `lib/gitlab/metrics/requests_rack_middleware.rb`. # Note that this should not happen for Sidekiq. Since Sidekiq workers are spawned from the
Prometheus::CleanupMultiprocDirService.new.execute # sidekiq-cluster script, we perform this cleanup in `sidekiq_cluster/cli.rb` instead,
# since it must happen prior to any worker processes or the metrics server starting up.
Prometheus::CleanupMultiprocDirService.new(prometheus_metrics_dir).execute
::Prometheus::Client.reinitialize_on_pid_change(force: true) ::Prometheus::Client.reinitialize_on_pid_change(force: true)
end end
...@@ -37,7 +39,7 @@ end ...@@ -37,7 +39,7 @@ end
::Prometheus::Client.configure do |config| ::Prometheus::Client.configure do |config|
config.logger = Gitlab::AppLogger config.logger = Gitlab::AppLogger
config.multiprocess_files_dir = ENV['prometheus_multiproc_dir'] || prometheus_default_multiproc_dir config.multiprocess_files_dir = prometheus_metrics_dir
config.pid_provider = ::Prometheus::PidProvider.method(:worker_id) config.pid_provider = ::Prometheus::PidProvider.method(:worker_id)
end end
......
...@@ -2,22 +2,17 @@ ...@@ -2,22 +2,17 @@
module Prometheus module Prometheus
class CleanupMultiprocDirService class CleanupMultiprocDirService
include Gitlab::Utils::StrongMemoize def initialize(metrics_dir)
@metrics_dir = metrics_dir
def execute
FileUtils.rm_rf(old_metrics) if old_metrics
end end
private def execute
return if @metrics_dir.blank?
def old_metrics files_to_delete = Dir[File.join(@metrics_dir, '*.db')]
strong_memoize(:old_metrics) do return if files_to_delete.blank?
Dir[File.join(multiprocess_files_dir, '*.db')] if multiprocess_files_dir
end
end
def multiprocess_files_dir FileUtils.rm_rf(files_to_delete)
::Prometheus::Client.configuration.multiprocess_files_dir
end end
end end
end end
...@@ -84,7 +84,7 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass ...@@ -84,7 +84,7 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass
end end
FileUtils.mkdir_p(@metrics_dir, mode: 0700) FileUtils.mkdir_p(@metrics_dir, mode: 0700)
::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir ::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute if @wipe_metrics_dir
# We need to `warmup: true` since otherwise the sampler and exporter threads enter # We need to `warmup: true` since otherwise the sampler and exporter threads enter
# a race where not all Prometheus db files will be visible to the exporter, resulting # a race where not all Prometheus db files will be visible to the exporter, resulting
......
...@@ -118,6 +118,11 @@ module Gitlab ...@@ -118,6 +118,11 @@ module Gitlab
return if @dryrun return if @dryrun
# Make sure we reset the metrics directory prior to:
# - starting a metrics server process
# - starting new workers
::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute
ProcessManagement.write_pid(@pid) if @pid ProcessManagement.write_pid(@pid) if @pid
supervisor = SidekiqProcessSupervisor.instance( supervisor = SidekiqProcessSupervisor.instance(
...@@ -137,7 +142,7 @@ module Gitlab ...@@ -137,7 +142,7 @@ module Gitlab
# and the metrics server died, restart it. # and the metrics server died, restart it.
if supervisor.alive && dead_pids.include?(metrics_server_pid) if supervisor.alive && dead_pids.include?(metrics_server_pid)
@logger.info('Sidekiq metrics server terminated, restarting...') @logger.info('Sidekiq metrics server terminated, restarting...')
metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false) metrics_server_pid = restart_metrics_server
all_pids = worker_pids + Array(metrics_server_pid) all_pids = worker_pids + Array(metrics_server_pid)
else else
# If a worker process died we'll just terminate the whole cluster. # If a worker process died we'll just terminate the whole cluster.
...@@ -154,15 +159,14 @@ module Gitlab ...@@ -154,15 +159,14 @@ module Gitlab
def start_metrics_server def start_metrics_server
return unless metrics_server_enabled? return unless metrics_server_enabled?
restart_metrics_server(wipe_metrics_dir: true) restart_metrics_server
end end
def restart_metrics_server(wipe_metrics_dir: false) def restart_metrics_server
@logger.info("Starting metrics server on port #{sidekiq_exporter_port}") @logger.info("Starting metrics server on port #{sidekiq_exporter_port}")
MetricsServer.fork( MetricsServer.fork(
'sidekiq', 'sidekiq',
metrics_dir: @metrics_dir, metrics_dir: @metrics_dir,
wipe_metrics_dir: wipe_metrics_dir,
reset_signals: TERMINATE_SIGNALS + FORWARD_SIGNALS reset_signals: TERMINATE_SIGNALS + FORWARD_SIGNALS
) )
end end
......
...@@ -41,6 +41,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo ...@@ -41,6 +41,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
end end
let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) } let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) }
let(:metrics_cleanup_service) { instance_double(Prometheus::CleanupMultiprocDirService, execute: nil) }
before do before do
stub_env('RAILS_ENV', 'test') stub_env('RAILS_ENV', 'test')
...@@ -54,6 +55,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo ...@@ -54,6 +55,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
allow(Gitlab::ProcessManagement).to receive(:write_pid) allow(Gitlab::ProcessManagement).to receive(:write_pid)
allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor) allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor)
allow(supervisor).to receive(:supervise) allow(supervisor).to receive(:supervise)
allow(Prometheus::CleanupMultiprocDirService).to receive(:new).and_return(metrics_cleanup_service)
end end
after do after do
...@@ -300,6 +303,12 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo ...@@ -300,6 +303,12 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
allow(Gitlab::SidekiqCluster).to receive(:start).and_return([]) allow(Gitlab::SidekiqCluster).to receive(:start).and_return([])
end end
it 'wipes the metrics directory' do
expect(metrics_cleanup_service).to receive(:execute)
cli.run(%w(foo))
end
context 'when there are no sidekiq_health_checks settings set' do context 'when there are no sidekiq_health_checks settings set' do
let(:sidekiq_exporter_enabled) { true } let(:sidekiq_exporter_enabled) { true }
...@@ -379,7 +388,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo ...@@ -379,7 +388,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
with_them do with_them do
specify do specify do
if start_metrics_server if start_metrics_server
expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, wipe_metrics_dir: true, reset_signals: trapped_signals) expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, reset_signals: trapped_signals)
else else
expect(MetricsServer).not_to receive(:fork) expect(MetricsServer).not_to receive(:fork)
end end
......
# frozen_string_literal: true # frozen_string_literal: true
require 'spec_helper' require 'fast_spec_helper'
RSpec.describe Prometheus::CleanupMultiprocDirService do RSpec.describe Prometheus::CleanupMultiprocDirService do
describe '.call' do describe '#execute' do
subject { described_class.new.execute }
let(:metrics_multiproc_dir) { Dir.mktmpdir } let(:metrics_multiproc_dir) { Dir.mktmpdir }
let(:metrics_file_path) { File.join(metrics_multiproc_dir, 'counter_puma_master-0.db') } let(:metrics_file_path) { File.join(metrics_multiproc_dir, 'counter_puma_master-0.db') }
subject(:service) { described_class.new(metrics_dir_arg).execute }
before do before do
FileUtils.touch(metrics_file_path) FileUtils.touch(metrics_file_path)
end end
after do after do
FileUtils.rm_r(metrics_multiproc_dir) FileUtils.rm_rf(metrics_multiproc_dir)
end end
context 'when `multiprocess_files_dir` is defined' do context 'when `multiprocess_files_dir` is defined' do
before do let(:metrics_dir_arg) { metrics_multiproc_dir }
expect(Prometheus::Client.configuration)
.to receive(:multiprocess_files_dir)
.and_return(metrics_multiproc_dir)
.at_least(:once)
end
it 'removes old metrics' do it 'removes old metrics' do
expect { subject } expect { service }
.to change { File.exist?(metrics_file_path) } .to change { File.exist?(metrics_file_path) }
.from(true) .from(true)
.to(false) .to(false)
...@@ -34,15 +29,10 @@ RSpec.describe Prometheus::CleanupMultiprocDirService do ...@@ -34,15 +29,10 @@ RSpec.describe Prometheus::CleanupMultiprocDirService do
end end
context 'when `multiprocess_files_dir` is not defined' do context 'when `multiprocess_files_dir` is not defined' do
before do let(:metrics_dir_arg) { nil }
expect(Prometheus::Client.configuration)
.to receive(:multiprocess_files_dir)
.and_return(nil)
.at_least(:once)
end
it 'does not remove any files' do it 'does not remove any files' do
expect { subject } expect { service }
.not_to change { File.exist?(metrics_file_path) } .not_to change { File.exist?(metrics_file_path) }
.from(true) .from(true)
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment