Use external metrics server process for Puma

This moves us away from a process-internal server thread and toward an external process that we supervise instead.

Use external metrics server process for Puma
This moves us away from a process-internal server thread and toward an external process that we supervise instead.
31a3c142 · Matthias Käppler · Bob Van Landuyt · ddddfe2b · 31a3c142 · 31a3c142
Commit 31a3c142 authored Mar 15, 2022 by Matthias Käppler Committed by Bob Van Landuyt Mar 15, 2022
9 changed files
--- a/config/initializers/7_prometheus_metrics.rb
+++ b/config/initializers/7_prometheus_metrics.rb
 # frozen_string_literal: true
+PUMA_EXTERNAL_METRICS_SERVER = Gitlab::Utils.to_boolean(ENV['PUMA_EXTERNAL_METRICS_SERVER'])
 # Keep separate directories for separate processes
 def prometheus_default_multiproc_dir
  return unless Rails.env.development? || Rails.env.test?
@@ -72,8 +74,12 @@ Gitlab::Cluster::LifecycleEvents.on_master_start do
  if Gitlab::Runtime.puma?
    Gitlab::Metrics::Samplers::PumaSampler.instance.start
-    # Starts a metrics server to export metrics from the Puma primary.
+    if Settings.monitoring.web_exporter.enabled && PUMA_EXTERNAL_METRICS_SERVER
-    Gitlab::Metrics::Exporter::WebExporter.instance.start
+      require_relative '../../metrics_server/metrics_server'
+      MetricsServer.start_for_puma
+    else
+      Gitlab::Metrics::Exporter::WebExporter.instance.start
+    end
  end
  Gitlab::Ci::Parsers.instrument!
@@ -90,11 +96,13 @@ Gitlab::Cluster::LifecycleEvents.on_worker_start do
  Gitlab::Metrics::Samplers::ThreadsSampler.initialize_instance(logger: logger).start
  if Gitlab::Runtime.puma?
-    # Since we are running a metrics server on the Puma primary, we would inherit
+    # Since we are observing a metrics server from the Puma primary, we would inherit
-    # this thread after forking into workers, so we need to explicitly stop it here.
+    # this supervision thread after forking into workers, so we need to explicitly stop it here.
-    # NOTE: This will not be necessary anymore after moving to an external server
+    if PUMA_EXTERNAL_METRICS_SERVER
-    # process via https://gitlab.com/gitlab-org/gitlab/-/issues/350548
+      ::MetricsServer::PumaProcessSupervisor.instance.stop
-    Gitlab::Metrics::Exporter::WebExporter.instance.stop
+    else
+      Gitlab::Metrics::Exporter::WebExporter.instance.stop
+    end
    Gitlab::Metrics::Samplers::ActionCableSampler.instance(logger: logger).start
  end
@@ -112,16 +120,24 @@ end
 if Gitlab::Runtime.puma?
  Gitlab::Cluster::LifecycleEvents.on_before_graceful_shutdown do
    # We need to ensure that before we re-exec or shutdown server
-    # we do stop the exporter
+    # we also stop the metrics server
-    Gitlab::Metrics::Exporter::WebExporter.instance.stop
+    if PUMA_EXTERNAL_METRICS_SERVER
+      ::MetricsServer::PumaProcessSupervisor.instance.shutdown
+    else
+      Gitlab::Metrics::Exporter::WebExporter.instance.stop
+    end
  end
  Gitlab::Cluster::LifecycleEvents.on_before_master_restart do
    # We need to ensure that before we re-exec server
-    # we do stop the exporter
+    # we also stop the metrics server
    #
    # We do it again, for being extra safe,
    # but it should not be needed
-    Gitlab::Metrics::Exporter::WebExporter.instance.stop
+    if PUMA_EXTERNAL_METRICS_SERVER
+      ::MetricsServer::PumaProcessSupervisor.instance.shutdown
+    else
+      Gitlab::Metrics::Exporter::WebExporter.instance.stop
+    end
  end
 end
--- a/lib/gitlab/process_supervisor.rb
+++ b/lib/gitlab/process_supervisor.rb
@@ -9,7 +9,7 @@ module Gitlab
  # The supervisor will also trap termination signals if provided and
  # propagate those to the supervised processes. Any supervised processes
  # that do not terminate within a specified grace period will be killed.
-  class ProcessSupervisor
+  class ProcessSupervisor < Gitlab::Daemon
    DEFAULT_HEALTH_CHECK_INTERVAL_SECONDS = 5
    DEFAULT_TERMINATE_INTERVAL_SECONDS = 1
    DEFAULT_TERMINATE_TIMEOUT_SECONDS = 10
@@ -21,13 +21,18 @@ module Gitlab
      check_terminate_interval_seconds: DEFAULT_TERMINATE_INTERVAL_SECONDS,
      terminate_timeout_seconds: DEFAULT_TERMINATE_TIMEOUT_SECONDS,
      term_signals: %i(INT TERM),
-      forwarded_signals: [])
+      forwarded_signals: [],
+      **options)
+      super(**options)
      @term_signals = term_signals
      @forwarded_signals = forwarded_signals
      @health_check_interval_seconds = health_check_interval_seconds
      @check_terminate_interval_seconds = check_terminate_interval_seconds
      @terminate_timeout_seconds = terminate_timeout_seconds
+      @pids = []
+      @alive = false
    end
    # Starts a supervision loop for the given process ID(s).
@@ -39,32 +44,66 @@ module Gitlab
    # start observing those processes instead. Otherwise it will shut down.
    def supervise(pid_or_pids, &on_process_death)
      @pids = Array(pid_or_pids)
+      @on_process_death = on_process_death
      trap_signals!
+      start
+    end
+    # Shuts down the supervisor and all supervised processes with the given signal.
+    def shutdown(signal = :TERM)
+      return unless @alive
+      stop_processes(signal)
+      stop
+    end
+    def supervised_pids
+      @pids
+    end
+    private
+    def start_working
      @alive = true
+    end
+    def stop_working
+      @alive = false
+    end
+    def run_thread
      while @alive
        sleep(@health_check_interval_seconds)
-        check_process_health(&on_process_death)
+        check_process_health
      end
    end
-    private
+    def check_process_health
-    def check_process_health(&on_process_death)
      unless all_alive?
-        dead_pids = @pids - live_pids
+        existing_pids = live_pids # Capture this value for the duration of the block.
-        @pids = Array(yield(dead_pids))
+        dead_pids = @pids - existing_pids
+        new_pids = Array(@on_process_death.call(dead_pids))
+        @pids = existing_pids + new_pids
        @alive = @pids.any?
      end
    end
+    def stop_processes(signal)
+      # Set this prior to shutting down so that shutdown hooks which read `alive`
+      # know the supervisor is about to shut down.
+      @alive = false
+      # Shut down supervised processes.
+      signal_all(signal)
+      wait_for_termination
+    end
    def trap_signals!
      ProcessManagement.trap_signals(@term_signals) do |signal|
-        @alive = false
+        stop_processes(signal)
-        signal_all(signal)
-        wait_for_termination
      end
      ProcessManagement.trap_signals(@forwarded_signals) do |signal|

--- a/metrics_server/dependencies.rb
+++ b/metrics_server/dependencies.rb
@@ -31,5 +31,6 @@ require_relative '../lib/gitlab/metrics/exporter/gc_request_middleware'
 require_relative '../lib/gitlab/health_checks/probes/collection'
 require_relative '../lib/gitlab/health_checks/probes/status'
 require_relative '../lib/gitlab/process_management'
+require_relative '../lib/gitlab/process_supervisor'
 # rubocop:enable Naming/FileName
--- a/metrics_server/metrics_server.rb
+++ b/metrics_server/metrics_server.rb
@@ -5,7 +5,28 @@ require_relative '../config/boot'
 require_relative 'dependencies'
 class MetricsServer # rubocop:disable Gitlab/NamespacedClass
+  # The singleton instance used to supervise the Puma metrics server.
+  PumaProcessSupervisor = Class.new(Gitlab::ProcessSupervisor)
  class << self
+    def start_for_puma
+      metrics_dir = ::Prometheus::Client.configuration.multiprocess_files_dir
+      start_server = proc do
+        MetricsServer.spawn('puma', metrics_dir: metrics_dir).tap do |pid|
+          Gitlab::AppLogger.info("Starting Puma metrics server with pid #{pid}")
+        end
+      end
+      supervisor = PumaProcessSupervisor.instance
+      supervisor.supervise(start_server.call) do
+        next unless supervisor.alive
+        Gitlab::AppLogger.info('Puma metrics server terminated, restarting...')
+        start_server.call
+      end
+    end
    def spawn(target, metrics_dir:, gitlab_config: nil, wipe_metrics_dir: false)
      ensure_valid_target!(target)

--- a/sidekiq_cluster/cli.rb
+++ b/sidekiq_cluster/cli.rb
@@ -10,6 +10,7 @@ require 'time'
 # we may run into "already initialized" warnings, hence the check.
 require_relative '../lib/gitlab' unless Object.const_defined?('Gitlab')
 require_relative '../lib/gitlab/utils'
+require_relative '../lib/gitlab/daemon'
 require_relative '../lib/gitlab/sidekiq_config/cli_methods'
 require_relative '../lib/gitlab/sidekiq_config/worker_matcher'
 require_relative '../lib/gitlab/sidekiq_logging/json_formatter'
@@ -121,11 +122,12 @@ module Gitlab
        ProcessManagement.write_pid(@pid) if @pid
-        supervisor = Gitlab::ProcessSupervisor.new(
+        supervisor = SidekiqProcessSupervisor.instance(
          health_check_interval_seconds: @interval,
          terminate_timeout_seconds: @soft_timeout_seconds + TIMEOUT_GRACE_PERIOD_SECONDS,
          term_signals: TERMINATE_SIGNALS,
-          forwarded_signals: FORWARD_SIGNALS
+          forwarded_signals: FORWARD_SIGNALS,
+          synchronous: true
        )
        metrics_server_pid = start_metrics_server
@@ -136,7 +138,7 @@ module Gitlab
          # If we're not in the process of shutting down the cluster,
          # and the metrics server died, restart it.
          if supervisor.alive && dead_pids.include?(metrics_server_pid)
-            @logger.info('Metrics server terminated, restarting...')
+            @logger.info('Sidekiq metrics server terminated, restarting...')
            metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false)
            all_pids = worker_pids + Array(metrics_server_pid)
          else

--- a/sidekiq_cluster/sidekiq_cluster.rb
+++ b/sidekiq_cluster/sidekiq_cluster.rb
@@ -16,6 +16,9 @@ module Gitlab
    # before we kill the process.
    TIMEOUT_GRACE_PERIOD_SECONDS = 5
+    # The singleton instance used to supervise cluster processes.
+    SidekiqProcessSupervisor = Class.new(Gitlab::ProcessSupervisor)
    # Starts Sidekiq workers for the pairs of processes.
    #
    # Example:

--- a/spec/commands/sidekiq_cluster/cli_spec.rb
+++ b/spec/commands/sidekiq_cluster/cli_spec.rb
@@ -40,6 +40,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
    }
  end
+  let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) }
  before do
    stub_env('RAILS_ENV', 'test')
@@ -47,8 +49,11 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
    config_file.close
    allow(::Settings).to receive(:source).and_return(config_file.path)
    ::Settings.reload!
+    allow(Gitlab::ProcessManagement).to receive(:write_pid)
+    allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor)
+    allow(supervisor).to receive(:supervise)
  end
  after do
@@ -63,11 +68,6 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
    end
    context 'with arguments' do
-      before do
-        allow(Gitlab::ProcessManagement).to receive(:write_pid)
-        allow_next_instance_of(Gitlab::ProcessSupervisor) { |it| allow(it).to receive(:supervise) }
-      end
      it 'starts the Sidekiq workers' do
        expect(Gitlab::SidekiqCluster).to receive(:start)
                                            .with([['foo']], default_options)
@@ -298,8 +298,6 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
        context 'without --dryrun' do
          before do
            allow(Gitlab::SidekiqCluster).to receive(:start).and_return([])
-            allow(Gitlab::ProcessManagement).to receive(:write_pid)
-            allow_next_instance_of(Gitlab::ProcessSupervisor) { |it| allow(it).to receive(:supervise) }
          end
          context 'when there are no sidekiq_health_checks settings set' do
@@ -429,14 +427,11 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
      before do
        allow(Gitlab::SidekiqCluster).to receive(:start).and_return(sidekiq_worker_pids)
-        allow(Gitlab::ProcessManagement).to receive(:write_pid)
      end
      it 'stops the entire process cluster if one of the workers has been terminated' do
-        allow_next_instance_of(Gitlab::ProcessSupervisor) do |it|
+        expect(supervisor).to receive(:alive).and_return(true)
-          allow(it).to receive(:supervise).and_yield([2])
+        expect(supervisor).to receive(:supervise).and_yield([2])
-        end
        expect(MetricsServer).to receive(:fork).once.and_return(metrics_server_pid)
        expect(Gitlab::ProcessManagement).to receive(:signal_processes).with([42, 99], :TERM)
@@ -445,11 +440,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
      context 'when the supervisor is alive' do
        it 'restarts the metrics server when it is down' do
-          allow_next_instance_of(Gitlab::ProcessSupervisor) do |it|
+          expect(supervisor).to receive(:alive).and_return(true)
-            allow(it).to receive(:alive).and_return(true)
+          expect(supervisor).to receive(:supervise).and_yield([metrics_server_pid])
-            allow(it).to receive(:supervise).and_yield([metrics_server_pid])
-          end
          expect(MetricsServer).to receive(:fork).twice.and_return(metrics_server_pid)
          cli.run(%w(foo))
@@ -458,11 +450,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
      context 'when the supervisor is shutting down' do
        it 'does not restart the metrics server' do
-          allow_next_instance_of(Gitlab::ProcessSupervisor) do |it|
+          expect(supervisor).to receive(:alive).and_return(false)
-            allow(it).to receive(:alive).and_return(false)
+          expect(supervisor).to receive(:supervise).and_yield([metrics_server_pid])
-            allow(it).to receive(:supervise).and_yield([metrics_server_pid])
-          end
          expect(MetricsServer).to receive(:fork).once.and_return(metrics_server_pid)
          cli.run(%w(foo))

--- a/spec/lib/gitlab/process_supervisor_spec.rb
+++ b/spec/lib/gitlab/process_supervisor_spec.rb
@@ -6,7 +6,9 @@ RSpec.describe Gitlab::ProcessSupervisor do
  let(:health_check_interval_seconds) { 0.1 }
  let(:check_terminate_interval_seconds) { 1 }
  let(:forwarded_signals) { [] }
-  let(:process_id) do
+  let(:process_ids) { [spawn_process, spawn_process] }
+  def spawn_process
    Process.spawn('while true; do sleep 1; done').tap do |pid|
      Process.detach(pid)
    end
@@ -22,54 +24,52 @@ RSpec.describe Gitlab::ProcessSupervisor do
  end
  after do
-    if Gitlab::ProcessManagement.process_alive?(process_id)
+    process_ids.each do |pid|
-      Process.kill('KILL', process_id)
+      Process.kill('KILL', pid)
+    rescue Errno::ESRCH
+      # Ignore if a process wasn't actually alive.
    end
  end
  describe '#supervise' do
-    context 'while supervised process is alive' do
+    context 'while supervised processes are alive' do
      it 'does not invoke callback' do
-        expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(true)
+        expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true)
        pids_killed = []
-        thread = Thread.new do
+        supervisor.supervise(process_ids) do |dead_pids|
-          supervisor.supervise(process_id) do |dead_pids|
+          pids_killed = dead_pids
-            pids_killed = dead_pids
+          []
-            []
-          end
        end
        # Wait several times the poll frequency of the supervisor.
        sleep health_check_interval_seconds * 10
-        thread.terminate
        expect(pids_killed).to be_empty
-        expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(true)
+        expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true)
      end
    end
-    context 'when supervised process dies' do
+    context 'when a supervised process dies' do
-      it 'triggers callback with the dead PIDs' do
+      it 'triggers callback with the dead PIDs and adds new PIDs to supervised PIDs' do
-        expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(true)
+        expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true)
        pids_killed = []
-        thread = Thread.new do
+        supervisor.supervise(process_ids) do |dead_pids|
-          supervisor.supervise(process_id) do |dead_pids|
+          pids_killed = dead_pids
-            pids_killed = dead_pids
+          [42] # Fake starting a new process in place of the terminated one.
-            []
-          end
        end
        # Terminate the supervised process.
-        Process.kill('TERM', process_id)
+        Process.kill('TERM', process_ids.first)
        await_condition(sleep_sec: health_check_interval_seconds) do
-          pids_killed == [process_id]
+          pids_killed == [process_ids.first]
        end
-        thread.terminate
-        expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(false)
+        expect(Gitlab::ProcessManagement.process_alive?(process_ids.first)).to be(false)
+        expect(Gitlab::ProcessManagement.process_alive?(process_ids.last)).to be(true)
+        expect(supervisor.supervised_pids).to match_array([process_ids.last, 42])
      end
    end
@@ -78,7 +78,7 @@ RSpec.describe Gitlab::ProcessSupervisor do
        allow(supervisor).to receive(:sleep)
        allow(Gitlab::ProcessManagement).to receive(:trap_signals)
        allow(Gitlab::ProcessManagement).to receive(:all_alive?).and_return(false)
-        allow(Gitlab::ProcessManagement).to receive(:signal_processes).with([process_id], anything)
+        allow(Gitlab::ProcessManagement).to receive(:signal_processes).with(process_ids, anything)
      end
      context 'termination signals' do
@@ -87,21 +87,21 @@ RSpec.describe Gitlab::ProcessSupervisor do
            allow(Gitlab::ProcessManagement).to receive(:any_alive?).and_return(false)
            expect(Gitlab::ProcessManagement).to receive(:trap_signals).ordered.with(%i(INT TERM)).and_yield(:TERM)
-            expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], :TERM)
+            expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :TERM)
            expect(supervisor).not_to receive(:sleep).with(check_terminate_interval_seconds)
-            supervisor.supervise(process_id) { [] }
+            supervisor.supervise(process_ids) { [] }
          end
        end
        context 'when TERM does not result in timely shutdown of processes' do
          it 'issues a KILL signal after the grace period expires' do
            expect(Gitlab::ProcessManagement).to receive(:trap_signals).with(%i(INT TERM)).and_yield(:TERM)
-            expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], :TERM)
+            expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :TERM)
            expect(supervisor).to receive(:sleep).ordered.with(check_terminate_interval_seconds).at_least(:once)
-            expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], '-KILL')
+            expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, '-KILL')
-            supervisor.supervise(process_id) { [] }
+            supervisor.supervise(process_ids) { [] }
          end
        end
      end
@@ -111,10 +111,53 @@ RSpec.describe Gitlab::ProcessSupervisor do
        it 'forwards given signals to the observed processes' do
          expect(Gitlab::ProcessManagement).to receive(:trap_signals).with(%i(USR1)).and_yield(:USR1)
-          expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], :USR1)
+          expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :USR1)
+          supervisor.supervise(process_ids) { [] }
+        end
+      end
+    end
+  end
+  describe '#shutdown' do
+    context 'when supervisor is supervising processes' do
+      before do
+        supervisor.supervise(process_ids)
+      end
+      context 'when supervisor is alive' do
+        it 'signals TERM then KILL to all supervised processes' do
+          expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :TERM)
+          expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, '-KILL')
-          supervisor.supervise(process_id) { [] }
+          supervisor.shutdown
        end
+        it 'stops the supervisor' do
+          expect { supervisor.shutdown }.to change { supervisor.alive }.from(true).to(false)
+        end
+      end
+      context 'when supervisor has already shut down' do
+        before do
+          supervisor.shutdown
+        end
+        it 'does nothing' do
+          expect(supervisor.alive).to be(false)
+          expect(Gitlab::ProcessManagement).not_to receive(:signal_processes)
+          supervisor.shutdown
+        end
+      end
+    end
+    context 'when supervisor never started' do
+      it 'does nothing' do
+        expect(supervisor.alive).to be(false)
+        expect(Gitlab::ProcessManagement).not_to receive(:signal_processes)
+        supervisor.shutdown
      end
    end
  end

--- a/spec/metrics_server/metrics_server_spec.rb
+++ b/spec/metrics_server/metrics_server_spec.rb
 # frozen_string_literal: true
-require 'fast_spec_helper'
+require 'spec_helper'
 require_relative '../../metrics_server/metrics_server'
-require_relative '../support/helpers/next_instance_of'
 RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
-  include NextInstanceOf
  let(:prometheus_config) { ::Prometheus::Client.configuration }
  let(:metrics_dir) { Dir.mktmpdir }
@@ -205,4 +202,47 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
    it_behaves_like 'a metrics exporter', 'sidekiq', 'sidekiq_exporter'
  end
+  describe '.start_for_puma' do
+    let(:supervisor) { instance_double(Gitlab::ProcessSupervisor) }
+    before do
+      allow(Gitlab::ProcessSupervisor).to receive(:instance).and_return(supervisor)
+    end
+    it 'spawns a server process and supervises it' do
+      expect(Process).to receive(:spawn).with(
+        include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything
+      ).once.and_return(42)
+      expect(supervisor).to receive(:supervise).with(42)
+      described_class.start_for_puma
+    end
+    context 'when the supervisor callback is invoked' do
+      context 'and the supervisor is alive' do
+        it 'restarts the metrics server' do
+          expect(supervisor).to receive(:alive).and_return(true)
+          expect(supervisor).to receive(:supervise).and_yield
+          expect(Process).to receive(:spawn).with(
+            include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything
+          ).twice.and_return(42)
+          described_class.start_for_puma
+        end
+      end
+      context 'and the supervisor is not alive' do
+        it 'does not restart the server' do
+          expect(supervisor).to receive(:alive).and_return(false)
+          expect(supervisor).to receive(:supervise).and_yield
+          expect(Process).to receive(:spawn).with(
+            include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything
+          ).once.and_return(42)
+          described_class.start_for_puma
+        end
+      end
+    end
+  end
 end