memory_killer.rb 8.98 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# frozen_string_literal: true

module Gitlab
  module SidekiqDaemon
    class MemoryKiller < Daemon
      include ::Gitlab::Utils::StrongMemoize

      # Today 64-bit CPU support max 256T memory. It is big enough.
      MAX_MEMORY_KB = 256 * 1024 * 1024 * 1024
      # RSS below `soft_limit_rss` is considered safe
      SOFT_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_MAX_RSS', 2000000).to_i
      # RSS above `hard_limit_rss` will be stopped
      HARD_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS', MAX_MEMORY_KB).to_i
      # RSS in range (soft_limit_rss, hard_limit_rss) is allowed for GRACE_BALLOON_SECONDS
      GRACE_BALLOON_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_GRACE_TIME', 15 * 60).to_i
      # Check RSS every CHECK_INTERVAL_SECONDS, minimum 2 seconds
      CHECK_INTERVAL_SECONDS = [ENV.fetch('SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL', 3).to_i, 2].max
      # Give Sidekiq up to 30 seconds to allow existing jobs to finish after exceeding the limit
      SHUTDOWN_TIMEOUT_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT', 30).to_i
20 21 22
      # Developer/admin should always set `memory_killer_max_memory_growth_kb` explicitly
      # In case not set, default to 300M. This is for extra-safe.
      DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000
23

24 25 26 27 28 29 30 31
      # Phases of memory killer
      PHASE = {
        running: 1,
        above_soft_limit: 2,
        stop_fetching_new_jobs: 3,
        shutting_down: 4,
        killing_sidekiq: 5
      }.freeze
32 33 34 35 36

      def initialize
        super

        @enabled = true
37
        @metrics = init_metrics
38 39 40 41
      end

      private

42 43
      def init_metrics
        {
44
          sidekiq_current_rss:                  ::Gitlab::Metrics.gauge(:sidekiq_current_rss, 'Current RSS of Sidekiq Worker'),
45 46
          sidekiq_memory_killer_soft_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_soft_limit_rss, 'Current soft_limit_rss of Sidekiq Worker'),
          sidekiq_memory_killer_hard_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_hard_limit_rss, 'Current hard_limit_rss of Sidekiq Worker'),
47
          sidekiq_memory_killer_phase:          ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_phase, 'Current phase of Sidekiq Worker')
48 49 50
        }
      end

51 52 53 54 55 56 57 58 59 60 61 62 63
      def refresh_state(phase)
        @phase = PHASE.fetch(phase)
        @current_rss = get_rss
        @soft_limit_rss = get_soft_limit_rss
        @hard_limit_rss = get_hard_limit_rss

        # track the current state as prometheus gauges
        @metrics[:sidekiq_memory_killer_phase].set({}, @phase)
        @metrics[:sidekiq_current_rss].set({}, @current_rss)
        @metrics[:sidekiq_memory_killer_soft_limit_rss].set({}, @soft_limit_rss)
        @metrics[:sidekiq_memory_killer_hard_limit_rss].set({}, @hard_limit_rss)
      end

Kamil Trzciński's avatar
Kamil Trzciński committed
64
      def run_thread
65 66 67 68 69 70 71 72 73 74
        Sidekiq.logger.info(
          class: self.class.to_s,
          action: 'start',
          pid: pid,
          message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon'
        )

        while enabled?
          begin
            sleep(CHECK_INTERVAL_SECONDS)
75
            restart_sidekiq unless rss_within_range?
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
          rescue => e
            log_exception(e, __method__)
          rescue Exception => e # rubocop:disable Lint/RescueException
            log_exception(e, __method__ )
            raise e
          end
        end
      ensure
        Sidekiq.logger.warn(
          class: self.class.to_s,
          action: 'stop',
          pid: pid,
          message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon'
        )
      end

      def log_exception(exception, method)
        Sidekiq.logger.warn(
          class: self.class.to_s,
          pid: pid,
          message: "Exception from #{method}: #{exception.message}"
        )
      end

      def stop_working
        @enabled = false
      end

      def enabled?
        @enabled
      end

      def restart_sidekiq
        # Tell Sidekiq to stop fetching new jobs
        # We first SIGNAL and then wait given time
        # We also monitor a number of running jobs and allow to restart early
112
        refresh_state(:stop_fetching_new_jobs)
113 114 115 116 117
        signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
        return unless enabled?

        # Tell sidekiq to restart itself
        # Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
118
        refresh_state(:shutting_down)
119 120 121 122 123 124
        signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
        return unless enabled?

        # Ideally we should never reach this condition
        # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
        # Kill the whole pgroup, so we can be sure no children are left behind
125
        refresh_state(:killing_sidekiq)
126 127 128 129
        signal_pgroup('SIGKILL', 'die')
      end

      def rss_within_range?
130 131
        refresh_state(:running)

132
        deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds
133 134 135 136
        loop do
          return true unless enabled?

          # RSS go above hard limit should trigger forcible shutdown right away
137
          break if @current_rss > @hard_limit_rss
138 139

          # RSS go below the soft limit
140
          return true if @current_rss < @soft_limit_rss
141

142
          # RSS did not go below the soft limit within deadline, restart
143
          break if Gitlab::Metrics::System.monotonic_time > deadline
144 145

          sleep(CHECK_INTERVAL_SECONDS)
146 147

          refresh_state(:above_soft_limit)
148 149
        end

150 151 152 153 154 155 156
        # There are two chances to break from loop:
        #   - above hard limit, or
        #   - above soft limit after deadline
        # When `above hard limit`, it immediately go to `stop_fetching_new_jobs`
        # So ignore `above hard limit` and always set `above_soft_limit` here
        refresh_state(:above_soft_limit)
        log_rss_out_of_range(@current_rss, @hard_limit_rss, @soft_limit_rss)
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187

        false
      end

      def log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
        Sidekiq.logger.warn(
          class: self.class.to_s,
          pid: pid,
          message: 'Sidekiq worker RSS out of range',
          current_rss: current_rss,
          hard_limit_rss: hard_limit_rss,
          soft_limit_rss: soft_limit_rss,
          reason: out_of_range_description(current_rss, hard_limit_rss, soft_limit_rss)
        )
      end

      def out_of_range_description(rss, hard_limit, soft_limit)
        if rss > hard_limit
          "current_rss(#{rss}) > hard_limit_rss(#{hard_limit})"
        else
          "current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{GRACE_BALLOON_SECONDS})"
        end
      end

      def get_rss
        output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{pid}), Rails.root.to_s)
        return 0 unless status&.zero?

        output.to_i
      end

188
      def get_soft_limit_rss
189 190 191
        SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
      end

192
      def get_hard_limit_rss
193 194 195 196 197 198 199 200 201 202 203 204 205 206
        HARD_LIMIT_RSS_KB
      end

      def signal_and_wait(time, signal, explanation)
        Sidekiq.logger.warn(
          class: self.class.to_s,
          pid: pid,
          signal: signal,
          explanation: explanation,
          wait_time: time,
          message: "Sending signal and waiting"
        )
        Process.kill(signal, pid)

207
        deadline = Gitlab::Metrics::System.monotonic_time + time
208 209 210

        # we try to finish as early as all jobs finished
        # so we retest that in loop
211
        sleep(CHECK_INTERVAL_SECONDS) while enabled? && any_jobs? && Gitlab::Metrics::System.monotonic_time < deadline
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
      end

      def signal_pgroup(signal, explanation)
        if Process.getpgrp == pid
          pid_or_pgrp_str = 'PGRP'
          pid_to_signal = 0
        else
          pid_or_pgrp_str = 'PID'
          pid_to_signal = pid
        end

        Sidekiq.logger.warn(
          class: self.class.to_s,
          signal: signal,
          pid: pid,
          message: "sending Sidekiq worker #{pid_or_pgrp_str}-#{pid} #{signal} (#{explanation})"
        )
        Process.kill(signal, pid_to_signal)
      end

      def rss_increase_by_jobs
233 234 235 236
        Gitlab::SidekiqDaemon::Monitor.instance.jobs_mutex.synchronize do
          Gitlab::SidekiqDaemon::Monitor.instance.jobs.sum do |job| # rubocop:disable CodeReuse/ActiveRecord
            rss_increase_by_job(job)
          end
237 238 239 240 241
        end
      end

      def rss_increase_by_job(job)
        memory_growth_kb = get_job_options(job, 'memory_killer_memory_growth_kb', 0).to_i
242
        max_memory_growth_kb = get_job_options(job, 'memory_killer_max_memory_growth_kb', DEFAULT_MAX_MEMORY_GROWTH_KB).to_i
243

244
        return 0 if memory_growth_kb == 0
245

246
        time_elapsed = [Gitlab::Metrics::System.monotonic_time - job[:started_at], 0].max
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
        [memory_growth_kb * time_elapsed, max_memory_growth_kb].min
      end

      def get_job_options(job, key, default)
        job[:worker_class].sidekiq_options.fetch(key, default)
      rescue
        default
      end

      def pid
        Process.pid
      end

      def any_jobs?
        Gitlab::SidekiqDaemon::Monitor.instance.jobs.any?
      end
    end
  end
end