Commit e1cbaf47 authored by Andrew Newdigate's avatar Andrew Newdigate Committed by Dmitriy Zaporozhets

Adds a Sidekiq queue duration metric

Adds a Prometheus histogram, `sidekiq_jobs_queue_duration_seconds` for
recording the duration that a Sidekiq job is queued for before being
executed.

This matches the scheduling_latency_s field emitted from structured
logging for the same purpose.
parent c47991bc
---
title: Adds a Sidekiq queue duration metric
merge_request: 19005
author:
type: other
...@@ -21,5 +21,49 @@ module Gitlab ...@@ -21,5 +21,49 @@ module Gitlab
payload[:rugged_duration_ms] = Gitlab::RuggedInstrumentation.query_time_ms payload[:rugged_duration_ms] = Gitlab::RuggedInstrumentation.query_time_ms
end end
end end
# Returns the queuing duration for a Sidekiq job in seconds, as a float, if the
# `enqueued_at` field or `created_at` field is available.
#
# * If the job doesn't contain sufficient information, returns nil
# * If the job has a start time in the future, returns 0
# * If the job contains an invalid start time value, returns nil
# @param [Hash] job a Sidekiq job, represented as a hash
def self.queue_duration_for_job(job)
# Old gitlab-shell messages don't provide enqueued_at/created_at attributes
enqueued_at = job['enqueued_at'] || job['created_at']
return unless enqueued_at
enqueued_at_time = convert_to_time(enqueued_at)
return unless enqueued_at_time
# Its possible that if theres clock-skew between two nodes
# this value may be less than zero. In that event, we record the value
# as zero.
[elapsed_by_absolute_time(enqueued_at_time), 0].max
end
# Calculates the time in seconds, as a float, from
# the provided start time until now
#
# @param [Time] start
def self.elapsed_by_absolute_time(start)
(Time.now - start).to_f.round(6)
end
private_class_method :elapsed_by_absolute_time
# Convert a representation of a time into a `Time` value
#
# @param time_value String, Float time representation, or nil
def self.convert_to_time(time_value)
return time_value if time_value.is_a?(Time)
return Time.iso8601(time_value) if time_value.is_a?(String)
return Time.at(time_value) if time_value.is_a?(Numeric) && time_value > 0
rescue ArgumentError
# Swallow invalid dates. Better to loose some observability
# than bring all background processing down because of a date
# formatting bug in a client
end
private_class_method :convert_to_time
end end
end end
...@@ -36,11 +36,8 @@ module Gitlab ...@@ -36,11 +36,8 @@ module Gitlab
payload['message'] = "#{base_message(payload)}: start" payload['message'] = "#{base_message(payload)}: start"
payload['job_status'] = 'start' payload['job_status'] = 'start'
# Old gitlab-shell messages don't provide enqueued_at/created_at attributes scheduling_latency_s = ::Gitlab::InstrumentationHelper.queue_duration_for_job(payload)
enqueued_at = payload['enqueued_at'] || payload['created_at'] payload['scheduling_latency_s'] = scheduling_latency_s if scheduling_latency_s
if enqueued_at
payload['scheduling_latency_s'] = elapsed_by_absolute_time(Time.iso8601(enqueued_at))
end
payload payload
end end
...@@ -98,10 +95,6 @@ module Gitlab ...@@ -98,10 +95,6 @@ module Gitlab
end end
end end
def elapsed_by_absolute_time(start)
(Time.now.utc - start).to_f.round(6)
end
def elapsed(t0) def elapsed(t0)
t1 = get_time t1 = get_time
{ {
......
...@@ -15,6 +15,9 @@ module Gitlab ...@@ -15,6 +15,9 @@ module Gitlab
def call(_worker, job, queue) def call(_worker, job, queue)
labels = create_labels(queue) labels = create_labels(queue)
queue_duration = ::Gitlab::InstrumentationHelper.queue_duration_for_job(job)
@metrics[:sidekiq_jobs_queue_duration_seconds].observe(labels, queue_duration) if queue_duration
@metrics[:sidekiq_running_jobs].increment(labels, 1) @metrics[:sidekiq_running_jobs].increment(labels, 1)
if job['retry_count'].present? if job['retry_count'].present?
...@@ -49,12 +52,13 @@ module Gitlab ...@@ -49,12 +52,13 @@ module Gitlab
def init_metrics def init_metrics
{ {
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'), sidekiq_jobs_queue_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'), sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'),
sidekiq_running_jobs: ::Gitlab::Metrics.gauge(:sidekiq_running_jobs, 'Number of Sidekiq jobs running', {}, :all), sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'),
sidekiq_concurrency: ::Gitlab::Metrics.gauge(:sidekiq_concurrency, 'Maximum number of Sidekiq jobs', {}, :all) sidekiq_running_jobs: ::Gitlab::Metrics.gauge(:sidekiq_running_jobs, 'Number of Sidekiq jobs running', {}, :all),
sidekiq_concurrency: ::Gitlab::Metrics.gauge(:sidekiq_concurrency, 'Maximum number of Sidekiq jobs', {}, :all)
} }
end end
......
# frozen_string_literal: true
require 'fast_spec_helper'
require 'rspec-parameterized'
describe Gitlab::InstrumentationHelper do
using RSpec::Parameterized::TableSyntax
describe '.queue_duration_for_job' do
where(:enqueued_at, :created_at, :time_now, :expected_duration) do
"2019-06-01T00:00:00.000+0000" | nil | "2019-06-01T02:00:00.000+0000" | 2.hours.to_f
"2019-06-01T02:00:00.000+0000" | nil | "2019-06-01T02:00:00.001+0000" | 0.001
"2019-06-01T02:00:00.000+0000" | "2019-05-01T02:00:00.000+0000" | "2019-06-01T02:00:01.000+0000" | 1
nil | "2019-06-01T02:00:00.000+0000" | "2019-06-01T02:00:00.001+0000" | 0.001
nil | nil | "2019-06-01T02:00:00.001+0000" | nil
"2019-06-01T02:00:00.000+0200" | nil | "2019-06-01T02:00:00.000-0200" | 4.hours.to_f
1571825569.998168 | nil | "2019-10-23T12:13:16.000+0200" | 26.001832
1571825569 | nil | "2019-10-23T12:13:16.000+0200" | 27
"invalid_date" | nil | "2019-10-23T12:13:16.000+0200" | nil
"" | nil | "2019-10-23T12:13:16.000+0200" | nil
0 | nil | "2019-10-23T12:13:16.000+0200" | nil
-1 | nil | "2019-10-23T12:13:16.000+0200" | nil
"2019-06-01T02:00:00.000+0000" | nil | "2019-06-01T00:00:00.000+0000" | 0
Time.at(1571999233) | nil | "2019-10-25T12:29:16.000+0200" | 123
end
with_them do
let(:job) { { 'enqueued_at' => enqueued_at, 'created_at' => created_at } }
it "returns the correct duration" do
Timecop.freeze(Time.iso8601(time_now)) do
expect(described_class.queue_duration_for_job(job)).to eq(expected_duration)
end
end
end
end
end
# frozen_string_literal: true # frozen_string_literal: true
require 'spec_helper' require 'fast_spec_helper'
describe Gitlab::SidekiqLogging::StructuredLogger do describe Gitlab::SidekiqLogging::StructuredLogger do
describe '#call' do describe '#call' do
......
# frozen_string_literal: true # frozen_string_literal: true
require 'spec_helper' require 'fast_spec_helper'
describe Gitlab::SidekiqMiddleware::Metrics do describe Gitlab::SidekiqMiddleware::Metrics do
let(:middleware) { described_class.new } let(:middleware) { described_class.new }
let(:concurrency_metric) { double('concurrency metric') } let(:concurrency_metric) { double('concurrency metric') }
let(:queue_duration_seconds) { double('queue duration seconds metric') }
let(:completion_seconds_metric) { double('completion seconds metric') } let(:completion_seconds_metric) { double('completion seconds metric') }
let(:user_execution_seconds_metric) { double('user execution seconds metric') } let(:user_execution_seconds_metric) { double('user execution seconds metric') }
let(:failed_total_metric) { double('failed total metric') } let(:failed_total_metric) { double('failed total metric') }
...@@ -13,6 +14,7 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -13,6 +14,7 @@ describe Gitlab::SidekiqMiddleware::Metrics do
let(:running_jobs_metric) { double('running jobs metric') } let(:running_jobs_metric) { double('running jobs metric') }
before do before do
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_queue_duration_seconds, anything, anything, anything).and_return(queue_duration_seconds)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_completion_seconds, anything, anything, anything).and_return(completion_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_completion_seconds, anything, anything, anything).and_return(completion_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric) allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric)
...@@ -20,7 +22,6 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -20,7 +22,6 @@ describe Gitlab::SidekiqMiddleware::Metrics do
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric) allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_concurrency, anything, {}, :all).and_return(concurrency_metric) allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_concurrency, anything, {}, :all).and_return(concurrency_metric)
allow(running_jobs_metric).to receive(:increment)
allow(concurrency_metric).to receive(:set) allow(concurrency_metric).to receive(:set)
end end
...@@ -32,62 +33,76 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -32,62 +33,76 @@ describe Gitlab::SidekiqMiddleware::Metrics do
end end
end end
it 'ignore user execution when measured 0' do
allow(completion_seconds_metric).to receive(:observe)
expect(user_execution_seconds_metric).not_to receive(:observe)
end
describe '#call' do describe '#call' do
let(:worker) { double(:worker) } let(:worker) { double(:worker) }
it 'yields block' do let(:job) { {} }
allow(completion_seconds_metric).to receive(:observe) let(:job_status) { :done }
allow(user_execution_seconds_metric).to receive(:observe) let(:labels) { { queue: :test } }
let(:labels_with_job_status) { { queue: :test, job_status: job_status } }
expect { |b| middleware.call(worker, {}, :test, &b) }.to yield_control.once let(:thread_cputime_before) { 1 }
end let(:thread_cputime_after) { 2 }
let(:thread_cputime_duration) { thread_cputime_after - thread_cputime_before }
it 'sets queue specific metrics' do let(:monotonic_time_before) { 11 }
labels = { queue: :test } let(:monotonic_time_after) { 20 }
labels_with_job_status = { queue: :test, job_status: :done } let(:monotonic_time_duration) { monotonic_time_after - monotonic_time_before }
allow(middleware).to receive(:get_thread_cputime).and_return(1, 3)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(2, 3) let(:queue_duration_for_job) { 0.01 }
before do
allow(middleware).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(monotonic_time_before, monotonic_time_after)
allow(Gitlab::InstrumentationHelper).to receive(:queue_duration_for_job).with(job).and_return(queue_duration_for_job)
expect(running_jobs_metric).to receive(:increment).with(labels, 1) expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1) expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, 2)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, 1)
middleware.call(worker, {}, :test) { nil } expect(queue_duration_seconds).to receive(:observe).with(labels, queue_duration_for_job) if queue_duration_for_job
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, thread_cputime_duration)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, monotonic_time_duration)
end end
it 'ignore user execution when measured 0' do it 'yields block' do
allow(completion_seconds_metric).to receive(:observe) expect { |b| middleware.call(worker, job, :test, &b) }.to yield_control.once
allow(middleware).to receive(:get_thread_cputime).and_return(0, 0) end
it 'sets queue specific metrics' do
middleware.call(worker, job, :test) { nil }
end
context 'when job_duration is not available' do
let(:queue_duration_for_job) { nil }
expect(user_execution_seconds_metric).not_to receive(:observe) it 'does not set the queue_duration_seconds histogram' do
middleware.call(worker, job, :test) { nil }
end
end end
context 'when job is retried' do context 'when job is retried' do
it 'sets sidekiq_jobs_retried_total metric' do let(:job) { { 'retry_count' => 1 } }
allow(completion_seconds_metric).to receive(:observe)
expect(user_execution_seconds_metric).to receive(:observe)
it 'sets sidekiq_jobs_retried_total metric' do
expect(retried_total_metric).to receive(:increment) expect(retried_total_metric).to receive(:increment)
middleware.call(worker, { 'retry_count' => 1 }, :test) { nil } middleware.call(worker, job, :test) { nil }
end end
end end
context 'when error is raised' do context 'when error is raised' do
it 'sets sidekiq_jobs_failed_total and reraises' do let(:job_status) { :fail }
labels = { queue: :test }
labels_with_job_status = { queue: :test, job_status: :fail }
allow(middleware).to receive(:get_thread_cputime).and_return(1, 4)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(2, 6)
expect(running_jobs_metric).to receive(:increment).with(labels, 1) it 'sets sidekiq_jobs_failed_total and reraises' do
expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(failed_total_metric).to receive(:increment).with(labels, 1) expect(failed_total_metric).to receive(:increment).with(labels, 1)
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, 3)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, 4)
expect { middleware.call(worker, {}, :test) { raise StandardError, "Failed" } }.to raise_error(StandardError, "Failed") expect { middleware.call(worker, job, :test) { raise StandardError, "Failed" } }.to raise_error(StandardError, "Failed")
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment