Commit c17922ea authored by Andrew Newdigate's avatar Andrew Newdigate

Record latencies for Sidekiq failures

Previously, the sidekiq metrics middleware would not record latency
and cpu duration metrics for failed sidekiq jobs.

This change does two things:
1. Ensures that Sidekiq jobs that fail are also included in latency
   histograms.
1. Adds a `job_status` label to Sidekiq latency metrics so that failed
   job latencies can be differentiated from successful jobs.
parent 8a6d7e6b
---
title: Record latencies for Sidekiq failures
merge_request: 18909
author:
type: performance
...@@ -21,29 +21,35 @@ module Gitlab ...@@ -21,29 +21,35 @@ module Gitlab
@metrics[:sidekiq_jobs_retried_total].increment(labels, 1) @metrics[:sidekiq_jobs_retried_total].increment(labels, 1)
end end
job_succeeded = false
monotonic_time_start = Gitlab::Metrics::System.monotonic_time
job_thread_cputime_start = get_thread_cputime job_thread_cputime_start = get_thread_cputime
begin
realtime = Benchmark.realtime do
yield yield
end job_succeeded = true
ensure
monotonic_time_end = Gitlab::Metrics::System.monotonic_time
job_thread_cputime_end = get_thread_cputime
job_thread_cputime_end = get_thread_cputime monotonic_time = monotonic_time_end - monotonic_time_start
job_thread_cputime = job_thread_cputime_end - job_thread_cputime_start job_thread_cputime = job_thread_cputime_end - job_thread_cputime_start
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, realtime) # sidekiq_running_jobs, sidekiq_jobs_failed_total should not include the job_status label
rescue Exception # rubocop: disable Lint/RescueException @metrics[:sidekiq_running_jobs].increment(labels, -1)
@metrics[:sidekiq_jobs_failed_total].increment(labels, 1) @metrics[:sidekiq_jobs_failed_total].increment(labels, 1) unless job_succeeded
raise
ensure # job_status: done, fail match the job_status attribute in structured logging
@metrics[:sidekiq_running_jobs].increment(labels, -1) labels[:job_status] = job_succeeded ? :done : :fail
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time)
end
end end
private private
def init_metrics def init_metrics
{ {
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'), sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'),
sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'), sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'),
......
...@@ -44,12 +44,14 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -44,12 +44,14 @@ describe Gitlab::SidekiqMiddleware::Metrics do
it 'sets queue specific metrics' do it 'sets queue specific metrics' do
labels = { queue: :test } labels = { queue: :test }
labels_with_job_status = { queue: :test, job_status: :done }
allow(middleware).to receive(:get_thread_cputime).and_return(1, 3) allow(middleware).to receive(:get_thread_cputime).and_return(1, 3)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(2, 3)
expect(user_execution_seconds_metric).to receive(:observe).with(labels, 2)
expect(running_jobs_metric).to receive(:increment).with(labels, 1) expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1) expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(completion_seconds_metric).to receive(:observe).with(labels, kind_of(Numeric)) expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, 2)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, 1)
middleware.call(worker, {}, :test) { nil } middleware.call(worker, {}, :test) { nil }
end end
...@@ -74,8 +76,18 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -74,8 +76,18 @@ describe Gitlab::SidekiqMiddleware::Metrics do
context 'when error is raised' do context 'when error is raised' do
it 'sets sidekiq_jobs_failed_total and reraises' do it 'sets sidekiq_jobs_failed_total and reraises' do
expect(failed_total_metric).to receive(:increment) labels = { queue: :test }
expect { middleware.call(worker, {}, :test) { raise } }.to raise_error labels_with_job_status = { queue: :test, job_status: :fail }
allow(middleware).to receive(:get_thread_cputime).and_return(1, 4)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(2, 6)
expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(failed_total_metric).to receive(:increment).with(labels, 1)
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, 3)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, 4)
expect { middleware.call(worker, {}, :test) { raise StandardError, "Failed" } }.to raise_error(StandardError, "Failed")
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment