Commit c17922ea authored by Andrew Newdigate's avatar Andrew Newdigate

Record latencies for Sidekiq failures

Previously, the sidekiq metrics middleware would not record latency
and cpu duration metrics for failed sidekiq jobs.

This change does two things:
1. Ensures that Sidekiq jobs that fail are also included in latency
   histograms.
1. Adds a `job_status` label to Sidekiq latency metrics so that failed
   job latencies can be differentiated from successful jobs.
parent 8a6d7e6b
---
title: Record latencies for Sidekiq failures
merge_request: 18909
author:
type: performance
......@@ -21,29 +21,35 @@ module Gitlab
@metrics[:sidekiq_jobs_retried_total].increment(labels, 1)
end
job_succeeded = false
monotonic_time_start = Gitlab::Metrics::System.monotonic_time
job_thread_cputime_start = get_thread_cputime
realtime = Benchmark.realtime do
begin
yield
end
job_succeeded = true
ensure
monotonic_time_end = Gitlab::Metrics::System.monotonic_time
job_thread_cputime_end = get_thread_cputime
job_thread_cputime_end = get_thread_cputime
job_thread_cputime = job_thread_cputime_end - job_thread_cputime_start
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
monotonic_time = monotonic_time_end - monotonic_time_start
job_thread_cputime = job_thread_cputime_end - job_thread_cputime_start
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, realtime)
rescue Exception # rubocop: disable Lint/RescueException
@metrics[:sidekiq_jobs_failed_total].increment(labels, 1)
raise
ensure
@metrics[:sidekiq_running_jobs].increment(labels, -1)
# sidekiq_running_jobs, sidekiq_jobs_failed_total should not include the job_status label
@metrics[:sidekiq_running_jobs].increment(labels, -1)
@metrics[:sidekiq_jobs_failed_total].increment(labels, 1) unless job_succeeded
# job_status: done, fail match the job_status attribute in structured logging
labels[:job_status] = job_succeeded ? :done : :fail
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time)
end
end
private
def init_metrics
{
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'),
sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'),
......
......@@ -44,12 +44,14 @@ describe Gitlab::SidekiqMiddleware::Metrics do
it 'sets queue specific metrics' do
labels = { queue: :test }
labels_with_job_status = { queue: :test, job_status: :done }
allow(middleware).to receive(:get_thread_cputime).and_return(1, 3)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(2, 3)
expect(user_execution_seconds_metric).to receive(:observe).with(labels, 2)
expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(completion_seconds_metric).to receive(:observe).with(labels, kind_of(Numeric))
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, 2)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, 1)
middleware.call(worker, {}, :test) { nil }
end
......@@ -74,8 +76,18 @@ describe Gitlab::SidekiqMiddleware::Metrics do
context 'when error is raised' do
it 'sets sidekiq_jobs_failed_total and reraises' do
expect(failed_total_metric).to receive(:increment)
expect { middleware.call(worker, {}, :test) { raise } }.to raise_error
labels = { queue: :test }
labels_with_job_status = { queue: :test, job_status: :fail }
allow(middleware).to receive(:get_thread_cputime).and_return(1, 4)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(2, 6)
expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(failed_total_metric).to receive(:increment).with(labels, 1)
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, 3)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, 4)
expect { middleware.call(worker, {}, :test) { raise StandardError, "Failed" } }.to raise_error(StandardError, "Failed")
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment