Commit a97c30c9 authored by Sean McGivern's avatar Sean McGivern

Add Sidekiq metrics for Gitaly and DB time

These metrics map to details already in our logs, but as they're in
Prometheus, make it easier to spot trends.
parent 09f08640
---
title: Add Prometheus metrics for Gitaly and database time in background jobs
merge_request: 26384
author:
type: changed
......@@ -86,12 +86,14 @@ The following metrics are available:
| `failed_login_captcha_total` | Gauge | 11.0 | Counter of failed CAPTCHA attempts during login | |
| `successful_login_captcha_total` | Gauge | 11.0 | Counter of successful CAPTCHA attempts during login | |
| `auto_devops_pipelines_completed_total` | Counter | 12.7 | Counter of completed Auto DevOps pipelines, labeled by status | |
| `sidekiq_jobs_cpu_seconds` | Histogram | 12.4 | Seconds of cpu time to run Sidekiq job | |
| `sidekiq_jobs_completion_seconds` | Histogram | 12.2 | Seconds to complete Sidekiq job | |
| `sidekiq_jobs_queue_duration_seconds` | Histogram | 12.5 | Duration in seconds that a Sidekiq job was queued before being executed | |
| `sidekiq_jobs_failed_total` | Counter | 12.2 | Sidekiq jobs failed | |
| `sidekiq_jobs_retried_total` | Counter | 12.2 | Sidekiq jobs retried | |
| `sidekiq_running_jobs` | Gauge | 12.2 | Number of Sidekiq jobs running | |
| `sidekiq_jobs_cpu_seconds` | Histogram | 12.4 | Seconds of cpu time to run Sidekiq job | queue, boundary, external_dependencies, feature_category, job_status, urgency |
| `sidekiq_jobs_completion_seconds` | Histogram | 12.2 | Seconds to complete Sidekiq job | queue, boundary, external_dependencies, feature_category, job_status, urgency |
| `sidekiq_jobs_db_seconds` | Histogram | 12.9 | Seconds of DB time to run Sidekiq job | queue, boundary, external_dependencies, feature_category, job_status, urgency |
| `sidekiq_jobs_gitaly_seconds` | Histogram | 12.9 | Seconds of Gitaly time to run Sidekiq job | queue, boundary, external_dependencies, feature_category, job_status, urgency |
| `sidekiq_jobs_queue_duration_seconds` | Histogram | 12.5 | Duration in seconds that a Sidekiq job was queued before being executed | queue, boundary, external_dependencies, feature_category, urgency |
| `sidekiq_jobs_failed_total` | Counter | 12.2 | Sidekiq jobs failed | queue, boundary, external_dependencies, feature_category, urgency |
| `sidekiq_jobs_retried_total` | Counter | 12.2 | Sidekiq jobs retried | queue, boundary, external_dependencies, feature_category, urgency |
| `sidekiq_running_jobs` | Gauge | 12.2 | Number of Sidekiq jobs running | queue, boundary, external_dependencies, feature_category, urgency |
| `sidekiq_concurrency` | Gauge | 12.5 | Maximum number of Sidekiq jobs | |
## Metrics controlled by a feature flag
......
......@@ -45,6 +45,8 @@ module Gitlab
labels[:job_status] = job_succeeded ? "done" : "fail"
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time)
@metrics[:sidekiq_jobs_db_seconds].observe(labels, ActiveRecord::LogSubscriber.runtime / 1000)
@metrics[:sidekiq_jobs_gitaly_seconds].observe(labels, Gitlab::GitalyClient.query_time)
end
end
......@@ -54,6 +56,8 @@ module Gitlab
{
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_db_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_db_seconds, 'Seconds of database time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_gitaly_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_gitaly_seconds, 'Seconds of Gitaly time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_queue_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'),
sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'),
......
......@@ -20,6 +20,8 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
let(:queue_duration_seconds) { double('queue duration seconds metric') }
let(:completion_seconds_metric) { double('completion seconds metric') }
let(:user_execution_seconds_metric) { double('user execution seconds metric') }
let(:db_seconds_metric) { double('db seconds metric') }
let(:gitaly_seconds_metric) { double('gitaly seconds metric') }
let(:failed_total_metric) { double('failed total metric') }
let(:retried_total_metric) { double('retried total metric') }
let(:running_jobs_metric) { double('running jobs metric') }
......@@ -28,6 +30,8 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_queue_duration_seconds, anything, anything, anything).and_return(queue_duration_seconds)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_completion_seconds, anything, anything, anything).and_return(completion_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_db_seconds, anything, anything, anything).and_return(db_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_gitaly_seconds, anything, anything, anything).and_return(gitaly_seconds_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric)
......@@ -55,16 +59,23 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
let(:queue_duration_for_job) { 0.01 }
let(:db_duration) { 3 }
let(:gitaly_duration) { 4 }
before do
allow(subject).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(monotonic_time_before, monotonic_time_after)
allow(Gitlab::InstrumentationHelper).to receive(:queue_duration_for_job).with(job).and_return(queue_duration_for_job)
allow(ActiveRecord::LogSubscriber).to receive(:runtime).and_return(db_duration * 1000)
allow(Gitlab::GitalyClient).to receive(:query_time).and_return(gitaly_duration)
expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(queue_duration_seconds).to receive(:observe).with(labels, queue_duration_for_job) if queue_duration_for_job
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, thread_cputime_duration)
expect(db_seconds_metric).to receive(:observe).with(labels_with_job_status, db_duration)
expect(gitaly_seconds_metric).to receive(:observe).with(labels_with_job_status, gitaly_duration)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, monotonic_time_duration)
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment