Commit 5c62dee0 authored by Andrew Newdigate's avatar Andrew Newdigate Committed by Sean McGivern

Add worker attributes to Sidekiq metrics

Sidekiq workers have attributes, including latency_sensitivity,
whether or not they have external dependencies, their feature category
and resource boundaries.

This change adds these attributes to Sidekiq metrics, to allow error
budgeting and attribution to take place.
parent 553dd2ee
---
title: Add worker attributes to Sidekiq metrics
merge_request: 20292
author:
type: other
......@@ -7,14 +7,17 @@ module Gitlab
# timeframes than the DEFAULT_BUCKET definition. Defined in seconds.
SIDEKIQ_LATENCY_BUCKETS = [0.1, 0.25, 0.5, 1, 2.5, 5, 10, 60, 300, 600].freeze
TRUE_LABEL = "yes"
FALSE_LABEL = "no"
def initialize
@metrics = init_metrics
@metrics[:sidekiq_concurrency].set({}, Sidekiq.options[:concurrency].to_i)
end
def call(_worker, job, queue)
labels = create_labels(queue)
def call(worker, job, queue)
labels = create_labels(worker.class, queue)
queue_duration = ::Gitlab::InstrumentationHelper.queue_duration_for_job(job)
@metrics[:sidekiq_jobs_queue_duration_seconds].observe(labels, queue_duration) if queue_duration
......@@ -42,7 +45,7 @@ module Gitlab
@metrics[:sidekiq_jobs_failed_total].increment(labels, 1) unless job_succeeded
# job_status: done, fail match the job_status attribute in structured logging
labels[:job_status] = job_succeeded ? :done : :fail
labels[:job_status] = job_succeeded ? "done" : "fail"
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time)
end
......@@ -62,10 +65,24 @@ module Gitlab
}
end
def create_labels(queue)
{
queue: queue
}
def create_labels(worker_class, queue)
labels = { queue: queue.to_s, latency_sensitive: FALSE_LABEL, external_dependencies: FALSE_LABEL, feature_category: "", boundary: "" }
return labels unless worker_class.include? WorkerAttributes
labels[:latency_sensitive] = bool_as_label(worker_class.latency_sensitive_worker?)
labels[:external_dependencies] = bool_as_label(worker_class.worker_has_external_dependencies?)
feature_category = worker_class.get_feature_category
labels[:feature_category] = feature_category.to_s
resource_boundary = worker_class.get_worker_resource_boundary
labels[:boundary] = resource_boundary == :unknown ? "" : resource_boundary.to_s
labels
end
def bool_as_label(value)
value ? TRUE_LABEL : FALSE_LABEL
end
def get_thread_cputime
......
......@@ -3,7 +3,18 @@
require 'fast_spec_helper'
describe Gitlab::SidekiqMiddleware::Metrics do
let(:middleware) { described_class.new }
context "with worker attribution" do
subject { described_class.new }
let(:queue) { :test }
let(:worker_class) { worker.class }
let(:job) { {} }
let(:job_status) { :done }
let(:labels_with_job_status) { labels.merge(job_status: job_status.to_s) }
let(:default_labels) { { queue: queue.to_s, boundary: "", external_dependencies: "no", feature_category: "", latency_sensitive: "no" } }
shared_examples "a metrics middleware" do
context "with mocked prometheus" do
let(:concurrency_metric) { double('concurrency metric') }
let(:queue_duration_seconds) { double('queue duration seconds metric') }
......@@ -26,27 +37,14 @@ describe Gitlab::SidekiqMiddleware::Metrics do
end
describe '#initialize' do
it 'sets general metrics' do
it 'sets concurrency metrics' do
expect(concurrency_metric).to receive(:set).with({}, Sidekiq.options[:concurrency].to_i)
middleware
subject
end
end
it 'ignore user execution when measured 0' do
allow(completion_seconds_metric).to receive(:observe)
expect(user_execution_seconds_metric).not_to receive(:observe)
end
describe '#call' do
let(:worker) { double(:worker) }
let(:job) { {} }
let(:job_status) { :done }
let(:labels) { { queue: :test } }
let(:labels_with_job_status) { { queue: :test, job_status: job_status } }
let(:thread_cputime_before) { 1 }
let(:thread_cputime_after) { 2 }
let(:thread_cputime_duration) { thread_cputime_after - thread_cputime_before }
......@@ -58,7 +56,7 @@ describe Gitlab::SidekiqMiddleware::Metrics do
let(:queue_duration_for_job) { 0.01 }
before do
allow(middleware).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after)
allow(subject).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(monotonic_time_before, monotonic_time_after)
allow(Gitlab::InstrumentationHelper).to receive(:queue_duration_for_job).with(job).and_return(queue_duration_for_job)
......@@ -71,18 +69,30 @@ describe Gitlab::SidekiqMiddleware::Metrics do
end
it 'yields block' do
expect { |b| middleware.call(worker, job, :test, &b) }.to yield_control.once
expect { |b| subject.call(worker, job, :test, &b) }.to yield_control.once
end
it 'sets queue specific metrics' do
middleware.call(worker, job, :test) { nil }
subject.call(worker, job, :test) { nil }
end
context 'when job_duration is not available' do
let(:queue_duration_for_job) { nil }
it 'does not set the queue_duration_seconds histogram' do
middleware.call(worker, job, :test) { nil }
expect(queue_duration_seconds).not_to receive(:observe)
subject.call(worker, job, :test) { nil }
end
end
context 'when error is raised' do
let(:job_status) { :fail }
it 'sets sidekiq_jobs_failed_total and reraises' do
expect(failed_total_metric).to receive(:increment).with(labels, 1)
expect { subject.call(worker, job, :test) { raise StandardError, "Failed" } }.to raise_error(StandardError, "Failed")
end
end
......@@ -92,17 +102,102 @@ describe Gitlab::SidekiqMiddleware::Metrics do
it 'sets sidekiq_jobs_retried_total metric' do
expect(retried_total_metric).to receive(:increment)
middleware.call(worker, job, :test) { nil }
subject.call(worker, job, :test) { nil }
end
end
end
end
context "with prometheus integrated" do
describe '#call' do
it 'yields block' do
expect { |b| subject.call(worker, job, :test, &b) }.to yield_control.once
end
context 'when error is raised' do
let(:job_status) { :fail }
it 'sets sidekiq_jobs_failed_total and reraises' do
expect(failed_total_metric).to receive(:increment).with(labels, 1)
expect { subject.call(worker, job, :test) { raise StandardError, "Failed" } }.to raise_error(StandardError, "Failed")
end
end
end
end
end
context "when workers are not attributed" do
class TestNonAttributedWorker
include Sidekiq::Worker
end
let(:worker) { TestNonAttributedWorker.new }
let(:labels) { default_labels }
it_behaves_like "a metrics middleware"
end
context "when workers are attributed" do
def create_attributed_worker_class(latency_sensitive, external_dependencies, resource_boundary, category)
Class.new do
include Sidekiq::Worker
include WorkerAttributes
latency_sensitive_worker! if latency_sensitive
worker_has_external_dependencies! if external_dependencies
worker_resource_boundary resource_boundary unless resource_boundary == :unknown
feature_category category unless category.nil?
end
end
let(:latency_sensitive) { false }
let(:external_dependencies) { false }
let(:resource_boundary) { :unknown }
let(:feature_category) { nil }
let(:worker_class) { create_attributed_worker_class(latency_sensitive, external_dependencies, resource_boundary, feature_category) }
let(:worker) { worker_class.new }
context "latency sensitive" do
let(:latency_sensitive) { true }
let(:labels) { default_labels.merge(latency_sensitive: "yes") }
it_behaves_like "a metrics middleware"
end
context "external dependencies" do
let(:external_dependencies) { true }
let(:labels) { default_labels.merge(external_dependencies: "yes") }
it_behaves_like "a metrics middleware"
end
context "cpu boundary" do
let(:resource_boundary) { :cpu }
let(:labels) { default_labels.merge(boundary: "cpu") }
it_behaves_like "a metrics middleware"
end
context "memory boundary" do
let(:resource_boundary) { :memory }
let(:labels) { default_labels.merge(boundary: "memory") }
it_behaves_like "a metrics middleware"
end
context "feature category" do
let(:feature_category) { :authentication }
let(:labels) { default_labels.merge(feature_category: "authentication") }
it_behaves_like "a metrics middleware"
end
context "combined" do
let(:latency_sensitive) { true }
let(:external_dependencies) { true }
let(:resource_boundary) { :cpu }
let(:feature_category) { :authentication }
let(:labels) { default_labels.merge(latency_sensitive: "yes", external_dependencies: "yes", boundary: "cpu", feature_category: "authentication") }
expect { middleware.call(worker, job, :test) { raise StandardError, "Failed" } }.to raise_error(StandardError, "Failed")
it_behaves_like "a metrics middleware"
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment