Commit c4c42d29 authored by Stan Hu's avatar Stan Hu

Measure Sidekiq enqueue latency for scheduled jobs

When Sidekiq schedules a job to be run later, it sets the `at` field
with the desired time and inserts the payload into the `scheduled`
queue. Sidekiq runs a separate thread that polls this queue via a Redis
`zrangebyscore` command and inserts jobs that have passed the scheduled
time. Sidekiq uses a random delay that is based on the number of
processes running at a given time.

We observed that this enqueuing of scheduled jobs can add an unexpected,
significant delay and increase CPU utilization on Redis as more Sidekiq
processes connect. Before we can make any tweaks to this configuration,
we should first measure how much delay we have.

This commit adds a `enqueue_latency_s` for scheduled jobs to record how
long this delay was. In the Sidekiq client middleware, we have access to
the `at` field before it is deleted by Sidekiq, so we save it as the
`scheduled_at` field. We subtract the difference between `enqueued_at`
and `scheduled_at` and log `enqueue_latency_s`.

Relates to
https://gitlab.com/gitlab-com/gl-infra/scalability/-/issues/1179

Changelog: added
parent 5cdebc47
......@@ -131,18 +131,43 @@ module Gitlab
enqueued_at_time = convert_to_time(enqueued_at)
return unless enqueued_at_time
# Its possible that if theres clock-skew between two nodes
# this value may be less than zero. In that event, we record the value
round_elapsed_time(enqueued_at_time)
end
# Returns the time it took for a scheduled job to be enqueued in seconds, as a float,
# if the `scheduled_at` and `enqueued_at` fields are available.
#
# * If the job doesn't contain sufficient information, returns nil
# * If the job has a start time in the future, returns 0
# * If the job contains an invalid start time value, returns nil
# @param [Hash] job a Sidekiq job, represented as a hash
def self.enqueue_latency_for_scheduled_job(job)
scheduled_at = job['scheduled_at']
enqueued_at = job['enqueued_at']
return unless scheduled_at && enqueued_at
scheduled_at_time = convert_to_time(scheduled_at)
enqueued_at_time = convert_to_time(enqueued_at)
return unless scheduled_at_time && enqueued_at_time
round_elapsed_time(scheduled_at_time, enqueued_at_time)
end
def self.round_elapsed_time(start, end_time = Time.now)
# It's possible that if there is clock-skew between two nodes this
# value may be less than zero. In that event, we record the value
# as zero.
[elapsed_by_absolute_time(enqueued_at_time), 0].max.round(DURATION_PRECISION)
[elapsed_by_absolute_time(start, end_time), 0].max.round(DURATION_PRECISION)
end
# Calculates the time in seconds, as a float, from
# the provided start time until now
#
# @param [Time] start
def self.elapsed_by_absolute_time(start)
(Time.now - start).to_f.round(DURATION_PRECISION)
def self.elapsed_by_absolute_time(start, end_time)
(end_time - start).to_f.round(DURATION_PRECISION)
end
private_class_method :elapsed_by_absolute_time
......
......@@ -55,6 +55,9 @@ module Gitlab
scheduling_latency_s = ::Gitlab::InstrumentationHelper.queue_duration_for_job(payload)
payload['scheduling_latency_s'] = scheduling_latency_s if scheduling_latency_s
enqueue_latency_s = ::Gitlab::InstrumentationHelper.enqueue_latency_for_scheduled_job(payload)
payload['enqueue_latency_s'] = enqueue_latency_s if enqueue_latency_s
payload
end
......
......@@ -16,7 +16,12 @@ module Gitlab
worker_class = worker_class.safe_constantize if worker_class.respond_to?(:safe_constantize)
labels = create_labels(worker_class, queue, job)
labels[:scheduling] = job.key?('at') ? 'delayed' : 'immediate'
if job.key?('at')
labels[:scheduling] = 'delayed'
job[:scheduled_at] = job['at']
else
labels[:scheduling] = 'immediate'
end
@metrics.fetch(ENQUEUED).increment(labels, 1)
......
......@@ -149,8 +149,8 @@ RSpec.describe Gitlab::InstrumentationHelper do
end
end
describe '.queue_duration_for_job' do
where(:enqueued_at, :created_at, :time_now, :expected_duration) do
describe 'duration calculations' do
where(:end_time, :start_time, :time_now, :expected_duration) do
"2019-06-01T00:00:00.000+0000" | nil | "2019-06-01T02:00:00.000+0000" | 2.hours.to_f
"2019-06-01T02:00:00.000+0000" | nil | "2019-06-01T02:00:00.001+0000" | 0.001
"2019-06-01T02:00:00.000+0000" | "2019-05-01T02:00:00.000+0000" | "2019-06-01T02:00:01.000+0000" | 1
......@@ -167,12 +167,26 @@ RSpec.describe Gitlab::InstrumentationHelper do
Time.at(1571999233) | nil | "2019-10-25T12:29:16.000+0200" | 123
end
with_them do
let(:job) { { 'enqueued_at' => enqueued_at, 'created_at' => created_at } }
describe '.queue_duration_for_job' do
with_them do
let(:job) { { 'enqueued_at' => end_time, 'created_at' => start_time } }
it "returns the correct duration" do
Timecop.freeze(Time.iso8601(time_now)) do
expect(described_class.queue_duration_for_job(job)).to eq(expected_duration)
it "returns the correct duration" do
Timecop.freeze(Time.iso8601(time_now)) do
expect(described_class.queue_duration_for_job(job)).to eq(expected_duration)
end
end
end
end
describe '.enqueue_latency_for_scheduled_job' do
with_them do
let(:job) { { 'enqueued_at' => end_time, 'scheduled_at' => start_time } }
it "returns the correct duration" do
Timecop.freeze(Time.iso8601(time_now)) do
expect(described_class.enqueue_latency_for_scheduled_job(job)).to eq(expected_duration)
end
end
end
end
......
......@@ -194,6 +194,35 @@ RSpec.describe Gitlab::SidekiqLogging::StructuredLogger do
end
end
context 'with enqueue latency' do
let(:expected_start_payload) do
start_payload.merge(
'scheduled_at' => job['scheduled_at'],
'enqueue_latency_s' => 1.hour.to_f
)
end
let(:expected_end_payload) do
end_payload.merge('enqueue_latency_s' => 1.hour.to_f)
end
before do
# enqueued_at is set to created_at
job['scheduled_at'] = created_at - 1.hour
end
it 'logs with scheduling latency' do
Timecop.freeze(timestamp) do
expect(logger).to receive(:info).with(expected_start_payload).ordered
expect(logger).to receive(:info).with(expected_end_payload).ordered
expect(subject).to receive(:log_job_start).and_call_original
expect(subject).to receive(:log_job_done).and_call_original
call_subject(job, 'test_queue') { }
end
end
end
context 'with Gitaly, Rugged, and Redis calls' do
let(:timing_data) do
{
......
......@@ -62,6 +62,14 @@ RSpec.describe Gitlab::SidekiqMiddleware::ClientMetrics do
Sidekiq::Testing.inline! { TestWorker.perform_in(1.second) }
end
it 'sets the scheduled_at field' do
job = { 'at' => Time.current }
subject.call('TestWorker', job, 'queue', nil) do
expect(job[:scheduled_at]).to eq(job['at'])
end
end
end
context 'when the worker class cannot be found' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment