Commit c7bfe9fc authored by Sean McGivern's avatar Sean McGivern

Record metrics for JobWaiter timeouts

When a JobWaiter times out, the caller doesn't know that all the jobs
have completed. This is particularly important for the
AuthorizedProjectsWorker, which is called with a timeout of 10 seconds
from various controllers.

In order to improve that worker, we first want to track how often these
waiters time out, as compared to how often we attempt to wait. Based on
the results, we can consider different approaches to making this meet
its timeout more frequently.
parent 7dd5296d
...@@ -9,7 +9,7 @@ module WaitableWorker ...@@ -9,7 +9,7 @@ module WaitableWorker
# Short-circuit: it's more efficient to do small numbers of jobs inline # Short-circuit: it's more efficient to do small numbers of jobs inline
return bulk_perform_inline(args_list) if args_list.size <= 3 return bulk_perform_inline(args_list) if args_list.size <= 3
waiter = Gitlab::JobWaiter.new(args_list.size) waiter = Gitlab::JobWaiter.new(args_list.size, worker_label: self.to_s)
# Point all the bulk jobs at the same JobWaiter. Converts, [[1], [2], [3]] # Point all the bulk jobs at the same JobWaiter. Converts, [[1], [2], [3]]
# into [[1, "key"], [2, "key"], [3, "key"]] # into [[1, "key"], [2, "key"], [3, "key"]]
......
...@@ -19,6 +19,9 @@ module Gitlab ...@@ -19,6 +19,9 @@ module Gitlab
class JobWaiter class JobWaiter
KEY_PREFIX = "gitlab:job_waiter" KEY_PREFIX = "gitlab:job_waiter"
STARTED_METRIC = :gitlab_job_waiter_started_total
TIMEOUTS_METRIC = :gitlab_job_waiter_timeouts_total
def self.notify(key, jid) def self.notify(key, jid)
Gitlab::Redis::SharedState.with { |redis| redis.lpush(key, jid) } Gitlab::Redis::SharedState.with { |redis| redis.lpush(key, jid) }
end end
...@@ -27,15 +30,16 @@ module Gitlab ...@@ -27,15 +30,16 @@ module Gitlab
key.is_a?(String) && key =~ /\A#{KEY_PREFIX}:\h{8}-\h{4}-\h{4}-\h{4}-\h{12}\z/ key.is_a?(String) && key =~ /\A#{KEY_PREFIX}:\h{8}-\h{4}-\h{4}-\h{4}-\h{12}\z/
end end
attr_reader :key, :finished attr_reader :key, :finished, :worker_label
attr_accessor :jobs_remaining attr_accessor :jobs_remaining
# jobs_remaining - the number of jobs left to wait for # jobs_remaining - the number of jobs left to wait for
# key - The key of this waiter. # key - The key of this waiter.
def initialize(jobs_remaining = 0, key = "#{KEY_PREFIX}:#{SecureRandom.uuid}") def initialize(jobs_remaining = 0, key = "#{KEY_PREFIX}:#{SecureRandom.uuid}", worker_label: nil)
@key = key @key = key
@jobs_remaining = jobs_remaining @jobs_remaining = jobs_remaining
@finished = [] @finished = []
@worker_label = worker_label
end end
# Waits for all the jobs to be completed. # Waits for all the jobs to be completed.
...@@ -45,6 +49,7 @@ module Gitlab ...@@ -45,6 +49,7 @@ module Gitlab
# long to process, or is never processed. # long to process, or is never processed.
def wait(timeout = 10) def wait(timeout = 10)
deadline = Time.now.utc + timeout deadline = Time.now.utc + timeout
increment_counter(STARTED_METRIC)
Gitlab::Redis::SharedState.with do |redis| Gitlab::Redis::SharedState.with do |redis|
# Fallback key expiry: allow a long grace period to reduce the chance of # Fallback key expiry: allow a long grace period to reduce the chance of
...@@ -60,7 +65,12 @@ module Gitlab ...@@ -60,7 +65,12 @@ module Gitlab
break if seconds_left <= 0 break if seconds_left <= 0
list, jid = redis.blpop(key, timeout: seconds_left) list, jid = redis.blpop(key, timeout: seconds_left)
break unless list && jid # timed out
# timed out
unless list && jid
increment_counter(TIMEOUTS_METRIC)
break
end
@finished << jid @finished << jid
@jobs_remaining -= 1 @jobs_remaining -= 1
...@@ -72,5 +82,20 @@ module Gitlab ...@@ -72,5 +82,20 @@ module Gitlab
finished finished
end end
private
def increment_counter(metric)
return unless worker_label
metrics[metric].increment(worker: worker_label)
end
def metrics
@metrics ||= {
STARTED_METRIC => Gitlab::Metrics.counter(STARTED_METRIC, 'JobWaiter attempts started'),
TIMEOUTS_METRIC => Gitlab::Metrics.counter(TIMEOUTS_METRIC, 'JobWaiter attempts timed out')
}
end
end end
end end
...@@ -37,5 +37,40 @@ describe Gitlab::JobWaiter do ...@@ -37,5 +37,40 @@ describe Gitlab::JobWaiter do
expect(result).to contain_exactly('a') expect(result).to contain_exactly('a')
end end
context 'when a label is provided' do
let(:waiter) { described_class.new(2, worker_label: 'Foo') }
let(:started_total) { double(:started_total) }
let(:timeouts_total) { double(:timeouts_total) }
before do
allow(Gitlab::Metrics).to receive(:counter)
.with(described_class::STARTED_METRIC, anything)
.and_return(started_total)
allow(Gitlab::Metrics).to receive(:counter)
.with(described_class::TIMEOUTS_METRIC, anything)
.and_return(timeouts_total)
end
it 'increments just job_waiter_started_total when all jobs complete' do
expect(started_total).to receive(:increment).with(worker: 'Foo')
expect(timeouts_total).not_to receive(:increment)
described_class.notify(waiter.key, 'a')
described_class.notify(waiter.key, 'b')
result = nil
expect { Timeout.timeout(1) { result = waiter.wait(2) } }.not_to raise_error
end
it 'increments job_waiter_started_total and job_waiter_timeouts_total when it times out' do
expect(started_total).to receive(:increment).with(worker: 'Foo')
expect(timeouts_total).to receive(:increment).with(worker: 'Foo')
result = nil
expect { Timeout.timeout(2) { result = waiter.wait(1) } }.not_to raise_error
end
end
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment