Commit abdc0a7e authored by Sean McGivern's avatar Sean McGivern

Merge branch 'make-register-job-service-to-be-resillient' into 'master'

Make register job service to be resillient

See merge request gitlab-org/gitlab!19150
parents b26b4e10 fd24f258
......@@ -35,6 +35,10 @@ module Ci
refspecs: -> (build) { build.merge_request_ref? }
}.freeze
DEFAULT_RETRIES = {
scheduler_failure: 2
}.freeze
has_one :deployment, as: :deployable, class_name: 'Deployment'
has_many :trace_sections, class_name: 'Ci::BuildTraceSection'
has_many :trace_chunks, class_name: 'Ci::BuildTraceChunk', foreign_key: :build_id
......@@ -372,18 +376,25 @@ module Ci
pipeline.builds.retried.where(name: self.name).count
end
def retries_max
normalized_retry.fetch(:max, 0)
def retry_failure?
max_allowed_retries = nil
max_allowed_retries ||= options_retry_max if retry_on_reason_or_always?
max_allowed_retries ||= DEFAULT_RETRIES.fetch(failure_reason.to_sym, 0)
max_allowed_retries > 0 && retries_count < max_allowed_retries
end
def retry_when
normalized_retry.fetch(:when, ['always'])
def options_retry_max
options_retry[:max]
end
def retry_failure?
return false if retries_max.zero? || retries_count >= retries_max
def options_retry_when
options_retry.fetch(:when, ['always'])
end
retry_when.include?('always') || retry_when.include?(failure_reason.to_s)
def retry_on_reason_or_always?
options_retry_when.include?(failure_reason.to_s) ||
options_retry_when.include?('always')
end
def latest?
......@@ -831,6 +842,13 @@ module Ci
:creating
end
# Consider this object to have a structural integrity problems
def doom!
update_columns(
status: :failed,
failure_reason: :data_integrity_failure)
end
private
def successful_deployment_status
......@@ -875,8 +893,8 @@ module Ci
# format, but builds created before GitLab 11.5 and saved in database still
# have the old integer only format. This method returns the retry option
# normalized as a hash in 11.5+ format.
def normalized_retry
strong_memoize(:normalized_retry) do
def options_retry
strong_memoize(:options_retry) do
value = options&.dig(:retry)
value = value.is_a?(Integer) ? { max: value } : value.to_h
value.with_indifferent_access
......
......@@ -15,7 +15,9 @@ module CommitStatusEnums
stale_schedule: 7,
job_execution_timeout: 8,
archived_failure: 9,
unmet_prerequisites: 10
unmet_prerequisites: 10,
scheduler_failure: 11,
data_integrity_failure: 12
}
end
end
......
......@@ -11,7 +11,9 @@ class CommitStatusPresenter < Gitlab::View::Presenter::Delegated
stale_schedule: 'Delayed job could not be executed by some reason, please try again',
job_execution_timeout: 'The script exceeded the maximum execution time set for the job',
archived_failure: 'The job is archived and cannot be run',
unmet_prerequisites: 'The job failed to complete prerequisite tasks'
unmet_prerequisites: 'The job failed to complete prerequisite tasks',
scheduler_failure: 'The scheduler failed to assign job to the runner, please try again or contact system administrator',
data_integrity_failure: 'There has been a structural integrity problem detected, please contact system administrator'
}.freeze
private_constant :CALLOUT_FAILURE_MESSAGES
......@@ -33,6 +35,6 @@ class CommitStatusPresenter < Gitlab::View::Presenter::Delegated
end
def unrecoverable?
script_failure? || missing_dependency_failure? || archived_failure?
script_failure? || missing_dependency_failure? || archived_failure? || scheduler_failure? || data_integrity_failure?
end
end
......@@ -42,15 +42,34 @@ module Ci
end
builds.each do |build|
next unless runner.can_pick?(build)
result = process_build(build, params)
next unless result
if result.valid?
register_success(result.build)
return result
else
# The usage of valid: is described in
# handling of ActiveRecord::StaleObjectError
valid = false
end
end
register_failure
Result.new(nil, valid)
end
# rubocop: enable CodeReuse/ActiveRecord
private
def process_build(build, params)
return unless runner.can_pick?(build)
begin
# In case when 2 runners try to assign the same build, second runner will be declined
# with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method.
if assign_runner!(build, params)
register_success(build)
return Result.new(build, true)
Result.new(build, true)
end
rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError
# We are looping to find another build that is not conflicting
......@@ -62,16 +81,16 @@ module Ci
# In case we hit the concurrency-access lock,
# we still have to return 409 in the end,
# to make sure that this is properly handled by runner.
valid = false
end
end
Result.new(nil, false)
rescue => ex
raise ex unless Feature.enabled?(:ci_doom_build, default_enabled: true)
register_failure
Result.new(nil, valid)
end
# rubocop: enable CodeReuse/ActiveRecord
scheduler_failure!(build)
track_exception_for_build(ex, build)
private
# skip, and move to next one
nil
end
def assign_runner!(build, params)
build.runner_id = runner.id
......@@ -96,6 +115,28 @@ module Ci
true
end
def scheduler_failure!(build)
Gitlab::OptimisticLocking.retry_lock(build, 3) do |subject|
subject.drop!(:scheduler_failure)
end
rescue => ex
build.doom!
# This requires extra exception, otherwise we would loose information
# why we cannot perform `scheduler_failure`
track_exception_for_build(ex, build)
end
def track_exception_for_build(ex, build)
Gitlab::Sentry.track_acceptable_exception(ex, extra: {
build_id: build.id,
build_name: build.name,
build_stage: build.stage,
pipeline_id: build.pipeline_id,
project_id: build.project_id
})
end
# rubocop: disable CodeReuse/ActiveRecord
def builds_for_shared_runner
new_builds.
......
---
title: Make `jobs/request` to be resillient
merge_request: 19150
author:
type: fixed
......@@ -16,7 +16,9 @@ module Gitlab
stale_schedule: 'stale schedule',
job_execution_timeout: 'job execution timeout',
archived_failure: 'archived failure',
unmet_prerequisites: 'unmet prerequisites'
unmet_prerequisites: 'unmet prerequisites',
scheduler_failure: 'scheduler failure',
data_integrity_failure: 'data integrity failure'
}.freeze
private_constant :REASONS
......
......@@ -1587,7 +1587,7 @@ describe Ci::Build do
end
end
describe '#retries_max' do
describe '#options_retry_max' do
context 'with retries max config option' do
subject { create(:ci_build, options: { retry: { max: 1 } }) }
......@@ -1597,7 +1597,7 @@ describe Ci::Build do
end
it 'returns the number of configured max retries' do
expect(subject.retries_max).to eq 1
expect(subject.options_retry_max).to eq 1
end
end
......@@ -1607,7 +1607,7 @@ describe Ci::Build do
end
it 'returns the number of configured max retries' do
expect(subject.retries_max).to eq 1
expect(subject.options_retry_max).to eq 1
end
end
end
......@@ -1615,16 +1615,16 @@ describe Ci::Build do
context 'without retries max config option' do
subject { create(:ci_build) }
it 'returns zero' do
expect(subject.retries_max).to eq 0
it 'returns nil' do
expect(subject.options_retry_max).to be_nil
end
end
context 'when build is degenerated' do
subject { create(:ci_build, :degenerated) }
it 'returns zero' do
expect(subject.retries_max).to eq 0
it 'returns nil' do
expect(subject.options_retry_max).to be_nil
end
end
......@@ -1632,17 +1632,17 @@ describe Ci::Build do
subject { create(:ci_build, options: { retry: 1 }) }
it 'returns the number of configured max retries' do
expect(subject.retries_max).to eq 1
expect(subject.options_retry_max).to eq 1
end
end
end
describe '#retry_when' do
describe '#options_retry_when' do
context 'with retries when config option' do
subject { create(:ci_build, options: { retry: { when: ['some_reason'] } }) }
it 'returns the configured when' do
expect(subject.retry_when).to eq ['some_reason']
expect(subject.options_retry_when).to eq ['some_reason']
end
end
......@@ -1650,7 +1650,7 @@ describe Ci::Build do
subject { create(:ci_build) }
it 'returns always array' do
expect(subject.retry_when).to eq ['always']
expect(subject.options_retry_when).to eq ['always']
end
end
......@@ -1658,72 +1658,38 @@ describe Ci::Build do
subject { create(:ci_build, options: { retry: 1 }) }
it 'returns always array' do
expect(subject.retry_when).to eq ['always']
expect(subject.options_retry_when).to eq ['always']
end
end
end
describe '#retry_failure?' do
subject { create(:ci_build) }
context 'when retries max is zero' do
before do
expect(subject).to receive(:retries_max).at_least(:once).and_return(0)
end
it 'returns false' do
expect(subject.retry_failure?).to eq false
end
end
context 'when retries max equals retries count' do
before do
expect(subject).to receive(:retries_max).at_least(:once).and_return(1)
expect(subject).to receive(:retries_count).at_least(:once).and_return(1)
end
it 'returns false' do
expect(subject.retry_failure?).to eq false
end
end
using RSpec::Parameterized::TableSyntax
context 'when retries max is higher than retries count' do
before do
expect(subject).to receive(:retries_max).at_least(:once).and_return(2)
expect(subject).to receive(:retries_count).at_least(:once).and_return(1)
end
let(:build) { create(:ci_build) }
context 'and retry when is always' do
before do
expect(subject).to receive(:retry_when).at_least(:once).and_return(['always'])
end
subject { build.retry_failure? }
it 'returns true' do
expect(subject.retry_failure?).to eq true
end
where(:description, :retry_count, :options, :failure_reason, :result) do
"retries are disabled" | 0 | { max: 0 } | nil | false
"max equals count" | 2 | { max: 2 } | nil | false
"max is higher than count" | 1 | { max: 2 } | nil | true
"matching failure reason" | 0 | { when: %w[api_failure], max: 2 } | :api_failure | true
"not matching with always" | 0 | { when: %w[always], max: 2 } | :api_failure | true
"not matching reason" | 0 | { when: %w[script_error], max: 2 } | :api_failure | false
"scheduler failure override" | 1 | { when: %w[scheduler_failure], max: 1 } | :scheduler_failure | false
"default for scheduler failure" | 1 | {} | :scheduler_failure | true
end
context 'and retry when includes the failure_reason' do
with_them do
before do
expect(subject).to receive(:failure_reason).at_least(:once).and_return('some_reason')
expect(subject).to receive(:retry_when).at_least(:once).and_return(['some_reason'])
end
allow(build).to receive(:retries_count) { retry_count }
it 'returns true' do
expect(subject.retry_failure?).to eq true
end
build.options[:retry] = options
build.failure_reason = failure_reason
end
context 'and retry when does not include failure_reason' do
before do
expect(subject).to receive(:failure_reason).at_least(:once).and_return('some_reason')
expect(subject).to receive(:retry_when).at_least(:once).and_return(['some', 'other failure'])
end
it 'returns false' do
expect(subject.retry_failure?).to eq false
end
end
it { is_expected.to eq(result) }
end
end
end
......
......@@ -269,7 +269,7 @@ describe Ci::BuildPresenter do
let(:build) { create(:ci_build, :failed, :script_failure) }
context 'when is a script or missing dependency failure' do
let(:failure_reasons) { %w(script_failure missing_dependency_failure archived_failure) }
let(:failure_reasons) { %w(script_failure missing_dependency_failure archived_failure scheduler_failure data_integrity_failure) }
it 'returns false' do
failure_reasons.each do |failure_reason|
......
......@@ -773,8 +773,8 @@ describe Ci::CreatePipelineService do
it 'correctly creates builds with auto-retry value configured' do
expect(pipeline).to be_persisted
expect(rspec_job.retries_max).to eq 2
expect(rspec_job.retry_when).to eq ['always']
expect(rspec_job.options_retry_max).to eq 2
expect(rspec_job.options_retry_when).to eq ['always']
end
end
......@@ -783,8 +783,8 @@ describe Ci::CreatePipelineService do
it 'correctly creates builds with auto-retry value configured' do
expect(pipeline).to be_persisted
expect(rspec_job.retries_max).to eq 2
expect(rspec_job.retry_when).to eq ['runner_system_failure']
expect(rspec_job.options_retry_max).to eq 2
expect(rspec_job.options_retry_when).to eq ['runner_system_failure']
end
end
end
......
......@@ -502,6 +502,57 @@ module Ci
end
end
context 'when build has data integrity problem' do
let!(:pending_job) do
create(:ci_build, :pending, pipeline: pipeline)
end
before do
pending_job.update_columns(options: "string")
end
subject { execute(specific_runner, {}) }
it 'does drop the build and logs both failures' do
expect(Gitlab::Sentry).to receive(:track_acceptable_exception)
.with(anything, a_hash_including(extra: a_hash_including(build_id: pending_job.id)))
.twice
.and_call_original
expect(subject).to be_nil
pending_job.reload
expect(pending_job).to be_failed
expect(pending_job).to be_data_integrity_failure
end
end
context 'when build fails to be run!' do
let!(:pending_job) do
create(:ci_build, :pending, pipeline: pipeline)
end
before do
expect_any_instance_of(Ci::Build).to receive(:run!)
.and_raise(RuntimeError, 'scheduler error')
end
subject { execute(specific_runner, {}) }
it 'does drop the build and logs failure' do
expect(Gitlab::Sentry).to receive(:track_acceptable_exception)
.with(anything, a_hash_including(extra: a_hash_including(build_id: pending_job.id)))
.once
.and_call_original
expect(subject).to be_nil
pending_job.reload
expect(pending_job).to be_failed
expect(pending_job).to be_scheduler_failure
end
end
context 'when an exception is raised during a persistent ref creation' do
before do
allow_any_instance_of(Ci::PersistentRef).to receive(:exist?) { false }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment