Commit 2a35e6da authored by Nick Thomas's avatar Nick Thomas

Merge branch 'rescue-stuck-resource-group' into 'master'

Rescue stuck resource groups

See merge request gitlab-org/gitlab!66729
parents 85ce50bd 5468378c
...@@ -8,6 +8,21 @@ module Ci ...@@ -8,6 +8,21 @@ module Ci
belongs_to :processable, class_name: 'Ci::Processable', foreign_key: 'build_id', inverse_of: :resource belongs_to :processable, class_name: 'Ci::Processable', foreign_key: 'build_id', inverse_of: :resource
scope :free, -> { where(processable: nil) } scope :free, -> { where(processable: nil) }
scope :retained, -> { where.not(processable: nil) }
scope :retained_by, -> (processable) { where(processable: processable) } scope :retained_by, -> (processable) { where(processable: processable) }
class << self
# In some cases, state machine hooks in `Ci::Build` are skipped
# even if the job status transitions to a complete state.
# For example, `Ci::Build#doom!` (a.k.a `data_integrity_failure`) doesn't execute state machine hooks.
# To handle these edge cases, we check the staleness of the jobs that currently
# assigned to the resources, and release if it's stale.
# See https://gitlab.com/gitlab-org/gitlab/-/issues/335537#note_632925914 for more information.
def stale_processables
Ci::Processable.where(id: retained.select(:build_id))
.complete
.updated_at_before(5.minutes.ago)
end
end
end end
end end
...@@ -58,6 +58,7 @@ class CommitStatus < ApplicationRecord ...@@ -58,6 +58,7 @@ class CommitStatus < ApplicationRecord
scope :in_pipelines, ->(pipelines) { where(pipeline: pipelines) } scope :in_pipelines, ->(pipelines) { where(pipeline: pipelines) }
scope :eager_load_pipeline, -> { eager_load(:pipeline, project: { namespace: :route }) } scope :eager_load_pipeline, -> { eager_load(:pipeline, project: { namespace: :route }) }
scope :with_pipeline, -> { joins(:pipeline) } scope :with_pipeline, -> { joins(:pipeline) }
scope :updated_at_before, ->(date) { where('updated_at < ?', date) }
scope :updated_before, ->(lookback:, timeout:) { scope :updated_before, ->(lookback:, timeout:) {
where('(ci_builds.created_at BETWEEN ? AND ?) AND (ci_builds.updated_at BETWEEN ? AND ?)', lookback, timeout, lookback, timeout) where('(ci_builds.created_at BETWEEN ? AND ?) AND (ci_builds.updated_at BETWEEN ? AND ?)', lookback, timeout, lookback, timeout)
} }
......
...@@ -93,6 +93,7 @@ module Ci ...@@ -93,6 +93,7 @@ module Ci
scope :running_or_pending, -> { with_status(:running, :pending) } scope :running_or_pending, -> { with_status(:running, :pending) }
scope :finished, -> { with_status(:success, :failed, :canceled) } scope :finished, -> { with_status(:success, :failed, :canceled) }
scope :failed_or_canceled, -> { with_status(:failed, :canceled) } scope :failed_or_canceled, -> { with_status(:failed, :canceled) }
scope :complete, -> { with_status(completed_statuses) }
scope :incomplete, -> { without_statuses(completed_statuses) } scope :incomplete, -> { without_statuses(completed_statuses) }
scope :cancelable, -> do scope :cancelable, -> do
......
...@@ -5,6 +5,8 @@ module Ci ...@@ -5,6 +5,8 @@ module Ci
class AssignResourceFromResourceGroupService < ::BaseService class AssignResourceFromResourceGroupService < ::BaseService
# rubocop: disable CodeReuse/ActiveRecord # rubocop: disable CodeReuse/ActiveRecord
def execute(resource_group) def execute(resource_group)
release_resource_from_stale_jobs(resource_group)
free_resources = resource_group.resources.free.count free_resources = resource_group.resources.free.count
resource_group.processables.waiting_for_resource.take(free_resources).each do |processable| resource_group.processables.waiting_for_resource.take(free_resources).each do |processable|
...@@ -12,6 +14,14 @@ module Ci ...@@ -12,6 +14,14 @@ module Ci
end end
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
private
def release_resource_from_stale_jobs(resource_group)
resource_group.resources.stale_processables.find_each do |processable|
resource_group.release_resource_from(processable)
end
end
end end
end end
end end
...@@ -15,6 +15,22 @@ RSpec.describe Ci::Resource do ...@@ -15,6 +15,22 @@ RSpec.describe Ci::Resource do
end end
end end
describe '.retained' do
subject { described_class.retained }
it "returns the resource if it's retained" do
resource = create(:ci_resource, processable: create(:ci_build))
is_expected.to eq([resource])
end
it "returns empty if it's not retained" do
create(:ci_resource, processable: nil)
is_expected.to be_empty
end
end
describe '.retained_by' do describe '.retained_by' do
subject { described_class.retained_by(build) } subject { described_class.retained_by(build) }
...@@ -25,4 +41,40 @@ RSpec.describe Ci::Resource do ...@@ -25,4 +41,40 @@ RSpec.describe Ci::Resource do
is_expected.to eq([resource]) is_expected.to eq([resource])
end end
end end
describe '.stale_processables' do
subject { resource_group.resources.stale_processables }
let!(:resource_group) { create(:ci_resource_group) }
let!(:resource) { create(:ci_resource, processable: build, resource_group: resource_group) }
context 'when the processable is running' do
let!(:build) { create(:ci_build, :running, resource_group: resource_group) }
before do
# Creating unrelated builds to make sure the `retained` scope is working
create(:ci_build, :running, resource_group: resource_group)
end
it 'returns empty' do
is_expected.to be_empty
end
context 'and doomed' do
before do
build.doom!
end
it 'returns empty' do
is_expected.to be_empty
end
it 'returns the stale prosessable a few minutes later' do
travel_to(10.minutes.since) do
is_expected.to eq([build])
end
end
end
end
end
end end
...@@ -79,6 +79,15 @@ RSpec.describe CommitStatus do ...@@ -79,6 +79,15 @@ RSpec.describe CommitStatus do
end end
end end
describe '.updated_at_before' do
it 'finds the relevant records' do
status = create(:commit_status, updated_at: 1.day.ago, project: project)
create(:commit_status, updated_at: 1.day.since, project: project)
expect(described_class.updated_at_before(Time.current)).to eq([status])
end
end
describe '.updated_before' do describe '.updated_before' do
let!(:lookback) { 5.days.ago } let!(:lookback) { 5.days.ago }
let!(:timeout) { 1.day.ago } let!(:timeout) { 1.day.ago }
......
...@@ -351,6 +351,18 @@ RSpec.describe Ci::HasStatus do ...@@ -351,6 +351,18 @@ RSpec.describe Ci::HasStatus do
it_behaves_like 'not containing the job', status it_behaves_like 'not containing the job', status
end end
end end
describe '.complete' do
subject { CommitStatus.complete }
described_class::COMPLETED_STATUSES.each do |status|
it_behaves_like 'containing the job', status
end
described_class::ACTIVE_STATUSES.each do |status|
it_behaves_like 'not containing the job', status
end
end
end end
describe '::DEFAULT_STATUS' do describe '::DEFAULT_STATUS' do
......
...@@ -51,14 +51,32 @@ RSpec.describe Ci::ResourceGroups::AssignResourceFromResourceGroupService do ...@@ -51,14 +51,32 @@ RSpec.describe Ci::ResourceGroups::AssignResourceFromResourceGroupService do
end end
context 'when there are no available resources' do context 'when there are no available resources' do
let!(:other_build) { create(:ci_build) }
before do before do
resource_group.assign_resource_to(create(:ci_build)) resource_group.assign_resource_to(other_build)
end end
it 'does not request resource' do it 'does not request resource' do
expect_any_instance_of(Ci::Build).not_to receive(:enqueue_waiting_for_resource) expect_any_instance_of(Ci::Build).not_to receive(:enqueue_waiting_for_resource)
subject subject
expect(build.reload).to be_waiting_for_resource
end
context 'when there is a stale build assigned to a resource' do
before do
other_build.doom!
other_build.update_column(:updated_at, 10.minutes.ago)
end
it 'releases the resource from the stale build and assignes to the waiting build' do
subject
expect(build.reload).to be_pending
expect(build.resource).to be_present
end
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment