Commit 8f0b7a30 authored by drew cimino's avatar drew cimino Committed by Bob Van Landuyt

Create Ci::StuckBuilds::DropRunningService for querying and dropping running builds

parent ca3aa6e0
# frozen_string_literal: true
module Ci
module StuckBuilds
class DropRunningService
include DropHelpers
BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour
def execute
Gitlab::AppLogger.info "#{self.class}: Cleaning running, timed-out builds"
drop(running_timed_out_builds, failure_reason: :stuck_or_timeout_failure)
end
private
def running_timed_out_builds
Ci::Build.running.updated_at_before(BUILD_RUNNING_OUTDATED_TIMEOUT.ago)
end
end
end
end
......@@ -5,7 +5,6 @@ module Ci
class DropService
include DropHelpers
BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_STUCK_TIMEOUT = 1.hour
......@@ -14,8 +13,6 @@ module Ci
def execute
Gitlab::AppLogger.info "#{self.class}: Cleaning stuck builds"
drop(running_timed_out_builds, failure_reason: :stuck_or_timeout_failure)
drop(
pending_builds(BUILD_PENDING_OUTDATED_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
......@@ -50,13 +47,6 @@ module Ci
BUILD_SCHEDULED_OUTDATED_TIMEOUT.ago
)
end
def running_timed_out_builds
Ci::Build.running.where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.updated_at < ?',
BUILD_RUNNING_OUTDATED_TIMEOUT.ago
)
end
end
end
end
......@@ -228,6 +228,15 @@
:weight: 1
:idempotent: true
:tags: []
- :name: cronjob:ci_stuck_builds_drop_running
:worker_name: Ci::StuckBuilds::DropRunningWorker
:feature_category: :continuous_integration
:has_external_dependencies:
:urgency: :low
:resource_boundary: :unknown
:weight: 1
:idempotent: true
:tags: []
- :name: cronjob:container_expiration_policy
:worker_name: ContainerExpirationPolicyWorker
:feature_category: :container_registry
......
# frozen_string_literal: true
module Ci
module StuckBuilds
class DropRunningWorker
include ApplicationWorker
idempotent!
# rubocop:disable Scalability/CronWorkerContext
# This is an instance-wide cleanup query, so there's no meaningful
# scope to consider this in the context of.
include CronjobQueue
# rubocop:enable Scalability/CronWorkerContext
data_consistency :always
feature_category :continuous_integration
EXCLUSIVE_LEASE_KEY = 'ci_stuck_builds_drop_running_worker_lease'
def perform
return unless try_obtain_lease
begin
Ci::StuckBuilds::DropRunningService.new.execute
ensure
remove_lease
end
end
private
def try_obtain_lease
@uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain
end
def remove_lease
Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid)
end
end
end
end
......@@ -17,6 +17,8 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease'
def perform
Ci::StuckBuilds::DropRunningWorker.perform_in(20.minutes)
return unless try_obtain_lease
Ci::StuckBuilds::DropService.new.execute
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Ci::StuckBuilds::DropRunningService do
let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner }
let(:created_at) { }
let(:updated_at) { }
subject(:service) { described_class.new }
before do
job_attributes = { status: status }
job_attributes[:created_at] = created_at if created_at
job_attributes[:updated_at] = updated_at if updated_at
job.update!(job_attributes)
end
context 'when job is running' do
let(:status) { 'running' }
context 'when job was updated_at more than an hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
end
%w(success skipped failed canceled scheduled pending).each do |status|
context "when job is #{status}" do
let(:status) { status }
let(:updated_at) { 2.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
end
end
context 'for deleted project' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
before do
job.project.update!(pending_delete: true)
end
it_behaves_like 'job is dropped'
end
end
......@@ -17,48 +17,6 @@ RSpec.describe Ci::StuckBuilds::DropService do
job.update!(job_attributes)
end
shared_examples 'job is dropped' do
it 'changes status' do
expect(service).to receive(:drop).exactly(3).times.and_call_original
expect(service).to receive(:drop_stuck).exactly(:once).and_call_original
service.execute
job.reload
expect(job).to be_failed
expect(job).to be_stuck_or_timeout_failure
end
context 'when job have data integrity problem' do
it "does drop the job and logs the reason" do
job.update_columns(yaml_variables: '[{"key" => "value"}]')
expect(Gitlab::ErrorTracking).to receive(:track_exception)
.with(anything, a_hash_including(build_id: job.id))
.once
.and_call_original
service.execute
job.reload
expect(job).to be_failed
expect(job).to be_data_integrity_failure
end
end
end
shared_examples 'job is unchanged' do
it 'does not change status' do
expect(service).to receive(:drop).exactly(3).times.and_call_original
expect(service).to receive(:drop_stuck).exactly(:once).and_call_original
service.execute
job.reload
expect(job.status).to eq(status)
end
end
context 'when job is pending' do
let(:status) { 'pending' }
......@@ -195,7 +153,7 @@ RSpec.describe Ci::StuckBuilds::DropService do
context 'when job was updated_at more than an hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped'
it_behaves_like 'job is unchanged'
end
context 'when job was updated in less than 1 hour ago' do
......@@ -238,7 +196,7 @@ RSpec.describe Ci::StuckBuilds::DropService do
job.project.update!(pending_delete: true)
end
it_behaves_like 'job is dropped'
it_behaves_like 'job is unchanged'
end
describe 'drop stale scheduled builds' do
......
# frozen_string_literal: true
RSpec.shared_examples 'job is dropped' do
it 'changes status' do
service.execute
job.reload
expect(job).to be_failed
expect(job).to be_stuck_or_timeout_failure
end
context 'when job has data integrity problem' do
it 'drops the job and logs the reason' do
job.update_columns(yaml_variables: '[{"key" => "value"}]')
expect(Gitlab::ErrorTracking)
.to receive(:track_exception)
.with(anything, a_hash_including(build_id: job.id))
.once
.and_call_original
service.execute
job.reload
expect(job).to be_failed
expect(job).to be_data_integrity_failure
end
end
end
RSpec.shared_examples 'job is unchanged' do
it 'does not change status' do
service.execute
job.reload
expect(job.status).to eq(status)
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Ci::StuckBuilds::DropRunningWorker do
include ExclusiveLeaseHelpers
let(:worker_lease_key) { Ci::StuckBuilds::DropRunningWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid }
let(:worker2) { described_class.new }
subject(:worker) { described_class.new }
before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
end
describe '#perform' do
it_behaves_like 'an idempotent worker'
it 'executes an instance of Ci::StuckBuilds::DropRunningService' do
expect_next_instance_of(Ci::StuckBuilds::DropRunningService) do |service|
expect(service).to receive(:execute).exactly(:once)
end
worker.perform
end
context 'with an exclusive lease' do
it 'does not execute concurrently' do
expect(worker).to receive(:remove_lease).exactly(:once)
expect(worker2).not_to receive(:remove_lease)
worker.perform
stub_exclusive_lease_taken(worker_lease_key)
worker2.perform
end
it 'can execute in sequence' do
expect(worker).to receive(:remove_lease).at_least(:once)
expect(worker2).to receive(:remove_lease).at_least(:once)
worker.perform
worker2.perform
end
it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
worker.perform
end
context 'when the DropRunningService fails' do
it 'ensures cancellation of the exclusive lease' do
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
allow_next_instance_of(Ci::StuckBuilds::DropRunningService) do |service|
allow(service).to receive(:execute) do
raise 'The query timed out'
end
end
expect { worker.perform }.to raise_error(/The query timed out/)
end
end
end
end
end
......@@ -16,7 +16,13 @@ RSpec.describe StuckCiJobsWorker do
end
describe '#perform' do
it 'executes an instance of Ci::StuckBuildsDropService' do
it 'enqueues a Ci::StuckBuilds::DropRunningWorker job' do
expect(Ci::StuckBuilds::DropRunningWorker).to receive(:perform_in).with(20.minutes).exactly(:once)
worker.perform
end
it 'executes an instance of Ci::StuckBuilds::DropService' do
expect_next_instance_of(Ci::StuckBuilds::DropService) do |service|
expect(service).to receive(:execute).exactly(:once)
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment