Commit 50027303 authored by drew cimino's avatar drew cimino

Introduce DropScheduledService and Worker

Add new Worker and Service classes for dropping scheduled builds.
This will take more work away from StuckCiBuilds worker in an
effort to reduce DB load and query timeouts
parent 6b946923
# frozen_string_literal: true
module Ci
module StuckBuilds
class DropScheduledService
include DropHelpers
BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour
def execute
Gitlab::AppLogger.info "#{self.class}: Cleaning scheduled, timed-out builds"
drop(scheduled_timed_out_builds, failure_reason: :stale_schedule)
end
private
def scheduled_timed_out_builds
Ci::Build.where(status: :scheduled).where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.scheduled_at IS NOT NULL AND ci_builds.scheduled_at < ?',
BUILD_SCHEDULED_OUTDATED_TIMEOUT.ago
)
end
end
end
end
...@@ -6,7 +6,6 @@ module Ci ...@@ -6,7 +6,6 @@ module Ci
include DropHelpers include DropHelpers
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_STUCK_TIMEOUT = 1.hour BUILD_PENDING_STUCK_TIMEOUT = 1.hour
BUILD_LOOKBACK = 5.days BUILD_LOOKBACK = 5.days
...@@ -18,8 +17,6 @@ module Ci ...@@ -18,8 +17,6 @@ module Ci
failure_reason: :stuck_or_timeout_failure failure_reason: :stuck_or_timeout_failure
) )
drop(scheduled_timed_out_builds, failure_reason: :stale_schedule)
drop_stuck( drop_stuck(
pending_builds(BUILD_PENDING_STUCK_TIMEOUT.ago), pending_builds(BUILD_PENDING_STUCK_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure failure_reason: :stuck_or_timeout_failure
...@@ -40,13 +37,6 @@ module Ci ...@@ -40,13 +37,6 @@ module Ci
end end
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
def scheduled_timed_out_builds
Ci::Build.where(status: :scheduled).where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.scheduled_at IS NOT NULL AND ci_builds.scheduled_at < ?',
BUILD_SCHEDULED_OUTDATED_TIMEOUT.ago
)
end
end end
end end
end end
...@@ -237,6 +237,15 @@ ...@@ -237,6 +237,15 @@
:weight: 1 :weight: 1
:idempotent: true :idempotent: true
:tags: [] :tags: []
- :name: cronjob:ci_stuck_builds_drop_scheduled
:worker_name: Ci::StuckBuilds::DropScheduledWorker
:feature_category: :continuous_integration
:has_external_dependencies:
:urgency: :low
:resource_boundary: :unknown
:weight: 1
:idempotent: true
:tags: []
- :name: cronjob:container_expiration_policy - :name: cronjob:container_expiration_policy
:worker_name: ContainerExpirationPolicyWorker :worker_name: ContainerExpirationPolicyWorker
:feature_category: :container_registry :feature_category: :container_registry
......
# frozen_string_literal: true
module Ci
module StuckBuilds
class DropScheduledWorker
include ApplicationWorker
idempotent!
# rubocop:disable Scalability/CronWorkerContext
# This is an instance-wide cleanup query, so there's no meaningful
# scope to consider this in the context of.
include CronjobQueue
# rubocop:enable Scalability/CronWorkerContext
data_consistency :always
feature_category :continuous_integration
EXCLUSIVE_LEASE_KEY = 'ci_stuck_builds_drop_scheduled_worker_lease'
def perform
return unless try_obtain_lease
begin
Ci::StuckBuilds::DropScheduledService.new.execute
ensure
remove_lease
end
end
private
def try_obtain_lease
@uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain
end
def remove_lease
Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid)
end
end
end
end
...@@ -18,6 +18,7 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -18,6 +18,7 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
def perform def perform
Ci::StuckBuilds::DropRunningWorker.perform_in(20.minutes) Ci::StuckBuilds::DropRunningWorker.perform_in(20.minutes)
Ci::StuckBuilds::DropScheduledWorker.perform_in(40.minutes)
return unless try_obtain_lease return unless try_obtain_lease
......
...@@ -22,7 +22,7 @@ RSpec.describe Ci::StuckBuilds::DropRunningService do ...@@ -22,7 +22,7 @@ RSpec.describe Ci::StuckBuilds::DropRunningService do
let(:created_at) { outdated_time } let(:created_at) { outdated_time }
let(:updated_at) { outdated_time } let(:updated_at) { outdated_time }
it_behaves_like 'job is dropped' it_behaves_like 'job is dropped with failure reason', 'stuck_or_timeout_failure'
end end
context 'when job is fresh' do context 'when job is fresh' do
...@@ -69,4 +69,15 @@ RSpec.describe Ci::StuckBuilds::DropRunningService do ...@@ -69,4 +69,15 @@ RSpec.describe Ci::StuckBuilds::DropRunningService do
end end
end end
end end
context 'for deleted project' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
before do
job.project.update!(pending_delete: true)
end
it_behaves_like 'job is dropped with failure reason', 'stuck_or_timeout_failure'
end
end end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Ci::StuckBuilds::DropScheduledService do
let_it_be(:runner) { create :ci_runner }
let!(:job) { create :ci_build, :scheduled, scheduled_at: scheduled_at, runner: runner }
subject(:service) { described_class.new }
context 'when job is scheduled' do
context 'for more than an hour ago' do
let(:scheduled_at) { 2.hours.ago }
it_behaves_like 'job is dropped with failure reason', 'stale_schedule'
end
context 'for less than 1 hour ago' do
let(:scheduled_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
end
%w(success skipped failed canceled running pending).each do |status|
context "when job is #{status}" do
before do
job.update!(status: status)
end
context 'and scheduled for more than an hour ago' do
let(:scheduled_at) { 2.hours.ago }
it_behaves_like 'job is unchanged'
end
context 'and scheduled for less than 1 hour ago' do
let(:scheduled_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
end
end
context 'for deleted project' do
let(:scheduled_at) { 2.days.ago }
before do
job.project.update!(pending_delete: true)
end
it_behaves_like 'job is dropped with failure reason', 'stale_schedule'
end
context 'when there are no stale scheduled builds' do
let(:job) { }
it 'does not drop the stale scheduled build yet' do
expect { service.execute }.not_to raise_error
end
end
end
...@@ -33,13 +33,13 @@ RSpec.describe Ci::StuckBuilds::DropService do ...@@ -33,13 +33,13 @@ RSpec.describe Ci::StuckBuilds::DropService do
context 'when created_at is the same as updated_at' do context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago } let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is dropped' it_behaves_like 'job is dropped with failure reason', 'stuck_or_timeout_failure'
end end
context 'when created_at is before updated_at' do context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago } let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped' it_behaves_like 'job is dropped with failure reason', 'stuck_or_timeout_failure'
end end
context 'when created_at is outside lookback window' do context 'when created_at is outside lookback window' do
...@@ -107,13 +107,13 @@ RSpec.describe Ci::StuckBuilds::DropService do ...@@ -107,13 +107,13 @@ RSpec.describe Ci::StuckBuilds::DropService do
context 'when created_at is the same as updated_at' do context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.hours.ago } let(:created_at) { 1.5.hours.ago }
it_behaves_like 'job is dropped' it_behaves_like 'job is dropped with failure reason', 'stuck_or_timeout_failure'
end end
context 'when created_at is before updated_at' do context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago } let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped' it_behaves_like 'job is dropped with failure reason', 'stuck_or_timeout_failure'
end end
context 'when created_at is outside lookback window' do context 'when created_at is outside lookback window' do
...@@ -198,45 +198,4 @@ RSpec.describe Ci::StuckBuilds::DropService do ...@@ -198,45 +198,4 @@ RSpec.describe Ci::StuckBuilds::DropService do
it_behaves_like 'job is unchanged' it_behaves_like 'job is unchanged'
end end
describe 'drop stale scheduled builds' do
let(:status) { 'scheduled' }
let(:updated_at) { }
context 'when scheduled at 2 hours ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 2.hours.ago) }
it 'drops the stale scheduled build' do
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
service.execute
job.reload
expect(Ci::Build.scheduled.count).to eq(0)
expect(job).to be_failed
expect(job).to be_stale_schedule
end
end
context 'when scheduled at 30 minutes ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 30.minutes.ago) }
it 'does not drop the stale scheduled build yet' do
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
service.execute
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
end
end
context 'when there are no stale scheduled builds' do
it 'does not drop the stale scheduled build yet' do
expect { service.execute }.not_to raise_error
end
end
end
end end
# frozen_string_literal: true # frozen_string_literal: true
RSpec.shared_examples 'job is dropped' do RSpec.shared_examples 'job is dropped with failure reason' do |failure_reason|
it 'changes status' do it 'changes status' do
service.execute service.execute
job.reload job.reload
expect(job).to be_failed expect(job).to be_failed
expect(job).to be_stuck_or_timeout_failure expect(job.failure_reason).to eq(failure_reason)
end end
context 'when job has data integrity problem' do context 'when job has data integrity problem' do
...@@ -23,16 +23,13 @@ RSpec.shared_examples 'job is dropped' do ...@@ -23,16 +23,13 @@ RSpec.shared_examples 'job is dropped' do
job.reload job.reload
expect(job).to be_failed expect(job).to be_failed
expect(job).to be_data_integrity_failure expect(job.failure_reason).to eq('data_integrity_failure')
end end
end end
end end
RSpec.shared_examples 'job is unchanged' do RSpec.shared_examples 'job is unchanged' do
it 'does not change status' do it 'does not change status' do
service.execute expect { service.execute }.not_to change(job, :status)
job.reload
expect(job.status).to eq(status)
end end
end end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Ci::StuckBuilds::DropScheduledWorker do
include ExclusiveLeaseHelpers
let(:worker_lease_key) { Ci::StuckBuilds::DropScheduledWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid }
let(:worker2) { described_class.new }
subject(:worker) { described_class.new }
before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
end
describe '#perform' do
it_behaves_like 'an idempotent worker'
it 'executes an instance of Ci::StuckBuilds::DropScheduledService' do
expect_next_instance_of(Ci::StuckBuilds::DropScheduledService) do |service|
expect(service).to receive(:execute).exactly(:once)
end
worker.perform
end
context 'with an exclusive lease' do
it 'does not execute concurrently' do
expect(worker).to receive(:remove_lease).exactly(:once)
expect(worker2).not_to receive(:remove_lease)
worker.perform
stub_exclusive_lease_taken(worker_lease_key)
worker2.perform
end
it 'can execute in sequence' do
expect(worker).to receive(:remove_lease).at_least(:once)
expect(worker2).to receive(:remove_lease).at_least(:once)
worker.perform
worker2.perform
end
it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
worker.perform
end
context 'when the DropScheduledService fails' do
it 'ensures cancellation of the exclusive lease' do
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
allow_next_instance_of(Ci::StuckBuilds::DropScheduledService) do |service|
expect(service).to receive(:execute) do
raise 'The query timed out'
end
end
expect { worker.perform }.to raise_error('The query timed out')
end
end
end
end
end
...@@ -22,6 +22,12 @@ RSpec.describe StuckCiJobsWorker do ...@@ -22,6 +22,12 @@ RSpec.describe StuckCiJobsWorker do
worker.perform worker.perform
end end
it 'enqueues a Ci::StuckBuilds::DropScheduledWorker job' do
expect(Ci::StuckBuilds::DropScheduledWorker).to receive(:perform_in).with(40.minutes).exactly(:once)
worker.perform
end
it 'executes an instance of Ci::StuckBuilds::DropService' do it 'executes an instance of Ci::StuckBuilds::DropService' do
expect_next_instance_of(Ci::StuckBuilds::DropService) do |service| expect_next_instance_of(Ci::StuckBuilds::DropService) do |service|
expect(service).to receive(:execute).exactly(:once) expect(service).to receive(:execute).exactly(:once)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment