Commit 6d6e8e42 authored by Bob Van Landuyt's avatar Bob Van Landuyt

Merge branch '291010-stuckcijobsworker-always-times-out' into 'master'

Optimize StuckCiJobsWorker Queries because they timeout in production

See merge request gitlab-org/gitlab!62239
parents 712f99a7 8ab6cefe
......@@ -57,6 +57,9 @@ class CommitStatus < ApplicationRecord
scope :in_pipelines, ->(pipelines) { where(pipeline: pipelines) }
scope :eager_load_pipeline, -> { eager_load(:pipeline, project: { namespace: :route }) }
scope :with_pipeline, -> { joins(:pipeline) }
scope :updated_before, ->(lookback:, timeout:) {
where('(ci_builds.created_at BETWEEN ? AND ?) AND (ci_builds.updated_at BETWEEN ? AND ?)', lookback, timeout, lookback, timeout)
}
scope :for_project_paths, -> (paths) do
where(project: Project.where_full_path_in(Array(paths)))
......
......@@ -15,22 +15,46 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_STUCK_TIMEOUT = 1.hour
BUILD_LOOKBACK = 5.days
def perform
return unless try_obtain_lease
Gitlab::AppLogger.info "#{self.class}: Cleaning stuck builds"
drop :running, BUILD_RUNNING_OUTDATED_TIMEOUT, 'ci_builds.updated_at < ?', :stuck_or_timeout_failure
drop :pending, BUILD_PENDING_OUTDATED_TIMEOUT, 'ci_builds.updated_at < ?', :stuck_or_timeout_failure
drop :scheduled, BUILD_SCHEDULED_OUTDATED_TIMEOUT, 'scheduled_at IS NOT NULL AND scheduled_at < ?', :stale_schedule
drop_stuck :pending, BUILD_PENDING_STUCK_TIMEOUT, 'ci_builds.updated_at < ?', :stuck_or_timeout_failure
drop(running_timed_out_builds, failure_reason: :stuck_or_timeout_failure)
drop(
Ci::Build.pending.updated_before(lookback: BUILD_LOOKBACK.ago, timeout: BUILD_PENDING_OUTDATED_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
drop(scheduled_timed_out_builds, failure_reason: :stale_schedule)
drop_stuck(
Ci::Build.pending.updated_before(lookback: BUILD_LOOKBACK.ago, timeout: BUILD_PENDING_STUCK_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
remove_lease
end
private
def scheduled_timed_out_builds
Ci::Build.where(status: :scheduled).where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.scheduled_at IS NOT NULL AND ci_builds.scheduled_at < ?',
BUILD_SCHEDULED_OUTDATED_TIMEOUT.ago
)
end
def running_timed_out_builds
Ci::Build.running.where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.updated_at < ?',
BUILD_RUNNING_OUTDATED_TIMEOUT.ago
)
end
def try_obtain_lease
@uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain
end
......@@ -39,28 +63,27 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid)
end
def drop(status, timeout, condition, reason)
search(status, timeout, condition) do |build|
drop_build :outdated, build, status, timeout, reason
def drop(builds, failure_reason:)
fetch(builds) do |build|
drop_build :outdated, build, failure_reason
end
end
def drop_stuck(status, timeout, condition, reason)
search(status, timeout, condition) do |build|
def drop_stuck(builds, failure_reason:)
fetch(builds) do |build|
break unless build.stuck?
drop_build :stuck, build, status, timeout, reason
drop_build :stuck, build, failure_reason
end
end
# rubocop: disable CodeReuse/ActiveRecord
def search(status, timeout, condition)
def fetch(builds)
loop do
jobs = Ci::Build.where(status: status)
.where(condition, timeout.ago)
.includes(:tags, :runner, project: [:namespace, :route])
jobs = builds.includes(:tags, :runner, project: [:namespace, :route])
.limit(100)
.to_a
break if jobs.empty?
jobs.each do |job|
......@@ -70,8 +93,8 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
end
# rubocop: enable CodeReuse/ActiveRecord
def drop_build(type, build, status, timeout, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout}, reason: #{reason})"
def drop_build(type, build, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{build.status}, failure_reason: #{reason})"
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
b.drop(reason)
end
......
......@@ -79,6 +79,32 @@ RSpec.describe CommitStatus do
end
end
describe '.updated_before' do
let!(:lookback) { 5.days.ago }
let!(:timeout) { 1.day.ago }
let!(:before_lookback) { lookback - 1.hour }
let!(:after_lookback) { lookback + 1.hour }
let!(:before_timeout) { timeout - 1.hour }
let!(:after_timeout) { timeout + 1.hour }
subject { described_class.updated_before(lookback: lookback, timeout: timeout) }
def create_build_with_set_timestamps(created_at:, updated_at:)
travel_to(created_at) { create(:ci_build, created_at: Time.current) }.tap do |build|
travel_to(updated_at) { build.update!(status: :failed) }
end
end
it 'finds builds updated and created in the window between lookback and timeout' do
build_in_lookback_timeout_window = create_build_with_set_timestamps(created_at: after_lookback, updated_at: before_timeout)
build_outside_lookback_window = create_build_with_set_timestamps(created_at: before_lookback, updated_at: before_timeout)
build_outside_timeout_window = create_build_with_set_timestamps(created_at: after_lookback, updated_at: after_timeout)
expect(subject).to contain_exactly(build_in_lookback_timeout_window)
expect(subject).not_to include(build_outside_lookback_window, build_outside_timeout_window)
end
end
describe '#processed' do
subject { commit_status.processed }
......
......@@ -9,12 +9,17 @@ RSpec.describe StuckCiJobsWorker do
let!(:job) { create :ci_build, runner: runner }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid }
let(:created_at) { }
let(:updated_at) { }
subject(:worker) { described_class.new }
before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
job.update!(status: status, updated_at: updated_at)
job_attributes = { status: status }
job_attributes[:created_at] = created_at if created_at
job_attributes[:updated_at] = updated_at if updated_at
job.update!(job_attributes)
end
shared_examples 'job is dropped' do
......@@ -63,49 +68,128 @@ RSpec.describe StuckCiJobsWorker do
allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(false)
end
context 'when job was not updated for more than 1 day ago' do
let(:updated_at) { 2.days.ago }
context 'when job was updated_at more than 1 day ago' do
let(:updated_at) { 1.5.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1 day ago' do
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated less than 1 day ago' do
let(:updated_at) { 6.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was not updated for more than 1 hour ago' do
context 'when job was updated more than 1 hour ago' do
let(:updated_at) { 2.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.hours.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
context 'when job is stuck' do
before do
allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(true)
end
context 'when job was not updated for more than 1 hour ago' do
let(:updated_at) { 2.hours.ago }
context 'when job was updated_at more than 1 hour ago' do
let(:updated_at) { 1.5.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1
hour ago' do
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
end
context 'when job is running' do
let(:status) { 'running' }
context 'when job was not updated for more than 1 hour ago' do
context 'when job was updated_at more than an hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped'
......@@ -123,9 +207,25 @@ RSpec.describe StuckCiJobsWorker do
let(:status) { status }
let(:updated_at) { 2.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
context 'for deleted project' do
let(:status) { 'running' }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment