Commit 5bdca904 authored by Furkan Ayhan's avatar Furkan Ayhan

Add metrics to track failure reasons of pipelines and jobs

This will help us to track possible dramatic failure changes
when we deploy new features / bug fixes.
parent c821e3a3
......@@ -286,9 +286,11 @@ module Ci
end
after_transition any => [:failed] do |pipeline|
next unless pipeline.auto_devops_source?
pipeline.run_after_commit do
::Gitlab::Ci::Pipeline::Metrics.pipeline_failure_reason_counter.increment(reason: pipeline.failure_reason)
pipeline.run_after_commit { AutoDevops::DisableWorker.perform_async(pipeline.id) }
AutoDevops::DisableWorker.perform_async(pipeline.id) if pipeline.auto_devops_source?
end
end
end
......
......@@ -179,6 +179,12 @@ class CommitStatus < ApplicationRecord
ExpireJobCacheWorker.perform_async(id)
end
end
after_transition any => :failed do |commit_status|
commit_status.run_after_commit do
::Gitlab::Ci::Pipeline::Metrics.job_failure_reason_counter.increment(reason: commit_status.failure_reason)
end
end
end
def self.names
......
......@@ -19,7 +19,7 @@ module Ci
end
def metrics
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics.new
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics
end
private
......
......@@ -83,7 +83,8 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do
project: project,
current_user: user,
save_incompleted: false,
pipeline_seed: double(:seed, size: 2))
pipeline_seed: double(:seed, size: 2),
increment_pipeline_failure_reason_counter: true)
end
it 'does not drop the pipeline' do
......@@ -97,6 +98,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do
expect(step.break?).to be true
end
it 'increments the error metric' do
expect(command).to receive(:increment_pipeline_failure_reason_counter).with(:size_limit_exceeded)
subject
end
end
end
......
......@@ -84,7 +84,7 @@ module Gitlab
end
def metrics
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics.new
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics
end
def observe_creation_duration(duration)
......@@ -97,6 +97,11 @@ module Gitlab
.observe({ source: pipeline.source.to_s }, pipeline.total_size)
end
def increment_pipeline_failure_reason_counter(reason)
metrics.pipeline_failure_reason_counter
.increment(reason: (reason || :unknown_failure).to_s)
end
def dangling_build?
%i[ondemand_dast_scan webide].include?(source)
end
......
......@@ -13,16 +13,7 @@ module Gitlab
pipeline.add_error_message(message)
if drop_reason && persist_pipeline?
if Feature.enabled?(:ci_pipeline_ensure_iid_on_drop, pipeline.project, default_enabled: :yaml)
# Project iid must be called outside a transaction, so we ensure it is set here
# otherwise it may be set within the state transition transaction of the drop! call
# which it will lock the InternalId row for the whole transaction
pipeline.ensure_project_iid!
end
pipeline.drop!(drop_reason)
end
drop_pipeline!(drop_reason)
# TODO: consider not to rely on AR errors directly as they can be
# polluted with other unrelated errors (e.g. state machine)
......@@ -34,8 +25,23 @@ module Gitlab
pipeline.add_warning_message(message)
end
def persist_pipeline?
command.save_incompleted && !pipeline.readonly?
private
def drop_pipeline!(drop_reason)
return if pipeline.readonly?
if drop_reason && command.save_incompleted
if Feature.enabled?(:ci_pipeline_ensure_iid_on_drop, pipeline.project, default_enabled: :yaml)
# Project iid must be called outside a transaction, so we ensure it is set here
# otherwise it may be set within the state transition transaction of the drop! call
# which it will lock the InternalId row for the whole transaction
pipeline.ensure_project_iid!
end
pipeline.drop!(drop_reason)
else
command.increment_pipeline_failure_reason_counter(drop_reason)
end
end
end
end
......
......@@ -14,7 +14,7 @@ module Gitlab
end
def counter
::Gitlab::Ci::Pipeline::Metrics.new.pipelines_created_counter
::Gitlab::Ci::Pipeline::Metrics.pipelines_created_counter
end
end
end
......
......@@ -4,55 +4,57 @@ module Gitlab
module Ci
module Pipeline
class Metrics
include Gitlab::Utils::StrongMemoize
def self.pipeline_creation_duration_histogram
name = :gitlab_ci_pipeline_creation_duration_seconds
comment = 'Pipeline creation duration'
labels = {}
buckets = [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 20.0, 50.0, 240.0]
def pipeline_creation_duration_histogram
strong_memoize(:pipeline_creation_duration_histogram) do
name = :gitlab_ci_pipeline_creation_duration_seconds
comment = 'Pipeline creation duration'
labels = {}
buckets = [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 20.0, 50.0, 240.0]
::Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
def self.pipeline_size_histogram
name = :gitlab_ci_pipeline_size_builds
comment = 'Pipeline size'
labels = { source: nil }
buckets = [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]
::Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
def self.pipeline_processing_events_counter
name = :gitlab_ci_pipeline_processing_events_total
comment = 'Total amount of pipeline processing events'
::Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
Gitlab::Metrics.counter(name, comment)
end
def pipeline_size_histogram
strong_memoize(:pipeline_size_histogram) do
name = :gitlab_ci_pipeline_size_builds
comment = 'Pipeline size'
labels = { source: nil }
buckets = [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]
def self.pipelines_created_counter
name = :pipelines_created_total
comment = 'Counter of pipelines created'
::Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
Gitlab::Metrics.counter(name, comment)
end
def pipeline_processing_events_counter
strong_memoize(:pipeline_processing_events_counter) do
name = :gitlab_ci_pipeline_processing_events_total
comment = 'Total amount of pipeline processing events'
def self.legacy_update_jobs_counter
name = :ci_legacy_update_jobs_as_retried_total
comment = 'Counter of occurrences when jobs were not being set as retried before update_retried'
Gitlab::Metrics.counter(name, comment)
end
Gitlab::Metrics.counter(name, comment)
end
def pipelines_created_counter
strong_memoize(:pipelines_created_count) do
name = :pipelines_created_total
comment = 'Counter of pipelines created'
def self.pipeline_failure_reason_counter
name = :gitlab_ci_pipeline_failure_reasons
comment = 'Counter of pipeline failure reasons'
Gitlab::Metrics.counter(name, comment)
end
Gitlab::Metrics.counter(name, comment)
end
def legacy_update_jobs_counter
strong_memoize(:legacy_update_jobs_counter) do
name = :ci_legacy_update_jobs_as_retried_total
comment = 'Counter of occurrences when jobs were not being set as retried before update_retried'
def self.job_failure_reason_counter
name = :gitlab_ci_job_failure_reasons
comment = 'Counter of job failure reasons'
Gitlab::Metrics.counter(name, comment)
end
Gitlab::Metrics.counter(name, comment)
end
end
end
......
......@@ -321,4 +321,25 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Command do
it { is_expected.to be_falsey }
end
end
describe '#increment_pipeline_failure_reason_counter' do
let(:command) { described_class.new }
let(:reason) { :size_limit_exceeded }
subject { command.increment_pipeline_failure_reason_counter(reason) }
it 'increments the error metric' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { subject }.to change { counter.get(reason: reason.to_s) }.by(1)
end
context 'when the reason is nil' do
let(:reason) { nil }
it 'increments the error metric with unknown_failure' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { subject }.to change { counter.get(reason: 'unknown_failure') }.by(1)
end
end
end
end
......@@ -11,7 +11,7 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
let(:save_incompleted) { false }
let(:command) do
double(:command,
Gitlab::Ci::Pipeline::Chain::Command.new(
project: project,
pipeline_seed: pipeline_seed,
save_incompleted: save_incompleted
......@@ -49,6 +49,11 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
expect(pipeline.deployments_limit_exceeded?).to be true
end
it 'calls increment_pipeline_failure_reason_counter' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { perform }.to change { counter.get(reason: 'deployments_limit_exceeded') }.by(1)
end
end
context 'when not saving incomplete pipelines' do
......@@ -71,6 +76,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
expect(pipeline.errors.messages).to include(base: ['Pipeline has too many deployments! Requested 2, but the limit is 1.'])
end
it 'increments the error metric' do
expect(command).to receive(:increment_pipeline_failure_reason_counter).with(:deployments_limit_exceeded)
perform
end
end
it 'logs the error' do
......
......@@ -96,6 +96,11 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Populate do
it 'wastes pipeline iid' do
expect(InternalId.ci_pipelines.where(project_id: project.id).last.last_value).to be > 0
end
it 'increments the error metric' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { run_chain }.to change { counter.get(reason: 'unknown_failure') }.by(1)
end
end
describe 'pipeline protect' do
......
......@@ -3902,6 +3902,16 @@ RSpec.describe Ci::Pipeline, :mailer, factory_default: :keep do
pipeline.drop
end
end
context 'with failure_reason' do
let(:pipeline) { create(:ci_pipeline, :running) }
let(:failure_reason) { 'config_error' }
let(:counter) { Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc') }
it 'increments the counter with the failure_reason' do
expect { pipeline.drop!(failure_reason) }.to change { counter.get(reason: failure_reason) }.by(1)
end
end
end
end
......
......@@ -629,30 +629,45 @@ RSpec.describe CommitStatus do
end
end
describe 'set failure_reason when drop' do
describe '#drop' do
let(:commit_status) { create(:commit_status, :created) }
let(:counter) { Gitlab::Metrics.counter(:gitlab_ci_job_failure_reasons, 'desc') }
let(:failure_reason) { reason.to_s }
subject do
commit_status.drop!(reason)
commit_status
end
shared_examples 'incrementing failure reason counter' do
it 'increments the counter with the failure_reason' do
expect { subject }.to change { counter.get(reason: failure_reason) }.by(1)
end
end
context 'when failure_reason is nil' do
let(:reason) { }
let(:failure_reason) { 'unknown_failure' }
it { is_expected.to be_unknown_failure }
it_behaves_like 'incrementing failure reason counter'
end
context 'when failure_reason is script_failure' do
let(:reason) { :script_failure }
it { is_expected.to be_script_failure }
it_behaves_like 'incrementing failure reason counter'
end
context 'when failure_reason is unmet_prerequisites' do
let(:reason) { :unmet_prerequisites }
it { is_expected.to be_unmet_prerequisites }
it_behaves_like 'incrementing failure reason counter'
end
end
......
......@@ -71,19 +71,21 @@ RSpec.describe Ci::CreatePipelineService do
end
it 'increments the prometheus counter' do
expect(Gitlab::Metrics).to receive(:counter)
.with(:pipelines_created_total, "Counter of pipelines created")
.and_call_original
allow(Gitlab::Metrics).to receive(:counter).and_call_original # allow other counters
counter = spy('pipeline created counter')
allow(Gitlab::Ci::Pipeline::Metrics)
.to receive(:pipelines_created_counter).and_return(counter)
pipeline
expect(counter).to have_received(:increment)
end
it 'records pipeline size in a prometheus histogram' do
histogram = spy('pipeline size histogram')
allow(Gitlab::Ci::Pipeline::Metrics)
.to receive(:new).and_return(histogram)
.to receive(:pipeline_size_histogram).and_return(histogram)
execute_service
......@@ -580,6 +582,13 @@ RSpec.describe Ci::CreatePipelineService do
it_behaves_like 'a failed pipeline'
it 'increments the error metric' do
stub_ci_pipeline_yaml_file(ci_yaml)
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { execute_service }.to change { counter.get(reason: 'config_error') }.by(1)
end
context 'when receive git commit' do
before do
allow_any_instance_of(Ci::Pipeline).to receive(:git_commit_message) { message }
......
......@@ -10,6 +10,14 @@ RSpec.describe Ci::ProcessPipelineService do
create(:ci_empty_pipeline, ref: 'master', project: project)
end
let(:pipeline_processing_events_counter) { double(increment: true) }
let(:legacy_update_jobs_counter) { double(increment: true) }
let(:metrics) do
double(pipeline_processing_events_counter: pipeline_processing_events_counter,
legacy_update_jobs_counter: legacy_update_jobs_counter)
end
subject { described_class.new(pipeline) }
before do
......@@ -17,22 +25,13 @@ RSpec.describe Ci::ProcessPipelineService do
stub_not_protect_default_branch
project.add_developer(user)
allow(subject).to receive(:metrics).and_return(metrics)
end
describe 'processing events counter' do
let(:metrics) { double('pipeline metrics') }
let(:counter) { double('events counter') }
before do
allow(subject)
.to receive(:metrics).and_return(metrics)
allow(metrics)
.to receive(:pipeline_processing_events_counter)
.and_return(counter)
end
it 'increments processing events counter' do
expect(counter).to receive(:increment)
expect(pipeline_processing_events_counter).to receive(:increment)
subject.execute
end
......@@ -64,33 +63,22 @@ RSpec.describe Ci::ProcessPipelineService do
expect(all_builds.retried).to contain_exactly(build_retried)
end
context 'counter ci_legacy_update_jobs_as_retried_total' do
let(:counter) { double(increment: true) }
it 'increments the counter' do
expect(legacy_update_jobs_counter).to receive(:increment)
subject.execute
end
context 'when the previous build has already retried column true' do
before do
allow(Gitlab::Metrics).to receive(:counter).and_call_original
allow(Gitlab::Metrics).to receive(:counter)
.with(:ci_legacy_update_jobs_as_retried_total, anything)
.and_return(counter)
build_retried.update_columns(retried: true)
end
it 'increments the counter' do
expect(counter).to receive(:increment)
it 'does not increment the counter' do
expect(legacy_update_jobs_counter).not_to receive(:increment)
subject.execute
end
context 'when the previous build has already retried column true' do
before do
build_retried.update_columns(retried: true)
end
it 'does not increment the counter' do
expect(counter).not_to receive(:increment)
subject.execute
end
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment