Commit 102f676a authored by Etienne Baqué's avatar Etienne Baqué

Merge branch '330281-escalate-incidents' into 'master'

Process pending Incident Escalations

See merge request gitlab-org/gitlab!74337
parents 785da5e7 84f6a32d
......@@ -59,6 +59,7 @@ module Emails
def prometheus_alert_fired_email(project, user, alert)
@project = project
@alert = alert.present
@incident = alert.issue
add_project_headers
add_alert_headers
......@@ -80,11 +81,10 @@ module Emails
end
def add_incident_headers
incident = @alert.issue
return unless incident
return unless @incident
headers['X-GitLab-Incident-ID'] = incident.id
headers['X-GitLab-Incident-IID'] = incident.iid
headers['X-GitLab-Incident-ID'] = @incident.id
headers['X-GitLab-Incident-IID'] = @incident.iid
end
end
end
......
---
name: incident_escalations
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/74337
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/345769
milestone: '14.6'
type: development
group: group::monitor
default_enabled: false
......@@ -205,6 +205,8 @@
- 1
- - incident_management_pending_escalations_alert_create
- 1
- - incident_management_pending_escalations_issue_check
- 1
- - integrations_create_external_cross_reference
- 1
- - invalid_gpg_signature_update
......
......@@ -41,6 +41,19 @@ module EE
format.text { render layout: 'mailer' }
end
end
def incident_escalation_fired_email(project, user, issue)
@project = project
@incident = issue.present
@escalation_status = issue.incident_management_issuable_escalation_status
add_project_headers
headers['X-GitLab-NotificationReason'] = "incident_#{@escalation_status.status_name}"
add_incident_headers
subject_text = "Incident: #{@incident.title}"
mail(to: user.notification_email_for(@project.group), subject: subject(subject_text))
end
end
end
end
......@@ -27,6 +27,13 @@ module EE
supports_sla?
end
def escalation_policies_available?
return false unless ::Feature.enabled?(:incident_escalations, project)
return false unless ::Gitlab::IncidentManagement.escalation_policies_available?(project)
supports_escalation?
end
def metric_images_available?
return false unless IssuableMetricImage.available_for?(project)
......@@ -41,6 +48,10 @@ module EE
incident?
end
def supports_escalation?
incident?
end
def supports_iterations?
false
end
......
......@@ -28,6 +28,14 @@ module IncidentManagement
scope :processable, -> { where(process_at: ESCALATION_BUFFER.ago..Time.current) }
delegate :project, to: :target
def escalatable
raise NotImplementedError
end
def type
raise NotImplementedError
end
end
end
end
......@@ -12,6 +12,14 @@ module IncidentManagement
belongs_to :alert, class_name: 'AlertManagement::Alert', foreign_key: 'alert_id', inverse_of: :pending_escalations
validates :rule_id, uniqueness: { scope: [:alert_id] }
def escalatable
alert
end
def type
:alert
end
end
end
end
......@@ -12,6 +12,14 @@ module IncidentManagement
belongs_to :issue, class_name: '::Issue', foreign_key: 'issue_id', inverse_of: :pending_escalations
validates :rule_id, uniqueness: { scope: [:issue_id] }
def escalatable
issue.incident_management_issuable_escalation_status
end
def type
:incident
end
end
end
end
......@@ -33,7 +33,7 @@ class IssuableMetricImage < ApplicationRecord
return file&.url unless file&.upload
# If we're using a CDN, we need to use the full URL
asset_host = ActionController::Base.asset_host
asset_host = ActionController::Base.asset_host || Gitlab.config.gitlab.base_url
local_path = Gitlab::Routing.url_helpers.issuable_metric_image_upload_path(
filename: file.filename,
id: file.upload.model_id,
......
......@@ -9,12 +9,12 @@ module EE
def execute(issue)
super
add_issue_sla(issue)
create_issue_sla(issue)
end
private
def add_issue_sla(issue)
def create_issue_sla(issue)
return unless issue.sla_available?
::IncidentManagement::Incidents::CreateSlaService.new(issue, current_user).execute
......
......@@ -77,6 +77,14 @@ module EE
end
end
def notify_oncall_users_of_incident(users, issue)
track_usage_event(:i_incident_management_oncall_notification_sent, users.map(&:id))
users.each do |user|
mailer.incident_escalation_fired_email(issue.project, user, issue).deliver_later
end
end
def oncall_user_removed(rotation, user, async_notification = true)
oncall_user_removed_recipients(rotation, user).each do |recipient|
email = mailer.user_removed_from_rotation_email(user, rotation, [recipient])
......
......@@ -112,8 +112,8 @@ module EE
issuables_service(noteable, project, author).publish_issue_to_status_page
end
def notify_via_escalation(noteable, project, recipients, escalation_policy)
escalations_service(noteable, project).notify_via_escalation(recipients, escalation_policy: escalation_policy)
def notify_via_escalation(noteable, project, recipients, escalation_policy, type)
escalations_service(noteable, project).notify_via_escalation(recipients, escalation_policy: escalation_policy, type: type)
end
private
......
......@@ -9,14 +9,15 @@ module IncidentManagement
@escalation = escalation
@project = escalation.project
@rule = escalation.rule
@escalatable = escalation.escalatable
@target = escalation.target
end
def execute
return unless ::Gitlab::IncidentManagement.escalation_policies_available?(project)
return if too_early_to_process?
return if target_already_resolved?
return if target_status_exceeded_rule?
return if escalatable_already_resolved?
return if escalatable_status_exceeded_rule?
notify_recipients
create_system_notes
......@@ -25,16 +26,16 @@ module IncidentManagement
private
attr_reader :escalation, :project, :target, :rule
attr_reader :escalation, :project, :target, :rule, :escalatable
def target_already_resolved?
return false unless target.resolved?
def escalatable_already_resolved?
return false unless escalatable.resolved?
destroy_escalation!
end
def target_status_exceeded_rule?
target.status >= rule.status_before_type_cast
def escalatable_status_exceeded_rule?
escalatable.status >= rule.status_before_type_cast
end
def too_early_to_process?
......@@ -45,11 +46,11 @@ module IncidentManagement
NotificationService
.new
.async
.notify_oncall_users_of_alert(oncall_notification_recipients, target)
.send("notify_oncall_users_of_#{escalation.type}", oncall_notification_recipients, target) # rubocop: disable GitlabSecurity/PublicSend
end
def create_system_notes
SystemNoteService.notify_via_escalation(target, project, oncall_notification_recipients, rule.policy)
SystemNoteService.notify_via_escalation(target, project, oncall_notification_recipients, rule.policy, escalation.type)
end
def oncall_notification_recipients
......
......@@ -8,8 +8,8 @@ module SystemNotes
@author = User.alert_bot
end
def notify_via_escalation(recipients, escalation_policy:)
body = "notified #{recipients.map(&:to_reference).to_sentence} of this alert via escalation policy **#{escalation_policy.name}**"
def notify_via_escalation(recipients, escalation_policy:, type:)
body = "notified #{recipients.map(&:to_reference).to_sentence} of this #{type} via escalation policy **#{escalation_policy.name}**"
create_note(NoteSummary.new(noteable, project, author, body, action: 'new_alert_added'))
end
......
- body = @escalation_status.resolved? ? _('An incident has been resolved in %{project_path}.') : _('An incident has been triggered in %{project_path}.')
%p
= body % { project_path: @incident.project.full_path }
%p
= link_to(_('View incident details.'), @incident.web_url)
%p
= _('Title:')
= @incident.title
- if @incident.description
%p
= _('Description:')
= markdown(@incident.description, pipeline: :email, author: @incident.author)
- if @escalation_status.policy
%p
= _('Escalation policy:')
= @escalation_status.policy.name
- if @incident.metric_images.any?
%p
= _('Metrics:')
- @incident.metric_images.each do |image|
= link_to image.filename, image.file_path
<% body = @escalation_status.resolved? ? _('An incident has been resolved in %{project_path}.') : _('An incident has been triggered in %{project_path}.') %>
<%= body % { project_path: @incident.project.full_path } %>
<%= _('View incident details at') %> <%= @incident.web_url %>
<%= _('Title:') %> <%= @incident.title %>
<% if @incident.description %>
<%= _('Description:') %> <%= @incident.description %>
<% end %>
<% if @escalation_status.policy %>
<%= _('Escalation policy:') %> <%= @escalation_status.policy.name %>
<% end %>
<% if @incident.metric_images.any? %>
<%= _('Metrics:') %>
<% @incident.metric_images.each do |image| %>
<%= image.file_path %>
<% end %>
<% end %>
......@@ -1110,6 +1110,15 @@
:weight: 1
:idempotent: true
:tags: []
- :name: incident_management_pending_escalations_issue_check
:worker_name: IncidentManagement::PendingEscalations::IssueCheckWorker
:feature_category: :incident_management
:has_external_dependencies:
:urgency: :high
:resource_boundary: :cpu
:weight: 1
:idempotent: true
:tags: []
- :name: ldap_group_sync
:worker_name: LdapGroupSyncWorker
:feature_category: :authentication_and_authorization
......
# frozen_string_literal: true
module IncidentManagement
module PendingEscalations
class IssueCheckWorker
include ApplicationWorker
data_consistency :always
worker_resource_boundary :cpu
urgency :high
idempotent!
feature_category :incident_management
def perform(escalation_id)
escalation = IncidentManagement::PendingEscalations::Issue.find_by_id(escalation_id)
return unless escalation
IncidentManagement::PendingEscalations::ProcessService.new(escalation).execute
end
end
end
end
......@@ -15,7 +15,12 @@ module IncidentManagement
def perform
::IncidentManagement::PendingEscalations::Alert.processable.each_batch do |relation|
args = relation.pluck(:id).map { |id| [id] } # rubocop:disable CodeReuse/ActiveRecord
::IncidentManagement::PendingEscalations::AlertCheckWorker.bulk_perform_async(args) # rubocop:disable Scalability/BulkPerformWithContext
::IncidentManagement::PendingEscalations::AlertCheckWorker.bulk_perform_async(args) # rubocop:disable Scalability/BulkPerformWithContext
end
::IncidentManagement::PendingEscalations::Issue.processable.each_batch do |relation|
args = relation.pluck(:id).map { |id| [id] } # rubocop:disable CodeReuse/ActiveRecord
::IncidentManagement::PendingEscalations::IssueCheckWorker.bulk_perform_async(args) # rubocop:disable Scalability/BulkPerformWithContext
end
end
end
......
......@@ -28,4 +28,58 @@ RSpec.describe Emails::Projects do
is_expected.to have_body_text("It is recommended that you reach out to the current on-call responder to ensure continuity of on-call coverage")
end
end
describe '#incident_escalation_fired_email' do
let_it_be(:project) { create(:project) }
let_it_be(:user) { create(:user) }
let!(:incident) { create(:issue, :incident, project: project) }
let!(:escalation_status) { create(:incident_management_issuable_escalation_status, issue: incident) }
subject do
Notify.incident_escalation_fired_email(project, user, incident)
end
include_context 'gitlab email notification'
it_behaves_like 'an email with X-GitLab headers containing project details'
it 'has expected X-GitLab alert headers', :aggregate_failures do
is_expected.to have_header('X-GitLab-NotificationReason', "incident_#{escalation_status.status_name}")
is_expected.to have_header('X-GitLab-Incident-ID', /#{incident.id}/)
is_expected.to have_header('X-GitLab-Incident-IID', /#{incident.iid}/)
end
it_behaves_like 'an email sent from GitLab'
it_behaves_like 'it should not have Gmail Actions links'
it_behaves_like 'a user cannot unsubscribe through footer link'
it 'has expected subject' do
is_expected.to have_subject("#{project.name} | Incident: #{incident.title}")
end
it 'has expected content' do
is_expected.to have_body_text('Title:')
is_expected.to have_body_text(incident.title)
end
context 'with description' do
let!(:incident) { create(:issue, :incident, project: project, description: 'some descripition') }
it 'has expected content' do
is_expected.to have_body_text('Description:')
is_expected.to have_body_text('some descripition')
end
end
context 'with escalation status policy' do
let!(:policy) { create(:incident_management_escalation_policy, project: project) }
let!(:escalation_status) { create(:incident_management_issuable_escalation_status, issue: incident, policy: policy, escalations_started_at: Time.current) }
it 'has expected content' do
is_expected.to have_body_text('Escalation policy:')
is_expected.to have_body_text(policy.name)
end
end
end
end
......@@ -108,4 +108,31 @@ RSpec.describe EE::Issuable do
it { is_expected.to eq(supports_iterations) }
end
end
describe '#escalation_policies_available?' do
where(:issuable_type, :incident_escalations_enabled, :oncall_schedules_enabled, :escalation_policies_enabled, :available) do
[
[:issue, true, true, true, false],
[:incident, false, false, false, false],
[:incident, false, true, true, false],
[:incident, true, false, false, false],
[:incident, true, true, false, false],
[:incident, true, false, true, false],
[:incident, true, true, true, true]
]
end
with_them do
let(:issuable) { build_stubbed(issuable_type) }
before do
stub_feature_flags(incident_escalations: incident_escalations_enabled)
stub_licensed_features(oncall_schedules: oncall_schedules_enabled, escalation_policies: escalation_policies_enabled)
end
subject { issuable.escalation_policies_available? }
it { is_expected.to eq(available) }
end
end
end
......@@ -60,4 +60,27 @@ RSpec.describe IssuableMetricImage do
it { is_expected.to eq(false) }
end
end
describe '#file_path' do
let(:issuable_metric_image) { create(:issuable_metric_image) }
let(:expected_path) { issuable_metric_image.file.url }
subject(:file_path) { issuable_metric_image.file_path }
context 'with asset host configured' do
it 'returns a full URL with the asset host and system path' do
asset_host = 'https://gitlab-assets.example.com'
allow(ActionController::Base).to receive(:asset_host) { asset_host }
expect(file_path).to eq("#{asset_host}#{expected_path}")
end
end
context 'no asset path configured' do
it 'returns a full URL with the base url and system path' do
base_url = Gitlab.config.gitlab.base_url
expect(file_path).to eq("#{base_url}#{expected_path}")
end
end
end
end
......@@ -10,12 +10,7 @@ RSpec.describe IncidentManagement::PendingEscalations::ProcessService do
let(:escalation_rule) { build(:incident_management_escalation_rule, oncall_schedule: schedule_1) }
let!(:escalation_policy) { create(:incident_management_escalation_policy, project: project, rules: [escalation_rule]) }
let(:alert) { create(:alert_management_alert, project: project, **alert_params) }
let(:alert_params) { { status: ::IncidentManagement::Escalatable::STATUSES[:triggered] } }
let(:target) { alert }
let(:process_at) { 5.minutes.ago }
let(:escalation) { create(:incident_management_pending_alert_escalation, rule: escalation_rule, alert: target, process_at: process_at) }
let(:service) { described_class.new(escalation) }
......@@ -42,47 +37,102 @@ RSpec.describe IncidentManagement::PendingEscalations::ProcessService do
end
end
context 'all conditions are met' do
let(:users) { schedule_1_users }
it_behaves_like 'sends on-call notification'
it_behaves_like 'deletes the escalation'
it 'creates a system note' do
shared_examples 'creates a system note' do
specify do
expect(SystemNoteService)
.to receive(:notify_via_escalation).with(alert, project, [a_kind_of(User)], escalation_policy)
.to receive(:notify_via_escalation).with(target, project, [a_kind_of(User)], escalation_policy, escalation.type)
.and_call_original
expect { execute }.to change(Note, :count).by(1)
end
end
shared_examples 'sends an on-call notification email' do
let(:notification_async) { double(NotificationService::Async) }
specify do
allow(NotificationService).to receive_message_chain(:new, :async).and_return(notification_async)
expect(notification_async).to receive(notification_action).with(
users,
target
)
subject
end
end
shared_examples 'escalates correctly when all conditions are met' do
let(:users) { schedule_1_users }
it_behaves_like 'sends an on-call notification email'
it_behaves_like 'deletes the escalation'
it_behaves_like 'creates a system note'
context 'when escalation rule is for a user' do
let(:escalation_rule) { build(:incident_management_escalation_rule, :with_user) }
let(:users) { [escalation_rule.user] }
it_behaves_like 'sends on-call notification'
it_behaves_like 'sends an on-call notification email'
it_behaves_like 'deletes the escalation'
end
end
context 'target is already resolved' do
let(:target) { create(:alert_management_alert, :resolved, project: project) }
shared_examples 'does not escalate if escalation is not ready to be processed' do
context 'does not escalate if escalation is not ready to be processed' do
let(:process_at) { 5.minutes.from_now }
it_behaves_like 'does not send on-call notification'
it_behaves_like 'deletes the escalation'
it_behaves_like 'it does not escalate'
end
end
context 'target status is not above threshold' do
let(:target) { create(:alert_management_alert, :acknowledged, project: project) }
context 'alert escalation' do
let(:alert) { create(:alert_management_alert, project: project, **alert_params) }
let(:alert_params) { { status: ::IncidentManagement::Escalatable::STATUSES[:triggered] } }
let(:target) { alert }
let(:escalation) { create(:incident_management_pending_alert_escalation, rule: escalation_rule, alert: target, process_at: process_at) }
let(:notification_action) { :notify_oncall_users_of_alert }
include_examples 'escalates correctly when all conditions are met'
include_examples 'does not escalate if escalation is not ready to be processed'
context 'target is already resolved' do
let(:target) { create(:alert_management_alert, :resolved, project: project) }
it_behaves_like 'does not send on-call notification'
it_behaves_like 'deletes the escalation'
end
context 'target status is not above threshold' do
let(:target) { create(:alert_management_alert, :acknowledged, project: project) }
it_behaves_like 'it does not escalate'
it_behaves_like 'it does not escalate'
end
end
context 'escalation is not ready to be processed' do
let(:process_at) { 5.minutes.from_now }
context 'issue escalation' do
let(:issue) { create(:issue, :incident, project: project) }
let!(:issue_escalation_status) { create(:incident_management_issuable_escalation_status, issue: target) }
let(:target) { issue }
let(:escalation) { create(:incident_management_pending_issue_escalation, rule: escalation_rule, issue: target, process_at: process_at) }
let(:notification_action) { :notify_oncall_users_of_incident }
include_examples 'escalates correctly when all conditions are met'
include_examples 'does not escalate if escalation is not ready to be processed'
it_behaves_like 'it does not escalate'
context 'target escalation status is resolved' do
before do
target.incident_management_issuable_escalation_status.resolve!
end
it_behaves_like 'does not send on-call notification'
it_behaves_like 'deletes the escalation'
end
context 'target status is not above threshold' do
let!(:issue_escalation_status) { create(:incident_management_issuable_escalation_status, :acknowledged, issue: issue) }
it_behaves_like 'it does not escalate'
end
end
end
end
......@@ -9,17 +9,18 @@ RSpec.describe SystemNotes::EscalationsService do
let_it_be(:author) { User.alert_bot }
describe '#notify_via_escalation' do
subject { described_class.new(noteable: noteable, project: project).notify_via_escalation([user, user_2], escalation_policy: escalation_policy) }
subject { described_class.new(noteable: noteable, project: project).notify_via_escalation([user, user_2], escalation_policy: escalation_policy, type: type) }
let_it_be(:escalation_policy) { create(:incident_management_escalation_policy, project: project) }
let_it_be(:noteable) { create(:alert_management_alert, project: project) }
let_it_be(:type) { :alert }
it_behaves_like 'a system note' do
let(:action) { 'new_alert_added' }
end
it 'posts the correct text to the system note' do
expect(subject.note).to match("notified #{user.to_reference} and #{user_2.to_reference} of this alert via escalation policy **#{escalation_policy.name}**")
expect(subject.note).to match("notified #{user.to_reference} and #{user_2.to_reference} of this #{type} via escalation policy **#{escalation_policy.name}**")
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe IncidentManagement::PendingEscalations::IssueCheckWorker do
let(:worker) { described_class.new }
let_it_be(:escalation) { create(:incident_management_pending_issue_escalation) }
describe '#perform' do
subject { worker.perform(*args) }
context 'with valid escalation' do
let(:args) { [escalation.id.to_s] }
it 'processes the escalation' do
expect_next_instance_of(IncidentManagement::PendingEscalations::ProcessService, escalation) do |service|
expect(service).to receive(:execute)
end
subject
end
end
context 'without valid escalation' do
let(:args) { [non_existing_record_id] }
it 'does nothing' do
expect(IncidentManagement::PendingEscalations::CreateService).not_to receive(:new)
expect { subject }.not_to raise_error
end
end
end
end
......@@ -5,8 +5,10 @@ require 'spec_helper'
RSpec.describe IncidentManagement::PendingEscalations::ScheduleCheckCronWorker do
let(:worker) { described_class.new }
let_it_be(:escalation_1) { create(:incident_management_pending_alert_escalation, process_at: 5.minutes.ago) }
let_it_be(:escalation_2) { create(:incident_management_pending_alert_escalation, process_at: 2.days.ago) }
let_it_be(:alert_escalation_1) { create(:incident_management_pending_alert_escalation, process_at: 5.minutes.ago) }
let_it_be(:alert_escalation_2) { create(:incident_management_pending_alert_escalation, process_at: 2.days.ago) }
let_it_be(:issue_escalation_1) { create(:incident_management_pending_issue_escalation, process_at: 2.days.ago) }
let_it_be(:issue_escalation_2) { create(:incident_management_pending_issue_escalation, process_at: 2.days.ago) }
let_it_be(:escalation_not_ready_to_process) { create(:incident_management_pending_alert_escalation) }
describe '#perform' do
......@@ -14,7 +16,10 @@ RSpec.describe IncidentManagement::PendingEscalations::ScheduleCheckCronWorker d
it 'schedules a job for each processable escalation' do
expect(IncidentManagement::PendingEscalations::AlertCheckWorker).to receive(:bulk_perform_async)
.with(array_including([escalation_2.id], [escalation_1.id]))
.with(array_including([alert_escalation_2.id], [alert_escalation_1.id]))
expect(IncidentManagement::PendingEscalations::IssueCheckWorker).to receive(:bulk_perform_async)
.with(array_including([issue_escalation_2.id], [issue_escalation_1.id]))
subject
end
......
......@@ -3991,6 +3991,12 @@ msgstr ""
msgid "An example showing how to use Jsonnet with GitLab dynamic child pipelines"
msgstr ""
msgid "An incident has been resolved in %{project_path}."
msgstr ""
msgid "An incident has been triggered in %{project_path}."
msgstr ""
msgid "An integer value is required for seconds"
msgstr ""
......@@ -13930,6 +13936,9 @@ msgstr ""
msgid "Escalation policies must have at least one rule"
msgstr ""
msgid "Escalation policy:"
msgstr ""
msgid "EscalationPolicies|%{clockIcon} IF alert is not %{alertStatus} in %{minutes}"
msgstr ""
......@@ -22246,6 +22255,9 @@ msgstr ""
msgid "Metrics and profiling"
msgstr ""
msgid "Metrics:"
msgstr ""
msgid "MetricsDashboardAnnotation|Annotation can't belong to both a cluster and an environment at the same time"
msgstr ""
......@@ -38616,6 +38628,12 @@ msgstr ""
msgid "View group labels"
msgstr ""
msgid "View incident details at"
msgstr ""
msgid "View incident details."
msgstr ""
msgid "View incident issues."
msgstr ""
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment