Commit 60ecd173 authored by Stan Hu's avatar Stan Hu

Merge branch '13401-close-gitlab-issue-on-recovery-alerts-from-prometheus' into 'master'

Resolve "Close GitLab issue on Recovery alerts from Prometheus"

Closes #13401

See merge request gitlab-org/gitlab!18431
parents dbff1452 4dfa692e
---
title: Close issues on Prometheus alert recovery
merge_request: 18431
author:
type: added
# frozen_string_literal: true
class AddSelfManagedPrometheusAlerts < ActiveRecord::Migration[5.2]
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
def change
......
......@@ -8,6 +8,7 @@ module EE
prepended do
has_many :prometheus_alerts, inverse_of: :environment
has_many :self_managed_prometheus_alert_events, inverse_of: :environment
has_one :last_deployable, through: :last_deployment, source: 'deployable', source_type: 'CommitStatus'
has_one :last_pipeline, through: :last_deployable, source: 'pipeline'
......
......@@ -16,6 +16,7 @@ module EE
scope :order_weight_desc, -> { reorder ::Gitlab::Database.nulls_last_order('weight', 'DESC') }
scope :order_weight_asc, -> { reorder ::Gitlab::Database.nulls_last_order('weight') }
scope :order_created_at_desc, -> { reorder(created_at: :desc) }
scope :service_desk, -> { where(author: ::User.support_bot) }
has_one :epic_issue
......
......@@ -8,6 +8,7 @@ class PrometheusAlertEvent < ApplicationRecord
has_and_belongs_to_many :related_issues, class_name: 'Issue', join_table: :issues_prometheus_alert_events
validates :payload_key, uniqueness: { scope: :prometheus_alert_id }
validates :started_at, presence: true
delegate :title, :prometheus_metric_id, to: :prometheus_alert
......
......@@ -4,9 +4,10 @@ class SelfManagedPrometheusAlertEvent < ApplicationRecord
include AlertEventLifecycle
belongs_to :project, validate: true, inverse_of: :self_managed_prometheus_alert_events
belongs_to :environment, validate: true
belongs_to :environment, validate: true, inverse_of: :self_managed_prometheus_alert_events
has_and_belongs_to_many :related_issues, class_name: 'Issue', join_table: :issues_self_managed_prometheus_alert_events
validates :started_at, presence: true
validates :payload_key, uniqueness: { scope: :project_id }
def self.find_or_initialize_by_payload_key(project, payload_key)
......
......@@ -242,6 +242,12 @@ module EE
create_note(NoteSummary.new(noteable, project, author, body, action: 'merge'))
end
def auto_resolve_prometheus_alert(noteable, project, author)
body = 'automatically closed this issue because the alert resolved.'
create_note(NoteSummary.new(noteable, project, author, body, action: 'closed'))
end
private
# We do not have a named route for DesignManagement::Version, instead
......
......@@ -20,22 +20,11 @@ module Projects
return unless parsed_alert.valid?
event = if parsed_alert.gitlab_managed?
build_managed_prometheus_alert_event(parsed_alert)
else
build_self_managed_prometheus_alert_event(parsed_alert)
end
if event
result = case parsed_alert.status
when 'firing'
event.fire(parsed_alert.starts_at)
when 'resolved'
event.resolve(parsed_alert.ends_at)
end
if parsed_alert.gitlab_managed?
create_managed_prometheus_alert_event(parsed_alert)
else
create_self_managed_prometheus_alert_event(parsed_alert)
end
event if result
end
def alerts
......@@ -49,24 +38,36 @@ module Projects
.first
end
def build_managed_prometheus_alert_event(parsed_alert)
def create_managed_prometheus_alert_event(parsed_alert)
alert = find_alert(parsed_alert.metric_id)
return if alert.blank?
payload_key = PrometheusAlertEvent.payload_key_for(parsed_alert.metric_id, parsed_alert.starts_at_raw)
PrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, alert, payload_key)
event = PrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, alert, payload_key)
set_status(parsed_alert, event)
end
def build_self_managed_prometheus_alert_event(parsed_alert)
def create_self_managed_prometheus_alert_event(parsed_alert)
payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(parsed_alert.starts_at_raw, parsed_alert.title, parsed_alert.full_query)
SelfManagedPrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, payload_key) do |event|
event = SelfManagedPrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, payload_key) do |event|
event.environment = parsed_alert.environment
event.title = parsed_alert.title
event.query_expression = parsed_alert.full_query
end
set_status(parsed_alert, event)
end
def set_status(parsed_alert, event)
persisted = case parsed_alert.status
when 'firing'
event.fire(parsed_alert.starts_at)
when 'resolved'
event.resolve(parsed_alert.ends_at)
end
event if persisted
end
end
end
......
......@@ -12,7 +12,7 @@ module Projects
persist_events
send_alert_email if send_email?
process_incident_issues if create_issue?
process_incident_issues if process_issues?
true
end
......@@ -41,8 +41,7 @@ module Projects
incident_management_setting.send_email && firings.any?
end
def create_issue?
return unless firings.any?
def process_issues?
return unless incident_management_available?
incident_management_setting.create_issue?
......@@ -128,7 +127,7 @@ module Projects
end
def process_incident_issues
firings.each do |alert|
alerts.each do |alert|
IncidentManagement::ProcessPrometheusAlertWorker
.perform_async(project.id, alert.to_h)
end
......
......@@ -13,9 +13,16 @@ module IncidentManagement
parsed_alert = Gitlab::Alerting::Alert.new(project: project, payload: alert_hash)
event = find_prometheus_alert_event(parsed_alert)
issue = create_issue(project, alert_hash)
relate_issue_to_event(event, issue)
if event&.resolved?
issue = event.related_issues.order_created_at_desc.detect(&:opened?)
close_issue(project, issue)
else
issue = create_issue(project, alert_hash)
relate_issue_to_event(event, issue)
end
end
private
......@@ -33,17 +40,25 @@ module IncidentManagement
end
def find_gitlab_managed_event(alert)
payload_key = PrometheusAlertEvent.payload_key_for(alert.metric_id, alert.starts_at_raw)
payload_key = payload_key_for_alert(alert)
PrometheusAlertEvent.find_by_payload_key(payload_key)
end
def find_self_managed_event(alert)
payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(alert.starts_at_raw, alert.title, alert.full_query)
payload_key = payload_key_for_alert(alert)
SelfManagedPrometheusAlertEvent.find_by_payload_key(payload_key)
end
def payload_key_for_alert(alert)
if alert.gitlab_managed?
PrometheusAlertEvent.payload_key_for(alert.metric_id, alert.starts_at_raw)
else
SelfManagedPrometheusAlertEvent.payload_key_for(alert.starts_at_raw, alert.title, alert.full_query)
end
end
def create_issue(project, alert)
IncidentManagement::CreateIssueService
.new(project, alert)
......@@ -51,6 +66,16 @@ module IncidentManagement
.dig(:issue)
end
def close_issue(project, issue)
return if issue.blank? || issue.closed?
processed_issue = Issues::CloseService
.new(project, User.alert_bot)
.execute(issue, system_note: false)
SystemNoteService.auto_resolve_prometheus_alert(issue, project, User.alert_bot) if processed_issue.reset.closed?
end
def relate_issue_to_event(event, issue)
return unless event && issue
......
......@@ -81,6 +81,14 @@ module Gitlab
end
end
def firing?
status == 'firing'
end
def resolved?
status == 'resolved'
end
def gitlab_managed?
metric_id.present?
end
......
......@@ -235,9 +235,8 @@ describe Projects::Prometheus::Alerts::NotifyService do
it 'does not send notification email', :sidekiq_might_not_need_inline do
expect(project.feature_available?(:incident_management)).to eq(true)
expect_next_instance_of(NotificationService) do |service|
expect(service).not_to receive(:async)
end
expect_any_instance_of(NotificationService)
.not_to receive(:async)
expect(subject).to eq(true)
end
......@@ -291,14 +290,14 @@ describe Projects::Prometheus::Alerts::NotifyService do
setting.update!(create_issue: true)
end
it_behaves_like 'processes incident issues', 1
it_behaves_like 'processes incident issues', 2
context 'without firing alerts' do
let(:payload_raw) do
payload_for(firing: [], resolved: [alert_resolved])
end
it_behaves_like 'does not process incident issues'
it_behaves_like 'processes incident issues', 1
end
end
......
......@@ -6,11 +6,8 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
describe '#perform' do
let_it_be(:project) { create(:project) }
let_it_be(:prometheus_alert) { create(:prometheus_alert, project: project) }
before_all do
payload_key = PrometheusAlertEvent.payload_key_for(prometheus_alert.prometheus_metric_id, prometheus_alert.created_at.rfc3339)
create(:prometheus_alert_event, prometheus_alert: prometheus_alert, payload_key: payload_key)
end
let_it_be(:payload_key) { PrometheusAlertEvent.payload_key_for(prometheus_alert.prometheus_metric_id, prometheus_alert.created_at.rfc3339) }
let!(:prometheus_alert_event) { create(:prometheus_alert_event, prometheus_alert: prometheus_alert, payload_key: payload_key) }
let(:alert_params) do
{
......@@ -34,6 +31,34 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
.to(1)
end
context 'resolved event' do
let(:issue) { create(:issue, project: project) }
before do
prometheus_alert_event.related_issues << issue
prometheus_alert_event.resolve
end
it 'does not create an issue' do
expect { subject.perform(project.id, alert_params) }
.not_to change(Issue, :count)
end
it 'closes the existing issue' do
expect { subject.perform(project.id, alert_params) }
.to change { issue.reload.state }
.from('opened')
.to('closed')
end
it 'leaves a system note on the issue' do
expect(SystemNoteService)
.to receive(:auto_resolve_prometheus_alert)
subject.perform(project.id, alert_params)
end
end
context 'when project could not be found' do
it 'does not create an issue' do
expect { subject.perform('1234', alert_params) }
......@@ -79,7 +104,7 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
let(:alert_name) { 'alert' }
let(:starts_at) { Time.now.rfc3339 }
let!(:prometheus_alert) do
let!(:prometheus_alert_event) do
payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(starts_at, alert_name, 'vector(1)')
create(:self_managed_prometheus_alert_event, project: project, payload_key: payload_key)
end
......@@ -102,7 +127,7 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
it 'relates issue to an event' do
expect { subject.perform(project.id, alert_params) }
.to change(prometheus_alert.related_issues, :count)
.to change(prometheus_alert_event.related_issues, :count)
.from(0)
.to(1)
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment