Commit 60ecd173 authored by Stan Hu's avatar Stan Hu

Merge branch '13401-close-gitlab-issue-on-recovery-alerts-from-prometheus' into 'master'

Resolve "Close GitLab issue on Recovery alerts from Prometheus"

Closes #13401

See merge request gitlab-org/gitlab!18431
parents dbff1452 4dfa692e
---
title: Close issues on Prometheus alert recovery
merge_request: 18431
author:
type: added
# frozen_string_literal: true # frozen_string_literal: true
class AddSelfManagedPrometheusAlerts < ActiveRecord::Migration[5.2] class AddSelfManagedPrometheusAlerts < ActiveRecord::Migration[5.2]
# Set this constant to true if this migration requires downtime.
DOWNTIME = false DOWNTIME = false
def change def change
......
...@@ -8,6 +8,7 @@ module EE ...@@ -8,6 +8,7 @@ module EE
prepended do prepended do
has_many :prometheus_alerts, inverse_of: :environment has_many :prometheus_alerts, inverse_of: :environment
has_many :self_managed_prometheus_alert_events, inverse_of: :environment
has_one :last_deployable, through: :last_deployment, source: 'deployable', source_type: 'CommitStatus' has_one :last_deployable, through: :last_deployment, source: 'deployable', source_type: 'CommitStatus'
has_one :last_pipeline, through: :last_deployable, source: 'pipeline' has_one :last_pipeline, through: :last_deployable, source: 'pipeline'
......
...@@ -16,6 +16,7 @@ module EE ...@@ -16,6 +16,7 @@ module EE
scope :order_weight_desc, -> { reorder ::Gitlab::Database.nulls_last_order('weight', 'DESC') } scope :order_weight_desc, -> { reorder ::Gitlab::Database.nulls_last_order('weight', 'DESC') }
scope :order_weight_asc, -> { reorder ::Gitlab::Database.nulls_last_order('weight') } scope :order_weight_asc, -> { reorder ::Gitlab::Database.nulls_last_order('weight') }
scope :order_created_at_desc, -> { reorder(created_at: :desc) }
scope :service_desk, -> { where(author: ::User.support_bot) } scope :service_desk, -> { where(author: ::User.support_bot) }
has_one :epic_issue has_one :epic_issue
......
...@@ -8,6 +8,7 @@ class PrometheusAlertEvent < ApplicationRecord ...@@ -8,6 +8,7 @@ class PrometheusAlertEvent < ApplicationRecord
has_and_belongs_to_many :related_issues, class_name: 'Issue', join_table: :issues_prometheus_alert_events has_and_belongs_to_many :related_issues, class_name: 'Issue', join_table: :issues_prometheus_alert_events
validates :payload_key, uniqueness: { scope: :prometheus_alert_id } validates :payload_key, uniqueness: { scope: :prometheus_alert_id }
validates :started_at, presence: true
delegate :title, :prometheus_metric_id, to: :prometheus_alert delegate :title, :prometheus_metric_id, to: :prometheus_alert
......
...@@ -4,9 +4,10 @@ class SelfManagedPrometheusAlertEvent < ApplicationRecord ...@@ -4,9 +4,10 @@ class SelfManagedPrometheusAlertEvent < ApplicationRecord
include AlertEventLifecycle include AlertEventLifecycle
belongs_to :project, validate: true, inverse_of: :self_managed_prometheus_alert_events belongs_to :project, validate: true, inverse_of: :self_managed_prometheus_alert_events
belongs_to :environment, validate: true belongs_to :environment, validate: true, inverse_of: :self_managed_prometheus_alert_events
has_and_belongs_to_many :related_issues, class_name: 'Issue', join_table: :issues_self_managed_prometheus_alert_events has_and_belongs_to_many :related_issues, class_name: 'Issue', join_table: :issues_self_managed_prometheus_alert_events
validates :started_at, presence: true
validates :payload_key, uniqueness: { scope: :project_id } validates :payload_key, uniqueness: { scope: :project_id }
def self.find_or_initialize_by_payload_key(project, payload_key) def self.find_or_initialize_by_payload_key(project, payload_key)
......
...@@ -242,6 +242,12 @@ module EE ...@@ -242,6 +242,12 @@ module EE
create_note(NoteSummary.new(noteable, project, author, body, action: 'merge')) create_note(NoteSummary.new(noteable, project, author, body, action: 'merge'))
end end
def auto_resolve_prometheus_alert(noteable, project, author)
body = 'automatically closed this issue because the alert resolved.'
create_note(NoteSummary.new(noteable, project, author, body, action: 'closed'))
end
private private
# We do not have a named route for DesignManagement::Version, instead # We do not have a named route for DesignManagement::Version, instead
......
...@@ -20,22 +20,11 @@ module Projects ...@@ -20,22 +20,11 @@ module Projects
return unless parsed_alert.valid? return unless parsed_alert.valid?
event = if parsed_alert.gitlab_managed? if parsed_alert.gitlab_managed?
build_managed_prometheus_alert_event(parsed_alert) create_managed_prometheus_alert_event(parsed_alert)
else else
build_self_managed_prometheus_alert_event(parsed_alert) create_self_managed_prometheus_alert_event(parsed_alert)
end
if event
result = case parsed_alert.status
when 'firing'
event.fire(parsed_alert.starts_at)
when 'resolved'
event.resolve(parsed_alert.ends_at)
end
end end
event if result
end end
def alerts def alerts
...@@ -49,24 +38,36 @@ module Projects ...@@ -49,24 +38,36 @@ module Projects
.first .first
end end
def build_managed_prometheus_alert_event(parsed_alert) def create_managed_prometheus_alert_event(parsed_alert)
alert = find_alert(parsed_alert.metric_id) alert = find_alert(parsed_alert.metric_id)
return if alert.blank?
payload_key = PrometheusAlertEvent.payload_key_for(parsed_alert.metric_id, parsed_alert.starts_at_raw) payload_key = PrometheusAlertEvent.payload_key_for(parsed_alert.metric_id, parsed_alert.starts_at_raw)
PrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, alert, payload_key) event = PrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, alert, payload_key)
set_status(parsed_alert, event)
end end
def build_self_managed_prometheus_alert_event(parsed_alert) def create_self_managed_prometheus_alert_event(parsed_alert)
payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(parsed_alert.starts_at_raw, parsed_alert.title, parsed_alert.full_query) payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(parsed_alert.starts_at_raw, parsed_alert.title, parsed_alert.full_query)
SelfManagedPrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, payload_key) do |event| event = SelfManagedPrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, payload_key) do |event|
event.environment = parsed_alert.environment event.environment = parsed_alert.environment
event.title = parsed_alert.title event.title = parsed_alert.title
event.query_expression = parsed_alert.full_query event.query_expression = parsed_alert.full_query
end end
set_status(parsed_alert, event)
end
def set_status(parsed_alert, event)
persisted = case parsed_alert.status
when 'firing'
event.fire(parsed_alert.starts_at)
when 'resolved'
event.resolve(parsed_alert.ends_at)
end
event if persisted
end end
end end
end end
......
...@@ -12,7 +12,7 @@ module Projects ...@@ -12,7 +12,7 @@ module Projects
persist_events persist_events
send_alert_email if send_email? send_alert_email if send_email?
process_incident_issues if create_issue? process_incident_issues if process_issues?
true true
end end
...@@ -41,8 +41,7 @@ module Projects ...@@ -41,8 +41,7 @@ module Projects
incident_management_setting.send_email && firings.any? incident_management_setting.send_email && firings.any?
end end
def create_issue? def process_issues?
return unless firings.any?
return unless incident_management_available? return unless incident_management_available?
incident_management_setting.create_issue? incident_management_setting.create_issue?
...@@ -128,7 +127,7 @@ module Projects ...@@ -128,7 +127,7 @@ module Projects
end end
def process_incident_issues def process_incident_issues
firings.each do |alert| alerts.each do |alert|
IncidentManagement::ProcessPrometheusAlertWorker IncidentManagement::ProcessPrometheusAlertWorker
.perform_async(project.id, alert.to_h) .perform_async(project.id, alert.to_h)
end end
......
...@@ -13,9 +13,16 @@ module IncidentManagement ...@@ -13,9 +13,16 @@ module IncidentManagement
parsed_alert = Gitlab::Alerting::Alert.new(project: project, payload: alert_hash) parsed_alert = Gitlab::Alerting::Alert.new(project: project, payload: alert_hash)
event = find_prometheus_alert_event(parsed_alert) event = find_prometheus_alert_event(parsed_alert)
issue = create_issue(project, alert_hash)
relate_issue_to_event(event, issue) if event&.resolved?
issue = event.related_issues.order_created_at_desc.detect(&:opened?)
close_issue(project, issue)
else
issue = create_issue(project, alert_hash)
relate_issue_to_event(event, issue)
end
end end
private private
...@@ -33,17 +40,25 @@ module IncidentManagement ...@@ -33,17 +40,25 @@ module IncidentManagement
end end
def find_gitlab_managed_event(alert) def find_gitlab_managed_event(alert)
payload_key = PrometheusAlertEvent.payload_key_for(alert.metric_id, alert.starts_at_raw) payload_key = payload_key_for_alert(alert)
PrometheusAlertEvent.find_by_payload_key(payload_key) PrometheusAlertEvent.find_by_payload_key(payload_key)
end end
def find_self_managed_event(alert) def find_self_managed_event(alert)
payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(alert.starts_at_raw, alert.title, alert.full_query) payload_key = payload_key_for_alert(alert)
SelfManagedPrometheusAlertEvent.find_by_payload_key(payload_key) SelfManagedPrometheusAlertEvent.find_by_payload_key(payload_key)
end end
def payload_key_for_alert(alert)
if alert.gitlab_managed?
PrometheusAlertEvent.payload_key_for(alert.metric_id, alert.starts_at_raw)
else
SelfManagedPrometheusAlertEvent.payload_key_for(alert.starts_at_raw, alert.title, alert.full_query)
end
end
def create_issue(project, alert) def create_issue(project, alert)
IncidentManagement::CreateIssueService IncidentManagement::CreateIssueService
.new(project, alert) .new(project, alert)
...@@ -51,6 +66,16 @@ module IncidentManagement ...@@ -51,6 +66,16 @@ module IncidentManagement
.dig(:issue) .dig(:issue)
end end
def close_issue(project, issue)
return if issue.blank? || issue.closed?
processed_issue = Issues::CloseService
.new(project, User.alert_bot)
.execute(issue, system_note: false)
SystemNoteService.auto_resolve_prometheus_alert(issue, project, User.alert_bot) if processed_issue.reset.closed?
end
def relate_issue_to_event(event, issue) def relate_issue_to_event(event, issue)
return unless event && issue return unless event && issue
......
...@@ -81,6 +81,14 @@ module Gitlab ...@@ -81,6 +81,14 @@ module Gitlab
end end
end end
def firing?
status == 'firing'
end
def resolved?
status == 'resolved'
end
def gitlab_managed? def gitlab_managed?
metric_id.present? metric_id.present?
end end
......
...@@ -235,9 +235,8 @@ describe Projects::Prometheus::Alerts::NotifyService do ...@@ -235,9 +235,8 @@ describe Projects::Prometheus::Alerts::NotifyService do
it 'does not send notification email', :sidekiq_might_not_need_inline do it 'does not send notification email', :sidekiq_might_not_need_inline do
expect(project.feature_available?(:incident_management)).to eq(true) expect(project.feature_available?(:incident_management)).to eq(true)
expect_next_instance_of(NotificationService) do |service| expect_any_instance_of(NotificationService)
expect(service).not_to receive(:async) .not_to receive(:async)
end
expect(subject).to eq(true) expect(subject).to eq(true)
end end
...@@ -291,14 +290,14 @@ describe Projects::Prometheus::Alerts::NotifyService do ...@@ -291,14 +290,14 @@ describe Projects::Prometheus::Alerts::NotifyService do
setting.update!(create_issue: true) setting.update!(create_issue: true)
end end
it_behaves_like 'processes incident issues', 1 it_behaves_like 'processes incident issues', 2
context 'without firing alerts' do context 'without firing alerts' do
let(:payload_raw) do let(:payload_raw) do
payload_for(firing: [], resolved: [alert_resolved]) payload_for(firing: [], resolved: [alert_resolved])
end end
it_behaves_like 'does not process incident issues' it_behaves_like 'processes incident issues', 1
end end
end end
......
...@@ -6,11 +6,8 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do ...@@ -6,11 +6,8 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
describe '#perform' do describe '#perform' do
let_it_be(:project) { create(:project) } let_it_be(:project) { create(:project) }
let_it_be(:prometheus_alert) { create(:prometheus_alert, project: project) } let_it_be(:prometheus_alert) { create(:prometheus_alert, project: project) }
let_it_be(:payload_key) { PrometheusAlertEvent.payload_key_for(prometheus_alert.prometheus_metric_id, prometheus_alert.created_at.rfc3339) }
before_all do let!(:prometheus_alert_event) { create(:prometheus_alert_event, prometheus_alert: prometheus_alert, payload_key: payload_key) }
payload_key = PrometheusAlertEvent.payload_key_for(prometheus_alert.prometheus_metric_id, prometheus_alert.created_at.rfc3339)
create(:prometheus_alert_event, prometheus_alert: prometheus_alert, payload_key: payload_key)
end
let(:alert_params) do let(:alert_params) do
{ {
...@@ -34,6 +31,34 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do ...@@ -34,6 +31,34 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
.to(1) .to(1)
end end
context 'resolved event' do
let(:issue) { create(:issue, project: project) }
before do
prometheus_alert_event.related_issues << issue
prometheus_alert_event.resolve
end
it 'does not create an issue' do
expect { subject.perform(project.id, alert_params) }
.not_to change(Issue, :count)
end
it 'closes the existing issue' do
expect { subject.perform(project.id, alert_params) }
.to change { issue.reload.state }
.from('opened')
.to('closed')
end
it 'leaves a system note on the issue' do
expect(SystemNoteService)
.to receive(:auto_resolve_prometheus_alert)
subject.perform(project.id, alert_params)
end
end
context 'when project could not be found' do context 'when project could not be found' do
it 'does not create an issue' do it 'does not create an issue' do
expect { subject.perform('1234', alert_params) } expect { subject.perform('1234', alert_params) }
...@@ -79,7 +104,7 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do ...@@ -79,7 +104,7 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
let(:alert_name) { 'alert' } let(:alert_name) { 'alert' }
let(:starts_at) { Time.now.rfc3339 } let(:starts_at) { Time.now.rfc3339 }
let!(:prometheus_alert) do let!(:prometheus_alert_event) do
payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(starts_at, alert_name, 'vector(1)') payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(starts_at, alert_name, 'vector(1)')
create(:self_managed_prometheus_alert_event, project: project, payload_key: payload_key) create(:self_managed_prometheus_alert_event, project: project, payload_key: payload_key)
end end
...@@ -102,7 +127,7 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do ...@@ -102,7 +127,7 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
it 'relates issue to an event' do it 'relates issue to an event' do
expect { subject.perform(project.id, alert_params) } expect { subject.perform(project.id, alert_params) }
.to change(prometheus_alert.related_issues, :count) .to change(prometheus_alert_event.related_issues, :count)
.from(0) .from(0)
.to(1) .to(1)
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment