Commit 26c00c8e authored by Sean McGivern's avatar Sean McGivern

Merge branch '222253-stop-saving-and-processing-prometheus-events' into 'master'

Resolve "Merge or add relation between Alert models"

See merge request gitlab-org/gitlab!35943
parents 955accb3 2ff8ba9a
# frozen_string_literal: true
module Projects
module Prometheus
module Alerts
# Persists a series of Prometheus alert events as list of PrometheusAlertEvent.
class CreateEventsService < BaseService
def execute
create_events_from(alerts)
end
private
def create_events_from(alerts)
Array.wrap(alerts).map { |alert| create_event(alert) }.compact
end
def create_event(payload)
parsed_alert = Gitlab::Alerting::Alert.new(project: project, payload: payload)
return unless parsed_alert.valid?
if parsed_alert.gitlab_managed?
create_managed_prometheus_alert_event(parsed_alert)
else
create_self_managed_prometheus_alert_event(parsed_alert)
end
end
def alerts
params['alerts']
end
def find_alert(metric)
Projects::Prometheus::AlertsFinder
.new(project: project, metric: metric)
.execute
.first
end
def create_managed_prometheus_alert_event(parsed_alert)
alert = find_alert(parsed_alert.metric_id)
event = PrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, alert, parsed_alert.gitlab_fingerprint)
set_status(parsed_alert, event)
end
def create_self_managed_prometheus_alert_event(parsed_alert)
event = SelfManagedPrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, parsed_alert.gitlab_fingerprint) do |event|
event.environment = parsed_alert.environment
event.title = parsed_alert.title
event.query_expression = parsed_alert.full_query
end
set_status(parsed_alert, event)
end
def set_status(parsed_alert, event)
persisted = case parsed_alert.status
when 'firing'
event.fire(parsed_alert.starts_at)
when 'resolved'
event.resolve(parsed_alert.ends_at)
end
event if persisted
end
end
end
end
end
...@@ -23,9 +23,7 @@ module Projects ...@@ -23,9 +23,7 @@ module Projects
return unauthorized unless valid_alert_manager_token?(token) return unauthorized unless valid_alert_manager_token?(token)
process_prometheus_alerts process_prometheus_alerts
persist_events
send_alert_email if send_email? send_alert_email if send_email?
process_incident_issues if process_issues?
ServiceResponse.success ServiceResponse.success
end end
...@@ -132,13 +130,6 @@ module Projects ...@@ -132,13 +130,6 @@ module Projects
.prometheus_alerts_fired(project, firings) .prometheus_alerts_fired(project, firings)
end end
def process_incident_issues
alerts.each do |alert|
IncidentManagement::ProcessPrometheusAlertWorker
.perform_async(project.id, alert.to_h)
end
end
def process_prometheus_alerts def process_prometheus_alerts
alerts.each do |alert| alerts.each do |alert|
AlertManagement::ProcessPrometheusAlertService AlertManagement::ProcessPrometheusAlertService
...@@ -147,10 +138,6 @@ module Projects ...@@ -147,10 +138,6 @@ module Projects
end end
end end
def persist_events
CreateEventsService.new(project, nil, params).execute
end
def bad_request def bad_request
ServiceResponse.error(message: 'Bad Request', http_status: :bad_request) ServiceResponse.error(message: 'Bad Request', http_status: :bad_request)
end end
......
...@@ -9,68 +9,13 @@ module IncidentManagement ...@@ -9,68 +9,13 @@ module IncidentManagement
worker_resource_boundary :cpu worker_resource_boundary :cpu
def perform(project_id, alert_hash) def perform(project_id, alert_hash)
project = find_project(project_id) # no-op
return unless project #
# This worker is not scheduled anymore since
parsed_alert = Gitlab::Alerting::Alert.new(project: project, payload: alert_hash) # https://gitlab.com/gitlab-org/gitlab/-/merge_requests/35943
event = find_prometheus_alert_event(parsed_alert) # and will be removed completely via
# https://gitlab.com/gitlab-org/gitlab/-/issues/227146
if event&.resolved? # in 14.0.
issue = event.related_issues.order_created_at_desc.detect(&:opened?)
close_issue(project, issue)
else
issue = create_issue(project, alert_hash)
relate_issue_to_event(event, issue)
end
end
private
def find_project(project_id)
Project.find_by_id(project_id)
end
def find_prometheus_alert_event(alert)
if alert.gitlab_managed?
find_gitlab_managed_event(alert)
else
find_self_managed_event(alert)
end
end
def find_gitlab_managed_event(alert)
PrometheusAlertEvent.find_by_payload_key(alert.gitlab_fingerprint)
end
def find_self_managed_event(alert)
SelfManagedPrometheusAlertEvent.find_by_payload_key(alert.gitlab_fingerprint)
end
def create_issue(project, alert)
IncidentManagement::CreateIssueService
.new(project, alert)
.execute
.dig(:issue)
end
def close_issue(project, issue)
return if issue.blank? || issue.closed?
processed_issue = Issues::CloseService
.new(project, User.alert_bot)
.execute(issue, system_note: false)
SystemNoteService.auto_resolve_prometheus_alert(issue, project, User.alert_bot) if processed_issue.reset.closed?
end
def relate_issue_to_event(event, issue)
return unless event && issue
if event.related_issues.exclude?(issue)
event.related_issues << issue
end
end end
end end
end end
...@@ -36,48 +36,8 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do ...@@ -36,48 +36,8 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do
end end
end end
shared_examples 'processes incident issues' do |amount|
let(:create_incident_service) { spy }
it 'processes issues' do
expect(IncidentManagement::ProcessPrometheusAlertWorker)
.to receive(:perform_async)
.with(project.id, kind_of(Hash))
.exactly(amount).times
Sidekiq::Testing.inline! do
expect(subject).to be_success
end
end
end
shared_examples 'does not process incident issues' do
it 'does not process issues' do
expect(IncidentManagement::ProcessPrometheusAlertWorker)
.not_to receive(:perform_async)
expect(subject).to be_success
end
end
shared_examples 'persists events' do
let(:create_events_service) { spy }
it 'persists events' do
expect(Projects::Prometheus::Alerts::CreateEventsService)
.to receive(:new)
.and_return(create_events_service)
expect(create_events_service)
.to receive(:execute)
expect(subject).to be_success
end
end
shared_examples 'notifies alerts' do shared_examples 'notifies alerts' do
it_behaves_like 'sends notification email' it_behaves_like 'sends notification email'
it_behaves_like 'persists events'
end end
shared_examples 'no notifications' do |http_status:| shared_examples 'no notifications' do |http_status:|
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Projects::Prometheus::Alerts::CreateEventsService do
let(:user) { create(:user) }
let_it_be(:project) { create(:project) }
let(:metric) { create(:prometheus_metric, project: project) }
let(:service) { described_class.new(project, user, alerts_payload) }
shared_examples 'events persisted' do |expected_count|
subject { service.execute }
it 'returns proper amount of created events' do
expect(subject.size).to eq(expected_count)
end
it 'increments event count' do
expect { subject }.to change { PrometheusAlertEvent.count }.to(expected_count)
end
end
shared_examples 'no events persisted' do
subject { service.execute }
it 'returns no created events' do
expect(subject).to be_empty
end
it 'does not change event count' do
expect { subject }.not_to change { PrometheusAlertEvent.count }
end
end
shared_examples 'self managed events persisted' do
subject { service.execute }
it 'returns created events' do
expect(subject).not_to be_empty
end
it 'does change self managed event count' do
expect { subject }.to change { SelfManagedPrometheusAlertEvent.count }
end
end
context 'with valid alerts_payload' do
let!(:alert) { create(:prometheus_alert, prometheus_metric: metric, project: project) }
let(:events) { service.execute }
context 'with a firing payload' do
let(:started_at) { truncate_to_second(Time.current) }
let(:firing_event) { alert_payload(status: 'firing', started_at: started_at) }
let(:alerts_payload) { { 'alerts' => [firing_event] } }
it_behaves_like 'events persisted', 1
it 'returns created event' do
event = events.first
expect(event).to be_firing
expect(event.started_at).to eq(started_at)
expect(event.ended_at).to be_nil
end
context 'with 2 different firing events' do
let(:another_firing_event) { alert_payload(status: 'firing', started_at: started_at + 1) }
let(:alerts_payload) { { 'alerts' => [firing_event, another_firing_event] } }
it_behaves_like 'events persisted', 2
end
context 'with already persisted firing event' do
before do
service.execute
end
it_behaves_like 'no events persisted'
end
context 'with duplicate payload' do
let(:alerts_payload) { { 'alerts' => [firing_event, firing_event] } }
it_behaves_like 'events persisted', 1
end
end
context 'with a resolved payload' do
let(:started_at) { truncate_to_second(Time.current) }
let(:ended_at) { started_at + 1 }
let(:resolved_event) { alert_payload(status: 'resolved', started_at: started_at, ended_at: ended_at) }
let(:alerts_payload) { { 'alerts' => [resolved_event] } }
let(:payload_key) { Gitlab::Alerting::Alert.new(project: project, payload: resolved_event).gitlab_fingerprint }
context 'with a matching firing event' do
before do
create(:prometheus_alert_event,
prometheus_alert: alert,
payload_key: payload_key,
started_at: started_at)
end
it 'does not create an additional event' do
expect { service.execute }.not_to change { PrometheusAlertEvent.count }
end
it 'marks firing event as `resolved`' do
expect(events.size).to eq(1)
event = events.first
expect(event).to be_resolved
expect(event.started_at).to eq(started_at)
expect(event.ended_at).to eq(ended_at)
end
context 'with duplicate payload' do
let(:alerts_payload) { { 'alerts' => [resolved_event, resolved_event] } }
it 'does not create an additional event' do
expect { service.execute }.not_to change { PrometheusAlertEvent.count }
end
it 'marks firing event as `resolved` only once' do
expect(events.size).to eq(1)
end
end
end
context 'without a matching firing event' do
context 'due to payload_key' do
let(:payload_key) { 'some other payload_key' }
before do
create(:prometheus_alert_event,
prometheus_alert: alert,
payload_key: payload_key,
started_at: started_at)
end
it_behaves_like 'no events persisted'
end
context 'due to status' do
before do
create(:prometheus_alert_event, :resolved,
prometheus_alert: alert,
started_at: started_at)
end
it_behaves_like 'no events persisted'
end
end
context 'with already resolved event' do
before do
service.execute
end
it_behaves_like 'no events persisted'
end
end
context 'with a metric from another project' do
let(:another_project) { create(:project) }
let(:metric) { create(:prometheus_metric, project: another_project) }
let(:alerts_payload) { { 'alerts' => [alert_payload] } }
let!(:alert) do
create(:prometheus_alert,
prometheus_metric: metric,
project: another_project)
end
it_behaves_like 'no events persisted'
end
end
context 'with invalid payload' do
let(:alert) { create(:prometheus_alert, prometheus_metric: metric, project: project) }
describe '`alerts` key' do
context 'is missing' do
let(:alerts_payload) { {} }
it_behaves_like 'no events persisted'
end
context 'is nil' do
let(:alerts_payload) { { 'alerts' => nil } }
it_behaves_like 'no events persisted'
end
context 'is empty' do
let(:alerts_payload) { { 'alerts' => [] } }
it_behaves_like 'no events persisted'
end
context 'is not a Hash' do
let(:alerts_payload) { { 'alerts' => [:not_a_hash] } }
it_behaves_like 'no events persisted'
end
describe '`status`' do
context 'is missing' do
let(:alerts_payload) { { 'alerts' => [alert_payload(status: nil)] } }
it_behaves_like 'no events persisted'
end
context 'is invalid' do
let(:alerts_payload) { { 'alerts' => [alert_payload(status: 'invalid')] } }
it_behaves_like 'no events persisted'
end
end
describe '`started_at`' do
context 'is missing' do
let(:alerts_payload) { { 'alerts' => [alert_payload(started_at: nil)] } }
it_behaves_like 'no events persisted'
end
context 'is invalid' do
let(:alerts_payload) { { 'alerts' => [alert_payload(started_at: 'invalid date')] } }
it_behaves_like 'no events persisted'
end
end
describe '`ended_at`' do
context 'is missing and status is resolved' do
let(:alerts_payload) { { 'alerts' => [alert_payload(ended_at: nil, status: 'resolved')] } }
it_behaves_like 'no events persisted'
end
context 'is invalid and status is resolved' do
let(:alerts_payload) { { 'alerts' => [alert_payload(ended_at: 'invalid date', status: 'resolved')] } }
it_behaves_like 'no events persisted'
end
end
describe '`labels`' do
describe '`gitlab_alert_id`' do
context 'is missing' do
let(:alerts_payload) { { 'alerts' => [alert_payload(gitlab_alert_id: nil)] } }
it_behaves_like 'no events persisted'
end
context 'is missing but title is given' do
let(:alerts_payload) { { 'alerts' => [alert_payload(gitlab_alert_id: nil, title: 'alert')] } }
it_behaves_like 'self managed events persisted'
end
context 'is missing and environment name is given' do
let(:environment) { create(:environment, project: project) }
let(:alerts_payload) { { 'alerts' => [alert_payload(gitlab_alert_id: nil, title: 'alert', environment: environment.name)] } }
it_behaves_like 'self managed events persisted'
it 'associates the environment to the alert event' do
service.execute
expect(SelfManagedPrometheusAlertEvent.last.environment).to eq environment
end
end
context 'is invalid' do
let(:alerts_payload) { { 'alerts' => [alert_payload(gitlab_alert_id: '-1')] } }
it_behaves_like 'no events persisted'
end
end
end
end
end
private
def alert_payload(status: 'firing', started_at: Time.current, ended_at: Time.current, gitlab_alert_id: alert.prometheus_metric_id, title: nil, environment: nil)
payload = {}
payload['status'] = status if status
payload['startsAt'] = utc_rfc3339(started_at) if started_at
payload['endsAt'] = utc_rfc3339(ended_at) if ended_at
payload['labels'] = {}
payload['labels']['gitlab_alert_id'] = gitlab_alert_id.to_s if gitlab_alert_id
payload['labels']['alertname'] = title if title
payload['labels']['gitlab_environment_name'] = environment if environment
payload
end
# Example: 2018-09-27T18:25:31.079079416Z
def utc_rfc3339(date)
date.utc.rfc3339
rescue
date
end
def truncate_to_second(date)
date.change(usec: 0)
end
end
...@@ -36,48 +36,8 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do ...@@ -36,48 +36,8 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do
end end
end end
shared_examples 'processes incident issues' do |amount|
let(:create_incident_service) { spy }
it 'processes issues' do
expect(IncidentManagement::ProcessPrometheusAlertWorker)
.to receive(:perform_async)
.with(project.id, kind_of(Hash))
.exactly(amount).times
Sidekiq::Testing.inline! do
expect(subject).to be_success
end
end
end
shared_examples 'does not process incident issues' do
it 'does not process issues' do
expect(IncidentManagement::ProcessPrometheusAlertWorker)
.not_to receive(:perform_async)
expect(subject).to be_success
end
end
shared_examples 'persists events' do
let(:create_events_service) { spy }
it 'persists events' do
expect(Projects::Prometheus::Alerts::CreateEventsService)
.to receive(:new)
.and_return(create_events_service)
expect(create_events_service)
.to receive(:execute)
expect(subject).to be_success
end
end
shared_examples 'notifies alerts' do shared_examples 'notifies alerts' do
it_behaves_like 'sends notification email' it_behaves_like 'sends notification email'
it_behaves_like 'persists events'
end end
shared_examples 'no notifications' do |http_status:| shared_examples 'no notifications' do |http_status:|
...@@ -257,8 +217,6 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do ...@@ -257,8 +217,6 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do
context 'when incident_management_setting does not exist' do context 'when incident_management_setting does not exist' do
let!(:setting) { nil } let!(:setting) { nil }
it_behaves_like 'persists events'
it 'does not send notification email', :sidekiq_might_not_need_inline do it 'does not send notification email', :sidekiq_might_not_need_inline do
expect_any_instance_of(NotificationService) expect_any_instance_of(NotificationService)
.not_to receive(:async) .not_to receive(:async)
...@@ -276,8 +234,6 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do ...@@ -276,8 +234,6 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do
create(:project_incident_management_setting, send_email: false, project: project) create(:project_incident_management_setting, send_email: false, project: project)
end end
it_behaves_like 'persists events'
it 'does not send notification' do it 'does not send notification' do
expect(NotificationService).not_to receive(:new) expect(NotificationService).not_to receive(:new)
...@@ -311,45 +267,6 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do ...@@ -311,45 +267,6 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do
end end
end end
end end
context 'process incident issues' do
before do
create(:prometheus_service, project: project)
create(:project_alerting_setting, project: project, token: token)
end
context 'with create_issue setting enabled' do
before do
setting.update!(create_issue: true)
end
it_behaves_like 'processes incident issues', 2
context 'multiple firing alerts' do
let(:payload_raw) do
prometheus_alert_payload(firing: [alert_firing, alert_firing], resolved: [])
end
it_behaves_like 'processes incident issues', 2
end
context 'without firing alerts' do
let(:payload_raw) do
prometheus_alert_payload(firing: [], resolved: [alert_resolved])
end
it_behaves_like 'processes incident issues', 1
end
end
context 'with create_issue setting disabled' do
before do
setting.update!(create_issue: false)
end
it_behaves_like 'does not process incident issues'
end
end
end end
context 'with invalid payload' do context 'with invalid payload' do
...@@ -380,13 +297,6 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do ...@@ -380,13 +297,6 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do
subject subject
end end
it 'does not process issues' do
expect(IncidentManagement::ProcessPrometheusAlertWorker)
.not_to receive(:perform_async)
subject
end
end end
end end
......
...@@ -19,137 +19,9 @@ RSpec.describe IncidentManagement::ProcessPrometheusAlertWorker do ...@@ -19,137 +19,9 @@ RSpec.describe IncidentManagement::ProcessPrometheusAlertWorker do
}.with_indifferent_access }.with_indifferent_access
end end
it 'creates an issue' do it 'does nothing' do
expect { subject.perform(project.id, alert_params) } expect { subject.perform(project.id, alert_params) }
.to change(Issue, :count) .not_to change(Issue, :count)
.by(1)
end
it 'relates issue to an event' do
expect { subject.perform(project.id, alert_params) }
.to change(prometheus_alert.related_issues, :count)
.from(0)
.to(1)
end
context 'resolved event' do
let(:issue) { create(:issue, project: project) }
before do
prometheus_alert_event.related_issues << issue
prometheus_alert_event.resolve
end
it 'does not create an issue' do
expect { subject.perform(project.id, alert_params) }
.not_to change(Issue, :count)
end
it 'closes the existing issue' do
expect { subject.perform(project.id, alert_params) }
.to change { issue.reload.state }
.from('opened')
.to('closed')
end
it 'leaves a system note on the issue' do
expect(SystemNoteService)
.to receive(:auto_resolve_prometheus_alert)
subject.perform(project.id, alert_params)
end
end
context 'when project could not be found' do
let(:non_existing_project_id) { non_existing_record_id }
it 'does not create an issue' do
expect { subject.perform(non_existing_project_id, alert_params) }
.not_to change(Issue, :count)
end
it 'does not relate issue to an event' do
expect { subject.perform(non_existing_project_id, alert_params) }
.not_to change(prometheus_alert.related_issues, :count)
end
end
context 'when event could not be found' do
before do
alert_params[:labels][:gitlab_alert_id] = non_existing_record_id
end
it 'does not create an issue' do
expect { subject.perform(project.id, alert_params) }
.not_to change(Issue, :count)
end
it 'does not relate issue to an event' do
expect { subject.perform(project.id, alert_params) }
.not_to change(prometheus_alert.related_issues, :count)
end
end
context 'when issue could not be created' do
before do
allow_next_instance_of(IncidentManagement::CreateIssueService) do |instance|
allow(instance).to receive(:execute).and_return( { error: true } )
end
end
it 'does not relate issue to an event' do
expect { subject.perform(project.id, alert_params) }
.not_to change(prometheus_alert.related_issues, :count)
end
end
context 'self-managed alert' do
let(:alert_name) { 'alert' }
let(:starts_at) { Time.now.rfc3339 }
let!(:prometheus_alert_event) do
create(:self_managed_prometheus_alert_event, project: project, payload_key: payload_key)
end
let(:alert_params) do
{
startsAt: starts_at,
generatorURL: 'http://localhost:9090/graph?g0.expr=vector%281%29&g0.tab=1',
labels: {
alertname: alert_name
}
}.with_indifferent_access
end
it 'creates an issue' do
expect { subject.perform(project.id, alert_params) }
.to change(Issue, :count)
.by(1)
end
it 'relates issue to an event' do
expect { subject.perform(project.id, alert_params) }
.to change(prometheus_alert_event.related_issues, :count)
.from(0)
.to(1)
end
context 'when event could not be found' do
before do
alert_params[:generatorURL] = 'http://somethingelse.com'
end
it 'creates an issue' do
expect { subject.perform(project.id, alert_params) }
.to change(Issue, :count)
.by(1)
end
it 'does not relate issue to an event' do
expect { subject.perform(project.id, alert_params) }
.not_to change(prometheus_alert.related_issues, :count)
end
end
end end
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment