Commit 1caffa13 authored by Tiago Botelho's avatar Tiago Botelho

Adds the backend logic required to add Prometheus Alerts to Custom Metrics

parent d784bad9
module EnvironmentsHelper module EnvironmentsHelper
prepend ::EE::EnvironmentsHelper
def environments_list_data def environments_list_data
{ {
endpoint: project_environments_path(@project, format: :json) endpoint: project_environments_path(@project, format: :json)
} }
end end
def metrics_data(project, environment)
{
"settings-path" => edit_project_service_path(project, 'prometheus'),
"clusters-path" => project_clusters_path(project),
"documentation-path" => help_page_path('administration/monitoring/prometheus/index.md'),
"empty-getting-started-svg-path" => image_path('illustrations/monitoring/getting_started.svg'),
"empty-loading-svg-path" => image_path('illustrations/monitoring/loading.svg'),
"empty-no-data-svg-path" => image_path('illustrations/monitoring/no_data.svg'),
"empty-unable-to-connect-svg-path" => image_path('illustrations/monitoring/unable_to_connect.svg'),
"metrics-endpoint" => additional_metrics_project_environment_path(project, environment, format: :json),
"deployment-endpoint" => project_environment_deployments_path(project, environment, format: :json),
"environments-endpoint": project_environments_path(project, format: :json),
"project-path" => project_path(project),
"tags-path" => project_tags_path(project),
"has-metrics" => "#{environment.has_metrics?}"
}
end
end end
...@@ -11,6 +11,8 @@ module Clusters ...@@ -11,6 +11,8 @@ module Clusters
include ::Clusters::Concerns::ApplicationStatus include ::Clusters::Concerns::ApplicationStatus
include ::Clusters::Concerns::ApplicationData include ::Clusters::Concerns::ApplicationData
prepend EE::Clusters::Applications::Prometheus
default_value_for :version, VERSION default_value_for :version, VERSION
state_machine :status do state_machine :status do
...@@ -21,6 +23,14 @@ module Clusters ...@@ -21,6 +23,14 @@ module Clusters
end end
end end
def ready_status
[:installed]
end
def ready?
ready_status.include?(status_name)
end
def chart def chart
'stable/prometheus' 'stable/prometheus'
end end
......
...@@ -4,6 +4,8 @@ module Clusters ...@@ -4,6 +4,8 @@ module Clusters
extend ActiveSupport::Concern extend ActiveSupport::Concern
included do included do
prepend ::EE::Clusters::ApplicationStatus
scope :installed, -> { where(status: self.state_machines[:status].states[:installed].value) } scope :installed, -> { where(status: self.state_machines[:status].states[:installed].value) }
state_machine :status, initial: :not_installable do state_machine :status, initial: :not_installable do
......
...@@ -3,6 +3,7 @@ module PrometheusAdapter ...@@ -3,6 +3,7 @@ module PrometheusAdapter
included do included do
include ReactiveCaching include ReactiveCaching
prepend EE::PrometheusAdapter
self.reactive_cache_key = ->(adapter) { [adapter.class.model_name.singular, adapter.id] } self.reactive_cache_key = ->(adapter) { [adapter.class.model_name.singular, adapter.id] }
self.reactive_cache_lease_timeout = 30.seconds self.reactive_cache_lease_timeout = 30.seconds
...@@ -24,17 +25,10 @@ module PrometheusAdapter ...@@ -24,17 +25,10 @@ module PrometheusAdapter
def query(query_name, *args) def query(query_name, *args)
return unless can_query? return unless can_query?
query_class = Gitlab::Prometheus::Queries.const_get("#{query_name.to_s.classify}Query") query_class = query_klass_for(query_name)
query_args = build_query_args(*args)
args.map! do |arg| with_reactive_cache(query_class.name, *query_args, &query_class.method(:transform_reactive_result))
if arg.respond_to?(:id)
arg.id
else
arg
end
end
with_reactive_cache(query_class.name, *args, &query_class.method(:transform_reactive_result))
end end
# Cache metrics for specific environment # Cache metrics for specific environment
...@@ -50,5 +44,13 @@ module PrometheusAdapter ...@@ -50,5 +44,13 @@ module PrometheusAdapter
rescue Gitlab::PrometheusClient::Error => err rescue Gitlab::PrometheusClient::Error => err
{ success: false, result: err.message } { success: false, result: err.message }
end end
def query_klass_for(query_name)
Gitlab::Prometheus::Queries.const_get("#{query_name.to_s.classify}Query")
end
def build_query_args(*args)
args.map(&:id)
end
end end
end end
...@@ -3,6 +3,8 @@ module Clusters ...@@ -3,6 +3,8 @@ module Clusters
class BaseHelmService class BaseHelmService
attr_accessor :app attr_accessor :app
prepend EE::Clusters::Applications::BaseHelmService
def initialize(app) def initialize(app)
@app = app @app = app
end end
......
...@@ -48,17 +48,17 @@ module Clusters ...@@ -48,17 +48,17 @@ module Clusters
end end
def remove_installation_pod def remove_installation_pod
helm_api.delete_installation_pod!(install_command.pod_name) helm_api.delete_pod!(install_command.pod_name)
rescue rescue
# no-op # no-op
end end
def installation_phase def installation_phase
helm_api.installation_status(install_command.pod_name) helm_api.status(install_command.pod_name)
end end
def installation_errors def installation_errors
helm_api.installation_log(install_command.pod_name) helm_api.log(install_command.pod_name)
end end
end end
end end
......
...@@ -28,7 +28,7 @@ module Prometheus ...@@ -28,7 +28,7 @@ module Prometheus
return unless deployment_platform.respond_to?(:cluster) return unless deployment_platform.respond_to?(:cluster)
cluster = deployment_platform.cluster cluster = deployment_platform.cluster
return unless cluster.application_prometheus&.installed? return unless cluster.application_prometheus&.ready?
cluster.application_prometheus cluster.application_prometheus
end end
......
...@@ -23,8 +23,10 @@ ...@@ -23,8 +23,10 @@
- cronjob:prune_web_hook_logs - cronjob:prune_web_hook_logs
- gcp_cluster:cluster_install_app - gcp_cluster:cluster_install_app
- gcp_cluster:cluster_update_app
- gcp_cluster:cluster_provision - gcp_cluster:cluster_provision
- gcp_cluster:cluster_wait_for_app_installation - gcp_cluster:cluster_wait_for_app_installation
- gcp_cluster:cluster_wait_for_app_update
- gcp_cluster:wait_for_cluster_creation - gcp_cluster:wait_for_cluster_creation
- gcp_cluster:cluster_wait_for_ingress_ip_address - gcp_cluster:cluster_wait_for_ingress_ip_address
......
...@@ -80,6 +80,12 @@ constraints(::Constraints::ProjectUrlConstrainer.new) do ...@@ -80,6 +80,12 @@ constraints(::Constraints::ProjectUrlConstrainer.new) do
post :validate_query, on: :collection post :validate_query, on: :collection
get :active_common, on: :collection get :active_common, on: :collection
end end
# EE-specific
resources :alerts, constraints: { id: /\d+/ }, only: [:index, :create, :show, :update, :destroy] do
post :notify, on: :collection
end
# EE-specific
end end
resources :deploy_keys, constraints: { id: /\d+/ }, only: [:index, :new, :create, :edit, :update] do resources :deploy_keys, constraints: { id: /\d+/ }, only: [:index, :new, :create, :edit, :update] do
......
...@@ -778,6 +778,7 @@ ActiveRecord::Schema.define(version: 20180704204006) do ...@@ -778,6 +778,7 @@ ActiveRecord::Schema.define(version: 20180704204006) do
t.text "status_reason" t.text "status_reason"
t.datetime_with_timezone "created_at", null: false t.datetime_with_timezone "created_at", null: false
t.datetime_with_timezone "updated_at", null: false t.datetime_with_timezone "updated_at", null: false
t.datetime_with_timezone "last_update_started_at"
end end
create_table "clusters_applications_runners", force: :cascade do |t| create_table "clusters_applications_runners", force: :cascade do |t|
...@@ -2179,6 +2180,21 @@ ActiveRecord::Schema.define(version: 20180704204006) do ...@@ -2179,6 +2180,21 @@ ActiveRecord::Schema.define(version: 20180704204006) do
add_index "projects", ["star_count"], name: "index_projects_on_star_count", using: :btree add_index "projects", ["star_count"], name: "index_projects_on_star_count", using: :btree
add_index "projects", ["visibility_level"], name: "index_projects_on_visibility_level", using: :btree add_index "projects", ["visibility_level"], name: "index_projects_on_visibility_level", using: :btree
create_table "prometheus_alerts", force: :cascade do |t|
t.datetime_with_timezone "created_at", null: false
t.datetime_with_timezone "updated_at", null: false
t.float "threshold", null: false
t.integer "operator", null: false
t.integer "environment_id", null: false
t.integer "project_id", null: false
t.integer "prometheus_metric_id", null: false
t.text "name", null: false
t.string "query", null: false
end
add_index "prometheus_alerts", ["environment_id"], name: "index_prometheus_alerts_on_environment_id", using: :btree
add_index "prometheus_alerts", ["prometheus_metric_id"], name: "index_prometheus_alerts_on_prometheus_metric_id", unique: true, using: :btree
create_table "prometheus_metrics", force: :cascade do |t| create_table "prometheus_metrics", force: :cascade do |t|
t.integer "project_id" t.integer "project_id"
t.string "title", null: false t.string "title", null: false
...@@ -2973,6 +2989,9 @@ ActiveRecord::Schema.define(version: 20180704204006) do ...@@ -2973,6 +2989,9 @@ ActiveRecord::Schema.define(version: 20180704204006) do
add_foreign_key "project_mirror_data", "projects", name: "fk_d1aad367d7", on_delete: :cascade add_foreign_key "project_mirror_data", "projects", name: "fk_d1aad367d7", on_delete: :cascade
add_foreign_key "project_repository_states", "projects", on_delete: :cascade add_foreign_key "project_repository_states", "projects", on_delete: :cascade
add_foreign_key "project_statistics", "projects", on_delete: :cascade add_foreign_key "project_statistics", "projects", on_delete: :cascade
add_foreign_key "prometheus_alerts", "environments", on_delete: :cascade
add_foreign_key "prometheus_alerts", "projects", on_delete: :cascade
add_foreign_key "prometheus_alerts", "prometheus_metrics", on_delete: :cascade
add_foreign_key "prometheus_metrics", "projects", on_delete: :cascade add_foreign_key "prometheus_metrics", "projects", on_delete: :cascade
add_foreign_key "protected_branch_merge_access_levels", "namespaces", column: "group_id", name: "fk_98f3d044fe", on_delete: :cascade add_foreign_key "protected_branch_merge_access_levels", "namespaces", column: "group_id", name: "fk_98f3d044fe", on_delete: :cascade
add_foreign_key "protected_branch_merge_access_levels", "protected_branches", name: "fk_8a3072ccb3", on_delete: :cascade add_foreign_key "protected_branch_merge_access_levels", "protected_branches", name: "fk_8a3072ccb3", on_delete: :cascade
......
module Projects
module Prometheus
class AlertsController < Projects::ApplicationController
respond_to :json
protect_from_forgery except: [:notify]
before_action :authorize_read_prometheus_alerts!, except: [:notify]
before_action :authorize_admin_project!, except: [:notify]
before_action :alert, only: [:update, :show, :destroy]
def index
alerts = project.prometheus_alerts.reorder(id: :asc)
render json: serialize_as_json(alerts)
end
def show
render json: serialize_as_json(alert)
end
def notify
NotificationService.new.async.prometheus_alerts_fired(project, params["alerts"])
head :ok
end
def create
@alert = project.prometheus_alerts.create(alerts_params)
if @alert
schedule_prometheus_update!
render json: serialize_as_json(@alert)
else
head :no_content
end
end
def update
if alert.update(alerts_params)
schedule_prometheus_update!
render json: serialize_as_json(alert)
else
head :no_content
end
end
def destroy
if alert.destroy
schedule_prometheus_update!
head :ok
else
head :no_content
end
end
private
def alerts_params
alerts_params = params.permit(:query, :operator, :threshold, :name, :environment_id, :prometheus_metric_id)
if alerts_params[:operator].present?
alerts_params[:operator] = PrometheusAlert.operator_to_enum(alerts_params[:operator])
end
alerts_params
end
def schedule_prometheus_update!
::Clusters::Applications::ScheduleUpdateService.new(application, project).execute
end
def serialize_as_json(alert_obj)
serializer.represent(alert_obj)
end
def serializer
PrometheusAlertSerializer.new(project: project, current_user: current_user)
end
def alert
@alert ||= project.prometheus_alerts.find_by(prometheus_metric: params[:id]) || render_404
end
def application
@application ||= alert.environment.cluster_prometheus_adapter
end
end
end
end
module EE
module EnvironmentsHelper
def metrics_data(project, environment)
ee_metrics_data = {
"alerts-endpoint" => project_prometheus_alerts_path(project, environment_id: environment.id, format: :json),
"prometheus-alerts-available" => "#{can?(current_user, :read_prometheus_alerts, project)}"
}
super.merge(ee_metrics_data)
end
end
end
...@@ -17,6 +17,25 @@ module Emails ...@@ -17,6 +17,25 @@ module Emails
mail(to: new_mirror_user.notification_email, mail(to: new_mirror_user.notification_email,
subject: subject('Mirror user changed')) subject: subject('Mirror user changed'))
end end
def prometheus_alert_fired_email(project_id, user_id, alert_params)
alert_metric_id = alert_params["labels"]["gitlab_alert_id"]
@project = Project.find_by(id: project_id)
return unless @project
@alert = @project.prometheus_alerts.find_by(prometheus_metric: alert_metric_id)
return unless @alert
@environment = @alert.environment
user = User.find_by(id: user_id)
return unless user
subject_text = "Alert: #{@environment.name} - #{@alert.name} #{@alert.computed_operator} #{@alert.threshold} for 5 minutes"
mail(to: user.notification_email, subject: subject(subject_text))
end
end end
end end
end end
module EE
module Clusters
module ApplicationStatus
extend ActiveSupport::Concern
prepended do
state_machine :status, initial: :not_installable do
state :updating, value: 4
state :updated, value: 5
state :update_errored, value: 6
event :make_updating do
transition [:installed, :updated, :update_errored] => :updating
end
event :make_updated do
transition [:updating] => :updated
end
event :make_update_errored do
transition any => :update_errored
end
before_transition any => [:updating] do |app_status, _|
app_status.status_reason = nil
end
before_transition any => [:update_errored] do |app_status, transition|
status_reason = transition.args.first
app_status.status_reason = status_reason if status_reason
end
end
end
end
end
end
module EE
module PrometheusAdapter
extend ::Gitlab::Utils::Override
def clear_prometheus_reactive_cache!(query_name, *args)
query_class = query_klass_for(query_name)
query_args = build_query_args(*args)
clear_reactive_cache!(query_class.name, *query_args)
end
private
override :build_query_args
def build_query_args(*args)
args.map do |arg|
arg.respond_to?(:id) ? arg.id : arg
end
end
end
end
module EE
module Clusters
module Applications
module Prometheus
extend ActiveSupport::Concern
prepended do
state_machine :status do
after_transition any => :updating do |application|
application.update(last_update_started_at: Time.now)
end
end
end
def ready_status
super + [:updating, :updated, :update_errored]
end
def updated_since?(timestamp)
last_update_started_at &&
last_update_started_at > timestamp &&
!update_errored?
end
def update_in_progress?
status_name == :updating
end
def update_errored?
status_name == :update_errored
end
def get_command
::Gitlab::Kubernetes::Helm::GetCommand.new(name)
end
def upgrade_command(values)
::Gitlab::Kubernetes::Helm::UpgradeCommand.new(
name,
chart: chart,
version: version,
values: values
)
end
end
end
end
end
module EE module EE
module Environment module Environment
extend ActiveSupport::Concern
prepended do
has_many :prometheus_alerts, inverse_of: :environment
end
def pod_names def pod_names
return [] unless rollout_status return [] unless rollout_status
...@@ -7,5 +13,13 @@ module EE ...@@ -7,5 +13,13 @@ module EE
instance[:pod_name] instance[:pod_name]
end end
end end
def clear_prometheus_reactive_cache!(query_name)
cluster_prometheus_adapter&.clear_prometheus_reactive_cache!(query_name, self)
end
def cluster_prometheus_adapter
@cluster_prometheus_adapter ||= Prometheus::AdapterService.new(project, deployment_platform).cluster_prometheus_adapter
end
end end
end end
...@@ -39,6 +39,8 @@ module EE ...@@ -39,6 +39,8 @@ module EE
has_many :source_pipelines, class_name: 'Ci::Sources::Pipeline', foreign_key: :project_id has_many :source_pipelines, class_name: 'Ci::Sources::Pipeline', foreign_key: :project_id
has_many :prometheus_alerts, inverse_of: :project
scope :with_shared_runners_limit_enabled, -> { with_shared_runners.non_public_only } scope :with_shared_runners_limit_enabled, -> { with_shared_runners.non_public_only }
scope :mirror, -> { where(mirror: true) } scope :mirror, -> { where(mirror: true) }
...@@ -99,6 +101,12 @@ module EE ...@@ -99,6 +101,12 @@ module EE
pipelines.newest_first(default_branch).with_security_reports.first pipelines.newest_first(default_branch).with_security_reports.first
end end
def environments_for_scope(scope)
quoted_scope = ::Gitlab::SQL::Glob.q(scope)
environments.where("name LIKE (#{::Gitlab::SQL::Glob.to_like(quoted_scope)})") # rubocop:disable GitlabSecurity/SqlInjection
end
def ensure_external_webhook_token def ensure_external_webhook_token
return if external_webhook_token.present? return if external_webhook_token.present?
......
...@@ -74,6 +74,7 @@ class License < ActiveRecord::Base ...@@ -74,6 +74,7 @@ class License < ActiveRecord::Base
chatops chatops
pod_logs pod_logs
pseudonymizer pseudonymizer
prometheus_alerts
].freeze ].freeze
# List all features available for early adopters, # List all features available for early adopters,
......
class PrometheusAlert < ActiveRecord::Base
include AtomicInternalId
OPERATORS_MAP = {
lt: "<",
eq: "=",
gt: ">"
}.freeze
belongs_to :environment, required: true, validate: true, inverse_of: :prometheus_alerts
belongs_to :project, required: true, validate: true, inverse_of: :prometheus_alerts
belongs_to :prometheus_metric, required: true, validate: true
validates :name, presence: true
after_save :clear_prometheus_adapter_cache!
after_destroy :clear_prometheus_adapter_cache!
enum operator: [:lt, :eq, :gt]
def self.operator_to_enum(op)
OPERATORS_MAP.invert.fetch(op)
end
def full_query
"#{query} #{computed_operator} #{threshold}"
end
def computed_operator
OPERATORS_MAP.fetch(operator.to_sym)
end
def to_param
{
"alert" => name,
"expr" => full_query,
"for" => "5m",
"labels" => {
"gitlab" => "hook",
"gitlab_alert_id" => prometheus_metric_id
}
}
end
private
def clear_prometheus_adapter_cache!
environment.clear_prometheus_reactive_cache!(:additional_metrics_environment)
end
end
class PrometheusMetric < ActiveRecord::Base class PrometheusMetric < ActiveRecord::Base
belongs_to :project, required: true, validate: true, inverse_of: :prometheus_metrics belongs_to :project, required: true, validate: true, inverse_of: :prometheus_metrics
has_one :prometheus_alert
enum group: [:business, :response, :system] enum group: [:business, :response, :system]
validates :title, presence: true validates :title, presence: true
...@@ -19,7 +22,7 @@ class PrometheusMetric < ActiveRecord::Base ...@@ -19,7 +22,7 @@ class PrometheusMetric < ActiveRecord::Base
end end
def to_query_metric def to_query_metric
Gitlab::Prometheus::Metric.new(title: title, required_metrics: [], weight: 0, y_label: y_label, queries: build_queries) Gitlab::Prometheus::Metric.new(id: id, title: title, required_metrics: [], weight: 0, y_label: y_label, queries: build_queries)
end end
private private
......
...@@ -49,6 +49,11 @@ module EE ...@@ -49,6 +49,11 @@ module EE
@subject.feature_available?(:pod_logs, @user) @subject.feature_available?(:pod_logs, @user)
end end
with_scope :subject
condition(:prometheus_alerts_enabled) do
@subject.feature_available?(:prometheus_alerts, @user)
end
rule { admin }.enable :change_repository_storage rule { admin }.enable :change_repository_storage
rule { support_bot }.enable :guest_access rule { support_bot }.enable :guest_access
...@@ -97,6 +102,7 @@ module EE ...@@ -97,6 +102,7 @@ module EE
end end
rule { pod_logs_enabled & can?(:maintainer_access) }.enable :read_pod_logs rule { pod_logs_enabled & can?(:maintainer_access) }.enable :read_pod_logs
rule { prometheus_alerts_enabled & can?(:maintainer_access) }.enable :read_prometheus_alerts
rule { auditor }.policy do rule { auditor }.policy do
enable :public_user_access enable :public_user_access
......
class PrometheusAlertEntity < Grape::Entity
include RequestAwareEntity
expose :id
expose :name
expose :query
expose :threshold
expose :operator do |prometheus_alert|
prometheus_alert.computed_operator
end
expose :alert_path do |prometheus_alert|
project_prometheus_alert_path(prometheus_alert.project, prometheus_alert.prometheus_metric_id, environment_id: prometheus_alert.environment.id, format: :json)
end
private
alias_method :prometheus_alert, :object
def can_read_prometheus_alerts?
can?(request.current_user, :read_prometheus_alerts, prometheus_alert.project)
end
end
class PrometheusAlertSerializer < BaseSerializer
entity PrometheusAlertEntity
end
module Clusters
module Applications
class CheckUpgradeProgressService < BaseHelmService
def execute
return unless app.updating?
case phase
when ::Gitlab::Kubernetes::Pod::SUCCEEDED
on_success
when ::Gitlab::Kubernetes::Pod::FAILED
on_failed
else
check_timeout
end
rescue ::Kubeclient::HttpError => e
app.make_update_errored!("Kubernetes error: #{e.message}") unless app.update_errored?
end
private
def on_success
app.make_updated!
ensure
remove_pod
end
def on_failed
app.make_update_errored!(errors || 'Update silently failed')
ensure
remove_pod
end
def check_timeout
if timeouted?
begin
app.make_update_errored!('Update timed out')
ensure
remove_pod
end
else
::ClusterWaitForAppUpdateWorker.perform_in(
::ClusterWaitForAppUpdateWorker::INTERVAL, app.name, app.id)
end
end
def timeouted?
Time.now.utc - app.updated_at.to_time.utc > ::ClusterWaitForAppUpdateWorker::TIMEOUT
end
def remove_pod
helm_api.delete_pod!(upgrade_command.pod_name)
rescue
# no-op
end
def phase
helm_api.status(upgrade_command.pod_name)
end
def errors
helm_api.log(upgrade_command.pod_name)
end
end
end
end
module Clusters
module Applications
class PrometheusUpdateService < BaseHelmService
attr_accessor :project
def initialize(app, project)
super(app)
@project = project
end
def execute
app.make_updating!
response = helm_api.get_config_map(app.get_command)
config = extract_config(response)
data =
if has_alerts?
generate_alert_manager(config)
else
reset_alert_manager(config)
end
helm_api.update(upgrade_command(data.to_yaml))
::ClusterWaitForAppUpdateWorker.perform_in(::ClusterWaitForAppUpdateWorker::INTERVAL, app.name, app.id)
rescue ::Kubeclient::HttpError => ke
app.make_update_errored!("Kubernetes error: #{ke.message}")
rescue StandardError => e
app.make_update_errored!(e.message)
end
private
def reset_alert_manager(config)
config = set_alert_manager_enabled(config, false)
config.delete("alertmanagerFiles")
config["serverFiles"]["alerts"] = {}
config
end
def generate_alert_manager(config)
config = set_alert_manager_enabled(config, true)
config = set_alert_manager_files(config)
set_alert_manager_groups(config)
end
def set_alert_manager_enabled(config, enabled)
config["alertmanager"]["enabled"] = enabled
config
end
def set_alert_manager_files(config)
config["alertmanagerFiles"] = {
"alertmanager.yml" => {
"receivers" => alert_manager_receivers_params,
"route" => alert_manager_route_params
}
}
config
end
def set_alert_manager_groups(config)
config["serverFiles"]["alerts"]["groups"] ||= []
environments_with_alerts.each do |env_name, alerts|
index = config["serverFiles"]["alerts"]["groups"].find_index do |group|
group["name"] == env_name
end
if index
config["serverFiles"]["alerts"]["groups"][index]["rules"] = alerts
else
config["serverFiles"]["alerts"]["groups"] << {
"name" => env_name,
"rules" => alerts
}
end
end
config
end
def alert_manager_receivers_params
[
{
"name" => "gitlab",
"webhook_configs" => [
{
"url" => notify_url,
"send_resolved" => false
}
]
}
]
end
def alert_manager_route_params
{
"receiver" => "gitlab",
"group_wait" => "30s",
"group_interval" => "5m",
"repeat_interval" => "4h"
}
end
def notify_url
::Gitlab::Routing.url_helpers.notify_namespace_project_prometheus_alerts_url(
namespace_id: project.namespace.path,
project_id: project.path,
format: :json
)
end
def extract_config(response)
YAML.safe_load(response.data.values)
end
def has_alerts?
environments_with_alerts.values.flatten.any?
end
def environments_with_alerts
@environments_with_alerts ||=
environments.each_with_object({}) do |environment, hsh|
name = rule_name(environment)
hsh[name] = environment.prometheus_alerts.map(&:to_param)
end
end
def rule_name(environment)
"#{environment.name}.rules"
end
def environments
project.environments_for_scope(cluster.environment_scope)
end
end
end
end
module Clusters
module Applications
class ScheduleUpdateService
BACKOFF_DELAY = 2.minutes
attr_accessor :application, :project
def initialize(application, project)
@application = application
@project = project
end
def execute
return unless application
if recently_scheduled?
worker_class.perform_in(BACKOFF_DELAY, application.name, application.id, project.id, Time.now)
else
worker_class.perform_async(application.name, application.id, project.id, Time.now)
end
end
private
def worker_class
::ClusterUpdateAppWorker
end
def recently_scheduled?
return false unless application.last_update_started_at
application.last_update_started_at >= Time.now - BACKOFF_DELAY
end
end
end
end
module EE
module Clusters
module Applications
module BaseHelmService
protected
def upgrade_command(new_values = "")
@upgrade_command ||= app.upgrade_command(new_values)
end
end
end
end
end
...@@ -42,6 +42,18 @@ module EE ...@@ -42,6 +42,18 @@ module EE
mailer.project_mirror_user_changed_email(new_mirror_user.id, deleted_user_name, project.id).deliver_later mailer.project_mirror_user_changed_email(new_mirror_user.id, deleted_user_name, project.id).deliver_later
end end
def prometheus_alerts_fired(project, alerts)
recipients = project.members.active_without_invites_and_requests.owners_and_masters
if recipients.empty? && project.group
recipients = project.group.members.active_without_invites_and_requests.owners_and_masters
end
recipients.product(alerts).each do |recipient, alert|
mailer.prometheus_alert_fired_email(project.id, recipient.user.id, alert).deliver_later
end
end
private private
def add_mr_approvers_email(merge_request, approvers, current_user) def add_mr_approvers_email(merge_request, approvers, current_user)
......
%p
An alert has been triggered in #{@project.full_path}.
%p
Environment: #{@environment.name}
%p
Metric:
%pre
= @alert.full_query
%p
= link_to("View #{@environment.name} performance dashboard.", metrics_project_environment_url(@environment.project, @environment))
An alert has been triggered in <%= @project.full_path %>.
Environment: <%= @environment.name %>
Metric: <%= @alert.full_query %>
You can view the <%= @environment.name %> performance dashboard at <%= metrics_project_environment_url(@environment.project, @environment) %>.
class ClusterUpdateAppWorker
UpdateAlreadyInProgressError = Class.new(StandardError)
include ApplicationWorker
include ClusterQueue
include ClusterApplications
sidekiq_options retry: 3, dead: false
def perform(app_name, app_id, project_id, scheduled_time)
project = Project.find_by(id: project_id)
return unless project
find_application(app_name, app_id) do |app|
break if app.updated_since?(scheduled_time)
raise UpdateAlreadyInProgressError if app.update_in_progress?
Clusters::Applications::PrometheusUpdateService.new(app, project).execute
end
end
end
class ClusterWaitForAppUpdateWorker
include ApplicationWorker
include ClusterQueue
include ClusterApplications
INTERVAL = 10.seconds
TIMEOUT = 20.minutes
def perform(app_name, app_id)
find_application(app_name, app_id) do |app|
::Clusters::Applications::CheckUpgradeProgressService.new(app).execute
end
end
end
class CreatePrometheusAlerts < ActiveRecord::Migration
DOWNTIME = false
def up
create_table :prometheus_alerts do |t|
t.datetime_with_timezone :created_at, null: false
t.datetime_with_timezone :updated_at, null: false
t.float :threshold, null: false
t.integer :operator, null: false
t.references :environment, index: true, null: false, foreign_key: { on_delete: :cascade }
t.references :project, null: false, foreign_key: { on_delete: :cascade }
t.references :prometheus_metric, null: false, index: { unique: true }, foreign_key: { on_delete: :cascade }
t.text :name, null: false
t.string :query, null: false
end
end
def down
remove_foreign_key :prometheus_alerts, column: :project_id
drop_table :prometheus_alerts
end
end
class AddLastUpdateStartedAtToApplicationsPrometheus < ActiveRecord::Migration
DOWNTIME = false
def change
add_column :clusters_applications_prometheus, :last_update_started_at, :datetime_with_timezone
end
end
module EE
module Gitlab
module Kubernetes
module Helm
module Api
def get_config_map(command)
namespace.ensure_exists!
return unless command.config_map?
kubeclient.get_config_map(command.config_map_name, namespace.name)
end
def update(command)
namespace.ensure_exists!
update_config_map(command) if command.config_map?
kubeclient.create_pod(command.pod_resource)
end
private
def update_config_map(command)
command.config_map_resource.tap do |config_map_resource|
kubeclient.update_config_map(config_map_resource)
end
end
end
end
end
end
end
module EE
module Gitlab
module Prometheus
module Queries
module QueryAdditionalMetrics
def query_metrics(project, environment, query_context)
super.map(&query_with_alert(project, environment))
end
protected
def query_with_alert(project, environment)
alerts_map =
project.prometheus_alerts.each_with_object({}) do |alert, hsh|
hsh[alert[:prometheus_metric_id]] = alert.prometheus_metric_id
end
proc do |group|
group[:metrics]&.map! do |metric|
key = metric[:id]
if key && alerts_map[key]
metric[:queries]&.map! do |item|
item[:alert_path] = alert_path(alerts_map, key, project, environment)
item
end
end
metric
end
group
end
end
private
def alert_path(alerts_map, key, project, environment)
::Gitlab::Routing.url_helpers.project_prometheus_alert_path(project, alerts_map[key], environment_id: environment.id, format: :json)
end
end
end
end
end
end
require_dependency 'lib/gitlab/kubernetes/helm.rb'
module Gitlab
module Kubernetes
module Helm
class GetCommand < BaseCommand
def config_map?
true
end
def config_map_name
::Gitlab::Kubernetes::ConfigMap.new(name).config_map_name
end
end
end
end
end
require_dependency 'lib/gitlab/kubernetes/helm.rb'
module Gitlab
module Kubernetes
module Helm
class UpgradeCommand < BaseCommand
attr_reader :chart, :version, :repository, :values
def initialize(name, chart:, values:, version: nil, repository: nil)
super(name)
@chart = chart
@version = version
@values = values
@repository = repository
end
def generate_script
super + [
init_command,
repository_command,
script_command
].compact.join("\n")
end
def config_map?
true
end
def config_map_resource
::Gitlab::Kubernetes::ConfigMap.new(name, values).generate
end
def pod_name
"upgrade-#{name}"
end
private
def init_command
'helm init --client-only >/dev/null'
end
def repository_command
"helm repo add #{name} #{repository}" if repository
end
def script_command
<<~HEREDOC
helm upgrade #{name}#{optional_version_flag} #{chart} --reset-values --install --namespace #{::Gitlab::Kubernetes::Helm::NAMESPACE} -f /data/helm/#{name}/config/values.yaml >/dev/null
HEREDOC
end
def optional_version_flag
" --version #{version}" if version
end
end
end
end
end
require 'spec_helper'
describe Projects::Prometheus::AlertsController do
let(:user) { create(:user) }
let(:project) { create(:project) }
let(:environment) { create(:environment, project: project) }
let(:metric) { create(:prometheus_metric, project: project) }
before do
stub_licensed_features(prometheus_alerts: true)
project.add_master(user)
sign_in(user)
end
describe 'GET #index' do
context 'when project has no prometheus alert' do
it 'returns an empty response' do
get :index, project_params
expect(response).to have_gitlab_http_status(200)
expect(JSON.parse(response.body)).to be_empty
end
end
context 'when project has prometheus alerts' do
before do
create_list(:prometheus_alert, 3, project: project, environment: environment)
end
it 'renders forbidden when unlicensed' do
stub_licensed_features(prometheus_alerts: false)
get :index, project_params
expect(response).to have_gitlab_http_status(:not_found)
end
it 'contains prometheus alerts' do
get :index, project_params
expect(response).to have_gitlab_http_status(200)
expect(JSON.parse(response.body).count).to eq(3)
end
end
end
describe 'GET #show' do
context 'when alert does not exist' do
it 'renders 404' do
get :show, project_params(id: PrometheusAlert.all.maximum(:prometheus_metric_id).to_i)
expect(response).to have_gitlab_http_status(404)
end
end
context 'when alert exists' do
let(:alert) { create(:prometheus_alert, project: project, environment: environment, prometheus_metric: metric) }
it 'renders forbidden when unlicensed' do
stub_licensed_features(prometheus_alerts: false)
get :show, project_params(id: alert.prometheus_metric_id)
expect(response).to have_gitlab_http_status(:not_found)
end
it 'renders the alert' do
alert_params = {
"id" => alert.id,
"name" => alert.name,
"query" => alert.query,
"operator" => alert.computed_operator,
"threshold" => alert.threshold,
"alert_path" => Gitlab::Routing.url_helpers.project_prometheus_alert_path(project, alert.prometheus_metric_id, environment_id: alert.environment.id, format: :json)
}
get :show, project_params(id: alert.prometheus_metric_id)
expect(response).to have_gitlab_http_status(200)
expect(JSON.parse(response.body)).to include(alert_params)
end
end
end
describe 'POST #notify' do
it 'sends a notification' do
alert = create(:prometheus_alert, project: project, environment: environment, prometheus_metric: metric)
notification_service = spy
alert_params = {
"alert" => alert.name,
"expr" => "#{alert.query} #{alert.computed_operator} #{alert.threshold}",
"for" => "5m",
"labels" => {
"gitlab" => "hook",
"gitlab_alert_id" => alert.prometheus_metric_id
}
}
allow(NotificationService).to receive(:new).and_return(notification_service)
post :notify, project_params(alerts: [alert])
expect(notification_service).to have_received(:prometheus_alerts_fired).with(project, [alert_params])
expect(response).to have_gitlab_http_status(200)
end
end
describe 'POST #create' do
it 'renders forbidden when unlicensed' do
stub_licensed_features(prometheus_alerts: false)
post :create, project_params(
query: "foo",
operator: ">",
threshold: "1",
name: "bar",
environment_id: environment.id,
prometheus_metric_id: metric.id
)
expect(response).to have_gitlab_http_status(:not_found)
end
it 'creates a new prometheus alert' do
schedule_update_service = spy
alert_params = {
"name" => "bar",
"query" => "foo",
"operator" => ">",
"threshold" => 1.0
}
allow(::Clusters::Applications::ScheduleUpdateService).to receive(:new).and_return(schedule_update_service)
post :create, project_params(
query: "foo",
operator: ">",
threshold: "1",
name: "bar",
environment_id: environment.id,
prometheus_metric_id: metric.id
)
expect(schedule_update_service).to have_received(:execute)
expect(response).to have_gitlab_http_status(200)
expect(JSON.parse(response.body)).to include(alert_params)
end
end
describe 'POST #update' do
let(:schedule_update_service) { spy }
let(:alert) { create(:prometheus_alert, project: project, environment: environment, prometheus_metric: metric) }
before do
allow(::Clusters::Applications::ScheduleUpdateService).to receive(:new).and_return(schedule_update_service)
end
it 'renders forbidden when unlicensed' do
stub_licensed_features(prometheus_alerts: false)
put :update, project_params(id: alert.prometheus_metric_id, name: "bar")
expect(response).to have_gitlab_http_status(:not_found)
end
it 'updates an already existing prometheus alert' do
alert_params = {
"id" => alert.id,
"name" => "bar",
"query" => alert.query,
"operator" => alert.computed_operator,
"threshold" => alert.threshold,
"alert_path" => Gitlab::Routing.url_helpers.project_prometheus_alert_path(project, alert.prometheus_metric_id, environment_id: alert.environment.id, format: :json)
}
expect do
put :update, project_params(id: alert.prometheus_metric_id, name: "bar")
end.to change { alert.reload.name }.to("bar")
expect(schedule_update_service).to have_received(:execute)
expect(response).to have_gitlab_http_status(200)
expect(JSON.parse(response.body)).to include(alert_params)
end
end
describe 'DELETE #destroy' do
let(:schedule_update_service) { spy }
let!(:alert) { create(:prometheus_alert, project: project, prometheus_metric: metric) }
before do
allow(::Clusters::Applications::ScheduleUpdateService).to receive(:new).and_return(schedule_update_service)
end
it 'renders forbidden when unlicensed' do
stub_licensed_features(prometheus_alerts: false)
delete :destroy, project_params(id: alert.prometheus_metric_id)
expect(response).to have_gitlab_http_status(:not_found)
end
it 'destroys the specified prometheus alert' do
expect do
delete :destroy, project_params(id: alert.prometheus_metric_id)
end.to change { PrometheusAlert.count }.from(1).to(0)
expect(schedule_update_service).to have_received(:execute)
end
end
def project_params(opts = {})
opts.reverse_merge(namespace_id: project.namespace, project_id: project)
end
end
FactoryBot.define do
factory :prometheus_alert do
project
environment
prometheus_metric
name { generate(:title) }
query "foo"
operator :gt
threshold 1
end
end
require 'spec_helper'
describe Gitlab::Kubernetes::Helm::Api do
let(:kubeclient) { spy }
let(:namespace) { spy }
let(:application) { build(:clusters_applications_prometheus) }
subject { described_class.new(kubeclient) }
before do
allow(Gitlab::Kubernetes::Namespace)
.to receive(:new)
.with(Gitlab::Kubernetes::Helm::NAMESPACE, kubeclient)
.and_return(namespace)
end
describe '#get_config_map' do
let(:command) { Gitlab::Kubernetes::Helm::GetCommand.new(application.name) }
it 'ensures the namespace exists before retrieving the config map' do
expect(namespace).to receive(:ensure_exists!).once
subject.get_config_map(command)
end
it 'gets the config map on kubeclient' do
expect(kubeclient).to receive(:get_config_map)
.with(command.config_map_name, namespace.name)
.once
subject.get_config_map(command)
end
end
describe '#update' do
let(:command) do
Gitlab::Kubernetes::Helm::UpgradeCommand.new(
application.name,
chart: application.chart,
values: application.values
)
end
it 'ensures the namespace exists before creating the pod' do
expect(namespace).to receive(:ensure_exists!).once.ordered
expect(kubeclient).to receive(:create_pod).once.ordered
subject.update(command)
end
it 'updates the config map on kubeclient when one exists' do
resource = Gitlab::Kubernetes::ConfigMap.new(
application.name, application.values
).generate
expect(kubeclient).to receive(:update_config_map).with(resource).once
subject.update(command)
end
end
end
require 'rails_helper'
describe Gitlab::Kubernetes::Helm::GetCommand do
let(:application) { build(:clusters_applications_prometheus) }
subject(:get_command) { described_class.new(application.name) }
describe '#config_map?' do
it 'returns true' do
expect(get_command.config_map?).to be true
end
end
describe '#config_map_name' do
it 'returns the ConfigMap name' do
expect(get_command.config_map_name).to eq("values-content-configuration-#{application.name}")
end
end
end
require 'rails_helper'
describe Gitlab::Kubernetes::Helm::UpgradeCommand do
let(:application) { build(:clusters_applications_prometheus) }
let(:namespace) { ::Gitlab::Kubernetes::Helm::NAMESPACE }
subject do
described_class.new(
application.name,
chart: application.chart,
values: application.values
)
end
it_behaves_like 'helm commands' do
let(:commands) do
<<~EOS
helm init --client-only >/dev/null
helm upgrade #{application.name} #{application.chart} --reset-values --install --namespace #{namespace} -f /data/helm/#{application.name}/config/values.yaml >/dev/null
EOS
end
end
context 'with an application with a repository' do
let(:ci_runner) { create(:ci_runner) }
let(:application) { build(:clusters_applications_runner, runner: ci_runner) }
subject do
described_class.new(
application.name,
chart: application.chart,
values: application.values,
repository: application.repository
)
end
it_behaves_like 'helm commands' do
let(:commands) do
<<~EOS
helm init --client-only >/dev/null
helm repo add #{application.name} #{application.repository}
helm upgrade #{application.name} #{application.chart} --reset-values --install --namespace #{namespace} -f /data/helm/#{application.name}/config/values.yaml >/dev/null
EOS
end
end
end
describe '#config_map?' do
it 'returns true' do
expect(subject.config_map?).to be_truthy
end
end
describe '#config_map_resource' do
it 'returns a KubeClient resource with config map content for the application' do
metadata = {
name: "values-content-configuration-#{application.name}",
namespace: namespace,
labels: { name: "values-content-configuration-#{application.name}" }
}
resource = ::Kubeclient::Resource.new(metadata: metadata, data: { values: application.values })
expect(subject.config_map_resource).to eq(resource)
end
end
describe '#pod_name' do
it 'returns the pod name' do
expect(subject.pod_name).to eq("upgrade-#{application.name}")
end
end
end
require 'rails_helper'
describe Clusters::Applications::Prometheus do
describe 'transition to updating' do
let(:project) { create(:project) }
let(:cluster) { create(:cluster, projects: [project]) }
subject { create(:clusters_applications_prometheus, :installed, cluster: cluster) }
it 'sets last_update_started_at to now' do
Timecop.freeze do
expect { subject.make_updating }.to change { subject.reload.last_update_started_at }.to be_within(1.second).of(Time.now)
end
end
end
describe '#ready' do
let(:project) { create(:project) }
let(:cluster) { create(:cluster, projects: [project]) }
it 'returns true when updating' do
application = build(:clusters_applications_prometheus, :updating, cluster: cluster)
expect(application).to be_ready
end
it 'returns true when updated' do
application = build(:clusters_applications_prometheus, :updated, cluster: cluster)
expect(application).to be_ready
end
it 'returns true when errored' do
application = build(:clusters_applications_prometheus, :update_errored, cluster: cluster)
expect(application).to be_ready
end
end
context '#updated_since?' do
let(:cluster) { create(:cluster) }
let(:prometheus_app) { build(:clusters_applications_prometheus, cluster: cluster) }
let(:timestamp) { Time.now - 5.minutes }
around do |example|
Timecop.freeze { example.run }
end
before do
prometheus_app.last_update_started_at = Time.now
end
context 'when app does not have status failed' do
it 'returns true when last update started after the timestamp' do
expect(prometheus_app.updated_since?(timestamp)).to be true
end
it 'returns false when last update started before the timestamp' do
expect(prometheus_app.updated_since?(Time.now + 5.minutes)).to be false
end
end
context 'when app has status failed' do
it 'returns false when last update started after the timestamp' do
prometheus_app.status = 6
expect(prometheus_app.updated_since?(timestamp)).to be false
end
end
end
describe '#update_in_progress?' do
context 'when app is updating' do
it 'returns true' do
cluster = create(:cluster)
prometheus_app = build(:clusters_applications_prometheus, :updating, cluster: cluster)
expect(prometheus_app.update_in_progress?).to be true
end
end
end
describe '#update_errored?' do
context 'when app errored' do
it 'returns true' do
cluster = create(:cluster)
prometheus_app = build(:clusters_applications_prometheus, :update_errored, cluster: cluster)
expect(prometheus_app.update_errored?).to be true
end
end
end
describe '#get_command' do
let(:prometheus) { build(:clusters_applications_prometheus) }
it 'returns an instance of Gitlab::Kubernetes::Helm::GetCommand' do
expect(prometheus.get_command).to be_an_instance_of(::Gitlab::Kubernetes::Helm::GetCommand)
end
it 'should be initialized with 1 argument' do
command = prometheus.get_command
expect(command.name).to eq('prometheus')
end
end
describe '#upgrade_command' do
let(:prometheus) { build(:clusters_applications_prometheus) }
let(:values) { { foo: 'bar' } }
it 'returns an instance of Gitlab::Kubernetes::Helm::GetCommand' do
expect(prometheus.upgrade_command(values)).to be_an_instance_of(::Gitlab::Kubernetes::Helm::UpgradeCommand)
end
it 'should be initialized with 3 arguments' do
command = prometheus.upgrade_command(values)
expect(command.name).to eq('prometheus')
expect(command.chart).to eq('stable/prometheus')
expect(command.values).to eq(values)
end
end
end
...@@ -152,6 +152,24 @@ describe Project do ...@@ -152,6 +152,24 @@ describe Project do
end end
end end
describe '#environments_for_scope' do
set(:project) { create(:project) }
before do
create_list(:environment, 2, project: project)
end
it 'retrieves all project environments when using the * wildcard' do
expect(project.environments_for_scope("*")).to eq(project.environments)
end
it 'retrieves a specific project environment when using the name of that environment' do
environment = project.environments.first
expect(project.environments_for_scope(environment.name)).to eq([environment])
end
end
describe '#ensure_external_webhook_token' do describe '#ensure_external_webhook_token' do
let(:project) { create(:project, :repository) } let(:project) { create(:project, :repository) }
......
require 'spec_helper'
describe PrometheusAlert do
describe 'associations' do
it { is_expected.to belong_to(:project) }
it { is_expected.to belong_to(:environment) }
end
describe 'validation' do
it { is_expected.to validate_presence_of(:name) }
end
describe '#full_query' do
it 'returns the concatenated query' do
subject.name = "bar"
subject.query = "foo"
subject.operator = "gt"
subject.threshold = 1
subject.prometheus_metric_id = 1
expect(subject.full_query).to eq("foo > 1.0")
end
end
describe '#to_param' do
it 'returns the params of the prometheus alert' do
subject.name = "bar"
subject.query = "foo"
subject.operator = "gt"
subject.threshold = 1
subject.prometheus_metric_id = 1
alert_params = {
"alert" => "bar",
"expr" => "foo > 1.0",
"for" => "5m",
"labels" => {
"gitlab" => "hook",
"gitlab_alert_id" => 1
}
}
expect(subject.to_param).to eq(alert_params)
end
end
end
require 'spec_helper'
describe PrometheusAlertEntity do
let(:user) { create(:user) }
let(:prometheus_alert) { create(:prometheus_alert) }
let(:request) { double('prometheus_alert', current_user: user) }
let(:entity) { described_class.new(prometheus_alert, request: request) }
subject { entity.as_json }
context 'when user can read prometheus alerts' do
before do
prometheus_alert.project.add_master(user)
stub_licensed_features(prometheus_alerts: true)
end
it 'exposes prometheus_alert attributes' do
expect(subject).to include(:id, :name, :query, :operator, :threshold)
end
it 'exposes alert_path' do
expect(subject).to include(:alert_path)
end
end
end
require 'spec_helper'
describe Clusters::Applications::CheckUpgradeProgressService do
RESCHEDULE_PHASES = ::Gitlab::Kubernetes::Pod::PHASES -
[::Gitlab::Kubernetes::Pod::SUCCEEDED, ::Gitlab::Kubernetes::Pod::FAILED, ::Gitlab].freeze
let(:application) { create(:clusters_applications_prometheus, :updating) }
let(:service) { described_class.new(application) }
let(:phase) { ::Gitlab::Kubernetes::Pod::UNKNOWN }
let(:errors) { nil }
shared_examples 'a terminated upgrade' do
it 'removes the POD' do
expect(service).to receive(:remove_pod).once
service.execute
end
end
shared_examples 'a not yet terminated upgrade' do |a_phase|
let(:phase) { a_phase }
context "when phase is #{a_phase}" do
context 'when not timed out' do
it 'reschedule a new check' do
expect(::ClusterWaitForAppUpdateWorker).to receive(:perform_in).once
expect(service).not_to receive(:remove_pod)
service.execute
expect(application).to be_updating
expect(application.status_reason).to be_nil
end
end
context 'when timed out' do
let(:application) { create(:clusters_applications_prometheus, :timeouted, :updating) }
it_behaves_like 'a terminated upgrade'
it 'make the application update errored' do
expect(::ClusterWaitForAppUpdateWorker).not_to receive(:perform_in)
service.execute
expect(application).to be_update_errored
expect(application.status_reason).to eq("Update timed out")
end
end
end
end
before do
allow(service).to receive(:phase).once.and_return(phase)
allow(service).to receive(:errors).and_return(errors)
allow(service).to receive(:remove_pod).and_return(nil)
end
describe '#execute' do
context 'when upgrade pod succeeded' do
let(:phase) { ::Gitlab::Kubernetes::Pod::SUCCEEDED }
it_behaves_like 'a terminated upgrade'
it 'make the application upgraded' do
expect(::ClusterWaitForAppUpdateWorker).not_to receive(:perform_in)
service.execute
expect(application).to be_updated
expect(application.status_reason).to be_nil
end
end
context 'when upgrade pod failed' do
let(:phase) { ::Gitlab::Kubernetes::Pod::FAILED }
let(:errors) { 'test installation failed' }
it_behaves_like 'a terminated upgrade'
it 'make the application update errored' do
service.execute
expect(application).to be_update_errored
expect(application.status_reason).to eq(errors)
end
end
RESCHEDULE_PHASES.each { |phase| it_behaves_like 'a not yet terminated upgrade', phase }
end
end
require 'spec_helper'
describe Clusters::Applications::PrometheusUpdateService do
describe '#execute' do
let(:project) { create(:project) }
let(:environment) { create(:environment, project: project) }
let(:cluster) { create(:cluster, projects: [project]) }
let(:application) { create(:clusters_applications_prometheus, :installed, cluster: cluster) }
let!(:get_command_values) { OpenStruct.new(data: OpenStruct.new(values: application.values)) }
let!(:upgrade_command) { application.upgrade_command("") }
let(:helm_client) { instance_double(::Gitlab::Kubernetes::Helm::Api) }
subject(:service) { described_class.new(application, project) }
before do
allow(service).to receive(:upgrade_command).and_return(upgrade_command)
allow(service).to receive(:helm_api).and_return(helm_client)
end
context 'when there are no errors' do
before do
expect(helm_client).to receive(:get_config_map).and_return(get_command_values)
expect(helm_client).to receive(:update).with(upgrade_command)
allow(::ClusterWaitForAppUpdateWorker).to receive(:perform_in).and_return(nil)
end
context 'when prometheus alerts exist' do
it 'generates the alert manager values' do
create(:prometheus_alert, project: project, environment: environment)
expect(service).to receive(:generate_alert_manager).once
service.execute
end
end
context 'when prometheus alerts do not exist' do
it 'resets the alert manager values' do
expect(service).to receive(:reset_alert_manager).once
service.execute
end
end
it 'make the application updating' do
expect(application.cluster).not_to be_nil
service.execute
expect(application).to be_updating
end
it 'schedules async update status check' do
expect(::ClusterWaitForAppUpdateWorker).to receive(:perform_in).once
service.execute
end
end
context 'when k8s cluster communication fails' do
it 'make the application update errored' do
error = ::Kubeclient::HttpError.new(500, 'system failure', nil)
allow(helm_client).to receive(:get_config_map).and_raise(error)
service.execute
expect(application).to be_update_errored
expect(application.status_reason).to match(/kubernetes error:/i)
end
end
context 'when application cannot be persisted' do
let(:application) { build(:clusters_applications_prometheus, :installed) }
it 'make the application update errored' do
allow(application).to receive(:make_updating!).once.and_raise(ActiveRecord::RecordInvalid)
expect(helm_client).not_to receive(:get_config_map)
expect(helm_client).not_to receive(:update)
service.execute
expect(application).to be_update_errored
end
end
end
end
require 'spec_helper'
describe Clusters::Applications::ScheduleUpdateService do
describe '#execute' do
let(:project) { create(:project) }
around do |example|
Timecop.freeze { example.run }
end
context 'when application is able to be updated' do
context 'when the application was recently scheduled' do
it 'schedules worker with a backoff delay' do
application = create(:clusters_applications_prometheus, :installed, last_update_started_at: Time.now + 5.minutes)
service = described_class.new(application, project)
expect(::ClusterUpdateAppWorker).to receive(:perform_in).with(described_class::BACKOFF_DELAY, application.name, application.id, project.id, Time.now).once
service.execute
end
end
context 'when the application has not been recently updated' do
it 'schedules worker' do
application = create(:clusters_applications_prometheus, :installed)
service = described_class.new(application, project)
expect(::ClusterUpdateAppWorker).to receive(:perform_async).with(application.name, application.id, project.id, Time.now).once
service.execute
end
end
end
end
end
...@@ -238,6 +238,23 @@ describe EE::NotificationService, :mailer do ...@@ -238,6 +238,23 @@ describe EE::NotificationService, :mailer do
end end
end end
describe '#prometheus_alerts_fired' do
it 'sends the email to owners and masters' do
project = create(:project)
prometheus_alert = create(:prometheus_alert, project: project)
master = create(:user)
developer = create(:user)
project.add_master(master)
expect(Notify).to receive(:prometheus_alert_fired_email).with(project.id, master.id, prometheus_alert).and_call_original
expect(Notify).to receive(:prometheus_alert_fired_email).with(project.id, project.owner.id, prometheus_alert).and_call_original
expect(Notify).not_to receive(:prometheus_alert_fired_email).with(project.id, developer.id, prometheus_alert)
subject.prometheus_alerts_fired(prometheus_alert.project, [prometheus_alert])
end
end
describe 'Notes' do describe 'Notes' do
around do |example| around do |example|
perform_enqueued_jobs do perform_enqueued_jobs do
......
require 'spec_helper'
describe ClusterUpdateAppWorker do
let(:project) { create(:project) }
let(:prometheus_update_service) { spy }
subject { described_class.new }
around do |example|
Timecop.freeze(Time.now) { example.run }
end
before do
allow(::Clusters::Applications::PrometheusUpdateService).to receive(:new).and_return(prometheus_update_service)
end
describe '#perform' do
context 'when the application last_update_started_at is higher than the time the job was scheduled in' do
it 'does nothing' do
application = create(:clusters_applications_prometheus, :updated, last_update_started_at: Time.now)
expect(prometheus_update_service).not_to receive(:execute)
expect(subject.perform(application.name, application.id, project.id, Time.now - 5.minutes)).to be_nil
end
end
context 'when another worker is already running' do
it 'raises UpdateAlreadyInProgressError' do
application = create(:clusters_applications_prometheus, :updating)
expect do
subject.perform(application.name, application.id, project.id, Time.now)
end.to raise_error(described_class::UpdateAlreadyInProgressError)
end
end
it 'executes PrometheusUpdateService' do
application = create(:clusters_applications_prometheus, :installed)
expect(prometheus_update_service).to receive(:execute)
subject.perform(application.name, application.id, project.id, Time.now)
end
end
end
require 'spec_helper'
describe ClusterWaitForAppUpdateWorker do
let(:check_upgrade_progress_service) { spy }
before do
allow(::Clusters::Applications::CheckUpgradeProgressService).to receive(:new).and_return(check_upgrade_progress_service)
end
it 'runs CheckUpgradeProgressService when application is found' do
application = create(:clusters_applications_prometheus)
expect(check_upgrade_progress_service).to receive(:execute)
subject.perform(application.name, application.id)
end
it 'does not run CheckUpgradeProgressService when application is not found' do
expect(check_upgrade_progress_service).not_to receive(:execute)
expect do
subject.perform("prometheus", -1)
end.to raise_error(ActiveRecord::RecordNotFound)
end
end
module Gitlab module Gitlab
module Kubernetes module Kubernetes
class ConfigMap class ConfigMap
def initialize(name, values) def initialize(name, values = "")
@name = name @name = name
@values = values @values = values
end end
...@@ -13,6 +13,10 @@ module Gitlab ...@@ -13,6 +13,10 @@ module Gitlab
resource resource
end end
def config_map_name
"values-content-configuration-#{name}"
end
private private
attr_reader :name, :values attr_reader :name, :values
...@@ -25,10 +29,6 @@ module Gitlab ...@@ -25,10 +29,6 @@ module Gitlab
} }
end end
def config_map_name
"values-content-configuration-#{name}"
end
def namespace def namespace
Gitlab::Kubernetes::Helm::NAMESPACE Gitlab::Kubernetes::Helm::NAMESPACE
end end
......
...@@ -2,15 +2,17 @@ module Gitlab ...@@ -2,15 +2,17 @@ module Gitlab
module Kubernetes module Kubernetes
module Helm module Helm
class Api class Api
prepend EE::Gitlab::Kubernetes::Helm::Api
def initialize(kubeclient) def initialize(kubeclient)
@kubeclient = kubeclient @kubeclient = kubeclient
@namespace = Gitlab::Kubernetes::Namespace.new(Gitlab::Kubernetes::Helm::NAMESPACE, kubeclient) @namespace = Gitlab::Kubernetes::Namespace.new(Gitlab::Kubernetes::Helm::NAMESPACE, kubeclient)
end end
def install(command) def install(command)
@namespace.ensure_exists! namespace.ensure_exists!
create_config_map(command) if command.config_map? create_config_map(command) if command.config_map?
@kubeclient.create_pod(command.pod_resource) kubeclient.create_pod(command.pod_resource)
end end
## ##
...@@ -20,23 +22,25 @@ module Gitlab ...@@ -20,23 +22,25 @@ module Gitlab
# #
# values: "Pending", "Running", "Succeeded", "Failed", "Unknown" # values: "Pending", "Running", "Succeeded", "Failed", "Unknown"
# #
def installation_status(pod_name) def status(pod_name)
@kubeclient.get_pod(pod_name, @namespace.name).status.phase kubeclient.get_pod(pod_name, namespace.name).status.phase
end end
def installation_log(pod_name) def log(pod_name)
@kubeclient.get_pod_log(pod_name, @namespace.name).body kubeclient.get_pod_log(pod_name, namespace.name).body
end end
def delete_installation_pod!(pod_name) def delete_pod!(pod_name)
@kubeclient.delete_pod(pod_name, @namespace.name) kubeclient.delete_pod(pod_name, namespace.name)
end end
private private
attr_reader :kubeclient, :namespace
def create_config_map(command) def create_config_map(command)
command.config_map_resource.tap do |config_map_resource| command.config_map_resource.tap do |config_map_resource|
@kubeclient.create_config_map(config_map_resource) kubeclient.create_config_map(config_map_resource)
end end
end end
end end
......
...@@ -3,7 +3,7 @@ module Gitlab ...@@ -3,7 +3,7 @@ module Gitlab
class Metric class Metric
include ActiveModel::Model include ActiveModel::Model
attr_accessor :title, :required_metrics, :weight, :y_label, :queries attr_accessor :id, :title, :required_metrics, :weight, :y_label, :queries
validates :title, :required_metrics, :weight, :y_label, :queries, presence: true validates :title, :required_metrics, :weight, :y_label, :queries, presence: true
......
...@@ -8,6 +8,7 @@ module Gitlab ...@@ -8,6 +8,7 @@ module Gitlab
Deployment.find_by(id: deployment_id).try do |deployment| Deployment.find_by(id: deployment_id).try do |deployment|
query_metrics( query_metrics(
deployment.project, deployment.project,
deployment.environment,
common_query_context( common_query_context(
deployment.environment, deployment.environment,
timeframe_start: (deployment.created_at - 30.minutes).to_f, timeframe_start: (deployment.created_at - 30.minutes).to_f,
......
...@@ -8,6 +8,7 @@ module Gitlab ...@@ -8,6 +8,7 @@ module Gitlab
::Environment.find_by(id: environment_id).try do |environment| ::Environment.find_by(id: environment_id).try do |environment|
query_metrics( query_metrics(
environment.project, environment.project,
environment,
common_query_context(environment, timeframe_start: 8.hours.ago.to_f, timeframe_end: Time.now.to_f) common_query_context(environment, timeframe_start: 8.hours.ago.to_f, timeframe_end: Time.now.to_f)
) )
end end
......
...@@ -2,7 +2,9 @@ module Gitlab ...@@ -2,7 +2,9 @@ module Gitlab
module Prometheus module Prometheus
module Queries module Queries
module QueryAdditionalMetrics module QueryAdditionalMetrics
def query_metrics(project, query_context) prepend EE::Gitlab::Prometheus::Queries::QueryAdditionalMetrics
def query_metrics(project, environment, query_context)
matched_metrics(project).map(&query_group(query_context)) matched_metrics(project).map(&query_group(query_context))
.select(&method(:group_with_any_metrics)) .select(&method(:group_with_any_metrics))
end end
...@@ -14,12 +16,16 @@ module Gitlab ...@@ -14,12 +16,16 @@ module Gitlab
lambda do |group| lambda do |group|
metrics = group.metrics.map do |metric| metrics = group.metrics.map do |metric|
{ metric_hsh = {
title: metric.title, title: metric.title,
weight: metric.weight, weight: metric.weight,
y_label: metric.y_label, y_label: metric.y_label,
queries: metric.queries.map(&query_processor).select(&method(:query_with_result)) queries: metric.queries.map(&query_processor).select(&method(:query_with_result))
} }
metric_hsh[:id] = metric.id if metric.id
metric_hsh
end end
{ {
......
...@@ -22,11 +22,24 @@ FactoryBot.define do ...@@ -22,11 +22,24 @@ FactoryBot.define do
status 3 status 3
end end
trait :updating do
status 4
end
trait :updated do
status 5
end
trait :errored do trait :errored do
status(-1) status(-1)
status_reason 'something went wrong' status_reason 'something went wrong'
end end
trait :update_errored do
status(6)
status_reason 'something went wrong'
end
trait :timeouted do trait :timeouted do
installing installing
updated_at ClusterWaitForAppInstallationWorker::TIMEOUT.ago updated_at ClusterWaitForAppInstallationWorker::TIMEOUT.ago
......
...@@ -332,6 +332,7 @@ project: ...@@ -332,6 +332,7 @@ project:
- ci_cd_settings - ci_cd_settings
- import_export_upload - import_export_upload
- vulnerability_feedback - vulnerability_feedback
- prometheus_alerts
award_emoji: award_emoji:
- awardable - awardable
- user - user
...@@ -339,6 +340,7 @@ priorities: ...@@ -339,6 +340,7 @@ priorities:
- label - label
prometheus_metrics: prometheus_metrics:
- project - project
- prometheus_alert
timelogs: timelogs:
- issue - issue
- merge_request - merge_request
......
...@@ -22,4 +22,10 @@ describe Gitlab::Kubernetes::ConfigMap do ...@@ -22,4 +22,10 @@ describe Gitlab::Kubernetes::ConfigMap do
is_expected.to eq(resource) is_expected.to eq(resource)
end end
end end
describe '#config_map_name' do
it 'returns the config_map name' do
expect(config_map.config_map_name).to eq("values-content-configuration-#{application.name}")
end
end
end end
...@@ -49,33 +49,33 @@ describe Gitlab::Kubernetes::Helm::Api do ...@@ -49,33 +49,33 @@ describe Gitlab::Kubernetes::Helm::Api do
end end
end end
describe '#installation_status' do describe '#status' do
let(:phase) { Gitlab::Kubernetes::Pod::RUNNING } let(:phase) { Gitlab::Kubernetes::Pod::RUNNING }
let(:pod) { Kubeclient::Resource.new(status: { phase: phase }) } # partial representation let(:pod) { Kubeclient::Resource.new(status: { phase: phase }) } # partial representation
it 'fetches POD phase from kubernetes cluster' do it 'fetches POD phase from kubernetes cluster' do
expect(client).to receive(:get_pod).with(command.pod_name, gitlab_namespace).once.and_return(pod) expect(client).to receive(:get_pod).with(command.pod_name, gitlab_namespace).once.and_return(pod)
expect(subject.installation_status(command.pod_name)).to eq(phase) expect(subject.status(command.pod_name)).to eq(phase)
end end
end end
describe '#installation_log' do describe '#log' do
let(:log) { 'some output' } let(:log) { 'some output' }
let(:response) { RestClient::Response.new(log) } let(:response) { RestClient::Response.new(log) }
it 'fetches POD phase from kubernetes cluster' do it 'fetches POD phase from kubernetes cluster' do
expect(client).to receive(:get_pod_log).with(command.pod_name, gitlab_namespace).once.and_return(response) expect(client).to receive(:get_pod_log).with(command.pod_name, gitlab_namespace).once.and_return(response)
expect(subject.installation_log(command.pod_name)).to eq(log) expect(subject.log(command.pod_name)).to eq(log)
end end
end end
describe '#delete_installation_pod!' do describe '#delete_pod!' do
it 'deletes the POD from kubernetes cluster' do it 'deletes the POD from kubernetes cluster' do
expect(client).to receive(:delete_pod).with(command.pod_name, gitlab_namespace).once expect(client).to receive(:delete_pod).with(command.pod_name, gitlab_namespace).once
subject.delete_installation_pod!(command.pod_name) subject.delete_pod!(command.pod_name)
end end
end end
end end
...@@ -34,6 +34,47 @@ describe Clusters::Applications::Prometheus do ...@@ -34,6 +34,47 @@ describe Clusters::Applications::Prometheus do
end end
end end
describe '#ready' do
let(:project) { create(:project) }
let(:cluster) { create(:cluster, projects: [project]) }
it 'returns true when installed' do
application = build(:clusters_applications_prometheus, :installed, cluster: cluster)
expect(application).to be_ready
end
it 'returns false when not_installable' do
application = build(:clusters_applications_prometheus, :not_installable, cluster: cluster)
expect(application).not_to be_ready
end
it 'returns false when installable' do
application = build(:clusters_applications_prometheus, :installable, cluster: cluster)
expect(application).not_to be_ready
end
it 'returns false when scheduled' do
application = build(:clusters_applications_prometheus, :scheduled, cluster: cluster)
expect(application).not_to be_ready
end
it 'returns false when installing' do
application = build(:clusters_applications_prometheus, :installing, cluster: cluster)
expect(application).not_to be_ready
end
it 'returns false when errored' do
application = build(:clusters_applications_prometheus, :errored, cluster: cluster)
expect(application).not_to be_ready
end
end
describe '#prometheus_client' do describe '#prometheus_client' do
context 'cluster is nil' do context 'cluster is nil' do
it 'returns nil' do it 'returns nil' do
...@@ -102,15 +143,17 @@ describe Clusters::Applications::Prometheus do ...@@ -102,15 +143,17 @@ describe Clusters::Applications::Prometheus do
let(:kubeclient) { double('kubernetes client') } let(:kubeclient) { double('kubernetes client') }
let(:prometheus) { create(:clusters_applications_prometheus) } let(:prometheus) { create(:clusters_applications_prometheus) }
subject { prometheus.install_command } it 'returns an instance of Gitlab::Kubernetes::Helm::InstallCommand' do
expect(prometheus.install_command).to be_an_instance_of(Gitlab::Kubernetes::Helm::InstallCommand)
it { is_expected.to be_an_instance_of(Gitlab::Kubernetes::Helm::InstallCommand) } end
it 'should be initialized with 3 arguments' do it 'should be initialized with 3 arguments' do
expect(subject.name).to eq('prometheus') command = prometheus.install_command
expect(subject.chart).to eq('stable/prometheus')
expect(subject.version).to eq('6.7.3') expect(command.name).to eq('prometheus')
expect(subject.values).to eq(prometheus.values) expect(command.chart).to eq('stable/prometheus')
expect(command.version).to eq('6.7.3')
expect(command.values).to eq(prometheus.values)
end end
end end
......
...@@ -25,7 +25,7 @@ RSpec.shared_examples 'additional metrics query' do ...@@ -25,7 +25,7 @@ RSpec.shared_examples 'additional metrics query' do
shared_examples 'query context containing environment slug and filter' do shared_examples 'query context containing environment slug and filter' do
it 'contains ci_environment_slug' do it 'contains ci_environment_slug' do
expect(subject).to receive(:query_metrics).with(project, hash_including(ci_environment_slug: environment.slug)) expect(subject).to receive(:query_metrics).with(project, environment, hash_including(ci_environment_slug: environment.slug))
subject.query(*query_params) subject.query(*query_params)
end end
...@@ -33,6 +33,7 @@ RSpec.shared_examples 'additional metrics query' do ...@@ -33,6 +33,7 @@ RSpec.shared_examples 'additional metrics query' do
it 'contains environment filter' do it 'contains environment filter' do
expect(subject).to receive(:query_metrics).with( expect(subject).to receive(:query_metrics).with(
project, project,
environment,
hash_including( hash_including(
environment_filter: "container_name!=\"POD\",environment=\"#{environment.slug}\"" environment_filter: "container_name!=\"POD\",environment=\"#{environment.slug}\""
) )
...@@ -50,7 +51,7 @@ RSpec.shared_examples 'additional metrics query' do ...@@ -50,7 +51,7 @@ RSpec.shared_examples 'additional metrics query' do
it_behaves_like 'query context containing environment slug and filter' it_behaves_like 'query context containing environment slug and filter'
it 'query context contains kube_namespace' do it 'query context contains kube_namespace' do
expect(subject).to receive(:query_metrics).with(project, hash_including(kube_namespace: kube_namespace)) expect(subject).to receive(:query_metrics).with(project, environment, hash_including(kube_namespace: kube_namespace))
subject.query(*query_params) subject.query(*query_params)
end end
...@@ -74,7 +75,7 @@ RSpec.shared_examples 'additional metrics query' do ...@@ -74,7 +75,7 @@ RSpec.shared_examples 'additional metrics query' do
it_behaves_like 'query context containing environment slug and filter' it_behaves_like 'query context containing environment slug and filter'
it 'query context contains empty kube_namespace' do it 'query context contains empty kube_namespace' do
expect(subject).to receive(:query_metrics).with(project, hash_including(kube_namespace: '')) expect(subject).to receive(:query_metrics).with(project, environment, hash_including(kube_namespace: ''))
subject.query(*query_params) subject.query(*query_params)
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment