Commit bec9ec9a authored by Douwe Maan's avatar Douwe Maan

Merge branch '27439-performance-deltas' into 'master'

Expose memory deltas between app deployments and refactor prometheus queries to support more custom queries

See merge request !10981
parents 40242003 f38779c6
...@@ -11,13 +11,15 @@ class Projects::DeploymentsController < Projects::ApplicationController ...@@ -11,13 +11,15 @@ class Projects::DeploymentsController < Projects::ApplicationController
end end
def metrics def metrics
@metrics = deployment.metrics(1.hour) return render_404 unless deployment.has_metrics?
@metrics = deployment.metrics
if @metrics&.any? if @metrics&.any?
render json: @metrics, status: :ok render json: @metrics, status: :ok
else else
head :no_content head :no_content
end end
rescue NotImplementedError
render_404
end end
private private
......
...@@ -103,15 +103,10 @@ class Deployment < ActiveRecord::Base ...@@ -103,15 +103,10 @@ class Deployment < ActiveRecord::Base
project.monitoring_service.present? project.monitoring_service.present?
end end
def metrics(timeframe) def metrics
return {} unless has_metrics? return {} unless has_metrics?
half_timeframe = timeframe / 2 project.monitoring_service.deployment_metrics(self)
timeframe_start = created_at - half_timeframe
timeframe_end = created_at + half_timeframe
metrics = project.monitoring_service.metrics(environment, timeframe_start: timeframe_start, timeframe_end: timeframe_end)
metrics&.merge(deployment_time: created_at.to_i) || {}
end end
private private
......
...@@ -150,7 +150,7 @@ class Environment < ActiveRecord::Base ...@@ -150,7 +150,7 @@ class Environment < ActiveRecord::Base
end end
def metrics def metrics
project.monitoring_service.metrics(self) if has_metrics? project.monitoring_service.environment_metrics(self) if has_metrics?
end end
# An environment name is not necessarily suitable for use in URLs, DNS # An environment name is not necessarily suitable for use in URLs, DNS
......
...@@ -9,8 +9,11 @@ class MonitoringService < Service ...@@ -9,8 +9,11 @@ class MonitoringService < Service
%w() %w()
end end
# Environments have a number of metrics def environment_metrics(environment)
def metrics(environment, timeframe_start: nil, timeframe_end: nil) raise NotImplementedError
end
def deployment_metrics(deployment)
raise NotImplementedError raise NotImplementedError
end end
end end
...@@ -63,45 +63,31 @@ class PrometheusService < MonitoringService ...@@ -63,45 +63,31 @@ class PrometheusService < MonitoringService
{ success: false, result: err } { success: false, result: err }
end end
def metrics(environment, timeframe_start: nil, timeframe_end: nil) def environment_metrics(environment)
with_reactive_cache(environment.slug, timeframe_start, timeframe_end) do |data| with_reactive_cache(Gitlab::Prometheus::Queries::EnvironmentQuery.name, environment.id, &:itself)
data
end end
def deployment_metrics(deployment)
metrics = with_reactive_cache(Gitlab::Prometheus::Queries::DeploymentQuery.name, deployment.id, &:itself)
metrics&.merge(deployment_time: created_at.to_i) || {}
end end
# Cache metrics for specific environment # Cache metrics for specific environment
def calculate_reactive_cache(environment_slug, timeframe_start, timeframe_end) def calculate_reactive_cache(query_class_name, *args)
return unless active? && project && !project.pending_delete? return unless active? && project && !project.pending_delete?
timeframe_start = Time.parse(timeframe_start) if timeframe_start metrics = Kernel.const_get(query_class_name).new(client).query(*args)
timeframe_end = Time.parse(timeframe_end) if timeframe_end
timeframe_start ||= 8.hours.ago
timeframe_end ||= Time.now
memory_query = %{(sum(container_memory_usage_bytes{container_name!="POD",environment="#{environment_slug}"}) / count(container_memory_usage_bytes{container_name!="POD",environment="#{environment_slug}"})) /1024/1024}
cpu_query = %{sum(rate(container_cpu_usage_seconds_total{container_name!="POD",environment="#{environment_slug}"}[2m])) / count(container_cpu_usage_seconds_total{container_name!="POD",environment="#{environment_slug}"}) * 100}
{ {
success: true, success: true,
metrics: { metrics: metrics,
# Average Memory used in MB
memory_values: client.query_range(memory_query, start: timeframe_start, stop: timeframe_end),
memory_current: client.query(memory_query, time: timeframe_end),
memory_previous: client.query(memory_query, time: timeframe_start),
# Average CPU Utilization
cpu_values: client.query_range(cpu_query, start: timeframe_start, stop: timeframe_end),
cpu_current: client.query(cpu_query, time: timeframe_end),
cpu_previous: client.query(cpu_query, time: timeframe_start)
},
last_update: Time.now.utc last_update: Time.now.utc
} }
rescue Gitlab::PrometheusError => err rescue Gitlab::PrometheusError => err
{ success: false, result: err.message } { success: false, result: err.message }
end end
def client def client
@prometheus ||= Gitlab::Prometheus.new(api_url: api_url) @prometheus ||= Gitlab::PrometheusClient.new(api_url: api_url)
end end
end end
module Gitlab
module Prometheus
module Queries
class BaseQuery
attr_accessor :client
delegate :query_range, :query, to: :client, prefix: true
def raw_memory_usage_query(environment_slug)
%{avg(container_memory_usage_bytes{container_name!="POD",environment="#{environment_slug}"}) / 2^20}
end
def raw_cpu_usage_query(environment_slug)
%{avg(rate(container_cpu_usage_seconds_total{container_name!="POD",environment="#{environment_slug}"}[2m])) * 100}
end
def initialize(client)
@client = client
end
def query(*args)
raise NotImplementedError
end
end
end
end
end
module Gitlab::Prometheus::Queries
class DeploymentQuery < BaseQuery
def query(deployment_id)
deployment = Deployment.find_by(id: deployment_id)
environment_slug = deployment.environment.slug
memory_query = raw_memory_usage_query(environment_slug)
memory_avg_query = %{avg(avg_over_time(container_memory_usage_bytes{container_name!="POD",environment="#{environment_slug}"}[30m]))}
cpu_query = raw_cpu_usage_query(environment_slug)
cpu_avg_query = %{avg(rate(container_cpu_usage_seconds_total{container_name!="POD",environment="#{environment_slug}"}[30m])) * 100}
timeframe_start = (deployment.created_at - 30.minutes).to_f
timeframe_end = (deployment.created_at + 30.minutes).to_f
{
memory_values: client_query_range(memory_query, start: timeframe_start, stop: timeframe_end),
memory_before: client_query(memory_avg_query, time: deployment.created_at.to_f),
memory_after: client_query(memory_avg_query, time: timeframe_end),
cpu_values: client_query_range(cpu_query, start: timeframe_start, stop: timeframe_end),
cpu_before: client_query(cpu_avg_query, time: deployment.created_at.to_f),
cpu_after: client_query(cpu_avg_query, time: timeframe_end)
}
end
end
end
module Gitlab::Prometheus::Queries
class EnvironmentQuery < BaseQuery
def query(environment_id)
environment = Environment.find_by(id: environment_id)
environment_slug = environment.slug
timeframe_start = 8.hours.ago.to_f
timeframe_end = Time.now.to_f
memory_query = raw_memory_usage_query(environment_slug)
cpu_query = raw_cpu_usage_query(environment_slug)
{
memory_values: client_query_range(memory_query, start: timeframe_start, stop: timeframe_end),
memory_current: client_query(memory_query, time: timeframe_end),
cpu_values: client_query_range(cpu_query, start: timeframe_start, stop: timeframe_end),
cpu_current: client_query(cpu_query, time: timeframe_end)
}
end
end
end
...@@ -2,7 +2,7 @@ module Gitlab ...@@ -2,7 +2,7 @@ module Gitlab
PrometheusError = Class.new(StandardError) PrometheusError = Class.new(StandardError)
# Helper methods to interact with Prometheus network services & resources # Helper methods to interact with Prometheus network services & resources
class Prometheus class PrometheusClient
attr_reader :api_url attr_reader :api_url
def initialize(api_url:) def initialize(api_url:)
...@@ -15,7 +15,7 @@ module Gitlab ...@@ -15,7 +15,7 @@ module Gitlab
def query(query, time: Time.now) def query(query, time: Time.now)
get_result('vector') do get_result('vector') do
json_api_get('query', query: query, time: time.utc.to_f) json_api_get('query', query: query, time: time.to_f)
end end
end end
......
...@@ -42,6 +42,22 @@ describe Projects::DeploymentsController do ...@@ -42,6 +42,22 @@ describe Projects::DeploymentsController do
before do before do
allow(controller).to receive(:deployment).and_return(deployment) allow(controller).to receive(:deployment).and_return(deployment)
end end
context 'when metrics are disabled' do
before do
allow(deployment).to receive(:has_metrics?).and_return false
end
it 'responds with not found' do
get :metrics, deployment_params(id: deployment.id)
expect(response).to be_not_found
end
end
context 'when metrics are enabled' do
before do
allow(deployment).to receive(:has_metrics?).and_return true
end
context 'when environment has no metrics' do context 'when environment has no metrics' do
before do before do
...@@ -77,6 +93,19 @@ describe Projects::DeploymentsController do ...@@ -77,6 +93,19 @@ describe Projects::DeploymentsController do
expect(json_response['last_update']).to eq(42) expect(json_response['last_update']).to eq(42)
end end
end end
context 'when metrics service does not implement deployment metrics' do
before do
allow(deployment).to receive(:metrics).and_raise(NotImplementedError)
end
it 'responds with not found' do
get :metrics, deployment_params(id: deployment.id)
expect(response).to be_not_found
end
end
end
end end
def deployment_params(opts = {}) def deployment_params(opts = {})
......
require 'spec_helper'
describe Gitlab::Prometheus::Queries::DeploymentQuery, lib: true do
let(:environment) { create(:environment, slug: 'environment-slug') }
let(:deployment) { create(:deployment, environment: environment) }
let(:client) { double('prometheus_client') }
subject { described_class.new(client) }
around do |example|
Timecop.freeze { example.run }
end
it 'sends appropriate queries to prometheus' do
start_time_matcher = be_within(0.5).of((deployment.created_at - 30.minutes).to_f)
stop_time_matcher = be_within(0.5).of((deployment.created_at + 30.minutes).to_f)
created_at_matcher = be_within(0.5).of(deployment.created_at.to_f)
expect(client).to receive(:query_range).with('avg(container_memory_usage_bytes{container_name!="POD",environment="environment-slug"}) / 2^20',
start: start_time_matcher, stop: stop_time_matcher)
expect(client).to receive(:query).with('avg(avg_over_time(container_memory_usage_bytes{container_name!="POD",environment="environment-slug"}[30m]))',
time: created_at_matcher)
expect(client).to receive(:query).with('avg(avg_over_time(container_memory_usage_bytes{container_name!="POD",environment="environment-slug"}[30m]))',
time: stop_time_matcher)
expect(client).to receive(:query_range).with('avg(rate(container_cpu_usage_seconds_total{container_name!="POD",environment="environment-slug"}[2m])) * 100',
start: start_time_matcher, stop: stop_time_matcher)
expect(client).to receive(:query).with('avg(rate(container_cpu_usage_seconds_total{container_name!="POD",environment="environment-slug"}[30m])) * 100',
time: created_at_matcher)
expect(client).to receive(:query).with('avg(rate(container_cpu_usage_seconds_total{container_name!="POD",environment="environment-slug"}[30m])) * 100',
time: stop_time_matcher)
expect(subject.query(deployment.id)).to eq(memory_values: nil, memory_before: nil, memory_after: nil,
cpu_values: nil, cpu_before: nil, cpu_after: nil)
end
end
require 'spec_helper' require 'spec_helper'
describe Gitlab::Prometheus, lib: true do describe Gitlab::PrometheusClient, lib: true do
include PrometheusHelpers include PrometheusHelpers
subject { described_class.new(api_url: 'https://prometheus.example.com') } subject { described_class.new(api_url: 'https://prometheus.example.com') }
......
...@@ -52,7 +52,7 @@ describe Deployment, models: true do ...@@ -52,7 +52,7 @@ describe Deployment, models: true do
describe '#metrics' do describe '#metrics' do
let(:deployment) { create(:deployment) } let(:deployment) { create(:deployment) }
subject { deployment.metrics(1.hour) } subject { deployment.metrics }
context 'metrics are disabled' do context 'metrics are disabled' do
it { is_expected.to eq({}) } it { is_expected.to eq({}) }
...@@ -63,16 +63,17 @@ describe Deployment, models: true do ...@@ -63,16 +63,17 @@ describe Deployment, models: true do
{ {
success: true, success: true,
metrics: {}, metrics: {},
last_update: 42 last_update: 42,
deployment_time: 1494408956
} }
end end
before do before do
allow(deployment.project).to receive_message_chain(:monitoring_service, :metrics) allow(deployment.project).to receive_message_chain(:monitoring_service, :deployment_metrics)
.with(any_args).and_return(simple_metrics) .with(any_args).and_return(simple_metrics)
end end
it { is_expected.to eq(simple_metrics.merge(deployment_time: deployment.created_at.utc.to_i)) } it { is_expected.to eq(simple_metrics) }
end end
end end
......
...@@ -393,7 +393,7 @@ describe Environment, models: true do ...@@ -393,7 +393,7 @@ describe Environment, models: true do
it 'returns the metrics from the deployment service' do it 'returns the metrics from the deployment service' do
expect(project.monitoring_service) expect(project.monitoring_service)
.to receive(:metrics).with(environment) .to receive(:environment_metrics).with(environment)
.and_return(:fake_metrics) .and_return(:fake_metrics)
is_expected.to eq(:fake_metrics) is_expected.to eq(:fake_metrics)
......
...@@ -6,6 +6,7 @@ describe PrometheusService, models: true, caching: true do ...@@ -6,6 +6,7 @@ describe PrometheusService, models: true, caching: true do
let(:project) { create(:prometheus_project) } let(:project) { create(:prometheus_project) }
let(:service) { project.prometheus_service } let(:service) { project.prometheus_service }
let(:environment_query) { Gitlab::Prometheus::Queries::EnvironmentQuery }
describe "Associations" do describe "Associations" do
it { is_expected.to belong_to :project } it { is_expected.to belong_to :project }
...@@ -45,49 +46,56 @@ describe PrometheusService, models: true, caching: true do ...@@ -45,49 +46,56 @@ describe PrometheusService, models: true, caching: true do
end end
end end
describe '#metrics' do describe '#environment_metrics' do
let(:environment) { build_stubbed(:environment, slug: 'env-slug') } let(:environment) { build_stubbed(:environment, slug: 'env-slug') }
around do |example| around do |example|
Timecop.freeze { example.run } Timecop.freeze { example.run }
end end
context 'with valid data without time range' do context 'with valid data' do
subject { service.metrics(environment) } subject { service.environment_metrics(environment) }
before do before do
stub_reactive_cache(service, prometheus_data, 'env-slug', nil, nil) stub_reactive_cache(service, prometheus_data, environment_query, environment.id)
end end
it 'returns reactive data' do it 'returns reactive data' do
is_expected.to eq(prometheus_data) is_expected.to eq(prometheus_data)
end end
end end
end
describe '#deployment_metrics' do
let(:deployment) { build_stubbed(:deployment)}
let(:deployment_query) { Gitlab::Prometheus::Queries::DeploymentQuery }
around do |example|
Timecop.freeze { example.run }
end
context 'with valid data with time range' do context 'with valid data' do
let(:t_start) { 1.hour.ago.utc } subject { service.deployment_metrics(deployment) }
let(:t_end) { Time.now.utc }
subject { service.metrics(environment, timeframe_start: t_start, timeframe_end: t_end) }
before do before do
stub_reactive_cache(service, prometheus_data, 'env-slug', t_start, t_end) stub_reactive_cache(service, prometheus_data, deployment_query, deployment.id)
end end
it 'returns reactive data' do it 'returns reactive data' do
is_expected.to eq(prometheus_data) is_expected.to eq(prometheus_data.merge(deployment_time: deployment.created_at.to_i))
end end
end end
end end
describe '#calculate_reactive_cache' do describe '#calculate_reactive_cache' do
let(:environment) { build_stubbed(:environment, slug: 'env-slug') } let(:environment) { create(:environment, slug: 'env-slug') }
around do |example| around do |example|
Timecop.freeze { example.run } Timecop.freeze { example.run }
end end
subject do subject do
service.calculate_reactive_cache(environment.slug, nil, nil) service.calculate_reactive_cache(environment_query.to_s, environment.id)
end end
context 'when service is inactive' do context 'when service is inactive' do
......
module PrometheusHelpers module PrometheusHelpers
def prometheus_memory_query(environment_slug) def prometheus_memory_query(environment_slug)
%{(sum(container_memory_usage_bytes{container_name!="POD",environment="#{environment_slug}"}) / count(container_memory_usage_bytes{container_name!="POD",environment="#{environment_slug}"})) /1024/1024} %{avg(container_memory_usage_bytes{container_name!="POD",environment="#{environment_slug}"}) / 2^20}
end end
def prometheus_cpu_query(environment_slug) def prometheus_cpu_query(environment_slug)
%{sum(rate(container_cpu_usage_seconds_total{container_name!="POD",environment="#{environment_slug}"}[2m])) / count(container_cpu_usage_seconds_total{container_name!="POD",environment="#{environment_slug}"}) * 100} %{avg(rate(container_cpu_usage_seconds_total{container_name!="POD",environment="#{environment_slug}"}[2m])) * 100}
end end
def prometheus_ping_url(prometheus_query) def prometheus_ping_url(prometheus_query)
...@@ -88,10 +88,8 @@ module PrometheusHelpers ...@@ -88,10 +88,8 @@ module PrometheusHelpers
metrics: { metrics: {
memory_values: prometheus_values_body('matrix').dig(:data, :result), memory_values: prometheus_values_body('matrix').dig(:data, :result),
memory_current: prometheus_value_body('vector').dig(:data, :result), memory_current: prometheus_value_body('vector').dig(:data, :result),
memory_previous: prometheus_value_body('vector').dig(:data, :result),
cpu_values: prometheus_values_body('matrix').dig(:data, :result), cpu_values: prometheus_values_body('matrix').dig(:data, :result),
cpu_current: prometheus_value_body('vector').dig(:data, :result), cpu_current: prometheus_value_body('vector').dig(:data, :result)
cpu_previous: prometheus_value_body('vector').dig(:data, :result)
}, },
last_update: last_update last_update: last_update
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment