Commit 789dc771 authored by Bob Van Landuyt's avatar Bob Van Landuyt

Don't record duration for errors

This makes the request duration histograms consistent across both
metrics and only records it when the request was successful.

It also makes sure that the way the error was received in the
middleware does not matter anymore. Before this change, an error that
bubbled up towards the middleware would count towards a groups error
budget, but not towards the service availability. With this, both metrics
will not include durations of requests that resulted in responses with
a 5xx status code.

If a request failed, it's not very important for users how fast it
failed.

For our metrics it could also skew results: A very fast 500 would have
less impact on a service's availability or a groups budget spend than
a slow one. While they should weigh the same. Similarly, a slow
failure should not count double towards availability.

This results in the following scoring for requests:

|         | Fast | Slow |
|---------|------|------|
| Success | 2/2  | 1/2  |
| Error   | 0/1  | 0/1  |
parent 9850151f
......@@ -58,7 +58,7 @@ The following metrics are available:
| `gitlab_transaction_cache_duration_total` | Counter | 10.2 | Counter for total time (seconds) spent in Rails cache calls (aggregate) | |
| `gitlab_transaction_cache_read_hit_count_total` | Counter | 10.2 | Counter for cache hits for Rails cache calls | `controller`, `action` |
| `gitlab_transaction_cache_read_miss_count_total` | Counter | 10.2 | Counter for cache misses for Rails cache calls | `controller`, `action` |
| `gitlab_transaction_duration_seconds` | Histogram | 10.2 | Duration for all transactions (`gitlab_transaction_*` metrics) | `controller`, `action` |
| `gitlab_transaction_duration_seconds` | Histogram | 10.2 | Duration for successful requests (`gitlab_transaction_*` metrics) | `controller`, `action` |
| `gitlab_transaction_event_build_found_total` | Counter | 9.4 | Counter for build found for API /jobs/request | |
| `gitlab_transaction_event_build_invalid_total` | Counter | 9.4 | Counter for build invalid due to concurrency conflict for API /jobs/request | |
| `gitlab_transaction_event_build_not_found_cached_total` | Counter | 9.4 | Counter for cached response of build not found for API /jobs/request | |
......@@ -91,7 +91,7 @@ The following metrics are available:
| `gitlab_transaction_view_duration_total` | Counter | 9.4 | Duration for views | `controller`, `action`, `view` |
| `gitlab_view_rendering_duration_seconds` | Histogram | 10.2 | Duration for views (histogram) | `controller`, `action`, `view` |
| `http_requests_total` | Counter | 9.4 | Rack request count | `method`, `status` |
| `http_request_duration_seconds` | Histogram | 9.4 | HTTP response time from rack middleware | `method` |
| `http_request_duration_seconds` | Histogram | 9.4 | HTTP response time from rack middleware for successful requests | `method` |
| `gitlab_transaction_db_count_total` | Counter | 13.1 | Counter for total number of SQL calls | `controller`, `action` |
| `gitlab_transaction_db_<role>_count_total` | Counter | 13.10 | Counter for total number of SQL calls, grouped by database roles (primary/replica) | `controller`, `action` |
| `gitlab_transaction_db_write_count_total` | Counter | 13.1 | Counter for total number of write SQL calls | `controller`, `action` |
......
......@@ -16,6 +16,10 @@ module Gitlab
@error
end
def self.record_duration_for_status?(status)
status.to_i.between?(200, 499)
end
# Tracks an event.
#
# See `Gitlab::Metrics::Transaction#add_event` for more details.
......
......@@ -75,7 +75,7 @@ module Gitlab
elapsed = Time.now.to_f - started
unless health_endpoint
if !health_endpoint && Gitlab::Metrics.record_duration_for_status?(status)
RequestsRackMiddleware.http_request_duration_seconds.observe({ method: method }, elapsed)
end
......
......@@ -13,8 +13,6 @@ module Gitlab
THREAD_KEY = :_gitlab_metrics_transaction
SMALL_BUCKETS = [0.1, 0.25, 0.5, 1.0, 2.5, 5.0].freeze
# The series to store events (e.g. Git pushes) in.
EVENT_SERIES = 'events'
......@@ -39,29 +37,10 @@ module Gitlab
def initialize
@methods = {}
@started_at = nil
@finished_at = nil
end
def duration
@finished_at ? (@finished_at - @started_at) : 0.0
end
def run
Thread.current[THREAD_KEY] = self
@started_at = System.monotonic_time
yield
ensure
@finished_at = System.monotonic_time
observe(:gitlab_transaction_duration_seconds, duration) do
buckets SMALL_BUCKETS
end
Thread.current[THREAD_KEY] = nil
raise NotImplementedError
end
# Tracks a business level event
......
......@@ -6,12 +6,29 @@ module Gitlab
CONTROLLER_KEY = 'action_controller.instance'
ENDPOINT_KEY = 'api.endpoint'
ALLOWED_SUFFIXES = Set.new(%w[json js atom rss xml zip])
SMALL_BUCKETS = [0.1, 0.25, 0.5, 1.0, 2.5, 5.0].freeze
def initialize(env)
super()
@env = env
end
def run
Thread.current[THREAD_KEY] = self
started_at = System.monotonic_time
status, _, _ = retval = yield
finished_at = System.monotonic_time
duration = finished_at - started_at
record_duration_if_needed(status, duration)
retval
ensure
Thread.current[THREAD_KEY] = nil
end
def labels
return @labels if @labels
......@@ -27,6 +44,14 @@ module Gitlab
private
def record_duration_if_needed(status, duration)
return unless Gitlab::Metrics.record_duration_for_status?(status)
observe(:gitlab_transaction_duration_seconds, duration) do
buckets SMALL_BUCKETS
end
end
def labels_from_controller
controller = @env[CONTROLLER_KEY]
......
......@@ -79,6 +79,17 @@ RSpec.describe Gitlab::Metrics::RequestsRackMiddleware, :aggregate_failures do
end
end
context '@app.call returns an error code' do
let(:status) { '500' }
it 'tracks count but not duration' do
expect(described_class).to receive_message_chain(:http_requests_total, :increment).with(method: 'get', status: '500', feature_category: 'unknown')
expect(described_class).not_to receive(:http_request_duration_seconds)
subject.call(env)
end
end
context '@app.call throws exception' do
let(:http_request_duration_seconds) { double('http_request_duration_seconds') }
let(:http_requests_total) { double('http_requests_total') }
......
......@@ -12,32 +12,6 @@ RSpec.describe Gitlab::Metrics::Transaction do
}
end
describe '#duration' do
it 'returns the duration of a transaction in seconds' do
transaction.run { }
expect(transaction.duration).to be > 0
end
end
describe '#run' do
it 'yields the supplied block' do
expect { |b| transaction.run(&b) }.to yield_control
end
it 'stores the transaction in the current thread' do
transaction.run do
expect(described_class.current).to eq(transaction)
end
end
it 'removes the transaction from the current thread upon completion' do
transaction.run { }
expect(described_class.current).to be_nil
end
end
describe '#method_call_for' do
it 'returns a MethodCall' do
method = transaction.method_call_for('Foo#bar', :Foo, '#bar')
......@@ -46,6 +20,10 @@ RSpec.describe Gitlab::Metrics::Transaction do
end
end
describe '#run' do
specify { expect { transaction.run }.to raise_error(NotImplementedError) }
end
describe '#add_event' do
let(:prometheus_metric) { instance_double(Prometheus::Client::Counter, increment: nil, base_labels: {}) }
......
......@@ -38,16 +38,6 @@ RSpec.describe Gitlab::Metrics::WebTransaction do
end
end
describe '#duration' do
include_context 'transaction observe metrics'
it 'returns the duration of a transaction in seconds' do
transaction.run { sleep(0.5) }
expect(transaction.duration).to be >= 0.5
end
end
describe '#run' do
include_context 'transaction observe metrics'
......@@ -58,6 +48,9 @@ RSpec.describe Gitlab::Metrics::WebTransaction do
it 'stores the transaction in the current thread' do
transaction.run do
expect(Thread.current[described_class::THREAD_KEY]).to eq(transaction)
expect(described_class.current).to eq(transaction)
['200', {}, '']
end
end
......@@ -65,6 +58,33 @@ RSpec.describe Gitlab::Metrics::WebTransaction do
transaction.run { }
expect(Thread.current[described_class::THREAD_KEY]).to be_nil
expect(described_class.current).to be_nil
end
it 'records the duration of the transaction if the request was successful' do
expect(transaction).to receive(:observe).with(:gitlab_transaction_duration_seconds, instance_of(Float))
transaction.run { ['200', {}, ''] }
end
it 'does not record the duration of the transaction if the request failed' do
expect(transaction).not_to receive(:observe).with(:gitlab_transaction_duration_seconds, instance_of(Float))
transaction.run { ['500', {}, ''] }
end
it 'does not record the duration of the transaction if it raised' do
expect(transaction).not_to receive(:observe).with(:gitlab_transaction_duration_seconds, instance_of(Float))
expect do
transaction.run { raise 'broken' }
end.to raise_error('broken')
end
it 'returns the rack response' do
response = ['500', {}, '']
expect(transaction.run { response }).to eq(response)
end
end
......
......@@ -92,6 +92,26 @@ RSpec.describe Gitlab::Metrics do
end
end
describe '.record_status_for_duration?' do
using RSpec::Parameterized::TableSyntax
where(:status, :should_record) do
100 | false
200 | true
401 | true
nil | false
500 | false
503 | false
'100' | false
'201' | true
'nothing' | false
end
with_them do
specify { expect(described_class.record_duration_for_status?(status)).to be(should_record) }
end
end
describe '.add_event' do
context 'without a transaction' do
it 'does nothing' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment