Commit 32958f14 authored by Qingyu Zhao's avatar Qingyu Zhao

Skip subsequent topology Prometheus queries if timeout occur

- Once timeout error occur, all subsequent queries will likely fail for
  the same reason. That leads to long duration_s since we have multiple
  queries. To avoid this, this MR cancel subsequent queries in this case.
parent ccaa07a1
---
title: Skip subsequent topology Prometheus queries if timeout occur
merge_request: 38293
author:
type: performance
......@@ -17,6 +17,9 @@ module Gitlab
'registry' => 'registry'
}.freeze
# If these errors occur, all subsequent queries are likely to fail for the same error
TIMEOUT_ERRORS = [Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout].freeze
CollectionFailure = Struct.new(:query, :error) do
def to_h
{ query => error }
......@@ -158,6 +161,11 @@ module Gitlab
end
def query_safely(query, query_name, fallback:)
if timeout_error_exists?
@failures << CollectionFailure.new(query_name, 'timeout_cancellation')
return fallback
end
result = yield query
return result if result.present?
......@@ -169,6 +177,14 @@ module Gitlab
fallback
end
def timeout_error_exists?
timeout_error_names = TIMEOUT_ERRORS.map(&:to_s).to_set
@failures.any? do |failure|
timeout_error_names.include?(failure.error)
end
end
def topology_node_services(instance, all_process_counts, all_process_memory, all_server_types)
# returns all node service data grouped by service name as the key
instance_service_data =
......
......@@ -402,28 +402,61 @@ RSpec.describe Gitlab::UsageData::Topology do
end
context 'and an error is raised when querying Prometheus' do
it 'returns empty result with failures' do
expect_prometheus_api_to receive(:query)
.at_least(:once)
.and_raise(Gitlab::PrometheusClient::ConnectionError)
context 'without timeout failures' do
it 'returns empty result and executes subsequent queries as usual' do
expect_prometheus_api_to receive(:query)
.at_least(:once)
.and_raise(Gitlab::PrometheusClient::ConnectionError)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpu_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_uname_info' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_workers' => 'Gitlab::PrometheusClient::ConnectionError' }
],
nodes: []
})
end
end
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpu_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_uname_info' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_workers' => 'Gitlab::PrometheusClient::ConnectionError' }
],
nodes: []
})
context 'with timeout failures' do
where(:exception) do
described_class::TIMEOUT_ERRORS
end
with_them do
it 'returns empty result and cancelled subsequent queries' do
expect_prometheus_api_to receive(:query)
.and_raise(exception)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => exception.to_s },
{ 'node_memory' => 'timeout_cancellation' },
{ 'node_memory_utilization' => 'timeout_cancellation' },
{ 'node_cpus' => 'timeout_cancellation' },
{ 'node_cpu_utilization' => 'timeout_cancellation' },
{ 'node_uname_info' => 'timeout_cancellation' },
{ 'service_rss' => 'timeout_cancellation' },
{ 'service_uss' => 'timeout_cancellation' },
{ 'service_pss' => 'timeout_cancellation' },
{ 'service_process_count' => 'timeout_cancellation' },
{ 'service_workers' => 'timeout_cancellation' }
],
nodes: []
})
end
end
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment