Commit fbc32fad authored by Heinrich Lee Yu's avatar Heinrich Lee Yu

Merge branch '222530-improve-instance-label-handling' into 'master'

Increase reliability of node to service mappings

Closes #222530

See merge request gitlab-org/gitlab!35756
parents 3a2073f9 27a9888e
...@@ -24,6 +24,7 @@ module Gitlab ...@@ -24,6 +24,7 @@ module Gitlab
def topology_usage_data def topology_usage_data
@failures = [] @failures = []
@instances = Set[]
topology_data, duration = measure_duration { topology_fetch_all_data } topology_data, duration = measure_duration { topology_fetch_all_data }
{ {
topology: topology_data topology: topology_data
...@@ -68,8 +69,7 @@ module Gitlab ...@@ -68,8 +69,7 @@ module Gitlab
by_instance_by_job_process_count = topology_all_service_process_count(client) by_instance_by_job_process_count = topology_all_service_process_count(client)
by_instance_by_job_server_types = topology_all_service_server_types(client) by_instance_by_job_server_types = topology_all_service_server_types(client)
instances = Set.new(by_instance_mem.keys + by_instance_cpus.keys) @instances.map do |instance|
instances.map do |instance|
{ {
node_memory_total_bytes: by_instance_mem[instance], node_memory_total_bytes: by_instance_mem[instance],
node_cpus: by_instance_cpus[instance], node_cpus: by_instance_cpus[instance],
...@@ -203,8 +203,28 @@ module Gitlab ...@@ -203,8 +203,28 @@ module Gitlab
all_instance_data.filter { |metric, _value| metric['instance'] == instance } all_instance_data.filter { |metric, _value| metric['instance'] == instance }
end end
def drop_port(instance) def normalize_instance_label(instance)
instance.gsub(/:.+$/, '') normalize_localhost_address(drop_port_number(instance))
end
def normalize_localhost_address(instance)
ip_addr = IPAddr.new(instance)
is_local_ip = ip_addr.loopback? || ip_addr.to_i.zero?
is_local_ip ? 'localhost' : instance
rescue IPAddr::InvalidAddressError
# This most likely means it was a host name, not an IP address
instance
end
def drop_port_number(instance)
instance.gsub(/:\d+$/, '')
end
def normalize_and_track_instance(instance)
normalize_instance_label(instance).tap do |normalized_instance|
@instances << normalized_instance
end
end end
def one_week_average(query) def one_week_average(query)
...@@ -212,13 +232,13 @@ module Gitlab ...@@ -212,13 +232,13 @@ module Gitlab
end end
def aggregate_by_instance(client, query) def aggregate_by_instance(client, query)
client.aggregate(query) { |metric| drop_port(metric['instance']) } client.aggregate(query) { |metric| normalize_and_track_instance(metric['instance']) }
end end
# Will retain a composite key that values are mapped to # Will retain a composite key that values are mapped to
def aggregate_by_labels(client, query) def aggregate_by_labels(client, query)
client.aggregate(query) do |metric| client.aggregate(query) do |metric|
metric['instance'] = drop_port(metric['instance']) metric['instance'] = normalize_and_track_instance(metric['instance'])
metric metric
end end
end end
...@@ -227,7 +247,7 @@ module Gitlab ...@@ -227,7 +247,7 @@ module Gitlab
# @return [Hash] mapping instance to a hash of target labels key/value, or the empty hash if input empty vector # @return [Hash] mapping instance to a hash of target labels key/value, or the empty hash if input empty vector
def map_instance_labels(query_result_vector, target_labels) def map_instance_labels(query_result_vector, target_labels)
query_result_vector.to_h do |result| query_result_vector.to_h do |result|
key = drop_port(result['metric']['instance']) key = normalize_and_track_instance(result['metric']['instance'])
value = result['metric'].slice(*target_labels).symbolize_keys value = result['metric'].slice(*target_labels).symbolize_keys
[key, value] [key, value]
end end
......
...@@ -160,6 +160,173 @@ RSpec.describe Gitlab::UsageData::Topology do ...@@ -160,6 +160,173 @@ RSpec.describe Gitlab::UsageData::Topology do
end end
end end
context 'and services run on the same node but report different instance values' do
let(:node_memory_response) do
[
{
'metric' => { 'instance' => 'localhost:9100' },
'value' => [1000, '512']
}
]
end
let(:node_uname_info_response) do
[
{
"metric" => {
"__name__" => "node_uname_info",
"domainname" => "(none)",
"instance" => "127.0.0.1:9100",
"job" => "node_exporter",
"machine" => "x86_64",
"nodename" => "127.0.0.1",
"release" => "4.19.76-linuxkit",
"sysname" => "Linux"
},
"value" => [1592463033.359, "1"]
}
]
end
# The services in this response should all be mapped to localhost i.e. the same node
let(:service_memory_response) do
[
{
'metric' => { 'instance' => 'localhost:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '10']
},
{
'metric' => { 'instance' => '127.0.0.1:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '11']
},
{
'metric' => { 'instance' => '0.0.0.0:9090', 'job' => 'prometheus' },
'value' => [1000, '12']
},
{
'metric' => { 'instance' => '[::1]:1234', 'job' => 'redis' },
'value' => [1000, '13']
},
{
'metric' => { 'instance' => '[::]:1234', 'job' => 'postgres' },
'value' => [1000, '14']
}
]
end
it 'normalizes equivalent instance values and maps them to the same node' do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: node_memory_response),
receive_node_cpu_count_query(result: []),
receive_node_uname_info_query(result: node_uname_info_response),
receive_node_service_memory_rss_query(result: service_memory_response),
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query(result: []),
receive_node_service_process_count_query(result: []),
receive_node_service_app_server_workers_query(result: [])
)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
{ 'service_process_count' => 'empty_result' },
{ 'service_workers' => 'empty_result' }
],
nodes: [
{
node_memory_total_bytes: 512,
node_uname_info: {
machine: 'x86_64',
sysname: 'Linux',
release: '4.19.76-linuxkit'
},
node_services: [
{
name: 'web',
process_memory_rss: 10
},
{
name: 'sidekiq',
process_memory_rss: 11
},
{
name: 'prometheus',
process_memory_rss: 12
},
{
name: 'redis',
process_memory_rss: 13
},
{
name: 'postgres',
process_memory_rss: 14
}
]
}
]
})
end
end
context 'and node metrics are missing but service metrics exist' do
it 'still reports service metrics' do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: []),
receive_node_cpu_count_query(result: []),
receive_node_uname_info_query(result: []),
receive_node_service_memory_rss_query,
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query(result: []),
receive_node_service_process_count_query(result: []),
receive_node_service_app_server_workers_query(result: [])
)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_memory' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'node_uname_info' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
{ 'service_process_count' => 'empty_result' },
{ 'service_workers' => 'empty_result' }
],
nodes: [
{
node_services: [
{
name: 'web',
process_memory_rss: 300
},
{
name: 'sidekiq',
process_memory_rss: 303
}
]
},
{
node_services: [
{
name: 'sidekiq',
process_memory_rss: 400
},
{
name: 'redis',
process_memory_rss: 402
}
]
}
]
})
end
end
context 'and an error is raised when querying Prometheus' do context 'and an error is raised when querying Prometheus' do
it 'returns empty result with failures' do it 'returns empty result with failures' do
expect_prometheus_api_to receive(:query) expect_prometheus_api_to receive(:query)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment