Commit 885fb839 authored by Valery Sizov's avatar Valery Sizov

[Geo] Post geo node status feature

parent 11d47876
...@@ -321,6 +321,8 @@ Example response: ...@@ -321,6 +321,8 @@ Example response:
} }
``` ```
Please note that the `health_status` parameter can only be in an "Healthy" or "Unhealthy" state, while the `health` parameter can be empty, "Healthy", or contain the actual error message.
## Retrieve project sync failures that occurred on the current node ## Retrieve project sync failures that occurred on the current node
......
...@@ -5,9 +5,6 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -5,9 +5,6 @@ class GeoNodeStatus < ActiveRecord::Base
after_initialize :initialize_feature_flags after_initialize :initialize_feature_flags
# Whether we were successful in reaching this node
attr_accessor :success
attr_writer :health_status
attr_accessor :storage_shards attr_accessor :storage_shards
attr_accessor :repository_verification_enabled attr_accessor :repository_verification_enabled
...@@ -76,14 +73,16 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -76,14 +73,16 @@ class GeoNodeStatus < ActiveRecord::Base
hashed_storage_attachments_max_id: 'Highest ID present in attachments migrated to hashed storage' hashed_storage_attachments_max_id: 'Highest ID present in attachments migrated to hashed storage'
}.freeze }.freeze
EXPIRATION_IN_MINUTES = 5
HEALTHY_STATUS = 'Healthy'.freeze
UNHEALTHY_STATUS = 'Unhealthy'.freeze
def self.current_node_status def self.current_node_status
current_node = Gitlab::Geo.current_node current_node = Gitlab::Geo.current_node
return unless current_node return unless current_node
status = current_node.find_or_build_status status = current_node.find_or_build_status
# Since we're retrieving our own data, we mark this as a successful load
status.success = true
status.load_data_from_current_node status.load_data_from_current_node
status.save if Gitlab::Geo.primary? status.save if Gitlab::Geo.primary?
...@@ -92,19 +91,10 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -92,19 +91,10 @@ class GeoNodeStatus < ActiveRecord::Base
end end
def self.fast_current_node_status def self.fast_current_node_status
# Primary's status is easy to calculate so we can calculate it on the fly
return current_node_status if Gitlab::Geo.primary?
spawn_worker
attrs = Rails.cache.read(cache_key) || {} attrs = Rails.cache.read(cache_key) || {}
new(attrs) new(attrs)
end end
def self.spawn_worker
::Geo::MetricsUpdateWorker.perform_async
end
def self.cache_key def self.cache_key
"geo-node:#{Gitlab::Geo.current_node.id}:status" "geo-node:#{Gitlab::Geo.current_node.id}:status"
end end
...@@ -117,9 +107,6 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -117,9 +107,6 @@ class GeoNodeStatus < ActiveRecord::Base
EXCLUDED_PARAMS = %w[id created_at].freeze EXCLUDED_PARAMS = %w[id created_at].freeze
EXTRA_PARAMS = %w[ EXTRA_PARAMS = %w[
success
health
health_status
last_event_timestamp last_event_timestamp
cursor_last_event_timestamp cursor_last_event_timestamp
storage_shards storage_shards
...@@ -230,14 +217,30 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -230,14 +217,30 @@ class GeoNodeStatus < ActiveRecord::Base
end end
end end
alias_attribute :health, :status_message
def healthy? def healthy?
status_message.blank? || status_message == 'Healthy'.freeze !outdated? && status_message_healthy?
end
def health
if outdated?
return "Status has not been updated in the past #{EXPIRATION_IN_MINUTES} minutes"
end
status_message
end end
def health_status def health_status
@health_status || (healthy? ? 'Healthy' : 'Unhealthy') healthy? ? HEALTHY_STATUS : UNHEALTHY_STATUS
end
def outdated?
return false unless updated_at
updated_at < EXPIRATION_IN_MINUTES.minutes.ago
end
def status_message_healthy?
status_message.blank? || status_message == HEALTHY_STATUS
end end
def last_successful_status_check_timestamp def last_successful_status_check_timestamp
......
...@@ -5,45 +5,38 @@ module Geo ...@@ -5,45 +5,38 @@ module Geo
def execute def execute
return unless Gitlab::Geo.enabled? return unless Gitlab::Geo.enabled?
if Gitlab::Geo.primary? current_node_status&.update_cache!
fetch_secondary_geo_nodes_metrics
end send_status_to_primary(current_node, current_node_status) if Gitlab::Geo.secondary?
fetch_current_geo_node_metrics update_prometheus_metrics(current_node, current_node_status) if prometheus_enabled?
if Gitlab::Geo.primary? && prometheus_enabled?
Gitlab::Geo.secondary_nodes.find_each { |node| update_prometheus_metrics(node, node.status) }
end
end end
private private
def fetch_secondary_geo_nodes_metrics def current_node_status
Gitlab::Geo.secondary_nodes.find_each { |node| fetch_geo_node_metrics(node) } @current_node_status ||= GeoNodeStatus.current_node_status
end end
def fetch_current_geo_node_metrics def current_node
fetch_geo_node_metrics(Gitlab::Geo.current_node) @current_node ||= Gitlab::Geo.current_node
end end
def fetch_geo_node_metrics(node) def send_status_to_primary(node, status)
return unless node&.enabled? if !NodeStatusPostService.new.execute(status) && prometheus_enabled?
status = node_status(node)
unless status.success
increment_failed_status_counter(node) increment_failed_status_counter(node)
return
end end
update_db_metrics(node, status) if Gitlab::Geo.primary?
status.update_cache! if node.current?
update_prometheus_metrics(node, status) if Gitlab::Metrics.prometheus_metrics_enabled?
end end
def update_db_metrics(node, status) def update_prometheus_metrics(node, status)
db_status = node.find_or_build_status return unless node&.enabled?
db_status.update_attributes(status.attributes.compact.merge(last_successful_status_check_at: Time.now.utc)) return unless status
end
def update_prometheus_metrics(node, status)
GeoNodeStatus::PROMETHEUS_METRICS.each do |column, docstring| GeoNodeStatus::PROMETHEUS_METRICS.each do |column, docstring|
value = status[column] value = status[column]
...@@ -54,10 +47,6 @@ module Geo ...@@ -54,10 +47,6 @@ module Geo
end end
end end
def node_status(node)
NodeStatusFetchService.new.call(node)
end
def increment_failed_status_counter(node) def increment_failed_status_counter(node)
failed_status_counter(node).increment failed_status_counter(node).increment
end end
...@@ -65,7 +54,7 @@ module Geo ...@@ -65,7 +54,7 @@ module Geo
def failed_status_counter(node) def failed_status_counter(node)
Gitlab::Metrics.counter( Gitlab::Metrics.counter(
:geo_status_failed_total, :geo_status_failed_total,
'Total number of times status for Geo node failed to retrieve', 'Total number of times status for Geo node failed to be sent to the primary',
metric_labels(node)) metric_labels(node))
end end
...@@ -81,5 +70,9 @@ module Geo ...@@ -81,5 +70,9 @@ module Geo
def metric_labels(node) def metric_labels(node)
{ url: node.url } { url: node.url }
end end
def prometheus_enabled?
Gitlab::Metrics.prometheus_metrics_enabled?
end
end end
end end
module Geo
class NodeStatusFetchService
def call(geo_node)
return GeoNodeStatus.current_node_status if geo_node.current?
data = GeoNodeStatus.find_or_initialize_by(geo_node: geo_node).attributes
data = data.merge(success: false, health_status: 'Offline')
begin
response = Gitlab::HTTP.get(geo_node.status_url, allow_local_requests: true, headers: headers, timeout: timeout)
data[:success] = response.success?
if response.success?
if response.parsed_response.is_a?(Hash)
data.merge!(response.parsed_response)
else
data[:health] = 'A JSON response was not received'
end
else
message = "Could not connect to Geo node - HTTP Status Code: #{response.code} #{response.message}"
payload = response.parsed_response
details =
if payload.is_a?(Hash)
payload['message']
else
# The return value can be a giant blob of HTML; ignore it
''
end
data[:health] = [message, details].compact.join("\n")
end
rescue Gitlab::Geo::GeoNodeNotFoundError
data[:health] = 'This GitLab instance does not appear to be configured properly as a Geo node. Make sure the URLs are using the correct fully-qualified domain names.'
data[:health_status] = 'Unhealthy'
rescue OpenSSL::Cipher::CipherError
data[:health] = 'Error decrypting the Geo secret from the database. Check that the primary uses the correct db_key_base.'
data[:health_status] = 'Unhealthy'
rescue Gitlab::HTTP::Error, Timeout::Error, SocketError, SystemCallError, OpenSSL::SSL::SSLError => e
data[:health] = e.message
end
GeoNodeStatus.from_json(data.as_json)
end
private
def headers
Gitlab::Geo::BaseRequest.new.headers
end
def timeout
Gitlab::CurrentSettings.geo_status_timeout
end
end
end
module Geo
class NodeStatusPostService
include Gitlab::Geo::LogHelpers
def execute(status)
response = Gitlab::HTTP.post(primary_status_url, body: status.attributes, allow_local_requests: true, headers: headers, timeout: timeout)
unless response.success?
handle_failure_for(response)
return false
end
return true
rescue Gitlab::Geo::GeoNodeNotFoundError => e
log_error(e.to_s)
return false
rescue OpenSSL::Cipher::CipherError => e
log_error('Error decrypting the Geo secret from the database. Check that the primary uses the correct db_key_base.', e)
return false
rescue Gitlab::HTTP::Error, Timeout::Error, SocketError, SystemCallError, OpenSSL::SSL::SSLError => e
log_error('Failed to post status data to primary', e)
return false
end
private
def handle_failure_for(response)
message = "Could not connect to Geo primary node - HTTP Status Code: #{response.code} #{response.message}"
payload = response.parsed_response
details =
if payload.is_a?(Hash)
payload['message']
else
# The return value can be a giant blob of HTML; ignore it
''
end
log_error([message, details].compact.join("\n"))
end
def primary_status_url
primary_node = Gitlab::Geo.primary_node
raise Gitlab::Geo::GeoNodeNotFoundError.new('Failed to look up Geo primary node in the database') unless primary_node
primary_node.status_url
end
def headers
Gitlab::Geo::BaseRequest.new.headers
end
def timeout
Gitlab::CurrentSettings.geo_status_timeout
end
end
end
---
title: "[Geo] Invert the direction of Geo metrics acquisition"
merge_request: 5934
author:
type: changed
...@@ -27,15 +27,18 @@ module API ...@@ -27,15 +27,18 @@ module API
end end
end end
# Get node information (e.g. health, repos synced, repos failed, etc.) # Post current node information to primary (e.g. health, repos synced, repos failed, etc.)
# #
# Example request: # Example request:
# GET /geo/status # POST /geo/status
get 'status' do post 'status' do
authenticate_by_gitlab_geo_node_token! authenticate_by_gitlab_geo_node_token!
status = ::GeoNodeStatus.fast_current_node_status db_status = GeoNode.find(params[:geo_node_id]).find_or_build_status
present status, with: EE::API::Entities::GeoNodeStatus
unless db_status.update(params.merge(last_successful_status_check_at: Time.now.utc))
render_validation_error!(db_status)
end
end end
end end
end end
......
...@@ -67,7 +67,7 @@ module API ...@@ -67,7 +67,7 @@ module API
if geo_node.current? if geo_node.current?
GeoNodeStatus.fast_current_node_status GeoNodeStatus.fast_current_node_status
else else
::Geo::NodeStatusFetchService.new.call(geo_node) geo_node.find_or_build_status
end end
end end
end end
......
...@@ -275,7 +275,7 @@ namespace :geo do ...@@ -275,7 +275,7 @@ namespace :geo do
puts 'N/A' puts 'N/A'
end end
print 'Last status was pulled by primary node: '.rjust(COLUMN_WIDTH) print 'Last status report was: '.rjust(COLUMN_WIDTH)
if current_node_status.updated_at if current_node_status.updated_at
puts "#{time_ago_in_words(current_node_status.updated_at)} ago" puts "#{time_ago_in_words(current_node_status.updated_at)} ago"
......
...@@ -4,7 +4,7 @@ FactoryBot.define do ...@@ -4,7 +4,7 @@ FactoryBot.define do
storage_shards { StorageShard.all } storage_shards { StorageShard.all }
trait :healthy do trait :healthy do
health nil status_message nil
attachments_count 329 attachments_count 329
attachments_failed_count 13 attachments_failed_count 13
attachments_synced_count 141 attachments_synced_count 141
...@@ -43,7 +43,7 @@ FactoryBot.define do ...@@ -43,7 +43,7 @@ FactoryBot.define do
end end
trait :unhealthy do trait :unhealthy do
health "Could not connect to Geo node - HTTP Status Code: 401 Unauthorized\nTest" status_message "Could not connect to Geo node - HTTP Status Code: 401 Unauthorized\nTest"
end end
end end
end end
...@@ -22,7 +22,7 @@ describe EE::API::Entities::GeoNodeStatus, :postgresql do ...@@ -22,7 +22,7 @@ describe EE::API::Entities::GeoNodeStatus, :postgresql do
context 'when node is unhealthy' do context 'when node is unhealthy' do
before do before do
geo_node_status.health = error geo_node_status.status_message = error
end end
subject { entity.as_json } subject { entity.as_json }
...@@ -36,13 +36,13 @@ describe EE::API::Entities::GeoNodeStatus, :postgresql do ...@@ -36,13 +36,13 @@ describe EE::API::Entities::GeoNodeStatus, :postgresql do
describe '#health' do describe '#health' do
context 'when node is healthy' do context 'when node is healthy' do
it 'exposes the health message' do it 'exposes the health message' do
expect(subject[:health]).to eq 'Healthy' expect(subject[:health]).to eq GeoNodeStatus::HEALTHY_STATUS
end end
end end
context 'when node is unhealthy' do context 'when node is unhealthy' do
before do before do
geo_node_status.health = error geo_node_status.status_message = error
end end
subject { entity.as_json } subject { entity.as_json }
......
...@@ -20,20 +20,12 @@ describe GeoNodeStatus, :geo do ...@@ -20,20 +20,12 @@ describe GeoNodeStatus, :geo do
describe '#fast_current_node_status' do describe '#fast_current_node_status' do
it 'reads the cache and spawns the worker' do it 'reads the cache and spawns the worker' do
expect(described_class).to receive(:spawn_worker).once
rails_cache = double rails_cache = double
expect(rails_cache).to receive(:read).with(described_class.cache_key) expect(rails_cache).to receive(:read).with(described_class.cache_key)
expect(Rails).to receive(:cache).and_return(rails_cache) expect(Rails).to receive(:cache).and_return(rails_cache)
described_class.fast_current_node_status described_class.fast_current_node_status
end end
it 'returns status for primary with no cache' do
stub_current_geo_node(primary)
expect(described_class.fast_current_node_status).to eq described_class.current_node_status
end
end end
describe '#update_cache!' do describe '#update_cache!' do
...@@ -57,7 +49,7 @@ describe GeoNodeStatus, :geo do ...@@ -57,7 +49,7 @@ describe GeoNodeStatus, :geo do
context 'when health is present' do context 'when health is present' do
it 'returns true' do it 'returns true' do
subject.status_message = 'Healthy' subject.status_message = GeoNodeStatus::HEALTHY_STATUS
expect(subject.healthy?).to be true expect(subject.healthy?).to be true
end end
...@@ -68,6 +60,36 @@ describe GeoNodeStatus, :geo do ...@@ -68,6 +60,36 @@ describe GeoNodeStatus, :geo do
expect(subject.healthy?).to be false expect(subject.healthy?).to be false
end end
end end
context 'takes outdated? into consideration' do
it 'return false' do
subject.status_message = GeoNodeStatus::HEALTHY_STATUS
subject.updated_at = 10.minutes.ago
expect(subject.healthy?).to be false
end
it 'return false' do
subject.status_message = 'something went wrong'
subject.updated_at = 1.minute.ago
expect(subject.healthy?).to be false
end
end
end
describe '#outdated?' do
it 'return true' do
subject.updated_at = 10.minutes.ago
expect(subject.outdated?).to be true
end
it 'return false' do
subject.updated_at = 1.minute.ago
expect(subject.outdated?).to be false
end
end end
describe '#status_message' do describe '#status_message' do
...@@ -78,6 +100,24 @@ describe GeoNodeStatus, :geo do ...@@ -78,6 +100,24 @@ describe GeoNodeStatus, :geo do
end end
end end
describe '#health' do
context 'takes outdated? into consideration' do
it 'returns expiration error' do
subject.status_message = GeoNodeStatus::HEALTHY_STATUS
subject.updated_at = 10.minutes.ago
expect(subject.health).to eq "Status has not been updated in the past #{described_class::EXPIRATION_IN_MINUTES} minutes"
end
it 'returns original message' do
subject.status_message = 'something went wrong'
subject.updated_at = 1.minute.ago
expect(subject.health).to eq 'something went wrong'
end
end
end
# Disable transactions via :delete method because a foreign table # Disable transactions via :delete method because a foreign table
# can't see changes inside a transaction of a different connection. # can't see changes inside a transaction of a different connection.
describe '#attachments_synced_count', :delete do describe '#attachments_synced_count', :delete do
......
...@@ -93,7 +93,6 @@ describe API::GeoNodes, :geo, :prometheus, api: true do ...@@ -93,7 +93,6 @@ describe API::GeoNodes, :geo, :prometheus, api: true do
stub_current_geo_node(secondary) stub_current_geo_node(secondary)
expect(GeoNode).to receive(:find).and_return(secondary) expect(GeoNode).to receive(:find).and_return(secondary)
expect(GeoNodeStatus).to receive(:current_node_status).and_call_original
get api("/geo_nodes/#{secondary.id}/status", admin) get api("/geo_nodes/#{secondary.id}/status", admin)
......
...@@ -13,20 +13,21 @@ describe API::Geo do ...@@ -13,20 +13,21 @@ describe API::Geo do
{ 'X-Gitlab-Token' => secondary_node.system_hook.token } { 'X-Gitlab-Token' => secondary_node.system_hook.token }
end end
before do
stub_current_geo_node(secondary_node)
end
shared_examples 'with terms enforced' do shared_examples 'with terms enforced' do
before do before do
enforce_terms enforce_terms
end end
it 'responds with 200' do it 'responds with 2xx HTTP response code' do
request request
expect(response).to have_gitlab_http_status(200) expect(response).to have_gitlab_http_status(:success)
end
end end
describe '/geo/transfers' do
before do
stub_current_geo_node(secondary_node)
end end
describe 'GET /geo/transfers/attachment/1' do describe 'GET /geo/transfers/attachment/1' do
...@@ -212,14 +213,52 @@ describe API::Geo do ...@@ -212,14 +213,52 @@ describe API::Geo do
end end
end end
end end
end
describe 'GET /geo/status', :postgresql do describe 'POST /geo/status', :postgresql do
let(:geo_base_request) { Gitlab::Geo::BaseRequest.new } let(:geo_base_request) { Gitlab::Geo::BaseRequest.new }
subject(:request) { get api('/geo/status'), nil, geo_base_request.headers } let(:data) do
{
geo_node_id: secondary_node.id,
status_message: nil,
db_replication_lag_seconds: 0,
repositories_count: 10,
repositories_synced_count: 1,
repositories_failed_count: 2,
wikis_count: 10,
wikis_synced_count: 2,
wikis_failed_count: 3,
lfs_objects_count: 100,
lfs_objects_synced_count: 50,
lfs_objects_failed_count: 12,
lfs_objects_synced_missing_on_primary_count: 4,
job_artifacts_count: 100,
job_artifacts_synced_count: 50,
job_artifacts_failed_count: 12,
job_artifacts_synced_missing_on_primary_count: 5,
attachments_count: 30,
attachments_synced_count: 30,
attachments_failed_count: 25,
attachments_synced_missing_on_primary_count: 6,
last_event_id: 2,
last_event_date: Time.now.utc,
cursor_last_event_id: 1,
cursor_last_event_date: Time.now.utc,
event_log_count: 55,
event_log_max_id: 555,
repository_created_max_id: 43,
repository_updated_max_id: 132,
repository_deleted_max_id: 23,
repository_renamed_max_id: 11,
repositories_changed_max_id: 109
}
end
subject(:request) { post api('/geo/status'), data, geo_base_request.headers }
it 'responds with 401 with invalid auth header' do it 'responds with 401 with invalid auth header' do
get api('/geo/status'), nil, Authorization: 'Test' post api('/geo/status'), nil, Authorization: 'Test'
expect(response).to have_gitlab_http_status(401) expect(response).to have_gitlab_http_status(401)
end end
...@@ -232,34 +271,17 @@ describe API::Geo do ...@@ -232,34 +271,17 @@ describe API::Geo do
expect(response).to have_gitlab_http_status(401) expect(response).to have_gitlab_http_status(401)
end end
context 'when requesting secondary node with valid auth header' do
before do
stub_current_geo_node(secondary_node)
allow(geo_base_request).to receive(:requesting_node) { primary_node }
allow(::GeoNodeStatus).to receive(:fast_current_node_status).and_return(::GeoNodeStatus.current_node_status)
end
it 'responds with 200' do
request
expect(response).to have_gitlab_http_status(200)
expect(response).to match_response_schema('public_api/v4/geo_node_status', dir: 'ee')
end
it_behaves_like 'with terms enforced'
end
context 'when requesting primary node with valid auth header' do context 'when requesting primary node with valid auth header' do
before do before do
stub_current_geo_node(primary_node) stub_current_geo_node(primary_node)
allow(geo_base_request).to receive(:requesting_node) { secondary_node } allow(geo_base_request).to receive(:requesting_node) { secondary_node }
end end
it 'responds with 200' do it 'updates the status and responds with 201' do
request expect { request }.to change { GeoNodeStatus.count }.by(1)
expect(response).to have_gitlab_http_status(200) expect(response).to have_gitlab_http_status(201)
expect(response).to match_response_schema('public_api/v4/geo_node_status', dir: 'ee') expect(secondary_node.reload.status.repositories_count).to eq(10)
end end
it_behaves_like 'with terms enforced' it_behaves_like 'with terms enforced'
......
...@@ -13,7 +13,6 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do ...@@ -13,7 +13,6 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do
let(:data) do let(:data) do
{ {
success: true,
status_message: nil, status_message: nil,
db_replication_lag_seconds: 0, db_replication_lag_seconds: 0,
repositories_count: 10, repositories_count: 10,
...@@ -54,7 +53,6 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do ...@@ -54,7 +53,6 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do
let(:primary_data) do let(:primary_data) do
{ {
success: true,
status_message: nil, status_message: nil,
repositories_count: 10, repositories_count: 10,
wikis_count: 10, wikis_count: 10,
...@@ -79,8 +77,8 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do ...@@ -79,8 +77,8 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do
describe '#execute' do describe '#execute' do
before do before do
request = double(success?: true, parsed_response: data.stringify_keys, code: 200) response = double(success?: true, parsed_response: data.stringify_keys, code: 200)
allow(Gitlab::HTTP).to receive(:get).and_return(request) allow(Gitlab::HTTP).to receive(:post).and_return(response)
end end
context 'when current node is nil' do context 'when current node is nil' do
...@@ -88,8 +86,8 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do ...@@ -88,8 +86,8 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do
stub_current_geo_node(nil) stub_current_geo_node(nil)
end end
it 'skips fetching the status' do it 'skips posting the status' do
expect(Gitlab::HTTP).to receive(:get).never expect(Gitlab::HTTP).to receive(:post).never
subject.execute subject.execute
end end
...@@ -100,9 +98,21 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do ...@@ -100,9 +98,21 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do
stub_current_geo_node(primary) stub_current_geo_node(primary)
end end
it 'attempts to retrieve metrics from all nodes' do it 'updates the cache' do
status = GeoNodeStatus.from_json(primary_data.as_json)
allow(GeoNodeStatus).to receive(:current_node_status).and_return(status)
expect(status).to receive(:update_cache!)
subject.execute
end
it 'updates metrics for all nodes' do
allow(GeoNodeStatus).to receive(:current_node_status).and_return(GeoNodeStatus.from_json(primary_data.as_json)) allow(GeoNodeStatus).to receive(:current_node_status).and_return(GeoNodeStatus.from_json(primary_data.as_json))
secondary.update(status: GeoNodeStatus.from_json(data.as_json))
another_secondary.update(status: GeoNodeStatus.from_json(data.as_json))
subject.execute subject.execute
expect(Gitlab::Metrics.registry.get(:geo_db_replication_lag_seconds).values.count).to eq(2) expect(Gitlab::Metrics.registry.get(:geo_db_replication_lag_seconds).values.count).to eq(2)
...@@ -113,29 +123,21 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do ...@@ -113,29 +123,21 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do
end end
it 'updates the GeoNodeStatus entry' do it 'updates the GeoNodeStatus entry' do
expect { subject.execute }.to change { GeoNodeStatus.count }.by(3) expect { subject.execute }.to change { GeoNodeStatus.count }.by(1)
status = secondary.status.load_data_from_current_node
expect(status.geo_node_id).to eq(secondary.id)
expect(status.last_successful_status_check_at).not_to be_nil
end
it 'updates only the active node' do
secondary.update_attributes(enabled: false)
expect { subject.execute }.to change { GeoNodeStatus.count }.by(2)
expect(another_secondary.status).not_to be_nil
end end
end end
context 'when node is a secondary' do context 'when node is a secondary' do
subject { described_class.new }
before do before do
stub_current_geo_node(secondary) stub_current_geo_node(secondary)
allow(subject).to receive(:node_status).and_return(GeoNodeStatus.new(data)) @status = GeoNodeStatus.new(data.as_json)
allow(GeoNodeStatus).to receive(:current_node_status).and_return(@status)
end
it 'updates the cache' do
expect(@status).to receive(:update_cache!)
subject.execute
end end
it 'adds gauges for various metrics' do it 'adds gauges for various metrics' do
...@@ -179,7 +181,7 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do ...@@ -179,7 +181,7 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do
end end
it 'increments a counter when metrics fail to retrieve' do it 'increments a counter when metrics fail to retrieve' do
allow(subject).to receive(:node_status).and_return(GeoNodeStatus.new(success: false)) allow_any_instance_of(Geo::NodeStatusPostService).to receive(:execute).and_return(false)
# Run once to get the gauge set # Run once to get the gauge set
subject.execute subject.execute
...@@ -187,16 +189,6 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do ...@@ -187,16 +189,6 @@ describe Geo::MetricsUpdateService, :geo, :prometheus do
expect { subject.execute }.to change { metric_value(:geo_status_failed_total) }.by(1) expect { subject.execute }.to change { metric_value(:geo_status_failed_total) }.by(1)
end end
it 'updates cache' do
status = GeoNodeStatus.new(success: true)
expect(status).to receive(:update_cache!)
allow(subject).to receive(:node_status).and_return(status)
subject.execute
end
it 'does not create GeoNodeStatus entries' do it 'does not create GeoNodeStatus entries' do
expect { subject.execute }.to change { GeoNodeStatus.count }.by(0) expect { subject.execute }.to change { GeoNodeStatus.count }.by(0)
end end
......
require 'spec_helper'
describe Geo::NodeStatusFetchService, :geo do
include ::EE::GeoHelpers
set(:primary) { create(:geo_node, :primary) }
set(:secondary) { create(:geo_node) }
subject { described_class.new }
describe '#call' do
it 'parses a 401 response' do
request = double(success?: false,
code: 401,
message: 'Unauthorized',
parsed_response: { 'message' => 'Test' } )
allow(Gitlab::HTTP).to receive(:get).and_return(request)
status = subject.call(secondary)
expect(status.status_message).to eq("Could not connect to Geo node - HTTP Status Code: 401 Unauthorized\nTest")
end
it 'always reload GeoNodeStatus if current node' do
stub_current_geo_node(secondary)
expect(GeoNodeStatus).to receive(:current_node_status).and_call_original
status = subject.call(secondary)
expect(status).to be_a(GeoNodeStatus)
end
it 'ignores certain parameters' do
yesterday = Date.yesterday
request = double(success?: true,
code: 200,
message: 'Unauthorized',
parsed_response: {
'id' => 5000,
'last_successful_status_check_at' => yesterday,
'created_at' => yesterday,
'updated_at' => yesterday
})
allow(described_class).to receive(:get).and_return(request)
status = subject.call(secondary)
expect(status.id).not_to be(5000)
expect(status.last_successful_status_check_at).not_to be(yesterday)
expect(status.created_at).not_to be(yesterday)
expect(status.updated_at).not_to be(yesterday)
end
it 'parses a 200 legacy response' do
data = { health: 'OK',
db_replication_lag_seconds: 0,
repositories_count: 10,
repositories_synced_count: 1,
repositories_failed_count: 2,
lfs_objects_count: 100,
lfs_objects_synced_count: 50,
lfs_objects_failed_count: 12,
job_artifacts_count: 100,
job_artifacts_synced_count: 50,
job_artifacts_failed_count: 12,
attachments_count: 30,
attachments_synced_count: 30,
attachments_failed_count: 25,
last_event_id: 2,
last_event_timestamp: Time.now.to_i,
cursor_last_event_id: 1,
cursor_last_event_timestamp: Time.now.to_i }
request = double(success?: true, parsed_response: data.stringify_keys, code: 200)
allow(Gitlab::HTTP).to receive(:get).and_return(request)
status = subject.call(secondary)
expect(status).to have_attributes(data)
expect(status.success).to be true
end
it 'handles invalid JSON response' do
request = double(success?: true,
code: 200,
message: 'Something here',
parsed_response: 'Something here')
allow(Gitlab::HTTP).to receive(:get).and_return(request)
status = subject.call(secondary)
expect(status.status_message).to eq("A JSON response was not received")
end
it 'omits full response text in status' do
request = double(success?: false,
code: 401,
message: 'Unauthorized',
parsed_response: '<html><h1>You are not allowed</h1></html>')
allow(Gitlab::HTTP).to receive(:get).and_return(request)
status = subject.call(secondary)
expect(status.status_message).to eq("Could not connect to Geo node - HTTP Status Code: 401 Unauthorized\n")
expect(status.success).to be false
end
it 'alerts on bad SSL certficate' do
message = 'bad certificate'
allow(Gitlab::HTTP).to receive(:get).and_raise(OpenSSL::SSL::SSLError.new(message))
status = subject.call(secondary)
expect(status.status_message).to eq(message)
end
it 'handles connection refused' do
allow(Gitlab::HTTP).to receive(:get).and_raise(Errno::ECONNREFUSED.new('bad connection'))
status = subject.call(secondary)
expect(status.status_message).to eq('Connection refused - bad connection')
end
it 'returns meaningful error message when primary uses incorrect db key' do
allow_any_instance_of(GeoNode).to receive(:secret_access_key).and_raise(OpenSSL::Cipher::CipherError)
status = subject.call(secondary)
expect(status.status_message).to eq('Error decrypting the Geo secret from the database. Check that the primary uses the correct db_key_base.')
end
it 'gracefully handles case when primary is deleted' do
primary.destroy!
status = subject.call(secondary)
expect(status.status_message).to eq('This GitLab instance does not appear to be configured properly as a Geo node. Make sure the URLs are using the correct fully-qualified domain names.')
end
it 'returns the status from database if it could not fetch it' do
allow(Gitlab::HTTP).to receive(:get).and_raise(Errno::ECONNREFUSED.new('bad connection'))
db_status = create(:geo_node_status, :healthy, geo_node: secondary)
status = subject.call(secondary)
expect(status.status_message).to eq('Connection refused - bad connection')
expect(status).not_to be_healthy
expect(status.attachments_count).to eq(db_status.attachments_count)
expect(status.attachments_failed_count).to eq(db_status.attachments_failed_count)
expect(status.attachments_synced_count).to eq(db_status.attachments_synced_count)
expect(status.lfs_objects_count).to eq(db_status.lfs_objects_count)
expect(status.lfs_objects_failed_count).to eq(db_status.lfs_objects_failed_count)
expect(status.lfs_objects_synced_count).to eq(db_status.lfs_objects_synced_count)
expect(status.job_artifacts_count).to eq(db_status.job_artifacts_count)
expect(status.job_artifacts_failed_count).to eq(db_status.job_artifacts_failed_count)
expect(status.job_artifacts_synced_count).to eq(db_status.job_artifacts_synced_count)
expect(status.repositories_count).to eq(db_status.repositories_count)
expect(status.repositories_synced_count).to eq(db_status.repositories_synced_count)
expect(status.repositories_failed_count).to eq(db_status.repositories_failed_count)
expect(status.last_event_id).to eq(db_status.last_event_id)
expect(status.last_event_timestamp).to eq(db_status.last_event_timestamp)
expect(status.cursor_last_event_id).to eq(db_status.cursor_last_event_id)
expect(status.cursor_last_event_timestamp).to eq(db_status.cursor_last_event_timestamp)
expect(status.last_successful_status_check_timestamp).to eq(db_status.last_successful_status_check_timestamp)
end
end
end
require 'spec_helper'
describe Geo::NodeStatusPostService, :geo do
include ::EE::GeoHelpers
include ApiHelpers
set(:primary) { create(:geo_node, :primary) }
set(:secondary) { create(:geo_node) }
subject { described_class.new }
describe '#execute' do
it 'parses a 401 response' do
response = double(success?: false,
code: 401,
message: 'Unauthorized',
parsed_response: { 'message' => 'Test' } )
allow(Gitlab::HTTP).to receive(:post).and_return(response)
expect(subject).to receive(:log_error).with("Could not connect to Geo primary node - HTTP Status Code: 401 Unauthorized\nTest")
expect(subject.execute(secondary.find_or_build_status)).to be_falsey
end
it 'alerts on bad SSL certficate' do
message = 'bad certificate'
allow(Gitlab::HTTP).to receive(:post).and_raise(OpenSSL::SSL::SSLError.new(message))
expect(subject).to receive(:log_error).with('Failed to post status data to primary', kind_of(OpenSSL::SSL::SSLError))
expect(subject.execute(secondary)).to be_falsey
end
it 'handles connection refused' do
allow(Gitlab::HTTP).to receive(:post).and_raise(Errno::ECONNREFUSED.new('bad connection'))
expect(subject).to receive(:log_error).with('Failed to post status data to primary', kind_of(Errno::ECONNREFUSED))
expect(subject.execute(secondary)).to be_falsey
end
it 'returns meaningful error message when primary uses incorrect db key' do
allow_any_instance_of(GeoNode).to receive(:secret_access_key).and_raise(OpenSSL::Cipher::CipherError)
expect(subject).to receive(:log_error).with(
"Error decrypting the Geo secret from the database. Check that the primary uses the correct db_key_base.",
kind_of(OpenSSL::Cipher::CipherError)
)
expect(subject.execute(secondary)).to be_falsey
end
it 'gracefully handles case when primary is deleted' do
primary.destroy!
expect(subject).to receive(:log_error).with(
'Failed to look up Geo primary node in the database'
)
expect(subject.execute(secondary)).to be_falsey
end
it 'sends geo_node_id in the request' do
stub_current_geo_node(primary)
expect(Gitlab::HTTP).to receive(:post)
.with(
primary.status_url,
hash_including(body: hash_including('geo_node_id' => secondary.id)))
.and_return(double(success?: true))
subject.execute(GeoNodeStatus.new({
geo_node_id: secondary.id,
status_message: nil,
db_replication_lag_seconds: 0,
repositories_count: 10
}))
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment