Commit 917f9c4c authored by Stan Hu's avatar Stan Hu

Add a health check for the state of Geo replication lag in secondary

Closes #1929
parent a240dbab
...@@ -13,6 +13,7 @@ class GeoNodeStatus { ...@@ -13,6 +13,7 @@ class GeoNodeStatus {
this.$el = $(el); this.$el = $(el);
this.$icon = $('.js-geo-node-icon', this.$el); this.$icon = $('.js-geo-node-icon', this.$el);
this.$loadingIcon = $('.js-geo-node-loading', this.$el); this.$loadingIcon = $('.js-geo-node-loading', this.$el);
this.$dbReplicationLag = $('.js-db-replication-lag', this.$status);
this.$healthStatus = $('.js-health-status', this.$el); this.$healthStatus = $('.js-health-status', this.$el);
this.$status = $('.js-geo-node-status', this.$el); this.$status = $('.js-geo-node-status', this.$el);
this.$repositoriesSynced = $('.js-repositories-synced', this.$status); this.$repositoriesSynced = $('.js-repositories-synced', this.$status);
...@@ -36,6 +37,15 @@ class GeoNodeStatus { ...@@ -36,6 +37,15 @@ class GeoNodeStatus {
$.getJSON(this.endpoint, (status) => { $.getJSON(this.endpoint, (status) => {
this.setStatusIcon(status.healthy); this.setStatusIcon(status.healthy);
this.setHealthStatus(status.healthy); this.setHealthStatus(status.healthy);
// Replication lag can be nil if the secondary isn't actually streaming
if (status.db_replication_lag) {
const parsedTime = gl.utils.prettyTime.parseSeconds(status.db_replication_lag);
this.$dbReplicationLag.html(gl.utils.prettyTime.stringifyTime(parsedTime));
} else {
this.$dbReplicationLag.html('UNKNOWN');
}
this.$repositoriesSynced.html(`${status.repositories_synced_count}/${status.repositories_count} (${status.repositories_synced_in_percentage})`); this.$repositoriesSynced.html(`${status.repositories_synced_count}/${status.repositories_count} (${status.repositories_synced_in_percentage})`);
this.$repositoriesFailed.html(status.repositories_failed_count); this.$repositoriesFailed.html(status.repositories_failed_count);
this.$lfsObjectsSynced.html(`${status.lfs_objects_synced_count}/${status.lfs_objects_count} (${status.lfs_objects_synced_in_percentage})`); this.$lfsObjectsSynced.html(`${status.lfs_objects_synced_count}/${status.lfs_objects_count} (${status.lfs_objects_synced_in_percentage})`);
......
...@@ -14,6 +14,14 @@ class GeoNodeStatus ...@@ -14,6 +14,14 @@ class GeoNodeStatus
health.blank? health.blank?
end end
def db_replication_lag
@db_replication_lag ||= Gitlab::Geo::HealthCheck.db_replication_lag
end
def db_replication_lag=(value)
@db_replication_lag = value
end
def repositories_count def repositories_count
@repositories_count ||= repositories.count @repositories_count ||= repositories.count
end end
......
...@@ -14,6 +14,8 @@ class GeoNodeStatusEntity < Grape::Entity ...@@ -14,6 +14,8 @@ class GeoNodeStatusEntity < Grape::Entity
number_to_percentage(node.attachments_synced_in_percentage, precision: 2) number_to_percentage(node.attachments_synced_in_percentage, precision: 2)
end end
expose :db_replication_lag
expose :lfs_objects_count expose :lfs_objects_count
expose :lfs_objects_synced_count expose :lfs_objects_synced_count
expose :lfs_objects_synced_in_percentage do |node| expose :lfs_objects_synced_in_percentage do |node|
......
...@@ -5,6 +5,7 @@ module Geo ...@@ -5,6 +5,7 @@ module Geo
KEYS = %w( KEYS = %w(
health health
db_replication_lag
repositories_count repositories_count
repositories_synced_count repositories_synced_count
repositories_failed_count repositories_failed_count
......
...@@ -45,6 +45,10 @@ ...@@ -45,6 +45,10 @@
%span.help-block %span.help-block
Health Status: Health Status:
%span.js-health-status %span.js-health-status
%p
%span.help-block
Database replication lag:
%strong.node-info.js-db-replication-lag
%p %p
%span.help-block %span.help-block
Repositories synced: Repositories synced:
......
...@@ -947,6 +947,7 @@ module API ...@@ -947,6 +947,7 @@ module API
class GeoNodeStatus < Grape::Entity class GeoNodeStatus < Grape::Entity
expose :id expose :id
expose :db_replication_lag
expose :health expose :health
expose :healthy?, as: :healthy expose :healthy?, as: :healthy
expose :repositories_count expose :repositories_count
......
...@@ -7,6 +7,7 @@ module Gitlab ...@@ -7,6 +7,7 @@ module Gitlab
return '' unless Gitlab::Geo.secondary? return '' unless Gitlab::Geo.secondary?
return 'The Geo database configuration file is missing.' unless Gitlab::Geo.geo_database_configured? return 'The Geo database configuration file is missing.' unless Gitlab::Geo.geo_database_configured?
return 'The Geo node has a database that is not configured for streaming replication with the primary node.' unless self.database_secondary? return 'The Geo node has a database that is not configured for streaming replication with the primary node.' unless self.database_secondary?
return 'The Geo node does not appear to be replicating data from the primary node.' unless self.db_replication_lag.present?
database_version = self.get_database_version.to_i database_version = self.get_database_version.to_i
migration_version = self.get_migration_version.to_i migration_version = self.get_migration_version.to_i
...@@ -60,6 +61,22 @@ module Gitlab ...@@ -60,6 +61,22 @@ module Gitlab
.first .first
.fetch('pg_is_in_recovery') == 't' .fetch('pg_is_in_recovery') == 't'
end end
def self.db_replication_lag
# Obtain the replication lag in seconds
raise NotImplementedError unless Gitlab::Database.postgresql?
ActiveRecord::Base.connection.execute('
SELECT CASE
WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location()
THEN 0
ELSE
EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER
END
AS replication_lag')
.first
.fetch('replication_lag')
end
end end
end end
end end
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
"attachments_synced_count", "attachments_synced_count",
"lfs_objects_count", "lfs_objects_count",
"lfs_objects_synced_count", "lfs_objects_synced_count",
"db_replication_lag",
"repositories_count", "repositories_count",
"repositories_failed_count", "repositories_failed_count",
"repositories_synced_count" "repositories_synced_count"
...@@ -19,6 +20,7 @@ ...@@ -19,6 +20,7 @@
"attachments_count": { "type": "integer" }, "attachments_count": { "type": "integer" },
"attachments_synced_count": { "type": "integer" }, "attachments_synced_count": { "type": "integer" },
"attachments_synced_in_percentage": { "type": "string" }, "attachments_synced_in_percentage": { "type": "string" },
"db_replication_lag": { "type": ["integer", "null"] },
"lfs_objects_count": { "type": "integer" }, "lfs_objects_count": { "type": "integer" },
"lfs_objects_synced_count": { "type": "integer" }, "lfs_objects_synced_count": { "type": "integer" },
"lfs_objects_synced_in_percentage": { "type": "string" }, "lfs_objects_synced_in_percentage": { "type": "string" },
......
...@@ -12,6 +12,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do ...@@ -12,6 +12,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
allow(described_class).to receive(:database_secondary?).and_return(true) allow(described_class).to receive(:database_secondary?).and_return(true)
allow(described_class).to receive(:get_database_version).and_return('20170101') allow(described_class).to receive(:get_database_version).and_return('20170101')
allow(described_class).to receive(:get_migration_version).and_return('20170201') allow(described_class).to receive(:get_migration_version).and_return('20170201')
allow(described_class).to receive(:db_replication_lag).and_return(0)
message = subject.perform_checks message = subject.perform_checks
...@@ -27,6 +28,16 @@ describe Gitlab::Geo::HealthCheck, :postgresql do ...@@ -27,6 +28,16 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
it 'returns an error when database is not configured for streaming replication' do it 'returns an error when database is not configured for streaming replication' do
allow(Gitlab::Geo).to receive(:secondary?) { true } allow(Gitlab::Geo).to receive(:secondary?) { true }
allow(Gitlab::Geo).to receive(:configured?) { true }
allow(Gitlab::Database).to receive(:postgresql?) { true }
allow(ActiveRecord::Base).to receive_message_chain(:connection, :execute, :first, :fetch) { 'f' }
expect(subject.perform_checks).not_to be_blank
end
it 'returns an error when streaming replication is not working' do
allow(Gitlab::Geo).to receive(:secondary?) { true }
allow(Gitlab::Geo).to receive(:configured?) { true }
allow(Gitlab::Database).to receive(:postgresql?) { true } allow(Gitlab::Database).to receive(:postgresql?) { true }
allow(ActiveRecord::Base).to receive_message_chain(:connection, :execute, :first, :fetch) { 'f' } allow(ActiveRecord::Base).to receive_message_chain(:connection, :execute, :first, :fetch) { 'f' }
...@@ -42,6 +53,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do ...@@ -42,6 +53,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
it 'returns an error when Geo database version does not match the latest migration version' do it 'returns an error when Geo database version does not match the latest migration version' do
allow(described_class).to receive(:database_secondary?).and_return(true) allow(described_class).to receive(:database_secondary?).and_return(true)
allow(subject).to receive(:get_database_version) { 1 } allow(subject).to receive(:get_database_version) { 1 }
allow(described_class).to receive(:db_replication_lag).and_return(0)
expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/) expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/)
end end
...@@ -49,9 +61,17 @@ describe Gitlab::Geo::HealthCheck, :postgresql do ...@@ -49,9 +61,17 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
it 'returns an error when latest migration version does not match the Geo database version' do it 'returns an error when latest migration version does not match the Geo database version' do
allow(described_class).to receive(:database_secondary?).and_return(true) allow(described_class).to receive(:database_secondary?).and_return(true)
allow(subject).to receive(:get_migration_version) { 1 } allow(subject).to receive(:get_migration_version) { 1 }
allow(described_class).to receive(:db_replication_lag).and_return(0)
expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/) expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/)
end end
it 'returns an error when replication lag is not present' do
allow(described_class).to receive(:database_secondary?).and_return(true)
allow(described_class).to receive(:db_replication_lag).and_return(nil)
expect(subject.perform_checks).to match(/The Geo node does not appear to be replicating data from the primary node/)
end
end end
describe 'MySQL checks' do describe 'MySQL checks' do
......
...@@ -98,6 +98,14 @@ describe GeoNodeStatus do ...@@ -98,6 +98,14 @@ describe GeoNodeStatus do
end end
end end
describe '#db_replication_lag' do
it 'returns the set replication lag' do
allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag).and_return(1000)
expect(subject.db_replication_lag).to eq(1000)
end
end
describe '#lfs_objects_synced_in_percentage' do describe '#lfs_objects_synced_in_percentage' do
let(:lfs_object_project) { create(:lfs_objects_project, project: project_1) } let(:lfs_object_project) { create(:lfs_objects_project, project: project_1) }
...@@ -164,6 +172,7 @@ describe GeoNodeStatus do ...@@ -164,6 +172,7 @@ describe GeoNodeStatus do
context 'when no values are available' do context 'when no values are available' do
it 'returns 0 for each attribute' do it 'returns 0 for each attribute' do
allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag).and_return(nil)
subject.attachments_count = nil subject.attachments_count = nil
subject.attachments_synced_count = nil subject.attachments_synced_count = nil
subject.lfs_objects_count = nil subject.lfs_objects_count = nil
...@@ -172,6 +181,7 @@ describe GeoNodeStatus do ...@@ -172,6 +181,7 @@ describe GeoNodeStatus do
subject.repositories_synced_count = nil subject.repositories_synced_count = nil
subject.repositories_failed_count = nil subject.repositories_failed_count = nil
expect(subject.db_replication_lag).to be_nil
expect(subject.repositories_count).to be_zero expect(subject.repositories_count).to be_zero
expect(subject.repositories_synced_count).to be_zero expect(subject.repositories_synced_count).to be_zero
expect(subject.repositories_synced_in_percentage).to be_zero expect(subject.repositories_synced_in_percentage).to be_zero
......
...@@ -25,6 +25,7 @@ describe Geo::NodeStatusService do ...@@ -25,6 +25,7 @@ describe Geo::NodeStatusService do
it 'parses a 200 response' do it 'parses a 200 response' do
data = { health: 'OK', data = { health: 'OK',
db_replication_lag: 0,
repositories_count: 10, repositories_count: 10,
repositories_synced_count: 1, repositories_synced_count: 1,
repositories_failed_count: 2, repositories_failed_count: 2,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment