Commit e794bf68 authored by Mark Lapierre's avatar Mark Lapierre Committed by Ramya Authappan

Combine automatic failover and recovery specs

The two specs were very similar, the failover spec only lacked the
reconciliation step.
parent cf995b42
...@@ -17,6 +17,7 @@ gem 'knapsack', '~> 1.17' ...@@ -17,6 +17,7 @@ gem 'knapsack', '~> 1.17'
gem 'parallel_tests', '~> 2.29' gem 'parallel_tests', '~> 2.29'
gem 'rotp', '~> 3.1.0' gem 'rotp', '~> 3.1.0'
gem 'timecop', '~> 0.9.1' gem 'timecop', '~> 0.9.1'
gem "parallel", "~> 1.19"
group :development do group :development do
gem 'pry-byebug', '~> 3.5.1', platform: :mri gem 'pry-byebug', '~> 3.5.1', platform: :mri
......
...@@ -126,6 +126,7 @@ DEPENDENCIES ...@@ -126,6 +126,7 @@ DEPENDENCIES
gitlab-qa gitlab-qa
knapsack (~> 1.17) knapsack (~> 1.17)
nokogiri (~> 1.10.9) nokogiri (~> 1.10.9)
parallel (~> 1.19)
parallel_tests (~> 2.29) parallel_tests (~> 2.29)
pry-byebug (~> 3.5.1) pry-byebug (~> 3.5.1)
rake (~> 12.3.3) rake (~> 12.3.3)
......
...@@ -151,7 +151,6 @@ module QA ...@@ -151,7 +151,6 @@ module QA
autoload :Mattermost, 'qa/scenario/test/integration/mattermost' autoload :Mattermost, 'qa/scenario/test/integration/mattermost'
autoload :ObjectStorage, 'qa/scenario/test/integration/object_storage' autoload :ObjectStorage, 'qa/scenario/test/integration/object_storage'
autoload :SMTP, 'qa/scenario/test/integration/smtp' autoload :SMTP, 'qa/scenario/test/integration/smtp'
autoload :GitalyHA, 'qa/scenario/test/integration/gitaly_ha'
end end
module Sanity module Sanity
......
# frozen_string_literal: true
module QA
module Scenario
module Test
module Integration
class GitalyHA < Test::Instance::All
tags :gitaly_ha
end
end
end
end
end
...@@ -10,7 +10,7 @@ module QA ...@@ -10,7 +10,7 @@ module QA
PrometheusQueryError = Class.new(StandardError) PrometheusQueryError = Class.new(StandardError)
def initialize def initialize
@gitlab = 'gitlab-gitaly-ha' @gitlab = 'gitlab-gitaly-cluster'
@praefect = 'praefect' @praefect = 'praefect'
@postgres = 'postgres' @postgres = 'postgres'
@primary_node = 'gitaly1' @primary_node = 'gitaly1'
...@@ -28,7 +28,7 @@ module QA ...@@ -28,7 +28,7 @@ module QA
def replicated?(project_id) def replicated?(project_id)
Support::Retrier.retry_until(raise_on_failure: false) do Support::Retrier.retry_until(raise_on_failure: false) do
replicas = wait_until_shell_command(%(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line| replicas = wait_until_shell_command(%(docker exec #{@gitlab} bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line|
QA::Runtime::Logger.debug(line.chomp) QA::Runtime::Logger.debug(line.chomp)
# The output of the rake task looks something like this: # The output of the rake task looks something like this:
# #
...@@ -77,6 +77,7 @@ module QA ...@@ -77,6 +77,7 @@ module QA
def trigger_failover_by_stopping_primary_node def trigger_failover_by_stopping_primary_node
QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover") QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
stop_node(@primary_node) stop_node(@primary_node)
wait_for_new_primary
end end
def clear_replication_queue def clear_replication_queue
...@@ -121,7 +122,7 @@ module QA ...@@ -121,7 +122,7 @@ module QA
end end
def query_read_distribution def query_read_distribution
output = shell "docker exec gitlab-gitaly-ha bash -c 'curl -s http://localhost:9090/api/v1/query?query=gitaly_praefect_read_distribution'" do |line| output = shell "docker exec #{@gitlab} bash -c 'curl -s http://localhost:9090/api/v1/query?query=gitaly_praefect_read_distribution'" do |line|
QA::Runtime::Logger.debug(line) QA::Runtime::Logger.debug(line)
break line break line
end end
...@@ -179,15 +180,6 @@ module QA ...@@ -179,15 +180,6 @@ module QA
wait_for_reliable_connection wait_for_reliable_connection
end end
def reset_cluster
QA::Runtime::Logger.info('Reset Gitaly Cluster by starting all nodes and enabling writes')
start_node(@praefect)
start_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
wait_for_health_check_all_nodes
end
def verify_storage_move(source_storage, destination_storage) def verify_storage_move(source_storage, destination_storage)
return if QA::Runtime::Env.dot_com? return if QA::Runtime::Env.dot_com?
...@@ -346,7 +338,7 @@ module QA ...@@ -346,7 +338,7 @@ module QA
end end
def value_for_node(data, node) def value_for_node(data, node)
data.find(-> {0}) { |item| item[:node] == node }[:value] data.find(-> {{ value: 0 }}) { |item| item[:node] == node }[:value]
end end
def wait_for_reliable_connection def wait_for_reliable_connection
......
# frozen_string_literal: true
module QA
RSpec.describe 'Create' do
context 'Gitaly automatic failover and manual recovery', :orchestrated, :gitaly_cluster do
# Variables shared between contexts. They're used and shared between
# contexts so they can't be `let` variables.
praefect_manager = Service::PraefectManager.new
project = nil
let(:intial_commit_message) { 'Initial commit' }
let(:first_added_commit_message) { 'pushed to primary gitaly node' }
let(:second_added_commit_message) { 'commit to failover node' }
before(:context) do
# Reset the cluster in case previous tests left it in a bad state
praefect_manager.reset_primary_to_original
project = Resource::Project.fabricate! do |project|
project.name = "gitaly_cluster"
project.initialize_with_readme = true
end
end
after(:context) do
# Leave the cluster in a suitable state for subsequent tests,
# if there was a problem during the tests here
praefect_manager.reset_primary_to_original
end
it 'automatically fails over' do
# Create a new project with a commit and wait for it to replicate
Resource::Repository::ProjectPush.fabricate! do |push|
push.project = project
push.commit_message = first_added_commit_message
push.new_branch = false
push.file_content = "This should exist on both nodes"
end
praefect_manager.wait_for_replication(project.id)
# Stop the primary node to trigger failover, and then wait
# for Gitaly to be ready for writes again
praefect_manager.trigger_failover_by_stopping_primary_node
praefect_manager.wait_for_new_primary
praefect_manager.wait_for_health_check_current_primary_node
praefect_manager.wait_for_gitaly_check
Resource::Repository::Commit.fabricate_via_api! do |commit|
commit.project = project
commit.commit_message = second_added_commit_message
commit.add_files([
{
file_path: "file-#{SecureRandom.hex(8)}",
content: 'This should exist on one node before reconciliation'
}
])
end
# Confirm that we have access to the repo after failover,
# including the commit we just added
expect(project.commits.map { |commit| commit[:message].chomp })
.to include(intial_commit_message)
.and include(first_added_commit_message)
.and include(second_added_commit_message)
end
context 'when recovering from dataloss after failover' do
it 'allows reconciliation' do
# Start the old primary node again
praefect_manager.start_primary_node
praefect_manager.wait_for_health_check_current_primary_node
# Confirm dataloss (i.e., inconsistent nodes)
expect(praefect_manager.replicated?(project.id)).to be false
# Reconcile nodes to recover from dataloss
praefect_manager.reconcile_nodes
praefect_manager.wait_for_replication(project.id)
# Confirm that all commits are available after reconciliation
expect(project.commits.map { |commit| commit[:message].chomp })
.to include(intial_commit_message)
.and include(first_added_commit_message)
.and include(second_added_commit_message)
# Restore the original primary node
praefect_manager.reset_primary_to_original
# Check that all commits are still available even though the primary
# node was offline when one was made
expect(project.commits.map { |commit| commit[:message].chomp })
.to include(intial_commit_message)
.and include(first_added_commit_message)
.and include(second_added_commit_message)
end
end
end
end
end
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
module QA module QA
RSpec.describe 'Create' do RSpec.describe 'Create' do
context 'Gitaly' do context 'Gitaly' do
describe 'Backend node recovery', :orchestrated, :gitaly_ha, :skip_live_env do describe 'Backend node recovery', :orchestrated, :gitaly_cluster, :skip_live_env do
let(:praefect_manager) { Service::PraefectManager.new } let(:praefect_manager) { Service::PraefectManager.new }
let(:project) do let(:project) do
Resource::Project.fabricate! do |project| Resource::Project.fabricate! do |project|
......
...@@ -6,7 +6,7 @@ module QA ...@@ -6,7 +6,7 @@ module QA
RSpec.describe 'Create' do RSpec.describe 'Create' do
context 'Gitaly' do context 'Gitaly' do
# Issue to track removal of feature flag: https://gitlab.com/gitlab-org/quality/team-tasks/-/issues/602 # Issue to track removal of feature flag: https://gitlab.com/gitlab-org/quality/team-tasks/-/issues/602
describe 'Distributed reads', :orchestrated, :gitaly_ha, :skip_live_env, :requires_admin do describe 'Distributed reads', :orchestrated, :gitaly_cluster, :skip_live_env, :requires_admin do
let(:number_of_reads) { 100 } let(:number_of_reads) { 100 }
let(:praefect_manager) { Service::PraefectManager.new } let(:praefect_manager) { Service::PraefectManager.new }
let(:project) do let(:project) do
......
...@@ -4,7 +4,7 @@ require 'parallel' ...@@ -4,7 +4,7 @@ require 'parallel'
module QA module QA
RSpec.describe 'Create' do RSpec.describe 'Create' do
context 'Gitaly Cluster replication queue', :orchestrated, :gitaly_ha, :skip_live_env, quarantine: { issue: 'https://gitlab.com/gitlab-org/quality/pipeline-triage/-/issues/39#note_388590227', type: :stale } do context 'Gitaly Cluster replication queue', :orchestrated, :gitaly_cluster, :skip_live_env do
let(:praefect_manager) { Service::PraefectManager.new } let(:praefect_manager) { Service::PraefectManager.new }
let(:project) do let(:project) do
Resource::Project.fabricate! do |project| Resource::Project.fabricate! do |project|
...@@ -14,7 +14,8 @@ module QA ...@@ -14,7 +14,8 @@ module QA
end end
after do after do
praefect_manager.reset_cluster praefect_manager.start_praefect
praefect_manager.wait_for_reliable_connection
praefect_manager.clear_replication_queue praefect_manager.clear_replication_queue
end end
......
# frozen_string_literal: true
module QA
RSpec.describe 'Create' do
context 'Gitaly' do
describe 'High Availability', :orchestrated, :gitaly_ha, quarantine: { issue: 'https://gitlab.com/gitlab-org/quality/pipeline-triage/-/issues/39#note_388590227', type: :stale } do
let(:project) do
Resource::Project.fabricate! do |project|
project.name = 'gitaly_high_availability'
end
end
let(:initial_file) { 'pushed_to_primary.txt' }
let(:final_file) { 'committed_to_primary.txt' }
let(:praefect_manager) { Service::PraefectManager.new }
before do
Flow::Login.sign_in
end
after do
praefect_manager.reset_cluster
end
it 'makes sure that automatic failover is happening' do
Resource::Repository::ProjectPush.fabricate! do |push|
push.project = project
push.commit_message = 'pushed to primary gitaly node'
push.new_branch = true
push.file_name = initial_file
push.file_content = "This should exist on both nodes"
end
praefect_manager.trigger_failover_by_stopping_primary_node
project.visit!
Page::Project::Show.perform do |show|
show.wait_until do
show.has_name?(project.name)
end
expect(show).to have_file(initial_file)
end
Resource::Repository::Commit.fabricate_via_api! do |commit|
commit.project = project
commit.add_files([
{
file_path: final_file,
content: 'This should exist on both nodes too'
}
])
end
project.visit!
Page::Project::Show.perform do |show|
expect(show).to have_file(final_file)
end
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment