Commit 1db40d61 authored by Mark Lapierre's avatar Mark Lapierre

Merge branch 'ml-add-e2e-test-of-gitaly-cluster-backend-node-recovery' into 'master'

Add e2e test of gitaly cluster backend node recovery

See merge request gitlab-org/gitlab!37040
parents 966a6aa7 7b8e5b7d
......@@ -123,6 +123,10 @@ module QA
"#{api_get_path}/runners"
end
def api_commits_path
"#{api_get_path}/repository/commits"
end
def api_repository_branches_path
"#{api_get_path}/repository/branches"
end
......@@ -176,6 +180,10 @@ module QA
raise Runtime::API::RepositoryStorageMoves::RepositoryStorageMovesError, 'Timed out while waiting for the repository storage move to finish'
end
def commits
parse_body(get(Runtime::API::Request.new(api_client, api_commits_path).url))
end
def import_status
response = get Runtime::API::Request.new(api_client, "/projects/#{id}/import").url
......
......@@ -17,21 +17,31 @@ module QA
@virtual_storage = 'default'
end
def enable_writes
shell "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage #{@virtual_storage}'"
# Executes the praefect `dataloss` command.
#
# @return [Boolean] whether dataloss has occurred
def dataloss?
wait_until_shell_command_matches(dataloss_command, /Outdated repositories/)
end
def replicated?(project_id)
shell %(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"') do |line|
replicas = wait_until_shell_command(%(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line|
QA::Runtime::Logger.debug(line.chomp)
# The output of the rake task looks something like this:
#
# Project name | gitaly1 (primary) | gitaly2 | gitaly3
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
# gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619
#
# We want to confirm that the checksums are identical
break line.split('|').map(&:strip)[1..3].uniq.one? if line.start_with?("gitaly_cluster")
break line if line.start_with?("gitaly_cluster")
end
# We want to know if the checksums are identical
replicas.split('|').map(&:strip)[1..3].uniq.one?
end
def start_primary_node
start_node(@primary_node)
end
def start_praefect
......@@ -51,40 +61,54 @@ module QA
end
def trigger_failover_by_stopping_primary_node
QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
stop_node(@primary_node)
end
def clear_replication_queue
QA::Runtime::Logger.debug("Clearing the replication queue")
shell <<~CMD
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"delete from replication_queue_job_lock; delete from replication_queue_lock; delete from replication_queue;\\""
CMD
QA::Runtime::Logger.info("Clearing the replication queue")
shell sql_to_docker_exec_cmd(
<<~SQL
delete from replication_queue_job_lock;
delete from replication_queue_lock;
delete from replication_queue;
SQL
)
end
def create_stalled_replication_queue
QA::Runtime::Logger.debug("Setting jobs in replication queue to `in_progress` and acquiring locks")
shell <<~CMD
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"update replication_queue set state = 'in_progress';
insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
select id, rq.lock_id, created_at from replication_queue rq
left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
where state = 'in_progress' and rqjl.job_id is null;
update replication_queue_lock set acquired = 't';\\""
CMD
QA::Runtime::Logger.info("Setting jobs in replication queue to `in_progress` and acquiring locks")
shell sql_to_docker_exec_cmd(
<<~SQL
update replication_queue set state = 'in_progress';
insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
select id, rq.lock_id, created_at from replication_queue rq
left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
where state = 'in_progress' and rqjl.job_id is null;
update replication_queue_lock set acquired = 't';
SQL
)
end
# Reconciles the previous primary node with the current one
# I.e., it brings the previous primary node up-to-date
def reconcile_nodes
reconcile_node_with_node(@primary_node, current_primary_node)
end
def reconcile_node_with_node(target, reference)
QA::Runtime::Logger.info("Reconcile #{target} with #{reference} on #{@virtual_storage}")
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml reconcile -virtual #{@virtual_storage} -target #{target} -reference #{reference} -f'",
/FINISHED: \d+ repos were checked for consistency/,
sleep_interval: 5,
retry_on_exception: true
)
end
def replication_queue_lock_count
result = []
cmd = <<~CMD
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"select count(*) from replication_queue_lock where acquired = 't';\\""
CMD
shell cmd do |line|
shell sql_to_docker_exec_cmd("select count(*) from replication_queue_lock where acquired = 't';") do |line|
result << line
end
# The result looks like:
......@@ -94,12 +118,36 @@ module QA
result[2].to_i
end
# Makes the original primary (gitaly1) the primary again by
# stopping the other nodes, waiting for gitaly1 to be made the
# primary again, and then it starts the other nodes and enables
# writes
def reset_primary_to_original
QA::Runtime::Logger.info("Checking primary node...")
return if @primary_node == current_primary_node
QA::Runtime::Logger.info("Reset primary node to #{@primary_node}")
start_node(@primary_node)
stop_node(@secondary_node)
stop_node(@tertiary_node)
wait_for_new_primary_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
wait_for_health_check_all_nodes
wait_for_reliable_connection
end
def reset_cluster
QA::Runtime::Logger.info('Reset Gitaly Cluster by starting all nodes and enabling writes')
start_node(@praefect)
start_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
enable_writes
wait_for_health_check_all_nodes
end
def verify_storage_move(source_storage, destination_storage)
......@@ -111,10 +159,33 @@ module QA
end
def wait_for_praefect
QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
/listening at tcp address/
)
# Praefect can fail to start if unable to dial one of the gitaly nodes
# See https://gitlab.com/gitlab-org/gitaly/-/issues/2847
# We tail the logs to allow us to confirm if that is the problem if tests fail
shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line|
QA::Runtime::Logger.debug(line.chomp)
end
end
def wait_for_new_primary_node(node)
QA::Runtime::Logger.info("Wait until #{node} is the primary node")
with_praefect_log do |log|
break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node
end
end
def wait_for_new_primary
QA::Runtime::Logger.info("Wait until a new primary node is selected")
with_praefect_log do |log|
break true if log['msg'] == 'primary node changed'
end
end
def wait_for_sql_ping
......@@ -124,35 +195,86 @@ module QA
)
end
def wait_for_no_praefect_storage_error
# If a healthcheck error was the last message to be logged, we'll keep seeing that message even if it's no longer a problem
# That is, there's no message shown in the Praefect logs when the healthcheck succeeds
# To work around that we perform the gitaly check rake task, wait a few seconds, and then we confirm that no healthcheck errors appear
QA::Runtime::Logger.info("Checking that Praefect does not report healthcheck errors with its gitaly nodes")
Support::Waiter.wait_until(max_duration: 120) do
wait_for_gitaly_check
sleep 5
shell "docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'" do |line|
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
break true if log['msg'] != 'error when pinging healthcheck'
rescue JSON::ParserError
# Ignore lines that can't be parsed as JSON
end
end
end
def wait_for_storage_nodes
nodes_confirmed = {
@primary_node => false,
@secondary_node => false,
@tertiary_node => false
}
wait_for_no_praefect_storage_error
wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
QA::Runtime::Logger.info(line.chomp)
Support::Waiter.repeat_until(max_attempts: 3) do
nodes_confirmed = {
@primary_node => false,
@secondary_node => false,
@tertiary_node => false
}
nodes_confirmed.each_key do |node|
nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
QA::Runtime::Logger.debug(line.chomp)
nodes_confirmed.each_key do |node|
nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
end
nodes_confirmed.values.all?
end
end
end
nodes_confirmed.values.all?
def wait_for_health_check_current_primary_node
wait_for_health_check(current_primary_node)
end
def wait_for_health_check_all_nodes
wait_for_health_check(@primary_node)
wait_for_health_check(@secondary_node)
wait_for_health_check(@tertiary_node)
end
def wait_for_health_check(node)
QA::Runtime::Logger.info("Waiting for health check on #{node}")
wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line|
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK'
rescue JSON::ParserError
# Ignore lines that can't be parsed as JSON
end
end
def wait_for_gitaly_check
storage_ok = false
check_finished = false
Support::Waiter.repeat_until(max_attempts: 3) do
storage_ok = false
check_finished = false
wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
QA::Runtime::Logger.info(line.chomp)
wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
QA::Runtime::Logger.debug(line.chomp)
storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
check_finished = true if line =~ /Checking Gitaly ... Finished/
storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
check_finished = true if line =~ /Checking Gitaly ... Finished/
storage_ok && check_finished
storage_ok && check_finished
end
end
end
......@@ -164,15 +286,36 @@ module QA
end
def wait_for_reliable_connection
QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
wait_for_praefect
wait_for_sql_ping
wait_for_storage_nodes
wait_for_gitaly_check
wait_for_gitlab_shell_check
end
def wait_for_replication(project_id)
Support::Waiter.wait_until(sleep_interval: 1) { replicated?(project_id) }
end
private
def current_primary_node
shell dataloss_command do |line|
QA::Runtime::Logger.debug(line.chomp)
match = line.match(/Primary: (.*)/)
break match[1] if match
end
end
def dataloss_command
"docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dataloss'"
end
def sql_to_docker_exec_cmd(sql)
Service::Shellout.sql_to_docker_exec_cmd(sql, 'postgres', 'SQL_PASSWORD', 'praefect_production', 'postgres.test', @postgres)
end
def verify_storage_move_from_gitaly(storage)
wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line|
log = JSON.parse(line)
......@@ -203,21 +346,10 @@ module QA
end
end
def wait_until_shell_command(cmd, **kwargs)
sleep_interval = kwargs.delete(:sleep_interval) || 1
Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
shell cmd do |line|
break true if yield line
end
end
end
def wait_until_shell_command_matches(cmd, regex, **kwargs)
wait_until_shell_command(cmd, kwargs) do |line|
QA::Runtime::Logger.info(line.chomp)
line =~ regex
def with_praefect_log
wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line|
QA::Runtime::Logger.debug(line.chomp)
yield JSON.parse(line)
end
end
end
......
......@@ -33,6 +33,31 @@ module QA
end
end
end
def sql_to_docker_exec_cmd(sql, username, password, database, host, container)
<<~CMD
docker exec --env PGPASSWORD=#{password} #{container} \
bash -c "psql -U #{username} -d #{database} -h #{host} -c \\"#{sql}\\""
CMD
end
def wait_until_shell_command(cmd, **kwargs)
sleep_interval = kwargs.delete(:sleep_interval) || 1
Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
shell cmd do |line|
break true if yield line
end
end
end
def wait_until_shell_command_matches(cmd, regex, **kwargs)
wait_until_shell_command(cmd, kwargs) do |line|
QA::Runtime::Logger.debug(line.chomp)
line =~ regex
end
end
end
end
end
# frozen_string_literal: true
module QA
RSpec.describe 'Create' do
context 'Gitaly' do
describe 'Backend node recovery', :orchestrated, :gitaly_ha, :skip_live_env do
let(:praefect_manager) { Service::PraefectManager.new }
let(:project) do
Resource::Project.fabricate! do |project|
project.name = "gitaly_cluster"
project.initialize_with_readme = true
end
end
before do
# Reset the cluster in case previous tests left it in a bad state
praefect_manager.reset_primary_to_original
end
after do
# Leave the cluster in a suitable state for subsequent tests
praefect_manager.reset_primary_to_original
end
it 'recovers from dataloss' do
# Create a new project with a commit and wait for it to replicate
praefect_manager.wait_for_replication(project.id)
# Stop the primary node to trigger failover, and then wait
# for Gitaly to be ready for writes again
praefect_manager.trigger_failover_by_stopping_primary_node
praefect_manager.wait_for_new_primary
praefect_manager.wait_for_health_check_current_primary_node
praefect_manager.wait_for_gitaly_check
# Confirm that we have access to the repo after failover
Support::Waiter.wait_until(retry_on_exception: true, sleep_interval: 5) do
Resource::Repository::Commit.fabricate_via_api! do |commits|
commits.project = project
commits.sha = 'master'
end
end
# Push a commit to the new primary
Resource::Repository::ProjectPush.fabricate! do |push|
push.project = project
push.new_branch = false
push.commit_message = 'pushed after failover'
push.file_name = 'new_file'
push.file_content = 'new file'
end
# Start the old primary node again
praefect_manager.start_primary_node
praefect_manager.wait_for_health_check_current_primary_node
# Confirm dataloss (i.e., inconsistent nodes)
expect(praefect_manager.dataloss?).to be true
expect(praefect_manager.replicated?(project.id)).to be false
# Reconcile nodes to recover from dataloss
praefect_manager.reconcile_nodes
praefect_manager.wait_for_replication(project.id)
# Confirm that both commits are available after reconciliation
expect(project.commits.map { |commit| commit[:message].chomp })
.to include("Initial commit").and include("pushed after failover")
end
end
end
end
end
......@@ -41,8 +41,6 @@ module QA
expect(show).to have_file(initial_file)
end
praefect_manager.enable_writes
Resource::Repository::Commit.fabricate_via_api! do |commit|
commit.project = project
commit.add_files([
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment