Commit 28e47ff8 authored by Mark Lapierre's avatar Mark Lapierre

Merge branch 'jmd-fixing-gitaly-cluster-e2e-tests' into 'master'

Updating Gitaly Cluster E2E test logic

See merge request gitlab-org/gitlab!71789
parents f21f76b8 b9839a17
...@@ -46,6 +46,10 @@ module QA ...@@ -46,6 +46,10 @@ module QA
end end
end end
def stop_primary_node
stop_node(@primary_node)
end
def start_primary_node def start_primary_node
start_node(@primary_node) start_node(@primary_node)
end end
...@@ -66,20 +70,29 @@ module QA ...@@ -66,20 +70,29 @@ module QA
start_node(@secondary_node) start_node(@secondary_node)
end end
def stop_tertiary_node
stop_node(@tertiary_node)
end
def start_tertiary_node
start_node(@tertiary_node)
end
def start_node(name) def start_node(name)
shell "docker start #{name}" shell "docker start #{name}"
wait_until_shell_command_matches(
"docker inspect -f {{.State.Running}} #{name}",
/true/,
sleep_interval: 3,
max_duration: 180,
retry_on_exception: true
)
end end
def stop_node(name) def stop_node(name)
shell "docker stop #{name}" shell "docker stop #{name}"
end end
def trigger_failover_by_stopping_primary_node
QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
stop_node(@primary_node)
wait_for_new_primary
end
def clear_replication_queue def clear_replication_queue
QA::Runtime::Logger.info("Clearing the replication queue") QA::Runtime::Logger.info("Clearing the replication queue")
shell sql_to_docker_exec_cmd( shell sql_to_docker_exec_cmd(
...@@ -157,22 +170,8 @@ module QA ...@@ -157,22 +170,8 @@ module QA
result[2].to_i result[2].to_i
end end
# Makes the original primary (gitaly1) the primary again by def start_all_nodes
# stopping the other nodes, waiting for gitaly1 to be made the
# primary again, and then it starts the other nodes and enables
# writes
def reset_primary_to_original
QA::Runtime::Logger.info("Checking primary node...")
return if @primary_node == current_primary_node
QA::Runtime::Logger.info("Reset primary node to #{@primary_node}")
start_node(@primary_node) start_node(@primary_node)
stop_node(@secondary_node)
stop_node(@tertiary_node)
wait_for_new_primary_node(@primary_node)
start_node(@secondary_node) start_node(@secondary_node)
start_node(@tertiary_node) start_node(@tertiary_node)
...@@ -189,10 +188,12 @@ module QA ...@@ -189,10 +188,12 @@ module QA
end end
def wait_for_praefect def wait_for_praefect
QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
wait_until_shell_command_matches( wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'", "docker inspect -f {{.State.Running}} #{@praefect}",
/listening at tcp address/ /true/,
sleep_interval: 3,
max_duration: 180,
retry_on_exception: true
) )
# Praefect can fail to start if unable to dial one of the gitaly nodes # Praefect can fail to start if unable to dial one of the gitaly nodes
...@@ -204,20 +205,6 @@ module QA ...@@ -204,20 +205,6 @@ module QA
end end
end end
def wait_for_new_primary_node(node)
QA::Runtime::Logger.info("Wait until #{node} is the primary node")
with_praefect_log(max_duration: 120) do |log|
break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node
end
end
def wait_for_new_primary
QA::Runtime::Logger.info("Wait until a new primary node is selected")
with_praefect_log(max_duration: 120) do |log|
break true if log['msg'] == 'primary node changed'
end
end
def wait_for_sql_ping def wait_for_sql_ping
wait_until_shell_command_matches( wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml sql-ping'", "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml sql-ping'",
...@@ -274,10 +261,6 @@ module QA ...@@ -274,10 +261,6 @@ module QA
end end
end end
def wait_for_health_check_current_primary_node
wait_for_health_check(current_primary_node)
end
def wait_for_health_check_all_nodes def wait_for_health_check_all_nodes
wait_for_health_check(@primary_node) wait_for_health_check(@primary_node)
wait_for_health_check(@secondary_node) wait_for_health_check(@secondary_node)
...@@ -286,29 +269,58 @@ module QA ...@@ -286,29 +269,58 @@ module QA
def wait_for_health_check(node) def wait_for_health_check(node)
QA::Runtime::Logger.info("Waiting for health check on #{node}") QA::Runtime::Logger.info("Waiting for health check on #{node}")
wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line| wait_until_node_is_marked_as_healthy_storage(node)
QA::Runtime::Logger.debug(line.chomp) end
log = JSON.parse(line)
log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK' def wait_for_primary_node_health_check
rescue JSON::ParserError wait_for_health_check(@primary_node)
# Ignore lines that can't be parsed as JSON end
def wait_for_secondary_node_health_check
wait_for_health_check(@secondary_node)
end end
def wait_for_tertiary_node_health_check
wait_for_health_check(@tertiary_node)
end
def wait_for_health_check_failure(node)
QA::Runtime::Logger.info("Waiting for health check failure on #{node}")
wait_until_node_is_removed_from_healthy_storages(node)
end
def wait_for_primary_node_health_check_failure
wait_for_health_check_failure(@primary_node)
end end
def wait_for_secondary_node_health_check_failure def wait_for_secondary_node_health_check_failure
wait_for_health_check_failure(@secondary_node) wait_for_health_check_failure(@secondary_node)
end end
def wait_for_health_check_failure(node) def wait_for_tertiary_node_health_check_failure
QA::Runtime::Logger.info("Waiting for Praefect to record a health check failure on #{node}") wait_for_health_check_failure(@tertiary_node)
wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line| end
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
health_check_failure_message?(log['msg']) && log['storage'] == node def wait_until_node_is_removed_from_healthy_storages(node)
rescue JSON::ParserError Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do
# Ignore lines that can't be parsed as JSON result = []
shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
result << line
end
QA::Runtime::Logger.debug("result is ---#{result}")
result[2].to_i == 0
end
end
def wait_until_node_is_marked_as_healthy_storage(node)
Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do
result = []
shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
result << line
end
QA::Runtime::Logger.debug("result is ---#{result}")
result[2].to_i == 1
end end
end end
......
...@@ -52,7 +52,7 @@ module QA ...@@ -52,7 +52,7 @@ module QA
end end
def wait_until_shell_command_matches(cmd, regex, **kwargs) def wait_until_shell_command_matches(cmd, regex, **kwargs)
wait_until_shell_command(cmd, kwargs) do |line| wait_until_shell_command(cmd, **kwargs) do |line|
QA::Runtime::Logger.debug(line.chomp) QA::Runtime::Logger.debug(line.chomp)
line =~ regex line =~ regex
......
...@@ -14,7 +14,7 @@ module QA ...@@ -14,7 +14,7 @@ module QA
before(:context) do before(:context) do
# Reset the cluster in case previous tests left it in a bad state # Reset the cluster in case previous tests left it in a bad state
praefect_manager.reset_primary_to_original praefect_manager.start_all_nodes
project = Resource::Project.fabricate! do |project| project = Resource::Project.fabricate! do |project|
project.name = "gitaly_cluster" project.name = "gitaly_cluster"
...@@ -25,25 +25,35 @@ module QA ...@@ -25,25 +25,35 @@ module QA
after(:context, quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }) do after(:context, quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }) do
# Leave the cluster in a suitable state for subsequent tests, # Leave the cluster in a suitable state for subsequent tests,
# if there was a problem during the tests here # if there was a problem during the tests here
praefect_manager.reset_primary_to_original praefect_manager.start_all_nodes
end end
it 'automatically fails over', testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1267' do it 'automatically fails over', testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1267' do
# Create a new project with a commit and wait for it to replicate # Create a new project with a commit and wait for it to replicate
# make sure that our project is published to the 'primary' node
praefect_manager.stop_secondary_node
praefect_manager.stop_tertiary_node
praefect_manager.wait_for_secondary_node_health_check_failure
praefect_manager.wait_for_tertiary_node_health_check_failure
Resource::Repository::ProjectPush.fabricate! do |push| Resource::Repository::ProjectPush.fabricate! do |push|
push.project = project push.project = project
push.commit_message = first_added_commit_message push.commit_message = first_added_commit_message
push.new_branch = false push.new_branch = false
push.file_content = "This should exist on both nodes" push.file_content = "This should exist on all nodes"
end end
praefect_manager.start_secondary_node
praefect_manager.start_tertiary_node
praefect_manager.wait_for_health_check_all_nodes
praefect_manager.wait_for_replication(project.id) praefect_manager.wait_for_replication(project.id)
# Stop the primary node to trigger failover, and then wait # Stop the primary node to trigger failover, and then wait
# for Gitaly to be ready for writes again # for Gitaly to be ready for writes again
praefect_manager.trigger_failover_by_stopping_primary_node praefect_manager.stop_primary_node
praefect_manager.wait_for_new_primary praefect_manager.wait_for_primary_node_health_check_failure
praefect_manager.wait_for_health_check_current_primary_node
praefect_manager.wait_for_gitaly_check praefect_manager.wait_for_gitaly_check
Resource::Repository::Commit.fabricate_via_api! do |commit| Resource::Repository::Commit.fabricate_via_api! do |commit|
...@@ -69,7 +79,7 @@ module QA ...@@ -69,7 +79,7 @@ module QA
it 'automatically reconciles', quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }, testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1266' do it 'automatically reconciles', quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }, testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1266' do
# Start the old primary node again # Start the old primary node again
praefect_manager.start_primary_node praefect_manager.start_primary_node
praefect_manager.wait_for_health_check_current_primary_node praefect_manager.wait_for_primary_node_health_check
# Confirm automatic reconciliation # Confirm automatic reconciliation
expect(praefect_manager.replicated?(project.id)).to be true expect(praefect_manager.replicated?(project.id)).to be true
...@@ -81,7 +91,7 @@ module QA ...@@ -81,7 +91,7 @@ module QA
.and include(second_added_commit_message) .and include(second_added_commit_message)
# Restore the original primary node # Restore the original primary node
praefect_manager.reset_primary_to_original praefect_manager.start_all_nodes
# Check that all commits are still available even though the primary # Check that all commits are still available even though the primary
# node was offline when one was made # node was offline when one was made
......
...@@ -14,12 +14,12 @@ module QA ...@@ -14,12 +14,12 @@ module QA
before do before do
# Reset the cluster in case previous tests left it in a bad state # Reset the cluster in case previous tests left it in a bad state
praefect_manager.reset_primary_to_original praefect_manager.start_all_nodes
end end
after do after do
# Leave the cluster in a suitable state for subsequent tests # Leave the cluster in a suitable state for subsequent tests
praefect_manager.reset_primary_to_original praefect_manager.start_all_nodes
end end
it 'recovers from dataloss', testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1265' do it 'recovers from dataloss', testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1265' do
...@@ -28,9 +28,7 @@ module QA ...@@ -28,9 +28,7 @@ module QA
# Stop the primary node to trigger failover, and then wait # Stop the primary node to trigger failover, and then wait
# for Gitaly to be ready for writes again # for Gitaly to be ready for writes again
praefect_manager.trigger_failover_by_stopping_primary_node praefect_manager.stop_primary_node
praefect_manager.wait_for_new_primary
praefect_manager.wait_for_health_check_current_primary_node
praefect_manager.wait_for_gitaly_check praefect_manager.wait_for_gitaly_check
# Confirm that we have access to the repo after failover # Confirm that we have access to the repo after failover
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment