Commit 955b8664 authored by Dan Davison's avatar Dan Davison

Merge branch 'jmd-improve-stability-of-gitaly-e2e-tests' into 'master'

Ensure containers are healthy for gitaly e2e tests

See merge request gitlab-org/gitlab!73802
parents 88987766 2992265a
...@@ -58,6 +58,7 @@ module QA ...@@ -58,6 +58,7 @@ module QA
def start_praefect def start_praefect
start_node(@praefect) start_node(@praefect)
wait_for_praefect
end end
def stop_praefect def stop_praefect
...@@ -176,6 +177,7 @@ module QA ...@@ -176,6 +177,7 @@ module QA
start_node(@primary_node) start_node(@primary_node)
start_node(@secondary_node) start_node(@secondary_node)
start_node(@tertiary_node) start_node(@tertiary_node)
start_node(@praefect)
wait_for_health_check_all_nodes wait_for_health_check_all_nodes
wait_for_reliable_connection wait_for_reliable_connection
...@@ -197,14 +199,7 @@ module QA ...@@ -197,14 +199,7 @@ module QA
max_duration: 180, max_duration: 180,
retry_on_exception: true retry_on_exception: true
) )
wait_for_gitaly_check
# Praefect can fail to start if unable to dial one of the gitaly nodes
# See https://gitlab.com/gitlab-org/gitaly/-/issues/2847
# We tail the logs to allow us to confirm if that is the problem if tests fail
shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line|
QA::Runtime::Logger.debug(line.chomp)
end
end end
def wait_for_sql_ping def wait_for_sql_ping
...@@ -244,7 +239,7 @@ module QA ...@@ -244,7 +239,7 @@ module QA
def wait_for_storage_nodes def wait_for_storage_nodes
wait_for_no_praefect_storage_error wait_for_no_praefect_storage_error
Support::Waiter.repeat_until(max_attempts: 3) do Support::Waiter.repeat_until(max_attempts: 3, max_duration: 120, sleep_interval: 1) do
nodes_confirmed = { nodes_confirmed = {
@primary_node => false, @primary_node => false,
@secondary_node => false, @secondary_node => false,
...@@ -304,7 +299,7 @@ module QA ...@@ -304,7 +299,7 @@ module QA
end end
def wait_until_node_is_removed_from_healthy_storages(node) def wait_until_node_is_removed_from_healthy_storages(node)
Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do
result = [] result = []
shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line| shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
result << line result << line
...@@ -315,7 +310,7 @@ module QA ...@@ -315,7 +310,7 @@ module QA
end end
def wait_until_node_is_marked_as_healthy_storage(node) def wait_until_node_is_marked_as_healthy_storage(node)
Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do
result = [] result = []
shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line| shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
result << line result << line
...@@ -327,17 +322,10 @@ module QA ...@@ -327,17 +322,10 @@ module QA
end end
def wait_for_gitaly_check def wait_for_gitaly_check
Support::Waiter.repeat_until(max_attempts: 3) do Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do
storage_ok = false wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:git:fsck'") do |line|
check_finished = false
wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
QA::Runtime::Logger.debug(line.chomp) QA::Runtime::Logger.debug(line.chomp)
line.include?('Done')
storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
check_finished = true if line =~ /Checking Gitaly ... Finished/
storage_ok && check_finished
end end
end end
end end
...@@ -347,7 +335,7 @@ module QA ...@@ -347,7 +335,7 @@ module QA
# has no pre-read data, consider it to have had zero reads. # has no pre-read data, consider it to have had zero reads.
def wait_for_read_count_change(pre_read_data) def wait_for_read_count_change(pre_read_data)
diff_found = false diff_found = false
Support::Waiter.wait_until(sleep_interval: 5) do Support::Waiter.wait_until(sleep_interval: 1, max_duration: 60) do
query_read_distribution.each_with_index do |data, index| query_read_distribution.each_with_index do |data, index|
diff_found = true if data[:value] > value_for_node(pre_read_data, data[:node]) diff_found = true if data[:value] > value_for_node(pre_read_data, data[:node])
end end
...@@ -361,10 +349,8 @@ module QA ...@@ -361,10 +349,8 @@ module QA
def wait_for_reliable_connection def wait_for_reliable_connection
QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably') QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
wait_for_praefect
wait_for_sql_ping wait_for_sql_ping
wait_for_storage_nodes wait_for_storage_nodes
wait_for_gitaly_check
end end
def wait_for_replication(project_id) def wait_for_replication(project_id)
......
...@@ -22,9 +22,7 @@ module QA ...@@ -22,9 +22,7 @@ module QA
end end
end end
after(:context, quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }) do after do
# Leave the cluster in a suitable state for subsequent tests,
# if there was a problem during the tests here
praefect_manager.start_all_nodes praefect_manager.start_all_nodes
end end
...@@ -44,10 +42,7 @@ module QA ...@@ -44,10 +42,7 @@ module QA
push.file_content = "This should exist on all nodes" push.file_content = "This should exist on all nodes"
end end
praefect_manager.start_secondary_node praefect_manager.start_all_nodes
praefect_manager.start_tertiary_node
praefect_manager.wait_for_health_check_all_nodes
praefect_manager.wait_for_replication(project.id) praefect_manager.wait_for_replication(project.id)
# Stop the primary node to trigger failover, and then wait # Stop the primary node to trigger failover, and then wait
......
...@@ -29,7 +29,7 @@ module QA ...@@ -29,7 +29,7 @@ module QA
# Stop the primary node to trigger failover, and then wait # Stop the primary node to trigger failover, and then wait
# for Gitaly to be ready for writes again # for Gitaly to be ready for writes again
praefect_manager.stop_primary_node praefect_manager.stop_primary_node
praefect_manager.wait_for_gitaly_check praefect_manager.wait_for_primary_node_health_check_failure
# Push a commit to the new primary # Push a commit to the new primary
Resource::Repository::ProjectPush.fabricate! do |push| Resource::Repository::ProjectPush.fabricate! do |push|
......
...@@ -42,6 +42,7 @@ module QA ...@@ -42,6 +42,7 @@ module QA
context 'when a node is unhealthy' do context 'when a node is unhealthy' do
before do before do
praefect_manager.start_all_nodes
praefect_manager.stop_secondary_node praefect_manager.stop_secondary_node
praefect_manager.wait_for_secondary_node_health_check_failure praefect_manager.wait_for_secondary_node_health_check_failure
end end
......
...@@ -13,9 +13,14 @@ module QA ...@@ -13,9 +13,14 @@ module QA
end end
end end
before do
praefect_manager.start_all_nodes
praefect_manager.start_praefect
end
after do after do
praefect_manager.start_all_nodes
praefect_manager.start_praefect praefect_manager.start_praefect
praefect_manager.wait_for_reliable_connection
praefect_manager.clear_replication_queue praefect_manager.clear_replication_queue
end end
...@@ -51,7 +56,6 @@ module QA ...@@ -51,7 +56,6 @@ module QA
praefect_manager.create_stalled_replication_queue praefect_manager.create_stalled_replication_queue
praefect_manager.start_praefect praefect_manager.start_praefect
praefect_manager.wait_for_reliable_connection
# Create a new project, push to it, and check that replication occurs # Create a new project, push to it, and check that replication occurs
project_push = Resource::Repository::ProjectPush.fabricate! do |push| project_push = Resource::Repository::ProjectPush.fabricate! do |push|
......
...@@ -9,11 +9,13 @@ module QA ...@@ -9,11 +9,13 @@ module QA
let(:repo2) { { "relative_path" => "@hashed/path/to/repo2.git", "storage" => "gitaly3", "virtual_storage" => "default" } } let(:repo2) { { "relative_path" => "@hashed/path/to/repo2.git", "storage" => "gitaly3", "virtual_storage" => "default" } }
before do before do
praefect_manager.start_all_nodes
praefect_manager.add_repo_to_disk(praefect_manager.primary_node, repo1["relative_path"]) praefect_manager.add_repo_to_disk(praefect_manager.primary_node, repo1["relative_path"])
praefect_manager.add_repo_to_disk(praefect_manager.tertiary_node, repo2["relative_path"]) praefect_manager.add_repo_to_disk(praefect_manager.tertiary_node, repo2["relative_path"])
end end
after do after do
praefect_manager.start_all_nodes
praefect_manager.remove_repo_from_disk(repo1["relative_path"]) praefect_manager.remove_repo_from_disk(repo1["relative_path"])
praefect_manager.remove_repo_from_disk(repo2["relative_path"]) praefect_manager.remove_repo_from_disk(repo2["relative_path"])
praefect_manager.remove_repository_from_praefect_database(repo1["relative_path"]) praefect_manager.remove_repository_from_praefect_database(repo1["relative_path"])
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment