Ensure containers are healthy for gitaly e2e tests

2992265a · John McDonnell · Dan Davison · 1a6e8d39 · 2992265a · 2992265a
Commit 2992265a authored Nov 22, 2021 by John McDonnell Committed by Dan Davison Nov 22, 2021
6 changed files
--- a/qa/qa/service/praefect_manager.rb
+++ b/qa/qa/service/praefect_manager.rb
@@ -58,6 +58,7 @@ module QA

      def start_praefect
        start_node(@praefect)
+        wait_for_praefect
      end

      def stop_praefect
@@ -176,6 +177,7 @@ module QA
        start_node(@primary_node)
        start_node(@secondary_node)
        start_node(@tertiary_node)
+        start_node(@praefect)

        wait_for_health_check_all_nodes
        wait_for_reliable_connection
@@ -197,14 +199,7 @@ module QA
          max_duration: 180,
          retry_on_exception: true
        )
-
-        # Praefect can fail to start if unable to dial one of the gitaly nodes
-        # See https://gitlab.com/gitlab-org/gitaly/-/issues/2847
-        # We tail the logs to allow us to confirm if that is the problem if tests fail
-
-        shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line|
-          QA::Runtime::Logger.debug(line.chomp)
-        end
+        wait_for_gitaly_check
      end

      def wait_for_sql_ping
@@ -244,7 +239,7 @@ module QA
      def wait_for_storage_nodes
        wait_for_no_praefect_storage_error

-        Support::Waiter.repeat_until(max_attempts: 3) do
+        Support::Waiter.repeat_until(max_attempts: 3, max_duration: 120, sleep_interval: 1) do
          nodes_confirmed = {
            @primary_node => false,
            @secondary_node => false,
@@ -304,7 +299,7 @@ module QA
      end

      def wait_until_node_is_removed_from_healthy_storages(node)
-        Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do
+        Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do
          result = []
          shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
            result << line
@@ -315,7 +310,7 @@ module QA
      end

      def wait_until_node_is_marked_as_healthy_storage(node)
-        Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do
+        Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do
          result = []
          shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
            result << line
@@ -327,17 +322,10 @@ module QA
      end

      def wait_for_gitaly_check
-        Support::Waiter.repeat_until(max_attempts: 3) do
-          storage_ok = false
-          check_finished = false
-
-          wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
+        Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do
+          wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:git:fsck'") do |line|
            QA::Runtime::Logger.debug(line.chomp)
-
-            storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
-            check_finished = true if line =~ /Checking Gitaly ... Finished/
-
-            storage_ok && check_finished
+            line.include?('Done')
          end
        end
      end
@@ -347,7 +335,7 @@ module QA
      # has no pre-read data, consider it to have had zero reads.
      def wait_for_read_count_change(pre_read_data)
        diff_found = false
-        Support::Waiter.wait_until(sleep_interval: 5) do
+        Support::Waiter.wait_until(sleep_interval: 1, max_duration: 60) do
          query_read_distribution.each_with_index do |data, index|
            diff_found = true if data[:value] > value_for_node(pre_read_data, data[:node])
          end
@@ -361,10 +349,8 @@ module QA

      def wait_for_reliable_connection
        QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
-        wait_for_praefect
        wait_for_sql_ping
        wait_for_storage_nodes
-        wait_for_gitaly_check
      end

      def wait_for_replication(project_id)

--- a/qa/qa/specs/features/api/3_create/gitaly/automatic_failover_and_recovery_spec.rb
+++ b/qa/qa/specs/features/api/3_create/gitaly/automatic_failover_and_recovery_spec.rb
@@ -22,9 +22,7 @@ module QA
        end
      end

-      after(:context, quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }) do
-        # Leave the cluster in a suitable state for subsequent tests,
-        # if there was a problem during the tests here
+      after do
        praefect_manager.start_all_nodes
      end

@@ -44,10 +42,7 @@ module QA
          push.file_content = "This should exist on all nodes"
        end

-        praefect_manager.start_secondary_node
-        praefect_manager.start_tertiary_node
-        praefect_manager.wait_for_health_check_all_nodes
-
+        praefect_manager.start_all_nodes
        praefect_manager.wait_for_replication(project.id)

        # Stop the primary node to trigger failover, and then wait

--- a/qa/qa/specs/features/api/3_create/gitaly/backend_node_recovery_spec.rb
+++ b/qa/qa/specs/features/api/3_create/gitaly/backend_node_recovery_spec.rb
@@ -29,7 +29,7 @@ module QA
          # Stop the primary node to trigger failover, and then wait
          # for Gitaly to be ready for writes again
          praefect_manager.stop_primary_node
-          praefect_manager.wait_for_gitaly_check
+          praefect_manager.wait_for_primary_node_health_check_failure

          # Push a commit to the new primary
          Resource::Repository::ProjectPush.fabricate! do |push|

--- a/qa/qa/specs/features/api/3_create/gitaly/distributed_reads_spec.rb
+++ b/qa/qa/specs/features/api/3_create/gitaly/distributed_reads_spec.rb
@@ -42,6 +42,7 @@ module QA

        context 'when a node is unhealthy' do
          before do
+            praefect_manager.start_all_nodes
            praefect_manager.stop_secondary_node
            praefect_manager.wait_for_secondary_node_health_check_failure
          end

--- a/qa/qa/specs/features/api/3_create/gitaly/praefect_replication_queue_spec.rb
+++ b/qa/qa/specs/features/api/3_create/gitaly/praefect_replication_queue_spec.rb
@@ -13,9 +13,14 @@ module QA
        end
      end

+      before do
+        praefect_manager.start_all_nodes
+        praefect_manager.start_praefect
+      end
+
      after do
+        praefect_manager.start_all_nodes
        praefect_manager.start_praefect
-        praefect_manager.wait_for_reliable_connection
        praefect_manager.clear_replication_queue
      end

@@ -51,7 +56,6 @@ module QA
        praefect_manager.create_stalled_replication_queue

        praefect_manager.start_praefect
-        praefect_manager.wait_for_reliable_connection

        # Create a new project, push to it, and check that replication occurs
        project_push = Resource::Repository::ProjectPush.fabricate! do |push|

--- a/qa/qa/specs/features/api/3_create/gitaly/praefect_repo_sync_spec.rb
+++ b/qa/qa/specs/features/api/3_create/gitaly/praefect_repo_sync_spec.rb
@@ -9,11 +9,13 @@ module QA
      let(:repo2) { { "relative_path" => "@hashed/path/to/repo2.git", "storage" => "gitaly3", "virtual_storage" => "default" } }

      before do
+        praefect_manager.start_all_nodes
        praefect_manager.add_repo_to_disk(praefect_manager.primary_node, repo1["relative_path"])
        praefect_manager.add_repo_to_disk(praefect_manager.tertiary_node, repo2["relative_path"])
      end

      after do
+        praefect_manager.start_all_nodes
        praefect_manager.remove_repo_from_disk(repo1["relative_path"])
        praefect_manager.remove_repo_from_disk(repo2["relative_path"])
        praefect_manager.remove_repository_from_praefect_database(repo1["relative_path"])