Commit 932b4392 authored by Douglas Barbosa Alexandre's avatar Douglas Barbosa Alexandre

Merge branch 'mk/add-verification-state-machine' into 'master'

Geo: Add verification state machine

See merge request gitlab-org/gitlab!47260
parents b7c276c0 5c9b54ad
---
title: 'Geo: Add verification state machine fields to package files table'
merge_request: 47260
author:
type: added
# frozen_string_literal: true
class AddVerificationStateToPackageFiles < ActiveRecord::Migration[6.0]
DOWNTIME = false
def change
add_column :packages_package_files, :verification_state, :integer, default: 0, limit: 2, null: false
add_column :packages_package_files, :verification_started_at, :datetime_with_timezone
end
end
# frozen_string_literal: true
class AddIndexOnVerificationStateOnPackageFiles < ActiveRecord::Migration[6.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
INDEX_NAME = 'index_packages_package_files_on_verification_state'
disable_ddl_transaction!
def up
add_concurrent_index :packages_package_files, :verification_state, name: INDEX_NAME
end
def down
remove_concurrent_index_by_name :packages_package_files, INDEX_NAME
end
end
d88a47333a4cc2b6c4aafa817c766822728d14b947a195c7c40b39e0c8b41610
\ No newline at end of file
dde78a32d53a695e82b44574458b3670dce4803ffc6f34a1216f3671cca470ed
\ No newline at end of file
......@@ -14485,6 +14485,8 @@ CREATE TABLE packages_package_files (
verification_failure character varying(255),
verification_retry_count integer,
verification_checksum bytea,
verification_state smallint DEFAULT 0 NOT NULL,
verification_started_at timestamp with time zone,
CONSTRAINT check_4c5e6bb0b3 CHECK ((file_store IS NOT NULL))
);
......@@ -21453,6 +21455,8 @@ CREATE INDEX index_packages_package_files_on_file_store ON packages_package_file
CREATE INDEX index_packages_package_files_on_package_id_and_file_name ON packages_package_files USING btree (package_id, file_name);
CREATE INDEX index_packages_package_files_on_verification_state ON packages_package_files USING btree (verification_state);
CREATE INDEX index_packages_packages_on_creator_id ON packages_packages USING btree (creator_id);
CREATE INDEX index_packages_packages_on_id_and_created_at ON packages_packages USING btree (id, created_at);
......
......@@ -393,6 +393,8 @@ can track verification state.
def change
change_table(:widgets) do |t|
t.integer :verification_state, default: 0, limit: 2, null: false
t.column :verification_started_at, :datetime_with_timezone
t.integer :verification_retry_count, limit: 2
t.column :verification_retry_at, :datetime_with_timezone
t.column :verified_at, :datetime_with_timezone
......@@ -431,13 +433,12 @@ can track verification state.
end
```
1. Add a partial index on `verification_failure` and `verification_checksum` to ensure
re-verification can be performed efficiently:
1. Add an index on `verification_state` to ensure verification can be performed efficiently:
```ruby
# frozen_string_literal: true
class AddVerificationFailureIndexToWidgets < ActiveRecord::Migration[6.0]
class AddVerificationStateIndexToWidgets < ActiveRecord::Migration[6.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
......@@ -445,22 +446,28 @@ can track verification state.
disable_ddl_transaction!
def up
add_concurrent_index :widgets, :verification_failure, where: "(verification_failure IS NOT NULL)", name: "widgets_verification_failure_partial"
add_concurrent_index :widgets, :verification_checksum, where: "(verification_checksum IS NOT NULL)", name: "widgets_verification_checksum_partial"
add_concurrent_index :widgets, :verification_state, name: "index_widgets_on_verification_state"
end
def down
remove_concurrent_index :widgets, :verification_failure
remove_concurrent_index :widgets, :verification_checksum
remove_concurrent_index :widgets, :verification_state
end
end
```
1. Add the `Gitlab::Geo::VerificationState` concern to the `widget` model if it is not already included in `Gitlab::Geo::ReplicableModel`:
```ruby
class Widget < ApplicationRecord
...
include ::Gitlab::Geo::VerificationState
...
end
```
##### Option 2: Create a separate `widget_states` table with verification state fields
1. Create a `widget_states` table and add a partial index on `verification_failure` and
`verification_checksum` to ensure re-verification can be performed efficiently. Order
the columns according to [the guidelines](../ordering_table_columns.md):
1. Create a `widget_states` table and add an index on `verification_state` to ensure verification can be performed efficiently. Order the columns according to [the guidelines](../ordering_table_columns.md):
```ruby
# frozen_string_literal: true
......@@ -477,14 +484,15 @@ can track verification state.
with_lock_retries do
create_table :widget_states, id: false do |t|
t.references :widget, primary_key: true, null: false, foreign_key: { on_delete: :cascade }
t.integer :verification_state, default: 0, limit: 2, null: false
t.column :verification_started_at, :datetime_with_timezone
t.datetime_with_timezone :verification_retry_at
t.datetime_with_timezone :verified_at
t.integer :verification_retry_count, limit: 2
t.binary :verification_checksum, using: 'verification_checksum::bytea'
t.text :verification_failure
t.index :verification_failure, where: "(verification_failure IS NOT NULL)", name: "widgets_verification_failure_partial"
t.index :verification_checksum, where: "(verification_checksum IS NOT NULL)", name: "widgets_verification_checksum_partial"
t.index :verification_state, name: "index_widget_states_on_verification_state"
end
end
end
......@@ -498,6 +506,20 @@ can track verification state.
end
```
1. Add the following lines to the `widget_state.rb` model:
```ruby
class WidgetState < ApplicationRecord
...
self.primary_key = :widget_id
include ::Gitlab::Geo::VerificationState
belongs_to :widget, inverse_of: :widget_state
...
end
```
1. Add the following lines to the `widget` model:
```ruby
......@@ -547,14 +569,16 @@ Metrics are gathered by `Geo::MetricsUpdateWorker`, persisted in
1. Add the following to `spec/factories/widgets.rb`:
```ruby
trait(:checksummed) do
trait(:verification_succeeded) do
with_file
verification_checksum { 'abc' }
verification_state { Widget.verification_state_value(:verification_succeeded) }
end
trait(:checksum_failure) do
trait(:verification_failed) do
with_file
verification_failure { 'Could not calculate the checksum' }
verification_state { Widget.verification_state_value(:verification_failed) }
end
```
......
......@@ -30,7 +30,7 @@ module Geo
# Bonus: This causes the progress bar to be hidden.
return unless verification_enabled?
model.available_replicables.checksummed.count
model.available_replicables.verification_succeeded.count
end
def checksum_failed_count
......@@ -38,7 +38,7 @@ module Geo
# Bonus: This causes the progress bar to be hidden.
return unless verification_enabled?
model.available_replicables.checksum_failed.count
model.available_replicables.verification_failed.count
end
end
......@@ -47,15 +47,28 @@ module Geo
end
def verify_async
# Marking started prevents backfill (VerificationBatchWorker) from picking
# this up too.
# Also, if another verification job is running, this will make that job
# set state to pending after it finishes, since the calculated checksum
# is already invalidated.
model_record.verification_started!
Geo::VerificationWorker.perform_async(replicable_name, model_record.id)
end
# Calculates checksum and asks the model/registry to update verification
# state.
def verify
# Deduplicate verification job
return unless model_record.verification_started?
calculation_started_at = Time.current
checksum = model_record.calculate_checksum
update_verification_state!(checksum: checksum)
model_record.verification_succeeded_with_checksum!(checksum, calculation_started_at)
rescue => e
log_error('Error calculating the checksum', e)
update_verification_state!(failure: e.message)
model_record.verification_failed_with_message!('Error calculating the checksum', e)
end
# Check if given checksum matches known one
......@@ -83,28 +96,5 @@ module Geo
def secondary_checksum
registry.verification_checksum
end
private
# Update checksum on Geo primary database
#
# @param [String] checksum value generated by the checksum routine
# @param [String] failure (optional) stacktrace from failed execution
def update_verification_state!(checksum: nil, failure: nil)
retry_at, retry_count = calculate_next_retry_attempt if failure.present?
model_record.update!(
verification_checksum: checksum,
verified_at: Time.current,
verification_failure: failure,
verification_retry_at: retry_at,
verification_retry_count: retry_count
)
end
def calculate_next_retry_attempt
retry_count = model_record.verification_retry_count.to_i + 1
[next_retry_time(retry_count), retry_count]
end
end
end
......@@ -24,8 +24,8 @@ module EE
scope :has_external_diffs, -> { with_files.where(stored_externally: true) }
scope :project_id_in, ->(ids) { where(merge_request_id: ::MergeRequest.where(target_project_id: ids)) }
scope :checksummed, -> { where(merge_request_diff_detail: ::MergeRequestDiffDetail.checksummed) }
scope :checksum_failed, -> { where(merge_request_diff_detail: ::MergeRequestDiffDetail.checksum_failed) }
scope :verification_succeeded, -> { where(merge_request_diff_detail: ::MergeRequestDiffDetail.verification_succeeded) }
scope :verification_failed, -> { where(merge_request_diff_detail: ::MergeRequestDiffDetail.verification_failed) }
scope :available_replicables, -> { has_external_diffs }
end
......
......@@ -7,6 +7,8 @@ module EE
prepended do
include ::Gitlab::Geo::ReplicableModel
include ::Gitlab::Geo::VerificationState
with_replicator Geo::PackageFileReplicator
end
......
......@@ -5,6 +5,11 @@ class MergeRequestDiffDetail < ApplicationRecord
belongs_to :merge_request_diff, inverse_of: :merge_request_diff_detail
scope :checksummed, -> { where.not(verification_checksum: nil) }
scope :checksum_failed, -> { where.not(verification_failure: nil) }
# Temporarily defining `verification_succeeded` and
# `verification_failed` for unverified models while verification is
# under development to avoid breaking GeoNodeStatusCheck code.
# TODO: Remove these after including `Gitlab::Geo::VerificationState` on
# all models. https://gitlab.com/gitlab-org/gitlab/-/issues/280768
scope :verification_succeeded, -> { none }
scope :verification_failed, -> { none }
end
......@@ -15,5 +15,9 @@ module Geo
def carrierwave_uploader
model_record.external_diff
end
def needs_checksum?
false
end
end
end
......@@ -11,5 +11,9 @@ module Geo
def self.model
::Terraform::StateVersion
end
def needs_checksum?
false
end
end
end
......@@ -5,18 +5,20 @@ module Gitlab
module ReplicableModel
extend ActiveSupport::Concern
include Checksummable
include ::ShaAttribute
included do
# If this hook turns out not to apply to all Models, perhaps we should extract a `ReplicableBlobModel`
after_create_commit -> { replicator.handle_after_create_commit if replicator.respond_to?(:handle_after_create_commit) }
after_destroy -> { replicator.handle_after_destroy if replicator.respond_to?(:handle_after_destroy) }
scope :checksummed, -> { where.not(verification_checksum: nil) }
scope :checksum_failed, -> { where.not(verification_failure: nil) }
# Temporarily defining `verification_succeeded` and
# `verification_failed` for unverified models while verification is
# under development to avoid breaking GeoNodeStatusCheck code.
# TODO: Remove these after including `Gitlab::Geo::VerificationState` on
# all models. https://gitlab.com/gitlab-org/gitlab/-/issues/280768
scope :verification_succeeded, -> { none }
scope :verification_failed, -> { none }
scope :available_replicables, -> { all }
sha_attribute :verification_checksum
end
class_methods do
......
# frozen_string_literal: true
module Gitlab
module Geo
# This concern is included on ActiveRecord classes to manage their
# verification fields. This concern does not handle how verification is
# performed.
#
# This is a separate concern from Gitlab::Geo::ReplicableModel because e.g.
# MergeRequestDiff stores its verification state in a separate table with
# the association to MergeRequestDiffDetail.
module VerificationState
extend ActiveSupport::Concern
include ::ShaAttribute
include Delay
include Gitlab::Geo::LogHelpers
VERIFICATION_STATE_VALUES = {
verification_pending: 0,
verification_started: 1,
verification_succeeded: 2,
verification_failed: 3
}.freeze
VERIFICATION_TIMEOUT = 8.hours
included do
sha_attribute :verification_checksum
# rubocop:disable CodeReuse/ActiveRecord
scope :verification_pending, -> { with_verification_state(:verification_pending) }
scope :verification_started, -> { with_verification_state(:verification_started) }
scope :verification_succeeded, -> { with_verification_state(:verification_succeeded) }
scope :verification_failed, -> { with_verification_state(:verification_failed) }
scope :checksummed, -> { where.not(verification_checksum: nil) }
scope :not_checksummed, -> { where(verification_checksum: nil) }
scope :never_attempted_verification, -> { verification_pending.where(verification_started_at: nil) }
scope :needs_verification_again, -> { verification_pending.where.not(verification_started_at: nil).or(verification_failed) }
scope :verification_timed_out, -> { verification_started.where("verification_started_at < ?", VERIFICATION_TIMEOUT.ago) }
scope :needs_verification, -> { verification_pending.or(verification_failed) }
# rubocop:enable CodeReuse/ActiveRecord
state_machine :verification_state, initial: :verification_pending do
state :verification_pending, value: VERIFICATION_STATE_VALUES[:verification_pending]
state :verification_started, value: VERIFICATION_STATE_VALUES[:verification_started]
state :verification_succeeded, value: VERIFICATION_STATE_VALUES[:verification_succeeded] do
validates :verification_checksum, presence: true
end
state :verification_failed, value: VERIFICATION_STATE_VALUES[:verification_failed] do
validates :verification_failure, presence: true
end
before_transition any => :verification_started do |instance, _|
instance.verification_started_at = Time.current
end
before_transition any => :verification_pending do |instance, _|
instance.verification_retry_count = 0
instance.verification_retry_at = nil
instance.verification_failure = nil
end
before_transition any => :verification_failed do |instance, _|
instance.verification_checksum = nil
instance.verification_retry_count ||= 0
instance.verification_retry_count += 1
instance.verification_retry_at = instance.next_retry_time(instance.verification_retry_count)
instance.verified_at = Time.current
end
before_transition any => :verification_succeeded do |instance, _|
instance.verification_retry_count = 0
instance.verification_retry_at = nil
instance.verification_failure = nil
instance.verified_at = Time.current
end
event :verification_started do
transition [:verification_pending, :verification_started, :verification_succeeded, :verification_failed] => :verification_started
end
event :verification_succeeded do
transition verification_started: :verification_succeeded
end
event :verification_failed do
transition verification_started: :verification_failed
end
event :verification_pending do
transition [:verification_started, :verification_succeeded, :verification_failed] => :verification_pending
end
end
end
class_methods do
def verification_state_value(state_string)
VERIFICATION_STATE_VALUES[state_string]
end
end
# Convenience method to update checksum and transition to success state.
#
# @param [String] checksum value generated by the checksum routine
# @param [DateTime] calculation_started_at the moment just before the
# checksum routine was called
def verification_succeeded_with_checksum!(checksum, calculation_started_at)
self.verification_checksum = checksum
self.verification_succeeded!
if resource_updated_during_checksum?(calculation_started_at)
# just let backfill pick it up
self.verification_pending!
end
end
def resource_updated_during_checksum?(calculation_started_at)
self.reset.verification_started_at > calculation_started_at
end
# Convenience method to update failure message and transition to failed
# state.
#
# @param [String] message error information
# @param [StandardError] error exception
def verification_failed_with_message!(message, error = nil)
log_error('Error calculating the checksum', error)
self.verification_failure = message
self.verification_failure += ": #{error.message}" if error.respond_to?(:message)
self.verification_failed!
end
end
end
end
# frozen_string_literal: true
FactoryBot.modify do
factory :package_file do
trait(:verification_succeeded) do
verification_checksum { 'abc' }
verification_state { Packages::PackageFile.verification_state_value[:verification_succeeded] }
end
trait(:verification_failed) do
verification_failure { 'Could not calculate the checksum' }
verification_state { Packages::PackageFile.verification_state_value[:verification_failed] }
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Geo::VerificationState do
include ::EE::GeoHelpers
let_it_be(:primary_node) { create(:geo_node, :primary) }
let_it_be(:secondary_node) { create(:geo_node) }
before(:all) do
create_dummy_model_table
end
after(:all) do
drop_dummy_model_table
end
before do
stub_dummy_replicator_class
stub_dummy_model_class
subject.verification_started
subject.save!
end
subject { DummyModel.new }
describe '#verification_succeeded_with_checksum!' do
context 'when the resource was updated during checksum calculation' do
let(:calculation_started_at) { subject.verification_started_at - 1.second }
it 'sets state to pending' do
subject.verification_succeeded_with_checksum!('abc123', calculation_started_at)
expect(subject.reload.verification_pending?).to be_truthy
end
end
context 'when the resource was not updated during checksum calculation' do
let(:calculation_started_at) { subject.verification_started_at + 1.second }
it 'saves the checksum' do
subject.verification_succeeded_with_checksum!('abc123', calculation_started_at)
expect(subject.reload.verification_succeeded?).to be_truthy
expect(subject.reload.verification_checksum).to eq('abc123')
expect(subject.verified_at).not_to be_nil
end
end
end
describe '#verification_failed_with_message!' do
it 'saves the error message and increments retry counter' do
error = double('error', message: 'An error message')
subject.verification_failed_with_message!('Failure to calculate checksum', error)
expect(subject.reload.verification_failed?).to be_truthy
expect(subject.reload.verification_failure).to eq 'Failure to calculate checksum: An error message'
expect(subject.verification_retry_count).to be 1
end
end
end
......@@ -1206,15 +1206,17 @@ RSpec.describe GeoNodeStatus, :geo do
context 'when verification is enabled' do
before do
skip "#{replicator.model} does not include the VerificationState concern yet" unless replicator.model.respond_to?(:verification_state)
stub_current_geo_node(primary)
allow(replicator).to receive(:verification_enabled?).and_return(true)
end
context 'when there are replicables' do
before do
create(model_factory, :checksummed)
create(model_factory, :checksummed)
create(model_factory, :checksum_failure)
create(model_factory, :verification_succeeded)
create(model_factory, :verification_succeeded)
create(model_factory, :verification_failed)
end
describe '#<replicable_name>_checksummed_count' do
......
......@@ -80,6 +80,7 @@ module EE
DummyModel.class_eval do
include ::Gitlab::Geo::ReplicableModel
include ::Gitlab::Geo::VerificationState
with_replicator Geo::DummyReplicator
......@@ -104,6 +105,12 @@ module EE
ActiveRecord::Schema.define do
create_table :dummy_models, force: true do |t|
t.binary :verification_checksum
t.integer :verification_state
t.datetime_with_timezone :verification_started_at
t.datetime_with_timezone :verified_at
t.datetime_with_timezone :verification_retry_at
t.integer :verification_retry_count
t.text :verification_failure
end
end
end
......
......@@ -39,6 +39,80 @@ RSpec.shared_examples 'a verifiable replicator' do
end
end
describe '.checksummed_count' do
context 'when verification is enabled' do
before do
allow(described_class).to receive(:verification_enabled?).and_return(true)
end
it 'returns the number of available replicables where verification succeeded' do
model_record.verification_started!
model_record.verification_succeeded_with_checksum!('some checksum', Time.current)
expect(described_class.checksummed_count).to eq(1)
end
it 'excludes other verification states' do
model_record.verification_started!
expect(described_class.checksummed_count).to eq(0)
model_record.verification_failed_with_message!('some error message')
expect(described_class.checksummed_count).to eq(0)
model_record.verification_pending!
expect(described_class.checksummed_count).to eq(0)
end
end
context 'when verification is disabled' do
it 'returns nil' do
allow(described_class).to receive(:verification_enabled?).and_return(false)
expect(described_class.checksummed_count).to be_nil
end
end
end
describe '.checksum_failed_count' do
context 'when verification is enabled' do
before do
allow(described_class).to receive(:verification_enabled?).and_return(true)
end
it 'returns the number of available replicables where verification failed' do
model_record.verification_started!
model_record.verification_failed_with_message!('some error message')
expect(described_class.checksum_failed_count).to eq(1)
end
it 'excludes other verification states' do
model_record.verification_started!
expect(described_class.checksum_failed_count).to eq(0)
model_record.verification_succeeded_with_checksum!('foo', Time.current)
expect(described_class.checksum_failed_count).to eq(0)
model_record.verification_pending!
expect(described_class.checksum_failed_count).to eq(0)
end
end
context 'when verification is disabled' do
it 'returns nil' do
allow(described_class).to receive(:verification_enabled?).and_return(false)
expect(described_class.checksum_failed_count).to be_nil
end
end
end
describe '#after_verifiable_update' do
it 'calls verify_async if needed' do
expect(replicator).to receive(:verify_async)
......@@ -48,29 +122,66 @@ RSpec.shared_examples 'a verifiable replicator' do
end
end
describe '#verify' do
describe '#verify_async' do
before do
model_record.save!
end
it 'calculates the checksum' do
expect(model_record).to receive(:calculate_checksum).and_return('abc123')
context 'on a Geo primary' do
before do
stub_primary_node
end
replicator.verify
it 'calls verification_started! and enqueues VerificationWorker' do
expect(model_record).to receive(:verification_started!)
expect(Geo::VerificationWorker).to receive(:perform_async).with(replicator.replicable_name, model_record.id)
expect(model_record.reload.verification_checksum).to eq('abc123')
expect(model_record.verified_at).not_to be_nil
replicator.verify_async
end
end
end
it 'saves the error message and increments retry counter' do
allow(model_record).to receive(:calculate_checksum) do
raise StandardError.new('Failure to calculate checksum')
describe '#verify' do
context 'on a Geo primary' do
before do
stub_primary_node
end
context 'when verification was started' do
before do
model_record.verification_started!
end
context 'when the checksum succeeds' do
it 'delegates checksum calculation and the state change to model_record' do
expect(model_record).to receive(:calculate_checksum).and_return('abc123')
expect(model_record).to receive(:verification_succeeded_with_checksum!).with('abc123', kind_of(Time))
replicator.verify
end
end
context 'when an error is raised during calculate_checksum' do
it 'passes the error message' do
error = StandardError.new('Some exception')
allow(model_record).to receive(:calculate_checksum) do
raise error
end
expect(model_record).to receive(:verification_failed_with_message!).with('Error calculating the checksum', error)
replicator.verify
end
end
end
replicator.verify
context 'when verification was not started' do
it 'does not call calculate_checksum!' do
expect(model_record).not_to receive(:calculate_checksum)
expect(model_record.reload.verification_failure).to eq 'Failure to calculate checksum'
expect(model_record.verification_retry_count).to be 1
replicator.verify
end
end
end
end
end
......@@ -21,7 +21,7 @@ RSpec.describe Geo::VerificationWorker, :geo do
context 'when on a primary node' do
before do
stub_primary_node
package_file.update!(verification_checksum: nil)
package_file.verification_started!
end
it_behaves_like 'an idempotent worker' do
......
......@@ -152,14 +152,6 @@ FactoryBot.define do
file_store { Packages::PackageFileUploader::Store::REMOTE }
end
trait(:checksummed) do
verification_checksum { 'abc' }
end
trait(:checksum_failure) do
verification_failure { 'Could not calculate the checksum' }
end
factory :package_file_with_file, traits: [:jar]
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment