Commit c6955deb authored by pbair's avatar pbair

Move batched BG helpers to separate module

Move the migration helpers for batched background migrations into a
separate module, to make it clear that they are a separate approach to
background migrations and incompatible with the existing and familiar
background migration helpers.
parent ab8d113a
......@@ -4,6 +4,7 @@ module Gitlab
module Database
module MigrationHelpers
include Migrations::BackgroundMigrationHelpers
include Migrations::BatchedBackgroundMigrationHelpers
include DynamicModelHelpers
include RenameTableHelpers
include AsyncIndexes::MigrationHelpers
......
......@@ -5,11 +5,7 @@ module Gitlab
module Migrations
module BackgroundMigrationHelpers
BATCH_SIZE = 1_000 # Number of rows to process per job
SUB_BATCH_SIZE = 100 # Number of rows to process per sub-batch
JOB_BUFFER_SIZE = 1_000 # Number of jobs to bulk queue at a time
BATCH_CLASS_NAME = 'PrimaryKeyBatchingStrategy' # Default batch class for batched migrations
BATCH_MIN_VALUE = 1 # Default minimum value for batched migrations
BATCH_MIN_DELAY = 2.minutes.freeze # Minimum delay between batched migrations
# Bulk queues background migration jobs for an entire table, batched by ID range.
# "Bulk" meaning many jobs will be pushed at a time for efficiency.
......@@ -170,102 +166,6 @@ module Gitlab
duration
end
# Creates a batched background migration for the given table. A batched migration runs one job
# at a time, computing the bounds of the next batch based on the current migration settings and the previous
# batch bounds. Each job's execution status is tracked in the database as the migration runs. The given job
# class must be present in the Gitlab::BackgroundMigration module, and the batch class (if specified) must be
# present in the Gitlab::BackgroundMigration::BatchingStrategies module.
#
# If migration with same job_class_name, table_name, column_name, and job_aruments already exists, this helper
# will log an warning and not create a new one.
#
# job_class_name - The background migration job class as a string
# batch_table_name - The name of the table the migration will batch over
# batch_column_name - The name of the column the migration will batch over
# job_arguments - Extra arguments to pass to the job instance when the migration runs
# job_interval - The pause interval between each job's execution, minimum of 2 minutes
# batch_min_value - The value in the column the batching will begin at
# batch_max_value - The value in the column the batching will end at, defaults to `SELECT MAX(batch_column)`
# batch_class_name - The name of the class that will be called to find the range of each next batch
# batch_size - The maximum number of rows per job
# sub_batch_size - The maximum number of rows processed per "iteration" within the job
#
#
# *Returns the created BatchedMigration record*
#
# Example:
#
# queue_batched_background_migration(
# 'CopyColumnUsingBackgroundMigrationJob',
# :events,
# :id,
# job_interval: 2.minutes,
# other_job_arguments: ['column1', 'column2'])
#
# Where the the background migration exists:
#
# class Gitlab::BackgroundMigration::CopyColumnUsingBackgroundMigrationJob
# def perform(start_id, end_id, batch_table, batch_column, sub_batch_size, *other_args)
# # do something
# end
# end
def queue_batched_background_migration( # rubocop:disable Metrics/ParameterLists
job_class_name,
batch_table_name,
batch_column_name,
*job_arguments,
job_interval:,
batch_min_value: BATCH_MIN_VALUE,
batch_max_value: nil,
batch_class_name: BATCH_CLASS_NAME,
batch_size: BATCH_SIZE,
sub_batch_size: SUB_BATCH_SIZE
)
if Gitlab::Database::BackgroundMigration::BatchedMigration.for_configuration(job_class_name, batch_table_name, batch_column_name, job_arguments).exists?
Gitlab::AppLogger.warn "Batched background migration not enqueued because it already exists: " \
"job_class_name: #{job_class_name}, table_name: #{batch_table_name}, column_name: #{batch_column_name}, " \
"job_arguments: #{job_arguments.inspect}"
return
end
job_interval = BATCH_MIN_DELAY if job_interval < BATCH_MIN_DELAY
batch_max_value ||= connection.select_value(<<~SQL)
SELECT MAX(#{connection.quote_column_name(batch_column_name)})
FROM #{connection.quote_table_name(batch_table_name)}
SQL
migration_status = batch_max_value.nil? ? :finished : :active
batch_max_value ||= batch_min_value
migration = Gitlab::Database::BackgroundMigration::BatchedMigration.create!(
job_class_name: job_class_name,
table_name: batch_table_name,
column_name: batch_column_name,
job_arguments: job_arguments,
interval: job_interval,
min_value: batch_min_value,
max_value: batch_max_value,
batch_class_name: batch_class_name,
batch_size: batch_size,
sub_batch_size: sub_batch_size,
status: migration_status)
# This guard is necessary since #total_tuple_count was only introduced schema-wise,
# after this migration helper had been used for the first time.
return migration unless migration.respond_to?(:total_tuple_count)
# We keep track of the estimated number of tuples to reason later
# about the overall progress of a migration.
migration.total_tuple_count = Gitlab::Database::SharedModel.using_connection(connection) do
Gitlab::Database::PgClass.for_table(batch_table_name)&.cardinality_estimate
end
migration.save!
migration
end
# Force a background migration to complete.
#
# WARNING: This method will block the caller and move the background migration from an
......
# frozen_string_literal: true
module Gitlab
module Database
module Migrations
# BatchedBackgroundMigrations are a new approach to scheduling and executing background migrations, which uses
# persistent state in the database to track each migration. This avoids having to batch over an entire table and
# schedule a large number of sidekiq jobs upfront. It also provides for more flexibility as the migration runs,
# as it can be paused and restarted, and have configuration values like the batch size updated dynamically as the
# migration runs.
#
# For now, these migrations are not considered ready for general use, for more information see the tracking epic:
# https://gitlab.com/groups/gitlab-org/-/epics/6751
module BatchedBackgroundMigrationHelpers
BATCH_SIZE = 1_000 # Number of rows to process per job
SUB_BATCH_SIZE = 100 # Number of rows to process per sub-batch
BATCH_CLASS_NAME = 'PrimaryKeyBatchingStrategy' # Default batch class for batched migrations
BATCH_MIN_VALUE = 1 # Default minimum value for batched migrations
BATCH_MIN_DELAY = 2.minutes.freeze # Minimum delay between batched migrations
# Creates a batched background migration for the given table. A batched migration runs one job
# at a time, computing the bounds of the next batch based on the current migration settings and the previous
# batch bounds. Each job's execution status is tracked in the database as the migration runs. The given job
# class must be present in the Gitlab::BackgroundMigration module, and the batch class (if specified) must be
# present in the Gitlab::BackgroundMigration::BatchingStrategies module.
#
# If migration with same job_class_name, table_name, column_name, and job_aruments already exists, this helper
# will log an warning and not create a new one.
#
# job_class_name - The background migration job class as a string
# batch_table_name - The name of the table the migration will batch over
# batch_column_name - The name of the column the migration will batch over
# job_arguments - Extra arguments to pass to the job instance when the migration runs
# job_interval - The pause interval between each job's execution, minimum of 2 minutes
# batch_min_value - The value in the column the batching will begin at
# batch_max_value - The value in the column the batching will end at, defaults to `SELECT MAX(batch_column)`
# batch_class_name - The name of the class that will be called to find the range of each next batch
# batch_size - The maximum number of rows per job
# sub_batch_size - The maximum number of rows processed per "iteration" within the job
#
# *Returns the created BatchedMigration record*
#
# Example:
#
# queue_batched_background_migration(
# 'CopyColumnUsingBackgroundMigrationJob',
# :events,
# :id,
# job_interval: 2.minutes,
# other_job_arguments: ['column1', 'column2'])
#
# Where the the background migration exists:
#
# class Gitlab::BackgroundMigration::CopyColumnUsingBackgroundMigrationJob
# def perform(start_id, end_id, batch_table, batch_column, sub_batch_size, *other_args)
# # do something
# end
# end
def queue_batched_background_migration( # rubocop:disable Metrics/ParameterLists
job_class_name,
batch_table_name,
batch_column_name,
*job_arguments,
job_interval:,
batch_min_value: BATCH_MIN_VALUE,
batch_max_value: nil,
batch_class_name: BATCH_CLASS_NAME,
batch_size: BATCH_SIZE,
sub_batch_size: SUB_BATCH_SIZE
)
if Gitlab::Database::BackgroundMigration::BatchedMigration.for_configuration(job_class_name, batch_table_name, batch_column_name, job_arguments).exists?
Gitlab::AppLogger.warn "Batched background migration not enqueued because it already exists: " \
"job_class_name: #{job_class_name}, table_name: #{batch_table_name}, column_name: #{batch_column_name}, " \
"job_arguments: #{job_arguments.inspect}"
return
end
job_interval = BATCH_MIN_DELAY if job_interval < BATCH_MIN_DELAY
batch_max_value ||= connection.select_value(<<~SQL)
SELECT MAX(#{connection.quote_column_name(batch_column_name)})
FROM #{connection.quote_table_name(batch_table_name)}
SQL
migration_status = batch_max_value.nil? ? :finished : :active
batch_max_value ||= batch_min_value
migration = Gitlab::Database::BackgroundMigration::BatchedMigration.create!(
job_class_name: job_class_name,
table_name: batch_table_name,
column_name: batch_column_name,
job_arguments: job_arguments,
interval: job_interval,
min_value: batch_min_value,
max_value: batch_max_value,
batch_class_name: batch_class_name,
batch_size: batch_size,
sub_batch_size: sub_batch_size,
status: migration_status)
# This guard is necessary since #total_tuple_count was only introduced schema-wise,
# after this migration helper had been used for the first time.
return migration unless migration.respond_to?(:total_tuple_count)
# We keep track of the estimated number of tuples to reason later
# about the overall progress of a migration.
migration.total_tuple_count = Gitlab::Database::SharedModel.using_connection(connection) do
Gitlab::Database::PgClass.for_table(batch_table_name)&.cardinality_estimate
end
migration.save!
migration
end
end
end
end
end
......@@ -354,161 +354,6 @@ RSpec.describe Gitlab::Database::Migrations::BackgroundMigrationHelpers do
end
end
describe '#queue_batched_background_migration' do
let(:pgclass_info) { instance_double('Gitlab::Database::PgClass', cardinality_estimate: 42) }
before do
allow(Gitlab::Database::PgClass).to receive(:for_table).and_call_original
end
context 'when such migration already exists' do
it 'does not create duplicate migration' do
create(
:batched_background_migration,
job_class_name: 'MyJobClass',
table_name: :projects,
column_name: :id,
interval: 10.minutes,
min_value: 5,
max_value: 1005,
batch_class_name: 'MyBatchClass',
batch_size: 200,
sub_batch_size: 20,
job_arguments: [[:id], [:id_convert_to_bigint]]
)
expect do
model.queue_batched_background_migration(
'MyJobClass',
:projects,
:id,
[:id], [:id_convert_to_bigint],
job_interval: 5.minutes,
batch_min_value: 5,
batch_max_value: 1000,
batch_class_name: 'MyBatchClass',
batch_size: 100,
sub_batch_size: 10)
end.not_to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }
end
end
it 'creates the database record for the migration' do
expect(Gitlab::Database::PgClass).to receive(:for_table).with(:projects).and_return(pgclass_info)
expect do
model.queue_batched_background_migration(
'MyJobClass',
:projects,
:id,
job_interval: 5.minutes,
batch_min_value: 5,
batch_max_value: 1000,
batch_class_name: 'MyBatchClass',
batch_size: 100,
sub_batch_size: 10)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to have_attributes(
job_class_name: 'MyJobClass',
table_name: 'projects',
column_name: 'id',
interval: 300,
min_value: 5,
max_value: 1000,
batch_class_name: 'MyBatchClass',
batch_size: 100,
sub_batch_size: 10,
job_arguments: %w[],
status: 'active',
total_tuple_count: pgclass_info.cardinality_estimate)
end
context 'when the job interval is lower than the minimum' do
let(:minimum_delay) { described_class::BATCH_MIN_DELAY }
it 'sets the job interval to the minimum value' do
expect do
model.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: minimum_delay - 1.minute)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
created_migration = Gitlab::Database::BackgroundMigration::BatchedMigration.last
expect(created_migration.interval).to eq(minimum_delay)
end
end
context 'when additional arguments are passed to the method' do
it 'saves the arguments on the database record' do
expect do
model.queue_batched_background_migration(
'MyJobClass',
:projects,
:id,
'my',
'arguments',
job_interval: 5.minutes,
batch_max_value: 1000)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to have_attributes(
job_class_name: 'MyJobClass',
table_name: 'projects',
column_name: 'id',
interval: 300,
min_value: 1,
max_value: 1000,
job_arguments: %w[my arguments])
end
end
context 'when the max_value is not given' do
context 'when records exist in the database' do
let!(:event1) { create(:event) }
let!(:event2) { create(:event) }
let!(:event3) { create(:event) }
it 'creates the record with the current max value' do
expect do
model.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
created_migration = Gitlab::Database::BackgroundMigration::BatchedMigration.last
expect(created_migration.max_value).to eq(event3.id)
end
it 'creates the record with an active status' do
expect do
model.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to be_active
end
end
context 'when the database is empty' do
it 'sets the max value to the min value' do
expect do
model.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
created_migration = Gitlab::Database::BackgroundMigration::BatchedMigration.last
expect(created_migration.max_value).to eq(created_migration.min_value)
end
it 'creates the record with a finished status' do
expect do
model.queue_batched_background_migration('MyJobClass', :projects, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to be_finished
end
end
end
end
describe '#migrate_async' do
it 'calls BackgroundMigrationWorker.perform_async' do
expect(BackgroundMigrationWorker).to receive(:perform_async).with("Class", "hello", "world")
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::Migrations::BatchedBackgroundMigrationHelpers do
let(:migration) do
ActiveRecord::Migration.new.extend(described_class)
end
describe '#queue_batched_background_migration' do
let(:pgclass_info) { instance_double('Gitlab::Database::PgClass', cardinality_estimate: 42) }
before do
allow(Gitlab::Database::PgClass).to receive(:for_table).and_call_original
end
context 'when such migration already exists' do
it 'does not create duplicate migration' do
create(
:batched_background_migration,
job_class_name: 'MyJobClass',
table_name: :projects,
column_name: :id,
interval: 10.minutes,
min_value: 5,
max_value: 1005,
batch_class_name: 'MyBatchClass',
batch_size: 200,
sub_batch_size: 20,
job_arguments: [[:id], [:id_convert_to_bigint]]
)
expect do
migration.queue_batched_background_migration(
'MyJobClass',
:projects,
:id,
[:id], [:id_convert_to_bigint],
job_interval: 5.minutes,
batch_min_value: 5,
batch_max_value: 1000,
batch_class_name: 'MyBatchClass',
batch_size: 100,
sub_batch_size: 10)
end.not_to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }
end
end
it 'creates the database record for the migration' do
expect(Gitlab::Database::PgClass).to receive(:for_table).with(:projects).and_return(pgclass_info)
expect do
migration.queue_batched_background_migration(
'MyJobClass',
:projects,
:id,
job_interval: 5.minutes,
batch_min_value: 5,
batch_max_value: 1000,
batch_class_name: 'MyBatchClass',
batch_size: 100,
sub_batch_size: 10)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to have_attributes(
job_class_name: 'MyJobClass',
table_name: 'projects',
column_name: 'id',
interval: 300,
min_value: 5,
max_value: 1000,
batch_class_name: 'MyBatchClass',
batch_size: 100,
sub_batch_size: 10,
job_arguments: %w[],
status: 'active',
total_tuple_count: pgclass_info.cardinality_estimate)
end
context 'when the job interval is lower than the minimum' do
let(:minimum_delay) { described_class::BATCH_MIN_DELAY }
it 'sets the job interval to the minimum value' do
expect do
migration.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: minimum_delay - 1.minute)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
created_migration = Gitlab::Database::BackgroundMigration::BatchedMigration.last
expect(created_migration.interval).to eq(minimum_delay)
end
end
context 'when additional arguments are passed to the method' do
it 'saves the arguments on the database record' do
expect do
migration.queue_batched_background_migration(
'MyJobClass',
:projects,
:id,
'my',
'arguments',
job_interval: 5.minutes,
batch_max_value: 1000)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to have_attributes(
job_class_name: 'MyJobClass',
table_name: 'projects',
column_name: 'id',
interval: 300,
min_value: 1,
max_value: 1000,
job_arguments: %w[my arguments])
end
end
context 'when the max_value is not given' do
context 'when records exist in the database' do
let!(:event1) { create(:event) }
let!(:event2) { create(:event) }
let!(:event3) { create(:event) }
it 'creates the record with the current max value' do
expect do
migration.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
created_migration = Gitlab::Database::BackgroundMigration::BatchedMigration.last
expect(created_migration.max_value).to eq(event3.id)
end
it 'creates the record with an active status' do
expect do
migration.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to be_active
end
end
context 'when the database is empty' do
it 'sets the max value to the min value' do
expect do
migration.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
created_migration = Gitlab::Database::BackgroundMigration::BatchedMigration.last
expect(created_migration.max_value).to eq(created_migration.min_value)
end
it 'creates the record with a finished status' do
expect do
migration.queue_batched_background_migration('MyJobClass', :projects, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to be_finished
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment