Commit 0d0d6cf1 authored by Andreas Brandl's avatar Andreas Brandl

Merge branch 'backfill-project-namespaces-for-specific-group' into 'master'

Backfill project namespaces for specific group

See merge request gitlab-org/gitlab!73640
parents 58161d1f a5dd3b7e
# frozen_string_literal: true
class BackfillProjectNamespacesForGroup < Gitlab::Database::Migration[1.0]
MIGRATION = 'ProjectNamespaces::BackfillProjectNamespaces'
DELAY_INTERVAL = 2.minutes
GROUP_ID = 9970 # picking gitlab-org group.
disable_ddl_transaction!
def up
return unless Gitlab.com? || Gitlab.staging?
projects_table = ::Gitlab::BackgroundMigration::ProjectNamespaces::Models::Project.arel_table
hierarchy_cte_sql = Arel.sql(::Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNamespaces.hierarchy_cte(GROUP_ID))
group_projects = ::Gitlab::BackgroundMigration::ProjectNamespaces::Models::Project.where(projects_table[:namespace_id].in(hierarchy_cte_sql))
min_id = group_projects.minimum(:id)
max_id = group_projects.maximum(:id)
return if min_id.blank? || max_id.blank?
queue_batched_background_migration(
MIGRATION,
:projects,
:id,
GROUP_ID,
'up',
job_interval: DELAY_INTERVAL,
batch_min_value: min_id,
batch_max_value: max_id,
sub_batch_size: 25,
batch_class_name: 'BackfillProjectNamespacePerGroupBatchingStrategy'
)
end
def down
return unless Gitlab.com? || Gitlab.staging?
Gitlab::Database::BackgroundMigration::BatchedMigration
.for_configuration(MIGRATION, :projects, :id, [GROUP_ID, 'up']).delete_all
end
end
d2d270a335b3a2441a20673bf19d47553f607533d4503e3a01bc3d6d108bcdb3
\ No newline at end of file
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
module BatchingStrategies
# Batching class to use for back-filling project namespaces for a single group.
# Batches over the projects table and id column combination, scoped to a given group returning the MIN() and MAX()
# values for the next batch as an array.
#
# If no more batches exist in the table, returns nil.
class BackfillProjectNamespacePerGroupBatchingStrategy < PrimaryKeyBatchingStrategy
# Finds and returns the next batch in the table.
#
# table_name - The table to batch over
# column_name - The column to batch over
# batch_min_value - The minimum value which the next batch will start at
# batch_size - The size of the next batch
# job_arguments - The migration job arguments
def next_batch(table_name, column_name, batch_min_value:, batch_size:, job_arguments:)
next_batch_bounds = nil
model_class = ::Gitlab::BackgroundMigration::ProjectNamespaces::Models::Project
quoted_column_name = model_class.connection.quote_column_name(column_name)
projects_table = model_class.arel_table
hierarchy_cte_sql = Arel::Nodes::SqlLiteral.new(::Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNamespaces.hierarchy_cte(job_arguments.first))
relation = model_class.where(projects_table[:namespace_id].in(hierarchy_cte_sql)).where("#{quoted_column_name} >= ?", batch_min_value)
relation.each_batch(of: batch_size, column: column_name) do |batch| # rubocop:disable Lint/UnreachableLoop
next_batch_bounds = batch.pluck(Arel.sql("MIN(#{quoted_column_name}), MAX(#{quoted_column_name})")).first
break
end
next_batch_bounds
end
end
end
end
end
......@@ -17,7 +17,8 @@ module Gitlab
# column_name - The column to batch over
# batch_min_value - The minimum value which the next batch will start at
# batch_size - The size of the next batch
def next_batch(table_name, column_name, batch_min_value:, batch_size:)
# job_arguments - The migration job arguments
def next_batch(table_name, column_name, batch_min_value:, batch_size:, job_arguments:)
model_class = define_batchable_model(table_name)
quoted_column_name = model_class.connection.quote_column_name(column_name)
......
......@@ -5,19 +5,15 @@ module Gitlab
module ProjectNamespaces
# Back-fill project namespaces for projects that do not yet have a namespace.
#
# TODO: remove this comment when an actuall backfill migration is added.
#
# This is first being added without an actual migration as we need to initially test
# if backfilling project namespaces affects performance in any significant way.
# rubocop: disable Metrics/ClassLength
class BackfillProjectNamespaces
BATCH_SIZE = 100
DELETE_BATCH_SIZE = 10
SUB_BATCH_SIZE = 25
PROJECT_NAMESPACE_STI_NAME = 'Project'
IsolatedModels = ::Gitlab::BackgroundMigration::ProjectNamespaces::Models
def perform(start_id, end_id, namespace_id, migration_type = 'up')
def perform(start_id, end_id, migration_table_name, migration_column_name, sub_batch_size, pause_ms, namespace_id, migration_type = 'up')
@sub_batch_size = sub_batch_size || SUB_BATCH_SIZE
load_project_ids(start_id, end_id, namespace_id)
case migration_type
......@@ -34,10 +30,13 @@ module Gitlab
private
attr_accessor :project_ids
attr_accessor :project_ids, :sub_batch_size
def backfill_project_namespaces(namespace_id)
project_ids.each_slice(BATCH_SIZE) do |project_ids|
project_ids.each_slice(sub_batch_size) do |project_ids|
ActiveRecord::Base.connection.execute("select gin_clean_pending_list('index_namespaces_on_name_trigram')")
ActiveRecord::Base.connection.execute("select gin_clean_pending_list('index_namespaces_on_path_trigram')")
# We need to lock these project records for the period when we create project namespaces
# and link them to projects so that if a project is modified in the time between creating
# project namespaces `batch_insert_namespaces` and linking them to projects `batch_update_projects`
......@@ -45,18 +44,17 @@ module Gitlab
#
# see https://gitlab.com/gitlab-org/gitlab/-/merge_requests/72527#note_730679469
Project.transaction do
Project.where(id: project_ids).select(:id).lock!('FOR UPDATE')
Project.where(id: project_ids).select(:id).lock!('FOR UPDATE').load
batch_insert_namespaces(project_ids)
batch_update_projects(project_ids)
batch_update_project_namespaces_traversal_ids(project_ids)
end
batch_update_project_namespaces_traversal_ids(project_ids)
end
end
def cleanup_backfilled_project_namespaces(namespace_id)
project_ids.each_slice(BATCH_SIZE) do |project_ids|
project_ids.each_slice(sub_batch_size) do |project_ids|
# IMPORTANT: first nullify project_namespace_id in projects table to avoid removing projects when records
# from namespaces are deleted due to FK/triggers
nullify_project_namespaces_in_projects(project_ids)
......@@ -109,7 +107,10 @@ module Gitlab
end
def delete_project_namespace_records(project_ids)
project_ids.each_slice(DELETE_BATCH_SIZE) do |p_ids|
# keep the deletes a 10x smaller batch as deletes seem to be much more expensive
delete_batch_size = (sub_batch_size / 10).to_i + 1
project_ids.each_slice(delete_batch_size) do |p_ids|
IsolatedModels::Namespace.where(type: PROJECT_NAMESPACE_STI_NAME).where(tmp_project_id: p_ids).delete_all
end
end
......@@ -117,7 +118,7 @@ module Gitlab
def load_project_ids(start_id, end_id, namespace_id)
projects = IsolatedModels::Project.arel_table
relation = IsolatedModels::Project.where(projects[:id].between(start_id..end_id))
relation = relation.where(projects[:namespace_id].in(Arel::Nodes::SqlLiteral.new(hierarchy_cte(namespace_id)))) if namespace_id
relation = relation.where(projects[:namespace_id].in(Arel::Nodes::SqlLiteral.new(self.class.hierarchy_cte(namespace_id)))) if namespace_id
@project_ids = relation.pluck(:id)
end
......@@ -126,7 +127,7 @@ module Gitlab
::Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded('BackfillProjectNamespaces', arguments)
end
def hierarchy_cte(root_namespace_id)
def self.hierarchy_cte(root_namespace_id)
<<-SQL
WITH RECURSIVE "base_and_descendants" AS (
(
......
......@@ -95,7 +95,8 @@ module Gitlab
active_migration.table_name,
active_migration.column_name,
batch_min_value: batch_min_value,
batch_size: active_migration.batch_size)
batch_size: active_migration.batch_size,
job_arguments: active_migration.job_arguments)
return if next_batch_bounds.nil?
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::BatchingStrategies::BackfillProjectNamespacePerGroupBatchingStrategy, '#next_batch' do
let!(:namespaces) { table(:namespaces) }
let!(:projects) { table(:projects) }
let!(:background_migrations) { table(:batched_background_migrations) }
let!(:namespace1) { namespaces.create!(name: 'batchtest1', type: 'Group', path: 'batch-test1') }
let!(:namespace2) { namespaces.create!(name: 'batchtest2', type: 'Group', parent_id: namespace1.id, path: 'batch-test2') }
let!(:namespace3) { namespaces.create!(name: 'batchtest3', type: 'Group', parent_id: namespace2.id, path: 'batch-test3') }
let!(:project1) { projects.create!(name: 'project1', path: 'project1', namespace_id: namespace1.id, visibility_level: 20) }
let!(:project2) { projects.create!(name: 'project2', path: 'project2', namespace_id: namespace2.id, visibility_level: 20) }
let!(:project3) { projects.create!(name: 'project3', path: 'project3', namespace_id: namespace3.id, visibility_level: 20) }
let!(:project4) { projects.create!(name: 'project4', path: 'project4', namespace_id: namespace3.id, visibility_level: 20) }
let!(:batching_strategy) { described_class.new }
let(:job_arguments) { [namespace1.id, 'up'] }
context 'when starting on the first batch' do
it 'returns the bounds of the next batch' do
batch_bounds = batching_strategy.next_batch(:projects, :id, batch_min_value: project1.id, batch_size: 3, job_arguments: job_arguments)
expect(batch_bounds).to match_array([project1.id, project3.id])
end
end
context 'when additional batches remain' do
it 'returns the bounds of the next batch' do
batch_bounds = batching_strategy.next_batch(:projects, :id, batch_min_value: project2.id, batch_size: 3, job_arguments: job_arguments)
expect(batch_bounds).to match_array([project2.id, project4.id])
end
end
context 'when on the final batch' do
it 'returns the bounds of the next batch' do
batch_bounds = batching_strategy.next_batch(:projects, :id, batch_min_value: project4.id, batch_size: 3, job_arguments: job_arguments)
expect(batch_bounds).to match_array([project4.id, project4.id])
end
end
context 'when no additional batches remain' do
it 'returns nil' do
batch_bounds = batching_strategy.next_batch(:projects, :id, batch_min_value: project4.id + 1, batch_size: 1, job_arguments: job_arguments)
expect(batch_bounds).to be_nil
end
end
end
......@@ -13,7 +13,7 @@ RSpec.describe Gitlab::BackgroundMigration::BatchingStrategies::PrimaryKeyBatchi
context 'when starting on the first batch' do
it 'returns the bounds of the next batch' do
batch_bounds = batching_strategy.next_batch(:namespaces, :id, batch_min_value: namespace1.id, batch_size: 3)
batch_bounds = batching_strategy.next_batch(:namespaces, :id, batch_min_value: namespace1.id, batch_size: 3, job_arguments: nil)
expect(batch_bounds).to eq([namespace1.id, namespace3.id])
end
......@@ -21,7 +21,7 @@ RSpec.describe Gitlab::BackgroundMigration::BatchingStrategies::PrimaryKeyBatchi
context 'when additional batches remain' do
it 'returns the bounds of the next batch' do
batch_bounds = batching_strategy.next_batch(:namespaces, :id, batch_min_value: namespace2.id, batch_size: 3)
batch_bounds = batching_strategy.next_batch(:namespaces, :id, batch_min_value: namespace2.id, batch_size: 3, job_arguments: nil)
expect(batch_bounds).to eq([namespace2.id, namespace4.id])
end
......@@ -29,7 +29,7 @@ RSpec.describe Gitlab::BackgroundMigration::BatchingStrategies::PrimaryKeyBatchi
context 'when on the final batch' do
it 'returns the bounds of the next batch' do
batch_bounds = batching_strategy.next_batch(:namespaces, :id, batch_min_value: namespace4.id, batch_size: 3)
batch_bounds = batching_strategy.next_batch(:namespaces, :id, batch_min_value: namespace4.id, batch_size: 3, job_arguments: nil)
expect(batch_bounds).to eq([namespace4.id, namespace4.id])
end
......@@ -37,7 +37,7 @@ RSpec.describe Gitlab::BackgroundMigration::BatchingStrategies::PrimaryKeyBatchi
context 'when no additional batches remain' do
it 'returns nil' do
batch_bounds = batching_strategy.next_batch(:namespaces, :id, batch_min_value: namespace4.id + 1, batch_size: 1)
batch_bounds = batching_strategy.next_batch(:namespaces, :id, batch_min_value: namespace4.id + 1, batch_size: 1, job_arguments: nil)
expect(batch_bounds).to be_nil
end
......
......@@ -30,7 +30,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
projects_count = ::Project.count
batches_count = (projects_count / described_class::BATCH_SIZE.to_f).ceil
batches_count = (projects_count / described_class::SUB_BATCH_SIZE.to_f).ceil
project_namespaces_count = ::Namespace.where(type: 'Project').count
migration = described_class.new
......@@ -39,7 +39,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
expect(migration).to receive(:batch_update_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_project_namespaces_traversal_ids).exactly(batches_count).and_call_original
expect { migration.perform(start_id, end_id, nil, 'up') }.to change(Namespace.where(type: 'Project'), :count)
expect { migration.perform(start_id, end_id, nil, nil, nil, nil, nil, 'up') }.to change(Namespace.where(type: 'Project'), :count)
expect(projects_count).to eq(::Namespace.where(type: 'Project').count)
check_projects_in_sync_with(Namespace.where(type: 'Project'))
......@@ -53,7 +53,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
start_id = backfilled_namespace_projects.minimum(:id)
end_id = backfilled_namespace_projects.maximum(:id)
group_projects_count = backfilled_namespace_projects.count
batches_count = (group_projects_count / described_class::BATCH_SIZE.to_f).ceil
batches_count = (group_projects_count / described_class::SUB_BATCH_SIZE.to_f).ceil
project_namespaces_in_hierarchy = project_namespaces_in_hierarchy(base_ancestor(backfilled_namespace))
migration = described_class.new
......@@ -66,7 +66,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
expect(group_projects_count).to eq(14)
expect(project_namespaces_in_hierarchy.count).to eq(0)
migration.perform(start_id, end_id, backfilled_namespace.id, 'up')
migration.perform(start_id, end_id, nil, nil, nil, nil, backfilled_namespace.id, 'up')
expect(project_namespaces_in_hierarchy.count).to eq(14)
check_projects_in_sync_with(project_namespaces_in_hierarchy)
......@@ -79,7 +79,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
start_id = hierarchy1_projects.minimum(:id)
end_id = hierarchy1_projects.maximum(:id)
described_class.new.perform(start_id, end_id, parent_group1.id, 'up')
described_class.new.perform(start_id, end_id, nil, nil, nil, nil, parent_group1.id, 'up')
end
it 'does not duplicate project namespaces' do
......@@ -87,7 +87,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
projects_count = ::Project.count
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
batches_count = (projects_count / described_class::BATCH_SIZE.to_f).ceil
batches_count = (projects_count / described_class::SUB_BATCH_SIZE.to_f).ceil
project_namespaces = ::Namespace.where(type: 'Project')
migration = described_class.new
......@@ -100,7 +100,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
expect(migration).to receive(:batch_update_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_project_namespaces_traversal_ids).exactly(batches_count).and_call_original
expect { migration.perform(start_id, end_id, nil, 'up') }.to change(project_namespaces, :count).by(14)
expect { migration.perform(start_id, end_id, nil, nil, nil, nil, nil, 'up') }.to change(project_namespaces, :count).by(14)
expect(projects_count).to eq(project_namespaces.count)
end
......@@ -125,7 +125,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
context 'back-fill project namespaces in batches' do
before do
stub_const("#{described_class.name}::BATCH_SIZE", 2)
stub_const("#{described_class.name}::SUB_BATCH_SIZE", 2)
end
it_behaves_like 'back-fill project namespaces'
......@@ -137,7 +137,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
# back-fill first
described_class.new.perform(start_id, end_id, nil, 'up')
described_class.new.perform(start_id, end_id, nil, nil, nil, nil, nil, 'up')
end
shared_examples 'cleanup project namespaces' do
......@@ -146,7 +146,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
migration = described_class.new
batches_count = (projects_count / described_class::BATCH_SIZE.to_f).ceil
batches_count = (projects_count / described_class::SUB_BATCH_SIZE.to_f).ceil
expect(projects_count).to be > 0
expect(projects_count).to eq(::Namespace.where(type: 'Project').count)
......@@ -154,7 +154,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
expect(migration).to receive(:nullify_project_namespaces_in_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:delete_project_namespace_records).exactly(batches_count).and_call_original
migration.perform(start_id, end_id, nil, 'down')
migration.perform(start_id, end_id, nil, nil, nil, nil, nil, 'down')
expect(::Project.count).to be > 0
expect(::Namespace.where(type: 'Project').count).to eq(0)
......@@ -168,7 +168,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
start_id = backfilled_namespace_projects.minimum(:id)
end_id = backfilled_namespace_projects.maximum(:id)
group_projects_count = backfilled_namespace_projects.count
batches_count = (group_projects_count / described_class::BATCH_SIZE.to_f).ceil
batches_count = (group_projects_count / described_class::SUB_BATCH_SIZE.to_f).ceil
project_namespaces_in_hierarchy = project_namespaces_in_hierarchy(base_ancestor(backfilled_namespace))
migration = described_class.new
......@@ -176,7 +176,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
expect(migration).to receive(:nullify_project_namespaces_in_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:delete_project_namespace_records).exactly(batches_count).and_call_original
migration.perform(start_id, end_id, backfilled_namespace.id, 'down')
migration.perform(start_id, end_id, nil, nil, nil, nil, backfilled_namespace.id, 'down')
expect(::Namespace.where(type: 'Project').count).to be > 0
expect(project_namespaces_in_hierarchy.count).to eq(0)
......@@ -190,7 +190,7 @@ RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNa
context 'cleanup project namespaces in batches' do
before do
stub_const("#{described_class.name}::BATCH_SIZE", 2)
stub_const("#{described_class.name}::SUB_BATCH_SIZE", 2)
end
it_behaves_like 'cleanup project namespaces'
......
# frozen_string_literal: true
require 'spec_helper'
require_migration!
RSpec.describe BackfillProjectNamespacesForGroup do
let_it_be(:migration) { described_class::MIGRATION }
let(:projects) { table(:projects) }
let(:namespaces) { table(:namespaces) }
let(:parent_group1) { namespaces.create!(name: 'parent_group1', path: 'parent_group1', visibility_level: 20, type: 'Group') }
let!(:parent_group1_project) { projects.create!(name: 'parent_group1_project', path: 'parent_group1_project', namespace_id: parent_group1.id, visibility_level: 20) }
before do
allow(Gitlab).to receive(:com?).and_return(true)
end
describe '#up' do
before do
stub_const("BackfillProjectNamespacesForGroup::GROUP_ID", parent_group1.id)
end
it 'schedules background jobs for each batch of namespaces' do
migrate!
expect(migration).to have_scheduled_batched_migration(
table_name: :projects,
column_name: :id,
job_arguments: [described_class::GROUP_ID, 'up'],
interval: described_class::DELAY_INTERVAL
)
end
end
describe '#down' do
it 'deletes all batched migration records' do
migrate!
schema_migrate_down!
expect(migration).not_to have_scheduled_batched_migration
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment