Commit 6af41560 authored by Alexandru Croitor's avatar Alexandru Croitor

Add migration for backfilling project namespaces

Adding capabilities to backfill project namespace for each
project. Starting with ability to backfill project namespaces
for a single root namespace first.

Changelog: added
parent eb04b06a
......@@ -17,6 +17,9 @@ class Namespace < ApplicationRecord
include EachBatch
ignore_column :delayed_project_removal, remove_with: '14.1', remove_after: '2021-05-22'
# Temporary column used for back-filling project namespaces.
# Remove it once the back-filling of all project namespaces is done.
ignore_column :tmp_project_id, remove_with: '14.7', remove_after: '2022-01-22'
# Tells ActiveRecord not to store the full class name, in order to save some space
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/69794
......
# frozen_string_literal: true
class AddTmpProjectIdColumnToNamespaces < Gitlab::Database::Migration[1.0]
enable_lock_retries!
def change
# this is a temporary column to be able to batch insert records into namespaces table and then be able to link these
# to projects table.
add_column :namespaces, :tmp_project_id, :integer # rubocop: disable Migration/AddColumnsToWideTables
end
end
# frozen_string_literal: true
class AddIndexToTmpProjectIdColumnOnNamespacesTable < Gitlab::Database::Migration[1.0]
disable_ddl_transaction!
INDEX_NAME = 'tmp_index_on_tmp_project_id_on_namespaces'
def up
add_concurrent_index :namespaces, :tmp_project_id, name: INDEX_NAME, unique: true
end
def down
remove_concurrent_index_by_name :namespaces, INDEX_NAME
end
end
# frozen_string_literal: true
class AddFkToTmpProjectIdColumnOnNamespacesTable < Gitlab::Database::Migration[1.0]
disable_ddl_transaction!
def up
add_concurrent_foreign_key :namespaces, :projects, column: :tmp_project_id
end
def down
remove_foreign_key :namespaces, column: :tmp_project_id
end
end
# frozen_string_literal: true
class AddIndexToGroupIdColumnOnWebhooksTable < Gitlab::Database::Migration[1.0]
disable_ddl_transaction!
INDEX_NAME = 'index_on_group_id_on_webhooks'
def up
add_concurrent_index :web_hooks, :group_id, name: INDEX_NAME
end
def down
remove_concurrent_index_by_name :web_hooks, INDEX_NAME
end
end
1cadc3a932d5b62cfeafcd4090eddc37b44997dbbd0b34da1c7c87a5774bb683
\ No newline at end of file
9a62f0ec43ab295619d82494090c38539cb16408c8971bdde86bb8d02546f558
\ No newline at end of file
30e9632877d3ad33528be0f56962c0ab57f5eee3889183d9638cbaea903a3d82
\ No newline at end of file
14bb815cbdad2db56dafb7eaaff893de96116a1a9e8d6c5ed95f4bef9b9717fc
\ No newline at end of file
......@@ -16375,7 +16375,8 @@ CREATE TABLE namespaces (
push_rule_id bigint,
shared_runners_enabled boolean DEFAULT true NOT NULL,
allow_descendants_override_disabled_shared_runners boolean DEFAULT false NOT NULL,
traversal_ids integer[] DEFAULT '{}'::integer[] NOT NULL
traversal_ids integer[] DEFAULT '{}'::integer[] NOT NULL,
tmp_project_id integer
);
CREATE SEQUENCE namespaces_id_seq
......@@ -26591,6 +26592,8 @@ CREATE INDEX index_oauth_openid_requests_on_access_grant_id ON oauth_openid_requ
CREATE UNIQUE INDEX index_on_deploy_keys_id_and_type_and_public ON keys USING btree (id, type) WHERE (public = true);
CREATE INDEX index_on_group_id_on_webhooks ON web_hooks USING btree (group_id);
CREATE INDEX index_on_identities_lower_extern_uid_and_provider ON identities USING btree (lower((extern_uid)::text), provider);
CREATE UNIQUE INDEX index_on_instance_statistics_recorded_at_and_identifier ON analytics_usage_trends_measurements USING btree (identifier, recorded_at);
......@@ -27769,6 +27772,8 @@ CREATE INDEX tmp_index_namespaces_empty_traversal_ids_with_child_namespaces ON n
CREATE INDEX tmp_index_namespaces_empty_traversal_ids_with_root_namespaces ON namespaces USING btree (id) WHERE ((parent_id IS NULL) AND (traversal_ids = '{}'::integer[]));
CREATE UNIQUE INDEX tmp_index_on_tmp_project_id_on_namespaces ON namespaces USING btree (tmp_project_id);
CREATE INDEX tmp_index_on_vulnerabilities_non_dismissed ON vulnerabilities USING btree (id) WHERE (state <> 2);
CREATE UNIQUE INDEX uniq_pkgs_deb_grp_architectures_on_distribution_id_and_name ON packages_debian_group_architectures USING btree (distribution_id, name);
......@@ -29012,6 +29017,9 @@ ALTER TABLE ONLY application_settings
ALTER TABLE ONLY merge_requests
ADD CONSTRAINT fk_6a5165a692 FOREIGN KEY (milestone_id) REFERENCES milestones(id) ON DELETE SET NULL;
ALTER TABLE ONLY namespaces
ADD CONSTRAINT fk_6a77f66919 FOREIGN KEY (tmp_project_id) REFERENCES projects(id) ON DELETE CASCADE;
ALTER TABLE ONLY geo_event_log
ADD CONSTRAINT fk_6ada82d42a FOREIGN KEY (container_repository_updated_event_id) REFERENCES geo_container_repository_updated_events(id) ON DELETE CASCADE;
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
module ProjectNamespaces
# Back-fill project namespaces for projects that do not yet have a namespace.
#
# TODO: remove this comment when an actuall backfill migration is added.
#
# This is first being added without an actual migration as we need to initially test
# if backfilling project namespaces affects performance in any significant way.
# rubocop: disable Metrics/ClassLength
class BackfillProjectNamespaces
BATCH_SIZE = 100
DELETE_BATCH_SIZE = 10
PROJECT_NAMESPACE_STI_NAME = 'Project'
IsolatedModels = ::Gitlab::BackgroundMigration::ProjectNamespaces::Models
def perform(start_id, end_id, namespace_id, migration_type = 'up')
load_project_ids(start_id, end_id, namespace_id)
case migration_type
when 'up'
backfill_project_namespaces(namespace_id)
mark_job_as_succeeded(start_id, end_id, namespace_id, 'up')
when 'down'
cleanup_backfilled_project_namespaces(namespace_id)
mark_job_as_succeeded(start_id, end_id, namespace_id, 'down')
else
raise "Unknown migration type"
end
end
private
attr_accessor :project_ids
def backfill_project_namespaces(namespace_id)
project_ids.each_slice(BATCH_SIZE) do |project_ids|
# We need to lock these project records for the period when we create project namespaces
# and link them to projects so that if a project is modified in the time between creating
# project namespaces `batch_insert_namespaces` and linking them to projects `batch_update_projects`
# we do not get them out of sync.
#
# see https://gitlab.com/gitlab-org/gitlab/-/merge_requests/72527#note_730679469
Project.transaction do
Project.where(id: project_ids).select(:id).lock!('FOR UPDATE')
batch_insert_namespaces(project_ids)
batch_update_projects(project_ids)
end
batch_update_project_namespaces_traversal_ids(project_ids)
end
end
def cleanup_backfilled_project_namespaces(namespace_id)
project_ids.each_slice(BATCH_SIZE) do |project_ids|
# IMPORTANT: first nullify project_namespace_id in projects table to avoid removing projects when records
# from namespaces are deleted due to FK/triggers
nullify_project_namespaces_in_projects(project_ids)
delete_project_namespace_records(project_ids)
end
end
def batch_insert_namespaces(project_ids)
projects = IsolatedModels::Project.where(id: project_ids)
.select("projects.id, projects.name, projects.path, projects.namespace_id, projects.visibility_level, shared_runners_enabled, '#{PROJECT_NAMESPACE_STI_NAME}', now(), now()")
ActiveRecord::Base.connection.execute <<~SQL
INSERT INTO namespaces (tmp_project_id, name, path, parent_id, visibility_level, shared_runners_enabled, type, created_at, updated_at)
#{projects.to_sql}
ON CONFLICT DO NOTHING;
SQL
end
def batch_update_projects(project_ids)
projects = IsolatedModels::Project.where(id: project_ids)
.joins("INNER JOIN namespaces ON projects.id = namespaces.tmp_project_id")
.select("namespaces.id, namespaces.tmp_project_id")
ActiveRecord::Base.connection.execute <<~SQL
WITH cte(project_namespace_id, project_id) AS #{::Gitlab::Database::AsWithMaterialized.materialized_if_supported} (
#{projects.to_sql}
)
UPDATE projects
SET project_namespace_id = cte.project_namespace_id
FROM cte
WHERE id = cte.project_id AND projects.project_namespace_id IS DISTINCT FROM cte.project_namespace_id
SQL
end
def batch_update_project_namespaces_traversal_ids(project_ids)
namespaces = Namespace.where(tmp_project_id: project_ids)
.joins("INNER JOIN namespaces n2 ON namespaces.parent_id = n2.id")
.select("namespaces.id as project_namespace_id, n2.traversal_ids")
ActiveRecord::Base.connection.execute <<~SQL
UPDATE namespaces
SET traversal_ids = array_append(project_namespaces.traversal_ids, project_namespaces.project_namespace_id)
FROM (#{namespaces.to_sql}) as project_namespaces(project_namespace_id, traversal_ids)
WHERE id = project_namespaces.project_namespace_id
SQL
end
def nullify_project_namespaces_in_projects(project_ids)
IsolatedModels::Project.where(id: project_ids).update_all(project_namespace_id: nil)
end
def delete_project_namespace_records(project_ids)
project_ids.each_slice(DELETE_BATCH_SIZE) do |p_ids|
IsolatedModels::Namespace.where(type: PROJECT_NAMESPACE_STI_NAME).where(tmp_project_id: p_ids).delete_all
end
end
def load_project_ids(start_id, end_id, namespace_id)
projects = IsolatedModels::Project.arel_table
relation = IsolatedModels::Project.where(projects[:id].between(start_id..end_id))
relation = relation.where(projects[:namespace_id].in(Arel::Nodes::SqlLiteral.new(hierarchy_cte(namespace_id)))) if namespace_id
@project_ids = relation.pluck(:id)
end
def mark_job_as_succeeded(*arguments)
::Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded('BackfillProjectNamespaces', arguments)
end
def hierarchy_cte(root_namespace_id)
<<-SQL
WITH RECURSIVE "base_and_descendants" AS (
(
SELECT "namespaces"."id"
FROM "namespaces"
WHERE "namespaces"."type" = 'Group' AND "namespaces"."id" = #{root_namespace_id.to_i}
)
UNION
(
SELECT "namespaces"."id"
FROM "namespaces", "base_and_descendants"
WHERE "namespaces"."type" = 'Group' AND "namespaces"."parent_id" = "base_and_descendants"."id"
)
)
SELECT "id" FROM "base_and_descendants" AS "namespaces"
SQL
end
end
# rubocop: enable Metrics/ClassLength
end
end
end
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
module ProjectNamespaces
module Models
# isolated Namespace model
class Namespace < ActiveRecord::Base
include EachBatch
self.table_name = 'namespaces'
self.inheritance_column = :_type_disabled
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
module ProjectNamespaces
module Models
# isolated Project model
class Project < ActiveRecord::Base
include EachBatch
self.table_name = 'projects'
end
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNamespaces, :migration do
include MigrationsHelpers
context 'when migrating data', :aggregate_failures do
let(:projects) { table(:projects) }
let(:namespaces) { table(:namespaces) }
let(:parent_group1) { namespaces.create!(name: 'parent_group1', path: 'parent_group1', visibility_level: 20, type: 'Group') }
let(:parent_group2) { namespaces.create!(name: 'test1', path: 'test1', runners_token: 'my-token1', project_creation_level: 1, visibility_level: 20, type: 'Group') }
let(:parent_group1_project) { projects.create!(name: 'parent_group1_project', path: 'parent_group1_project', namespace_id: parent_group1.id, visibility_level: 20) }
let(:parent_group2_project) { projects.create!(name: 'parent_group2_project', path: 'parent_group2_project', namespace_id: parent_group2.id, visibility_level: 20) }
let(:child_nodes_count) { 2 }
let(:tree_depth) { 3 }
let(:backfilled_namespace) { nil }
before do
BackfillProjectNamespaces::TreeGenerator.new(namespaces, projects, [parent_group1, parent_group2], child_nodes_count, tree_depth).build_tree
end
describe '#up' do
shared_examples 'back-fill project namespaces' do
it 'back-fills all project namespaces' do
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
projects_count = ::Project.count
batches_count = (projects_count / described_class::BATCH_SIZE.to_f).ceil
project_namespaces_count = ::Namespace.where(type: 'Project').count
migration = described_class.new
expect(projects_count).not_to eq(project_namespaces_count)
expect(migration).to receive(:batch_insert_namespaces).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_project_namespaces_traversal_ids).exactly(batches_count).and_call_original
expect { migration.perform(start_id, end_id, nil, 'up') }.to change(Namespace.where(type: 'Project'), :count)
expect(projects_count).to eq(::Namespace.where(type: 'Project').count)
check_projects_in_sync_with(Namespace.where(type: 'Project'))
end
context 'when passing specific group as parameter' do
let(:backfilled_namespace) { parent_group1 }
it 'back-fills project namespaces for the specified group hierarchy' do
backfilled_namespace_projects = base_ancestor(backfilled_namespace).first.all_projects
start_id = backfilled_namespace_projects.minimum(:id)
end_id = backfilled_namespace_projects.maximum(:id)
group_projects_count = backfilled_namespace_projects.count
batches_count = (group_projects_count / described_class::BATCH_SIZE.to_f).ceil
project_namespaces_in_hierarchy = project_namespaces_in_hierarchy(base_ancestor(backfilled_namespace))
migration = described_class.new
expect(project_namespaces_in_hierarchy.count).to eq(0)
expect(migration).to receive(:batch_insert_namespaces).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_project_namespaces_traversal_ids).exactly(batches_count).and_call_original
expect(group_projects_count).to eq(14)
expect(project_namespaces_in_hierarchy.count).to eq(0)
migration.perform(start_id, end_id, backfilled_namespace.id, 'up')
expect(project_namespaces_in_hierarchy.count).to eq(14)
check_projects_in_sync_with(project_namespaces_in_hierarchy)
end
end
context 'when projects already have project namespaces' do
before do
hierarchy1_projects = base_ancestor(parent_group1).first.all_projects
start_id = hierarchy1_projects.minimum(:id)
end_id = hierarchy1_projects.maximum(:id)
described_class.new.perform(start_id, end_id, parent_group1.id, 'up')
end
it 'does not duplicate project namespaces' do
# check there are already some project namespaces but not for all
projects_count = ::Project.count
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
batches_count = (projects_count / described_class::BATCH_SIZE.to_f).ceil
project_namespaces = ::Namespace.where(type: 'Project')
migration = described_class.new
expect(project_namespaces_in_hierarchy(base_ancestor(parent_group1)).count).to be >= 14
expect(project_namespaces_in_hierarchy(base_ancestor(parent_group2)).count).to eq(0)
expect(projects_count).not_to eq(project_namespaces.count)
# run migration again to test we do not generate extra project namespaces
expect(migration).to receive(:batch_insert_namespaces).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_project_namespaces_traversal_ids).exactly(batches_count).and_call_original
expect { migration.perform(start_id, end_id, nil, 'up') }.to change(project_namespaces, :count).by(14)
expect(projects_count).to eq(project_namespaces.count)
end
end
end
it 'checks no project namespaces exist in the defined hierarchies' do
hierarchy1_project_namespaces = project_namespaces_in_hierarchy(base_ancestor(parent_group1))
hierarchy2_project_namespaces = project_namespaces_in_hierarchy(base_ancestor(parent_group2))
hierarchy1_projects_count = base_ancestor(parent_group1).first.all_projects.count
hierarchy2_projects_count = base_ancestor(parent_group2).first.all_projects.count
expect(hierarchy1_project_namespaces).to be_empty
expect(hierarchy2_project_namespaces).to be_empty
expect(hierarchy1_projects_count).to eq(14)
expect(hierarchy2_projects_count).to eq(14)
end
context 'back-fill project namespaces in a single batch' do
it_behaves_like 'back-fill project namespaces'
end
context 'back-fill project namespaces in batches' do
before do
stub_const("#{described_class.name}::BATCH_SIZE", 2)
end
it_behaves_like 'back-fill project namespaces'
end
end
describe '#down' do
before do
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
# back-fill first
described_class.new.perform(start_id, end_id, nil, 'up')
end
shared_examples 'cleanup project namespaces' do
it 'removes project namespaces' do
projects_count = ::Project.count
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
migration = described_class.new
batches_count = (projects_count / described_class::BATCH_SIZE.to_f).ceil
expect(projects_count).to be > 0
expect(projects_count).to eq(::Namespace.where(type: 'Project').count)
expect(migration).to receive(:nullify_project_namespaces_in_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:delete_project_namespace_records).exactly(batches_count).and_call_original
migration.perform(start_id, end_id, nil, 'down')
expect(::Project.count).to be > 0
expect(::Namespace.where(type: 'Project').count).to eq(0)
end
context 'when passing specific group as parameter' do
let(:backfilled_namespace) { parent_group1 }
it 'removes project namespaces only for the specific group hierarchy' do
backfilled_namespace_projects = base_ancestor(backfilled_namespace).first.all_projects
start_id = backfilled_namespace_projects.minimum(:id)
end_id = backfilled_namespace_projects.maximum(:id)
group_projects_count = backfilled_namespace_projects.count
batches_count = (group_projects_count / described_class::BATCH_SIZE.to_f).ceil
project_namespaces_in_hierarchy = project_namespaces_in_hierarchy(base_ancestor(backfilled_namespace))
migration = described_class.new
expect(project_namespaces_in_hierarchy.count).to eq(14)
expect(migration).to receive(:nullify_project_namespaces_in_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:delete_project_namespace_records).exactly(batches_count).and_call_original
migration.perform(start_id, end_id, backfilled_namespace.id, 'down')
expect(::Namespace.where(type: 'Project').count).to be > 0
expect(project_namespaces_in_hierarchy.count).to eq(0)
end
end
end
context 'cleanup project namespaces in a single batch' do
it_behaves_like 'cleanup project namespaces'
end
context 'cleanup project namespaces in batches' do
before do
stub_const("#{described_class.name}::BATCH_SIZE", 2)
end
it_behaves_like 'cleanup project namespaces'
end
end
end
def base_ancestor(ancestor)
::Namespace.where(id: ancestor.id)
end
def project_namespaces_in_hierarchy(base_node)
Gitlab::ObjectHierarchy.new(base_node).base_and_descendants.where(type: 'Project')
end
def check_projects_in_sync_with(namespaces)
project_namespaces_attrs = namespaces.order(:id).pluck(:id, :name, :path, :parent_id, :visibility_level, :shared_runners_enabled)
corresponding_projects_attrs = Project.where(project_namespace_id: project_namespaces_attrs.map(&:first))
.order(:project_namespace_id).pluck(:project_namespace_id, :name, :path, :namespace_id, :visibility_level, :shared_runners_enabled)
expect(project_namespaces_attrs).to eq(corresponding_projects_attrs)
end
end
module BackfillProjectNamespaces
class TreeGenerator
def initialize(namespaces, projects, parent_nodes, child_nodes_count, tree_depth)
parent_nodes_ids = parent_nodes.map(&:id)
@namespaces = namespaces
@projects = projects
@subgroups_depth = tree_depth
@resource_count = child_nodes_count
@all_groups = [parent_nodes_ids]
end
def build_tree
(1..@subgroups_depth).each do |level|
parent_level = level - 1
current_level = level
parent_groups = @all_groups[parent_level]
parent_groups.each do |parent_id|
@resource_count.times do |i|
group_path = "child#{i}_level#{level}"
project_path = "project#{i}_level#{level}"
sub_group = @namespaces.create!(name: group_path, path: group_path, parent_id: parent_id, visibility_level: 20, type: 'Group')
@projects.create!(name: project_path, path: project_path, namespace_id: sub_group.id, visibility_level: 20)
track_group_id(current_level, sub_group.id)
end
end
end
end
def track_group_id(depth_level, group_id)
@all_groups[depth_level] ||= []
@all_groups[depth_level] << group_id
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment