Commit 191f87b1 authored by charlie ablett's avatar charlie ablett Committed by Kerri Miller

Resolve labels with duplicate title and group ID

- Add uniqueness index to DB
parent bb2d4497
---
title: Deduplicate labels with identical title and group
merge_request: 37148
author:
type: fixed
# frozen_string_literal: true
class RemoveDuplicateLabelsFromGroup < ActiveRecord::Migration[6.0]
DOWNTIME = false
CREATE = 1
RENAME = 2
disable_ddl_transaction!
class BackupLabel < ApplicationRecord
include EachBatch
self.table_name = 'backup_labels'
end
class Label < ApplicationRecord
self.table_name = 'labels'
end
class Group < ApplicationRecord
include EachBatch
self.table_name = 'namespaces'
end
BATCH_SIZE = 10_000
def up
# Split to smaller chunks
# Loop rather than background job, every 10,000
# there are ~1,800,000 groups in total (excluding personal namespaces, which can't have labels)
Group.where(type: 'Group').each_batch(of: BATCH_SIZE) do |batch|
range = batch.pluck('MIN(id)', 'MAX(id)').first
transaction do
remove_full_duplicates(*range)
end
transaction do
rename_partial_duplicates(*range)
end
end
end
DOWN_BATCH_SIZE = 1000
def down
BackupLabel.where('project_id IS NULL AND group_id IS NOT NULL').each_batch(of: DOWN_BATCH_SIZE) do |batch|
range = batch.pluck('MIN(id)', 'MAX(id)').first
restore_renamed_labels(*range)
restore_deleted_labels(*range)
end
end
def remove_full_duplicates(start_id, stop_id)
# Fields that are considered duplicate:
# group_id title template description type color
duplicate_labels = ApplicationRecord.connection.execute(<<-SQL.squish)
WITH data AS (
SELECT labels.*,
row_number() OVER (PARTITION BY labels.group_id, labels.title, labels.template, labels.description, labels.type, labels.color ORDER BY labels.id) AS row_number,
#{CREATE} AS restore_action
FROM labels
WHERE labels.group_id BETWEEN #{start_id} AND #{stop_id}
AND NOT EXISTS (SELECT * FROM board_labels WHERE board_labels.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM label_links WHERE label_links.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM label_priorities WHERE label_priorities.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM lists WHERE lists.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM resource_label_events WHERE resource_label_events.label_id = labels.id)
) SELECT * FROM data WHERE row_number > 1;
SQL
if duplicate_labels.any?
# create backup records
BackupLabel.insert_all!(duplicate_labels.map { |label| label.except("row_number") })
Label.unscoped.where(id: duplicate_labels.pluck("id")).delete_all
end
end
def rename_partial_duplicates(start_id, stop_id)
# We need to ensure that the new title (with `_duplicate#{ID}`) doesn't exceed the limit.
# Truncate the original title (if needed) to 245 characters minus the length of the ID
# then add `_duplicate#{ID}`
soft_duplicates = ApplicationRecord.connection.execute(<<-SQL.squish)
WITH data AS (
SELECT
*,
substring(title from 1 for 245 - length(id::text)) || '_duplicate' || id::text as new_title,
#{RENAME} AS restore_action,
row_number() OVER (PARTITION BY group_id, title ORDER BY id) AS row_number
FROM labels
WHERE group_id BETWEEN #{start_id} AND #{stop_id}
) SELECT * FROM data WHERE row_number > 1;
SQL
if soft_duplicates.any?
# create backup records
BackupLabel.insert_all!(soft_duplicates.map { |label| label.except("row_number") })
ApplicationRecord.connection.execute(<<-SQL.squish)
UPDATE labels SET title = substring(title from 1 for 245 - length(id::text)) || '_duplicate' || id::text
WHERE labels.id IN (#{soft_duplicates.map { |dup| dup["id"] }.join(", ")});
SQL
end
end
def restore_renamed_labels(start_id, stop_id)
# the backup label IDs are not incremental, they are copied directly from the Labels table
ApplicationRecord.connection.execute(<<-SQL.squish)
WITH backups AS (
SELECT id, title
FROM backup_labels
WHERE id BETWEEN #{start_id} AND #{stop_id}
AND restore_action = #{RENAME}
) UPDATE labels SET title = backups.title
FROM backups
WHERE labels.id = backups.id;
SQL
end
def restore_deleted_labels(start_id, stop_id)
ActiveRecord::Base.connection.execute(<<-SQL.squish)
INSERT INTO labels
SELECT id, title, color, group_id, created_at, updated_at, template, description, description_html, type, cached_markdown_version FROM backup_labels
WHERE backup_labels.id BETWEEN #{start_id} AND #{stop_id}
AND backup_labels.project_id IS NULL AND backup_labels.group_id IS NOT NULL
AND backup_labels.restore_action = #{CREATE}
SQL
end
end
# frozen_string_literal: true
class AddUniquenessIndexToLabelTitleAndGroup < ActiveRecord::Migration[6.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
GROUP_AND_TITLE = [:group_id, :title]
def up
add_concurrent_index :labels, GROUP_AND_TITLE, where: "labels.project_id IS NULL", unique: true, name: "index_labels_on_group_id_and_title_unique"
remove_concurrent_index :labels, GROUP_AND_TITLE, name: "index_labels_on_group_id_and_title"
end
def down
add_concurrent_index :labels, GROUP_AND_TITLE, where: "labels.project_id IS NULL", unique: false, name: "index_labels_on_group_id_and_title"
remove_concurrent_index :labels, GROUP_AND_TITLE, name: "index_labels_on_group_id_and_title_unique"
end
end
9683f55a327b9579b9b0b9484dd11a07b7ea4244b126c46e0144662cb25da6bb
\ No newline at end of file
71cd12e553b3acbb665770fe7478365f1f082e2d278c67b166f41461f689aa5e
\ No newline at end of file
......@@ -21801,7 +21801,7 @@ CREATE UNIQUE INDEX index_label_priorities_on_project_id_and_label_id ON label_p
CREATE UNIQUE INDEX index_labels_on_group_id_and_project_id_and_title ON labels USING btree (group_id, project_id, title);
CREATE INDEX index_labels_on_group_id_and_title_with_null_project_id ON labels USING btree (group_id, title) WHERE (project_id IS NULL);
CREATE UNIQUE INDEX index_labels_on_group_id_and_title_unique ON labels USING btree (group_id, title) WHERE (project_id IS NULL);
CREATE INDEX index_labels_on_project_id ON labels USING btree (project_id);
......
# frozen_string_literal: true
require 'spec_helper'
require Rails.root.join('db', 'post_migrate', '20200716234259_remove_duplicate_labels_from_group.rb')
RSpec.describe RemoveDuplicateLabelsFromGroup do
let(:labels_table) { table(:labels) }
let(:labels) { labels_table.all }
let(:projects_table) { table(:projects) }
let(:projects) { projects_table.all }
let(:namespaces_table) { table(:namespaces) }
let(:namespaces) { namespaces_table.all }
let(:backup_labels_table) { table(:backup_labels) }
let(:backup_labels) { backup_labels_table.all }
# for those cases where we can't use the activerecord class because the `type` column
# makes it think it has polymorphism and should be/have a Label subclass
let(:sql_backup_labels) { ApplicationRecord.connection.execute('SELECT * from backup_labels') }
# all the possible tables with records that may have a relationship with a label
let(:analytics_cycle_analytics_group_stages_table) { table(:analytics_cycle_analytics_group_stages) }
let(:analytics_cycle_analytics_project_stages_table) { table(:analytics_cycle_analytics_project_stages) }
let(:board_labels_table) { table(:board_labels) }
let(:label_links_table) { table(:label_links) }
let(:label_priorities_table) { table(:label_priorities) }
let(:lists_table) { table(:lists) }
let(:resource_label_events_table) { table(:resource_label_events) }
let!(:group_one) { namespaces_table.create!(id: 1, type: 'Group', name: 'group', path: 'group') }
let!(:project_one) do
projects_table.create!(id: 1, name: 'project', path: 'project',
visibility_level: 0, namespace_id: group_one.id)
end
let(:label_title) { 'bug' }
let(:label_color) { 'red' }
let(:label_description) { 'nice label' }
let(:project_id) { project_one.id }
let(:group_id) { group_one.id }
let(:other_title) { 'feature' }
let(:group_label_attributes) do
{
title: label_title, color: label_color, group_id: group_id, type: 'GroupLabel', template: false, description: label_description
}
end
let(:migration) { described_class.new }
describe 'removing full duplicates' do
context 'when there are no duplicate labels' do
let!(:first_label) { labels_table.create!(group_label_attributes.merge(id: 1, title: "a different label")) }
let!(:second_label) { labels_table.create!(group_label_attributes.merge(id: 2, title: "a totally different label")) }
it 'does not remove anything' do
expect { migration.up }.not_to change { backup_labels_table.count }
end
it 'restores removed records when rolling back - no change' do
migration.up
expect { migration.down }.not_to change { labels_table.count }
end
end
context 'with duplicates with no relationships' do
let!(:first_label) { labels_table.create!(group_label_attributes.merge(id: 1)) }
let!(:second_label) { labels_table.create!(group_label_attributes.merge(id: 2)) }
let!(:third_label) { labels_table.create!(group_label_attributes.merge(id: 3, title: other_title)) }
let!(:fourth_label) { labels_table.create!(group_label_attributes.merge(id: 4, title: other_title)) }
it 'creates a backup record for each removed record' do
expect { migration.up }.to change { backup_labels_table.count }.from(0).to(2)
end
it 'creates the correct backup records with `create` restore_action' do
migration.up
expect(sql_backup_labels.find { |bl| bl["id"] == 2 }).to include(second_label.attributes.merge("restore_action" => described_class::CREATE, "new_title" => nil, "created_at" => anything, "updated_at" => anything))
expect(sql_backup_labels.find { |bl| bl["id"] == 4 }).to include(fourth_label.attributes.merge("restore_action" => described_class::CREATE, "new_title" => nil, "created_at" => anything, "updated_at" => anything))
end
it 'deletes all but one' do
migration.up
expect { second_label.reload }.to raise_error(ActiveRecord::RecordNotFound)
expect { fourth_label.reload }.to raise_error(ActiveRecord::RecordNotFound)
end
it 'restores removed records on rollback' do
second_label_attributes = modified_attributes(second_label)
fourth_label_attributes = modified_attributes(fourth_label)
migration.up
migration.down
expect(second_label.attributes).to include(second_label_attributes)
expect(fourth_label.attributes).to include(fourth_label_attributes)
end
end
context 'two duplicate records, one of which has a relationship' do
let!(:first_label) { labels_table.create!(group_label_attributes.merge(id: 1)) }
let!(:second_label) { labels_table.create!(group_label_attributes.merge(id: 2)) }
let!(:label_priority) { label_priorities_table.create!(label_id: second_label.id, project_id: project_id, priority: 1) }
it 'does not remove anything' do
expect { migration.up }.not_to change { labels_table.count }
end
it 'does not create a backup record with `create` restore_action' do
expect { migration.up }.not_to change { backup_labels_table.where(restore_action: described_class::CREATE).count }
end
it 'restores removed records when rolling back - no change' do
migration.up
expect { migration.down }.not_to change { labels_table.count }
end
end
context 'multiple duplicates, a subset of which have relationships' do
let!(:first_label) { labels_table.create!(group_label_attributes.merge(id: 1)) }
let!(:second_label) { labels_table.create!(group_label_attributes.merge(id: 2)) }
let!(:label_priority_for_second_label) { label_priorities_table.create!(label_id: second_label.id, project_id: project_id, priority: 1) }
let!(:third_label) { labels_table.create!(group_label_attributes.merge(id: 3)) }
let!(:fourth_label) { labels_table.create!(group_label_attributes.merge(id: 4)) }
let!(:label_priority_for_fourth_label) { label_priorities_table.create!(label_id: fourth_label.id, project_id: project_id, priority: 2) }
it 'creates a backup record with `create` restore_action for each removed record' do
expect { migration.up }.to change { backup_labels_table.where(restore_action: described_class::CREATE).count }.from(0).to(1)
end
it 'creates the correct backup records' do
migration.up
expect(sql_backup_labels.find { |bl| bl["id"] == 3 }).to include(third_label.attributes.merge("restore_action" => described_class::CREATE, "new_title" => nil, "created_at" => anything, "updated_at" => anything))
end
it 'deletes the duplicate record' do
migration.up
expect { first_label.reload }.not_to raise_error
expect { second_label.reload }.not_to raise_error
expect { third_label.reload }.to raise_error(ActiveRecord::RecordNotFound)
end
it 'restores removed records on rollback' do
third_label_attributes = modified_attributes(third_label)
migration.up
migration.down
expect(third_label.attributes).to include(third_label_attributes)
end
end
end
describe 'renaming partial duplicates' do
# partial duplicates - only group_id and title match. Distinct colour prevents deletion.
context 'when there are no duplicate labels' do
let!(:first_label) { labels_table.create!(group_label_attributes.merge(id: 1, title: "a unique label", color: 'green')) }
let!(:second_label) { labels_table.create!(group_label_attributes.merge(id: 2, title: "a totally different, unique, label", color: 'blue')) }
it 'does not rename anything' do
expect { migration.up }.not_to change { backup_labels_table.count }
end
end
context 'with duplicates with no relationships' do
let!(:first_label) { labels_table.create!(group_label_attributes.merge(id: 1, color: 'green')) }
let!(:second_label) { labels_table.create!(group_label_attributes.merge(id: 2, color: 'blue')) }
let!(:third_label) { labels_table.create!(group_label_attributes.merge(id: 3, title: other_title, color: 'purple')) }
let!(:fourth_label) { labels_table.create!(group_label_attributes.merge(id: 4, title: other_title, color: 'yellow')) }
it 'creates a backup record for each renamed record' do
expect { migration.up }.to change { backup_labels_table.count }.from(0).to(2)
end
it 'creates the correct backup records with `rename` restore_action' do
migration.up
expect(sql_backup_labels.find { |bl| bl["id"] == 2 }).to include(second_label.attributes.merge("restore_action" => described_class::RENAME, "created_at" => anything, "updated_at" => anything))
expect(sql_backup_labels.find { |bl| bl["id"] == 4 }).to include(fourth_label.attributes.merge("restore_action" => described_class::RENAME, "created_at" => anything, "updated_at" => anything))
end
it 'modifies the titles of the partial duplicates' do
migration.up
expect(second_label.reload.title).to match(/#{label_title}_duplicate#{second_label.id}$/)
expect(fourth_label.reload.title).to match(/#{other_title}_duplicate#{fourth_label.id}$/)
end
it 'restores renamed records on rollback' do
second_label_attributes = modified_attributes(second_label)
fourth_label_attributes = modified_attributes(fourth_label)
migration.up
migration.down
expect(second_label.reload.attributes).to include(second_label_attributes)
expect(fourth_label.reload.attributes).to include(fourth_label_attributes)
end
context 'when the labels have a long title that might overflow' do
let(:long_title) { "a" * 255 }
before do
first_label.update_attribute(:title, long_title)
second_label.update_attribute(:title, long_title)
end
it 'keeps the length within the limit' do
migration.up
expect(second_label.reload.title).to eq("#{"a" * 244}_duplicate#{second_label.id}")
expect(second_label.title.length).to eq(255)
end
end
end
end
def modified_attributes(label)
label.attributes.except('created_at', 'updated_at')
end
end
......@@ -520,14 +520,29 @@ RSpec.describe API::Labels do
expect(json_response['color']).to eq(label1.color)
end
it 'returns 200 if group label already exists' do
create(:group_label, title: label1.name, group: group)
context 'if group label already exists' do
let!(:group_label) { create(:group_label, title: label1.name, group: group) }
expect { put api("/projects/#{project.id}/labels/promote", user), params: { name: label1.name } }
.to change(project.labels, :count).by(-1)
.and change(group.labels, :count).by(0)
it 'returns a status of 200' do
put api("/projects/#{project.id}/labels/promote", user), params: { name: label1.name }
expect(response).to have_gitlab_http_status(:ok)
expect(response).to have_gitlab_http_status(:ok)
end
it 'does not change the group label count' do
expect { put api("/projects/#{project.id}/labels/promote", user), params: { name: label1.name } }
.not_to change(group.labels, :count)
end
it 'does not change the group label max (reuses the same ID)' do
expect { put api("/projects/#{project.id}/labels/promote", user), params: { name: label1.name } }
.not_to change(group.labels, :max)
end
it 'changes the project label count' do
expect { put api("/projects/#{project.id}/labels/promote", user), params: { name: label1.name } }
.to change(project.labels, :count).by(-1)
end
end
it 'returns 403 if guest promotes label' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment