Commit 9800171f authored by Yorick Peterse's avatar Yorick Peterse

Clean up migration to populate commit users

This cleans up any pending MigrateMergeRequestDiffCommitUsers migration
jobs. In addition, we stop using the old columns, ignore them, and
remove them in a post-deployment migration.

We don't need a separate release for the removal of the columns, as
ignoring them in the deploy ensures it's safe to remove the columns in a
post-deployment migration. In addition, the code has supported reading
from/writing to the new data for the last two or so months.

With this change we should start seeing less growth in the
merge_request_diff_commits table, as we no longer store many duplicate
names and Emails. Once we find a way to reclaim unused space in the
table, we should be able to free up roughly 500 GB of disk space as
mentioned in the original issue
(https://gitlab.com/gitlab-org/gitlab/-/issues/331823).

Changelog: added
parent 03e3c425
...@@ -6,6 +6,11 @@ class MergeRequestDiffCommit < ApplicationRecord ...@@ -6,6 +6,11 @@ class MergeRequestDiffCommit < ApplicationRecord
include BulkInsertSafe include BulkInsertSafe
include ShaAttribute include ShaAttribute
include CachedCommit include CachedCommit
include IgnorableColumns
ignore_column %i[author_name author_email committer_name committer_email],
remove_with: '14.6',
remove_after: '2021-11-22'
belongs_to :merge_request_diff belongs_to :merge_request_diff
...@@ -51,9 +56,14 @@ class MergeRequestDiffCommit < ApplicationRecord ...@@ -51,9 +56,14 @@ class MergeRequestDiffCommit < ApplicationRecord
committer = committer =
users[[commit_hash[:committer_name], commit_hash[:committer_email]]] users[[commit_hash[:committer_name], commit_hash[:committer_email]]]
# These fields are only used to determine the author/committer IDs, we
# don't store them in the DB.
commit_hash = commit_hash
.except(:author_name, :author_email, :committer_name, :committer_email)
commit_hash.merge( commit_hash.merge(
commit_author_id: author&.id, commit_author_id: author.id,
committer_id: committer&.id, committer_id: committer.id,
merge_request_diff_id: merge_request_diff_id, merge_request_diff_id: merge_request_diff_id,
relative_order: index, relative_order: index,
sha: Gitlab::Database::ShaAttribute.serialize(sha), # rubocop:disable Cop/ActiveRecordSerialize sha: Gitlab::Database::ShaAttribute.serialize(sha), # rubocop:disable Cop/ActiveRecordSerialize
...@@ -104,18 +114,18 @@ class MergeRequestDiffCommit < ApplicationRecord ...@@ -104,18 +114,18 @@ class MergeRequestDiffCommit < ApplicationRecord
end end
def author_name def author_name
commit_author_id ? commit_author.name : super commit_author.name
end end
def author_email def author_email
commit_author_id ? commit_author.email : super commit_author.email
end end
def committer_name def committer_name
committer_id ? committer.name : super committer.name
end end
def committer_email def committer_email
committer_id ? committer.email : super committer.email
end end
end end
# frozen_string_literal: true
class CleanUpMigrateMergeRequestDiffCommitUsers < Gitlab::Database::Migration[1.0]
def up
jobs = Gitlab::Database::BackgroundMigrationJob
.for_migration_class('MigrateMergeRequestDiffCommitUsers')
.pending
.to_a
return if jobs.empty?
say("#{jobs.length} MigrateMergeRequestDiffCommitUsers are still pending")
# Normally we don't process background migrations in a regular migration, as
# this could take a while to complete and thus block a deployment.
#
# In this case the jobs have all been processed for GitLab.com at the time
# of writing. In addition, it's been a few releases since this migration was
# introduced. As a result, self-hosted instances should have their
# migrations finished a long time ago.
#
# For these reasons we clean up any pending jobs (just in case) before
# deploying the code. This also allows us to immediately start using the new
# setup only, instead of having to support both the old and new approach for
# at least one more release.
jobs.each do |job|
Gitlab::BackgroundMigration::MigrateMergeRequestDiffCommitUsers
.new
.perform(*job.arguments)
end
end
def down
end
end
# frozen_string_literal: true
class RemoveMergeRequestDiffCommitColumns < Gitlab::Database::Migration[1.0]
enable_lock_retries!
COLUMNS = %i[author_name author_email committer_name committer_email].freeze
def change
COLUMNS.each do |column|
remove_column(:merge_request_diff_commits, column, :text)
end
end
end
0f2578f0266154ad2790cc808233c71566b3a3ea87c40909feba9ccc5872927c
\ No newline at end of file
2685a534728ab1a50acb49a7a5ac7d9285fdc36ec3610b93a4219e6687c22b06
\ No newline at end of file
...@@ -15818,10 +15818,6 @@ CREATE TABLE merge_request_diff_commits ( ...@@ -15818,10 +15818,6 @@ CREATE TABLE merge_request_diff_commits (
merge_request_diff_id integer NOT NULL, merge_request_diff_id integer NOT NULL,
relative_order integer NOT NULL, relative_order integer NOT NULL,
sha bytea NOT NULL, sha bytea NOT NULL,
author_name text,
author_email text,
committer_name text,
committer_email text,
message text, message text,
trailers jsonb DEFAULT '{}'::jsonb NOT NULL, trailers jsonb DEFAULT '{}'::jsonb NOT NULL,
commit_author_id bigint, commit_author_id bigint,
...@@ -399,6 +399,10 @@ excluded_attributes: ...@@ -399,6 +399,10 @@ excluded_attributes:
- :verification_checksum - :verification_checksum
- :verification_failure - :verification_failure
merge_request_diff_commits: merge_request_diff_commits:
- :author_name
- :author_email
- :committer_name
- :committer_email
- :merge_request_diff_id - :merge_request_diff_id
- :commit_author_id - :commit_author_id
- :committer_id - :committer_id
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
require 'spec_helper' require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::MigrateMergeRequestDiffCommitUsers do RSpec.describe Gitlab::BackgroundMigration::MigrateMergeRequestDiffCommitUsers, schema: 20211012134316 do
let(:namespaces) { table(:namespaces) } let(:namespaces) { table(:namespaces) }
let(:projects) { table(:projects) } let(:projects) { table(:projects) }
let(:users) { table(:users) } let(:users) { table(:users) }
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
require 'spec_helper' require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::StealMigrateMergeRequestDiffCommitUsers do RSpec.describe Gitlab::BackgroundMigration::StealMigrateMergeRequestDiffCommitUsers, schema: 20211012134316 do
let(:migration) { described_class.new } let(:migration) { described_class.new }
describe '#perform' do describe '#perform' do
......
...@@ -23,7 +23,7 @@ RSpec.describe Gitlab::ImportExport::Project::TreeRestorer do ...@@ -23,7 +23,7 @@ RSpec.describe Gitlab::ImportExport::Project::TreeRestorer do
] ]
RSpec::Mocks.with_temporary_scope do RSpec::Mocks.with_temporary_scope do
@project = create(:project, :builds_enabled, :issues_disabled, name: 'project', path: 'project') @project = create(:project, :repository, :builds_enabled, :issues_disabled, name: 'project', path: 'project')
@shared = @project.import_export_shared @shared = @project.import_export_shared
stub_all_feature_flags stub_all_feature_flags
...@@ -36,7 +36,6 @@ RSpec.describe Gitlab::ImportExport::Project::TreeRestorer do ...@@ -36,7 +36,6 @@ RSpec.describe Gitlab::ImportExport::Project::TreeRestorer do
allow_any_instance_of(Gitlab::Git::Repository).to receive(:branch_exists?).and_return(false) allow_any_instance_of(Gitlab::Git::Repository).to receive(:branch_exists?).and_return(false)
expect(@shared).not_to receive(:error) expect(@shared).not_to receive(:error)
expect_any_instance_of(Gitlab::Git::Repository).to receive(:create_branch).with('feature', 'DCBA')
allow_any_instance_of(Gitlab::Git::Repository).to receive(:create_branch) allow_any_instance_of(Gitlab::Git::Repository).to receive(:create_branch)
project_tree_restorer = described_class.new(user: @user, shared: @shared, project: @project) project_tree_restorer = described_class.new(user: @user, shared: @shared, project: @project)
......
# frozen_string_literal: true
require 'spec_helper'
require_migration! 'clean_up_migrate_merge_request_diff_commit_users'
RSpec.describe CleanUpMigrateMergeRequestDiffCommitUsers, :migration do
describe '#up' do
context 'when there are pending jobs' do
it 'processes the jobs immediately' do
Gitlab::Database::BackgroundMigrationJob.create!(
class_name: 'MigrateMergeRequestDiffCommitUsers',
status: :pending,
arguments: [10, 20]
)
spy = Gitlab::BackgroundMigration::MigrateMergeRequestDiffCommitUsers
migration = described_class.new
allow(Gitlab::BackgroundMigration::MigrateMergeRequestDiffCommitUsers)
.to receive(:new)
.and_return(spy)
expect(migration).to receive(:say)
expect(spy).to receive(:perform).with(10, 20)
migration.up
end
end
context 'when all jobs are completed' do
it 'does nothing' do
Gitlab::Database::BackgroundMigrationJob.create!(
class_name: 'MigrateMergeRequestDiffCommitUsers',
status: :succeeded,
arguments: [10, 20]
)
migration = described_class.new
expect(migration).not_to receive(:say)
expect(Gitlab::BackgroundMigration::MigrateMergeRequestDiffCommitUsers)
.not_to receive(:new)
migration.up
end
end
end
end
...@@ -46,11 +46,7 @@ RSpec.describe MergeRequestDiffCommit do ...@@ -46,11 +46,7 @@ RSpec.describe MergeRequestDiffCommit do
{ {
"message": "Add submodule from gitlab.com\n\nSigned-off-by: Dmitriy Zaporozhets \u003cdmitriy.zaporozhets@gmail.com\u003e\n", "message": "Add submodule from gitlab.com\n\nSigned-off-by: Dmitriy Zaporozhets \u003cdmitriy.zaporozhets@gmail.com\u003e\n",
"authored_date": "2014-02-27T10:01:38.000+01:00".to_time, "authored_date": "2014-02-27T10:01:38.000+01:00".to_time,
"author_name": "Dmitriy Zaporozhets",
"author_email": "dmitriy.zaporozhets@gmail.com",
"committed_date": "2014-02-27T10:01:38.000+01:00".to_time, "committed_date": "2014-02-27T10:01:38.000+01:00".to_time,
"committer_name": "Dmitriy Zaporozhets",
"committer_email": "dmitriy.zaporozhets@gmail.com",
"commit_author_id": an_instance_of(Integer), "commit_author_id": an_instance_of(Integer),
"committer_id": an_instance_of(Integer), "committer_id": an_instance_of(Integer),
"merge_request_diff_id": merge_request_diff_id, "merge_request_diff_id": merge_request_diff_id,
...@@ -61,11 +57,7 @@ RSpec.describe MergeRequestDiffCommit do ...@@ -61,11 +57,7 @@ RSpec.describe MergeRequestDiffCommit do
{ {
"message": "Change some files\n\nSigned-off-by: Dmitriy Zaporozhets \u003cdmitriy.zaporozhets@gmail.com\u003e\n", "message": "Change some files\n\nSigned-off-by: Dmitriy Zaporozhets \u003cdmitriy.zaporozhets@gmail.com\u003e\n",
"authored_date": "2014-02-27T09:57:31.000+01:00".to_time, "authored_date": "2014-02-27T09:57:31.000+01:00".to_time,
"author_name": "Dmitriy Zaporozhets",
"author_email": "dmitriy.zaporozhets@gmail.com",
"committed_date": "2014-02-27T09:57:31.000+01:00".to_time, "committed_date": "2014-02-27T09:57:31.000+01:00".to_time,
"committer_name": "Dmitriy Zaporozhets",
"committer_email": "dmitriy.zaporozhets@gmail.com",
"commit_author_id": an_instance_of(Integer), "commit_author_id": an_instance_of(Integer),
"committer_id": an_instance_of(Integer), "committer_id": an_instance_of(Integer),
"merge_request_diff_id": merge_request_diff_id, "merge_request_diff_id": merge_request_diff_id,
...@@ -111,11 +103,7 @@ RSpec.describe MergeRequestDiffCommit do ...@@ -111,11 +103,7 @@ RSpec.describe MergeRequestDiffCommit do
[{ [{
"message": "Weird commit date\n", "message": "Weird commit date\n",
"authored_date": timestamp, "authored_date": timestamp,
"author_name": "Alejandro Rodríguez",
"author_email": "alejorro70@gmail.com",
"committed_date": timestamp, "committed_date": timestamp,
"committer_name": "Alejandro Rodríguez",
"committer_email": "alejorro70@gmail.com",
"commit_author_id": an_instance_of(Integer), "commit_author_id": an_instance_of(Integer),
"committer_id": an_instance_of(Integer), "committer_id": an_instance_of(Integer),
"merge_request_diff_id": merge_request_diff_id, "merge_request_diff_id": merge_request_diff_id,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment