Commit 5dc6368c authored by Heinrich Lee Yu's avatar Heinrich Lee Yu

Add migration to backfill search data

Runs a batched migration job to backfill issue search data
parent cd8596fe
# frozen_string_literal: true
class BackfillIssueSearchData < Gitlab::Database::Migration[1.0]
MIGRATION = 'BackfillIssueSearchData'
def up
queue_batched_background_migration(
MIGRATION,
:issues,
:id,
batch_size: 100_000,
sub_batch_size: 1_000,
job_interval: 5.minutes
)
end
def down
Gitlab::Database::BackgroundMigration::BatchedMigration
.for_configuration(MIGRATION, :issues, :id, [])
.delete_all
end
end
630899d5a7f833ce0533ae553de89e70bd03fad9b438fd367e3a568261b08b00
\ No newline at end of file
# frozen_string_literal: true
# rubocop:disable Style/Documentation
module Gitlab
module BackgroundMigration
# Backfills the new `issue_search_data` table, which contains
# the tsvector from the issue title and description.
class BackfillIssueSearchData
include Gitlab::Database::DynamicModelHelpers
def perform(start_id, stop_id, batch_table, batch_column, sub_batch_size, pause_ms)
define_batchable_model(batch_table, connection: ActiveRecord::Base.connection).where(batch_column => start_id..stop_id).each_batch(of: sub_batch_size) do |sub_batch|
update_search_data(sub_batch)
sleep(pause_ms * 0.001)
rescue ActiveRecord::StatementInvalid => e
raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')
update_search_data_individually(sub_batch, pause_ms)
end
end
private
def update_search_data(relation)
relation.klass.connection.execute(
<<~SQL
INSERT INTO issue_search_data (issue_id, search_vector, created_at, updated_at)
SELECT
id,
setweight(to_tsvector('english', LEFT(title, 255)), 'A') || setweight(to_tsvector('english', LEFT(REGEXP_REPLACE(description, '[A-Za-z0-9+/]{50,}', ' ', 'g'), 1048576)), 'B'),
NOW(),
NOW()
FROM issues
WHERE issues.id IN (#{relation.select(:id).to_sql})
ON CONFLICT DO NOTHING
SQL
)
end
def update_search_data_individually(relation, pause_ms)
relation.pluck(:id).each do |issue_id|
update_search_data(relation.klass.where(id: issue_id))
sleep(pause_ms * 0.001)
rescue ActiveRecord::StatementInvalid => e
raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')
logger.error(
message: 'Error updating search data: string is too long for tsvector',
class: relation.klass.name,
model_id: issue_id
)
end
end
def logger
@logger ||= Gitlab::BackgroundMigration::Logger.build
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::BackfillIssueSearchData do
let(:issues_table) { table(:issues) }
let(:issue_search_data_table) { table(:issue_search_data) }
let!(:issues) { Array.new(10) { issues_table.create!(title: 'test title', description: 'test description') } }
let(:migration) { described_class.new }
it 'backfills search data for the specified records' do
# sleeps for every sub-batch
expect(migration).to receive(:sleep).with(0.05).exactly(3).times
migration.perform(issues[0].id, issues[5].id, :issues, :id, 2, 50)
expect(issue_search_data_table.count).to eq(6)
end
it 'skips issues that already have search data' do
old_time = Time.new(2019, 1, 1).in_time_zone
issue_search_data_table.create!(issue_id: issues[0].id, updated_at: old_time)
migration.perform(issues[0].id, issues[5].id, :issues, :id, 2, 50)
expect(issue_search_data_table.count).to eq(6)
expect(issue_search_data_table.find(issues[0].id).updated_at).to be_like_time(old_time)
end
it 'rescues batch with bad data and inserts other rows' do
issues[1].update!(description: Array.new(30_000) { SecureRandom.hex }.join(' '))
expect_next_instance_of(Gitlab::BackgroundMigration::Logger) do |logger|
expect(logger).to receive(:error).with(a_hash_including(message: /string is too long for tsvector/, model_id: issues[1].id))
end
expect { migration.perform(issues[0].id, issues[5].id, :issues, :id, 2, 50) }.not_to raise_error
expect(issue_search_data_table.count).to eq(5)
expect(issue_search_data_table.find_by_issue_id(issues[1].id)).to eq(nil)
end
it 're-raises other errors' do
allow(migration).to receive(:update_search_data).and_raise(ActiveRecord::StatementTimeout)
expect { migration.perform(issues[0].id, issues[5].id, :issues, :id, 2, 50) }.to raise_error(ActiveRecord::StatementTimeout)
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment