Commit 43234742 authored by Robert Speicher's avatar Robert Speicher Committed by Robert Speicher

Merge branch 'update-column-in-batches-where' into 'master'

Allow customising of queries used for `update_column_in_batches`

This MR makes two changes to `add_column_with_default` and `update_column_in_batches`:

1. `add_column_with_default` no longer wraps the entire set of updates in a single transaction, preventing any locks from sticking around for the duration of the entire transaction
2. `update_column_in_batches` now takes a block which can be used to customise the queries. This uses Arel as messing with raw SQL strings is a total pain

In !4381 there's a need for updating existing rows/columns in a table in batches using a custom `WHERE` condition. Without the changes in this MR this would not be possible.

See merge request !4680
parent 4a1b42b0
...@@ -28,65 +28,79 @@ module Gitlab ...@@ -28,65 +28,79 @@ module Gitlab
# Updates the value of a column in batches. # Updates the value of a column in batches.
# #
# This method updates the table in batches of 5% of the total row count. # This method updates the table in batches of 5% of the total row count.
# Any data inserted while running this method (or after it has finished # This method will continue updating rows until no rows remain.
# running) is _not_ updated automatically. #
# When given a block this method will yield two values to the block:
#
# 1. An instance of `Arel::Table` for the table that is being updated.
# 2. The query to run as an Arel object.
#
# By supplying a block one can add extra conditions to the queries being
# executed. Note that the same block is used for _all_ queries.
#
# Example:
#
# update_column_in_batches(:projects, :foo, 10) do |table, query|
# query.where(table[:some_column].eq('hello'))
# end
#
# This would result in this method updating only rows where
# `projects.some_column` equals "hello".
# #
# table - The name of the table. # table - The name of the table.
# column - The name of the column to update. # column - The name of the column to update.
# value - The value for the column. # value - The value for the column.
def update_column_in_batches(table, column, value)
quoted_table = quote_table_name(table)
quoted_column = quote_column_name(column)
##
# Workaround for #17711
# #
# It looks like for MySQL `ActiveRecord::Base.conntection.quote(true)` # Rubocop's Metrics/AbcSize metric is disabled for this method as Rubocop
# returns correct value (1), but `ActiveRecord::Migration.new.quote` # determines this method to be too complex while there's no way to make it
# returns incorrect value ('true'), which causes migrations to fail. # less "complex" without introducing extra methods (which actually will
# make things _more_ complex).
# #
quoted_value = connection.quote(value) # rubocop: disable Metrics/AbcSize
processed = 0 def update_column_in_batches(table, column, value)
table = Arel::Table.new(table)
total = exec_query("SELECT COUNT(*) AS count FROM #{quoted_table}"). count_arel = table.project(Arel.star.count.as('count'))
to_hash. count_arel = yield table, count_arel if block_given?
first['count'].
to_i total = exec_query(count_arel.to_sql).to_hash.first['count'].to_i
return if total == 0
# Update in batches of 5% until we run out of any rows to update. # Update in batches of 5% until we run out of any rows to update.
batch_size = ((total / 100.0) * 5.0).ceil batch_size = ((total / 100.0) * 5.0).ceil
start_arel = table.project(table[:id]).order(table[:id].asc).take(1)
start_arel = yield table, start_arel if block_given?
start_id = exec_query(start_arel.to_sql).to_hash.first['id'].to_i
loop do loop do
start_row = exec_query(%Q{ stop_arel = table.project(table[:id]).
SELECT id where(table[:id].gteq(start_id)).
FROM #{quoted_table} order(table[:id].asc).
ORDER BY id ASC take(1).
LIMIT 1 OFFSET #{processed} skip(batch_size)
}).to_hash.first
stop_arel = yield table, stop_arel if block_given?
# There are no more rows to process stop_row = exec_query(stop_arel.to_sql).to_hash.first
break unless start_row
update_arel = Arel::UpdateManager.new(ActiveRecord::Base).
stop_row = exec_query(%Q{ table(table).
SELECT id set([[table[column], value]]).
FROM #{quoted_table} where(table[:id].gteq(start_id))
ORDER BY id ASC
LIMIT 1 OFFSET #{processed + batch_size}
}).to_hash.first
query = %Q{
UPDATE #{quoted_table}
SET #{quoted_column} = #{quoted_value}
WHERE id >= #{start_row['id']}
}
if stop_row if stop_row
query += " AND id < #{stop_row['id']}" stop_id = stop_row['id'].to_i
start_id = stop_id
update_arel = update_arel.where(table[:id].lt(stop_id))
end end
execute(query) update_arel = yield table, update_arel if block_given?
processed += batch_size execute(update_arel.to_sql)
# There are no more rows left to update.
break unless stop_row
end end
end end
...@@ -95,9 +109,9 @@ module Gitlab ...@@ -95,9 +109,9 @@ module Gitlab
# This method runs the following steps: # This method runs the following steps:
# #
# 1. Add the column with a default value of NULL. # 1. Add the column with a default value of NULL.
# 2. Update all existing rows in batches. # 2. Change the default value of the column to the specified value.
# 3. Change the default value of the column to the specified value. # 3. Update all existing rows in batches.
# 4. Update any remaining rows. # 4. Set a `NOT NULL` constraint on the column if desired (the default).
# #
# These steps ensure a column can be added to a large and commonly used # These steps ensure a column can be added to a large and commonly used
# table without locking the entire table for the duration of the table # table without locking the entire table for the duration of the table
...@@ -109,7 +123,10 @@ module Gitlab ...@@ -109,7 +123,10 @@ module Gitlab
# default - The default value for the column. # default - The default value for the column.
# allow_null - When set to `true` the column will allow NULL values, the # allow_null - When set to `true` the column will allow NULL values, the
# default is to not allow NULL values. # default is to not allow NULL values.
def add_column_with_default(table, column, type, default:, allow_null: false) #
# This method can also take a block which is passed directly to the
# `update_column_in_batches` method.
def add_column_with_default(table, column, type, default:, allow_null: false, &block)
if transaction_open? if transaction_open?
raise 'add_column_with_default can not be run inside a transaction, ' \ raise 'add_column_with_default can not be run inside a transaction, ' \
'you can disable transactions by calling disable_ddl_transaction! ' \ 'you can disable transactions by calling disable_ddl_transaction! ' \
...@@ -125,11 +142,9 @@ module Gitlab ...@@ -125,11 +142,9 @@ module Gitlab
end end
begin begin
transaction do update_column_in_batches(table, column, default, &block)
update_column_in_batches(table, column, default)
change_column_null(table, column, false) unless allow_null change_column_null(table, column, false) unless allow_null
end
# We want to rescue _all_ exceptions here, even those that don't inherit # We want to rescue _all_ exceptions here, even those that don't inherit
# from StandardError. # from StandardError.
rescue Exception => error # rubocop: disable all rescue Exception => error # rubocop: disable all
......
...@@ -71,6 +71,18 @@ describe Gitlab::Database::MigrationHelpers, lib: true do ...@@ -71,6 +71,18 @@ describe Gitlab::Database::MigrationHelpers, lib: true do
expect(Project.where(archived: true).count).to eq(5) expect(Project.where(archived: true).count).to eq(5)
end end
context 'when a block is supplied' do
it 'yields an Arel table and query object to the supplied block' do
first_id = Project.first.id
model.update_column_in_batches(:projects, :archived, true) do |t, query|
query.where(t[:id].eq(first_id))
end
expect(Project.where(archived: true).count).to eq(1)
end
end
end end
describe '#add_column_with_default' do describe '#add_column_with_default' do
...@@ -78,7 +90,7 @@ describe Gitlab::Database::MigrationHelpers, lib: true do ...@@ -78,7 +90,7 @@ describe Gitlab::Database::MigrationHelpers, lib: true do
before do before do
expect(model).to receive(:transaction_open?).and_return(false) expect(model).to receive(:transaction_open?).and_return(false)
expect(model).to receive(:transaction).twice.and_yield expect(model).to receive(:transaction).and_yield
expect(model).to receive(:add_column). expect(model).to receive(:add_column).
with(:projects, :foo, :integer, default: nil) with(:projects, :foo, :integer, default: nil)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment