Commit 30019a48 authored by Andrei Elkin's avatar Andrei Elkin

MDEV-12746 rpl.rpl_parallel_optimistic_nobinlog fails committing

           out of order at retry

The test failures were of two sorts. One is that the number of retries
what the slave thought as a temporary error exceeded
the default value of the slave retry option.
The 2nd issue was an out of order commit by transactions that
were supposed to error out instead.
Both issues are caused by the same reason that the post-temporary-error
retry did not check possibly already existing error status.

This is mended with refining conditions to retry. Specifically, a retrying
worker checks `rpl_parallel_entry::stop_on_error_sub_id` that
a potential failing predecessor could set to its own sub id.
Now should the member be set the retrying follower errors out with
ER_PRIOR_COMMIT_FAILED.
parent 76ae6e72
......@@ -120,6 +120,7 @@ connection server_2;
SET sql_log_bin=0;
CALL mtr.add_suppression("Slave worker thread retried transaction 10 time\\(s\\) in vain, giving up");
CALL mtr.add_suppression("Slave: Deadlock found when trying to get lock; try restarting transaction");
CALL mtr.add_suppression("Slave worker thread retried transaction .* in vain, giving up");
SET sql_log_bin=1;
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100";
......@@ -340,4 +341,60 @@ include/start_slave.inc
connection server_1;
DROP TABLE t1, t2, t3, t4;
DROP function foo;
connection server_2;
connection server_1;
CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
connection server_2;
include/stop_slave.inc
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
SET @@GLOBAL.slave_parallel_threads=5;
SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
SET @@GLOBAL.slave_parallel_mode='aggressive';
SET @old_lock_wait_timeout=@@GLOBAL.innodb_lock_wait_timeout;
SET @@GLOBAL.innodb_lock_wait_timeout=2;
SET @old_slave_transaction_retries=@@GLOBAL.slave_transaction_retries;
SET @@GLOBAL.slave_transaction_retries=1;
# Spoilers on the slave side causing temporary errors
connect spoiler_21,127.0.0.1,root,,test,$SLAVE_MYPORT;
BEGIN;
INSERT INTO t1 SET a=1,b=2;
connect spoiler_22,127.0.0.1,root,,test,$SLAVE_MYPORT;
BEGIN;
INSERT INTO t1 SET a=2,b=2;
# Master payload
connection server_1;
SET @@SESSION.GTID_SEQ_NO=1000;
INSERT INTO t1 SET a=1,b=1;
SET @@SESSION.GTID_SEQ_NO=1001;
INSERT INTO t1 SET a=2,b=1;
# Start slave whose both appliers is destined to being blocked
connection server_2;
SET @old_dbug= @@GLOBAL.debug_dbug;
SET @@GLOBAL.debug_dbug="+d,rpl_parallel_simulate_wait_at_retry";
include/start_slave.inc
# Make sure the 2nd seqno_1001 worker has gotten to waiting
# Signal to the 1st to proceed after it has reached termination state
SET @@DEBUG_SYNC='now SIGNAL proceed_by_1000';
connection spoiler_21;
ROLLBACK;
# Release the 2nd worker to proceed
connection spoiler_22;
ROLLBACK;
connection server_2;
SET @@DEBUG_SYNC='now SIGNAL proceed_by_1001';
# observe how it all ends
# Wait for the workers to go home and check the result of applying
# which is OK
connection server_2;
include/stop_slave.inc
SET @@GLOBAL.slave_parallel_threads=@old_parallel_threads;
SET @@GLOBAL.slave_parallel_mode=@old_parallel_mode;
SET @@GLOBAL.innodb_lock_wait_timeout=@old_lock_wait_timeout;
SET @@GLOBAL.slave_transaction_retries=@old_slave_transaction_retries;
SET @@GLOBAL.debug_dbug=@old_dbug;
SET debug_sync='RESET';
include/start_slave.inc
connection server_1;
DROP TABLE t1;
connection server_2;
include/rpl_end.inc
......@@ -128,6 +128,7 @@ SELECT * FROM t1 ORDER BY a;
SET sql_log_bin=0;
CALL mtr.add_suppression("Slave worker thread retried transaction 10 time\\(s\\) in vain, giving up");
CALL mtr.add_suppression("Slave: Deadlock found when trying to get lock; try restarting transaction");
CALL mtr.add_suppression("Slave worker thread retried transaction .* in vain, giving up");
SET sql_log_bin=1;
SET @old_dbug= @@GLOBAL.debug_dbug;
......@@ -371,7 +372,7 @@ SELECT * FROM t3 ORDER BY a;
SET binlog_format=@old_format;
# Clean up.
# Clean up of the above part.
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
......@@ -381,4 +382,102 @@ SET GLOBAL slave_parallel_threads=@old_parallel_threads;
DROP TABLE t1, t2, t3, t4;
DROP function foo;
--sync_slave_with_master server_2
#
# MDEV-12746 rpl.rpl_parallel_optimistic_nobinlog fails committing out of order at retry
#
--connection server_1
CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
# Replicate create-t1 and prepare to re-start slave in optimistic mode
--sync_slave_with_master server_2
--source include/stop_slave.inc
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
SET @@GLOBAL.slave_parallel_threads=5;
SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
SET @@GLOBAL.slave_parallel_mode='aggressive';
SET @old_lock_wait_timeout=@@GLOBAL.innodb_lock_wait_timeout;
SET @@GLOBAL.innodb_lock_wait_timeout=2;
SET @old_slave_transaction_retries=@@GLOBAL.slave_transaction_retries;
SET @@GLOBAL.slave_transaction_retries=1;
--echo # Spoilers on the slave side causing temporary errors
--connect (spoiler_21,127.0.0.1,root,,test,$SLAVE_MYPORT)
BEGIN;
INSERT INTO t1 SET a=1,b=2;
--connect (spoiler_22,127.0.0.1,root,,test,$SLAVE_MYPORT)
BEGIN;
INSERT INTO t1 SET a=2,b=2;
--echo # Master payload
--connection server_1
SET @@SESSION.GTID_SEQ_NO=1000;
INSERT INTO t1 SET a=1,b=1;
SET @@SESSION.GTID_SEQ_NO=1001;
INSERT INTO t1 SET a=2,b=1;
--echo # Start slave whose both appliers is destined to being blocked
--connection server_2
SET @old_dbug= @@GLOBAL.debug_dbug;
SET @@GLOBAL.debug_dbug="+d,rpl_parallel_simulate_wait_at_retry";
--source include/start_slave.inc
--echo # Make sure the 2nd seqno_1001 worker has gotten to waiting
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE '%debug sync point: now%';
--source include/wait_condition.inc
--echo # Signal to the 1st to proceed after it has reached termination state
SET @@DEBUG_SYNC='now SIGNAL proceed_by_1000';
--connection spoiler_21
ROLLBACK;
--echo # Release the 2nd worker to proceed
--connection spoiler_22
ROLLBACK;
--connection server_2
SET @@DEBUG_SYNC='now SIGNAL proceed_by_1001';
--echo # observe how it all ends
if (`SELECT count(*) = 1 FROM t1 WHERE a = 1`)
{
--echo "*** Unexpected commit by the first Worker ***"
SELECT * from t1;
--die
}
--echo # Wait for the workers to go home and check the result of applying
--let $wait_condition=SELECT count(*) = 0 FROM information_schema.processlist WHERE command = 'Slave_worker'
--source include/wait_condition.inc
if (`SELECT count(*) = 1 FROM t1 WHERE a = 2`)
{
--echo
--echo "*** Error: congrats, you hit MDEV-12746 issue. ***"
--echo
--die
}
--echo # which is OK
#
# Clean up
#
--connection server_2
--source include/stop_slave.inc
SET @@GLOBAL.slave_parallel_threads=@old_parallel_threads;
SET @@GLOBAL.slave_parallel_mode=@old_parallel_mode;
SET @@GLOBAL.innodb_lock_wait_timeout=@old_lock_wait_timeout;
SET @@GLOBAL.slave_transaction_retries=@old_slave_transaction_retries;
SET @@GLOBAL.debug_dbug=@old_dbug;
SET debug_sync='RESET';
--source include/start_slave.inc
--connection server_1
DROP TABLE t1;
--sync_slave_with_master server_2
--source include/rpl_end.inc
......@@ -229,6 +229,14 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
entry->stop_on_error_sub_id= sub_id;
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
DBUG_EXECUTE_IF("rpl_parallel_simulate_wait_at_retry", {
if (rgi->current_gtid.seq_no == 1000) {
DBUG_ASSERT(entry->stop_on_error_sub_id == sub_id);
debug_sync_set_action(thd,
STRING_WITH_LEN("now WAIT_FOR proceed_by_1000"));
}
});
if (rgi->killed_for_retry == rpl_group_info::RETRY_KILL_PENDING)
wait_for_pending_deadlock_kill(thd, rgi);
thd->clear_error();
......@@ -716,12 +724,20 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
unregistering (and later re-registering) the wait.
*/
if(thd->wait_for_commit_ptr)
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
DBUG_EXECUTE_IF("inject_mdev8031", {
/* Simulate that we get deadlock killed at this exact point. */
rgi->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED;
thd->set_killed(KILL_CONNECTION);
});
DBUG_EXECUTE_IF("rpl_parallel_simulate_wait_at_retry", {
if (rgi->current_gtid.seq_no == 1001) {
debug_sync_set_action(thd,
STRING_WITH_LEN("rpl_parallel_simulate_wait_at_retry WAIT_FOR proceed_by_1001"));
}
DEBUG_SYNC(thd, "rpl_parallel_simulate_wait_at_retry");
});
rgi->cleanup_context(thd, 1);
wait_for_pending_deadlock_kill(thd, rgi);
thd->reset_killed();
......@@ -745,7 +761,26 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
for (;;)
{
mysql_mutex_lock(&entry->LOCK_parallel_entry);
register_wait_for_prior_event_group_commit(rgi, entry);
if (entry->stop_on_error_sub_id == (uint64) ULONGLONG_MAX ||
#ifndef DBUG_OFF
(DBUG_EVALUATE_IF("simulate_mdev_12746", 1, 0)) ||
#endif
rgi->gtid_sub_id < entry->stop_on_error_sub_id)
{
register_wait_for_prior_event_group_commit(rgi, entry);
}
else
{
/*
A failure of a preceeding "parent" transaction may not be
seen by the current one through its own worker_error.
Such induced error gets set by ourselves now.
*/
err= rgi->worker_error= 1;
my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
goto err;
}
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment