MDEV-12746 rpl.rpl_parallel_optimistic_nobinlog fails committing

out of order at retry The test failures were of two sorts. One is that the number of retries what the slave thought as a temporary error exceeded the default value of the slave retry option. The 2nd issue was an out of order commit by transactions that were supposed to error out instead. Both issues are caused by the same reason that the post-temporary-error retry did not check possibly already existing error status. This is mended with refining conditions to retry. Specifically, a retrying worker checks `rpl_parallel_entry::stop_on_error_sub_id` that a potential failing predecessor could set to its own sub id. Now should the member be set the retrying follower errors out with ER_PRIOR_COMMIT_FAILED.

MDEV-12746 rpl.rpl_parallel_optimistic_nobinlog fails committing
out of order at retry The test failures were of two sorts. One is that the number of retries what the slave thought as a temporary error exceeded the default value of the slave retry option. The 2nd issue was an out of order commit by transactions that were supposed to error out instead. Both issues are caused by the same reason that the post-temporary-error retry did not check possibly already existing error status. This is mended with refining conditions to retry. Specifically, a retrying worker checks `rpl_parallel_entry::stop_on_error_sub_id` that a potential failing predecessor could set to its own sub id. Now should the member be set the retrying follower errors out with ER_PRIOR_COMMIT_FAILED.
30019a48 · Andrei Elkin · 76ae6e72 · 30019a48 · 30019a48 · 30019a48
Commit 30019a48 authored Feb 09, 2018 by Andrei Elkin
3 changed files
--- a/mysql-test/suite/rpl/r/rpl_parallel_retry.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel_retry.result
@@ -120,6 +120,7 @@ connection server_2;
 SET sql_log_bin=0;
 CALL mtr.add_suppression("Slave worker thread retried transaction 10 time\\(s\\) in vain, giving up");
 CALL mtr.add_suppression("Slave: Deadlock found when trying to get lock; try restarting transaction");
+CALL mtr.add_suppression("Slave worker thread retried transaction .* in vain, giving up");
 SET sql_log_bin=1;
 SET @old_dbug= @@GLOBAL.debug_dbug;
 SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100";
@@ -340,4 +341,60 @@ include/start_slave.inc
 connection server_1;
 DROP TABLE t1, t2, t3, t4;
 DROP function foo;
+connection server_2;
+connection server_1;
+CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+connection server_2;
+include/stop_slave.inc
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @@GLOBAL.slave_parallel_threads=5;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
+SET @@GLOBAL.slave_parallel_mode='aggressive';
+SET @old_lock_wait_timeout=@@GLOBAL.innodb_lock_wait_timeout;
+SET @@GLOBAL.innodb_lock_wait_timeout=2;
+SET @old_slave_transaction_retries=@@GLOBAL.slave_transaction_retries;
+SET @@GLOBAL.slave_transaction_retries=1;
+# Spoilers on the slave side causing temporary errors
+connect  spoiler_21,127.0.0.1,root,,test,$SLAVE_MYPORT;
+BEGIN;
+INSERT INTO t1 SET a=1,b=2;
+connect  spoiler_22,127.0.0.1,root,,test,$SLAVE_MYPORT;
+BEGIN;
+INSERT INTO t1 SET a=2,b=2;
+# Master payload
+connection server_1;
+SET @@SESSION.GTID_SEQ_NO=1000;
+INSERT INTO t1 SET a=1,b=1;
+SET @@SESSION.GTID_SEQ_NO=1001;
+INSERT INTO t1 SET a=2,b=1;
+# Start slave whose both appliers is destined to being blocked
+connection server_2;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET @@GLOBAL.debug_dbug="+d,rpl_parallel_simulate_wait_at_retry";
+include/start_slave.inc
+# Make sure the 2nd seqno_1001 worker has gotten to waiting
+# Signal to the 1st to proceed after it has reached termination state
+SET @@DEBUG_SYNC='now SIGNAL proceed_by_1000';
+connection spoiler_21;
+ROLLBACK;
+# Release the 2nd worker to proceed
+connection spoiler_22;
+ROLLBACK;
+connection server_2;
+SET @@DEBUG_SYNC='now SIGNAL proceed_by_1001';
+# observe how it all ends
+# Wait for the workers to go home and check the result of applying
+# which is OK
+connection server_2;
+include/stop_slave.inc
+SET @@GLOBAL.slave_parallel_threads=@old_parallel_threads;
+SET @@GLOBAL.slave_parallel_mode=@old_parallel_mode;
+SET @@GLOBAL.innodb_lock_wait_timeout=@old_lock_wait_timeout;
+SET @@GLOBAL.slave_transaction_retries=@old_slave_transaction_retries;
+SET @@GLOBAL.debug_dbug=@old_dbug;
+SET debug_sync='RESET';
+include/start_slave.inc
+connection server_1;
+DROP TABLE t1;
+connection server_2;
 include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_parallel_retry.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel_retry.test
@@ -128,6 +128,7 @@ SELECT * FROM t1 ORDER BY a;
 SET sql_log_bin=0;
 CALL mtr.add_suppression("Slave worker thread retried transaction 10 time\\(s\\) in vain, giving up");
 CALL mtr.add_suppression("Slave: Deadlock found when trying to get lock; try restarting transaction");
+CALL mtr.add_suppression("Slave worker thread retried transaction .* in vain, giving up");
 SET sql_log_bin=1;

 SET @old_dbug= @@GLOBAL.debug_dbug;
@@ -371,7 +372,7 @@ SELECT * FROM t3 ORDER BY a;
 SET binlog_format=@old_format;


-# Clean up.
+# Clean up of the above part.
 --connection server_2
 --source include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
@@ -381,4 +382,102 @@ SET GLOBAL slave_parallel_threads=@old_parallel_threads;
 DROP TABLE t1, t2, t3, t4;
 DROP function foo;

+--sync_slave_with_master server_2
+
+#
+# MDEV-12746 rpl.rpl_parallel_optimistic_nobinlog fails committing out of order at retry
+#
+
+--connection server_1
+CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+
+
+# Replicate create-t1 and prepare to re-start slave in optimistic mode
+--sync_slave_with_master server_2
+--source include/stop_slave.inc
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @@GLOBAL.slave_parallel_threads=5;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
+SET @@GLOBAL.slave_parallel_mode='aggressive';
+SET @old_lock_wait_timeout=@@GLOBAL.innodb_lock_wait_timeout;
+SET @@GLOBAL.innodb_lock_wait_timeout=2;
+SET @old_slave_transaction_retries=@@GLOBAL.slave_transaction_retries;
+SET @@GLOBAL.slave_transaction_retries=1;
+
+--echo # Spoilers on the slave side causing temporary errors
+--connect (spoiler_21,127.0.0.1,root,,test,$SLAVE_MYPORT)
+BEGIN;
+  INSERT INTO t1 SET a=1,b=2;
+
+--connect (spoiler_22,127.0.0.1,root,,test,$SLAVE_MYPORT)
+BEGIN;
+  INSERT INTO t1 SET a=2,b=2;
+
+--echo # Master payload
+--connection server_1
+SET @@SESSION.GTID_SEQ_NO=1000;
+INSERT INTO t1 SET a=1,b=1;
+SET @@SESSION.GTID_SEQ_NO=1001;
+INSERT INTO t1 SET a=2,b=1;
+
+--echo # Start slave whose both appliers is destined to being blocked
+--connection server_2
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET @@GLOBAL.debug_dbug="+d,rpl_parallel_simulate_wait_at_retry";
+--source include/start_slave.inc
+
+--echo # Make sure the 2nd seqno_1001 worker has gotten to waiting
+--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE '%debug sync point: now%';
+--source include/wait_condition.inc
+
+
+--echo # Signal to the 1st to proceed after it has reached termination state
+SET @@DEBUG_SYNC='now SIGNAL proceed_by_1000';
+--connection spoiler_21
+ROLLBACK;
+
+--echo # Release the 2nd worker to proceed
+--connection spoiler_22
+ROLLBACK;
+--connection server_2
+SET @@DEBUG_SYNC='now SIGNAL proceed_by_1001';
+
+--echo # observe how it all ends
+if (`SELECT count(*) = 1 FROM t1 WHERE a = 1`)
+{
+  --echo "*** Unexpected commit by the first Worker ***"
+  SELECT * from t1;
+  --die
+}
+
+--echo # Wait for the workers to go home and check the result of applying
+--let $wait_condition=SELECT count(*) = 0 FROM information_schema.processlist WHERE command = 'Slave_worker'
+--source include/wait_condition.inc
+if (`SELECT count(*) = 1 FROM t1 WHERE a = 2`)
+{
+  --echo
+  --echo "*** Error: congrats, you hit MDEV-12746 issue. ***"
+  --echo
+  --die
+}
+--echo # which is OK
+
+#
+# Clean up
+#
+--connection server_2
+--source include/stop_slave.inc
+SET @@GLOBAL.slave_parallel_threads=@old_parallel_threads;
+SET @@GLOBAL.slave_parallel_mode=@old_parallel_mode;
+SET @@GLOBAL.innodb_lock_wait_timeout=@old_lock_wait_timeout;
+SET @@GLOBAL.slave_transaction_retries=@old_slave_transaction_retries;
+SET @@GLOBAL.debug_dbug=@old_dbug;
+SET debug_sync='RESET';
+--source include/start_slave.inc
+
+--connection server_1
+DROP TABLE t1;
+
+--sync_slave_with_master server_2
+
 --source include/rpl_end.inc
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -229,6 +229,14 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
    entry->stop_on_error_sub_id= sub_id;
  mysql_mutex_unlock(&entry->LOCK_parallel_entry);

+  DBUG_EXECUTE_IF("rpl_parallel_simulate_wait_at_retry", {
+      if (rgi->current_gtid.seq_no == 1000) {
+        DBUG_ASSERT(entry->stop_on_error_sub_id == sub_id);
+        debug_sync_set_action(thd,
+                              STRING_WITH_LEN("now WAIT_FOR proceed_by_1000"));
+      }
+    });
+
  if (rgi->killed_for_retry == rpl_group_info::RETRY_KILL_PENDING)
    wait_for_pending_deadlock_kill(thd, rgi);
  thd->clear_error();
@@ -716,12 +724,20 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
    unregistering (and later re-registering) the wait.
  */
  if(thd->wait_for_commit_ptr)
-    thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
+      thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
  DBUG_EXECUTE_IF("inject_mdev8031", {
      /* Simulate that we get deadlock killed at this exact point. */
      rgi->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED;
      thd->set_killed(KILL_CONNECTION);
  });
+  DBUG_EXECUTE_IF("rpl_parallel_simulate_wait_at_retry", {
+      if (rgi->current_gtid.seq_no == 1001) {
+        debug_sync_set_action(thd,
+                              STRING_WITH_LEN("rpl_parallel_simulate_wait_at_retry WAIT_FOR proceed_by_1001"));
+      }
+      DEBUG_SYNC(thd, "rpl_parallel_simulate_wait_at_retry");
+    });
+
  rgi->cleanup_context(thd, 1);
  wait_for_pending_deadlock_kill(thd, rgi);
  thd->reset_killed();
@@ -745,7 +761,26 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
  for (;;)
  {
    mysql_mutex_lock(&entry->LOCK_parallel_entry);
-    register_wait_for_prior_event_group_commit(rgi, entry);
+    if (entry->stop_on_error_sub_id == (uint64) ULONGLONG_MAX ||
+#ifndef DBUG_OFF
+        (DBUG_EVALUATE_IF("simulate_mdev_12746", 1, 0)) ||
+#endif
+        rgi->gtid_sub_id < entry->stop_on_error_sub_id)
+    {
+      register_wait_for_prior_event_group_commit(rgi, entry);
+    }
+    else
+    {
+      /*
+        A failure of a preceeding "parent" transaction may not be
+        seen by the current one through its own worker_error.
+        Such induced error gets set by ourselves now.
+      */
+      err= rgi->worker_error= 1;
+      my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
+      mysql_mutex_unlock(&entry->LOCK_parallel_entry);
+      goto err;
+    }
    mysql_mutex_unlock(&entry->LOCK_parallel_entry);

    /*