MDEV-6549, failing to update gtid_slave_pos for a transaction that was retried.

The bug was that in some cases, if a replicated transaction was rolled back due to deadlock, during the subsequent retry of that transaction, the gtid_slave_pos would _not_ be updated with the new GTID, leaving the GTID position of the slave incorrect. Fix this by ensuring during the retry that we clear the flag that marks that the GTID has already been recorded in gtid_slave_pos, so that the update of gtid_slave_pos will be done again during the retry. In the original bug, the symptom was an assertion due to OPTION_GTID_BEGIN not being cleared during the retry of the transaction. The reason was some code in handling of a COMMIT query event, which would not clear the flag when not recording a GTID in gtid_slave_pos. This commit also fixes that code to always clear the OPTION_GTID_BEGIN flag for clarity, though it is actually not possible for OPTION_GTID_BEGIN to become set unless a GTID is pending for update (after fixing the bug described above).

MDEV-6549, failing to update gtid_slave_pos for a transaction that was retried.
The bug was that in some cases, if a replicated transaction was rolled back due to deadlock, during the subsequent retry of that transaction, the gtid_slave_pos would _not_ be updated with the new GTID, leaving the GTID position of the slave incorrect. Fix this by ensuring during the retry that we clear the flag that marks that the GTID has already been recorded in gtid_slave_pos, so that the update of gtid_slave_pos will be done again during the retry. In the original bug, the symptom was an assertion due to OPTION_GTID_BEGIN not being cleared during the retry of the transaction. The reason was some code in handling of a COMMIT query event, which would not clear the flag when not recording a GTID in gtid_slave_pos. This commit also fixes that code to always clear the OPTION_GTID_BEGIN flag for clarity, though it is actually not possible for OPTION_GTID_BEGIN to become set unless a GTID is pending for update (after fixing the bug described above).
ec05fea0 · Kristian Nielsen · 354f3f1f · ec05fea0 · ec05fea0 · ec05fea0
Commit ec05fea0 authored Aug 13, 2014 by Kristian Nielsen
5 changed files
--- a/mysql-test/suite/rpl/r/rpl_parallel.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel.result
@@ -793,6 +793,7 @@ SET debug_sync='now WAIT_FOR master_queued2';
 SET debug_sync='now SIGNAL master_cont1';
 SET debug_sync='RESET';
 include/start_slave.inc
+include/stop_slave.inc
 SELECT * FROM t4 ORDER BY a;
 a	b
 1	NULL
@@ -801,6 +802,42 @@ a	b
 5	NULL
 6	6
 7	NULL
+DELETE FROM t4;
+INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
+UPDATE t4 SET b=NULL WHERE a=6;
+SET debug_sync='now WAIT_FOR master_queued1';
+SET @old_format= @@SESSION.binlog_format;
+SET binlog_format='statement';
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
+DELETE FROM t4 WHERE b <= 1;
+SET debug_sync='now WAIT_FOR master_queued2';
+SET debug_sync='now SIGNAL master_cont1';
+SET @old_format=@@GLOBAL.binlog_format;
+SET debug_sync='RESET';
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,disable_thd_need_ordering_with";
+include/start_slave.inc
+SET GLOBAL debug_dbug=@old_dbug;
+SELECT * FROM t4 ORDER BY a;
+a	b
+1	NULL
+2	2
+3	NULL
+4	4
+5	NULL
+6	NULL
+SET @last_gtid= 'GTID';
+SELECT IF(@@gtid_slave_pos LIKE CONCAT('%',@last_gtid,'%'), "GTID found ok",
+CONCAT("GTID ", @last_gtid, " not found in gtid_slave_pos=", @@gtid_slave_pos))
+AS result;
+result
+GTID found ok
+SELECT "ROW FOUND" AS `Is the row found?`
+  FROM mysql.gtid_slave_pos
+WHERE CONCAT(domain_id, "-", server_id, "-", seq_no) = @last_gtid;
+Is the row found?
+ROW FOUND
 *** MDEV-5938: Exec_master_log_pos not updated at log rotate in parallel replication ***
 include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=1;

--- a/mysql-test/suite/rpl/t/rpl_parallel.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel.test
@@ -1246,8 +1246,76 @@ SET debug_sync='RESET';
 --connection server_2
 --source include/start_slave.inc
 --sync_with_master
+--source include/stop_slave.inc
+
+SELECT * FROM t4 ORDER BY a;
+
+
+# MDEV-6549, failing to update gtid_slave_pos for a transaction that was retried.
+# The problem was that when a transaction updates the mysql.gtid_slave_pos
+# table, it clears the flag that marks that there is a GTID position that
+# needs to be updated. Then, if the transaction got killed after that due
+# to a deadlock, the subsequent retry would fail to notice that the GTID needs
+# to be recorded in gtid_slave_pos.
+#
+# (In the original bug report, the symptom was an assertion; this was however
+# just a side effect of the missing update of gtid_slave_pos, which also
+# happened to cause a missing clear of OPTION_GTID_BEGIN).
+--connection server_1
+DELETE FROM t4;
+INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
+
+# Create two transactions that can run in parallel on the slave but cause
+# a deadlock if the second runs before the first.
+--connection con1
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
+send UPDATE t4 SET b=NULL WHERE a=6;
+--connection server_1
+SET debug_sync='now WAIT_FOR master_queued1';
+
+--connection con2
+# Must use statement-based binlogging. Otherwise the transaction will not be
+# binlogged at all, as it modifies no rows.
+SET @old_format= @@SESSION.binlog_format;
+SET binlog_format='statement';
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
+send DELETE FROM t4 WHERE b <= 1;
+
+--connection server_1
+SET debug_sync='now WAIT_FOR master_queued2';
+SET debug_sync='now SIGNAL master_cont1';
+
+--connection con1
+REAP;
+--connection con2
+REAP;
+SET @old_format=@@GLOBAL.binlog_format;
+SET debug_sync='RESET';
+--save_master_pos
+--let $last_gtid= `SELECT @@last_gtid`
+
+--connection server_2
+# Disable the usual skip of gap locks for transactions that are run in
+# parallel, using DBUG. This allows the deadlock to occur, and this in turn
+# triggers a retry of the second transaction, and the code that was buggy and
+# caused the gtid_slave_pos update to be skipped in the retry.
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,disable_thd_need_ordering_with";
+--source include/start_slave.inc
+--sync_with_master
+SET GLOBAL debug_dbug=@old_dbug;

 SELECT * FROM t4 ORDER BY a;
+# Check that the GTID of the second transaction was correctly recorded in
+# gtid_slave_pos, in the variable as well as in the table.
+--replace_result $last_gtid GTID
+eval SET @last_gtid= '$last_gtid';
+SELECT IF(@@gtid_slave_pos LIKE CONCAT('%',@last_gtid,'%'), "GTID found ok",
+    CONCAT("GTID ", @last_gtid, " not found in gtid_slave_pos=", @@gtid_slave_pos))
+    AS result;
+SELECT "ROW FOUND" AS `Is the row found?`
+  FROM mysql.gtid_slave_pos
+ WHERE CONCAT(domain_id, "-", server_id, "-", seq_no) = @last_gtid;


 --echo *** MDEV-5938: Exec_master_log_pos not updated at log rotate in parallel replication ***

--- a/sql/log_event.cc
+++ b/sql/log_event.cc
@@ -4265,28 +4265,31 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi,
        Record any GTID in the same transaction, so slave state is
        transactionally consistent.
      */
-      if (current_stmt_is_commit && rgi->gtid_pending)
+      if (current_stmt_is_commit)
      {
-        sub_id= rgi->gtid_sub_id;
-        rgi->gtid_pending= false;
-
-        gtid= rgi->current_gtid;
        thd->variables.option_bits&= ~OPTION_GTID_BEGIN;
-        if (rpl_global_gtid_slave_state.record_gtid(thd, &gtid, sub_id, true, false))
+        if (rgi->gtid_pending)
        {
-          int errcode= thd->get_stmt_da()->sql_errno();
-          if (!is_parallel_retry_error(rgi, errcode))
-            rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE,
-                        rgi->gtid_info(),
-                        "Error during COMMIT: failed to update GTID state in "
-                      "%s.%s: %d: %s",
-                        "mysql", rpl_gtid_slave_state_table_name.str,
-                        errcode,
-                        thd->get_stmt_da()->message());
-          trans_rollback(thd);
-          sub_id= 0;
-          thd->is_slave_error= 1;
-          goto end;
+          sub_id= rgi->gtid_sub_id;
+          rgi->gtid_pending= false;
+
+          gtid= rgi->current_gtid;
+          if (rpl_global_gtid_slave_state.record_gtid(thd, &gtid, sub_id, true, false))
+          {
+            int errcode= thd->get_stmt_da()->sql_errno();
+            if (!is_parallel_retry_error(rgi, errcode))
+              rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE,
+                          rgi->gtid_info(),
+                          "Error during COMMIT: failed to update GTID state in "
+                        "%s.%s: %d: %s",
+                          "mysql", rpl_gtid_slave_state_table_name.str,
+                          errcode,
+                          thd->get_stmt_da()->message());
+            trans_rollback(thd);
+            sub_id= 0;
+            thd->is_slave_error= 1;
+            goto end;
+          }
        }
      }


--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -318,6 +318,15 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
    thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
  rgi->cleanup_context(thd, 1);

+  /*
+    If we retry due to a deadlock kill that occured during the commit step, we
+    might have already updated (but not committed) an update of table
+    mysql.gtid_slave_pos, and cleared the gtid_pending flag. Now we have
+    rolled back any such update, so we must set the gtid_pending flag back to
+    true so that we will do a new update when/if we succeed with the retry.
+  */
+  rgi->gtid_pending= true;
+
  mysql_mutex_lock(&rli->data_lock);
  ++rli->retried_trans;
  statistic_increment(slave_retried_transactions, LOCK_status);

--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -4346,6 +4346,7 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
 {
  rpl_group_info *rgi, *other_rgi;

+  DBUG_EXECUTE_IF("disable_thd_need_ordering_with", return 1;);
  if (!thd || !other_thd)
    return 1;
  rgi= thd->rgi_slave;
@@ -4361,7 +4362,7 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
  if (!rgi->commit_id || rgi->commit_id != other_rgi->commit_id)
    return 1;
  /*
-    These two threads are doing parallel replication within the same
+    Otherwise, these two threads are doing parallel replication within the same
    replication domain. Their commit order is already fixed, so we do not need
    gap locks or similar to otherwise enforce ordering (and in fact such locks
    could lead to unnecessary deadlocks and transaction retry).