MDEV-10863: parallel replication tries to continue from wrong position

This occured when the SQL thread (but not the IO thread) stops while GTID and parallel replication are used with multiple domain ids in the GTID position, and is restarted. In this case, the SQL needs to start some way back in the relay log, applying or skipping events within each replication domain as appropriate. The SQL threads starts at the beginning of an old relay log file, and this position may be in the middle of an event group. The bug was that such partial event group could be re-applied, causing replication corruption. This patch fixes the issue, by making sure to skip any initial events that were part of an earlier (already applied) event group.

MDEV-10863: parallel replication tries to continue from wrong position
This occured when the SQL thread (but not the IO thread) stops while GTID and parallel replication are used with multiple domain ids in the GTID position, and is restarted. In this case, the SQL needs to start some way back in the relay log, applying or skipping events within each replication domain as appropriate. The SQL threads starts at the beginning of an old relay log file, and this position may be in the middle of an event group. The bug was that such partial event group could be re-applied, causing replication corruption. This patch fixes the issue, by making sure to skip any initial events that were part of an earlier (already applied) event group.
717f2128 · Kristian Nielsen · eca8c324 · 717f2128 · 717f2128 · 717f2128
Commit 717f2128 authored Nov 04, 2016 by Kristian Nielsen
3 changed files
--- a/mysql-test/suite/rpl/r/rpl_mdev10863.result
+++ b/mysql-test/suite/rpl/r/rpl_mdev10863.result
+include/rpl_init.inc [topology=1->2]
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=10;
+SET @old_max_relay= @@GLOBAL.max_relay_log_size;
+SET GLOBAL max_relay_log_size = 4096;
+CHANGE MASTER TO master_use_gtid=slave_pos;
+include/start_slave.inc
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a int PRIMARY KEY, b VARCHAR(100)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, "a");
+*** Create a long transaction that will span a relay log file. ***
+SET @old_domain= @@gtid_domain_id;
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (10000, "domain 10");
+SET gtid_domain_id=20;
+INSERT INTO t1 VALUES (20000, "domain 20");
+SET gtid_domain_id=@old_domain;
+BEGIN;
+[lots of inserts omitted]
+COMMIT;
+BEGIN;
+[lots of inserts omitted]
+COMMIT;
+include/stop_slave_sql.inc
+START SLAVE SQL_THREAD;
+include/wait_for_slave_to_start.inc
+INSERT INTO t1 VALUES (100000, "More stuffs.");
+INSERT INTO t1 VALUES (100001, "And even more");
+SELECT * FROM t1 WHERE a >= 100000 ORDER BY a;
+a	b
+100000	More stuffs.
+100001	And even more
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL max_relay_log_size= @old_max_relay;
+include/start_slave.inc
+DROP TABLE t1;
+include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_mdev10863.test
+++ b/mysql-test/suite/rpl/t/rpl_mdev10863.test
+--source include/have_innodb.inc
+--let $rpl_topology=1->2
+--source include/rpl_init.inc
+# Test various aspects of parallel replication.
+--connection server_2
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=10;
+SET @old_max_relay= @@GLOBAL.max_relay_log_size;
+SET GLOBAL max_relay_log_size = 4096;
+CHANGE MASTER TO master_use_gtid=slave_pos;
+--source include/start_slave.inc
+--connection server_1
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a int PRIMARY KEY, b VARCHAR(100)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, "a");
+--save_master_pos
+--connection server_2
+--sync_with_master
+--echo *** Create a long transaction that will span a relay log file. ***
+--connection server_1
+# Add some transactions in separate domains, that will cause the need to
+# have a multi-valued restart position in the relay log for the SQL thread.
+SET @old_domain= @@gtid_domain_id;
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (10000, "domain 10");
+SET gtid_domain_id=20;
+INSERT INTO t1 VALUES (20000, "domain 20");
+SET gtid_domain_id=@old_domain;
+BEGIN;
+--echo [lots of inserts omitted]
+--disable_query_log
+--let $count = 500
+while ($count) {
+  eval INSERT INTO t1 VALUES (1000+$count, REPEAT("hulubulu??!?", 8));
+  dec $count;
+}
+--enable_query_log
+COMMIT;
+--save_master_pos
+--connection server_2
+--sync_with_master
+--connection server_1
+# Now do another one, to make the inuse_relaylog proceed to somewhere inside
+# the first large transaction.
+BEGIN;
+--echo [lots of inserts omitted]
+--disable_query_log
+--let $count = 500
+while ($count) {
+  eval INSERT INTO t1 VALUES (2000+$count, REPEAT("hulubulu??!?", 8));
+  dec $count;
+}
+--enable_query_log
+COMMIT;
+--save_master_pos
+--connection server_2
+--sync_with_master
+# Stop and restart the SQL thread only.
+# The bug was that the SQL thread would restart at the start
+# of a relay log file, which could be in the middle of an event group.
+# This way, part of that event group could be wrongly re-applied.
+--source include/stop_slave_sql.inc
+START SLAVE SQL_THREAD;
+--source include/wait_for_slave_to_start.inc
+--connection server_1
+INSERT INTO t1 VALUES (100000, "More stuffs.");
+INSERT INTO t1 VALUES (100001, "And even more");
+--save_master_pos
+--connection server_2
+--sync_with_master
+SELECT * FROM t1 WHERE a >= 100000 ORDER BY a;
+# Clean up.
+--connection server_2
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL max_relay_log_size= @old_max_relay;
+--source include/start_slave.inc
+--connection server_1
+DROP TABLE t1;
+--source include/rpl_end.inc
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -4546,6 +4546,21 @@ pthread_handler_t handle_slave_sql(void *arg)
  serial_rgi->gtid_sub_id= 0;
  serial_rgi->gtid_pending= false;
+  if (mi->using_gtid != Master_info::USE_GTID_NO &&
+      opt_slave_parallel_threads > 0 &&
+      rli->restart_gtid_pos.count() > 0)
+  {
+    /*
+      With parallel replication in GTID mode, if we have a multi-domain GTID
+      position, we need to start some way back in the relay log and skip any
+      GTID that was already applied before. Since event groups can be split
+      across multiple relay logs, this earlier starting point may be in the
+      middle of an already applied event group, so we also need to skip any
+      remaining part of such group.
+    */
+    rli->gtid_skip_flag = GTID_SKIP_TRANSACTION;
+  }
+  else
    rli->gtid_skip_flag = GTID_SKIP_NOT;
  if (init_relay_log_pos(rli,
                         rli->group_relay_log_name,