Commit 717f2128 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-10863: parallel replication tries to continue from wrong position

This occured when the SQL thread (but not the IO thread) stops while
GTID and parallel replication are used with multiple domain ids in the
GTID position, and is restarted.

In this case, the SQL needs to start some way back in the relay log,
applying or skipping events within each replication domain as
appropriate.

The SQL threads starts at the beginning of an old relay log file, and
this position may be in the middle of an event group. The bug was that
such partial event group could be re-applied, causing replication
corruption.

This patch fixes the issue, by making sure to skip any initial events
that were part of an earlier (already applied) event group.
parent eca8c324
include/rpl_init.inc [topology=1->2]
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=10;
SET @old_max_relay= @@GLOBAL.max_relay_log_size;
SET GLOBAL max_relay_log_size = 4096;
CHANGE MASTER TO master_use_gtid=slave_pos;
include/start_slave.inc
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1 (a int PRIMARY KEY, b VARCHAR(100)) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1, "a");
*** Create a long transaction that will span a relay log file. ***
SET @old_domain= @@gtid_domain_id;
SET gtid_domain_id=10;
INSERT INTO t1 VALUES (10000, "domain 10");
SET gtid_domain_id=20;
INSERT INTO t1 VALUES (20000, "domain 20");
SET gtid_domain_id=@old_domain;
BEGIN;
[lots of inserts omitted]
COMMIT;
BEGIN;
[lots of inserts omitted]
COMMIT;
include/stop_slave_sql.inc
START SLAVE SQL_THREAD;
include/wait_for_slave_to_start.inc
INSERT INTO t1 VALUES (100000, "More stuffs.");
INSERT INTO t1 VALUES (100001, "And even more");
SELECT * FROM t1 WHERE a >= 100000 ORDER BY a;
a b
100000 More stuffs.
100001 And even more
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
SET GLOBAL max_relay_log_size= @old_max_relay;
include/start_slave.inc
DROP TABLE t1;
include/rpl_end.inc
--source include/have_innodb.inc
--let $rpl_topology=1->2
--source include/rpl_init.inc
# Test various aspects of parallel replication.
--connection server_2
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=10;
SET @old_max_relay= @@GLOBAL.max_relay_log_size;
SET GLOBAL max_relay_log_size = 4096;
CHANGE MASTER TO master_use_gtid=slave_pos;
--source include/start_slave.inc
--connection server_1
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1 (a int PRIMARY KEY, b VARCHAR(100)) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1, "a");
--save_master_pos
--connection server_2
--sync_with_master
--echo *** Create a long transaction that will span a relay log file. ***
--connection server_1
# Add some transactions in separate domains, that will cause the need to
# have a multi-valued restart position in the relay log for the SQL thread.
SET @old_domain= @@gtid_domain_id;
SET gtid_domain_id=10;
INSERT INTO t1 VALUES (10000, "domain 10");
SET gtid_domain_id=20;
INSERT INTO t1 VALUES (20000, "domain 20");
SET gtid_domain_id=@old_domain;
BEGIN;
--echo [lots of inserts omitted]
--disable_query_log
--let $count = 500
while ($count) {
eval INSERT INTO t1 VALUES (1000+$count, REPEAT("hulubulu??!?", 8));
dec $count;
}
--enable_query_log
COMMIT;
--save_master_pos
--connection server_2
--sync_with_master
--connection server_1
# Now do another one, to make the inuse_relaylog proceed to somewhere inside
# the first large transaction.
BEGIN;
--echo [lots of inserts omitted]
--disable_query_log
--let $count = 500
while ($count) {
eval INSERT INTO t1 VALUES (2000+$count, REPEAT("hulubulu??!?", 8));
dec $count;
}
--enable_query_log
COMMIT;
--save_master_pos
--connection server_2
--sync_with_master
# Stop and restart the SQL thread only.
# The bug was that the SQL thread would restart at the start
# of a relay log file, which could be in the middle of an event group.
# This way, part of that event group could be wrongly re-applied.
--source include/stop_slave_sql.inc
START SLAVE SQL_THREAD;
--source include/wait_for_slave_to_start.inc
--connection server_1
INSERT INTO t1 VALUES (100000, "More stuffs.");
INSERT INTO t1 VALUES (100001, "And even more");
--save_master_pos
--connection server_2
--sync_with_master
SELECT * FROM t1 WHERE a >= 100000 ORDER BY a;
# Clean up.
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
SET GLOBAL max_relay_log_size= @old_max_relay;
--source include/start_slave.inc
--connection server_1
DROP TABLE t1;
--source include/rpl_end.inc
...@@ -4546,7 +4546,22 @@ pthread_handler_t handle_slave_sql(void *arg) ...@@ -4546,7 +4546,22 @@ pthread_handler_t handle_slave_sql(void *arg)
serial_rgi->gtid_sub_id= 0; serial_rgi->gtid_sub_id= 0;
serial_rgi->gtid_pending= false; serial_rgi->gtid_pending= false;
rli->gtid_skip_flag = GTID_SKIP_NOT; if (mi->using_gtid != Master_info::USE_GTID_NO &&
opt_slave_parallel_threads > 0 &&
rli->restart_gtid_pos.count() > 0)
{
/*
With parallel replication in GTID mode, if we have a multi-domain GTID
position, we need to start some way back in the relay log and skip any
GTID that was already applied before. Since event groups can be split
across multiple relay logs, this earlier starting point may be in the
middle of an already applied event group, so we also need to skip any
remaining part of such group.
*/
rli->gtid_skip_flag = GTID_SKIP_TRANSACTION;
}
else
rli->gtid_skip_flag = GTID_SKIP_NOT;
if (init_relay_log_pos(rli, if (init_relay_log_pos(rli,
rli->group_relay_log_name, rli->group_relay_log_name,
rli->group_relay_log_pos, rli->group_relay_log_pos,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment