Commit 6e16e1f6 authored by unknown's avatar unknown

bMDEV-4906: When event apply fails, next SQL thread start errorneously commits...

bMDEV-4906: When event apply fails, next SQL thread start errorneously commits the failing GTID to gtid_slave_pos

When a GTID event is executed, we remember the contained GTID position so that
when we have applied the entire event group we can commit it to
gtid_slave_pos.

However, if the event group fails to apply due to some error and the SQL
thread aborts, the code did not correctly clear the remembered GTID. Thus,
when SQL thread was restarted, the old GTID of the failing event group was
incorrectly updated to gtid_slave_pos when the initial rotate event was
executed, corrupting the GTID position.
parent a68dfa4b
...@@ -173,6 +173,33 @@ a ...@@ -173,6 +173,33 @@ a
SET sql_log_bin=0; SET sql_log_bin=0;
CALL mtr.add_suppression("Slave: Could not update replication slave gtid state"); CALL mtr.add_suppression("Slave: Could not update replication slave gtid state");
SET sql_log_bin=1; SET sql_log_bin=1;
*** MDEV-4906: When event apply fails, next SQL thread start errorneously commits the failing GTID to gtid_slave_pos ***
include/stop_slave.inc
SET sql_log_bin=0;
DELETE FROM t2;
SET sql_log_bin=1;
SET @old_format=@@binlog_format;
SET GLOBAL binlog_format='row';
include/start_slave.inc
SET @old_format=@@binlog_format;
SET binlog_format='row';
DELETE FROM t2;
SET binlog_format=@old_format;
include/wait_for_slave_sql_error.inc [errno=1032]
result
OK
STOP SLAVE IO_THREAD;
START SLAVE;
include/wait_for_slave_sql_error.inc [errno=1032]
result
OK
STOP SLAVE IO_THREAD;
SET sql_log_bin=0;
INSERT INTO t2 VALUES (1);
CALL mtr.add_suppression("Slave: Can't find record in 't2' Error_code: 1032");
SET sql_log_bin=1;
include/start_slave.inc
SET GLOBAL binlog_format=@old_format;
DROP TABLE t1; DROP TABLE t1;
DROP TABLE t2; DROP TABLE t2;
include/rpl_end.inc include/rpl_end.inc
...@@ -230,6 +230,57 @@ CALL mtr.add_suppression("Slave: Could not update replication slave gtid state") ...@@ -230,6 +230,57 @@ CALL mtr.add_suppression("Slave: Could not update replication slave gtid state")
SET sql_log_bin=1; SET sql_log_bin=1;
--echo *** MDEV-4906: When event apply fails, next SQL thread start errorneously commits the failing GTID to gtid_slave_pos ***
--connection slave
--source include/stop_slave.inc
SET sql_log_bin=0;
DELETE FROM t2;
SET sql_log_bin=1;
SET @old_format=@@binlog_format;
SET GLOBAL binlog_format='row';
--source include/start_slave.inc
--connection master
SET @old_format=@@binlog_format;
SET binlog_format='row';
--let $gtid_pos1=`SELECT @@GLOBAL.gtid_binlog_pos`
DELETE FROM t2;
SET binlog_format=@old_format;
--save_master_pos
--connection slave
--let $slave_sql_errno= 1032
--source include/wait_for_slave_sql_error.inc
# Disable query to avoid result file update if precise GTID value changes.
--disable_query_log
SET @x=@@GLOBAL.gtid_slave_pos;
eval SELECT IF(@x='$gtid_pos1', "OK", CONCAT("ERROR: expected $gtid_pos1 got ", @x)) AS result;
--enable_query_log
# The bug was that upon restarting the SQL thread, the GTID for the
# failing event group was not cleared, so we would update it in the
# gtid_slave_pos as part of the first rotate event, corrupting the
# replication.
STOP SLAVE IO_THREAD;
START SLAVE;
--let $slave_sql_errno= 1032
--source include/wait_for_slave_sql_error.inc
# Disable query to avoid result file update if precise GTID value changes.
--disable_query_log
SET @x=@@GLOBAL.gtid_slave_pos;
eval SELECT IF(@x='$gtid_pos1', "OK", CONCAT("ERROR: expected $gtid_pos1 got ", @x)) AS result;
--enable_query_log
STOP SLAVE IO_THREAD;
SET sql_log_bin=0;
INSERT INTO t2 VALUES (1);
CALL mtr.add_suppression("Slave: Can't find record in 't2' Error_code: 1032");
SET sql_log_bin=1;
--source include/start_slave.inc
--sync_with_master
SET GLOBAL binlog_format=@old_format;
--connection master --connection master
DROP TABLE t1; DROP TABLE t1;
DROP TABLE t2; DROP TABLE t2;
......
...@@ -3160,6 +3160,14 @@ int apply_event_and_update_pos(Log_event* ev, THD* thd, Relay_log_info* rli) ...@@ -3160,6 +3160,14 @@ int apply_event_and_update_pos(Log_event* ev, THD* thd, Relay_log_info* rli)
DBUG_RETURN(2); DBUG_RETURN(2);
} }
} }
else
{
/*
Make sure we do not errorneously update gtid_slave_pos with a lingering
GTID from this failed event group (MDEV-4906).
*/
rli->gtid_sub_id= 0;
}
DBUG_RETURN(exec_res ? 1 : 0); DBUG_RETURN(exec_res ? 1 : 0);
} }
...@@ -4094,6 +4102,7 @@ pthread_handler_t handle_slave_sql(void *arg) ...@@ -4094,6 +4102,7 @@ pthread_handler_t handle_slave_sql(void *arg)
rli->trans_retries= 0; // start from "no error" rli->trans_retries= 0; // start from "no error"
DBUG_PRINT("info", ("rli->trans_retries: %lu", rli->trans_retries)); DBUG_PRINT("info", ("rli->trans_retries: %lu", rli->trans_retries));
rli->gtid_sub_id= 0;
if (init_relay_log_pos(rli, if (init_relay_log_pos(rli,
rli->group_relay_log_name, rli->group_relay_log_name,
rli->group_relay_log_pos, rli->group_relay_log_pos,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment