Commit 1bcdc3e9 authored by Andrei's avatar Andrei

MDEV-27697 slave must recognize incomplete replication event group

In cases of a faulty master or an incorrect binlog event producer, that slave is working with,
sends an incomplete group of events slave must react with an error to not to log
into the relay-log any new events that do not belong to the incomplete group.

Fixed with extending received event properties check when slave connects to master
in gtid mode.
Specifically for the event that can be a part of a group its relay-logging is
permitted only when its position within the group is validated.
Otherwise slave IO thread stops with ER_SLAVE_RELAY_LOG_WRITE_FAILURE.
parent 907e4c62
......@@ -1872,6 +1872,7 @@ SET GLOBAL slave_parallel_threads=10;
SET GLOBAL slave_parallel_threads=1;
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
CALL mtr.add_suppression("Unexpected break of being relay-logged GTID");
--connection server_1
INSERT INTO t2 VALUES (101);
......
include/master-slave.inc
[connection master]
connection slave;
call mtr.add_suppression("Unexpected break of being relay-logged GTID 0-27697-1000");
call mtr.add_suppression("Relay log write failure: could not queue event from master");
call mtr.add_suppression("The current group of events starts with a non-GTID");
include/stop_slave.inc
CHANGE MASTER TO MASTER_USE_GTID=slave_pos;
include/start_slave.inc
connection master;
CREATE TABLE t (a INT) ENGINE=innodb;
INSERT INTO t VALUES(1);
### A. Simulate an unnoticeable loss of Xid event
connection slave;
SET @@global.debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
connection master;
SET @@gtid_seq_no=1000;
set @@server_id=27697;
INSERT INTO t VALUES(1000);
set @@server_id=default;
INSERT INTO t VALUES(1001);
## Prove the error occurs.
connection slave;
include/wait_for_slave_io_error.inc [errno=1595]
## Prove the slave recovers after the simulation condtion is lifted.
SET @@global.debug_dbug=default;
include/start_slave.inc
### B. Do the same to GTID event.
connection slave;
SET @@global.debug_dbug="+d,slave_discard_gtid_0_x_1002";
connection master;
SET @@gtid_seq_no=1002;
set @@server_id=27697;
INSERT INTO t VALUES(1002);
set @@server_id=default;
INSERT INTO t VALUES(1003);
## Prove the error occurs.
connection slave;
include/wait_for_slave_io_error.inc [errno=1595]
## Prove the slave recovers after the simulation condtion is lifted.
SET @@global.debug_dbug=default;
include/start_slave.inc
connection master;
connection slave;
include/diff_tables.inc [master:t,slave:t]
"===== Clean up ====="
connection slave;
include/stop_slave.inc
CHANGE MASTER TO MASTER_USE_GTID=no;
include/start_slave.inc
connection master;
DROP TABLE t;
SET GLOBAL LOG_WARNINGS=default;
include/rpl_end.inc
......@@ -1378,6 +1378,7 @@ include/stop_slave.inc
SET GLOBAL slave_parallel_threads=1;
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
CALL mtr.add_suppression("Unexpected break of being relay-logged GTID");
connection server_1;
INSERT INTO t2 VALUES (101);
INSERT INTO t2 VALUES (102);
......
# ==== Purpose ====
#
# Test verifies that replicated transaction boundaries are set properly
# at receiving from master time.
#
# ==== Implementation ====
#
# A. Simulate an unnoticeable loss of Xid event to observe a slave error,
# then restart slave to recover from the failure.
# B. Do the same to GTID event.
#
# ==== References ====
#
# MDEV-27697 slave must recognize incomplete replication event group
#
--source include/have_binlog_format_mixed.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/master-slave.inc
--connection slave
call mtr.add_suppression("Unexpected break of being relay-logged GTID 0-27697-1000");
call mtr.add_suppression("Relay log write failure: could not queue event from master");
call mtr.add_suppression("The current group of events starts with a non-GTID");
--source include/stop_slave.inc
CHANGE MASTER TO MASTER_USE_GTID=slave_pos;
--source include/start_slave.inc
--connection master
CREATE TABLE t (a INT) ENGINE=innodb;
INSERT INTO t VALUES(1);
save_master_pos;
--echo ### A. Simulate an unnoticeable loss of Xid event
--sync_slave_with_master
SET @@global.debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
--connection master
SET @@gtid_seq_no=1000;
set @@server_id=27697;
INSERT INTO t VALUES(1000);
set @@server_id=default;
INSERT INTO t VALUES(1001);
--echo ## Prove the error occurs.
--connection slave
# ER_SLAVE_RELAY_LOG_WRITE_FAILURE
--let $slave_io_errno = 1595
--source include/wait_for_slave_io_error.inc
## EOP
--echo ## Prove the slave recovers after the simulation condtion is lifted.
SET @@global.debug_dbug=default;
--source include/start_slave.inc
--echo ### B. Do the same to GTID event.
--connection slave
SET @@global.debug_dbug="+d,slave_discard_gtid_0_x_1002";
--connection master
SET @@gtid_seq_no=1002;
set @@server_id=27697;
INSERT INTO t VALUES(1002);
set @@server_id=default;
INSERT INTO t VALUES(1003);
--echo ## Prove the error occurs.
--connection slave
# ER_SLAVE_RELAY_LOG_WRITE_FAILURE
--let $slave_io_errno = 1595
--source include/wait_for_slave_io_error.inc
## EOP
--echo ## Prove the slave recovers after the simulation condtion is lifted.
SET @@global.debug_dbug=default;
--source include/start_slave.inc
--connection master
save_master_pos;
--sync_slave_with_master
## EOP
--let $diff_tables=master:t,slave:t
--source include/diff_tables.inc
--echo "===== Clean up ====="
--connection slave
--source include/stop_slave.inc
CHANGE MASTER TO MASTER_USE_GTID=no;
--source include/start_slave.inc
--connection master
DROP TABLE t;
SET GLOBAL LOG_WARNINGS=default;
--source include/rpl_end.inc
......@@ -26,6 +26,7 @@
extern const LEX_STRING rpl_gtid_slave_state_table_name;
class String;
#define PARAM_GTID(G) G.domain_id, G.server_id, G.seq_no
struct rpl_gtid
{
......
......@@ -6196,23 +6196,75 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
}
}
if (unlikely(mi->gtid_reconnect_event_skip_count))
{
goto default_action;
}
/*
We have successfully queued to relay log everything before this GTID, so
Unless the previous group is malformed,
we have successfully queued to relay log everything before this GTID, so
in case of reconnect we can start from after any previous GTID.
(Normally we would have updated gtid_current_pos earlier at the end of
the previous event group, but better leave an extra check here for
safety).
(We must have updated gtid_current_pos earlier at the end of
the previous event group. Unless ...)
*/
if (mi->events_queued_since_last_gtid)
if (unlikely(mi->events_queued_since_last_gtid >
mi->gtid_reconnect_event_skip_count))
{
mi->gtid_current_pos.update(&mi->last_queued_gtid);
mi->events_queued_since_last_gtid= 0;
/*
...unless the last group has not been completed. An assert below
can be satisfied only with the strict mode that ensures
against "genuine" gtid duplicates.
*/
rpl_gtid *gtid_in_slave_state=
mi->gtid_current_pos.find(mi->last_queued_gtid.domain_id);
// Slave gtid state must not have updated yet to the last received gtid.
DBUG_ASSERT((mi->using_gtid == Master_info::USE_GTID_NO ||
!opt_gtid_strict_mode) ||
(!gtid_in_slave_state ||
!(*gtid_in_slave_state == mi->last_queued_gtid)));
DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
{
/* Inject an event group that is missing its XID commit event. */
if (mi->last_queued_gtid.domain_id == 0 &&
mi->last_queued_gtid.seq_no == 1000)
{
sql_print_warning(
"Unexpected break of being relay-logged GTID %u-%u-%llu "
"event group by the current GTID event %u-%u-%llu",
PARAM_GTID(mi->last_queued_gtid),PARAM_GTID(event_gtid));
DBUG_SET("-d,slave_discard_xid_for_gtid_0_x_1000");
goto dbug_gtid_accept;
}
});
error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
sql_print_error("Unexpected break of being relay-logged GTID %u-%u-%llu "
"event group by the current GTID event %u-%u-%llu",
PARAM_GTID(mi->last_queued_gtid),PARAM_GTID(event_gtid));
goto err;
}
else if (unlikely(mi->gtid_reconnect_event_skip_count > 0))
{
if (mi->gtid_reconnect_event_skip_count ==
mi->events_queued_since_last_gtid)
{
DBUG_ASSERT(event_gtid == mi->last_queued_gtid);
goto default_action;
}
DBUG_ASSERT(0);
}
// else_likely{...
#ifndef DBUG_OFF
dbug_gtid_accept:
DBUG_EXECUTE_IF("slave_discard_gtid_0_x_1002",
{
if (mi->last_queued_gtid.server_id == 27697 &&
mi->last_queued_gtid.seq_no == 1002)
{
DBUG_SET("-d,slave_discard_gtid_0_x_1002");
goto skip_relay_logging;
}
});
#endif
mi->last_queued_gtid= event_gtid;
mi->last_queued_gtid_standalone=
(gtid_flag & Gtid_log_event::FL_STANDALONE) != 0;
......@@ -6222,6 +6274,7 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
++mi->events_queued_since_last_gtid;
inc_pos= event_len;
// ...} eof else_likely
}
break;
/*
......@@ -6274,6 +6327,12 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
case XID_EVENT:
DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
{
if (mi->last_queued_gtid.server_id == 27697 &&
mi->last_queued_gtid.seq_no == 1000)
{
DBUG_SET("-d,slave_discard_xid_for_gtid_0_x_1000");
goto skip_relay_logging;
}
/* Inject an event group that is missing its XID commit event. */
if (mi->last_queued_gtid.domain_id == 0 &&
mi->last_queued_gtid.seq_no == 1000)
......@@ -6319,15 +6378,48 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
}
};);
if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen)
if (mi->using_gtid != Master_info::USE_GTID_NO)
{
if (unlikely(mi->gtid_reconnect_event_skip_count))
if (likely(mi->gtid_event_seen))
{
--mi->gtid_reconnect_event_skip_count;
gtid_skip_enqueue= true;
if (unlikely(mi->gtid_reconnect_event_skip_count))
{
if (!got_gtid_event &&
mi->gtid_reconnect_event_skip_count ==
mi->events_queued_since_last_gtid)
goto gtid_not_start; // the 1st re-sent must be gtid
--mi->gtid_reconnect_event_skip_count;
gtid_skip_enqueue= true;
}
else if (likely(mi->events_queued_since_last_gtid))
{
DBUG_ASSERT(!got_gtid_event);
++mi->events_queued_since_last_gtid;
}
else if (Log_event::is_group_event((Log_event_type) (uchar)
buf[EVENT_TYPE_OFFSET]))
{
goto gtid_not_start; // no first gtid event in this group
}
}
else if (Log_event::is_group_event((Log_event_type) (uchar)
buf[EVENT_TYPE_OFFSET]))
{
gtid_not_start:
DBUG_ASSERT(!got_gtid_event);
error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
sql_print_error("The current group of events starts with "
"a non-GTID %s event; "
"the last seen GTID is %u-%u-%llu",
Log_event::get_type_str((Log_event_type) (uchar)
buf[EVENT_TYPE_OFFSET]),
mi->last_queued_gtid);
goto err;
}
else if (mi->events_queued_since_last_gtid)
++mi->events_queued_since_last_gtid;
}
if (!is_compress_event)
......@@ -6500,15 +6592,35 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
Query_log_event::peek_is_commit_rollback(buf, event_len,
checksum_alg))))))
{
/*
The whole of the current event group is queued. So in case of
reconnect we can start from after the current GTID.
*/
mi->gtid_current_pos.update(&mi->last_queued_gtid);
mi->events_queued_since_last_gtid= 0;
DBUG_ASSERT(mi->events_queued_since_last_gtid > 1);
/* Reset the domain_id_filter flag. */
mi->domain_id_filter.reset_filter();
if (unlikely(gtid_skip_enqueue))
{
error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
sql_print_error("Recieved a group closing %s event "
"at %llu position in the group while there are "
"still %llu events to skip upon reconnecting; "
"the last seen GTID is %u-%u-%llu",
Log_event::get_type_str((Log_event_type) (uchar)
buf[EVENT_TYPE_OFFSET]),
(mi->events_queued_since_last_gtid -
mi->gtid_reconnect_event_skip_count),
mi->events_queued_since_last_gtid,
mi->last_queued_gtid);
goto err;
}
else
{
/*
The whole of the current event group is queued. So in case of
reconnect we can start from after the current GTID.
*/
mi->gtid_current_pos.update(&mi->last_queued_gtid);
mi->events_queued_since_last_gtid= 0;
/* Reset the domain_id_filter flag. */
mi->domain_id_filter.reset_filter();
}
}
skip_relay_logging:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment