MDEV-27697 slave must recognize incomplete replication event group

In cases of a faulty master or an incorrect binlog event producer, that slave is working with, sends an incomplete group of events slave must react with an error to not to log into the relay-log any new events that do not belong to the incomplete group. Fixed with extending received event properties check when slave connects to master in gtid mode. Specifically for the event that can be a part of a group its relay-logging is permitted only when its position within the group is validated. Otherwise slave IO thread stops with ER_SLAVE_RELAY_LOG_WRITE_FAILURE.

MDEV-27697 slave must recognize incomplete replication event group
In cases of a faulty master or an incorrect binlog event producer, that slave is working with, sends an incomplete group of events slave must react with an error to not to log into the relay-log any new events that do not belong to the incomplete group. Fixed with extending received event properties check when slave connects to master in gtid mode. Specifically for the event that can be a part of a group its relay-logging is permitted only when its position within the group is validated. Otherwise slave IO thread stops with ER_SLAVE_RELAY_LOG_WRITE_FAILURE.
1bcdc3e9 · Andrei · 907e4c62 · 1bcdc3e9 · 1bcdc3e9 · 1bcdc3e9
Commit 1bcdc3e9 authored Apr 14, 2022 by Andrei
6 changed files
--- a/mysql-test/extra/rpl_tests/rpl_parallel.inc
+++ b/mysql-test/extra/rpl_tests/rpl_parallel.inc
@@ -1872,6 +1872,7 @@ SET GLOBAL slave_parallel_threads=10;
 SET GLOBAL slave_parallel_threads=1;
 SET @old_dbug= @@GLOBAL.debug_dbug;
 SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
+CALL mtr.add_suppression("Unexpected break of being relay-logged GTID");

 --connection server_1
 INSERT INTO t2 VALUES (101);

--- a/mysql-test/suite/rpl/r/rpl_gtid_grouping.result
+++ b/mysql-test/suite/rpl/r/rpl_gtid_grouping.result
+include/master-slave.inc
+[connection master]
+connection slave;
+call mtr.add_suppression("Unexpected break of being relay-logged GTID 0-27697-1000");
+call mtr.add_suppression("Relay log write failure: could not queue event from master");
+call mtr.add_suppression("The current group of events starts with a non-GTID");
+include/stop_slave.inc
+CHANGE MASTER TO MASTER_USE_GTID=slave_pos;
+include/start_slave.inc
+connection master;
+CREATE TABLE t (a INT) ENGINE=innodb;
+INSERT INTO t VALUES(1);
+### A. Simulate an unnoticeable loss of Xid event
+connection slave;
+SET @@global.debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
+connection master;
+SET @@gtid_seq_no=1000;
+set @@server_id=27697;
+INSERT INTO t VALUES(1000);
+set @@server_id=default;
+INSERT INTO t VALUES(1001);
+## Prove the error occurs.
+connection slave;
+include/wait_for_slave_io_error.inc [errno=1595]
+## Prove the slave recovers after the simulation condtion is lifted.
+SET @@global.debug_dbug=default;
+include/start_slave.inc
+### B. Do the same to GTID event.
+connection slave;
+SET @@global.debug_dbug="+d,slave_discard_gtid_0_x_1002";
+connection master;
+SET @@gtid_seq_no=1002;
+set @@server_id=27697;
+INSERT INTO t VALUES(1002);
+set @@server_id=default;
+INSERT INTO t VALUES(1003);
+## Prove the error occurs.
+connection slave;
+include/wait_for_slave_io_error.inc [errno=1595]
+## Prove the slave recovers after the simulation condtion is lifted.
+SET @@global.debug_dbug=default;
+include/start_slave.inc
+connection master;
+connection slave;
+include/diff_tables.inc [master:t,slave:t]
+"===== Clean up ====="
+connection slave;
+include/stop_slave.inc
+CHANGE MASTER TO MASTER_USE_GTID=no;
+include/start_slave.inc
+connection master;
+DROP TABLE t;
+SET GLOBAL LOG_WARNINGS=default;
+include/rpl_end.inc
--- a/mysql-test/suite/rpl/r/rpl_parallel.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel.result
@@ -1378,6 +1378,7 @@ include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=1;
 SET @old_dbug= @@GLOBAL.debug_dbug;
 SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
+CALL mtr.add_suppression("Unexpected break of being relay-logged GTID");
 connection server_1;
 INSERT INTO t2 VALUES (101);
 INSERT INTO t2 VALUES (102);

--- a/mysql-test/suite/rpl/t/rpl_gtid_grouping.test
+++ b/mysql-test/suite/rpl/t/rpl_gtid_grouping.test
+# ==== Purpose ====
+#
+# Test verifies that replicated transaction boundaries are set properly
+# at receiving from master time.
+#
+# ==== Implementation ====
+#
+# A. Simulate an unnoticeable loss of Xid event to observe a slave error,
+#    then restart slave to recover from the failure.
+# B. Do the same to GTID event.
+#
+# ==== References ====
+#
+# MDEV-27697 slave must recognize incomplete replication event group 
+#
+--source include/have_binlog_format_mixed.inc
+--source include/have_innodb.inc
+--source include/have_debug.inc
+--source include/master-slave.inc
+
+--connection slave
+call mtr.add_suppression("Unexpected break of being relay-logged GTID 0-27697-1000");
+call mtr.add_suppression("Relay log write failure: could not queue event from master");
+call mtr.add_suppression("The current group of events starts with a non-GTID");
+
+--source include/stop_slave.inc
+CHANGE MASTER TO MASTER_USE_GTID=slave_pos;
+--source include/start_slave.inc
+
+--connection master
+CREATE TABLE t (a INT) ENGINE=innodb;
+INSERT INTO t VALUES(1);
+save_master_pos;
+
+--echo ### A. Simulate an unnoticeable loss of Xid event
+--sync_slave_with_master
+SET @@global.debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
+
+--connection master
+SET @@gtid_seq_no=1000;
+set @@server_id=27697;
+INSERT INTO t VALUES(1000);
+set @@server_id=default;
+INSERT INTO t VALUES(1001);
+
+--echo ## Prove the error occurs.
+--connection slave
+# ER_SLAVE_RELAY_LOG_WRITE_FAILURE
+--let $slave_io_errno = 1595
+--source include/wait_for_slave_io_error.inc
+## EOP
+
+--echo ## Prove the slave recovers after the simulation condtion is lifted.
+SET @@global.debug_dbug=default;
+--source include/start_slave.inc
+
+--echo ### B. Do the same to GTID event.
+--connection slave
+SET @@global.debug_dbug="+d,slave_discard_gtid_0_x_1002";
+
+--connection master
+SET @@gtid_seq_no=1002;
+set @@server_id=27697;
+INSERT INTO t VALUES(1002);
+set @@server_id=default;
+INSERT INTO t VALUES(1003);
+
+--echo ## Prove the error occurs.
+--connection slave
+# ER_SLAVE_RELAY_LOG_WRITE_FAILURE
+--let $slave_io_errno = 1595
+--source include/wait_for_slave_io_error.inc
+## EOP
+
+--echo ## Prove the slave recovers after the simulation condtion is lifted.
+SET @@global.debug_dbug=default;
+--source include/start_slave.inc
+
+--connection master
+save_master_pos;
+
+--sync_slave_with_master
+## EOP
+
+--let $diff_tables=master:t,slave:t
+--source include/diff_tables.inc
+
+--echo "===== Clean up ====="
+--connection slave
+--source include/stop_slave.inc
+CHANGE MASTER TO MASTER_USE_GTID=no;
+--source include/start_slave.inc
+
+--connection master
+DROP TABLE t;
+SET GLOBAL LOG_WARNINGS=default;
+--source include/rpl_end.inc
--- a/sql/rpl_gtid.h
+++ b/sql/rpl_gtid.h
@@ -26,6 +26,7 @@
 extern const LEX_STRING rpl_gtid_slave_state_table_name;

 class String;
+#define PARAM_GTID(G) G.domain_id, G.server_id, G.seq_no

 struct rpl_gtid
 {

--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -6196,23 +6196,75 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
      }
    }

-    if (unlikely(mi->gtid_reconnect_event_skip_count))
-    {
-      goto default_action;
-    }
-
    /*
-      We have successfully queued to relay log everything before this GTID, so
+      Unless the previous group is malformed,
+      we have successfully queued to relay log everything before this GTID, so
      in case of reconnect we can start from after any previous GTID.
-      (Normally we would have updated gtid_current_pos earlier at the end of
-      the previous event group, but better leave an extra check here for
-      safety).
+      (We must have updated gtid_current_pos earlier at the end of
+      the previous event group. Unless ...)
    */
-    if (mi->events_queued_since_last_gtid)
+    if (unlikely(mi->events_queued_since_last_gtid >
+                 mi->gtid_reconnect_event_skip_count))
    {
-      mi->gtid_current_pos.update(&mi->last_queued_gtid);
-      mi->events_queued_since_last_gtid= 0;
+      /*
+        ...unless the last group has not been completed. An assert below
+        can be satisfied only with the strict mode that ensures
+        against "genuine" gtid duplicates.
+      */
+      rpl_gtid *gtid_in_slave_state=
+        mi->gtid_current_pos.find(mi->last_queued_gtid.domain_id);
+
+      // Slave gtid state must not have updated yet to the last received gtid.
+      DBUG_ASSERT((mi->using_gtid == Master_info::USE_GTID_NO ||
+                   !opt_gtid_strict_mode) ||
+                  (!gtid_in_slave_state ||
+                   !(*gtid_in_slave_state == mi->last_queued_gtid)));
+
+      DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
+      {
+        /* Inject an event group that is missing its XID commit event. */
+        if (mi->last_queued_gtid.domain_id == 0 &&
+            mi->last_queued_gtid.seq_no == 1000)
+        {
+          sql_print_warning(
+            "Unexpected break of being relay-logged GTID %u-%u-%llu "
+            "event group by the current GTID event %u-%u-%llu",
+            PARAM_GTID(mi->last_queued_gtid),PARAM_GTID(event_gtid));
+          DBUG_SET("-d,slave_discard_xid_for_gtid_0_x_1000");
+          goto dbug_gtid_accept;
+        }
+      });
+      error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
+      sql_print_error("Unexpected break of being relay-logged GTID %u-%u-%llu "
+                      "event group by the current GTID event %u-%u-%llu",
+                      PARAM_GTID(mi->last_queued_gtid),PARAM_GTID(event_gtid));
+      goto err;
+    }
+    else if (unlikely(mi->gtid_reconnect_event_skip_count > 0))
+    {
+      if (mi->gtid_reconnect_event_skip_count ==
+          mi->events_queued_since_last_gtid)
+      {
+        DBUG_ASSERT(event_gtid == mi->last_queued_gtid);
+
+        goto default_action;
+      }
+
+      DBUG_ASSERT(0);
    }
+    // else_likely{...
+#ifndef DBUG_OFF
+dbug_gtid_accept:
+    DBUG_EXECUTE_IF("slave_discard_gtid_0_x_1002",
+    {
+      if (mi->last_queued_gtid.server_id == 27697 &&
+          mi->last_queued_gtid.seq_no == 1002)
+      {
+        DBUG_SET("-d,slave_discard_gtid_0_x_1002");
+        goto skip_relay_logging;
+      }
+    });
+#endif
    mi->last_queued_gtid= event_gtid;
    mi->last_queued_gtid_standalone=
      (gtid_flag & Gtid_log_event::FL_STANDALONE) != 0;
@@ -6222,6 +6274,7 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)

    ++mi->events_queued_since_last_gtid;
    inc_pos= event_len;
+    // ...} eof else_likely
  }
  break;
  /*
@@ -6274,6 +6327,12 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
  case XID_EVENT:
    DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
    {
+      if (mi->last_queued_gtid.server_id == 27697 &&
+          mi->last_queued_gtid.seq_no == 1000)
+      {
+        DBUG_SET("-d,slave_discard_xid_for_gtid_0_x_1000");
+        goto skip_relay_logging;
+      }
      /* Inject an event group that is missing its XID commit event. */
      if (mi->last_queued_gtid.domain_id == 0 &&
          mi->last_queued_gtid.seq_no == 1000)
@@ -6319,15 +6378,48 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
                      }
                    };);

-    if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen)
+    if (mi->using_gtid != Master_info::USE_GTID_NO)
    {
-      if (unlikely(mi->gtid_reconnect_event_skip_count))
+      if (likely(mi->gtid_event_seen))
      {
-        --mi->gtid_reconnect_event_skip_count;
-        gtid_skip_enqueue= true;
+        if (unlikely(mi->gtid_reconnect_event_skip_count))
+        {
+          if (!got_gtid_event &&
+              mi->gtid_reconnect_event_skip_count ==
+              mi->events_queued_since_last_gtid)
+            goto gtid_not_start; // the 1st re-sent must be gtid
+
+          --mi->gtid_reconnect_event_skip_count;
+          gtid_skip_enqueue= true;
+        }
+        else if (likely(mi->events_queued_since_last_gtid))
+        {
+          DBUG_ASSERT(!got_gtid_event);
+
+          ++mi->events_queued_since_last_gtid;
+        }
+        else if (Log_event::is_group_event((Log_event_type) (uchar)
+                                           buf[EVENT_TYPE_OFFSET]))
+        {
+          goto gtid_not_start; // no first gtid event in this group
+        }
+      }
+      else if (Log_event::is_group_event((Log_event_type) (uchar)
+                                           buf[EVENT_TYPE_OFFSET]))
+      {
+      gtid_not_start:
+
+        DBUG_ASSERT(!got_gtid_event);
+
+        error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
+        sql_print_error("The current group of events starts with "
+                        "a non-GTID %s event; "
+                        "the last seen GTID is %u-%u-%llu",
+                        Log_event::get_type_str((Log_event_type) (uchar)
+                                                buf[EVENT_TYPE_OFFSET]),
+                        mi->last_queued_gtid);
+        goto err;
      }
-      else if (mi->events_queued_since_last_gtid)
-        ++mi->events_queued_since_last_gtid;
    }

    if (!is_compress_event)
@@ -6500,15 +6592,35 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
           Query_log_event::peek_is_commit_rollback(buf, event_len,
                                                    checksum_alg))))))
    {
-      /*
-        The whole of the current event group is queued. So in case of
-        reconnect we can start from after the current GTID.
-      */
-      mi->gtid_current_pos.update(&mi->last_queued_gtid);
-      mi->events_queued_since_last_gtid= 0;
+      DBUG_ASSERT(mi->events_queued_since_last_gtid > 1);

-      /* Reset the domain_id_filter flag. */
-      mi->domain_id_filter.reset_filter();
+      if (unlikely(gtid_skip_enqueue))
+      {
+        error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
+        sql_print_error("Recieved a group closing %s event "
+                        "at %llu position in the group while there are "
+                        "still %llu events to skip upon reconnecting; "
+                        "the last seen GTID is %u-%u-%llu",
+                        Log_event::get_type_str((Log_event_type) (uchar)
+                                                buf[EVENT_TYPE_OFFSET]),
+                        (mi->events_queued_since_last_gtid -
+                         mi->gtid_reconnect_event_skip_count),
+                        mi->events_queued_since_last_gtid,
+                        mi->last_queued_gtid);
+        goto err;
+      }
+      else
+      {
+        /*
+          The whole of the current event group is queued. So in case of
+          reconnect we can start from after the current GTID.
+        */
+        mi->gtid_current_pos.update(&mi->last_queued_gtid);
+        mi->events_queued_since_last_gtid= 0;
+
+        /* Reset the domain_id_filter flag. */
+        mi->domain_id_filter.reset_filter();
+      }
    }

 skip_relay_logging: