Commit 475f69b9 authored by Sujatha's avatar Sujatha

MDEV-25958: rpl_semi_sync_fail_over.test fails in buildbot

Analysis:
========
In case multi binlog truncation scenario debug sync points are in the
following order.

Two inserts are done on master as shown below.

INSERT INTO t1 VALUES (4, REPEAT("x", 4100)
commit_after_release_LOCK_after_binlog_sync

INSERT INTO t1 VALUES (5, REPEAT("x", 4100)
commit_after_release_LOCK_log

First insert debug sync ensures that transaction is synced to binlog and
not committed but it reached slave through semi sync.

Second insert debug sync ensures that transaction is synced to binlog and
not committed. It doesn't ensure that 'INSERT 5' reached slave.

Most of the times INSERT 5 reaches slave, hence when it is promoted as
master it sends 4,5 to slave. But occasionally 5 may not reach slave in
those cases post recovery master will have only 4. When row 6 is inserted
Master has 4-6 and Slave has 4,5,6.

This results in test failure.

Fix:
===
For the first insert use 'commit_before_get_LOCK_commit_ordered' debug sync
point, it will ensure that binlog was sent to slave and slave has
acknowledged the receipt. Now enable debug code such that when the next
transaction is written to binary log, dump thread will read and send it
across the network and notify the server to be get killed. Insert row 5
and wait for notification from dump thread. Kill the server. This ensures
that both 4 and 5 have reached the semi-sync slave.

Added a new test case:
Insert two rows on master such that first is present in master's binlog and
reached semi sync slave. Second insert should be flushed to binlog but not
sent to slave. Now crash and fail over to slave. The promoted master will send
the extra transaction to slave.
parent 583516bb
......@@ -20,36 +20,70 @@ if (!$failover_to_slave)
# Hold insert after write to binlog and before "run_commit_ordered" in engine
SET DEBUG_SYNC= "commit_after_release_LOCK_after_binlog_sync SIGNAL con1_ready WAIT_FOR con1_go";
--send_eval $query_to_crash
if ($case == 1)
{
SET DEBUG_SYNC= "commit_after_release_LOCK_after_binlog_sync SIGNAL con1_ready WAIT_FOR con1_go";
--send_eval $query_to_crash
--connection server_$server_to_crash
SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
--source include/kill_mysqld.inc
}
# complicate recovery with an extra binlog file
if (!$failover_to_slave)
if ($case == 2)
{
SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL con1_ready WAIT_FOR con1_go";
--send_eval $query_to_crash
--connect (conn_client_2,127.0.0.1,root,,test,$SERVER_MYPORT_2,)
# use the same signal with $query_to_crash
SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
SET DEBUG_SYNC= "commit_after_release_LOCK_log SIGNAL con1_ready WAIT_FOR con2_go";
SET GLOBAL debug_dbug="d,Notify_binlog_EOF";
--send_eval $query2_to_crash
--connection server_$server_to_crash
SET DEBUG_SYNC= "now WAIT_FOR eof_reached";
--source include/kill_mysqld.inc
}
--connection server_$server_to_crash
SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
--source include/kill_mysqld.inc
# complicate recovery with an extra binlog file
if ($case == 3)
{
SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL con1_ready WAIT_FOR con1_go";
--send_eval $query_to_crash
--connect (conn_client_3,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
# use the same signal with $query_to_crash
SET DEBUG_SYNC= "commit_before_update_binlog_end_pos SIGNAL con3_ready WAIT_FOR con1_go";
--send_eval $query2_to_crash
--connection server_$server_to_crash
SET DEBUG_SYNC= "now WAIT_FOR con3_ready";
--source include/kill_mysqld.inc
}
--connection server_$server_to_promote
--let $slave_param= Slave_SQL_Running_State
--let $slave_param_value= Slave has read all relay log; waiting for more updates
source include/wait_for_slave_param.inc;
--error 2003
--source include/stop_slave.inc
--let $assert_cond= COUNT(*) = $expected_rows_on_slave FROM t1
--let $assert_text= Table t1 should have $expected_rows_on_slave rows.
--source include/assert.inc
SELECT @@GLOBAL.gtid_current_pos;
--let $restart_parameters=--rpl-semi-sync-slave-enabled=1
--let $restart_parameters=--skip-slave-start=1 --rpl-semi-sync-slave-enabled=1
--let $allow_rpl_inited=1
--source include/start_mysqld.inc
--connection server_$server_to_crash
--enable_reconnect
--source include/wait_until_connected_again.inc
--let $assert_cond= COUNT(*) = $expected_rows_on_master FROM t1
--let $assert_text= Table t1 should have $expected_rows_on_master rows.
--source include/assert.inc
# Check error log for correct messages.
let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.$server_to_crash.err;
--let SEARCH_FILE=$log_error_
......
......@@ -5,7 +5,9 @@
[mysqld.1]
log-slave-updates
gtid-strict-mode=1
sync-binlog=1
[mysqld.2]
log-slave-updates
gtid-strict-mode=1
sync-binlog=1
# ==== Purpose ====
#
# Test verifies replication failover scenario.
#
# ==== Implementation ====
#
# Steps:
# 0 - Having two servers 1 and 2 enable semi-sync replication with
# with the master wait 'after_sync'.
# 1 - Insert a row. While inserting second row simulate
# a server crash at once the transaction is written to binlog, flushed
# and synced but the binlog position is not updated.
# 2 - Post crash-recovery on the old master execute there CHANGE MASTER
# TO command to connect to server id 2.
# 3 - The old master new slave server 1 must connect to the new
# master server 2.
# 4 - repeat the above to crash the new master and restore in role the old one
#
# ==== References ====
#
# MDEV-21117: recovery for --rpl-semi-sync-slave-enabled server
#
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--source include/have_binlog_format_row.inc
--source include/master-slave.inc
......@@ -43,36 +26,53 @@ set @@global.gtid_slave_pos = "";
CHANGE MASTER TO master_use_gtid= slave_pos;
--source include/start_slave.inc
--connection server_1
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
set @@global.rpl_semi_sync_master_enabled = 1;
set @@global.rpl_semi_sync_master_wait_point=AFTER_SYNC;
call mtr.add_suppression("Can.t init tc log");
call mtr.add_suppression("Aborting");
call mtr.add_suppression("1 client is using or hasn.t closed the table properly");
call mtr.add_suppression("Table './mtr/test_suppressions' is marked as crashed and should be repaired");
CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
INSERT INTO t1 VALUES (1, 'dummy1');
--save_master_pos
#
# CRASH the original master, and FAILOVER to the new
#
--connection server_2
--sync_with_master
--connection server_1
--let $case = 1
--echo #
--echo # Case:$case
--echo #
--echo # CRASH the original master, and FAILOVER to the new
# value 1 for server id 1 -> 2 failover
--let $failover_to_slave=1
--let $query_to_crash= INSERT INTO t1 VALUES (2, REPEAT("x", 4100))
--echo # $query_to_crash
--echo # Row - 2 will be in master's binlog but not committed, gets replicated
--echo # to slave and applied. On crash master should have 1 row and slave
--echo # should have 2 rows.
--echo #
--echo # Expected State post crash:
--echo #=================================================================
--echo # Master | Slave |
--echo # 0-1-4 (Not committed) | 0-1-4 (Received through semi-sync |
--echo # | replication and applied) |
--echo #=================================================================
--let $log_search_pattern=truncated binlog file:.*master.*000001
--let $expected_rows_on_master= 1
--let $expected_rows_on_slave= 2
--source rpl_semi_sync_crash.inc
--echo #
--echo # Server_2 promoted as master will send 0-1-4 to new slave Server_1
--echo #
--connection server_2
--let $rows_so_far=3
--eval INSERT INTO t1 VALUES ($rows_so_far, 'dummy3')
--save_master_pos
--echo # The gtid state on current master must be equal to ...
SHOW VARIABLES LIKE 'gtid_binlog_pos';
SHOW VARIABLES LIKE 'gtid_slave_pos';
--connection server_1
--sync_with_master
......@@ -82,23 +82,44 @@ SHOW VARIABLES LIKE 'gtid_slave_pos';
SHOW VARIABLES LIKE 'gtid_binlog_pos';
--connection server_2
#
# CRASH the new master and FAILOVER back to the original
#
--let $case = 2
--echo #
--echo # Case:$case
--echo #
--echo # CRASH the new master, and FAILOVER back to the original
# value 0 for the reverse server id 2 -> 1 failover
--let $failover_to_slave=0
--let $query_to_crash = INSERT INTO t1 VALUES (4, REPEAT("x", 4100))
--let $query2_to_crash= INSERT INTO t1 VALUES (5, REPEAT("x", 4100))
--let $log_search_pattern=truncated binlog file:.*slave.*000001
--echo # $query_to_crash
--echo # $query2_to_crash
--echo # Rows 4 and 5 will be in master's binlog but not committed, they get
--echo # replicated to slave and applied. On crash master should have 3 rows
--echo # and slave should have 5 rows.
--echo #
--echo # Expected State post crash:
--echo #=================================================================
--echo # Master | Slave |
--echo # 0-2-6 (Not commited) | 0-2-6 (Received through semi-sync |
--echo # | replication and applied) |
--echo # 0-2-7 (Not commited) | 0-2-7 (Received through semi-sync |
--echo # | replication and applied) |
--echo #=================================================================
--let $log_search_pattern=truncated binlog file:.*slave.*000002
--let $expected_rows_on_master= 3
--let $expected_rows_on_slave= 5
--source rpl_semi_sync_crash.inc
--echo #
--echo # Server_1 promoted as master will send 0-2-6 and 0-2-7 to slave Server_2
--echo #
--connection server_1
--let $rows_so_far=6
--eval INSERT INTO t1 VALUES ($rows_so_far, 'Done')
--eval INSERT INTO t1 VALUES ($rows_so_far, 'dummy6')
--save_master_pos
--echo # The gtid state on current master must be equal to ...
SHOW VARIABLES LIKE 'gtid_binlog_pos';
SHOW VARIABLES LIKE 'gtid_slave_pos';
--connection server_2
--sync_with_master
......@@ -107,36 +128,83 @@ SHOW VARIABLES LIKE 'gtid_binlog_pos';
SHOW VARIABLES LIKE 'gtid_slave_pos';
SHOW VARIABLES LIKE 'gtid_binlog_pos';
--let $diff_tables=server_1:t1, server_2:t1
--source include/diff_tables.inc
#
--echo # Cleanup
#
--connection server_1
DROP TABLE t1;
--save_master_pos
--let $case = 3
--echo #
--echo # Case:$case
--echo #
--echo # CRASH the master and FAILOVER to slave
--let $failover_to_slave=1
--let $query_to_crash = INSERT INTO t1 VALUES (7, REPEAT("x", 4100))
--let $query2_to_crash= INSERT INTO t1 VALUES (8, REPEAT("x", 4100))
--echo # $query_to_crash
--echo # $query2_to_crash
--echo # Rows 7 and 8 will be in master's binlog but not committed, only 7
--echo # gets replicated to slave and applied. On crash master should have 6
--echo # rows and slave should have 7 rows.
--echo #
--echo # Expected State post crash:
--echo #=================================================================
--echo # Master | Slave |
--echo # 0-1-9 (Not commited) | 0-1-9 (Received through semi-sync |
--echo # | replication and applied) |
--echo # 0-1-10 (Not commited - | |
--echo # never sent to slave) | |
--echo #=================================================================
--let $log_search_pattern=truncated binlog file:.*master.*000003
--let $expected_rows_on_master= 6
--let $expected_rows_on_slave= 7
--source rpl_semi_sync_crash.inc
--echo #
--echo # Server_2 promoted as master will send 0-1-9 to slave Server_1
--echo #
--connection server_2
--sync_with_master
--source include/stop_slave.inc
--let $rows_so_far=8
--eval INSERT INTO t1 VALUES ($rows_so_far, 'Done')
--source include/save_master_gtid.inc
--echo # The gtid state on current master must be equal to ...
SHOW VARIABLES LIKE 'gtid_binlog_pos';
SHOW VARIABLES LIKE 'gtid_slave_pos';
--connection server_1
set @@global.rpl_semi_sync_master_enabled = 0;
set @@global.rpl_semi_sync_slave_enabled = 0;
set @@global.rpl_semi_sync_master_wait_point=default;
RESET SLAVE;
--source include/sync_with_master_gtid.inc
--eval SELECT COUNT(*) = $rows_so_far as 'true' FROM t1
--echo # ... the gtid states on the slave:
SHOW VARIABLES LIKE 'gtid_slave_pos';
SHOW VARIABLES LIKE 'gtid_binlog_pos';
--echo #
--echo # Cleanup
--echo #
--source include/stop_slave.inc
set global rpl_semi_sync_slave_enabled = 0;
set global rpl_semi_sync_master_enabled = 0;
set global rpl_semi_sync_master_wait_point=default;
RESET MASTER;
RESET SLAVE;
--connection server_2
RESET MASTER;
RESET SLAVE;
set @@global.rpl_semi_sync_master_enabled = 0;
set @@global.rpl_semi_sync_slave_enabled = 0;
set @@global.rpl_semi_sync_master_wait_point=default;
evalp CHANGE MASTER TO master_host='127.0.0.1', master_port=$SERVER_MYPORT_1, master_user='root', master_use_gtid=no;
evalp CHANGE MASTER TO master_host='127.0.0.1', master_port=$SERVER_MYPORT_1, master_user='root', master_use_gtid=SLAVE_POS;
set @@global.gtid_slave_pos=@@global.gtid_binlog_pos;
--source include/start_slave.inc
--connection server_1
DROP TABLE t1;
--save_master_pos
--connection server_2
--sync_with_master
connection default;
--enable_reconnect
--source include/wait_until_connected_again.inc
......
......@@ -8316,6 +8316,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
}
else
{
DEBUG_SYNC(leader->thd, "commit_before_update_binlog_end_pos");
bool any_error= false;
mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
......
......@@ -2828,6 +2828,12 @@ static int send_one_binlog_file(binlog_send_info *info,
*/
if (send_events(info, log, linfo, end_pos))
return 1;
DBUG_EXECUTE_IF("Notify_binlog_EOF",
{
const char act[]= "now signal eof_reached";
DBUG_ASSERT(!debug_sync_set_action(current_thd,
STRING_WITH_LEN(act)));
};);
}
return 1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment