Commit 68938d2b authored by Brandon Nesterenko's avatar Brandon Nesterenko

MDEV-33500 (part 2): rpl.rpl_parallel_sbm can still fail

The failing test case validates Seconds_Behind_Master for a delayed
slave, while STOP SLAVE is executed during a delay. The test fixes
initially added to the test (commit b04c8575) added a table lock
to ensure a transaction could not finish before validating the
Seconds_Behind_Master field after SLAVE START, but did not address a
possibility that the transaction could finish before running the
STOP SLAVE command, which invalidates the validations for the rest
of the test case. Specifically, this would result in 1) a timeout in
“Waiting for table metadata lock” on the replica, which expects the
transaction to retry after slave restart and hit a lock conflict on
the locked tables (added in b04c8575), and 2) that
Seconds_Behind_Master should have increased, but did not.

The failure can be reproduced by synchronizing the slave to the master
before the MDEV-32265 echo statement (i.e. before the SLAVE STOP).

This patch fixes the test by adding a mechanism to use DEBUG_SYNC to
synchronize a MASTER_DELAY, rather than continually increase the
duration of the delay each time the test fails on buildbot. This is
to ensure that on slow machines, a delay does not pass before the
test gets a chance to validate results. Additionally, it decreases
overall test time because the test can continue immediately after
validation, thereby bypassing the remainder of a full delay for each
transaction.
parent a1adabdd
# ==== Purpose ====
#
# If using DEBUG_SYNC to coordinate a slave's SQL DELAY via the DEBUG_DBUG
# identifier "sql_delay_by_debug_sync", this helper file will help synchronize
# a slave with the master for statements which don't need to be delayed. This
# can be helpful, for example, for setup/cleanup statements, if they must be
# run in the same lifetime as the statements used for the test.
#
# The actual synchronization will take place based on the input parameter
# slave_sync_method, which can be "gtid", "file_coord", or "none"; and will use
# the helper files sync_with_master_gtid.inc or sync_with_master.inc (or none
# at all), respectively.
#
#
# ==== Requirements ====
#
# --source include/have_debug.inc
# --source include/have_debug_sync.inc
# set @@GLOBAL.debug_dbug= "+d,sql_delay_by_debug_sync";
#
#
# ==== Usage ====
#
# --let $slave_sync_method= gtid|file_coord|none
# [--let $num_event_groups= NUMBER]
# --source include/sync_with_master_sql_delay_debug_sync.inc
#
#
# Parameters:
# $slave_sync_method
# Value can be gtid, file_coord, or none; and will synchronize the slave
# with the master via this method (i.e. using sync_with_master_gtid.inc
# or sync_with_master.inc, respectively), after synchronizing the SQL
# delay
#
# $num_event_groups
# Number of event groups to synchronize the SQL delay for. If unset, will
# be default to 1.
#
--let $include_filename= sync_with_master_sql_delay_debug_sync.inc
--source include/begin_include_file.inc
if (!$slave_sync_method)
{
--die Parameter slave_sync_method must be set
}
if (`select "$slave_sync_method" not like "gtid" and "$slave_sync_method" not like "file_coord" and "$slave_sync_method" not like "none"`)
{
--die Parameter slave_sync_method must have value "gtid", "file_coord" or "none"
}
if (`select "$slave_sync_method" not like "none" and strcmp("$master_pos", "") = 0`)
{
--die sync_with_master.inc or sync_with_master_gtid.inc was not called to populate variable master_pos
}
if (!$num_event_groups)
{
--let $num_event_groups= 1
}
while ($num_event_groups)
{
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
--dec $num_event_groups
}
if (`select "$slave_sync_method" LIKE "gtid"`)
{
--source include/sync_with_master_gtid.inc
}
if (`select "$slave_sync_method" LIKE "file_coord"`)
{
--source include/sync_with_master.inc
}
--let $include_filename= sync_with_master_sql_delay_debug_sync.inc
--source include/end_include_file.inc
......@@ -5,14 +5,20 @@ include/master-slave.inc
#
connection slave;
include/stop_slave.inc
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master";
set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
change master to master_delay=3, master_use_gtid=Slave_Pos;
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync";
change master to master_delay=1, master_use_gtid=Slave_Pos;
include/start_slave.inc
connection master;
create table t1 (a int);
create table t2 (a int);
include/sync_slave_sql_with_master.inc
include/save_master_gtid.inc
connection slave;
include/sync_with_master_sql_delay_debug_sync.inc
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
#
# Pt 1) Ensure SBM is updated immediately upon arrival of the next event
connection master;
......@@ -21,12 +27,23 @@ insert into t1 values (0);
include/save_master_gtid.inc
connection slave;
# Waiting for transaction to arrive on slave and begin SQL Delay..
set debug_sync= "now WAIT_FOR at_sql_delay";
# Validating SBM is updated on event arrival..
# ..done
# MDEV-32265. At time of STOP SLAVE, if the SQL Thread is currently
# delaying a transaction; then when the reciprocal START SLAVE occurs,
# if the event is still to be delayed, SBM should resume accordingly
include/stop_slave.inc
connection server_2;
# Ensure the kill from STOP SLAVE will be received before continuing the
# SQL thread
set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill";
STOP SLAVE;
connection slave;
set debug_sync= "now WAIT_FOR slave_notified_of_kill";
set debug_sync= "now SIGNAL continue_sql_thread";
connection server_2;
include/wait_for_slave_to_stop.inc
set debug_sync="RESET";
# Lock t1 on slave to ensure the event can't finish (and thereby update
# Seconds_Behind_Master) so slow running servers don't accidentally
# catch up to the master before checking SBM.
......@@ -34,6 +51,10 @@ connection server_2;
LOCK TABLES t1 WRITE;
include/start_slave.inc
connection slave;
# SQL delay has no impact for the rest of the test case, so ignore it
include/sync_with_master_sql_delay_debug_sync.inc
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
# Waiting for replica to get blocked by the table lock
# Sleeping 1s to increment SBM
# Ensuring Seconds_Behind_Master increases after sleeping..
......@@ -54,6 +75,13 @@ insert into t1 values (2);
include/save_master_pos.inc
connection slave;
# Wait for first transaction to complete SQL delay and begin execution..
include/sync_with_master_sql_delay_debug_sync.inc
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
# Wait for second transaction to complete SQL delay..
include/sync_with_master_sql_delay_debug_sync.inc
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
# Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle..
# ..and that SBM wasn't calculated using prior committed transactions
# ..done
......@@ -63,6 +91,8 @@ include/wait_for_slave_param.inc [Relay_Master_Log_File]
include/wait_for_slave_param.inc [Exec_Master_Log_Pos]
# Cleanup
include/stop_slave.inc
set debug_sync= "RESET";
set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync";
CHANGE MASTER TO master_delay=0;
include/start_slave.inc
#
......
......@@ -3,6 +3,7 @@
#
--source include/have_log_bin.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--source include/master-slave.inc
--echo #
......@@ -16,14 +17,23 @@
# Seconds_Behind_Master is based on the timestamp of the new transaction,
# rather than the last committed transaction.
#
# Note that the test doesn't actually use the value of MASTER_DELAY, but
# rather uses debug_sync to coordinate the end of the delay. This is to ensure
# that on slow machines, a delay doesn't pass before the test gets a chance to
# validate results. Additionally, it lets us continue testing after validation
# so we don't have to wait out a full delay unnecessarily. The debug_sync point
# is enabled via sql_delay_by_debug_sync, which will delay transactions based
# only on GTID events, so only one synchronization is needed per transaction.
#
--connection slave
--source include/stop_slave.inc
--let $save_dbug= `SELECT @@GLOBAL.debug_dbug`
--let $save_parallel_mode= `SELECT @@GLOBAL.slave_parallel_mode`
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master";
set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
--let $master_delay= 3
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync";
--let $master_delay= 1
--eval change master to master_delay=$master_delay, master_use_gtid=Slave_Pos
--source include/start_slave.inc
......@@ -31,7 +41,11 @@ set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
--let insert_ctr= 0
create table t1 (a int);
create table t2 (a int);
--source include/sync_slave_sql_with_master.inc
--source include/save_master_gtid.inc
--connection slave
--let $slave_sync_method= gtid
--let $num_event_groups= 2
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo #
--echo # Pt 1) Ensure SBM is updated immediately upon arrival of the next event
......@@ -48,8 +62,7 @@ sleep 2;
--connection slave
--echo # Waiting for transaction to arrive on slave and begin SQL Delay..
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting until MASTER_DELAY seconds after master executed event';
--source include/wait_condition.inc
set debug_sync= "now WAIT_FOR at_sql_delay";
--echo # Validating SBM is updated on event arrival..
--let $sbm_trx1_arrive= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
......@@ -66,7 +79,20 @@ if (`SELECT $sbm_trx1_arrive > ($seconds_since_idling + 1)`)
--echo # delaying a transaction; then when the reciprocal START SLAVE occurs,
--echo # if the event is still to be delayed, SBM should resume accordingly
--source include/stop_slave.inc
--connection server_2
--echo # Ensure the kill from STOP SLAVE will be received before continuing the
--echo # SQL thread
set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill";
--send STOP SLAVE
--connection slave
set debug_sync= "now WAIT_FOR slave_notified_of_kill";
set debug_sync= "now SIGNAL continue_sql_thread";
--connection server_2
--reap
--source include/wait_for_slave_to_stop.inc
set debug_sync="RESET";
--echo # Lock t1 on slave to ensure the event can't finish (and thereby update
--echo # Seconds_Behind_Master) so slow running servers don't accidentally
......@@ -77,6 +103,11 @@ LOCK TABLES t1 WRITE;
--source include/start_slave.inc
--connection slave
--echo # SQL delay has no impact for the rest of the test case, so ignore it
--let $slave_sync_method= none
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo # Waiting for replica to get blocked by the table lock
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock';
--source include/wait_condition.inc
......@@ -120,9 +151,14 @@ sleep 3;
--connection slave
--echo # Wait for first transaction to complete SQL delay and begin execution..
--let $slave_sync_method= none
--source include/sync_with_master_sql_delay_debug_sync.inc
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock%' AND command LIKE 'Slave_Worker';
--source include/wait_condition.inc
--echo # Wait for second transaction to complete SQL delay..
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo # Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle..
--let $sbm_after_trx_no_idle= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--let $timestamp_trxpt2_arrive= `SELECT UNIX_TIMESTAMP()`
......@@ -147,6 +183,8 @@ UNLOCK TABLES;
--echo # Cleanup
--source include/stop_slave.inc
set debug_sync= "RESET";
set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync";
--eval CHANGE MASTER TO master_delay=0
--source include/start_slave.inc
......
......@@ -1088,6 +1088,8 @@ terminate_slave_thread(THD *thd,
mysql_mutex_unlock(&thd->LOCK_thd_kill);
mysql_mutex_unlock(&thd->LOCK_thd_data);
DEBUG_SYNC(current_thd, "after_thd_awake_kill");
/*
There is a small chance that slave thread might miss the first
alarm. To protect againts it, resend the signal until it reacts
......@@ -3794,13 +3796,26 @@ sql_delay_event(Log_event *ev, THD *thd, rpl_group_info *rgi)
sql_delay, (long)ev->when,
rli->mi->clock_diff_with_master,
(long)now, (ulonglong)sql_delay_end, (long)nap_time));
if (sql_delay_end > now)
/* if using debug_sync for sql_delay, only delay once per event group */
if (DBUG_EVALUATE_IF("sql_delay_by_debug_sync", type == GTID_EVENT,
sql_delay_end > now))
{
DBUG_PRINT("info", ("delaying replication event %lu secs",
nap_time));
rli->start_sql_delay(sql_delay_end);
mysql_mutex_unlock(&rli->data_lock);
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("sql_delay_by_debug_sync", {
DBUG_ASSERT(!debug_sync_set_action(
thd, STRING_WITH_LEN(
"now SIGNAL at_sql_delay WAIT_FOR continue_sql_thread")));
// Skip the actual sleep if using DEBUG_SYNC to coordinate SQL_DELAY
DBUG_RETURN(0);
};);
#endif
DBUG_RETURN(slave_sleep(thd, nap_time, sql_slave_killed, rgi));
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment