Commit 2b84e1c9 authored by Leandro Pacheco's avatar Leandro Pacheco Committed by Jan Lindström

MDEV-23080: desync and pause node on BACKUP STAGE BLOCK_DDL

make BACKUP STAGE behave as FTWRL, desyncing and pausing the node
to prevent BF threads (appliers) from interfering with blocking stages.
This is needed because BF threads don't respect BACKUP MDL locks.
Reviewed-by: default avatarJan Lindström <jan.lindstrom@mariadb.com>
parent 389f5cf7
......@@ -930,7 +930,7 @@ bool lock_tables(MYSQL *connection)
if (have_galera_enabled)
{
xb_mysql_query(connection, "SET SESSION wsrep_causal_reads=0", false);
xb_mysql_query(connection, "SET SESSION wsrep_sync_wait=0", false);
}
xb_mysql_query(connection, "BACKUP STAGE START", true);
......
......@@ -2,14 +2,14 @@ connection node_2;
connection node_1;
FLUSH TABLES WITH READ LOCK;
CREATE TABLE t1 (a INT) ENGINE=InnoDB;
ERROR 08S01: Aborting TOI: Global Read-Lock (FTWRL) in place.
ERROR 08S01: Aborting TOI: Replication paused on node for FTWRL/BACKUP STAGE.
SET wsrep_OSU_method=RSU;
CREATE TABLE t1 (a INT) ENGINE=InnoDB;
ERROR 08S01: Aborting TOI: Global Read-Lock (FTWRL) in place.
ERROR 08S01: Aborting TOI: Replication paused on node for FTWRL/BACKUP STAGE.
SET wsrep_OSU_method=TOI;
connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
CREATE TABLE t1 (a INT) ENGINE=InnoDB;
ERROR 08S01: Aborting TOI: Global Read-Lock (FTWRL) in place.
ERROR 08S01: Aborting TOI: Replication paused on node for FTWRL/BACKUP STAGE.
connection node_1;
UNLOCK TABLES;
CREATE TABLE t1 (a INT) ENGINE=InnoDB;
......
connection node_2;
connection node_1;
connection node_1;
CREATE TABLE t1 (f1 varchar(10)) ENGINE=InnoDB;
BACKUP STAGE START;
BACKUP STAGE FLUSH;
BACKUP STAGE END;
BACKUP STAGE START;
BACKUP STAGE FLUSH;
connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
connection node_1a;
SET SESSION wsrep_sync_wait=0;
SET SESSION wsrep_retry_autocommit=0;
INSERT INTO t1 (f1) values ("node1_1");
ALTER TABLE t1 ADD COLUMN (f2 int(10));
connection node_2;
INSERT INTO t1 (f1) values ("node2_1");
ALTER TABLE t1 ADD COLUMN (f3 int(10));
connection node_1;
BACKUP STAGE BLOCK_DDL;
connect node_1c, 127.0.0.1, root, , test, $NODE_MYPORT_1;
connection node_1c;
SET SESSION wsrep_sync_wait=0;
connection node_2;
INSERT INTO t1 (f1) values("node2_2");
ALTER TABLE t1 ADD COLUMN (f5 int(10));
connection node_1a;
ALTER TABLE t1 ADD COLUMN (f4 int(10));
ERROR 08S01: Aborting TOI: Replication paused on node for FTWRL/BACKUP STAGE.
INSERT INTO t1 (f1) values("node1a");;
connection node_1c;
connection node_1;
BACKUP STAGE BLOCK_COMMIT;
connection node_1c;
SELECT variable_value="Donor/Desynced" FROM information_schema.global_status WHERE variable_name="wsrep_local_state_comment";
variable_value="Donor/Desynced"
1
connection node_2;
INSERT INTO t1 (f1) values("node2_3");
ALTER TABLE t1 ADD COLUMN (f6 int(10));
connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1;
connection node_1b;
SET SESSION wsrep_sync_wait=0;
SET SESSION wsrep_retry_autocommit=0;
ALTER TABLE t1 ADD COLUMN (f4 int(10));
ERROR 08S01: Aborting TOI: Replication paused on node for FTWRL/BACKUP STAGE.
INSERT INTO t1 (f1) values("node1b");;
connection node_1c;
SELECT COUNT(*)=2 FROM t1;
COUNT(*)=2
1
SELECT COUNT(*)=3 FROM information_schema.columns WHERE table_name = 't1';
COUNT(*)=3
1
connection node_1;
BACKUP STAGE END;
connection node_1a;
ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
connection node_1b;
ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
connection node_1;
SELECT COUNT(*)=4 FROM t1;
COUNT(*)=4
1
SELECT COUNT(*)=5 FROM information_schema.columns WHERE table_name = 't1';
COUNT(*)=5
1
connection node_2;
SELECT COUNT(*)=4 FROM t1;
COUNT(*)=4
1
SELECT COUNT(*)=5 FROM information_schema.columns WHERE table_name = 't1';
COUNT(*)=5
1
connection node_1;
DROP TABLE t1;
call mtr.add_suppression("WSREP: ALTER TABLE isolation failure");
call mtr.add_suppression("greater than drain seqno");
#
# Check that BACKUP STAGE BLOCK_DDL desyncs and pauses the node until BACKUP STAGE END:
# - Local DDLs will fail immediately
# - Local DMLs will block until resync
# - Remote txns will be applied after resync (STAGE END).
#
--source include/galera_cluster.inc
--source include/have_innodb.inc
--source include/have_metadata_lock_info.inc
--connection node_1
CREATE TABLE t1 (f1 varchar(10)) ENGINE=InnoDB;
# First, check that BACKUP STAGE END skipping desyncing stages is fine
BACKUP STAGE START;
BACKUP STAGE FLUSH;
BACKUP STAGE END;
BACKUP STAGE START;
BACKUP STAGE FLUSH;
--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
--connection node_1a
SET SESSION wsrep_sync_wait=0;
SET SESSION wsrep_retry_autocommit=0;
INSERT INTO t1 (f1) values ("node1_1");
ALTER TABLE t1 ADD COLUMN (f2 int(10));
--connection node_2
INSERT INTO t1 (f1) values ("node2_1");
ALTER TABLE t1 ADD COLUMN (f3 int(10));
# BLOCK_DDL desyncs and pauses the node
--connection node_1
BACKUP STAGE BLOCK_DDL;
--connect node_1c, 127.0.0.1, root, , test, $NODE_MYPORT_1
--connection node_1c
SET SESSION wsrep_sync_wait=0;
--let $wait_condition = SELECT variable_value="Donor/Desynced" FROM information_schema.global_status WHERE variable_name="wsrep_local_state_comment"
--source include/wait_condition.inc
--connection node_2
INSERT INTO t1 (f1) values("node2_2");
ALTER TABLE t1 ADD COLUMN (f5 int(10));
--connection node_1a
--error ER_UNKNOWN_COM_ERROR
ALTER TABLE t1 ADD COLUMN (f4 int(10));
--let $insert_id = `SELECT CONNECTION_ID()`
--send INSERT INTO t1 (f1) values("node1a");
# the insert will block during commit inside the provider, in certify. We can't
# check for sure it is blocked there, so we wait for the thread to at least
# reach commit stage. In the unlikely case the interleaving is different, the
# result of the test should not change.
--connection node_1c
--let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE State='Commit' AND ID=$insert_id
--source include/wait_condition.inc
--let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.metadata_lock_info WHERE TABLE_NAME='t1' AND THREAD_ID=$insert_id
--source include/wait_condition.inc
--connection node_1
BACKUP STAGE BLOCK_COMMIT;
# node only resumes/resyncs upon STAGE END
--connection node_1c
SELECT variable_value="Donor/Desynced" FROM information_schema.global_status WHERE variable_name="wsrep_local_state_comment";
--connection node_2
INSERT INTO t1 (f1) values("node2_3");
ALTER TABLE t1 ADD COLUMN (f6 int(10));
--connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1
--connection node_1b
SET SESSION wsrep_sync_wait=0;
SET SESSION wsrep_retry_autocommit=0;
--error ER_UNKNOWN_COM_ERROR
ALTER TABLE t1 ADD COLUMN (f4 int(10));
--let $insert_id = `SELECT CONNECTION_ID()`
--send INSERT INTO t1 (f1) values("node1b");
# wait for insert to get blocked
--connection node_1c
--let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE State='Commit' AND ID=$insert_id
--source include/wait_condition.inc
--let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.metadata_lock_info WHERE TABLE_NAME='t1' AND THREAD_ID=$insert_id
--source include/wait_condition.inc
--let $wait_condition = SELECT COUNT(*)=2 FROM information_schema.processlist WHERE Info like 'INSERT INTO t1 (f1) values("node1%")' AND State = 'Commit'
--source include/wait_condition.inc
# nothing after BLOCK_DDL is applied
SELECT COUNT(*)=2 FROM t1;
SELECT COUNT(*)=3 FROM information_schema.columns WHERE table_name = 't1';
# STAGE END resumes and resyncs the node
--connection node_1
BACKUP STAGE END;
# Upon resume, blocked inserts will continue but conflict with the applying alters
--connection node_1a
--error ER_LOCK_DEADLOCK
--reap
--connection node_1b
--error ER_LOCK_DEADLOCK
--reap
--connection node_1
SELECT COUNT(*)=4 FROM t1;
SELECT COUNT(*)=5 FROM information_schema.columns WHERE table_name = 't1';
--connection node_2
SELECT COUNT(*)=4 FROM t1;
SELECT COUNT(*)=5 FROM information_schema.columns WHERE table_name = 't1';
--connection node_1
DROP TABLE t1;
call mtr.add_suppression("WSREP: ALTER TABLE isolation failure");
call mtr.add_suppression("greater than drain seqno");
......@@ -34,6 +34,7 @@
#include "sql_insert.h" // kill_delayed_threads
#include "sql_handler.h" // mysql_ha_cleanup_no_free
#include <my_sys.h>
#include "wsrep_mysqld.h"
static const char *stage_names[]=
{"START", "FLUSH", "BLOCK_DDL", "BLOCK_COMMIT", "END", 0};
......@@ -254,6 +255,21 @@ static bool backup_block_ddl(THD *thd)
(void) flush_tables(thd, FLUSH_NON_TRANS_TABLES);
thd->clear_error();
#ifdef WITH_WSREP
/*
We desync the node for BACKUP STAGE because applier threads
bypass backup MDL locks (see MDL_lock::can_grant_lock)
*/
if (WSREP_NNULL(thd))
{
Wsrep_server_state &server_state= Wsrep_server_state::instance();
if (server_state.desync_and_pause().is_undefined()) {
DBUG_RETURN(1);
}
thd->wsrep_desynced_backup_stage= true;
}
#endif /* WITH_WSREP */
/*
block new DDL's, in addition to all previous blocks
We didn't do this lock above, as we wanted DDL's to be executed while
......@@ -318,6 +334,14 @@ bool backup_end(THD *thd)
ha_end_backup();
thd->current_backup_stage= BACKUP_FINISHED;
thd->mdl_context.release_lock(backup_flush_ticket);
#ifdef WITH_WSREP
if (WSREP_NNULL(thd) && thd->wsrep_desynced_backup_stage)
{
Wsrep_server_state &server_state= Wsrep_server_state::instance();
server_state.resume_and_resync();
thd->wsrep_desynced_backup_stage= false;
}
#endif /* WITH_WSREP */
}
DBUG_RETURN(0);
}
......
......@@ -1281,6 +1281,7 @@ void THD::init()
m_wsrep_next_trx_id = WSREP_UNDEFINED_TRX_ID;
wsrep_replicate_GTID = false;
wsrep_aborter = 0;
wsrep_desynced_backup_stage= false;
#endif /* WITH_WSREP */
if (variables.sql_log_bin)
......
......@@ -3011,6 +3011,9 @@ class THD: public THD_count, /* this must be first */
uint server_status,open_options;
enum enum_thread_type system_thread;
enum backup_stages current_backup_stage;
#ifdef WITH_WSREP
bool wsrep_desynced_backup_stage;
#endif /* WITH_WSREP */
/*
Current or next transaction isolation level.
When a connection is established, the value is taken from
......
......@@ -2168,7 +2168,7 @@ int wsrep_to_isolation_begin(THD *thd, const char *db_, const char *table_,
if (Wsrep_server_state::instance().desynced_on_pause())
{
my_message(ER_UNKNOWN_COM_ERROR,
"Aborting TOI: Global Read-Lock (FTWRL) in place.", MYF(0));
"Aborting TOI: Replication paused on node for FTWRL/BACKUP STAGE.", MYF(0));
return -1;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment