Commit ac2857a5 authored by Daniele Sciascia's avatar Daniele Sciascia Committed by Jan Lindström

MDEV-25717 Assertion `owning_thread_id_ == wsrep::this_thread::get_id()'

A test case to reproduce the issue. The actual fix is in galera
library.
Reviewed-by: default avatarJan Lindström <jan.lindstrom@mariadb.com>
parent 112b2396
connection node_2;
connection node_1;
connection node_1;
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) Engine=InnoDB;
INSERT INTO t1 VALUES (1), (2), (3);
connection node_2;
SET SESSION wsrep_trx_fragment_size = 1;
START TRANSACTION;
INSERT INTO t1 VALUES (4);
connection node_1;
SELECT COUNT(*) FROM t1;
COUNT(*)
3
connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2;
connection node_2a;
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_toi";
connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
connection node_1a;
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_bf_abort";
connection node_1;
TRUNCATE TABLE t1;
connection node_1a;
SET DEBUG_SYNC = "now WAIT_FOR sync.wsrep_bf_abort_reached";
connection node_2a;
SET DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_toi_reached";
connection node_2;
INSERT INTO t1 VALUES (5);
connection node_2a;
SET SESSION wsrep_sync_wait = 0;
SET SESSION wsrep_sync_wait = DEFAULT;
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_toi";
connection node_2;
ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
connection node_1a;
SET SESSION wsrep_sync_wait=0;
SET GLOBAL DEBUG_DBUG = "+d,sync.wsrep_log_dummy_write_set";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_bf_abort";
SET DEBUG_SYNC = "now WAIT_FOR sync.wsrep_log_dummy_write_set_reached";
connection node_1;
connection node_2;
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "RESET";
connection node_1;
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "RESET";
DROP TABLE t1;
#
# MDEV-25717 Assertion `owning_thread_id_ == wsrep::this_thread::get_id()'
#
# This test exposes a race condition between rollbacker thread and rollback
# fragment processing.
#
--source include/galera_cluster.inc
--source include/have_debug_sync.inc
--connection node_1
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) Engine=InnoDB;
INSERT INTO t1 VALUES (1), (2), (3);
#
# On node_2 we start a SR transaction, it going to
# be BF aborted later on
#
--connection node_2
SET SESSION wsrep_trx_fragment_size = 1;
START TRANSACTION;
INSERT INTO t1 VALUES (4);
--connection node_1
SELECT COUNT(*) FROM t1; # Sync wait
#
# Issue a conflicting TRUNCATE statement on node_1:
# - on node_2, block it before it is going to apply
# - on node_1, block before the before it BF aborts the INSERT
#
--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2a
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_toi";
--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
--connection node_1a
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_bf_abort";
--connection node_1
--send TRUNCATE TABLE t1
--connection node_1a
SET DEBUG_SYNC = "now WAIT_FOR sync.wsrep_bf_abort_reached";
--connection node_2a
SET DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_toi_reached";
#
# Generate one more fragment on the SR transaction.
# This is going to fail certification and results
# in a rollback fragment.
#
--connection node_2
--let $expected_cert_failures = `SELECT VARIABLE_VALUE + 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_cert_failures'`
--send INSERT INTO t1 VALUES (5)
#
# Wait until after certify and observe the certification
# failure. Let both continue and we are done on node_2.
#
--connection node_2a
SET SESSION wsrep_sync_wait = 0;
--let $wait_condition = SELECT VARIABLE_VALUE = $expected_cert_failures FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_cert_failures'
--source include/wait_condition.inc
SET SESSION wsrep_sync_wait = DEFAULT;
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_toi";
--connection node_2
--error ER_LOCK_DEADLOCK
--reap
#
# On node_1 we expect the following things:
# - the TRUNCATE should successfully bf abort the transaction
# - A rollback fragment should be delivered as a result of
# certification failure. We expect the rollback fragment to
# be delivered after TRUNCATE has bf aborted, therefore rollback
# fragment logs a dummy writeset.
#
--connection node_1a
SET SESSION wsrep_sync_wait=0;
SET GLOBAL DEBUG_DBUG = "+d,sync.wsrep_log_dummy_write_set";
# Signal the TRUNCATE to continue and observe the BF abort
--let $expected_bf_aborts = `SELECT VARIABLE_VALUE + 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_bf_aborts'`
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_bf_abort";
# Expect a timeout if bug is present
--let $wait_condition = SELECT VARIABLE_VALUE = $expected_bf_aborts FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_bf_aborts'
--source include/wait_condition.inc
# Observe logging of dummy writeset
SET DEBUG_SYNC = "now WAIT_FOR sync.wsrep_log_dummy_write_set_reached";
# TRUNCATE succeeds
--connection node_1
--reap
#
# Cleanup
#
--connection node_2
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "RESET";
--connection node_1
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "RESET";
DROP TABLE t1;
......@@ -379,6 +379,16 @@ int Wsrep_high_priority_service::apply_toi(const wsrep::ws_meta& ws_meta,
WSREP_DEBUG("Wsrep_high_priority_service::apply_toi: %lld",
client_state.toi_meta().seqno().get());
DBUG_EXECUTE_IF("sync.wsrep_apply_toi",
{
const char act[]=
"now "
"SIGNAL sync.wsrep_apply_toi_reached "
"WAIT_FOR signal.wsrep_apply_toi";
DBUG_ASSERT(!debug_sync_set_action(thd,
STRING_WITH_LEN(act)));
};);
int ret= wsrep_apply_events(thd, m_rli, data.data(), data.size());
if (ret != 0 || thd->wsrep_has_ignored_error)
{
......@@ -427,6 +437,15 @@ int Wsrep_high_priority_service::log_dummy_write_set(const wsrep::ws_handle& ws_
DBUG_PRINT("info",
("Wsrep_high_priority_service::log_dummy_write_set: seqno=%lld",
ws_meta.seqno().get()));
DBUG_EXECUTE_IF("sync.wsrep_log_dummy_write_set",
{
const char act[]=
"now "
"SIGNAL sync.wsrep_log_dummy_write_set_reached ";
DBUG_ASSERT(!debug_sync_set_action(m_thd,
STRING_WITH_LEN(act)));
};);
if (ws_meta.ordered())
{
wsrep::client_state& cs(m_thd->wsrep_cs());
......
......@@ -340,11 +340,20 @@ int wsrep_abort_thd(THD *bf_thd_ptr, THD *victim_thd_ptr, my_bool signal)
DBUG_RETURN(1);
}
bool wsrep_bf_abort(const THD* bf_thd, THD* victim_thd)
bool wsrep_bf_abort(THD* bf_thd, THD* victim_thd)
{
WSREP_LOG_THD(bf_thd, "BF aborter before");
WSREP_LOG_THD(victim_thd, "victim before");
wsrep::seqno bf_seqno(bf_thd->wsrep_trx().ws_meta().seqno());
DBUG_EXECUTE_IF("sync.wsrep_bf_abort",
{
const char act[]=
"now "
"SIGNAL sync.wsrep_bf_abort_reached "
"WAIT_FOR signal.wsrep_bf_abort";
DBUG_ASSERT(!debug_sync_set_action(bf_thd,
STRING_WITH_LEN(act)));
};);
if (WSREP(victim_thd) && !victim_thd->wsrep_trx().active())
{
......@@ -362,6 +371,8 @@ bool wsrep_bf_abort(const THD* bf_thd, THD* victim_thd)
}
bool ret;
wsrep::seqno bf_seqno(bf_thd->wsrep_trx().ws_meta().seqno());
if (wsrep_thd_is_toi(bf_thd))
{
ret= victim_thd->wsrep_cs().total_order_bf_abort(bf_seqno);
......
......@@ -87,7 +87,7 @@ int wsrep_show_bf_aborts (THD *thd, SHOW_VAR *var, char *buff,
bool wsrep_create_appliers(long threads, bool mutex_protected=false);
void wsrep_create_rollbacker();
bool wsrep_bf_abort(const THD*, THD*);
bool wsrep_bf_abort(THD* bf_thd, THD* victim_thd);
int wsrep_abort_thd(THD *bf_thd_ptr, THD *victim_thd_ptr, my_bool signal);
extern void wsrep_thd_set_PA_safe(void *thd_ptr, my_bool safe);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment