Commit beaea31a authored by sjaakola's avatar sjaakola Committed by Jan Lindström

MDEV-23851 BF-BF Conflict issue because of UK GAP locks

Some DML operations on tables having unique secondary keys cause scanning
in the secondary index, for instance to find potential unique key violations
in the seconday index. This scanning may involve GAP locking in the index.
As this locking happens also when applying replication events in high priority
applier threads, there is a probabality for lock conflicts between two wsrep
high priority threads.

This PR avoids lock conflicts of high priority wsrep threads, which do
secondary index scanning e.g. for duplicate key detection.

The actual fix is the patch in sql_class.cc:thd_need_ordering_with(), where
we allow relaxed GAP locking protocol between wsrep high priority threads.
wsrep high priority threads (replication appliers, replayers and TOI processors)
are ordered by the replication provider, and they will not need serializability
support gained by secondary index GAP locks.

PR contains also a mtr test, which exercises a scenario where two replication
applier threads have a false positive conflict in GAP of unique secondary index.
The conflicting local committing transaction has to replay, and the test verifies
also that the replaying phase will not conflict with the latter repllication applier.
Commit also contains new test scenario for galera.galera_UK_conflict.test,
where replayer starts applying after a slave applier thread, with later seqno,
has advanced to commit phase. The applier and replayer have false positive GAP
lock conflict on secondary unique index, and replayer should ignore this.
This test scenario caused crash with earlier version in this PR, and to fix this,
the secondary index uniquenes checking has been relaxed even further.

Now innodb trx_t structure has new member: bool wsrep_UK_scan, which is set to
true, when high priority thread is performing unique secondary index scanning.
The member trx_t::wsrep_UK_scan is defined inside WITH_WSREP directive, to make
it possible to prepare a MariaDB build where this additional trx_t member is
not present and is not used in the code base. trx->wsrep_UK_scan is set to true
only for the duration of function call for: lock_rec_lock() trx->wsrep_UK_scan
is used only in lock_rec_has_to_wait() function to relax the need to wait if
wsrep_UK_scan is set and conflicting transaction is also high priority.
Reviewed-by: default avatarJan Lindström <jan.lindstrom@mariadb.com>
parent cf6114eb
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY, f2 int, f3 int, unique key keyj (f2));
INSERT INTO t1 VALUES (1, 1, 0);
INSERT INTO t1 VALUES (3, 3, 0);
INSERT INTO t1 VALUES (10, 10, 0);
SET GLOBAL wsrep_slave_threads = 3;
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb";
connection node_1;
SET SESSION wsrep_sync_wait=0;
START TRANSACTION;
DELETE FROM t1 WHERE f2 = 3;
INSERT INTO t1 VALUES (3, 3, 1);
connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
connection node_1a;
SET SESSION wsrep_sync_wait=0;
connection node_2;
INSERT INTO t1 VALUES (5, 5, 2);
connection node_1a;
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
connection node_2;
INSERT INTO t1 VALUES (4, 4, 2);
connection node_1a;
SET SESSION wsrep_on = 0;
SET SESSION wsrep_on = 1;
SET GLOBAL wsrep_provider_options = 'dbug=';
SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_enter_sync';
connection node_1;
COMMIT;
connection node_1a;
SET SESSION wsrep_on = 0;
SET SESSION wsrep_on = 1;
SET GLOBAL wsrep_provider_options = 'dbug=';
SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_enter_sync';
SET GLOBAL wsrep_provider_options = 'dbug=';
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
SET GLOBAL debug_dbug = NULL;
SET debug_sync='RESET';
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb";
SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_enter_sync';
SET GLOBAL wsrep_provider_options = 'dbug=';
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
SET GLOBAL debug_dbug = NULL;
SET debug_sync='RESET';
SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_enter_sync';
SET GLOBAL wsrep_provider_options = 'dbug=';
connection node_1;
SELECT * FROM t1;
f1 f2 f3
1 1 0
3 3 1
4 4 2
5 5 2
10 10 0
wsrep_local_replays
1
SET GLOBAL wsrep_slave_threads = DEFAULT;
connection node_2;
SELECT * FROM t1;
f1 f2 f3
1 1 0
3 3 1
4 4 2
5 5 2
10 10 0
INSERT INTO t1 VALUES (7,7,7);
INSERT INTO t1 VALUES (8,8,8);
SELECT * FROM t1;
f1 f2 f3
1 1 0
3 3 1
4 4 2
5 5 2
7 7 7
8 8 8
10 10 0
connection node_1;
SELECT * FROM t1;
f1 f2 f3
1 1 0
3 3 1
4 4 2
5 5 2
7 7 7
10 10 0
DROP TABLE t1;
#
# This test tests the operation of transaction replay with a scenario
# where two subsequent write sets in applying conflict with local transaction
# in commit phase. The conflict is "false positive" confict on GAP lock in
# secondary unique index.
# The first applier will cause BF abort for the local committer, which
# starts replaying because of positive certification.
# In buggy version, scenatio continues so that ehile the local transaction
# is replaying, the latter applier experiences similar UK GAP lock conflict
# and forces the replayer to abort second time.
# In fixed version, this latter BF abort should not happen.
#
--source include/galera_cluster.inc
--source include/have_innodb.inc
--source include/have_debug_sync.inc
--source include/galera_have_debug_sync.inc
--let $wsrep_local_replays_old = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'`
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY, f2 int, f3 int, unique key keyj (f2));
INSERT INTO t1 VALUES (1, 1, 0);
INSERT INTO t1 VALUES (3, 3, 0);
INSERT INTO t1 VALUES (10, 10, 0);
# we will need 2 appliers threads for applyin two write sets in parallel in node1
# and 1 applier thread for handling replaying
SET GLOBAL wsrep_slave_threads = 3;
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb";
--connection node_1
# starting a transaction, which deletes and inserts the middle row in test table
# this will be victim of false positive conflict with appliers
SET SESSION wsrep_sync_wait=0;
START TRANSACTION;
DELETE FROM t1 WHERE f2 = 3;
INSERT INTO t1 VALUES (3, 3, 1);
# Control connection to manage sync points for appliers
--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
--connection node_1a
SET SESSION wsrep_sync_wait=0;
# send from node 2 first INSERT transaction, which will conflict on GAP lock in node 1
--connection node_2
INSERT INTO t1 VALUES (5, 5, 2);
--connection node_1a
# wait to see the INSERT in apply_cb sync point
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
# first applier seen in wait point, set sync point for the second INSERT
--let $galera_sync_point = apply_monitor_slave_enter_sync
--source include/galera_set_sync_point.inc
--connection node_2
# send second insert into same GAP in test table
INSERT INTO t1 VALUES (4, 4, 2);
--connection node_1a
# wait for the second insert to arrive in his sync point
--let $galera_sync_point = apply_monitor_slave_enter_sync
--source include/galera_wait_sync_point.inc
--source include/galera_clear_sync_point.inc
# both appliers are now waiting in separate sync points
# Block the local commit, send the COMMIT and wait until it gets blocked
--let $galera_sync_point = commit_monitor_enter_sync
--source include/galera_set_sync_point.inc
--connection node_1
--send COMMIT
--connection node_1a
# wait for the local commit to enter in commit monitor wait state
--let $galera_sync_point = apply_monitor_slave_enter_sync commit_monitor_enter_sync
--source include/galera_wait_sync_point.inc
--source include/galera_clear_sync_point.inc
# release the local transaction to continue with commit
--let $galera_sync_point = commit_monitor_enter_sync
--source include/galera_signal_sync_point.inc
--source include/galera_clear_sync_point.inc
# and now release the first applier, it should force local trx to abort
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
SET GLOBAL debug_dbug = NULL;
SET debug_sync='RESET';
# set another sync point for second applier
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb";
# letting the second appier to move forward
--let $galera_sync_point = apply_monitor_slave_enter_sync
--source include/galera_signal_sync_point.inc
# waiting until second applier is in wait
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
# stopping second applier before commit
--let $galera_sync_point = commit_monitor_enter_sync
--source include/galera_set_sync_point.inc
--source include/galera_clear_sync_point.inc
# releasing the second insert, with buggy version it will conflict with
# replayer
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
SET GLOBAL debug_dbug = NULL;
SET debug_sync='RESET';
# with fixed version, second applier has reached commit monitor, and we can
# release it to complete
--let $galera_sync_point = commit_monitor_enter_sync
--source include/galera_signal_sync_point.inc
--source include/galera_clear_sync_point.inc
# local commit should succeed
--connection node_1
--reap
SELECT * FROM t1;
# wsrep_local_replays has increased by 1
--let $wsrep_local_replays_new = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'`
--disable_query_log
--eval SELECT $wsrep_local_replays_new - $wsrep_local_replays_old = 1 AS wsrep_local_replays;
--enable_query_log
# returning original slave thread count
SET GLOBAL wsrep_slave_threads = DEFAULT;
--connection node_2
SELECT * FROM t1;
# replicate some transactions, so that wsrep slave thread count can reach
# original state in node 1
INSERT INTO t1 VALUES (7,7,7);
INSERT INTO t1 VALUES (8,8,8);
SELECT * FROM t1;
--connection node_1
SELECT * FROM t1;
DROP TABLE t1;
......@@ -1082,7 +1082,7 @@ MDL_wait::timed_wait(MDL_context_owner *owner, struct timespec *abs_timeout,
DBUG_ASSERT(!debug_sync_set_action((owner->get_thd()),
STRING_WITH_LEN(act)));
};);
if (wsrep_thd_is_BF(owner->get_thd(), false))
if (WSREP_ON && wsrep_thd_is_BF(owner->get_thd(), false))
{
wait_result= mysql_cond_wait(&m_COND_wait_status, &m_LOCK_wait_status);
}
......@@ -1155,7 +1155,7 @@ void MDL_lock::Ticket_list::add_ticket(MDL_ticket *ticket)
*/
DBUG_ASSERT(ticket->get_lock());
#ifdef WITH_WSREP
if ((this == &(ticket->get_lock()->m_waiting)) &&
if (WSREP_ON && (this == &(ticket->get_lock()->m_waiting)) &&
wsrep_thd_is_BF(ticket->get_ctx()->get_thd(), false))
{
Ticket_iterator itw(ticket->get_lock()->m_waiting);
......@@ -1581,7 +1581,7 @@ MDL_lock::can_grant_lock(enum_mdl_type type_arg,
ticket->is_incompatible_when_granted(type_arg))
{
#ifdef WITH_WSREP
if (wsrep_thd_is_BF(requestor_ctx->get_thd(),false) &&
if (WSREP_ON && wsrep_thd_is_BF(requestor_ctx->get_thd(),false) &&
key.mdl_namespace() == MDL_key::GLOBAL)
{
WSREP_DEBUG("global lock granted for BF: %lu %s",
......@@ -1615,7 +1615,7 @@ MDL_lock::can_grant_lock(enum_mdl_type type_arg,
}
else
{
if (wsrep_thd_is_BF(requestor_ctx->get_thd(), false) &&
if (WSREP_ON && wsrep_thd_is_BF(requestor_ctx->get_thd(), false) &&
key.mdl_namespace() == MDL_key::GLOBAL)
{
WSREP_DEBUG("global lock granted for BF (waiting queue): %lu %s",
......
/*
Copyright (c) 2000, 2015, Oracle and/or its affiliates.
Copyright (c) 2008, 2020, MariaDB Corporation.
Copyright (c) 2008, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -4730,6 +4730,16 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
DBUG_EXECUTE_IF("disable_thd_need_ordering_with", return 1;);
if (!thd || !other_thd)
return 1;
#ifdef WITH_WSREP
/* wsrep applier, replayer and TOI processing threads are ordered
by replication provider, relaxed GAP locking protocol can be used
between high priority wsrep threads
*/
if (WSREP_ON &&
wsrep_thd_is_BF(const_cast<THD *>(thd), false) &&
wsrep_thd_is_BF(const_cast<THD *>(other_thd), true))
return 0;
#endif /* WITH_WSREP */
rgi= thd->rgi_slave;
other_rgi= other_thd->rgi_slave;
if (!rgi || !other_rgi)
......
......@@ -22,6 +22,7 @@
//#include "global_threads.h" // LOCK_thread_count, etc.
#include "sql_base.h" // close_thread_tables()
#include "mysqld.h" // start_wsrep_THD();
#include "debug_sync.h"
#include "slave.h" // opt_log_slave_updates
#include "rpl_filter.h"
......@@ -371,6 +372,19 @@ void wsrep_replay_transaction(THD *thd)
thd->variables.option_bits|= OPTION_BEGIN;
thd->server_status|= SERVER_STATUS_IN_TRANS;
/* Allow tests to block the replayer thread using the DBUG facilities */
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("sync.wsrep_replay_cb",
{
const char act[]=
"now "
"SIGNAL sync.wsrep_replay_cb_reached "
"WAIT_FOR signal.wsrep_replay_cb";
DBUG_ASSERT(!debug_sync_set_action(thd,
STRING_WITH_LEN(act)));
};);
#endif /* ENABLED_DEBUG_SYNC */
int rcode = wsrep->replay_trx(wsrep,
&thd->wsrep_ws_handle,
(void *)thd);
......
......@@ -3,7 +3,7 @@
Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Copyright (c) 2012, Facebook Inc.
Copyright (c) 2015, 2020, MariaDB Corporation.
Copyright (c) 2015, 2021, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
......@@ -67,6 +67,9 @@ Created 10/16/1994 Heikki Tuuri
#include "srv0start.h"
#include "mysql_com.h"
#include "dict0stats.h"
#ifdef WITH_WSREP
#include "mysql/service_wsrep.h"
#endif /* WITH_WSREP */
/** Buffered B-tree operation types, introduced as part of delete buffering. */
enum btr_op_t {
......@@ -2941,7 +2944,8 @@ btr_cur_ins_lock_and_undo(
/* Check if there is predicate or GAP lock preventing the insertion */
if (!(flags & BTR_NO_LOCKING_FLAG)) {
if (dict_index_is_spatial(index)) {
const unsigned type = index->type;
if (UNIV_UNLIKELY(type & DICT_SPATIAL)) {
lock_prdt_t prdt;
rtr_mbr_t mbr;
......@@ -2958,9 +2962,30 @@ btr_cur_ins_lock_and_undo(
index, thr, mtr, &prdt);
*inherit = false;
} else {
#ifdef WITH_WSREP
trx_t* trx= thr_get_trx(thr);
/* If transaction scanning an unique secondary
key is wsrep high priority thread (brute
force) this scanning may involve GAP-locking
in the index. As this locking happens also
when applying replication events in high
priority applier threads, there is a
probability for lock conflicts between two
wsrep high priority threads. To avoid this
GAP-locking we mark that this transaction
is using unique key scan here. */
if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE
&& trx->is_wsrep()
&& wsrep_thd_is_BF(trx->mysql_thd, false)) {
trx->wsrep_UK_scan= true;
}
#endif /* WITH_WSREP */
err = lock_rec_insert_check_and_lock(
flags, rec, btr_cur_get_block(cursor),
index, thr, mtr, inherit);
#ifdef WITH_WSREP
trx->wsrep_UK_scan= false;
#endif /* WITH_WSREP */
}
}
......
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2015, 2020, MariaDB Corporation.
Copyright (c) 2015, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -875,6 +875,9 @@ struct trx_t {
/** whether wsrep_on(mysql_thd) held at the start of transaction */
bool wsrep;
bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); }
/** true, if BF thread is performing unique secondary index scanning */
bool wsrep_UK_scan;
bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep_UK_scan); }
#else /* WITH_WSREP */
bool is_wsrep() const { return false; }
#endif /* WITH_WSREP */
......
/*****************************************************************************
Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2014, 2020, MariaDB Corporation.
Copyright (c) 2014, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -816,6 +816,17 @@ lock_rec_has_to_wait(
}
#ifdef WITH_WSREP
/* New lock request from a transaction is using unique key
scan and this transaction is a wsrep high priority transaction
(brute force). If conflicting transaction is also wsrep high
priority transaction we should avoid lock conflict because
ordering of these transactions is already decided and
conflicting transaction will be later replayed. */
if (trx->is_wsrep_UK_scan()
&& wsrep_thd_is_BF(lock2->trx->mysql_thd, true)) {
return (FALSE);
}
/* There should not be two conflicting locks that are
brute force. If there is it is a bug. */
wsrep_assert_no_bf_bf_wait(NULL, lock2, trx);
......@@ -5928,6 +5939,19 @@ lock_sec_rec_modify_check_and_lock(
heap_no = page_rec_get_heap_no(rec);
#ifdef WITH_WSREP
trx_t *trx= thr_get_trx(thr);
/* If transaction scanning an unique secondary key is wsrep
high priority thread (brute force) this scanning may involve
GAP-locking in the index. As this locking happens also when
applying replication events in high priority applier threads,
there is a probability for lock conflicts between two wsrep
high priority threads. To avoid this GAP-locking we mark that
this transaction is using unique key scan here. */
if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
trx->wsrep_UK_scan= true;
#endif /* WITH_WSREP */
/* Another transaction cannot have an implicit lock on the record,
because when we come here, we already have modified the clustered
index record, and this would not have been possible if another active
......@@ -5943,6 +5967,9 @@ lock_sec_rec_modify_check_and_lock(
MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
lock_mutex_exit();
#ifdef WITH_WSREP
trx->wsrep_UK_scan= false;
#endif /* WITH_WSREP */
#ifdef UNIV_DEBUG
{
......@@ -6032,6 +6059,18 @@ lock_sec_rec_read_check_and_lock(
lock_rec_convert_impl_to_expl(block, rec, index, offsets);
}
#ifdef WITH_WSREP
trx_t *trx= thr_get_trx(thr);
/* If transaction scanning an unique secondary key is wsrep
high priority thread (brute force) this scanning may involve
GAP-locking in the index. As this locking happens also when
applying replication events in high priority applier threads,
there is a probability for lock conflicts between two wsrep
high priority threads. To avoid this GAP-locking we mark that
this transaction is using unique key scan here. */
if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
trx->wsrep_UK_scan= true;
#endif /* WITH_WSREP */
lock_mutex_enter();
ut_ad(mode != LOCK_X
......@@ -6045,6 +6084,9 @@ lock_sec_rec_read_check_and_lock(
MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
lock_mutex_exit();
#ifdef WITH_WSREP
trx->wsrep_UK_scan= false;
#endif /* WITH_WSREP */
ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
......
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2015, 2020, MariaDB Corporation.
Copyright (c) 2015, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -154,6 +154,11 @@ trx_init(
trx->lock.rec_cached = 0;
trx->lock.table_cached = 0;
#ifdef WITH_WSREP
ut_ad(!trx->wsrep);
ut_ad(!trx->wsrep_event);
ut_ad(!trx->wsrep_UK_scan);
#endif /* WITH_WSREP */
ut_ad(trx->get_flush_observer() == NULL);
}
......@@ -355,6 +360,7 @@ trx_t *trx_allocate_for_background()
#ifdef WITH_WSREP
trx->wsrep_event = NULL;
ut_ad(!trx->wsrep_UK_scan);
#endif /* WITH_WSREP */
return(trx);
......@@ -466,6 +472,8 @@ inline void trx_t::free()
MEM_NOACCESS(&flush_observer, sizeof flush_observer);
#ifdef WITH_WSREP
MEM_NOACCESS(&wsrep_event, sizeof wsrep_event);
ut_ad(!wsrep_UK_scan);
MEM_NOACCESS(&wsrep_UK_scan, sizeof wsrep_UK_scan);
#endif /* WITH_WSREP */
MEM_NOACCESS(&magic_n, sizeof magic_n);
trx_pools->mem_free(this);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment