Commit 8d3d85e5 authored by Vlad Lesin's avatar Vlad Lesin

MDEV-34690 lock_rec_unlock_unmodified() causes deadlock

lock_rec_unlock_unmodified() is executed either under lock_sys.wr_lock()
or under a combination of lock_sys.rd_lock() + record locks hash table
cell latch. It also requests page latch to check if locked records were
changed by the current transaction or not.

Usually InnoDB requests page latch to find the certain record on the
page, and then requests lock_sys and/or record lock hash cell latch to
request record lock. lock_rec_unlock_unmodified() requests the latches
in the opposite order, what causes deadlocks. One of the possible
scenario for the deadlock is the following:

thread 1 - lock_rec_unlock_unmodified() is invoked under locks hash table
           cell latch, the latch is acquired;
thread 2 - purge thread acquires page latch and tries to remove
           delete-marked record, it invokes lock_update_delete(), which
           requests locks hash table cell latch, held by thread 1;
thread 1 - requests page latch, held by thread 2.

To fix it we need to release lock_sys.latch and/or lock hash cell latch,
acquire page latch and re-acquire lock_sys related latches.

When lock_sys.latch and/or lock hash cell latch are released in
lock_release_on_prepare() and lock_release_on_prepare_try(), the page on
which the current lock is held, can be merged. In this case the bitmap
of the current lock must be cleared, and the new lock must be added to
the end of trx->lock.trx_locks list, or bitmap of already existing lock
must be changed.

The new field trx_lock_t::set_nth_bit_calls indicates if new locks
(bits in existing lock bitmaps or new lock objects) were created during
the period when lock_sys was released in trx->lock.trx_locks list
iteration loop in lock_release_on_prepare() or
lock_release_on_prepare_try(). And, if so, we traverse the list again.

The block can be freed during pages merging, what causes assertion
failure in buf_page_get_gen(), as btr_block_get() passes BUF_GET as page
get mode to it. That's why page_get_mode parameter was added to
btr_block_get() to pass BUF_GET_POSSIBLY_FREED from
lock_release_on_prepare() and lock_release_on_prepare_try() to
buf_page_get_gen().

As searching for id of trx, which modified secondary index record, is
quite expensive operation, restrict its usage for master. System variable
was added to remove the restriction for testing simplifying. The
variable exists only either for debug build or for build with
-DINNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY option to increase the
probability of catching bugs for release build with RQG.

Reviewed by Marko Mäkelä, Debarun Banerjee.
parent cdde97db
SET @old_innodb_enable_xap_unlock_unmodified_for_primary_debug=
@@GLOBAL.innodb_enable_xap_unlock_unmodified_for_primary_debug;
SET GLOBAL innodb_enable_xap_unlock_unmodified_for_primary_debug= 1;
CREATE TABLE t1 (a int, b int, c int,
INDEX i1(a),
INDEX i2(b))
......@@ -19,4 +22,6 @@ disconnect con1;
connection default;
XA COMMIT "t1";
DROP TABLE t1;
SET GLOBAL innodb_enable_xap_unlock_unmodified_for_primary_debug=
@old_innodb_enable_xap_unlock_unmodified_for_primary_debug;
SET GLOBAL innodb_lock_wait_timeout= @old_timeout;
SET @old_innodb_enable_xap_unlock_unmodified_for_primary_debug=
@@GLOBAL.innodb_enable_xap_unlock_unmodified_for_primary_debug;
SET GLOBAL innodb_enable_xap_unlock_unmodified_for_primary_debug= 1;
SET @saved_dbug = @@GLOBAL.debug_dbug;
CREATE TABLE t(id INT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=0;
InnoDB 0 transactions not purged
INSERT INTO t VALUES (10), (20), (30);
connect prevent_purge,localhost,root,,;
start transaction with consistent snapshot;
connection default;
DELETE FROM t WHERE id = 20;
SET @@GLOBAL.debug_dbug=
"+d,enable_row_purge_remove_clust_if_poss_low_sync_point";
XA START '1';
UPDATE t SET id=40 WHERE id=30;
XA END '1';
connection prevent_purge;
COMMIT;
SET DEBUG_SYNC=
'now WAIT_FOR row_purge_remove_clust_if_poss_low_before_delete';
SET @@GLOBAL.debug_dbug=
"-d,enable_row_purge_remove_clust_if_poss_low_sync_point";
connection default;
SET DEBUG_SYNC=
"lock_rec_unlock_unmodified_start SIGNAL lock_sys_latched WAIT_FOR cont";
XA PREPARE '1';
connection prevent_purge;
SET DEBUG_SYNC= 'now SIGNAL row_purge_remove_clust_if_poss_low_cont';
SET DEBUG_SYNC= 'now SIGNAL cont';
disconnect prevent_purge;
connection default;
XA COMMIT '1';
SET DEBUG_SYNC="RESET";
TRUNCATE TABLE t;
InnoDB 0 transactions not purged
INSERT INTO t VALUES (10), (20), (30);
connect prevent_purge,localhost,root,,;
start transaction with consistent snapshot;
connection default;
DELETE FROM t WHERE id = 20;
SET @@GLOBAL.debug_dbug=
"+d,enable_row_purge_remove_clust_if_poss_low_sync_point";
SET @@GLOBAL.debug_dbug="+d,skip_lock_release_on_prepare_try";
XA START '1';
UPDATE t SET id=40 WHERE id=30;
XA END '1';
connection prevent_purge;
COMMIT;
SET DEBUG_SYNC=
'now WAIT_FOR row_purge_remove_clust_if_poss_low_before_delete';
SET @@GLOBAL.debug_dbug=
"-d,enable_row_purge_remove_clust_if_poss_low_sync_point";
connection default;
SET DEBUG_SYNC=
"lock_rec_unlock_unmodified_start SIGNAL lock_sys_latched WAIT_FOR cont";
XA PREPARE '1';
connection prevent_purge;
SET DEBUG_SYNC= 'now SIGNAL row_purge_remove_clust_if_poss_low_cont';
SET DEBUG_SYNC= 'now SIGNAL cont';
disconnect prevent_purge;
connection default;
XA COMMIT '1';
SET DEBUG_SYNC="RESET";
TRUNCATE TABLE t;
SET @@GLOBAL.debug_dbug = @saved_dbug;
DROP TABLE t;
SET GLOBAL innodb_enable_xap_unlock_unmodified_for_primary_debug=
@old_innodb_enable_xap_unlock_unmodified_for_primary_debug;
--source include/have_innodb.inc
--source include/count_sessions.inc
--source include/have_debug.inc
SET @old_innodb_enable_xap_unlock_unmodified_for_primary_debug=
@@GLOBAL.innodb_enable_xap_unlock_unmodified_for_primary_debug;
SET GLOBAL innodb_enable_xap_unlock_unmodified_for_primary_debug= 1;
CREATE TABLE t1 (a int, b int, c int,
INDEX i1(a),
......@@ -30,5 +35,8 @@ SELECT * FROM t1 FORCE INDEX (i1) WHERE a=2 AND b=1 FOR UPDATE;
XA COMMIT "t1";
DROP TABLE t1;
SET GLOBAL innodb_enable_xap_unlock_unmodified_for_primary_debug=
@old_innodb_enable_xap_unlock_unmodified_for_primary_debug;
SET GLOBAL innodb_lock_wait_timeout= @old_timeout;
--source include/wait_until_count_sessions.inc
--source include/have_innodb.inc
--source include/count_sessions.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
SET @old_innodb_enable_xap_unlock_unmodified_for_primary_debug=
@@GLOBAL.innodb_enable_xap_unlock_unmodified_for_primary_debug;
SET GLOBAL innodb_enable_xap_unlock_unmodified_for_primary_debug= 1;
SET @saved_dbug = @@GLOBAL.debug_dbug;
CREATE TABLE t(id INT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=0;
--let $i = 2
while ($i) {
--source include/wait_all_purged.inc
INSERT INTO t VALUES (10), (20), (30);
--connect(prevent_purge,localhost,root,,)
start transaction with consistent snapshot;
--connection default
DELETE FROM t WHERE id = 20;
SET @@GLOBAL.debug_dbug=
"+d,enable_row_purge_remove_clust_if_poss_low_sync_point";
# Cover both lock_release_on_prepare() and lock_release_on_prepare_try()
# functions
if ($i == 1) {
SET @@GLOBAL.debug_dbug="+d,skip_lock_release_on_prepare_try";
}
XA START '1';
UPDATE t SET id=40 WHERE id=30;
XA END '1';
--connection prevent_purge
COMMIT;
# stop purge worker after it requested page X-latch, but before
# lock_update_delete() call
SET DEBUG_SYNC=
'now WAIT_FOR row_purge_remove_clust_if_poss_low_before_delete';
SET @@GLOBAL.debug_dbug=
"-d,enable_row_purge_remove_clust_if_poss_low_sync_point";
--connection default
# lock_rec_unlock_unmodified() is executed either under lock_sys.wr_lock() or
# under a combination of lock_sys.rd_lock() + record locks hash table cell
# latch. Stop it before page latch request.
SET DEBUG_SYNC=
"lock_rec_unlock_unmodified_start SIGNAL lock_sys_latched WAIT_FOR cont";
--send XA PREPARE '1'
--connection prevent_purge
# let purge thread to continue execution and invoke lock_update_delete(),
# which, in turns, requests locks_sys related latches.
SET DEBUG_SYNC= 'now SIGNAL row_purge_remove_clust_if_poss_low_cont';
SET DEBUG_SYNC= 'now SIGNAL cont';
--disconnect prevent_purge
--connection default
# deadlock if the bug is not fixed, as lock_rec_unlock_unmodified() requests
# page latch acquired by purge worker, and the purge worker requests lock_sys
# related latches in lock_update_delete() call, acquired by the current XA
# in lock_rec_unlock_unmodified() caller.
--reap
XA COMMIT '1';
SET DEBUG_SYNC="RESET";
TRUNCATE TABLE t;
--dec $i
}
SET @@GLOBAL.debug_dbug = @saved_dbug;
DROP TABLE t;
SET GLOBAL innodb_enable_xap_unlock_unmodified_for_primary_debug=
@old_innodb_enable_xap_unlock_unmodified_for_primary_debug;
--source include/wait_until_count_sessions.inc
......@@ -535,6 +535,18 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST OFF,ON
READ_ONLY YES
COMMAND_LINE_ARGUMENT NONE
VARIABLE_NAME INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY_DEBUG
SESSION_VALUE NULL
DEFAULT_VALUE OFF
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BOOLEAN
VARIABLE_COMMENT Unlock unmodified records on XA PREPARE for primary.
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST OFF,ON
READ_ONLY NO
COMMAND_LINE_ARGUMENT NONE
VARIABLE_NAME INNODB_ENCRYPTION_ROTATE_KEY_AGE
SESSION_VALUE NULL
DEFAULT_VALUE 1
......
......@@ -5642,6 +5642,10 @@ thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2)
return 0;
}
/* Returns whether the thd is slave worker thread */
extern "C" bool thd_is_slave(const MYSQL_THD thd) {
return thd->rgi_slave;
}
extern "C" int thd_non_transactional_update(const MYSQL_THD thd)
{
......
......@@ -497,6 +497,14 @@ IF(MSVC)
TARGET_COMPILE_OPTIONS(innobase PRIVATE "/wd4065")
ENDIF()
OPTION(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY "Enable innodb_enable_xap_unlock_unmodified_for_primary_debug system variable even for release build" OFF)
IF(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY)
ADD_COMPILE_FLAGS(
lock/lock0lock.cc
handler/ha_innodb.cc
COMPILE_FLAGS "-DINNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY")
ENDIF()
IF(NOT (PLUGIN_INNOBASE STREQUAL DYNAMIC))
TARGET_LINK_LIBRARIES(innobase tpool mysys)
ADD_SUBDIRECTORY(${CMAKE_SOURCE_DIR}/extra/mariabackup ${CMAKE_BINARY_DIR}/extra/mariabackup)
......
......@@ -208,26 +208,36 @@ ATTRIBUTE_COLD void btr_read_failed(dberr_t err, const dict_index_t &index)
}
/** Get an index page and declare its latching order level.
@param[in] index index tree
@param[in] page page number
@param[in] mode latch mode
@param[in] merge whether change buffer merge should be attempted
@param[in,out] mtr mini-transaction
@param[out] err error code
@param[out] first set if this is a first-time access to the page
@param index index tree
@param page page number
@param latch_mode latch mode
@param merge whether change buffer merge should be attempted
@param mtr mini-transaction
@param err error code
@param first set if this is a first-time access to the page
@param page_get_mode page get mode
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
uint32_t page, rw_lock_type_t mode, bool merge,
mtr_t *mtr, dberr_t *err, bool *first)
buf_block_t *btr_block_get(const dict_index_t &index, uint32_t page,
rw_lock_type_t latch_mode, bool merge, mtr_t *mtr,
dberr_t *err, bool *first
#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
, ulint page_get_mode
#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
)
{
ut_ad(mode != RW_NO_LATCH);
ut_ad(latch_mode != RW_NO_LATCH);
dberr_t local_err;
if (!err)
err= &local_err;
buf_block_t *block=
buf_page_get_gen(page_id_t{index.table->space->id, page},
index.table->space->zip_size(), mode, nullptr, BUF_GET,
mtr, err, merge && !index.is_clust());
buf_page_get_gen(page_id_t{index.table->space->id, page},
index.table->space->zip_size(), latch_mode, nullptr,
#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
page_get_mode,
#else
BUF_GET,
#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
mtr, err, merge && !index.is_clust());
ut_ad(!block == (*err != DB_SUCCESS));
if (UNIV_LIKELY(block != nullptr))
......
......@@ -253,6 +253,7 @@ void trx_t::commit(std::vector<pfs_os_file_t> &deleted)
/* commit_persist() already reset this. */
ut_ad(!lock.was_chosen_as_deadlock_victim);
lock.n_rec_locks= 0;
lock.set_nth_bit_calls= 0;
while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
{
UT_LIST_REMOVE(lock.evicted_tables, table);
......
......@@ -202,6 +202,12 @@ extern uint srv_n_fil_crypt_iops;
my_bool innodb_evict_tables_on_commit_debug;
#endif
#if defined(UNIV_DEBUG) || \
defined(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY)
my_bool innodb_enable_xap_unlock_unmodified_for_primary_debug;
#endif /* defined(UNIV_DEBUG) ||
defined(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY) */
/** File format constraint for ALTER TABLE */
ulong innodb_instant_alter_column_allowed;
......@@ -19722,6 +19728,15 @@ static MYSQL_SYSVAR_UINT(saved_page_number_debug,
NULL, NULL, 0, 0, UINT_MAX32, 0);
#endif /* UNIV_DEBUG */
#if defined(UNIV_DEBUG) || \
defined(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY)
static MYSQL_SYSVAR_BOOL(enable_xap_unlock_unmodified_for_primary_debug,
innodb_enable_xap_unlock_unmodified_for_primary_debug, PLUGIN_VAR_NOCMDARG,
"Unlock unmodified records on XA PREPARE for primary.",
NULL, NULL, FALSE);
#endif /* defined(UNIV_DEBUG) ||
defined(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY) */
static MYSQL_SYSVAR_BOOL(force_primary_key,
srv_force_primary_key,
PLUGIN_VAR_OPCMDARG,
......@@ -19959,6 +19974,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(fil_make_page_dirty_debug),
MYSQL_SYSVAR(saved_page_number_debug),
#endif /* UNIV_DEBUG */
#if defined(UNIV_DEBUG) || \
defined(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY)
MYSQL_SYSVAR(enable_xap_unlock_unmodified_for_primary_debug),
#endif /* defined(UNIV_DEBUG) ||
defined(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY) */
MYSQL_SYSVAR(force_primary_key),
MYSQL_SYSVAR(fatal_semaphore_wait_threshold),
/* Table page compression feature */
......
......@@ -85,18 +85,22 @@ btr_root_adjust_on_import(
ATTRIBUTE_COLD void btr_read_failed(dberr_t err, const dict_index_t &index);
/** Get an index page and declare its latching order level.
@param[in] index index tree
@param[in] page page number
@param[in] mode latch mode
@param[in] merge whether change buffer merge should be attempted
@param[in,out] mtr mini-transaction
@param[out] err error code
@param[out] first set if this is a first-time access to the page
@param index index tree
@param page page number
@param latch_mode latch mode
@param merge whether change buffer merge should be attempted
@param mtr mini-transaction
@param err error code
@param first set if this is a first-time access to the page
@param page_get_mode page get mode
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
uint32_t page, rw_lock_type_t mode, bool merge,
mtr_t *mtr, dberr_t *err= nullptr,
bool *first= nullptr);
buf_block_t *btr_block_get(const dict_index_t &index, uint32_t page,
rw_lock_type_t latch_mode, bool merge, mtr_t *mtr,
dberr_t *err= nullptr, bool *first= nullptr
#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
, ulint page_get_mode= BUF_GET
#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
);
/**************************************************************//**
Gets the index id field of a page.
......
......@@ -820,6 +820,15 @@ class lock_sys_t
bool is_writer() const { return latch.have_wr(); }
/** @return whether the current thread is holding lock_sys.latch */
bool is_holder() const { return latch.have_any(); }
/** Returns whether a cell of hash table where the lock is stored is latched.
@param lock the lock which cell is checked
@return whether the lock's cell is latched */
bool is_cell_locked(const lock_t &lock) {
return hash_table::latch(
hash_get(lock.type_mode)
.cell_get(lock.un_member.rec_lock.page_id.fold()))
->is_locked();
}
/** Assert that a lock shard is exclusively latched (by some thread) */
void assert_locked(const lock_t &lock) const;
/** Assert that a table lock shard is exclusively latched by this thread */
......
......@@ -474,10 +474,12 @@ inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
{
ut_ad(!lock->is_table());
#ifdef SUX_LOCK_GENERIC
ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
|| lock_sys.is_cell_locked(*lock));
#else
ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
|| (xtest() && !lock->trx->mutex_is_locked()));
|| lock_sys.is_cell_locked(*lock)
|| (xtest() && !lock->trx->mutex_is_locked()));
#endif
ut_ad(i < lock->un_member.rec_lock.n_bits);
......
......@@ -98,6 +98,7 @@ lock_rec_set_nth_bit(
|| (xtest() && !lock->trx->mutex_is_locked()));
#endif
lock->trx->lock.n_rec_locks++;
lock->trx->lock.set_nth_bit_calls++;
}
/*********************************************************************//**
......
......@@ -434,6 +434,13 @@ extern uint srv_sys_space_size_debug;
extern bool srv_log_file_created;
#endif /* UNIV_DEBUG */
#if defined(UNIV_DEBUG) || \
defined(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY)
/** Enable unmodified records unlocking on XA PREPARE for master. */
extern my_bool innodb_enable_xap_unlock_unmodified_for_primary_debug;
#endif /* defined(UNIV_DEBUG) ||
defined(INNODB_ENABLE_XAP_UNLOCK_UNMODIFIED_FOR_PRIMARY) */
extern ulint srv_dml_needed_delay;
/** innodb_purge_threads; the number of purge tasks to use */
......
......@@ -399,6 +399,9 @@ struct trx_lock_t
/** number of record locks; protected by lock_sys.assert_locked(page_id) */
ulint n_rec_locks;
/** number of lock_rec_set_nth_bit() calls since the start of transaction;
protected by lock_sys.is_writer() or trx->mutex_is_owner(). */
ulint set_nth_bit_calls;
};
/** Logical first modification time of a table in a transaction */
......
This diff is collapsed.
......@@ -215,6 +215,17 @@ row_purge_remove_clust_if_poss_low(
always refer to an existing undo log record. */
ut_ad(row_get_rec_trx_id(rec, index, offsets));
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("enable_row_purge_remove_clust_if_poss_low_sync_point",
debug_sync_set_action
(current_thd,
STRING_WITH_LEN(
"now SIGNAL "
"row_purge_remove_clust_if_poss_low_before_delete "
"WAIT_FOR "
"row_purge_remove_clust_if_poss_low_cont"));
);
#endif
if (mode == BTR_MODIFY_LEAF) {
success = DB_FAIL != btr_cur_optimistic_delete(
btr_pcur_get_btr_cur(&node->pcur), 0, &mtr);
......
......@@ -114,6 +114,8 @@ trx_init(
trx->lock.n_rec_locks = 0;
trx->lock.set_nth_bit_calls = 0;
trx->dict_operation = false;
trx->error_state = DB_SUCCESS;
......@@ -353,6 +355,7 @@ trx_t *trx_create()
ut_ad(trx->mod_tables.empty());
ut_ad(trx->lock.n_rec_locks == 0);
ut_ad(trx->lock.set_nth_bit_calls == 0);
ut_ad(trx->lock.table_cached == 0);
ut_ad(trx->lock.rec_cached == 0);
ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment