Commit c68007d9 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-24738 Improve the InnoDB deadlock checker

A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.

The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.

Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.

Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.

To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().

The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.

lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().

DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.

lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.

Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.

Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).

Deadlock::check_and_resolve(): Find and resolve a deadlock.

lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.

Reviewed by: Vladislav Vaintroub
parent 3ddb4fdd
......@@ -104,6 +104,10 @@ disconnect con2;
connection default;
SET @@global.innodb_strict_mode = @old_innodb_strict_mode;
SET @@global.innodb_file_per_table = @old_innodb_file_per_table;
SET @save_detect= @@GLOBAL.innodb_deadlock_detect;
SET @save_report= @@GLOBAL.innodb_deadlock_report;
SET GLOBAL innodb_deadlock_detect=ON;
SET GLOBAL innodb_deadlock_report=BASIC;
SET NAMES utf8;
CREATE TABLE `t``\""e` (a INT, PRIMARY KEY (a))
ENGINE=InnoDB
......@@ -148,6 +152,8 @@ set sql_mode = 'ANSI_QUOTES';
SHOW ENGINE InnoDB STATUS;
Type Name Status
InnoDB index PRIMARY of table `test`.`t``\""e` /* Partition `p0``\""e`, Subpartition `sp0``\""e` */
SET GLOBAL innodb_deadlock_detect= @save_detect;
SET GLOBAL innodb_deadlock_report= @save_report;
set @@sql_mode = @old_sql_mode;
connection con1;
ROLLBACK;
......
......@@ -100,6 +100,10 @@ SET GLOBAL innodb_read_only_compressed=@save_innodb_read_only_compressed;
#
# Bug#32430 - show engine innodb status causes errors
#
SET @save_detect= @@GLOBAL.innodb_deadlock_detect;
SET @save_report= @@GLOBAL.innodb_deadlock_report;
SET GLOBAL innodb_deadlock_detect=ON;
SET GLOBAL innodb_deadlock_report=BASIC;
SET NAMES utf8;
CREATE TABLE `t``\""e` (a INT, PRIMARY KEY (a))
ENGINE=InnoDB
......@@ -150,6 +154,8 @@ set @old_sql_mode = @@sql_mode;
set sql_mode = 'ANSI_QUOTES';
--replace_regex /.*RECORD LOCKS space id [0-9]* page no [0-9]* n bits [0-9]* // / trx id .*// /.*index .* in // /trx table locks [0-9]* // /total table locks [0-9]* //
SHOW ENGINE InnoDB STATUS;
SET GLOBAL innodb_deadlock_detect= @save_detect;
SET GLOBAL innodb_deadlock_report= @save_report;
set @@sql_mode = @old_sql_mode;
connection con1;
REAP;
......
@@ -16,7 +16,10 @@
connection default;
SELECT * FROM t1 WHERE id = 2 FOR UPDATE;
connection con2;
+connection con1;
+COMMIT;
disconnect con1;
+connection con2;
ROLLBACK;
disconnect con2;
connection default;
......@@ -16,9 +16,9 @@ SELECT * FROM t1 WHERE id = 1 FOR UPDATE;
connection default;
SELECT * FROM t1 WHERE id = 2 FOR UPDATE;
connection con2;
disconnect con1;
ROLLBACK;
disconnect con2;
disconnect con1;
connection default;
ROLLBACK;
DROP TABLE t1;
......@@ -39,14 +39,18 @@ connection con2;
if (!$have_deadlock) {
--error ER_LOCK_WAIT_TIMEOUT
reap;
disconnect con1;
}
if ($have_deadlock) {
--error 0,ER_LOCK_DEADLOCK
connection con1;
COMMIT;
disconnect con1;
connection con2;
--error 0,ER_LOCK_DEADLOCK,ER_LOCK_WAIT_TIMEOUT
reap;
}
ROLLBACK;
disconnect con2;
disconnect con1;
#
# Note here that con1 is the older transaction as it
......
......@@ -417,6 +417,18 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST OFF,ON
READ_ONLY NO
COMMAND_LINE_ARGUMENT NONE
VARIABLE_NAME INNODB_DEADLOCK_REPORT
SESSION_VALUE NULL
DEFAULT_VALUE full
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE ENUM
VARIABLE_COMMENT How to report deadlocks (if innodb_deadlock_detect=ON).
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST off,basic,full
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME INNODB_DEFAULT_ENCRYPTION_KEY_ID
SESSION_VALUE 1
DEFAULT_VALUE 1
......
......@@ -372,6 +372,26 @@ TYPELIB innodb_flush_method_typelib = {
NULL
};
/** Names of allowed values of innodb_deadlock_report */
static const char *innodb_deadlock_report_names[]= {
"off", /* Do not report any details of deadlocks */
"basic", /* Report waiting transactions and lock requests */
"full", /* Also report blocking locks */
NullS
};
static_assert(Deadlock::REPORT_OFF == 0, "compatibility");
static_assert(Deadlock::REPORT_BASIC == 1, "compatibility");
static_assert(Deadlock::REPORT_FULL == 2, "compatibility");
/** Enumeration of innodb_deadlock_report */
static TYPELIB innodb_deadlock_report_typelib = {
array_elements(innodb_deadlock_report_names) - 1,
"innodb_deadlock_report_typelib",
innodb_deadlock_report_names,
NULL
};
/** Allowed values of innodb_change_buffering */
static const char* innodb_change_buffering_names[] = {
"none", /* IBUF_USE_NONE */
......@@ -4476,6 +4496,7 @@ static void innobase_kill_query(handlerton*, THD *thd, enum thd_kill_levels)
trx->mutex_unlock();
}
}
lock_sys.deadlock_check();
mysql_mutex_unlock(&lock_sys.wait_mutex);
}
}
......@@ -18601,13 +18622,18 @@ static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors,
" when flushing a block",
NULL, NULL, 1, 0, 2, 0);
static MYSQL_SYSVAR_BOOL(deadlock_detect, innobase_deadlock_detect,
static MYSQL_SYSVAR_BOOL(deadlock_detect, innodb_deadlock_detect,
PLUGIN_VAR_NOCMDARG,
"Enable/disable InnoDB deadlock detector (default ON)."
" if set to OFF, deadlock detection is skipped,"
" and we rely on innodb_lock_wait_timeout in case of deadlock.",
NULL, NULL, TRUE);
static MYSQL_SYSVAR_ENUM(deadlock_report, innodb_deadlock_report,
PLUGIN_VAR_RQCMDARG,
"How to report deadlocks (if innodb_deadlock_detect=ON).",
NULL, NULL, Deadlock::REPORT_FULL, &innodb_deadlock_report_typelib);
static MYSQL_SYSVAR_UINT(fill_factor, innobase_fill_factor,
PLUGIN_VAR_RQCMDARG,
"Percentage of B-tree page filled during bulk insert",
......@@ -19190,6 +19216,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(force_load_corrupted),
MYSQL_SYSVAR(lock_wait_timeout),
MYSQL_SYSVAR(deadlock_detect),
MYSQL_SYSVAR(deadlock_report),
MYSQL_SYSVAR(page_size),
MYSQL_SYSVAR(log_buffer_size),
MYSQL_SYSVAR(log_file_size),
......
......@@ -43,7 +43,15 @@ Created 5/7/1996 Heikki Tuuri
class ReadView;
/** The value of innodb_deadlock_detect */
extern my_bool innobase_deadlock_detect;
extern my_bool innodb_deadlock_detect;
/** The value of innodb_deadlock_report */
extern ulong innodb_deadlock_report;
namespace Deadlock
{
/** The allowed values of innodb_deadlock_report */
enum report { REPORT_OFF, REPORT_BASIC, REPORT_FULL };
}
/*********************************************************************//**
Gets the heap_no of the smallest user record on a page.
......@@ -704,22 +712,21 @@ class lock_sys_t
hash_table prdt_hash;
/** page locks for SPATIAL INDEX */
hash_table prdt_page_hash;
/** number of deadlocks detected; protected by mutex */
ulint deadlocks;
/** mutex covering lock waits; @see trx_lock_t::wait_lock */
MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t wait_mutex;
private:
/** Pending number of lock waits; protected by wait_mutex */
ulint wait_pending;
/** Cumulative number of lock waits; protected by wait_mutex */
ulint wait_count;
/** Pending number of lock waits; protected by wait_mutex */
uint32_t wait_pending;
/** Cumulative wait time; protected by wait_mutex */
ulint wait_time;
uint32_t wait_time;
/** Longest wait time; protected by wait_mutex */
ulint wait_time_max;
uint32_t wait_time_max;
public:
/** number of deadlocks detected; protected by wait_mutex */
ulint deadlocks;
/**
Constructor.
......@@ -821,6 +828,10 @@ class lock_sys_t
void close();
/** Check for deadlocks */
static void deadlock_check();
/** Note that a record lock wait started */
inline void wait_start();
......@@ -940,8 +951,8 @@ UNIV_INLINE
lock_t*
lock_rec_create(
/*============*/
#ifdef WITH_WSREP
lock_t* c_lock, /*!< conflicting lock */
#ifdef WITH_WSREP
que_thr_t* thr, /*!< thread owning trx */
#endif
unsigned type_mode,/*!< in: lock mode and wait flag */
......@@ -961,6 +972,7 @@ void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock);
/** Create a new record lock and inserts it to the lock queue,
without checking for deadlocks or conflicts.
@param[in] c_lock conflicting lock, or NULL
@param[in] type_mode lock mode and wait flag
@param[in] page_id index page number
@param[in] page R-tree index page, or NULL
......@@ -971,8 +983,8 @@ without checking for deadlocks or conflicts.
@return created lock */
lock_t*
lock_rec_create_low(
lock_t* c_lock,
#ifdef WITH_WSREP
lock_t* c_lock, /*!< conflicting lock */
que_thr_t* thr, /*!< thread owning trx */
#endif
unsigned type_mode,
......@@ -985,6 +997,7 @@ lock_rec_create_low(
/** Enqueue a waiting request for a lock which cannot be granted immediately.
Check for deadlocks.
@param[in] c_lock conflicting lock
@param[in] type_mode the requested lock mode (LOCK_S or LOCK_X)
possibly ORed with LOCK_GAP or
LOCK_REC_NOT_GAP, ORed with
......@@ -1002,9 +1015,7 @@ Check for deadlocks.
@retval DB_DEADLOCK if this transaction was chosen as the victim */
dberr_t
lock_rec_enqueue_waiting(
#ifdef WITH_WSREP
lock_t* c_lock, /*!< conflicting lock */
#endif
lock_t* c_lock,
unsigned type_mode,
const page_id_t id,
const page_t* page,
......
/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, 2020, MariaDB Corporation.
Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -60,8 +60,8 @@ UNIV_INLINE
lock_t*
lock_rec_create(
/*============*/
#ifdef WITH_WSREP
lock_t* c_lock, /*!< conflicting lock */
#ifdef WITH_WSREP
que_thr_t* thr, /*!< thread owning trx */
#endif
unsigned type_mode,/*!< in: lock mode and wait flag */
......@@ -76,8 +76,9 @@ lock_rec_create(
{
btr_assert_not_corrupted(block, index);
return lock_rec_create_low(
c_lock,
#ifdef WITH_WSREP
c_lock, thr,
thr,
#endif
type_mode, block->page.id(), block->frame, heap_no,
index, trx, caller_owns_trx_mutex);
......
......@@ -416,14 +416,6 @@ lock_rec_get_prev(
const lock_t* in_lock,/*!< in: record lock */
ulint heap_no);/*!< in: heap number of the record */
/*********************************************************************//**
Cancels a waiting lock request and releases possible other transactions
waiting behind it. */
void
lock_cancel_waiting_and_release(
/*============================*/
lock_t* lock); /*!< in/out: waiting lock request */
/*********************************************************************//**
Checks if some transaction has an implicit x-lock on a record in a clustered
index.
......
......@@ -425,15 +425,13 @@ struct trx_lock_t
trx->mutex, by the thread that is executing the transaction.
Set to nullptr when holding lock_sys.wait_mutex. */
Atomic_relaxed<lock_t*> wait_lock;
/** Transaction being waited for; protected by lock_sys.wait_mutex */
trx_t *wait_trx;
/** condition variable for !wait_lock; used with lock_sys.wait_mutex */
pthread_cond_t cond;
/** lock wait start time, protected only by lock_sys.wait_mutex */
my_hrtime_t suspend_time;
/** DeadlockChecker::search() uses this to keep track of visited locks.
Protected by lock_sys.is_writer(). */
uint64_t deadlock_mark;
#ifdef WITH_WSREP
/** 2=high priority wsrep thread has marked this trx to abort;
1=another transaction chose this as a victim in deadlock resolution. */
......
This diff is collapsed.
......@@ -463,12 +463,16 @@ lock_prdt_add_to_queue(
}
create:
lock_t* lock = lock_rec_create(
/* Note: We will not pass any conflicting lock to lock_rec_create(),
because we should be moving an existing waiting lock request. */
ut_ad(!(type_mode & LOCK_WAIT) || trx->lock.wait_trx);
lock_t* lock = lock_rec_create(nullptr,
#ifdef WITH_WSREP
NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
nullptr,
#endif
type_mode, block, PRDT_HEAPNO, index, trx,
caller_owns_trx_mutex);
type_mode, block, PRDT_HEAPNO, index,
trx, caller_owns_trx_mutex);
if (lock->type_mode & LOCK_PREDICATE) {
lock_prdt_set_prdt(lock, prdt);
......@@ -529,11 +533,8 @@ lock_prdt_insert_check_and_lock(
trx->mutex_lock();
/* Allocate MBR on the lock heap */
lock_init_prdt_from_mbr(prdt, mbr, 0, trx->lock.lock_heap);
err= lock_rec_enqueue_waiting(
#ifdef WITH_WSREP
c_lock,
#endif
mode, id, block->frame, PRDT_HEAPNO, index, thr, prdt);
err= lock_rec_enqueue_waiting(c_lock, mode, id, block->frame,
PRDT_HEAPNO, index, thr, prdt);
trx->mutex_unlock();
}
}
......@@ -732,8 +733,9 @@ lock_prdt_lock(
if (lock == NULL) {
lock = lock_rec_create(
NULL,
#ifdef WITH_WSREP
NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
NULL, /* FIXME: replicate SPATIAL INDEX locks */
#endif
prdt_mode, block, PRDT_HEAPNO,
index, trx, FALSE);
......@@ -762,10 +764,7 @@ lock_prdt_lock(
if (wait_for != NULL) {
err = lock_rec_enqueue_waiting(
#ifdef WITH_WSREP
NULL, /* FIXME: replicate
SPATIAL INDEX locks */
#endif
wait_for,
prdt_mode,
id, block->frame, PRDT_HEAPNO,
index, thr, prdt);
......@@ -835,8 +834,9 @@ lock_place_prdt_page_lock(
if (lock == NULL) {
lock = lock_rec_create_low(
NULL,
#ifdef WITH_WSREP
NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
NULL, /* FIXME: replicate SPATIAL INDEX locks */
#endif
mode, page_id, NULL, PRDT_HEAPNO,
index, trx, FALSE);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment