Commit cf2480dd authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-21452: Retain the watchdog only on dict_sys.mutex, for performance

Most hangs seem to involve dict_sys.mutex. While holding lock_sys.mutex
we rarely acquire any buffer pool page latches, which are a frequent
source of potential hangs.
parent ff5d306e
connect con1,localhost,root,,;
CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
SET DEBUG_SYNC='create_table SIGNAL stuck WAIT_FOR ever';
CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
connect con2,localhost,root,,;
drop table if exists t1;
connection con1;
create table t1 (id integer, x integer) engine = InnoDB;
insert into t1 values(0, 0);
set DEBUG_DBUG='+d,fatal-semaphore-timeout';
set autocommit=0;
# Sending query on con1,
# the session will hold lock table mutex and sleep
SELECT * from t1 where id = 0 FOR UPDATE;
connection con2;
set autocommit=0;
# Sending query on con2,
# the session will be blocked on the lock table mutex and
# thus be put into sync arry
SELECT * from t1 where id = 0 FOR UPDATE;
SET DEBUG_SYNC='now WAIT_FOR stuck';
FLUSH TABLES;
SELECT * FROM t1;
connection default;
# Waitting for mysqld to crash
# Mysqld crash was detected
# Waitting for reconnect after mysqld restarts
# Reconnected after mysqld was successfully restarted
# Cleaning up before exit
drop table if exists t1;
# Clean exit
# Waiting for mariadbd to crash
# Crash was detected
DROP TABLE t1, t2;
Warnings:
Warning 1932 Table 'test.t2' doesn't exist in engine
# Only test in debug mode since DBUG_EXECUTE_IF is used
--source include/have_debug.inc
--source include/have_debug_sync.inc
# Can't test this with embedded server
--source include/not_embedded.inc
......@@ -11,56 +11,17 @@
--source include/have_innodb.inc
connect (con1,localhost,root,,);
CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
SET DEBUG_SYNC='create_table SIGNAL stuck WAIT_FOR ever';
send CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
connect (con2,localhost,root,,);
SET DEBUG_SYNC='now WAIT_FOR stuck';
FLUSH TABLES;
--disable_warnings
drop table if exists t1;
--enable_warnings
connection con1;
eval create table t1 (id integer, x integer) engine = InnoDB;
insert into t1 values(0, 0);
# Enable the debug injection.
set DEBUG_DBUG='+d,fatal-semaphore-timeout';
set autocommit=0;
# The following query will hang for an hour since the debug injection
# code will sleep an hour after holding the lock table mutex
--echo # Sending query on con1,
--echo # the session will hold lock table mutex and sleep
--send
SELECT * from t1 where id = 0 FOR UPDATE;
# To make sure con1 holding the lock table mutex and sleeping
--sleep 2
connection con2;
set autocommit=0;
# The following query will be blocked on the lock table mutex held by
# con1 so it will be put into sync array.
--echo # Sending query on con2,
--echo # the session will be blocked on the lock table mutex and
--echo # thus be put into sync arry
--send
SELECT * from t1 where id = 0 FOR UPDATE;
# Waitting for mysqld to abort due to fatal semaphore timeout.
# Please note that, in the master.opt file, the fatal timeout
# was set to 1 second, but in mysqld debug mode, this timeout
# value will be timed 10 because UNIV_DEBUG_VALGRIND is set
# (see sync_array_print_long_waits_low() in storage/innobase/sync/sync0arr.cc)
# so the actual timeout will be 1 * 10 = 10 seconds. Besides,
# mysqld will abort after detecting this fatal timeout 10 times in
# a loop with interval of 1 second (see srv_error_monitor_thread
# thread in torage/innobase/srv/srv0srv.cc), so mysqld will abort
# in 1 * 10 + 1 * 10 = 20 seconds after con2 being blocked on
# the lock table mutex.
#
# P.S. the default fatal sempahore timeout is 600 seconds,
# so mysqld will abort after 600 * 10 + 1 * 10 = 6010 seconds
# in debug mode and 600 + 1 * 10 = 610 seconds in release mode.
# The following query will be blocked on the dict_sys.mutex held by con1
send SELECT * FROM t1;
connection default;
......@@ -73,10 +34,10 @@ call mtr.add_suppression(".*");
# The crash is expected
exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
--echo # Waitting for mysqld to crash
--echo # Waiting for mariadbd to crash
# It will take 20 seconds to detect the long semaphore and mysqld to abort.
# This test will be treated as pass as long as mysqld crash/restart is dectected
# It may take 20 seconds to detect the long semaphore and mysqld to abort.
# This test will be treated as pass as long as mysqld crash/restart is detected
# in 80 seconds.
let $counter= 80;
let $mysql_errno= 0;
......@@ -94,8 +55,7 @@ while (!$mysql_errno)
--sleep 1
}
--echo # Mysqld crash was detected
--echo # Waitting for reconnect after mysqld restarts
--echo # Crash was detected
enable_reconnect;
connection default;
......@@ -105,11 +65,4 @@ connection default;
# Call script that will poll the server waiting for it to be back online again
source include/wait_until_connected_again.inc;
--echo # Reconnected after mysqld was successfully restarted
--echo # Cleaning up before exit
--disable_warnings
drop table if exists t1;
--enable_warnings
--echo # Clean exit
DROP TABLE t1, t2;
......@@ -706,15 +706,9 @@ class lock_sys_t
{
bool m_initialised;
/** The my_hrtime_coarse().val of the oldest mutex_lock_wait() start, or 0 */
std::atomic<ulonglong> mutex_wait_start;
/** mutex proteting the locks */
MY_ALIGNED(CACHE_LINE_SIZE) mysql_mutex_t mutex;
public:
/** Diagnostic message for exceeding the mutex_lock_wait() timeout */
static const char fatal_msg[];
/** record locks */
hash_table_t rec_hash;
/** predicate locks for SPATIAL INDEX */
......@@ -747,10 +741,6 @@ class lock_sys_t
bool is_initialised() { return m_initialised; }
private:
/** Acquire lock_sys.mutex */
ATTRIBUTE_NOINLINE void mutex_lock_wait();
public:
#ifdef HAVE_PSI_MUTEX_INTERFACE
/** Try to acquire lock_sys.mutex */
ATTRIBUTE_NOINLINE int mutex_trylock();
......@@ -762,7 +752,7 @@ class lock_sys_t
/** Try to acquire lock_sys.mutex */
int mutex_trylock() { return mysql_mutex_trylock(&mutex); }
/** Aqcuire lock_sys.mutex */
void mutex_lock() { if (mutex_trylock()) mutex_lock_wait(); }
void mutex_lock() { mysql_mutex_lock(&mutex); }
/** Release lock_sys.mutex */
void mutex_unlock() { mysql_mutex_unlock(&mutex); }
#endif
......@@ -771,11 +761,6 @@ class lock_sys_t
/** Assert that mutex_lock() has not been invoked */
void mutex_assert_unlocked() const { mysql_mutex_assert_not_owner(&mutex); }
/** @return the my_hrtime_coarse().val of the oldest mutex_lock_wait() start,
assuming that requests are served on a FIFO basis */
ulonglong oldest_wait() const
{ return mutex_wait_start.load(std::memory_order_relaxed); }
/** Wait for a lock to be granted */
void wait_lock(lock_t **lock, mysql_cond_t *cond)
{ while (*lock) mysql_cond_wait(cond, &mutex); }
......
......@@ -304,12 +304,6 @@ static bool lock_rec_validate_page(const buf_block_t *block, bool latched)
/* The lock system */
lock_sys_t lock_sys;
/** Diagnostic message for exceeding the mutex_lock_wait() timeout */
const char lock_sys_t::fatal_msg[]=
"innodb_fatal_semaphore_wait_threshold was exceeded for lock_sys.mutex. "
"Please refer to "
"https://mariadb.com/kb/en/how-to-produce-a-full-stack-trace-for-mysqld/";
/** We store info on the latest deadlock error to this buffer. InnoDB
Monitor will then fetch it and print */
static bool lock_deadlock_found = false;
......@@ -476,36 +470,12 @@ void lock_sys_t::create(ulint n_cells)
timeout_timer_active = false;
}
void lock_sys_t::mutex_lock_wait()
{
ulonglong now= my_hrtime_coarse().val, old= 0;
if (mutex_wait_start.compare_exchange_strong
(old, now, std::memory_order_relaxed, std::memory_order_relaxed))
{
mysql_mutex_lock(&mutex);
mutex_wait_start.store(0, std::memory_order_relaxed);
return;
}
ut_ad(old);
/* We could have old > now due to our use of my_hrtime_coarse(). */
ulong waited= old <= now ? static_cast<ulong>((now - old) / 1000000) : 0;
const ulong threshold= srv_fatal_semaphore_wait_threshold;
if (waited >= threshold)
ib::fatal() << fatal_msg;
if (waited > threshold / 4)
ib::warn() << "A long wait (" << waited
<< " seconds) was observed for lock_sys.mutex";
mysql_mutex_lock(&mutex);
}
#ifdef HAVE_PSI_MUTEX_INTERFACE
/** Try to acquire lock_sys.mutex */
int lock_sys_t::mutex_trylock() { return mysql_mutex_trylock(&mutex); }
/** Acquire lock_sys.mutex */
void lock_sys_t::mutex_lock() { if (mutex_trylock()) mutex_lock_wait(); }
void lock_sys_t::mutex_lock() { mysql_mutex_lock(&mutex); }
/** Release lock_sys.mutex */
void lock_sys_t::mutex_unlock() { mysql_mutex_unlock(&mutex); }
#endif
......@@ -3571,9 +3541,6 @@ lock_table(
lock_sys.mutex_lock();
DBUG_EXECUTE_IF("fatal-semaphore-timeout",
{ os_thread_sleep(3600000000LL); });
/* We have to check if the new lock is compatible with any locks
other transactions have in the table lock queue. */
......
......@@ -2304,6 +2304,8 @@ row_create_table_for_mysql(
ut_d(dict_sys.assert_locked());
ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
DEBUG_SYNC_C("create_table");
DBUG_EXECUTE_IF(
"ib_create_table_fail_at_start_of_row_create_table_for_mysql",
goto err_exit;
......
......@@ -1349,20 +1349,6 @@ void srv_monitor_task(void*)
const ulonglong now = my_hrtime_coarse().val;
const ulong threshold = srv_fatal_semaphore_wait_threshold;
if (ulonglong start = lock_sys.oldest_wait()) {
ulong waited = static_cast<ulong>((now - start) / 1000000);
if (waited >= threshold) {
ib::fatal() << lock_sys.fatal_msg;
}
if (waited == threshold / 4
|| waited == threshold / 2
|| waited == threshold / 4 * 3) {
ib::warn() << "Long wait (" << waited
<< " seconds) for lock_sys.mutex";
}
}
if (ulonglong start = dict_sys.oldest_wait()) {
ulong waited = static_cast<ulong>((now - start) / 1000000);
if (waited >= threshold) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment