Commit ba025501 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-7818: Deadlock occurring with parallel replication and FTWRL

Problem is that FLUSH TABLES WITH READ LOCK first blocks threads from
starting new commits, then waits for running commits to complete. But
in-order parallel replication needs commits to happen in a particular
order, so this can easily deadlock.

To fix this problem, this patch introduces a way to temporarily pause
the parallel replication worker threads. Before starting FTWRL, we let
all worker threads complete in-progress transactions, and then
wait. Then we proceed to take the global read lock. Once the lock is
obtained, we unpause the worker threads. Now commits are blocked from
starting by the global read lock, so the deadlock will no longer occur.
parent 6d96fab7
...@@ -6,6 +6,7 @@ user1 statement/sql/flush flush tables with read lock ...@@ -6,6 +6,7 @@ user1 statement/sql/flush flush tables with read lock
username event_name nesting_event_type username event_name nesting_event_type
username event_name nesting_event_type username event_name nesting_event_type
user1 stage/sql/init STATEMENT user1 stage/sql/init STATEMENT
user1 stage/sql/init STATEMENT
user1 stage/sql/query end STATEMENT user1 stage/sql/query end STATEMENT
user1 stage/sql/closing tables STATEMENT user1 stage/sql/closing tables STATEMENT
user1 stage/sql/freeing items STATEMENT user1 stage/sql/freeing items STATEMENT
......
...@@ -29,8 +29,98 @@ include/start_slave.inc ...@@ -29,8 +29,98 @@ include/start_slave.inc
SELECT * FROM t1 WHERE a >= 10 ORDER BY a; SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
a b a b
10 0 10 0
*** MDEV-7818: Deadlock occurring with parallel replication and FTWRL ***
CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
INSERT INTO t2 VALUES (1,0), (2,0), (3,0);
include/stop_slave.inc
SET @old_dbug= @@SESSION.debug_dbug;
SET @commit_id= 4242;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
BEGIN;
UPDATE t2 SET b=b+1 WHERE a=2;
COMMIT;
BEGIN;
INSERT INTO t2 VALUES (4,10);
COMMIT;
SET SESSION debug_dbug= @old_dbug;
INSERT INTO t2 VALUES (5,0);
INSERT INTO t2 VALUES (6,0);
INSERT INTO t2 VALUES (7,0);
INSERT INTO t2 VALUES (8,0);
INSERT INTO t2 VALUES (9,0);
INSERT INTO t2 VALUES (10,0);
INSERT INTO t2 VALUES (11,0);
INSERT INTO t2 VALUES (12,0);
INSERT INTO t2 VALUES (13,0);
INSERT INTO t2 VALUES (14,0);
INSERT INTO t2 VALUES (15,0);
INSERT INTO t2 VALUES (16,0);
INSERT INTO t2 VALUES (17,0);
INSERT INTO t2 VALUES (18,0);
INSERT INTO t2 VALUES (19,0);
BEGIN;
SELECT * FROM t2 WHERE a=2 FOR UPDATE;
a b
2 0
include/start_slave.inc
FLUSH TABLES WITH READ LOCK;
COMMIT;
STOP SLAVE;
SELECT * FROM t2 ORDER BY a;
a b
1 0
2 1
3 0
4 10
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
UNLOCK TABLES;
include/wait_for_slave_to_stop.inc
include/start_slave.inc
SELECT * FROM t2 ORDER BY a;
a b
1 0
2 1
3 0
4 10
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
*** MDEV-8318: Assertion `!pool->busy' failed in pool_mark_busy(rpl_parallel_thread_pool*) on concurrent FTWRL ***
LOCK TABLE t2 WRITE;
FLUSH TABLES WITH READ LOCK;
FLUSH TABLES WITH READ LOCK;
KILL QUERY CID;
ERROR 70100: Query execution was interrupted
UNLOCK TABLES;
UNLOCK TABLES;
include/stop_slave.inc include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads; SET GLOBAL slave_parallel_threads=@old_parallel_threads;
include/start_slave.inc include/start_slave.inc
DROP TABLE t1; DROP TABLE t1, t2;
include/rpl_end.inc include/rpl_end.inc
--source include/have_debug.inc
--source include/have_innodb.inc
--source include/have_binlog_format_statement.inc --source include/have_binlog_format_statement.inc
--let $rpl_topology=1->2 --let $rpl_topology=1->2
--source include/rpl_init.inc --source include/rpl_init.inc
...@@ -78,13 +80,144 @@ SET GLOBAL sql_slave_skip_counter= 1; ...@@ -78,13 +80,144 @@ SET GLOBAL sql_slave_skip_counter= 1;
SELECT * FROM t1 WHERE a >= 10 ORDER BY a; SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
# Clean up --echo *** MDEV-7818: Deadlock occurring with parallel replication and FTWRL ***
--connection server_1
CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
INSERT INTO t2 VALUES (1,0), (2,0), (3,0);
--save_master_pos
--connection server_2
--sync_with_master
--source include/stop_slave.inc
--connection server_1
# Create a group commit with two transactions, will be used to provoke the
# problematic thread interaction with FTWRL on the slave.
SET @old_dbug= @@SESSION.debug_dbug;
SET @commit_id= 4242;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
BEGIN;
UPDATE t2 SET b=b+1 WHERE a=2;
COMMIT;
BEGIN;
INSERT INTO t2 VALUES (4,10);
COMMIT;
SET SESSION debug_dbug= @old_dbug;
INSERT INTO t2 VALUES (5,0);
INSERT INTO t2 VALUES (6,0);
INSERT INTO t2 VALUES (7,0);
INSERT INTO t2 VALUES (8,0);
INSERT INTO t2 VALUES (9,0);
INSERT INTO t2 VALUES (10,0);
INSERT INTO t2 VALUES (11,0);
INSERT INTO t2 VALUES (12,0);
INSERT INTO t2 VALUES (13,0);
INSERT INTO t2 VALUES (14,0);
INSERT INTO t2 VALUES (15,0);
INSERT INTO t2 VALUES (16,0);
INSERT INTO t2 VALUES (17,0);
INSERT INTO t2 VALUES (18,0);
INSERT INTO t2 VALUES (19,0);
--save_master_pos
--connection server_2
--connect (s1, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
# Block one transaction on a row lock.
BEGIN;
SELECT * FROM t2 WHERE a=2 FOR UPDATE;
--connection server_2
# Wait for slave thread of the other transaction to have the commit lock.
--source include/start_slave.inc
--let $wait_condition= SELECT COUNT(*) > 0 FROM information_schema.processlist WHERE state = "Waiting for prior transaction to commit"
--source include/wait_condition.inc
--connect (s2, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
send FLUSH TABLES WITH READ LOCK;
# The bug was that at this point we were deadlocked.
# The FTWRL command would wait forever for T2 to commit.
# T2 would wait for T1 to commit first, but T1 is waiting for
# the global read lock to be released.
--connection s1
# Release the lock that blocs T1 from replicating.
COMMIT;
--connection s1
send STOP SLAVE;
--connection s2
reap;
--connection server_1
SELECT * FROM t2 ORDER BY a;
--connection s2
UNLOCK TABLES;
--connection s1
reap;
--connection server_2
--source include/wait_for_slave_to_stop.inc
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t2 ORDER BY a;
--echo *** MDEV-8318: Assertion `!pool->busy' failed in pool_mark_busy(rpl_parallel_thread_pool*) on concurrent FTWRL ***
--connection server_1
LOCK TABLE t2 WRITE;
--connect (m1,localhost,root,,test)
--connection m1
--let $cid=`SELECT CONNECTION_ID()`
send FLUSH TABLES WITH READ LOCK;
--connect (m2,localhost,root,,test)
# We cannot force the race with DEBUG_SYNC, because the race does not
# exist after fixing the bug. At best we could force a debug sync to
# time out, which is effectively just a sleep.
# So just put a small sleep here; it is enough to trigger the bug in
# most run before the bug fix, and the code should work correctly
# however the thread scheduling happens.
--sleep 0.1
send FLUSH TABLES WITH READ LOCK;
--connection server_1
--replace_result $cid CID
eval KILL QUERY $cid;
--connection m1
--error ER_QUERY_INTERRUPTED
reap;
--connection server_1
UNLOCK TABLES;
--connection m2
reap;
UNLOCK TABLES;
# Clean up.
--connection server_2 --connection server_2
--source include/stop_slave.inc --source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads; SET GLOBAL slave_parallel_threads=@old_parallel_threads;
--source include/start_slave.inc --source include/start_slave.inc
--connection server_1 --connection server_1
DROP TABLE t1; DROP TABLE t1, t2;
--source include/rpl_end.inc --source include/rpl_end.inc
...@@ -9525,6 +9525,9 @@ PSI_stage_info stage_waiting_for_prior_transaction_to_commit= { 0, "Waiting for ...@@ -9525,6 +9525,9 @@ PSI_stage_info stage_waiting_for_prior_transaction_to_commit= { 0, "Waiting for
PSI_stage_info stage_waiting_for_prior_transaction_to_start_commit= { 0, "Waiting for prior transaction to start commit before starting next transaction", 0}; PSI_stage_info stage_waiting_for_prior_transaction_to_start_commit= { 0, "Waiting for prior transaction to start commit before starting next transaction", 0};
PSI_stage_info stage_waiting_for_room_in_worker_thread= { 0, "Waiting for room in worker thread event queue", 0}; PSI_stage_info stage_waiting_for_room_in_worker_thread= { 0, "Waiting for room in worker thread event queue", 0};
PSI_stage_info stage_waiting_for_workers_idle= { 0, "Waiting for worker threads to be idle", 0}; PSI_stage_info stage_waiting_for_workers_idle= { 0, "Waiting for worker threads to be idle", 0};
PSI_stage_info stage_waiting_for_ftwrl= { 0, "Waiting due to global read lock", 0};
PSI_stage_info stage_waiting_for_ftwrl_threads_to_pause= { 0, "Waiting for worker threads to pause for global read lock", 0};
PSI_stage_info stage_waiting_for_rpl_thread_pool= { 0, "Waiting while replication worker thread pool is busy", 0};
PSI_stage_info stage_master_gtid_wait_primary= { 0, "Waiting in MASTER_GTID_WAIT() (primary waiter)", 0}; PSI_stage_info stage_master_gtid_wait_primary= { 0, "Waiting in MASTER_GTID_WAIT() (primary waiter)", 0};
PSI_stage_info stage_master_gtid_wait= { 0, "Waiting in MASTER_GTID_WAIT()", 0}; PSI_stage_info stage_master_gtid_wait= { 0, "Waiting in MASTER_GTID_WAIT()", 0};
PSI_stage_info stage_gtid_wait_other_connection= { 0, "Waiting for other master connection to process GTID received on multiple master connections", 0}; PSI_stage_info stage_gtid_wait_other_connection= { 0, "Waiting for other master connection to process GTID received on multiple master connections", 0};
......
...@@ -454,6 +454,9 @@ extern PSI_stage_info stage_waiting_for_prior_transaction_to_commit; ...@@ -454,6 +454,9 @@ extern PSI_stage_info stage_waiting_for_prior_transaction_to_commit;
extern PSI_stage_info stage_waiting_for_prior_transaction_to_start_commit; extern PSI_stage_info stage_waiting_for_prior_transaction_to_start_commit;
extern PSI_stage_info stage_waiting_for_room_in_worker_thread; extern PSI_stage_info stage_waiting_for_room_in_worker_thread;
extern PSI_stage_info stage_waiting_for_workers_idle; extern PSI_stage_info stage_waiting_for_workers_idle;
extern PSI_stage_info stage_waiting_for_ftwrl;
extern PSI_stage_info stage_waiting_for_ftwrl_threads_to_pause;
extern PSI_stage_info stage_waiting_for_rpl_thread_pool;
extern PSI_stage_info stage_master_gtid_wait_primary; extern PSI_stage_info stage_master_gtid_wait_primary;
extern PSI_stage_info stage_master_gtid_wait; extern PSI_stage_info stage_master_gtid_wait;
extern PSI_stage_info stage_gtid_wait_other_connection; extern PSI_stage_info stage_gtid_wait_other_connection;
......
This diff is collapsed.
...@@ -70,6 +70,7 @@ struct rpl_parallel_thread { ...@@ -70,6 +70,7 @@ struct rpl_parallel_thread {
bool delay_start; bool delay_start;
bool running; bool running;
bool stop; bool stop;
bool pause_for_ftwrl;
mysql_mutex_t LOCK_rpl_thread; mysql_mutex_t LOCK_rpl_thread;
mysql_cond_t COND_rpl_thread; mysql_cond_t COND_rpl_thread;
mysql_cond_t COND_rpl_thread_queue; mysql_cond_t COND_rpl_thread_queue;
...@@ -199,12 +200,18 @@ struct rpl_parallel_thread { ...@@ -199,12 +200,18 @@ struct rpl_parallel_thread {
struct rpl_parallel_thread_pool { struct rpl_parallel_thread_pool {
uint32 count;
struct rpl_parallel_thread **threads; struct rpl_parallel_thread **threads;
struct rpl_parallel_thread *free_list; struct rpl_parallel_thread *free_list;
mysql_mutex_t LOCK_rpl_thread_pool; mysql_mutex_t LOCK_rpl_thread_pool;
mysql_cond_t COND_rpl_thread_pool; mysql_cond_t COND_rpl_thread_pool;
uint32 count;
bool inited; bool inited;
/*
While FTWRL runs, this counter is incremented to make SQL thread or
STOP/START slave not try to start new activity while that operation
is in progress.
*/
bool busy;
rpl_parallel_thread_pool(); rpl_parallel_thread_pool();
int init(uint32 size); int init(uint32 size);
...@@ -219,6 +226,12 @@ struct rpl_parallel_entry { ...@@ -219,6 +226,12 @@ struct rpl_parallel_entry {
mysql_mutex_t LOCK_parallel_entry; mysql_mutex_t LOCK_parallel_entry;
mysql_cond_t COND_parallel_entry; mysql_cond_t COND_parallel_entry;
uint32 domain_id; uint32 domain_id;
/*
Incremented by wait_for_workers_idle() and rpl_pause_for_ftwrl() to show
that they are waiting, so that finish_event_group knows to signal them
when last_committed_sub_id is increased.
*/
uint32 need_sub_id_signal;
uint64 last_commit_id; uint64 last_commit_id;
bool active; bool active;
/* /*
...@@ -227,12 +240,6 @@ struct rpl_parallel_entry { ...@@ -227,12 +240,6 @@ struct rpl_parallel_entry {
waiting for event groups to complete. waiting for event groups to complete.
*/ */
bool force_abort; bool force_abort;
/*
Set in wait_for_workers_idle() to show that it is waiting, so that
finish_event_group knows to signal it when last_committed_sub_id is
increased.
*/
bool need_sub_id_signal;
/* /*
At STOP SLAVE (force_abort=true), we do not want to process all events in At STOP SLAVE (force_abort=true), we do not want to process all events in
the queue (which could unnecessarily delay stop, if a lot of events happen the queue (which could unnecessarily delay stop, if a lot of events happen
...@@ -273,6 +280,15 @@ struct rpl_parallel_entry { ...@@ -273,6 +280,15 @@ struct rpl_parallel_entry {
queued for execution by a worker thread. queued for execution by a worker thread.
*/ */
uint64 current_sub_id; uint64 current_sub_id;
/*
The largest sub_id that has started its transaction. Protected by
LOCK_parallel_entry.
(Transactions can start out-of-order, so this value signifies that no
transactions with larger sub_id have started, but not necessarily that all
transactions with smaller sub_id have started).
*/
uint64 largest_started_sub_id;
rpl_group_info *current_group_info; rpl_group_info *current_group_info;
/* /*
If we get an error in some event group, we set the sub_id of that event If we get an error in some event group, we set the sub_id of that event
...@@ -282,6 +298,12 @@ struct rpl_parallel_entry { ...@@ -282,6 +298,12 @@ struct rpl_parallel_entry {
The value is ULONGLONG_MAX when no error occured. The value is ULONGLONG_MAX when no error occured.
*/ */
uint64 stop_on_error_sub_id; uint64 stop_on_error_sub_id;
/*
During FLUSH TABLES WITH READ LOCK, transactions with sub_id larger than
this value must not start, but wait until the global read lock is released.
The value is set to ULONGLONG_MAX when no FTWRL is pending.
*/
uint64 pause_sub_id;
/* Total count of event groups queued so far. */ /* Total count of event groups queued so far. */
uint64 count_queued_event_groups; uint64 count_queued_event_groups;
/* /*
...@@ -322,5 +344,7 @@ extern struct rpl_parallel_thread_pool global_rpl_thread_pool; ...@@ -322,5 +344,7 @@ extern struct rpl_parallel_thread_pool global_rpl_thread_pool;
extern int rpl_parallel_activate_pool(rpl_parallel_thread_pool *pool); extern int rpl_parallel_activate_pool(rpl_parallel_thread_pool *pool);
extern int rpl_parallel_inactivate_pool(rpl_parallel_thread_pool *pool); extern int rpl_parallel_inactivate_pool(rpl_parallel_thread_pool *pool);
extern bool process_gtid_for_restart_pos(Relay_log_info *rli, rpl_gtid *gtid); extern bool process_gtid_for_restart_pos(Relay_log_info *rli, rpl_gtid *gtid);
extern int rpl_pause_for_ftwrl(THD *thd);
extern void rpl_unpause_after_ftwrl(THD *thd);
#endif /* RPL_PARALLEL_H */ #endif /* RPL_PARALLEL_H */
...@@ -4284,6 +4284,17 @@ case SQLCOM_PREPARE: ...@@ -4284,6 +4284,17 @@ case SQLCOM_PREPARE:
break; break;
} }
if (lex->type & REFRESH_READ_LOCK)
{
/*
We need to pause any parallel replication slave workers during FLUSH
TABLES WITH READ LOCK. Otherwise we might cause a deadlock, as
worker threads eun run in arbitrary order but need to commit in a
specific given order.
*/
if (rpl_pause_for_ftwrl(thd))
goto error;
}
/* /*
reload_acl_and_cache() will tell us if we are allowed to write to the reload_acl_and_cache() will tell us if we are allowed to write to the
binlog or not. binlog or not.
...@@ -4314,6 +4325,8 @@ case SQLCOM_PREPARE: ...@@ -4314,6 +4325,8 @@ case SQLCOM_PREPARE:
if (!res) if (!res)
my_ok(thd); my_ok(thd);
} }
if (lex->type & REFRESH_READ_LOCK)
rpl_unpause_after_ftwrl(thd);
break; break;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment