Commit c2dd88ac authored by Kristian Nielsen's avatar Kristian Nielsen

Merge MDEV-8031 into 10.1

parents fc1f301e b616991a
...@@ -1591,7 +1591,7 @@ a b ...@@ -1591,7 +1591,7 @@ a b
120 0 120 0
121 0 121 0
include/stop_slave.inc include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug; SET GLOBAL debug_dbug= @old_dbug;
include/start_slave.inc include/start_slave.inc
*** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. *** *** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. ***
include/stop_slave.inc include/stop_slave.inc
...@@ -1620,7 +1620,88 @@ a b ...@@ -1620,7 +1620,88 @@ a b
130 0 130 0
131 0 131 0
include/stop_slave.inc include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug; SET GLOBAL debug_dbug= @old_dbug;
include/start_slave.inc
*** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) ***
INSERT INTO t3 VALUES (201,0), (202,0);
include/save_master_gtid.inc
include/sync_with_master_gtid.inc
include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_mdev8031';
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10200;
INSERT INTO t3 VALUES (203, 1);
INSERT INTO t3 VALUES (204, 1);
INSERT INTO t3 VALUES (205, 1);
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=205;
UPDATE t3 SET b=b+1 WHERE a=205;
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 3
202 4
203 4
204 4
205 3
include/save_master_gtid.inc
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 3
202 4
203 4
204 4
205 3
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_dbug;
include/start_slave.inc
*** Check getting deadlock killed inside open_binlog() during retry. ***
include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill';
SET @old_max= @@GLOBAL.max_relay_log_size;
SET GLOBAL max_relay_log_size= 4096;
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10210;
Omit long queries that cause relaylog rotations and transaction retries...
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 6
202 8
203 7
204 7
205 5
include/save_master_gtid.inc
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 6
202 8
203 7
204 7
205 5
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debg;
SET GLOBAL max_relay_log_size= @old_max;
include/start_slave.inc include/start_slave.inc
include/stop_slave.inc include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads; SET GLOBAL slave_parallel_threads=@old_parallel_threads;
......
...@@ -2215,7 +2215,7 @@ SELECT * FROM t3 WHERE a >= 120 ORDER BY a; ...@@ -2215,7 +2215,7 @@ SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
SELECT * FROM t3 WHERE a >= 120 ORDER BY a; SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
--source include/stop_slave.inc --source include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug; SET GLOBAL debug_dbug= @old_dbug;
--source include/start_slave.inc --source include/start_slave.inc
...@@ -2262,10 +2262,119 @@ SELECT * FROM t3 WHERE a >= 130 ORDER BY a; ...@@ -2262,10 +2262,119 @@ SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
SELECT * FROM t3 WHERE a >= 130 ORDER BY a; SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
--source include/stop_slave.inc --source include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug; SET GLOBAL debug_dbug= @old_dbug;
--source include/start_slave.inc --source include/start_slave.inc
--echo *** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) ***
--connection server_1
INSERT INTO t3 VALUES (201,0), (202,0);
--source include/save_master_gtid.inc
--connection server_2
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_mdev8031';
--connection server_1
# We artificially create a situation that hopefully resembles the original
# bug which was only seen "in the wild", and only once.
# Setup a fake group commit with lots of conflicts that will lead to deadloc
# kill. The slave DBUG injection causes the slave to be deadlock killed at
# a particular point during the retry, and then later do a small sleep at
# another critical point where the prior transaction then has a chance to
# complete. Finally an extra KILL check catches an unhandled, lingering
# deadlock kill. So rather artificial, but at least it exercises the
# relevant code paths.
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10200;
INSERT INTO t3 VALUES (203, 1);
INSERT INTO t3 VALUES (204, 1);
INSERT INTO t3 VALUES (205, 1);
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=205;
UPDATE t3 SET b=b+1 WHERE a=205;
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
--source include/save_master_gtid.inc
--connection server_2
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL debug_dbug= @old_dbug;
--source include/start_slave.inc
--echo *** Check getting deadlock killed inside open_binlog() during retry. ***
--connection server_2
--source include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill';
SET @old_max= @@GLOBAL.max_relay_log_size;
SET GLOBAL max_relay_log_size= 4096;
--connection server_1
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
--let $large= `SELECT REPEAT("*", 8192)`
SET @commit_id= 10210;
--echo Omit long queries that cause relaylog rotations and transaction retries...
--disable_query_log
eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */;
--enable_query_log
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
--source include/save_master_gtid.inc
--connection server_2
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debg;
SET GLOBAL max_relay_log_size= @old_max;
--source include/start_slave.inc
# Clean up. # Clean up.
--connection server_2 --connection server_2
--source include/stop_slave.inc --source include/stop_slave.inc
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include "rpl_parallel.h" #include "rpl_parallel.h"
#include "slave.h" #include "slave.h"
#include "rpl_mi.h" #include "rpl_mi.h"
#include "sql_parse.h"
#include "debug_sync.h" #include "debug_sync.h"
/* /*
...@@ -333,7 +334,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt, ...@@ -333,7 +334,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
IO_CACHE rlog; IO_CACHE rlog;
LOG_INFO linfo; LOG_INFO linfo;
File fd= (File)-1; File fd= (File)-1;
const char *errmsg= NULL; const char *errmsg;
inuse_relaylog *ir= rgi->relay_log; inuse_relaylog *ir= rgi->relay_log;
uint64 event_count; uint64 event_count;
uint64 events_to_execute= rgi->retry_event_count; uint64 events_to_execute= rgi->retry_event_count;
...@@ -349,6 +350,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt, ...@@ -349,6 +350,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
do_retry: do_retry:
event_count= 0; event_count= 0;
err= 0; err= 0;
errmsg= NULL;
/* /*
If we already started committing before getting the deadlock (or other If we already started committing before getting the deadlock (or other
...@@ -384,7 +386,16 @@ do_retry: ...@@ -384,7 +386,16 @@ do_retry:
*/ */
if(thd->wait_for_commit_ptr) if(thd->wait_for_commit_ptr)
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit(); thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
DBUG_EXECUTE_IF("inject_mdev8031", {
/* Simulate that we get deadlock killed at this exact point. */
rgi->killed_for_retry= true;
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->killed= KILL_CONNECTION;
mysql_mutex_unlock(&thd->LOCK_thd_data);
});
rgi->cleanup_context(thd, 1); rgi->cleanup_context(thd, 1);
thd->reset_killed();
thd->clear_error();
/* /*
If we retry due to a deadlock kill that occured during the commit step, we If we retry due to a deadlock kill that occured during the commit step, we
...@@ -428,10 +439,22 @@ do_retry: ...@@ -428,10 +439,22 @@ do_retry:
complete its commit. complete its commit.
*/ */
thd->clear_error(); thd->clear_error();
thd->reset_killed();
if(thd->wait_for_commit_ptr) if(thd->wait_for_commit_ptr)
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit(); thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
DBUG_EXECUTE_IF("inject_mdev8031", {
/* Inject a small sleep to give prior transaction a chance to commit. */
my_sleep(100000);
});
} }
/*
Let us clear any lingering deadlock kill one more time, here after
wait_for_prior_commit() has completed. This should rule out any
possibility of an old deadlock kill lingering on beyond this point.
*/
thd->reset_killed();
strmake_buf(log_name, ir->name); strmake_buf(log_name, ir->name);
if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0) if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
{ {
...@@ -447,6 +470,14 @@ do_retry: ...@@ -447,6 +470,14 @@ do_retry:
err= 1; err= 1;
goto err; goto err;
} }
DBUG_EXECUTE_IF("inject_mdev8031", {
/* Simulate pending KILL caught in read_relay_log_description_event(). */
if (thd->check_killed()) {
thd->send_kill_message();
err= 1;
goto err;
}
});
my_b_seek(&rlog, cur_offset); my_b_seek(&rlog, cur_offset);
do do
...@@ -469,7 +500,7 @@ do_retry: ...@@ -469,7 +500,7 @@ do_retry:
{ {
errmsg= "slave SQL thread aborted because of I/O error"; errmsg= "slave SQL thread aborted because of I/O error";
err= 1; err= 1;
goto err; goto check_retry;
} }
if (rlog.error > 0) if (rlog.error > 0)
{ {
...@@ -498,10 +529,25 @@ do_retry: ...@@ -498,10 +529,25 @@ do_retry:
} }
strmake_buf(log_name ,linfo.log_file_name); strmake_buf(log_name ,linfo.log_file_name);
DBUG_EXECUTE_IF("inject_retry_event_group_open_binlog_kill", {
if (retries < 2)
{
/* Simulate that we get deadlock killed during open_binlog(). */
mysql_reset_thd_for_next_command(thd);
rgi->killed_for_retry= true;
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->killed= KILL_CONNECTION;
mysql_mutex_unlock(&thd->LOCK_thd_data);
thd->send_kill_message();
fd= (File)-1;
err= 1;
goto check_retry;
}
});
if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0) if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
{ {
err= 1; err= 1;
goto err; goto check_retry;
} }
/* Loop to try again on the new log file. */ /* Loop to try again on the new log file. */
} }
...@@ -544,26 +590,31 @@ do_retry: ...@@ -544,26 +590,31 @@ do_retry:
if (retries == 0) err= dbug_simulate_tmp_error(rgi, thd);); if (retries == 0) err= dbug_simulate_tmp_error(rgi, thd););
DBUG_EXECUTE_IF("rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100", DBUG_EXECUTE_IF("rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100",
err= dbug_simulate_tmp_error(rgi, thd);); err= dbug_simulate_tmp_error(rgi, thd););
if (err) if (!err)
continue;
check_retry:
convert_kill_to_deadlock_error(rgi);
if (has_temporary_error(thd))
{ {
convert_kill_to_deadlock_error(rgi); ++retries;
if (has_temporary_error(thd)) if (retries < slave_trans_retries)
{ {
++retries; if (fd >= 0)
if (retries < slave_trans_retries)
{ {
end_io_cache(&rlog); end_io_cache(&rlog);
mysql_file_close(fd, MYF(MY_WME)); mysql_file_close(fd, MYF(MY_WME));
fd= (File)-1; fd= (File)-1;
goto do_retry;
} }
sql_print_error("Slave worker thread retried transaction %lu time(s) " goto do_retry;
"in vain, giving up. Consider raising the value of "
"the slave_transaction_retries variable.",
slave_trans_retries);
} }
goto err; sql_print_error("Slave worker thread retried transaction %lu time(s) "
"in vain, giving up. Consider raising the value of "
"the slave_transaction_retries variable.",
slave_trans_retries);
} }
goto err;
} while (event_count < events_to_execute); } while (event_count < events_to_execute);
err: err:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment