Bug#37780: Make KILL reliable (main.kill fails randomly)

- A prerequisite cleanup patch for making KILL reliable. The test case main.kill did not work reliably. The following problems have been identified: 1. A kill signal could go lost if it came in, short before a thread went reading on the client connection. 2. A kill signal could go lost if it came in, short before a thread went waiting on a condition variable. These problems have been solved as follows. Please see also added code comments for more details. 1. There is no safe way to detect, when a thread enters the blocking state of a read(2) or recv(2) system call, where it can be interrupted by a signal. Hence it is not possible to wait for the right moment to send a kill signal. It has been decided, not to fix it in the code. Instead, the test case repeats the KILL statement until the connection terminates. 2. Before waiting on a condition variable, we register it together with a synchronizating mutex in THD::mysys_var. After this, we need to test THD::killed again. At some places we did only test it in a loop condition before the registration. When THD::killed had been set between this test and the registration, we entered waiting without noticing the killed flag. Additional checks ahve been introduced where required. In addition to the above, a re-write of the main.kill test case has been done. All sleeps have been replaced by Debug Sync Facility synchronization. A couple of sync points have been added to the server code. To avoid further problems, if the test case fails in spite of the fixes, the test case has been added to the "experimental" list for now. - Most of the work on this patch is authored by Ingo Struewing

Bug#37780: Make KILL reliable (main.kill fails randomly)
- A prerequisite cleanup patch for making KILL reliable. The test case main.kill did not work reliably. The following problems have been identified: 1. A kill signal could go lost if it came in, short before a thread went reading on the client connection. 2. A kill signal could go lost if it came in, short before a thread went waiting on a condition variable. These problems have been solved as follows. Please see also added code comments for more details. 1. There is no safe way to detect, when a thread enters the blocking state of a read(2) or recv(2) system call, where it can be interrupted by a signal. Hence it is not possible to wait for the right moment to send a kill signal. It has been decided, not to fix it in the code. Instead, the test case repeats the KILL statement until the connection terminates. 2. Before waiting on a condition variable, we register it together with a synchronizating mutex in THD::mysys_var. After this, we need to test THD::killed again. At some places we did only test it in a loop condition before the registration. When THD::killed had been set between this test and the registration, we entered waiting without noticing the killed flag. Additional checks ahve been introduced where required. In addition to the above, a re-write of the main.kill test case has been done. All sleeps have been replaced by Debug Sync Facility synchronization. A couple of sync points have been added to the server code. To avoid further problems, if the test case fails in spite of the fixes, the test case has been added to the "experimental" list for now. - Most of the work on this patch is authored by Ingo Struewing
ae6801eb · Davi Arnaut · 26e7ee2f · ae6801eb · ae6801eb · ae6801eb
Commit ae6801eb authored Oct 22, 2010 by Davi Arnaut
10 changed files
--- a/mysql-test/r/kill.result
+++ b/mysql-test/r/kill.result
-set @old_concurrent_insert= @@global.concurrent_insert;
+SET DEBUG_SYNC = 'RESET';
-set @@global.concurrent_insert= 0;
+DROP TABLE IF EXISTS t1, t2, t3;
-drop table if exists t1, t2, t3;
+DROP FUNCTION IF EXISTS MY_KILL;
-create table t1 (kill_id int);
+CREATE FUNCTION MY_KILL(tid INT) RETURNS INT
-insert into t1 values(connection_id());
+BEGIN
-select ((@id := kill_id) - kill_id) from t1;
+DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
-((@id := kill_id) - kill_id)
+KILL tid;
-0
+RETURN (SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE ID = tid);
-kill @id;
+END|
-select ((@id := kill_id) - kill_id) from t1;
+SET DEBUG_SYNC= 'thread_end SIGNAL con1_end';
-((@id := kill_id) - kill_id)
+SET DEBUG_SYNC= 'before_do_command_net_read SIGNAL con1_read';
-0
+SET DEBUG_SYNC='now WAIT_FOR con1_read';
-select @id != connection_id();
+SET DEBUG_SYNC= 'now WAIT_FOR con1_end';
-@id != connection_id()
+SET DEBUG_SYNC = 'RESET';
+SELECT 1;
+Got one of the listed errors
+SELECT 1;
+1
 1
-select 4;
+SELECT @id != CONNECTION_ID();
+@id != CONNECTION_ID()
+1
+SELECT 4;
 4
 4
-drop table t1;
+KILL (SELECT COUNT(*) FROM mysql.user);
-kill (select count(*) from mysql.user);
 ERROR 42000: This version of MySQL doesn't yet support 'Usage of subqueries or stored function calls as part of this statement'
-create table t1 (id int primary key);
+SET DEBUG_SYNC= 'thread_end SIGNAL con1_end';
-create table t2 (id int unsigned not null);
+SET DEBUG_SYNC= 'before_do_command_net_read SIGNAL con1_read WAIT_FOR kill';
-insert into t2 select id from t1;
+SET DEBUG_SYNC= 'now WAIT_FOR con1_read';
-create table t3 (kill_id int);
+SET DEBUG_SYNC= 'now WAIT_FOR con1_end';
-insert into t3 values(connection_id());
+SET DEBUG_SYNC = 'RESET';
-select id from t1 where id in (select distinct a.id from t2 a, t2 b, t2 c, t2 d group by a.id, b.id, c.id, d.id having a.id between 10 and 20);
+SELECT 1;
-select ((@id := kill_id) - kill_id) from t3;
-((@id := kill_id) - kill_id)
-0
-kill @id;
 Got one of the listed errors
-drop table t1, t2, t3;
+SELECT 1;
-select get_lock("a", 10);
+1
-get_lock("a", 10)
+1
-1
+SELECT @id != CONNECTION_ID();
-select get_lock("a", 10);
+@id != CONNECTION_ID()
-get_lock("a", 10)
+1
-NULL
+SELECT 4;
-select 1;
+4
-1
+4
-1
+CREATE TABLE t1 (id INT PRIMARY KEY AUTO_INCREMENT);
-select RELEASE_LOCK("a");
+CREATE TABLE t2 (id INT UNSIGNED NOT NULL);
-RELEASE_LOCK("a")
+INSERT INTO t1 VALUES
-1
+(0),(0),(0),(0),(0),(0),(0),(0), (0),(0),(0),(0),(0),(0),(0),(0),
-create table t1(f1 int);
+(0),(0),(0),(0),(0),(0),(0),(0), (0),(0),(0),(0),(0),(0),(0),(0),
-create function bug27563() returns int(11)
+(0),(0),(0),(0),(0),(0),(0),(0), (0),(0),(0),(0),(0),(0),(0),(0),
-deterministic
+(0),(0),(0),(0),(0),(0),(0),(0), (0),(0),(0),(0),(0),(0),(0),(0);
-begin
+INSERT t1 SELECT 0 FROM t1 AS a1, t1 AS a2 LIMIT 4032;
-declare continue handler for sqlstate '70100' set @a:= 'killed';
+INSERT INTO t2 SELECT id FROM t1;
-declare continue handler for sqlexception set @a:= 'exception';
+SET DEBUG_SYNC= 'thread_end SIGNAL con1_end';
-set @a= get_lock("lock27563", 10);
+SET DEBUG_SYNC= 'before_acos_function SIGNAL in_sync';
-return 1;
+SELECT id FROM t1 WHERE id IN
-end|
+(SELECT DISTINCT a.id FROM t2 a, t2 b, t2 c, t2 d
-select get_lock("lock27563",10);
+GROUP BY ACOS(1/a.id), b.id, c.id, d.id
-get_lock("lock27563",10)
+HAVING a.id BETWEEN 10 AND 20);
-1
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
-insert into t1 values (bug27563());
+KILL @id;
-ERROR 70100: Query execution was interrupted
+SET DEBUG_SYNC= 'now WAIT_FOR con1_end';
-select @a;
+Got one of the listed errors
-@a
+SELECT 1;
-NULL
+1
-select * from t1;
+1
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t1, t2;
+SET DEBUG_SYNC= 'before_acos_function SIGNAL in_sync WAIT_FOR kill';
+SELECT ACOS(0);
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
+ACOS(0)
+1.5707963267948966
+SELECT 1;
+1
+1
+SELECT @id = CONNECTION_ID();
+@id = CONNECTION_ID()
+1
+SET DEBUG_SYNC = 'RESET';
+CREATE TABLE t1 (f1 INT);
+CREATE FUNCTION bug27563() RETURNS INT(11)
+DETERMINISTIC
+BEGIN
+DECLARE CONTINUE HANDLER FOR SQLSTATE '70100' SET @a:= 'killed';
+DECLARE CONTINUE HANDLER FOR SQLEXCEPTION SET @a:= 'exception';
+SET DEBUG_SYNC= 'now SIGNAL in_sync WAIT_FOR kill';
+RETURN 1;
+END|
+INSERT INTO t1 VALUES (bug27563());
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
+ERROR 70100: Query execution was interrupted
+SELECT * FROM t1;
 f1
-insert into t1 values(0);
+SET DEBUG_SYNC = 'RESET';
-update t1 set f1= bug27563();
+INSERT INTO t1 VALUES(0);
+UPDATE t1 SET f1= bug27563();
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
 ERROR 70100: Query execution was interrupted
-select @a;
+SELECT * FROM t1;
-@a
-NULL
-select * from t1;
 f1
 0
-insert into t1 values(1);
+SET DEBUG_SYNC = 'RESET';
-delete from t1 where bug27563() is null;
+INSERT INTO t1 VALUES(1);
+DELETE FROM t1 WHERE bug27563() IS NULL;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
 ERROR 70100: Query execution was interrupted
-select @a;
+SELECT * FROM t1;
-@a
-NULL
-select * from t1;
 f1
 0
 1
-select * from t1 where f1= bug27563();
+SET DEBUG_SYNC = 'RESET';
-ERROR 70100: Query execution was interrupted
+SELECT * FROM t1 WHERE f1= bug27563();
-select @a;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
-@a
+KILL QUERY @id;
-NULL
+ERROR 70100: Query execution was interrupted
-create procedure proc27563()
+SELECT * FROM t1;
-begin
-declare continue handler for sqlstate '70100' set @a:= 'killed';
-declare continue handler for sqlexception set @a:= 'exception';
-select get_lock("lock27563",10);
-select "shouldn't be selected";
-end|
-call proc27563();
-get_lock("lock27563",10)
-NULL
-ERROR 70100: Query execution was interrupted
-select @a;
-@a
-NULL
-create table t2 (f2 int);
-create trigger trg27563 before insert on t1 for each row 
-begin 
-declare continue handler for sqlstate '70100' set @a:= 'killed';
-declare continue handler for sqlexception set @a:= 'exception';
-set @a:= get_lock("lock27563",10);
-insert into t2 values(1);
-end|
-insert into t1 values(2),(3);
-ERROR 70100: Query execution was interrupted
-select @a;
-@a
-NULL
-select * from t1;
 f1
 0
 1
-select * from t2;
+SET DEBUG_SYNC = 'RESET';
-f2
+DROP FUNCTION bug27563;
-select release_lock("lock27563");
+CREATE TABLE t2 (f2 INT);
-release_lock("lock27563")
+CREATE TRIGGER trg27563 BEFORE INSERT ON t1 FOR EACH ROW
+BEGIN
+DECLARE CONTINUE HANDLER FOR SQLSTATE '70100' SET @a:= 'killed';
+DECLARE CONTINUE HANDLER FOR SQLEXCEPTION SET @a:= 'exception';
+INSERT INTO t2 VALUES(0);
+SET DEBUG_SYNC= 'now SIGNAL in_sync WAIT_FOR kill';
+INSERT INTO t2 VALUES(1);
+END|
+INSERT INTO t1 VALUES(2),(3);
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
+ERROR 70100: Query execution was interrupted
+SELECT * FROM t1;
+f1
+0
 1
-drop table t1, t2;
+SELECT * FROM t2;
-drop function bug27563;
+f2
-drop procedure proc27563;
+0
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t1, t2;
+SET DEBUG_SYNC= 'before_join_optimize SIGNAL in_sync';
 PREPARE stmt FROM 'EXPLAIN SELECT * FROM t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22,t23,t24,t25,t26,t27,t28,t29,t30,t31,t32,t33,t34,t35,t36,t37,t38,t39,t40 WHERE a1=a2 AND a2=a3 AND a3=a4 AND a4=a5 AND a5=a6 AND a6=a7 AND a7=a8 AND a8=a9 AND a9=a10 AND a10=a11 AND a11=a12 AND a12=a13 AND a13=a14 AND a14=a15 AND a15=a16 AND a16=a17 AND a17=a18 AND a18=a19 AND a19=a20 AND a20=a21 AND a21=a22 AND a22=a23 AND a23=a24 AND a24=a25 AND a25=a26 AND a26=a27 AND a27=a28 AND a28=a29 AND a29=a30 AND a30=a31 AND a31=a32 AND a32=a33 AND a33=a34 AND a34=a35 AND a35=a36 AND a36=a37 AND a37=a38 AND a38=a39 AND a39=a40  ';
 EXECUTE stmt;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
+ERROR 70100: Query execution was interrupted
+SET DEBUG_SYNC = 'RESET';
 #
 # Bug#19723: kill of active connection yields different error code
 # depending on platform.
 #
-# Connection: con2.
+# Connection: con1.
-KILL CONNECTION_ID();
+SET DEBUG_SYNC= 'thread_end SIGNAL con1_end';
-# CR_SERVER_LOST, CR_SERVER_GONE_ERROR, depending on the timing 
+KILL @id;
-# of close of the connection socket
+ERROR 70100: Query execution was interrupted
+SET DEBUG_SYNC= 'now WAIT_FOR con1_end';
+# ER_SERVER_SHUTDOWN, CR_SERVER_GONE_ERROR, CR_SERVER_LOST,
+# depending on the timing of close of the connection socket
 SELECT 1;
 Got one of the listed errors
+SELECT 1;
+1
+1
+SELECT @id != CONNECTION_ID();
+@id != CONNECTION_ID()
+1
+SET DEBUG_SYNC = 'RESET';
 #
 # Additional test for WL#3726 "DDL locking for all metadata objects"
 # Check that DDL and DML statements waiting for metadata locks can
@@ -208,13 +243,11 @@ ERROR 70100: Query execution was interrupted
 # Test for DML waiting for meta-data lock
 # Switching to connection 'blocker'
 unlock tables;
-drop table t2;
-create table t2 (k int);
 lock tables t1 read;
 # Switching to connection 'ddl'
-rename tables t1 to t3, t2 to t1;
+truncate table t1;
 # Switching to connection 'dml'
-insert into t2 values (1);
+insert into t1 values (1);
 # Switching to connection 'default'
 kill query ID2;
 # Switching to connection 'dml'
@@ -239,6 +272,7 @@ unlock tables;
 # Switching to connection 'ddl'
 # Cleanup.
 # Switching to connection 'default'
-drop table t3;
 drop table t1;
-set @@global.concurrent_insert= @old_concurrent_insert;
+drop table t2;
+SET DEBUG_SYNC = 'RESET';
+DROP FUNCTION MY_KILL;
--- a/mysql-test/t/disabled.def
+++ b/mysql-test/t/disabled.def
@@ -9,7 +9,6 @@
 #  Do not use any TAB characters for whitespace.
 #
 ##############################################################################
-kill                     : Bug#37780 2008-12-03 HHunger need some changes to be robust enough for pushbuild.
 lowercase_table3         : Bug#54845 2010-06-30 alik main.lowercase_table3 on Mac OSX
 mysqlhotcopy_myisam      : Bug#54129 2010-08-31 alik mysqlhotcopy* fails
 mysqlhotcopy_archive     : Bug#54129 2010-08-31 alik mysqlhotcopy* fails

--- a/mysql-test/t/kill.test
+++ b/mysql-test/t/kill.test
-# This test doesn't work with the embedded version as this code
-# assumes that one query is running while we are doing queries on
-# a second connection.
-# This would work if mysqltest run would be threaded and handle each
-# connection in a separate thread.
 #
+# Test KILL and KILL QUERY statements.
+#
+# Killing a connection in an embedded server does not work like in a normal
+# server, if it is waiting for a new statement. In an embedded server, the
+# connection does not read() from a socket, but returns control to the
+# application. 'mysqltest' does not handle the kill request.
+#
 -- source include/not_embedded.inc
+-- source include/have_debug_sync.inc
-# Disable concurrent inserts to avoid test failures when reading the
+--disable_warnings
-# connection id which was inserted into a table by another thread.
+SET DEBUG_SYNC = 'RESET';
-set @old_concurrent_insert= @@global.concurrent_insert;
+DROP TABLE IF EXISTS t1, t2, t3;
-set @@global.concurrent_insert= 0;
+DROP FUNCTION IF EXISTS MY_KILL;
+--enable_warnings
+delimiter |;
+# Helper function used to repeatedly kill a session.
+CREATE FUNCTION MY_KILL(tid INT) RETURNS INT
+BEGIN
+  DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
+  KILL tid;
+  RETURN (SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE ID = tid);
+END|
+delimiter ;|
 connect (con1, localhost, root,,);
 connect (con2, localhost, root,,);
-#remember id of con1
+# Save id of con1
 connection con1;
--disable_warnings
-drop table if exists t1, t2, t3;
--enable_warnings
 --disable_reconnect
-create table t1 (kill_id int);
+let $ID= `SELECT @id := CONNECTION_ID()`;
-insert into t1 values(connection_id());
+connection con2;
+let $ignore= `SELECT @id := $ID`;
+connection con1;
+# Signal when this connection is terminating.
+SET DEBUG_SYNC= 'thread_end SIGNAL con1_end';
+# See if we can kill read().
+# Run into read() immediately after hitting 'before_do_command_net_read'.
+SET DEBUG_SYNC= 'before_do_command_net_read SIGNAL con1_read';
-#kill con1
+# Kill con1
 connection con2;
-select ((@id := kill_id) - kill_id) from t1; 
+SET DEBUG_SYNC='now WAIT_FOR con1_read';
-kill @id;
+# At this point we have no way to figure out, when con1 is blocked in
+# reading from the socket. Sending KILL to early would not terminate
+# con1. So we repeat KILL until con1 terminates.
+let $wait_condition= SELECT MY_KILL(@id);
+--source include/wait_condition.inc
+# If KILL missed the read(), sync point wait will time out.
+SET DEBUG_SYNC= 'now WAIT_FOR con1_end';
+SET DEBUG_SYNC = 'RESET';
 connection con1;
--sleep 2
+--error 1053,2006,2013
+SELECT 1;
--disable_query_log
--disable_result_log
-# One of the following statements should fail
--error 0,2006,2013
-select 1;
--error 0,2006,2013
-select 1;
--enable_query_log
--enable_result_log
 --enable_reconnect
 # this should work, and we should have a new connection_id()
-select ((@id := kill_id) - kill_id) from t1;
+SELECT 1;
-select @id != connection_id();
+let $ignore= `SELECT @id := $ID`;
+SELECT @id != CONNECTION_ID();
 #make sure the server is still alive
 connection con2;
-select 4;
+SELECT 4;
-drop table t1;
 connection default;
 --error ER_NOT_SUPPORTED_YET
-kill (select count(*) from mysql.user);
+KILL (SELECT COUNT(*) FROM mysql.user);
+connection con1;
+let $ID= `SELECT @id := CONNECTION_ID()`;
+connection con2;
+let $ignore= `SELECT @id := $ID`;
+connection con1;
+disable_reconnect;
+# Signal when this connection is terminating.
+SET DEBUG_SYNC= 'thread_end SIGNAL con1_end';
+# See if we can kill the sync point itself.
+# Wait in 'before_do_command_net_read' until killed.
+# It doesn't wait for a signal 'kill' but for to be killed.
+# The signal name doesn't matter here.
+SET DEBUG_SYNC= 'before_do_command_net_read SIGNAL con1_read WAIT_FOR kill';
+connection con2;
+SET DEBUG_SYNC= 'now WAIT_FOR con1_read';
+# Repeat KILL until con1 terminates.
+let $wait_condition= SELECT MY_KILL(@id);
+--source include/wait_condition.inc
+SET DEBUG_SYNC= 'now WAIT_FOR con1_end';
+SET DEBUG_SYNC = 'RESET';
+connection con1;
+--error 1053,2006,2013
+SELECT 1;
+enable_reconnect;
+SELECT 1;
+let $ignore= `SELECT @id := $ID`;
+SELECT @id != CONNECTION_ID();
+connection con2;
+SELECT 4;
+connection default;
 #
 # BUG#14851: killing long running subquery processed via a temporary table.
 #
-create table t1 (id int primary key);
-create table t2 (id int unsigned not null);
-connect (conn1, localhost, root,,);
+CREATE TABLE t1 (id INT PRIMARY KEY AUTO_INCREMENT);
-connection conn1;
+CREATE TABLE t2 (id INT UNSIGNED NOT NULL);
-- disable_result_log
+INSERT INTO t1 VALUES
-- disable_query_log
+(0),(0),(0),(0),(0),(0),(0),(0), (0),(0),(0),(0),(0),(0),(0),(0),
-let $1 = 4096;
+(0),(0),(0),(0),(0),(0),(0),(0), (0),(0),(0),(0),(0),(0),(0),(0),
-while ($1)
+(0),(0),(0),(0),(0),(0),(0),(0), (0),(0),(0),(0),(0),(0),(0),(0),
-{
+(0),(0),(0),(0),(0),(0),(0),(0), (0),(0),(0),(0),(0),(0),(0),(0);
-  eval insert into t1 values ($1);
+INSERT t1 SELECT 0 FROM t1 AS a1, t1 AS a2 LIMIT 4032;
-  dec $1;
-}
-- enable_query_log
-- enable_result_log
-insert into t2 select id from t1;
+INSERT INTO t2 SELECT id FROM t1;
-create table t3 (kill_id int);
+connection con1;
-insert into t3 values(connection_id());
+let $ID= `SELECT @id := CONNECTION_ID()`;
+connection con2;
-connect (conn2, localhost, root,,);
+let $ignore= `SELECT @id := $ID`;
-connection conn2;
-connection conn1;
+connection con1;
-- disable_result_log
+SET DEBUG_SYNC= 'thread_end SIGNAL con1_end';
-# This is a very long running query. If this test start failing, it may
+SET DEBUG_SYNC= 'before_acos_function SIGNAL in_sync';
-# be necessary to change to an even longer query.
+# This is a very long running query. If this test start failing,
-send select id from t1 where id in (select distinct a.id from t2 a, t2 b, t2 c, t2 d group by a.id, b.id, c.id, d.id having a.id between 10 and 20);
+# it may be necessary to change to an even longer query.
-- enable_result_log
+send SELECT id FROM t1 WHERE id IN
+       (SELECT DISTINCT a.id FROM t2 a, t2 b, t2 c, t2 d
+          GROUP BY ACOS(1/a.id), b.id, c.id, d.id
+          HAVING a.id BETWEEN 10 AND 20);
-connection conn2;
+connection con2;
-select ((@id := kill_id) - kill_id) from t3;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
-- sleep 1
+KILL @id;
-kill @id;
+SET DEBUG_SYNC= 'now WAIT_FOR con1_end';
-connection conn1;
+connection con1;
-- error 1317,2013
+--error 1053,2006,2013
 reap;
+SELECT 1;
 connection default;
+SET DEBUG_SYNC = 'RESET';
-drop table t1, t2, t3;
+DROP TABLE t1, t2;
-# End of 4.1 tests
 #
-# test of blocking of sending ERROR after OK or EOF
+# Test of blocking of sending ERROR after OK or EOF
 #
 connection con1;
-select get_lock("a", 10);
+let $ID= `SELECT @id := CONNECTION_ID()`;
 connection con2;
-let $ID= `select connection_id()`;
+let $ignore= `SELECT @id := $ID`;
-send select get_lock("a", 10);
-real_sleep 2;
 connection con1;
-disable_query_log;
+SET DEBUG_SYNC= 'before_acos_function SIGNAL in_sync WAIT_FOR kill';
-eval kill query $ID;
+send SELECT ACOS(0);
-enable_query_log;
 connection con2;
-reap;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
-select 1;
+KILL QUERY @id;
 connection con1;
-select RELEASE_LOCK("a");
+reap;
+SELECT 1;
+SELECT @id = CONNECTION_ID();
+connection default;
+SET DEBUG_SYNC = 'RESET';
 #
 # Bug#27563: Stored functions and triggers wasn't throwing an error when killed.
 #
-create table t1(f1 int);
+CREATE TABLE t1 (f1 INT);
 delimiter |;
-create function bug27563() returns int(11)
+CREATE FUNCTION bug27563() RETURNS INT(11)
-deterministic
+DETERMINISTIC
-begin
+BEGIN
-  declare continue handler for sqlstate '70100' set @a:= 'killed';
+  DECLARE CONTINUE HANDLER FOR SQLSTATE '70100' SET @a:= 'killed';
-  declare continue handler for sqlexception set @a:= 'exception';
+  DECLARE CONTINUE HANDLER FOR SQLEXCEPTION SET @a:= 'exception';
-  set @a= get_lock("lock27563", 10);
+  SET DEBUG_SYNC= 'now SIGNAL in_sync WAIT_FOR kill';
-  return 1;
+  RETURN 1;
-end|
+END|
 delimiter ;|
 # Test stored functions
 # Test INSERT
 connection con1;
-select get_lock("lock27563",10);
+let $ID= `SELECT @id := CONNECTION_ID()`;
 connection con2;
-let $ID= `select connection_id()`;
+let $ignore= `SELECT @id := $ID`;
-send insert into t1 values (bug27563());
-real_sleep 2;
 connection con1;
-disable_query_log;
+send INSERT INTO t1 VALUES (bug27563());
-eval kill query $ID;
-enable_query_log;
 connection con2;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
+connection con1;
 --error 1317
 reap;
-select @a;
+SELECT * FROM t1;
-connection con1;
+connection default;
-select * from t1;
+SET DEBUG_SYNC = 'RESET';
 # Test UPDATE
-insert into t1 values(0);
+INSERT INTO t1 VALUES(0);
-connection con2;
-send update t1 set f1= bug27563();
-real_sleep 2;
 connection con1;
-disable_query_log;
+send UPDATE t1 SET f1= bug27563();
-eval kill query $ID;
-enable_query_log;
 connection con2;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
+connection con1;
 --error 1317
 reap;
-select @a;
+SELECT * FROM t1;
-connection con1;
+connection default;
-select * from t1;
+SET DEBUG_SYNC = 'RESET';
 # Test DELETE
-insert into t1 values(1);
+INSERT INTO t1 VALUES(1);
-connection con2;
-send delete from t1 where bug27563() is null;
-real_sleep 2;
 connection con1;
-disable_query_log;
+send DELETE FROM t1 WHERE bug27563() IS NULL;
-eval kill query $ID;
-enable_query_log;
 connection con2;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
+connection con1;
 --error 1317
 reap;
-select @a;
+SELECT * FROM t1;
-connection con1;
+connection default;
-select * from t1;
+SET DEBUG_SYNC = 'RESET';
 # Test SELECT
-connection con2;
-send select * from t1 where f1= bug27563();
-real_sleep 2;
 connection con1;
-disable_query_log;
+send SELECT * FROM t1 WHERE f1= bug27563();
-eval kill query $ID;
-enable_query_log;
 connection con2;
--error 1317
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
-reap;
+KILL QUERY @id;
-select @a;
-# Test PROCEDURE
-connection con2;
-delimiter |;
-create procedure proc27563()
-begin
-  declare continue handler for sqlstate '70100' set @a:= 'killed';
-  declare continue handler for sqlexception set @a:= 'exception';
-  select get_lock("lock27563",10);
-  select "shouldn't be selected";
-end|
-delimiter ;|
-send call proc27563();
-real_sleep 2;
 connection con1;
-disable_query_log;
-eval kill query $ID;
-enable_query_log;
-connection con2;
 --error 1317
 reap;
-select @a;
+SELECT * FROM t1;
+connection default;
+SET DEBUG_SYNC = 'RESET';
+DROP FUNCTION bug27563;
 # Test TRIGGERS
-connection con2;
+CREATE TABLE t2 (f2 INT);
-create table t2 (f2 int);
 delimiter |;
-create trigger trg27563 before insert on t1 for each row 
+CREATE TRIGGER trg27563 BEFORE INSERT ON t1 FOR EACH ROW
-begin 
+BEGIN
-  declare continue handler for sqlstate '70100' set @a:= 'killed';
+  DECLARE CONTINUE HANDLER FOR SQLSTATE '70100' SET @a:= 'killed';
-  declare continue handler for sqlexception set @a:= 'exception';
+  DECLARE CONTINUE HANDLER FOR SQLEXCEPTION SET @a:= 'exception';
-  set @a:= get_lock("lock27563",10);
+  INSERT INTO t2 VALUES(0);
-  insert into t2 values(1);
+  SET DEBUG_SYNC= 'now SIGNAL in_sync WAIT_FOR kill';
-end|
+  INSERT INTO t2 VALUES(1);
+END|
 delimiter ;|
-send insert into t1 values(2),(3);
-real_sleep 2;
 connection con1;
-disable_query_log;
+send INSERT INTO t1 VALUES(2),(3);
-eval kill query $ID;
-enable_query_log;
 connection con2;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
+KILL QUERY @id;
+connection con1;
 --error 1317
 reap;
-select @a;
+SELECT * FROM t1;
-connection con1;
+SELECT * FROM t2;
-select * from t1;
+connection default;
-select * from t2;
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t1, t2;
-# Cleanup
-select release_lock("lock27563");
-drop table t1, t2;
-drop function bug27563;
-drop procedure proc27563;
 #
 # Bug#28598: mysqld crash when killing a long-running explain query.
 #
--disable_query_log
 connection con1;
-let $ID= `select connection_id()`;
+let $ID= `SELECT @id := CONNECTION_ID()`;
+connection con2;
+let $ignore= `SELECT @id := $ID`;
+connection con1;
+--disable_query_log
 let $tab_count= 40;
 let $i= $tab_count;
 while ($i)
 {
-  eval CREATE TABLE t$i (a$i int, KEY(a$i));
+  eval CREATE TABLE t$i (a$i INT, KEY(a$i));
  eval INSERT INTO t$i VALUES (1),(2),(3),(4),(5),(6),(7);
  dec $i ;
 }
-set session optimizer_search_depth=0;
+SET SESSION optimizer_search_depth=0;
 let $i=$tab_count;
 while ($i)
@@ -295,13 +310,17 @@ while ($i)
 }
 --enable_query_log
+SET DEBUG_SYNC= 'before_join_optimize SIGNAL in_sync';
 eval PREPARE stmt FROM 'EXPLAIN SELECT * $from $where';
 send EXECUTE stmt;
--disable_query_log
 connection con2;
-real_sleep 2;
+SET DEBUG_SYNC= 'now WAIT_FOR in_sync';
-eval kill query $ID;
+KILL QUERY @id;
+connection con1;
+--error 1317
+reap;
+--disable_query_log
 let $i= $tab_count;
 while ($i)
 {
@@ -309,8 +328,8 @@ while ($i)
  dec $i ;
 }
 --enable_query_log
+connection default;
-###########################################################################
+SET DEBUG_SYNC = 'RESET';
 --echo #
 --echo # Bug#19723: kill of active connection yields different error code
@@ -318,16 +337,27 @@ while ($i)
 --echo #
 --echo
--echo # Connection: con2.
+--echo # Connection: con1.
--connection con2
+--connection con1
+let $ID= `SELECT @id := CONNECTION_ID()`;
-KILL CONNECTION_ID();
+SET DEBUG_SYNC= 'thread_end SIGNAL con1_end';
+--disable_reconnect
+--error ER_QUERY_INTERRUPTED
+KILL @id;
--echo # CR_SERVER_LOST, CR_SERVER_GONE_ERROR, depending on the timing 
+connection con2;
--echo # of close of the connection socket
+SET DEBUG_SYNC= 'now WAIT_FOR con1_end';
--error 2013, 2006 
+connection con1;
+--echo # ER_SERVER_SHUTDOWN, CR_SERVER_GONE_ERROR, CR_SERVER_LOST,
+--echo # depending on the timing of close of the connection socket
+--error 1053,2006,2013
+SELECT 1;
+--enable_reconnect
 SELECT 1;
--connection default
+let $ignore= `SELECT @id := $ID`;
+SELECT @id != CONNECTION_ID();
+connection default;
+SET DEBUG_SYNC = 'RESET';
 --echo #
 --echo # Additional test for WL#3726 "DDL locking for all metadata objects"
@@ -489,28 +519,26 @@ connection ddl;
 --echo # Switching to connection 'blocker'
 connection blocker;
 unlock tables;
-drop table t2;
-create table t2 (k int);
 lock tables t1 read;
 --echo # Switching to connection 'ddl'
 connection ddl;
 # Let us add pending exclusive metadata lock on t2
--send rename tables t1 to t3, t2 to t1
+--send truncate table t1
 --echo # Switching to connection 'dml'
 connection dml;
 let $wait_condition=
  select count(*) = 1 from information_schema.processlist
  where state = "Waiting for table metadata lock" and
-        info = "rename tables t1 to t3, t2 to t1";
+        info = "truncate table t1";
 --source include/wait_condition.inc
 let $ID2= `select connection_id()`;
--send insert into t2 values (1)
+--send insert into t1 values (1)
 --echo # Switching to connection 'default'
 connection default;
 let $wait_condition=
  select count(*) = 1 from information_schema.processlist
  where state = "Waiting for table metadata lock" and
-        info = "insert into t2 values (1)";
+        info = "insert into t1 values (1)";
 --source include/wait_condition.inc
 --replace_result $ID2 ID2
 eval kill query $ID2;
@@ -564,10 +592,10 @@ connection ddl;
 --echo # Cleanup.
 --echo # Switching to connection 'default'
 connection default;
-drop table t3;
 drop table t1;
+drop table t2;
 ###########################################################################
-# Restore global concurrent_insert value. Keep in the end of the test file.
+SET DEBUG_SYNC = 'RESET';
-set @@global.concurrent_insert= @old_concurrent_insert;
+DROP FUNCTION MY_KILL;
--- a/sql/event_queue.cc
+++ b/sql/event_queue.cc
@@ -741,11 +741,13 @@ Event_queue::cond_wait(THD *thd, struct timespec *abstime, const char* msg,
  thd->enter_cond(&COND_queue_state, &LOCK_event_queue, msg);
-  DBUG_PRINT("info", ("mysql_cond_%swait", abstime? "timed":""));
+  if (!thd->killed)
+  {
    if (!abstime)
      mysql_cond_wait(&COND_queue_state, &LOCK_event_queue);
    else
      mysql_cond_timedwait(&COND_queue_state, &LOCK_event_queue, abstime);
+  }
  mutex_last_locked_in_func= func;
  mutex_last_locked_at_line= line;

--- a/sql/lock.cc
+++ b/sql/lock.cc
@@ -989,6 +989,14 @@ bool Global_read_lock::lock_global_read_lock(THD *thd)
    const char *new_message= "Waiting to get readlock";
    (void) mysql_mutex_lock(&LOCK_global_read_lock);
+    old_message= thd->enter_cond(&COND_global_read_lock,
+                                 &LOCK_global_read_lock, new_message);
+    DBUG_PRINT("info",
+               ("waiting_for: %d  protect_against: %d",
+                waiting_for_read_lock, protect_against_global_read_lock));
+    waiting_for_read_lock++;
 #if defined(ENABLED_DEBUG_SYNC)
    /*
      The below sync point fires if we have to wait for
@@ -997,27 +1005,18 @@ bool Global_read_lock::lock_global_read_lock(THD *thd)
      WARNING: Beware to use WAIT_FOR with this sync point. We hold
      LOCK_global_read_lock here.
-      Call the sync point before calling enter_cond() as it does use
+      The sync point is after enter_cond() so that proc_info is
-      enter_cond() and exit_cond() itself if a WAIT_FOR action is
+      available immediately after the sync point sends a SIGNAL. This
-      executed in spite of the above warning.
+      can make tests more reliable.
-      Pre-set proc_info so that it is available immediately after the
+      The sync point is before the loop so that it is executed only once.
-      sync point sends a SIGNAL. This makes tests more reliable.
    */
-    if (protect_against_global_read_lock)
+    if (protect_against_global_read_lock && !thd->killed)
    {
-      thd_proc_info(thd, new_message);
      DEBUG_SYNC(thd, "wait_lock_global_read_lock");
    }
 #endif /* defined(ENABLED_DEBUG_SYNC) */
-    old_message=thd->enter_cond(&COND_global_read_lock, &LOCK_global_read_lock,
-                                new_message);
-    DBUG_PRINT("info",
-	       ("waiting_for: %d  protect_against: %d",
-		waiting_for_read_lock, protect_against_global_read_lock));
-    waiting_for_read_lock++;
    while (protect_against_global_read_lock && !thd->killed)
      mysql_cond_wait(&COND_global_read_lock, &LOCK_global_read_lock);
    waiting_for_read_lock--;

--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -721,8 +721,16 @@ int start_slave_thread(
    while (start_id == *slave_run_id)
    {
      DBUG_PRINT("sleep",("Waiting for slave thread to start"));
-      const char* old_msg = thd->enter_cond(start_cond,cond_lock,
+      const char *old_msg= thd->enter_cond(start_cond, cond_lock,
                                           "Waiting for slave thread to start");
+      /*
+        It is not sufficient to test this at loop bottom. We must test
+        it after registering the mutex in enter_cond(). If the kill
+        happens after testing of thd->killed and before the mutex is
+        registered, we could otherwise go waiting though thd->killed is
+        set.
+      */
+      if (!thd->killed)
        mysql_cond_wait(start_cond, cond_lock);
      thd->exit_cond(old_msg);
      mysql_mutex_lock(cond_lock); // re-acquire it as exit_cond() released

--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -1179,36 +1179,70 @@ void add_diff_to_status(STATUS_VAR *to_var, STATUS_VAR *from_var,
 }
+/**
+  Awake a thread.
+  @param[in]  state_to_set    value for THD::killed
+  This is normally called from another thread's THD object.
+  @note Do always call this while holding LOCK_thd_data.
+*/
 void THD::awake(THD::killed_state state_to_set)
 {
  DBUG_ENTER("THD::awake");
-  DBUG_PRINT("enter", ("this: 0x%lx", (long) this));
+  DBUG_PRINT("enter", ("this: %p current_thd: %p", this, current_thd));
  THD_CHECK_SENTRY(this);
  mysql_mutex_assert_owner(&LOCK_thd_data);
+  /* Set the 'killed' flag of 'this', which is the target THD object. */
  killed= state_to_set;
  if (state_to_set != THD::KILL_QUERY)
  {
-    thr_alarm_kill(thread_id);
-    if (!slave_thread)
-      MYSQL_CALLBACK(thread_scheduler, post_kill_notification, (this));
 #ifdef SIGNAL_WITH_VIO_CLOSE
    if (this != current_thd)
    {
      /*
-        In addition to a signal, let's close the socket of the thread that
+        Before sending a signal, let's close the socket of the thread
-        is being killed. This is to make sure it does not block if the
+        that is being killed ("this", which is not the current thread).
-        signal is lost. This needs to be done only on platforms where
+        This is to make sure it does not block if the signal is lost.
-        signals are not a reliable interruption mechanism.
+        This needs to be done only on platforms where signals are not
+        a reliable interruption mechanism.
-        If we're killing ourselves, we know that we're not blocked, so this
-        hack is not used.
+        Note that the downside of this mechanism is that we could close
+        the connection while "this" target thread is in the middle of
+        sending a result to the application, thus violating the client-
+        server protocol.
+        On the other hand, without closing the socket we have a race
+        condition. If "this" target thread passes the check of
+        thd->killed, and then the current thread runs through
+        THD::awake(), sets the 'killed' flag and completes the
+        signaling, and then the target thread runs into read(), it will
+        block on the socket. As a result of the discussions around
+        Bug#37780, it has been decided that we accept the race
+        condition. A second KILL awakes the target from read().
+        If we are killing ourselves, we know that we are not blocked.
+        We also know that we will check thd->killed before we go for
+        reading the next statement.
      */
      close_active_vio();
    }
 #endif
+    /* Mark the target thread's alarm request expired, and signal alarm. */
+    thr_alarm_kill(thread_id);
+    /* Send an event to the scheduler that a thread should be killed. */
+    if (!slave_thread)
+      MYSQL_CALLBACK(thread_scheduler, post_kill_notification, (this));
  }
+  /* Broadcast a condition to kick the target if it is waiting on it. */
  if (mysys_var)
  {
    mysql_mutex_lock(&mysys_var->mutex);
@@ -1232,6 +1266,11 @@ void THD::awake(THD::killed_state state_to_set)
      we issue a second KILL or the status it's waiting for happens).
      It's true that we have set its thd->killed but it may not
      see it immediately and so may have time to reach the cond_wait().
+      However, where possible, we test for killed once again after
+      enter_cond(). This should make the signaling as safe as possible.
+      However, there is still a small chance of failure on platforms with
+      instruction or memory write reordering.
    */
    if (mysys_var->current_cond && mysys_var->current_mutex)
    {

--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1953,6 +1953,12 @@ public:
  DYNAMIC_ARRAY user_var_events;        /* For user variables replication */
  MEM_ROOT      *user_var_events_alloc; /* Allocate above array elements here */
+  /*
+    If checking this in conjunction with a wait condition, please
+    include a check after enter_cond() if you want to avoid a race
+    condition. For details see the implementation of awake(),
+    especially the "broadcast" part.
+  */
  enum killed_state
  {
    NOT_KILLED=0,

--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -712,6 +712,22 @@ bool do_command(THD *thd)
  net_new_transaction(net);
+  /*
+    Synchronization point for testing of KILL_CONNECTION.
+    This sync point can wait here, to simulate slow code execution
+    between the last test of thd->killed and blocking in read().
+    The goal of this test is to verify that a connection does not
+    hang, if it is killed at this point of execution.
+    (Bug#37780 - main.kill fails randomly)
+    Note that the sync point wait itself will be terminated by a
+    kill. In this case it consumes a condition broadcast, but does
+    not change anything else. The consumed broadcast should not
+    matter here, because the read/recv() below doesn't use it.
+  */
+  DEBUG_SYNC(thd, "before_do_command_net_read");
  if ((packet_length= my_net_read(net)) == packet_error)
  {
    DBUG_PRINT("info",("Got error %d reading command from socket %s",

--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -47,6 +47,7 @@
 #include "records.h"             // init_read_record, end_read_record
 #include "filesort.h"            // filesort_free_buffers
 #include "sql_union.h"           // mysql_union
+#include "debug_sync.h"          // DEBUG_SYNC
 #include <m_ctype.h>
 #include <my_bit.h>
 #include <hash.h>
@@ -852,6 +853,7 @@ JOIN::optimize()
  if (optimized)
    DBUG_RETURN(0);
  optimized= 1;
+  DEBUG_SYNC(thd, "before_join_optimize");
  thd_proc_info(thd, "optimizing");
  row_limit= ((select_distinct || order || group_list) ? HA_POS_ERROR :