Commit aa4b8939 authored by unknown's avatar unknown

Bug #45214 get_master_version_and_clock does not report error when queries fail

        
The "get_master_version_and_clock(...)" function in sql/slave.cc ignores 
error and passes directly when queries fail, or queries succeed 
but the result retrieved is empty.
  
The "get_master_version_and_clock(...)" function should try to reconnect master
if queries fail because of transient network problems, and fail otherwise.
The I/O thread should print a warning if the some system variables do not 
exist on master (very old master)

mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test:
  Added test file for bug #45214
mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result:
  Added test result for bug #45214
mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test:
  Added test file for bug #45214
sql/slave.cc:
  The 'is_network_error()' function is added for checking if the error is caused by network.
  Added a new return value (2) to 'get_master_version_and_clock()' function result set 
  to indicate transient network errors when queries fail, and the caller should 
  try to reconnect in this case.
parent 5f9440d2
#
# BUG#45214
# The common part of the "rpl_get_master_version_and_clock" test.
# Restart slave under network disconnection between slave and master
# following the steps:
# 1 - Got DBUG_SYNC_POINT lock
# 2 - Set DBUG_SYNC_POINT before call mysql_real_query(...) function in get_master_version_and_clock(...) function and hang here
# 3 - shutdown master server for simulating network disconnection
# 4 - Release DBUG_SYNC_POINT lock
# 5 - Check if the slave I/O thread tries to reconnect to master.
#
# Note: Please make sure initialize the $debug_lock when call the test script.
#
connection slave;
if (`SELECT '$debug_lock' = ''`)
{
--die Cannot continue. Please set value for $debug_lock.
}
# Restart slave
--disable_warnings
stop slave;
source include/wait_for_slave_to_stop.inc;
start slave;
source include/wait_for_slave_to_start.inc;
connection master;
# Write file to make mysql-test-run.pl expect the "crash", but don't start
# it until it's told to
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
wait
EOF
# Send shutdown to the connected server and give
# it 10 seconds to die before zapping it
shutdown_server 10;
connection slave;
eval SELECT RELEASE_LOCK($debug_lock);
# Show slave last IO errno
connection slave;
source include/wait_for_slave_io_error.inc;
let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1);
echo Slave_IO_Errno= $last_io_errno;
# Write file to make mysql-test-run.pl start up the server again
--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
restart
EOF
connection master;
# Turn on reconnect
--enable_reconnect
# Call script that will poll the server waiting for it to be back online again
--source include/wait_until_connected_again.inc
# Turn off reconnect again
--disable_reconnect
connection slave;
source include/wait_for_slave_to_start.inc;
stop slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
SELECT IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP");
IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP")
1
SELECT GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000);
GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000)
1
set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP';
stop slave;
start slave;
SELECT RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP");
RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP")
1
Slave_IO_Errno= 2013
SELECT IS_FREE_LOCK("debug_lock.before_get_SERVER_ID");
IS_FREE_LOCK("debug_lock.before_get_SERVER_ID")
1
SELECT GET_LOCK("debug_lock.before_get_SERVER_ID", 1000);
GET_LOCK("debug_lock.before_get_SERVER_ID", 1000)
1
set global debug= 'd,debug_lock.before_get_SERVER_ID';
stop slave;
start slave;
SELECT RELEASE_LOCK("debug_lock.before_get_SERVER_ID");
RELEASE_LOCK("debug_lock.before_get_SERVER_ID")
1
Slave_IO_Errno= 2013
set global debug= '';
reset master;
include/stop_slave.inc
change master to master_port=SLAVE_PORT;
start slave;
*** must be having the replicate-same-server-id IO thread error ***
Slave_IO_Errno= 1593
Slave_IO_Error= Fatal error: The slave I/O thread stops because master and slave have equal MySQL server ids; these ids must be different for replication to work (or the --replicate-same-server-id option must be used on slave but this does not always make sense; please check the manual before using it).
#
# BUG#45214
# This test verifies if the slave I/O tread tries to reconnect to
# master when it tries to get the values of the UNIX_TIMESTAMP, SERVER_ID,
# COLLATION_SERVER and TIME_ZONE from master under network disconnection.
# The COLLATION_SERVER and TIME_ZONE are got only on master server version 4.
# So they can't be verified by test case here.
# Finish the following tests by calling its common test script:
# extra/rpl_tests/rpl_get_master_version_and_clock.test.
# And meanwhile this test checks that the slave I/O thread refuses to start if slave
# and master have the same server id (because this is a useless setup,
# and otherwise SHOW SLAVE STATUS shows progress but all queries are
# ignored, which has caught our customers), unless
# --replicate-same-server-id.
#
source include/master-slave.inc;
#Test case 1: Try to get the value of the UNIX_TIMESTAMP from master under network disconnection
connection slave;
let $debug_saved= `select @@global.debug`;
let $debug_lock= "debug_lock.before_get_UNIX_TIMESTAMP";
eval SELECT IS_FREE_LOCK($debug_lock);
eval SELECT GET_LOCK($debug_lock, 1000);
set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP';
source extra/rpl_tests/rpl_get_master_version_and_clock.test;
#Test case 2: Try to get the value of the SERVER_ID from master under network disconnection
connection slave;
let $debug_lock= "debug_lock.before_get_SERVER_ID";
eval SELECT IS_FREE_LOCK($debug_lock);
eval SELECT GET_LOCK($debug_lock, 1000);
set global debug= 'd,debug_lock.before_get_SERVER_ID';
source extra/rpl_tests/rpl_get_master_version_and_clock.test;
eval set global debug= '$debug_saved';
#Test case 3: This test checks that the slave I/O thread refuses to start
#if slave and master have the same server id.
connection slave;
reset master;
# replicate ourselves
source include/stop_slave.inc;
--replace_result $SLAVE_MYPORT SLAVE_PORT
eval change master to master_port=$SLAVE_MYPORT;
start slave;
let $slave_param= Last_IO_Errno;
let $slave_param_value= 1593;
source include/wait_for_slave_param.inc;
--echo *** must be having the replicate-same-server-id IO thread error ***
let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1);
let $last_io_error= query_get_value("show slave status", Last_IO_Error, 1);
echo Slave_IO_Errno= $last_io_errno;
echo Slave_IO_Error= $last_io_error;
# End of tests
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
#include <my_dir.h> #include <my_dir.h>
#include <sql_common.h> #include <sql_common.h>
#include <errmsg.h> #include <errmsg.h>
#include <mysqld_error.h>
#include <mysys_err.h> #include <mysys_err.h>
#ifdef HAVE_REPLICATION #ifdef HAVE_REPLICATION
...@@ -859,6 +860,29 @@ int init_intvar_from_file(int* var, IO_CACHE* f, int default_val) ...@@ -859,6 +860,29 @@ int init_intvar_from_file(int* var, IO_CACHE* f, int default_val)
DBUG_RETURN(1); DBUG_RETURN(1);
} }
/*
Check if the error is caused by network.
@param[in] errorno Number of the error.
RETURNS:
TRUE network error
FALSE not network error
*/
bool is_network_error(uint errorno)
{
if (errorno == CR_CONNECTION_ERROR ||
errorno == CR_CONN_HOST_ERROR ||
errorno == CR_SERVER_GONE_ERROR ||
errorno == CR_SERVER_LOST ||
errorno == ER_CON_COUNT_ERROR ||
errorno == ER_SERVER_SHUTDOWN)
return TRUE;
return FALSE;
}
/* /*
Note that we rely on the master's version (3.23, 4.0.14 etc) instead of Note that we rely on the master's version (3.23, 4.0.14 etc) instead of
relying on the binlog's version. This is not perfect: imagine an upgrade relying on the binlog's version. This is not perfect: imagine an upgrade
...@@ -871,6 +895,7 @@ int init_intvar_from_file(int* var, IO_CACHE* f, int default_val) ...@@ -871,6 +895,7 @@ int init_intvar_from_file(int* var, IO_CACHE* f, int default_val)
RETURNS RETURNS
0 ok 0 ok
1 error 1 error
2 transient network problem, the caller should try to reconnect
*/ */
static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
...@@ -881,6 +906,7 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) ...@@ -881,6 +906,7 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
MYSQL_RES *master_res= 0; MYSQL_RES *master_res= 0;
MYSQL_ROW master_row; MYSQL_ROW master_row;
DBUG_ENTER("get_master_version_and_clock"); DBUG_ENTER("get_master_version_and_clock");
int query_re= 0;
/* /*
Free old description_event_for_queue (that is needed if we are in Free old description_event_for_queue (that is needed if we are in
...@@ -956,6 +982,8 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) ...@@ -956,6 +982,8 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
unavailable (very old master not supporting UNIX_TIMESTAMP()?). unavailable (very old master not supporting UNIX_TIMESTAMP()?).
*/ */
DBUG_SYNC_POINT("debug_lock.before_get_UNIX_TIMESTAMP", 10);
master_res= NULL;
if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()")) && if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()")) &&
(master_res= mysql_store_result(mysql)) && (master_res= mysql_store_result(mysql)) &&
(master_row= mysql_fetch_row(master_res))) (master_row= mysql_fetch_row(master_res)))
...@@ -963,7 +991,13 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) ...@@ -963,7 +991,13 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
mi->clock_diff_with_master= mi->clock_diff_with_master=
(long) (time((time_t*) 0) - strtoul(master_row[0], 0, 10)); (long) (time((time_t*) 0) - strtoul(master_row[0], 0, 10));
} }
else if (!check_io_slave_killed(mi->io_thd, mi, NULL)) else if (is_network_error(mysql_errno(mysql)))
{
mi->report(WARNING_LEVEL, mysql_errno(mysql),
"Get master clock failed with error: %s", mysql_error(mysql));
goto network_err;
}
else
{ {
mi->clock_diff_with_master= 0; /* The "most sensible" value */ mi->clock_diff_with_master= 0; /* The "most sensible" value */
sql_print_warning("\"SELECT UNIX_TIMESTAMP()\" failed on master, " sql_print_warning("\"SELECT UNIX_TIMESTAMP()\" failed on master, "
...@@ -972,7 +1006,10 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) ...@@ -972,7 +1006,10 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
mysql_error(mysql), mysql_errno(mysql)); mysql_error(mysql), mysql_errno(mysql));
} }
if (master_res) if (master_res)
{
mysql_free_result(master_res); mysql_free_result(master_res);
master_res= NULL;
}
/* /*
Check that the master's server id and ours are different. Because if they Check that the master's server id and ours are different. Because if they
...@@ -984,12 +1021,15 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) ...@@ -984,12 +1021,15 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
Note: we could have put a @@SERVER_ID in the previous SELECT Note: we could have put a @@SERVER_ID in the previous SELECT
UNIX_TIMESTAMP() instead, but this would not have worked on 3.23 masters. UNIX_TIMESTAMP() instead, but this would not have worked on 3.23 masters.
*/ */
DBUG_SYNC_POINT("debug_lock.before_get_SERVER_ID", 10);
master_res= NULL;
master_row= NULL;
if (!mysql_real_query(mysql, if (!mysql_real_query(mysql,
STRING_WITH_LEN("SHOW VARIABLES LIKE 'SERVER_ID'")) && STRING_WITH_LEN("SHOW VARIABLES LIKE 'SERVER_ID'")) &&
(master_res= mysql_store_result(mysql))) (master_res= mysql_store_result(mysql)) &&
(master_row= mysql_fetch_row(master_res)))
{ {
if ((master_row= mysql_fetch_row(master_res)) && if ((::server_id == strtoul(master_row[1], 0, 10)) &&
(::server_id == strtoul(master_row[1], 0, 10)) &&
!mi->rli.replicate_same_server_id) !mi->rli.replicate_same_server_id)
{ {
errmsg= "The slave I/O thread stops because master and slave have equal \ errmsg= "The slave I/O thread stops because master and slave have equal \
...@@ -998,11 +1038,35 @@ the --replicate-same-server-id option must be used on slave but this does \ ...@@ -998,11 +1038,35 @@ the --replicate-same-server-id option must be used on slave but this does \
not always make sense; please check the manual before using it)."; not always make sense; please check the manual before using it).";
err_code= ER_SLAVE_FATAL_ERROR; err_code= ER_SLAVE_FATAL_ERROR;
sprintf(err_buff, ER(err_code), errmsg); sprintf(err_buff, ER(err_code), errmsg);
goto err;
} }
mysql_free_result(master_res); }
if (errmsg) else if (mysql_errno(mysql))
{
if (is_network_error(mysql_errno(mysql)))
{
mi->report(WARNING_LEVEL, mysql_errno(mysql),
"Get master SERVER_ID failed with error: %s", mysql_error(mysql));
goto network_err;
}
/* Fatal error */
errmsg= "The slave I/O thread stops because a fatal error is encountered \
when it try to get the value of SERVER_ID variable from master.";
err_code= mysql_errno(mysql);
sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql));
goto err; goto err;
} }
else if (!master_row && master_res)
{
mi->report(WARNING_LEVEL, ER_UNKNOWN_SYSTEM_VARIABLE,
"Unknown system variable 'SERVER_ID' on master, \
maybe it is a *VERY OLD MASTER*.");
}
if (master_res)
{
mysql_free_result(master_res);
master_res= NULL;
}
/* /*
Check that the master's global character_set_server and ours are the same. Check that the master's global character_set_server and ours are the same.
...@@ -1025,24 +1089,51 @@ not always make sense; please check the manual before using it)."; ...@@ -1025,24 +1089,51 @@ not always make sense; please check the manual before using it).";
if (*mysql->server_version == '3') if (*mysql->server_version == '3')
goto err; goto err;
if ((*mysql->server_version == '4') && if (*mysql->server_version == '4')
!mysql_real_query(mysql, {
master_res= NULL;
if (!mysql_real_query(mysql,
STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER")) && STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER")) &&
(master_res= mysql_store_result(mysql))) (master_res= mysql_store_result(mysql)) &&
(master_row= mysql_fetch_row(master_res)))
{ {
if ((master_row= mysql_fetch_row(master_res)) && if (strcmp(master_row[0], global_system_variables.collation_server->name))
strcmp(master_row[0], global_system_variables.collation_server->name))
{ {
errmsg= "The slave I/O thread stops because master and slave have \ errmsg= "The slave I/O thread stops because master and slave have \
different values for the COLLATION_SERVER global variable. The values must \ different values for the COLLATION_SERVER global variable. The values must \
be equal for replication to work"; be equal for the Statement-format replication to work";
err_code= ER_SLAVE_FATAL_ERROR; err_code= ER_SLAVE_FATAL_ERROR;
sprintf(err_buff, ER(err_code), errmsg); sprintf(err_buff, ER(err_code), errmsg);
goto err;
} }
mysql_free_result(master_res); }
if (errmsg) else if (is_network_error(mysql_errno(mysql)))
{
mi->report(WARNING_LEVEL, mysql_errno(mysql),
"Get master COLLATION_SERVER failed with error: %s", mysql_error(mysql));
goto network_err;
}
else if (mysql_errno(mysql) != ER_UNKNOWN_SYSTEM_VARIABLE)
{
/* Fatal error */
errmsg= "The slave I/O thread stops because a fatal error is encountered \
when it try to get the value of COLLATION_SERVER global variable from master.";
err_code= mysql_errno(mysql);
sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql));
goto err; goto err;
} }
else
mi->report(WARNING_LEVEL, ER_UNKNOWN_SYSTEM_VARIABLE,
"Unknown system variable 'COLLATION_SERVER' on master, \
maybe it is a *VERY OLD MASTER*. *NOTE*: slave may experience \
inconsistency if replicated data deals with collation.");
if (master_res)
{
mysql_free_result(master_res);
master_res= NULL;
}
}
/* /*
Perform analogous check for time zone. Theoretically we also should Perform analogous check for time zone. Theoretically we also should
...@@ -1059,35 +1150,62 @@ be equal for replication to work"; ...@@ -1059,35 +1150,62 @@ be equal for replication to work";
This check is only necessary for 4.x masters (and < 5.0.4 masters but This check is only necessary for 4.x masters (and < 5.0.4 masters but
those were alpha). those were alpha).
*/ */
if ((*mysql->server_version == '4') && if (*mysql->server_version == '4')
!mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE")) && {
(master_res= mysql_store_result(mysql))) master_res= NULL;
if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE")) &&
(master_res= mysql_store_result(mysql)) &&
(master_row= mysql_fetch_row(master_res)))
{ {
if ((master_row= mysql_fetch_row(master_res)) && if (strcmp(master_row[0],
strcmp(master_row[0],
global_system_variables.time_zone->get_name()->ptr())) global_system_variables.time_zone->get_name()->ptr()))
{ {
errmsg= "The slave I/O thread stops because master and slave have \ errmsg= "The slave I/O thread stops because master and slave have \
different values for the TIME_ZONE global variable. The values must \ different values for the TIME_ZONE global variable. The values must \
be equal for replication to work"; be equal for the Statement-format replication to work";
err_code= ER_SLAVE_FATAL_ERROR; err_code= ER_SLAVE_FATAL_ERROR;
sprintf(err_buff, ER(err_code), errmsg); sprintf(err_buff, ER(err_code), errmsg);
goto err;
} }
mysql_free_result(master_res); }
else if (is_network_error(mysql_errno(mysql)))
if (errmsg) {
mi->report(WARNING_LEVEL, mysql_errno(mysql),
"Get master TIME_ZONE failed with error: %s", mysql_error(mysql));
goto network_err;
}
else
{
/* Fatal error */
errmsg= "The slave I/O thread stops because a fatal error is encountered \
when it try to get the value of TIME_ZONE global variable from master.";
err_code= mysql_errno(mysql);
sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql));
goto err; goto err;
} }
if (master_res)
{
mysql_free_result(master_res);
master_res= NULL;
}
}
err: err:
if (errmsg) if (errmsg)
{ {
if (master_res)
mysql_free_result(master_res);
DBUG_ASSERT(err_code != 0); DBUG_ASSERT(err_code != 0);
mi->report(ERROR_LEVEL, err_code, err_buff); mi->report(ERROR_LEVEL, err_code, err_buff);
DBUG_RETURN(1); DBUG_RETURN(1);
} }
DBUG_RETURN(0); DBUG_RETURN(0);
network_err:
if (master_res)
mysql_free_result(master_res);
DBUG_RETURN(2);
} }
/* /*
...@@ -2374,6 +2492,7 @@ pthread_handler_t handle_slave_io(void *arg) ...@@ -2374,6 +2492,7 @@ pthread_handler_t handle_slave_io(void *arg)
char llbuff[22]; char llbuff[22];
uint retry_count; uint retry_count;
bool suppress_warnings; bool suppress_warnings;
int ret;
#ifndef DBUG_OFF #ifndef DBUG_OFF
uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0; uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0;
#endif #endif
...@@ -2454,8 +2573,23 @@ connected: ...@@ -2454,8 +2573,23 @@ connected:
mi->slave_running= MYSQL_SLAVE_RUN_CONNECT; mi->slave_running= MYSQL_SLAVE_RUN_CONNECT;
thd->slave_net = &mysql->net; thd->slave_net = &mysql->net;
thd_proc_info(thd, "Checking master version"); thd_proc_info(thd, "Checking master version");
if (get_master_version_and_clock(mysql, mi)) ret= get_master_version_and_clock(mysql, mi);
if (ret == 1)
/* Fatal error */
goto err;
if (ret == 2)
{
if (check_io_slave_killed(mi->io_thd, mi, "Slave I/O thread killed"
"while calling get_master_version_and_clock(...)"))
goto err; goto err;
suppress_warnings= FALSE;
/* Try to reconnect because the error was caused by a transient network problem */
if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
reconnect_messages[SLAVE_RECON_ACT_REG]))
goto err;
goto connected;
}
if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1) if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment