Commit 0c9aaecc authored by Luis Soares's avatar Luis Soares

BUG#38826 Race in MYSQL_LOG::purge_logs is impossible to debug in production

BUG#39325 Server crash inside MYSQL_LOG::purge_first_log halts replicaiton

The patch reverses the order of the purging and updating events for log and
relay-log.info/index files respectively.

This solves the problem of having holes caused by crashes happening between updating
info/index files and purging logs.

This patch also contains an aditional test case for testing the crashing before purge logs.
      
NOTE1: This is a combined patch for BUG#38826 and BUG#39325. This patch is based on
bugteam tree and takes into account reviewers suggestions.

NOTE2: Merge from 5.0-bugteam
parents 3059c95c 5726574b
......@@ -6,7 +6,7 @@
let $counter= 500;
while ($mysql_errno)
{
--error 0,2002,2006
--error 0,2002,2006,2003
show status;
dec $counter;
......
......@@ -21,6 +21,7 @@ flush logs;
*** must be a warning master-bin.000001 was not found ***
Warnings:
Warning 1612 Being purged log MYSQLTEST_VARDIR/log/master-bin.000001 was not found
Warning 1612 Being purged log MYSQLTEST_VARDIR/log/master-bin.000001 was not found
*** must show one record, of the active binlog, left in the index file after PURGE ***
show binary logs;
Log_name File_size
......
stop slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
stop slave;
DROP TABLE IF EXISTS t1;
CREATE TABLE t1 (id INT);
start slave;
SET GLOBAL debug= "+d,crash_before_purge_logs";
FLUSH LOGS;
ERROR HY000: Lost connection to MySQL server during query
start slave;
--source include/master-slave.inc
--source include/have_debug.inc
--disable_reconnect
# We have to sync with master, to ensure slave had time to start properly
# # before we stop it. If not, we get errors about UNIX_TIMESTAMP() in the
# log.
sync_slave_with_master;
stop slave;
--source include/wait_for_slave_to_stop.inc
# ON MASTER
connection master;
--disable_warnings
DROP TABLE IF EXISTS t1;
--enable_warnings
CREATE TABLE t1 (id INT);
let $1=100;
disable_query_log;
begin;
while ($1)
{
eval INSERT INTO t1 VALUES( $1 );
dec $1;
}
DROP TABLE t1;
save_master_pos;
enable_query_log;
## ON SLAVE
connection slave;
start slave;
--source include/wait_for_slave_to_start.inc
sync_with_master 0;
connection master;
save_master_pos;
connection slave;
--exec echo "restart" > $MYSQLTEST_VARDIR/tmp/slave0.expect
SET GLOBAL debug= "+d,crash_before_purge_logs";
--error 2013
# try to rotate logs
FLUSH LOGS;
--enable_reconnect
--source include/wait_until_connected_again.inc
start slave;
--source include/wait_for_slave_to_start.inc
sync_with_master 0;
......@@ -2359,6 +2359,7 @@ MYSQL_BIN_LOG::MYSQL_BIN_LOG()
*/
index_file_name[0] = 0;
bzero((char*) &index_file, sizeof(index_file));
bzero((char*) &purge_temp, sizeof(purge_temp));
}
/* this is called only once */
......@@ -2953,6 +2954,7 @@ err:
int MYSQL_BIN_LOG::purge_first_log(Relay_log_info* rli, bool included)
{
int error;
char *to_purge_if_included= NULL;
DBUG_ENTER("purge_first_log");
DBUG_ASSERT(is_open());
......@@ -2960,36 +2962,20 @@ int MYSQL_BIN_LOG::purge_first_log(Relay_log_info* rli, bool included)
DBUG_ASSERT(!strcmp(rli->linfo.log_file_name,rli->event_relay_log_name));
pthread_mutex_lock(&LOCK_index);
pthread_mutex_lock(&rli->log_space_lock);
rli->relay_log.purge_logs(rli->group_relay_log_name, included,
0, 0, &rli->log_space_total);
// Tell the I/O thread to take the relay_log_space_limit into account
rli->ignore_log_space_limit= 0;
pthread_mutex_unlock(&rli->log_space_lock);
/*
Ok to broadcast after the critical region as there is no risk of
the mutex being destroyed by this thread later - this helps save
context switches
*/
pthread_cond_broadcast(&rli->log_space_cond);
to_purge_if_included= my_strdup(rli->group_relay_log_name, MYF(0));
/*
Read the next log file name from the index file and pass it back to
the caller
If included is true, we want the first relay log;
otherwise we want the one after event_relay_log_name.
the caller.
*/
if ((included && (error=find_log_pos(&rli->linfo, NullS, 0))) ||
(!included &&
((error=find_log_pos(&rli->linfo, rli->event_relay_log_name, 0)) ||
(error=find_next_log(&rli->linfo, 0)))))
if((error=find_log_pos(&rli->linfo, rli->event_relay_log_name, 0)) ||
(error=find_next_log(&rli->linfo, 0)))
{
char buff[22];
sql_print_error("next log error: %d offset: %s log: %s included: %d",
error,
llstr(rli->linfo.index_file_offset,buff),
rli->group_relay_log_name,
rli->event_relay_log_name,
included);
goto err;
}
......@@ -3017,7 +3003,42 @@ int MYSQL_BIN_LOG::purge_first_log(Relay_log_info* rli, bool included)
/* Store where we are in the new file for the execution thread */
flush_relay_log_info(rli);
DBUG_EXECUTE_IF("crash_before_purge_logs", abort(););
pthread_mutex_lock(&rli->log_space_lock);
rli->relay_log.purge_logs(to_purge_if_included, included,
0, 0, &rli->log_space_total);
// Tell the I/O thread to take the relay_log_space_limit into account
rli->ignore_log_space_limit= 0;
pthread_mutex_unlock(&rli->log_space_lock);
/*
Ok to broadcast after the critical region as there is no risk of
the mutex being destroyed by this thread later - this helps save
context switches
*/
pthread_cond_broadcast(&rli->log_space_cond);
/*
* Need to update the log pos because purge logs has been called
* after fetching initially the log pos at the begining of the method.
*/
if(error=find_log_pos(&rli->linfo, rli->event_relay_log_name, 0))
{
char buff[22];
sql_print_error("next log error: %d offset: %s log: %s included: %d",
error,
llstr(rli->linfo.index_file_offset,buff),
rli->group_relay_log_name,
included);
goto err;
}
/* If included was passed, rli->linfo should be the first entry. */
DBUG_ASSERT(!included || rli->linfo.index_file_start_offset == 0);
err:
my_free(to_purge_if_included, MYF(0));
pthread_mutex_unlock(&LOCK_index);
DBUG_RETURN(error);
}
......@@ -3078,7 +3099,35 @@ int MYSQL_BIN_LOG::purge_logs(const char *to_log,
if (need_mutex)
pthread_mutex_lock(&LOCK_index);
if ((error=find_log_pos(&log_info, to_log, 0 /*no mutex*/)))
{
sql_print_error("MYSQL_LOG::purge_logs was called with file %s not "
"listed in the index.", to_log);
goto err;
}
/*
For crash recovery reasons the index needs to be updated before
any files are deleted. Move files to be deleted into a temp file
to be processed after the index is updated.
*/
if (!my_b_inited(&purge_temp))
{
if (error=open_cached_file(&purge_temp, mysql_tmpdir, TEMP_PREFIX,
DISK_BUFFER_SIZE, MYF(MY_WME)))
{
sql_print_error("MYSQL_LOG::purge_logs failed to open purge_temp");
goto err;
}
}
else
{
if (error=reinit_io_cache(&purge_temp, WRITE_CACHE, 0, 0, 1))
{
sql_print_error("MYSQL_LOG::purge_logs failed to reinit purge_temp "
"for write");
goto err;
}
}
/*
File name exists in index file; delete until we find this file
......@@ -3089,6 +3138,61 @@ int MYSQL_BIN_LOG::purge_logs(const char *to_log,
while ((strcmp(to_log,log_info.log_file_name) || (exit_loop=included)) &&
!log_in_use(log_info.log_file_name))
{
if ((error=my_b_write(&purge_temp, (const uchar*)log_info.log_file_name,
strlen(log_info.log_file_name))) ||
(error=my_b_write(&purge_temp, (const uchar*)"\n", 1)))
{
sql_print_error("MYSQL_LOG::purge_logs failed to copy %s to purge_temp",
log_info.log_file_name);
goto err;
}
if (find_next_log(&log_info, 0) || exit_loop)
break;
}
/* We know how many files to delete. Update index file. */
if (error=update_log_index(&log_info, need_update_threads))
{
sql_print_error("MSYQL_LOG::purge_logs failed to update the index file");
goto err;
}
DBUG_EXECUTE_IF("crash_after_update_index", abort(););
/* Switch purge_temp for read. */
if (error=reinit_io_cache(&purge_temp, READ_CACHE, 0, 0, 0))
{
sql_print_error("MSYQL_LOG::purge_logs failed to reinit purge_temp "
"for read");
goto err;
}
/* Read each entry from purge_temp and delete the file. */
for (;;)
{
uint length;
if ((length=my_b_gets(&purge_temp, log_info.log_file_name,
FN_REFLEN)) <= 1)
{
if (purge_temp.error)
{
error= purge_temp.error;
sql_print_error("MSYQL_LOG::purge_logs error %d reading from "
"purge_temp", error);
goto err;
}
/* Reached EOF */
break;
}
/* Get rid of the trailing '\n' */
log_info.log_file_name[length-1]= 0;
ha_binlog_index_purge_file(current_thd, log_info.log_file_name);
MY_STAT s;
if (!my_stat(log_info.log_file_name, &s, MYF(0)))
{
......@@ -3189,20 +3293,6 @@ int MYSQL_BIN_LOG::purge_logs(const char *to_log,
}
}
}
ha_binlog_index_purge_file(current_thd, log_info.log_file_name);
if (find_next_log(&log_info, 0) || exit_loop)
break;
}
/*
If we get killed -9 here, the sysadmin would have to edit
the log index file after restart - otherwise, this should be safe
*/
error= update_log_index(&log_info, need_update_threads);
if (error == 0) {
error = ret;
}
err:
......@@ -3216,7 +3306,7 @@ err:
index file.
@param thd Thread pointer
@param before_date Delete all log files before given date.
@param purge_time Delete all log files before given date.
@note
If any of the logs before the deleted one is in use,
......@@ -3233,6 +3323,7 @@ err:
int MYSQL_BIN_LOG::purge_logs_before_date(time_t purge_time)
{
int error;
char to_log[FN_REFLEN];
LOG_INFO log_info;
MY_STAT stat_area;
THD *thd= current_thd;
......@@ -3240,12 +3331,8 @@ int MYSQL_BIN_LOG::purge_logs_before_date(time_t purge_time)
DBUG_ENTER("purge_logs_before_date");
pthread_mutex_lock(&LOCK_index);
to_log[0]= 0;
/*
Delete until we find curren file
or a file that is used or a file
that is older than purge_time.
*/
if ((error=find_log_pos(&log_info, NullS, 0 /*no mutex*/)))
goto err;
......@@ -3295,55 +3382,18 @@ int MYSQL_BIN_LOG::purge_logs_before_date(time_t purge_time)
}
else
{
if (stat_area.st_mtime >= purge_time)
break;
if (my_delete(log_info.log_file_name, MYF(0)))
{
if (my_errno == ENOENT)
{
/* It's not fatal even if we can't delete a log file */
if (thd)
{
push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
ER_LOG_PURGE_NO_FILE, ER(ER_LOG_PURGE_NO_FILE),
log_info.log_file_name);
}
sql_print_information("Failed to delete file '%s'",
log_info.log_file_name);
my_errno= 0;
}
else
{
if (thd)
{
push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_ERROR,
ER_BINLOG_PURGE_FATAL_ERR,
"a problem with deleting %s; "
"consider examining correspondence "
"of your binlog index file "
"to the actual binlog files",
log_info.log_file_name);
}
if (stat_area.st_mtime < purge_time)
strmake(to_log,
log_info.log_file_name,
sizeof(log_info.log_file_name));
else
{
sql_print_information("Failed to delete log file '%s'",
log_info.log_file_name);
}
error= LOG_INFO_FATAL;
goto err;
}
}
ha_binlog_index_purge_file(current_thd, log_info.log_file_name);
break;
}
if (find_next_log(&log_info, 0))
break;
}
/*
If we get killed -9 here, the sysadmin would have to edit
the log index file after restart - otherwise, this should be safe
*/
error= update_log_index(&log_info, 1);
error= (to_log[0] ? purge_logs(to_log, 1, 0, 1, (ulonglong *) 0) : 0);
err:
pthread_mutex_unlock(&LOCK_index);
......
......@@ -233,6 +233,13 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
pthread_cond_t update_cond;
ulonglong bytes_written;
IO_CACHE index_file;
/*
purge_temp is a temp file used in purge_logs so that the index file
can be updated before deleting files from disk, yielding better crash
recovery. It is created on demand the first time purge_logs is called
and then reused for subsequent calls. It is cleaned up in cleanup().
*/
IO_CACHE purge_temp;
char index_file_name[FN_REFLEN];
/*
The max size before rotation (usable only if log_type == LOG_BIN: binary
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment