Commit 55aab082 authored by Luis Soares's avatar Luis Soares

BUG#50364: FLUSH LOGS crashes the server (rpl.rpl_heartbeat_basic

fails in PB sporadically)
      
The IO thread can concurrently access the relay log IO_CACHE
while another thread is performing an FLUSH LOGS procedure.
      
FLUSH LOGS closes and reopens the relay log and while doing so it
(re)initializes its IO_CACHE. During this procedure the IO_CACHE
mutex is also reinitialized, which can cause problems if some
other thread (namely the IO THREAD) is concurrently accessing it
at the time .
      
This patch fixes the problem by extending the interface of the
flush_master_info function to also include a second paramater, 
"need_relay_log_lock", stating whether the thread should grab the 
relay log lock or not before actually flushing the relay log. 
Also, IO thread now calls flush_master_info with this flag set 
when it flushes master info with in the event read_event loop.

Finally, we also increase loop time in rpl_heartbeat_basic test 
case, so that the number of calls to flush logs doubles, stressing
this part of the code a little more.

mysql-test/suite/rpl/t/rpl_heartbeat_basic.test:
  Doubled the number of iterations on the FLUSH LOGS loop by
  doubling the time available to perform all iterations.
sql/repl_failsafe.cc:
  Updating flush_master_info call so that it uses two parameters
  instead of one.
sql/rpl_mi.cc:
  Updating flush_master_info call so that it uses two parameters
  instead of one.
sql/rpl_mi.h:
  Changed flush_master_info interface. Now takes a second parameter
  instead of just one. The second parameter is: need_lock_relay_log.
sql/rpl_rli.cc:
  Small fix in comment.
sql/slave.cc:
  Updating flush_master_info call so that it uses two parameters
  instead of one.
sql/sql_repl.cc:
  Updating flush_master_info call so that it uses two parameters
  instead of one.
parent 54fc7997
...@@ -382,7 +382,7 @@ let $slave_param_comparison= =; ...@@ -382,7 +382,7 @@ let $slave_param_comparison= =;
let $rcvd_heartbeats_before= query_get_value(SHOW STATUS LIKE 'slave_received_heartbeats', Value, 1); let $rcvd_heartbeats_before= query_get_value(SHOW STATUS LIKE 'slave_received_heartbeats', Value, 1);
# Flush logs every 0.1 second during 5 sec # Flush logs every 0.1 second during 5 sec
--disable_query_log --disable_query_log
let $i=50; let $i=100;
while ($i) { while ($i) {
FLUSH LOGS; FLUSH LOGS;
dec $i; dec $i;
......
...@@ -977,7 +977,7 @@ bool load_master_data(THD* thd) ...@@ -977,7 +977,7 @@ bool load_master_data(THD* thd)
host was specified; there could have been a problem when replication host was specified; there could have been a problem when replication
started, which led to relay log's IO_CACHE to not be inited. started, which led to relay log's IO_CACHE to not be inited.
*/ */
if (flush_master_info(active_mi, 0)) if (flush_master_info(active_mi, FALSE, FALSE))
sql_print_error("Failed to flush master info file"); sql_print_error("Failed to flush master info file");
} }
mysql_free_result(master_status_res); mysql_free_result(master_status_res);
......
...@@ -387,7 +387,7 @@ file '%s')", fname); ...@@ -387,7 +387,7 @@ file '%s')", fname);
mi->rli.is_relay_log_recovery= FALSE; mi->rli.is_relay_log_recovery= FALSE;
// now change cache READ -> WRITE - must do this before flush_master_info // now change cache READ -> WRITE - must do this before flush_master_info
reinit_io_cache(&mi->file, WRITE_CACHE, 0L, 0, 1); reinit_io_cache(&mi->file, WRITE_CACHE, 0L, 0, 1);
if ((error=test(flush_master_info(mi, 1)))) if ((error=test(flush_master_info(mi, TRUE, TRUE))))
sql_print_error("Failed to flush master info file"); sql_print_error("Failed to flush master info file");
pthread_mutex_unlock(&mi->data_lock); pthread_mutex_unlock(&mi->data_lock);
DBUG_RETURN(error); DBUG_RETURN(error);
...@@ -413,7 +413,9 @@ err: ...@@ -413,7 +413,9 @@ err:
1 - flush master info failed 1 - flush master info failed
0 - all ok 0 - all ok
*/ */
int flush_master_info(Master_info* mi, bool flush_relay_log_cache) int flush_master_info(Master_info* mi,
bool flush_relay_log_cache,
bool need_lock_relay_log)
{ {
IO_CACHE* file = &mi->file; IO_CACHE* file = &mi->file;
char lbuf[22]; char lbuf[22];
...@@ -436,8 +438,19 @@ int flush_master_info(Master_info* mi, bool flush_relay_log_cache) ...@@ -436,8 +438,19 @@ int flush_master_info(Master_info* mi, bool flush_relay_log_cache)
*/ */
if (flush_relay_log_cache) if (flush_relay_log_cache)
{ {
pthread_mutex_t *log_lock= mi->rli.relay_log.get_log_lock();
IO_CACHE *log_file= mi->rli.relay_log.get_log_file(); IO_CACHE *log_file= mi->rli.relay_log.get_log_file();
if (flush_io_cache(log_file))
if (need_lock_relay_log)
pthread_mutex_lock(log_lock);
safe_mutex_assert_owner(log_lock);
err= flush_io_cache(log_file);
if (need_lock_relay_log)
pthread_mutex_unlock(log_lock);
if (err)
DBUG_RETURN(2); DBUG_RETURN(2);
} }
......
...@@ -120,7 +120,9 @@ int init_master_info(Master_info* mi, const char* master_info_fname, ...@@ -120,7 +120,9 @@ int init_master_info(Master_info* mi, const char* master_info_fname,
bool abort_if_no_master_info_file, bool abort_if_no_master_info_file,
int thread_mask); int thread_mask);
void end_master_info(Master_info* mi); void end_master_info(Master_info* mi);
int flush_master_info(Master_info* mi, bool flush_relay_log_cache); int flush_master_info(Master_info* mi,
bool flush_relay_log_cache,
bool need_lock_relay_log);
int change_master_server_id_cmp(ulong *id1, ulong *id2); int change_master_server_id_cmp(ulong *id1, ulong *id2);
#endif /* HAVE_REPLICATION */ #endif /* HAVE_REPLICATION */
......
...@@ -121,7 +121,7 @@ int init_relay_log_info(Relay_log_info* rli, ...@@ -121,7 +121,7 @@ int init_relay_log_info(Relay_log_info* rli,
/* /*
The relay log will now be opened, as a SEQ_READ_APPEND IO_CACHE. The relay log will now be opened, as a SEQ_READ_APPEND IO_CACHE.
Note that the I/O thread flushes it to disk after writing every Note that the I/O thread flushes it to disk after writing every
event, in flush_master_info(mi, 1). event, in flush_master_info(mi, 1, ?).
*/ */
/* /*
......
...@@ -1726,7 +1726,7 @@ static void write_ignored_events_info_to_relay_log(THD *thd, Master_info *mi) ...@@ -1726,7 +1726,7 @@ static void write_ignored_events_info_to_relay_log(THD *thd, Master_info *mi)
" to the relay log, SHOW SLAVE STATUS may be" " to the relay log, SHOW SLAVE STATUS may be"
" inaccurate"); " inaccurate");
rli->relay_log.harvest_bytes_written(&rli->log_space_total); rli->relay_log.harvest_bytes_written(&rli->log_space_total);
if (flush_master_info(mi, 1)) if (flush_master_info(mi, TRUE, FALSE))
sql_print_error("Failed to flush master info file"); sql_print_error("Failed to flush master info file");
delete ev; delete ev;
} }
...@@ -3047,7 +3047,7 @@ Stopping slave I/O thread due to out-of-memory error from master"); ...@@ -3047,7 +3047,7 @@ Stopping slave I/O thread due to out-of-memory error from master");
goto err; goto err;
} }
if (flush_master_info(mi, 1)) if (flush_master_info(mi, TRUE, TRUE))
{ {
sql_print_error("Failed to flush master info file"); sql_print_error("Failed to flush master info file");
goto err; goto err;
......
...@@ -1532,7 +1532,7 @@ bool change_master(THD* thd, Master_info* mi) ...@@ -1532,7 +1532,7 @@ bool change_master(THD* thd, Master_info* mi)
Relay log's IO_CACHE may not be inited, if rli->inited==0 (server was never Relay log's IO_CACHE may not be inited, if rli->inited==0 (server was never
a slave before). a slave before).
*/ */
if (flush_master_info(mi, 0)) if (flush_master_info(mi, FALSE, FALSE))
{ {
my_error(ER_RELAY_LOG_INIT, MYF(0), "Failed to flush master info file"); my_error(ER_RELAY_LOG_INIT, MYF(0), "Failed to flush master info file");
ret= TRUE; ret= TRUE;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment