Commit 9682ff8a authored by Alfranio Correia's avatar Alfranio Correia

BUG#43075 rpl.rpl_sync fails sporadically on pushbuild

NOTE: Backporting the patch to next-mr.
      
The slave was crashing while failing to execute the init_slave() function.
      
The issue stems from two different reasons:
      
1 - A failure while allocating the master info structure generated a
    segfault due to a NULL pointer.
      
2 - A failure while recovering generated a segfault due to a non-initialized
    relay log file. In other words, the mi->init and rli->init were both set to true
    before executing the recovery process thus creating an inconsistent state as the
    relay log file was not initialized.
      
To circumvent such problems, we refactored the recovery process which is now executed
while initializing the relay log. It is ensured that the master info structure is
created before accessing it and any error is propagated thus avoiding to set mi->init
and rli->init to true when for instance the relay log is not initialized or the relay
info is not flushed.
      
The changes related to the refactory are described below:
      
1 - Removed call to init_recovery from init_slave.
      
2 - Changed the signature of the function init_recovery.
      
3 - Removed flushes. They are called while initializing the relay log and master
    info.
      
4 - Made sure that if the relay info is not flushed the mi-init and rli-init are not
    set to true.
      
In this patch, we also replaced the exit(1) in the fault injection by DBUG_ABORT()
to make it compliant with the code guidelines.
parent 9581d628
...@@ -310,6 +310,7 @@ file '%s')", fname); ...@@ -310,6 +310,7 @@ file '%s')", fname);
goto err; goto err;
mi->inited = 1; mi->inited = 1;
mi->rli.is_relay_log_recovery= FALSE;
// now change cache READ -> WRITE - must do this before flush_master_info // now change cache READ -> WRITE - must do this before flush_master_info
reinit_io_cache(&mi->file, WRITE_CACHE, 0L, 0, 1); reinit_io_cache(&mi->file, WRITE_CACHE, 0L, 0, 1);
if ((error=test(flush_master_info(mi, 1)))) if ((error=test(flush_master_info(mi, 1))))
......
...@@ -259,8 +259,10 @@ Failed to open the existing relay log info file '%s' (errno %d)", ...@@ -259,8 +259,10 @@ Failed to open the existing relay log info file '%s' (errno %d)",
rli->group_relay_log_pos= rli->event_relay_log_pos= relay_log_pos; rli->group_relay_log_pos= rli->event_relay_log_pos= relay_log_pos;
rli->group_master_log_pos= master_log_pos; rli->group_master_log_pos= master_log_pos;
if (!rli->is_relay_log_recovery && if (rli->is_relay_log_recovery && init_recovery(rli->mi, &msg))
init_relay_log_pos(rli, goto err;
if (init_relay_log_pos(rli,
rli->group_relay_log_name, rli->group_relay_log_name,
rli->group_relay_log_pos, rli->group_relay_log_pos,
0 /* no data lock*/, 0 /* no data lock*/,
...@@ -275,7 +277,6 @@ Failed to open the existing relay log info file '%s' (errno %d)", ...@@ -275,7 +277,6 @@ Failed to open the existing relay log info file '%s' (errno %d)",
} }
#ifndef DBUG_OFF #ifndef DBUG_OFF
if (!rli->is_relay_log_recovery)
{ {
char llbuf1[22], llbuf2[22]; char llbuf1[22], llbuf2[22];
DBUG_PRINT("info", ("my_b_tell(rli->cur_log)=%s rli->event_relay_log_pos=%s", DBUG_PRINT("info", ("my_b_tell(rli->cur_log)=%s rli->event_relay_log_pos=%s",
...@@ -292,7 +293,10 @@ Failed to open the existing relay log info file '%s' (errno %d)", ...@@ -292,7 +293,10 @@ Failed to open the existing relay log info file '%s' (errno %d)",
*/ */
reinit_io_cache(&rli->info_file, WRITE_CACHE,0L,0,1); reinit_io_cache(&rli->info_file, WRITE_CACHE,0L,0,1);
if ((error= flush_relay_log_info(rli))) if ((error= flush_relay_log_info(rli)))
sql_print_error("Failed to flush relay log info file"); {
msg= "Failed to flush relay log info file";
goto err;
}
if (count_relay_log_space(rli)) if (count_relay_log_space(rli))
{ {
msg="Error counting relay log space"; msg="Error counting relay log space";
......
...@@ -129,7 +129,6 @@ static bool wait_for_relay_log_space(Relay_log_info* rli); ...@@ -129,7 +129,6 @@ static bool wait_for_relay_log_space(Relay_log_info* rli);
static inline bool io_slave_killed(THD* thd,Master_info* mi); static inline bool io_slave_killed(THD* thd,Master_info* mi);
static inline bool sql_slave_killed(THD* thd,Relay_log_info* rli); static inline bool sql_slave_killed(THD* thd,Relay_log_info* rli);
static int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type); static int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type);
static int init_recovery(Master_info* mi);
static void print_slave_skip_errors(void); static void print_slave_skip_errors(void);
static int safe_connect(THD* thd, MYSQL* mysql, Master_info* mi); static int safe_connect(THD* thd, MYSQL* mysql, Master_info* mi);
static int safe_reconnect(THD* thd, MYSQL* mysql, Master_info* mi, static int safe_reconnect(THD* thd, MYSQL* mysql, Master_info* mi,
...@@ -264,12 +263,6 @@ int init_slave() ...@@ -264,12 +263,6 @@ int init_slave()
goto err; goto err;
} }
if (active_mi->rli.is_relay_log_recovery && init_recovery(active_mi))
{
error= 1;
goto err;
}
if (server_id && !master_host && active_mi->host[0]) if (server_id && !master_host && active_mi->host[0])
master_host= active_mi->host; master_host= active_mi->host;
...@@ -291,7 +284,6 @@ int init_slave() ...@@ -291,7 +284,6 @@ int init_slave()
} }
err: err:
active_mi->rli.is_relay_log_recovery= FALSE;
pthread_mutex_unlock(&LOCK_active_mi); pthread_mutex_unlock(&LOCK_active_mi);
DBUG_RETURN(error); DBUG_RETURN(error);
} }
...@@ -323,9 +315,8 @@ err: ...@@ -323,9 +315,8 @@ err:
If there is an error, it returns (1), otherwise returns (0). If there is an error, it returns (1), otherwise returns (0).
*/ */
static int init_recovery(Master_info* mi) int init_recovery(Master_info* mi, const char** errmsg)
{ {
const char *errmsg= 0;
DBUG_ENTER("init_recovery"); DBUG_ENTER("init_recovery");
Relay_log_info *rli= &mi->rli; Relay_log_info *rli= &mi->rli;
...@@ -345,26 +336,8 @@ static int init_recovery(Master_info* mi) ...@@ -345,26 +336,8 @@ static int init_recovery(Master_info* mi)
sizeof(mi->rli.event_relay_log_name)-1); sizeof(mi->rli.event_relay_log_name)-1);
rli->group_relay_log_pos= rli->event_relay_log_pos= BIN_LOG_HEADER_SIZE; rli->group_relay_log_pos= rli->event_relay_log_pos= BIN_LOG_HEADER_SIZE;
if (init_relay_log_pos(rli,
rli->group_relay_log_name,
rli->group_relay_log_pos,
0 /*no data lock*/,
&errmsg, 0))
DBUG_RETURN(1);
if (flush_master_info(mi, 0))
{
sql_print_error("Failed to flush master info file");
DBUG_RETURN(1);
}
if (flush_relay_log_info(rli))
{
sql_print_error("Failed to flush relay info file");
DBUG_RETURN(1);
}
} }
DBUG_RETURN(0); DBUG_RETURN(0);
} }
......
...@@ -134,6 +134,7 @@ extern ulonglong relay_log_space_limit; ...@@ -134,6 +134,7 @@ extern ulonglong relay_log_space_limit;
#define SLAVE_FORCE_ALL 4 #define SLAVE_FORCE_ALL 4
int init_slave(); int init_slave();
int init_recovery(Master_info* mi, const char** errmsg);
void init_slave_skip_errors(const char* arg); void init_slave_skip_errors(const char* arg);
bool flush_relay_log_info(Relay_log_info* rli); bool flush_relay_log_info(Relay_log_info* rli);
int register_slave_on_master(MYSQL* mysql); int register_slave_on_master(MYSQL* mysql);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment