Commit be569826 authored by Jan Lindström's avatar Jan Lindström

MDEV-15607: mysqld crashed few after node is being joined with sst

This is a typical systemd response where it tries to shutdown the
joiner (due to "timeout") before the joiner manages to complete SST.

wsrep_sst_wait
wsrep_SE_init_wait
	While waiting the operation to finish use mysql_cond_timedwait
	instead of mysql_cond_wait and if operation is not finished
	extend systemd timeout (if needed).
parent c6392d52
...@@ -30,6 +30,10 @@ ...@@ -30,6 +30,10 @@
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#if MYSQL_VERSION_ID < 100200
# include <my_service_manager.h>
#endif
static char wsrep_defaults_file[FN_REFLEN * 2 + 10 + 30 + static char wsrep_defaults_file[FN_REFLEN * 2 + 10 + 30 +
sizeof(WSREP_SST_OPT_CONF) + sizeof(WSREP_SST_OPT_CONF) +
sizeof(WSREP_SST_OPT_CONF_SUFFIX) + sizeof(WSREP_SST_OPT_CONF_SUFFIX) +
...@@ -186,6 +190,9 @@ bool wsrep_before_SE() ...@@ -186,6 +190,9 @@ bool wsrep_before_SE()
static bool sst_complete = false; static bool sst_complete = false;
static bool sst_needed = false; static bool sst_needed = false;
#define WSREP_EXTEND_TIMEOUT_INTERVAL 30
#define WSREP_TIMEDWAIT_SECONDS 10
void wsrep_sst_grab () void wsrep_sst_grab ()
{ {
WSREP_INFO("wsrep_sst_grab()"); WSREP_INFO("wsrep_sst_grab()");
...@@ -197,11 +204,25 @@ void wsrep_sst_grab () ...@@ -197,11 +204,25 @@ void wsrep_sst_grab ()
// Wait for end of SST // Wait for end of SST
bool wsrep_sst_wait () bool wsrep_sst_wait ()
{ {
if (mysql_mutex_lock (&LOCK_wsrep_sst)) abort(); struct timespec wtime = {WSREP_TIMEDWAIT_SECONDS, 0};
uint32 total_wtime = 0;
if (mysql_mutex_lock (&LOCK_wsrep_sst))
abort();
WSREP_INFO("Waiting for SST to complete.");
while (!sst_complete) while (!sst_complete)
{ {
WSREP_INFO("Waiting for SST to complete."); mysql_cond_timedwait (&COND_wsrep_sst, &LOCK_wsrep_sst, &wtime);
mysql_cond_wait (&COND_wsrep_sst, &LOCK_wsrep_sst);
if (!sst_complete)
{
total_wtime += wtime.tv_sec;
WSREP_DEBUG("Waiting for SST to complete. waited %u secs.", total_wtime);
service_manager_extend_timeout(WSREP_EXTEND_TIMEOUT_INTERVAL,
"WSREP state transfer ongoing, current seqno: %ld", local_seqno);
}
} }
if (local_seqno >= 0) if (local_seqno >= 0)
...@@ -1298,10 +1319,22 @@ void wsrep_SE_init_grab() ...@@ -1298,10 +1319,22 @@ void wsrep_SE_init_grab()
void wsrep_SE_init_wait() void wsrep_SE_init_wait()
{ {
struct timespec wtime = {WSREP_TIMEDWAIT_SECONDS, 0};
uint32 total_wtime=0;
while (SE_initialized == false) while (SE_initialized == false)
{ {
mysql_cond_wait (&COND_wsrep_sst_init, &LOCK_wsrep_sst_init); mysql_cond_timedwait (&COND_wsrep_sst_init, &LOCK_wsrep_sst_init, &wtime);
if (!SE_initialized)
{
total_wtime += wtime.tv_sec;
WSREP_DEBUG("Waiting for SST to complete. waited %u secs.", total_wtime);
service_manager_extend_timeout(WSREP_EXTEND_TIMEOUT_INTERVAL,
"WSREP SE initialization ongoing.");
} }
}
mysql_mutex_unlock (&LOCK_wsrep_sst_init); mysql_mutex_unlock (&LOCK_wsrep_sst_init);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment