Commit 90cac28e authored by unknown's avatar unknown

Bug#32025 ndb_waiter does too many roundtrips to ndb_mgmd


ndb/tools/waiter.cpp:
  - Only contact ndb_mgmd once per loop
  - Program only cares about ndbd nodes -> remove the api and mgm vectors
  - Program can not wait for "starting" -> remove that code
  - Remove unused includes
  - Protect against SIGPIPE(writing to a socket where the mgmsrv was
    gone, silently killed the program)
  - Don't sleep one second if if all nodes are in the wanted state  
  - Use 100 milliseconds sleep between each poll
parent de9fe35c
...@@ -21,13 +21,11 @@ ...@@ -21,13 +21,11 @@
#include <NdbMain.h> #include <NdbMain.h>
#include <NdbOut.hpp> #include <NdbOut.hpp>
#include <NdbSleep.h> #include <NdbSleep.h>
#include <kernel/ndb_limits.h>
#include <NDBT.hpp> #include <NDBT.hpp>
int static int
waitClusterStatus(const char* _addr, ndb_mgm_node_status _status, waitClusterStatus(const char* _addr, ndb_mgm_node_status _status);
unsigned int _timeout);
enum ndb_waiter_options { enum ndb_waiter_options {
OPT_WAIT_STATUS_NOT_STARTED = NDB_STD_OPTIONS_LAST, OPT_WAIT_STATUS_NOT_STARTED = NDB_STD_OPTIONS_LAST,
...@@ -55,12 +53,13 @@ static struct my_option my_long_options[] = ...@@ -55,12 +53,13 @@ static struct my_option my_long_options[] =
"Wait for cluster to enter single user mode", "Wait for cluster to enter single user mode",
(gptr*) &_single_user, (gptr*) &_single_user, 0, (gptr*) &_single_user, (gptr*) &_single_user, 0,
GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0 }, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0 },
{ "timeout", 't', "Timeout to wait", { "timeout", 't', "Timeout to wait in seconds",
(gptr*) &_timeout, (gptr*) &_timeout, 0, (gptr*) &_timeout, (gptr*) &_timeout, 0,
GET_INT, REQUIRED_ARG, 120, 0, 0, 0, 0, 0 }, GET_INT, REQUIRED_ARG, 120, 0, 0, 0, 0, 0 },
{ 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
}; };
static void usage() static void usage()
{ {
ndb_std_print_version(); ndb_std_print_version();
...@@ -70,16 +69,18 @@ static void usage() ...@@ -70,16 +69,18 @@ static void usage()
my_print_variables(my_long_options); my_print_variables(my_long_options);
} }
int main(int argc, char** argv){ int main(int argc, char** argv){
NDB_INIT(argv[0]); NDB_INIT(argv[0]);
load_defaults("my",load_default_groups,&argc,&argv); load_defaults("my",load_default_groups,&argc,&argv);
const char* _hostName = NULL; const char* _hostName = NULL;
int ho_error;
#ifndef DBUG_OFF #ifndef DBUG_OFF
opt_debug= "d:t:O,/tmp/ndb_waiter.trace"; opt_debug= "d:t:O,/tmp/ndb_waiter.trace";
#endif #endif
if ((ho_error=handle_options(&argc, &argv, my_long_options,
ndb_std_get_one_option))) if (handle_options(&argc, &argv, my_long_options,
ndb_std_get_one_option))
return NDBT_ProgramExit(NDBT_WRONGARGS); return NDBT_ProgramExit(NDBT_WRONGARGS);
_hostName = argv[0]; _hostName = argv[0];
...@@ -105,7 +106,7 @@ int main(int argc, char** argv){ ...@@ -105,7 +106,7 @@ int main(int argc, char** argv){
wait_status= NDB_MGM_NODE_STATUS_STARTED; wait_status= NDB_MGM_NODE_STATUS_STARTED;
} }
if (waitClusterStatus(_hostName, wait_status, _timeout) != 0) if (waitClusterStatus(_hostName, wait_status) != 0)
return NDBT_ProgramExit(NDBT_FAILED); return NDBT_ProgramExit(NDBT_FAILED);
return NDBT_ProgramExit(NDBT_OK); return NDBT_ProgramExit(NDBT_OK);
} }
...@@ -118,8 +119,6 @@ int main(int argc, char** argv){ ...@@ -118,8 +119,6 @@ int main(int argc, char** argv){
NdbMgmHandle handle= NULL; NdbMgmHandle handle= NULL;
Vector<ndb_mgm_node_state> ndbNodes; Vector<ndb_mgm_node_state> ndbNodes;
Vector<ndb_mgm_node_state> mgmNodes;
Vector<ndb_mgm_node_state> apiNodes;
int int
getStatus(){ getStatus(){
...@@ -128,8 +127,6 @@ getStatus(){ ...@@ -128,8 +127,6 @@ getStatus(){
struct ndb_mgm_node_state * node; struct ndb_mgm_node_state * node;
ndbNodes.clear(); ndbNodes.clear();
mgmNodes.clear();
apiNodes.clear();
while(retries < 10){ while(retries < 10){
status = ndb_mgm_get_status(handle); status = ndb_mgm_get_status(handle);
...@@ -153,18 +150,16 @@ getStatus(){ ...@@ -153,18 +150,16 @@ getStatus(){
ndbNodes.push_back(*node); ndbNodes.push_back(*node);
break; break;
case NDB_MGM_NODE_TYPE_MGM: case NDB_MGM_NODE_TYPE_MGM:
mgmNodes.push_back(*node); /* Don't care about MGM nodes */
break; break;
case NDB_MGM_NODE_TYPE_API: case NDB_MGM_NODE_TYPE_API:
apiNodes.push_back(*node); /* Don't care about API nodes */
break; break;
default: default:
if(node->node_status == NDB_MGM_NODE_STATUS_UNKNOWN || if(node->node_status == NDB_MGM_NODE_STATUS_UNKNOWN ||
node->node_status == NDB_MGM_NODE_STATUS_NO_CONTACT){ node->node_status == NDB_MGM_NODE_STATUS_NO_CONTACT){
retries++; retries++;
ndbNodes.clear(); ndbNodes.clear();
mgmNodes.clear();
apiNodes.clear();
free(status); free(status);
status = NULL; status = NULL;
count = 0; count = 0;
...@@ -183,24 +178,22 @@ getStatus(){ ...@@ -183,24 +178,22 @@ getStatus(){
free(status); free(status);
return 0; return 0;
} }
g_err << "getStatus failed" << endl;
return -1; return -1;
} }
int static int
waitClusterStatus(const char* _addr, waitClusterStatus(const char* _addr,
ndb_mgm_node_status _status, ndb_mgm_node_status _status)
unsigned int _timeout)
{ {
int _startphase = -1; int _startphase = -1;
int _nodes[MAX_NDB_NODES]; /* Ignore SIGPIPE */
int _num_nodes = 0; signal(SIGPIPE, SIG_IGN);
handle = ndb_mgm_create_handle(); handle = ndb_mgm_create_handle();
if (handle == NULL){ if (handle == NULL){
g_err << "handle == NULL" << endl; g_err << "Could not create ndb_mgm handle" << endl;
return -1; return -1;
} }
g_info << "Connecting to mgmsrv at " << _addr << endl; g_info << "Connecting to mgmsrv at " << _addr << endl;
...@@ -216,19 +209,11 @@ waitClusterStatus(const char* _addr, ...@@ -216,19 +209,11 @@ waitClusterStatus(const char* _addr,
return -1; return -1;
} }
if (getStatus() != 0) int attempts = 0;
return -1; int resetAttempts = 0;
const int MAX_RESET_ATTEMPTS = 10;
// Collect all nodes into nodes bool allInState = false;
for (size_t i = 0; i < ndbNodes.size(); i++){ int timeout_ms= _timeout * 10; /* In number of 100 milliseconds */
_nodes[i] = ndbNodes[i].node_id;
_num_nodes++;
}
unsigned int attempts = 0;
unsigned int resetAttempts = 0;
const unsigned int MAX_RESET_ATTEMPTS = 10;
bool allInState = false;
while (allInState == false){ while (allInState == false){
if (_timeout > 0 && attempts > _timeout){ if (_timeout > 0 && attempts > _timeout){
/** /**
...@@ -236,8 +221,8 @@ waitClusterStatus(const char* _addr, ...@@ -236,8 +221,8 @@ waitClusterStatus(const char* _addr,
* the state we want * the state we want
*/ */
bool waitMore = false; bool waitMore = false;
/** /**
* Make special check if we are waiting for * Make special check if we are waiting for
* cluster to become started * cluster to become started
*/ */
if(_status == NDB_MGM_NODE_STATUS_STARTED){ if(_status == NDB_MGM_NODE_STATUS_STARTED){
...@@ -252,7 +237,7 @@ waitClusterStatus(const char* _addr, ...@@ -252,7 +237,7 @@ waitClusterStatus(const char* _addr,
waitMore = false; waitMore = false;
} }
} }
if (!waitMore || resetAttempts > MAX_RESET_ATTEMPTS){ if (!waitMore || resetAttempts > MAX_RESET_ATTEMPTS){
g_err << "waitNodeState(" g_err << "waitNodeState("
...@@ -260,7 +245,7 @@ waitClusterStatus(const char* _addr, ...@@ -260,7 +245,7 @@ waitClusterStatus(const char* _addr,
<<", "<<_startphase<<")" <<", "<<_startphase<<")"
<< " timeout after " << attempts <<" attemps" << endl; << " timeout after " << attempts <<" attemps" << endl;
return -1; return -1;
} }
g_err << "waitNodeState(" g_err << "waitNodeState("
<< ndb_mgm_get_node_status_string(_status) << ndb_mgm_get_node_status_string(_status)
...@@ -269,62 +254,34 @@ waitClusterStatus(const char* _addr, ...@@ -269,62 +254,34 @@ waitClusterStatus(const char* _addr,
<< resetAttempts << endl; << resetAttempts << endl;
attempts = 0; attempts = 0;
resetAttempts++; resetAttempts++;
} }
allInState = true;
if (getStatus() != 0){ if (getStatus() != 0){
g_err << "getStatus != 0" << endl;
return -1; return -1;
} }
// ndbout << "waitNodeState; _num_nodes = " << _num_nodes << endl; /* Assume all nodes are in state(if there is any) */
// for (int i = 0; i < _num_nodes; i++) allInState = (ndbNodes.size() > 0);
// ndbout << " node["<<i<<"] =" <<_nodes[i] << endl;
for (int i = 0; i < _num_nodes; i++){ /* Loop through all nodes and check their state */
ndb_mgm_node_state* ndbNode = NULL; for (size_t n = 0; n < ndbNodes.size(); n++) {
for (size_t n = 0; n < ndbNodes.size(); n++){ ndb_mgm_node_state* ndbNode = &ndbNodes[n];
if (ndbNodes[n].node_id == _nodes[i])
ndbNode = &ndbNodes[n];
}
if(ndbNode == NULL){ assert(ndbNode != NULL);
allInState = false;
continue;
}
g_info << "State node " << ndbNode->node_id << " " g_info << "Node " << ndbNode->node_id << ": "
<< ndb_mgm_get_node_status_string(ndbNode->node_status)<< endl; << ndb_mgm_get_node_status_string(ndbNode->node_status)<< endl;
assert(ndbNode != NULL); if (ndbNode->node_status != _status)
if(_status == NDB_MGM_NODE_STATUS_STARTING &&
((ndbNode->node_status == NDB_MGM_NODE_STATUS_STARTING &&
ndbNode->start_phase >= _startphase) ||
(ndbNode->node_status == NDB_MGM_NODE_STATUS_STARTED)))
continue;
if (_status == NDB_MGM_NODE_STATUS_STARTING){
g_info << "status = "
<< ndb_mgm_get_node_status_string(ndbNode->node_status)
<<", start_phase="<<ndbNode->start_phase<<endl;
if (ndbNode->node_status != _status) {
if (ndbNode->node_status < _status)
allInState = false;
else
g_info << "node_status(" << (unsigned)ndbNode->node_status
<< ") != _status("<< (unsigned)_status << ")" <<endl;
} else if (ndbNode->start_phase < _startphase)
allInState = false;
} else {
if (ndbNode->node_status != _status)
allInState = false; allInState = false;
}
} }
g_info << "Waiting for cluster enter state "
<< ndb_mgm_get_node_status_string(_status)<< endl; if (!allInState) {
NdbSleep_SecSleep(1); g_info << "Waiting for cluster enter state "
<< ndb_mgm_get_node_status_string(_status)<< endl;
NdbSleep_MilliSleep(100);
}
attempts++; attempts++;
} }
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment