BUG#13985 ndb_mgm "status" command can return incorrect data node status

Second half of the fix for this bug. This patch forces a heartbeat to be sent and will wait (a little while) for replies. This way we can get > all status X starting Y started X started > which is okay as the new status comes after the old status, always. There is the slimmest of opportunities to get output like above where only half the cluster appears started. This is about the best we can do with a command line interactive program. ndb/src/mgmsrv/MgmtSrvr.cpp: Add updateStatus method to MgmtSrvr. Used to force an update of node status for the nodes. ndb/src/mgmsrv/MgmtSrvr.hpp: add prototype for updateStatus(NodeBitmask) method ndb/src/mgmsrv/Services.cpp: When status is queried, force an update of the status in the mgm server. (i.e. send heartbeats) ndb/src/ndbapi/ClusterMgr.cpp: new DEBUG_REG define for debugging registration and HB code. Add ClusterMgr::forceHB(NodeBitmask) which sends a HB signal to each node in the bitmask and then waits for a REGCONF from them. Will only wait for a total of 1 second, not blocking an end client for too long. On receipt of HB, clear the nodeId in the waiting for bitmask and signal any waiting threads. ndb/src/ndbapi/ClusterMgr.hpp: Add ::forceHB(NodeBitmask) and associated variables

BUG#13985 ndb_mgm "status" command can return incorrect data node status
Second half of the fix for this bug. This patch forces a heartbeat to be sent and will wait (a little while) for replies. This way we can get > all status X starting Y started X started > which is okay as the new status comes after the old status, always. There is the slimmest of opportunities to get output like above where only half the cluster appears started. This is about the best we can do with a command line interactive program. ndb/src/mgmsrv/MgmtSrvr.cpp: Add updateStatus method to MgmtSrvr. Used to force an update of node status for the nodes. ndb/src/mgmsrv/MgmtSrvr.hpp: add prototype for updateStatus(NodeBitmask) method ndb/src/mgmsrv/Services.cpp: When status is queried, force an update of the status in the mgm server. (i.e. send heartbeats) ndb/src/ndbapi/ClusterMgr.cpp: new DEBUG_REG define for debugging registration and HB code. Add ClusterMgr::forceHB(NodeBitmask) which sends a HB signal to each node in the bitmask and then waits for a REGCONF from them. Will only wait for a total of 1 second, not blocking an end client for too long. On receipt of HB, clear the nodeId in the waiting for bitmask and signal any waiting threads. ndb/src/ndbapi/ClusterMgr.hpp: Add ::forceHB(NodeBitmask) and associated variables
3cea3705 · unknown · 746fc2f4 · 3cea3705 · 3cea3705 · 3cea3705
Commit 3cea3705 authored Jul 03, 2006 by unknown
5 changed files
--- a/ndb/src/mgmsrv/MgmtSrvr.cpp
+++ b/ndb/src/mgmsrv/MgmtSrvr.cpp
@@ -1412,6 +1412,12 @@ MgmtSrvr::exitSingleUser(int * stopCount, bool abort)

 #include <ClusterMgr.hpp>

+void
+MgmtSrvr::updateStatus(NodeBitmask nodes)
+{
+  theFacade->theClusterMgr->forceHB(nodes);
+}
+
 int 
 MgmtSrvr::status(int nodeId, 
                 ndb_mgm_node_status * _status, 

--- a/ndb/src/mgmsrv/MgmtSrvr.hpp
+++ b/ndb/src/mgmsrv/MgmtSrvr.hpp
@@ -487,6 +487,8 @@ public:
  void get_connected_nodes(NodeBitmask &connected_nodes) const;
  SocketServer *get_socket_server() { return m_socket_server; }

+  void updateStatus(NodeBitmask nodes);
+
  //**************************************************************************
 private:
  //**************************************************************************

--- a/ndb/src/mgmsrv/Services.cpp
+++ b/ndb/src/mgmsrv/Services.cpp
@@ -951,6 +951,9 @@ printNodeStatus(OutputStream *output,
 		MgmtSrvr &mgmsrv,
 		enum ndb_mgm_node_type type) {
  NodeId nodeId = 0;
+  NodeBitmask hbnodes;
+  mgmsrv.get_connected_nodes(hbnodes);
+  mgmsrv.updateStatus(hbnodes);
  while(mgmsrv.getNextNodeId(&nodeId, type)) {
    enum ndb_mgm_node_status status;
    Uint32 startPhase = 0, 

--- a/ndb/src/ndbapi/ClusterMgr.cpp
+++ b/ndb/src/ndbapi/ClusterMgr.cpp
@@ -39,6 +39,8 @@

 int global_flag_send_heartbeat_now= 0;

+//#define DEBUG_REG
+
 // Just a C wrapper for threadMain
 extern "C" 
 void*
@@ -67,6 +69,8 @@ ClusterMgr::ClusterMgr(TransporterFacade & _facade):
  DBUG_ENTER("ClusterMgr::ClusterMgr");
  ndbSetOwnVersion();
  clusterMgrThreadMutex = NdbMutex_Create();
+  waitForHBMutex= NdbMutex_Create();
+  waitForHBCond= NdbCondition_Create();
  noOfAliveNodes= 0;
  noOfConnectedNodes= 0;
  theClusterMgrThread= 0;
@@ -78,6 +82,8 @@ ClusterMgr::~ClusterMgr()
 {
  DBUG_ENTER("ClusterMgr::~ClusterMgr");
  doStop();
+  NdbCondition_Destroy(waitForHBCond);
+  NdbMutex_Destroy(waitForHBMutex);
  NdbMutex_Destroy(clusterMgrThreadMutex);
  DBUG_VOID_RETURN;
 }
@@ -163,6 +169,49 @@ ClusterMgr::doStop( ){
  DBUG_VOID_RETURN;
 }

+void
+ClusterMgr::forceHB(NodeBitmask waitFor)
+{
+    theFacade.lock_mutex();
+    global_flag_send_heartbeat_now= 1;
+
+    waitForHBFromNodes= waitFor;
+#ifdef DEBUG_REG
+    char buf[128];
+    ndbout << "Waiting for HB from " << waitForHBFromNodes.getText(buf) << endl;
+#endif
+    NdbApiSignal signal(numberToRef(API_CLUSTERMGR, theFacade.ownId()));
+
+    signal.theVerId_signalNumber   = GSN_API_REGREQ;
+    signal.theReceiversBlockNumber = QMGR;
+    signal.theTrace                = 0;
+    signal.theLength               = ApiRegReq::SignalLength;
+
+    ApiRegReq * req = CAST_PTR(ApiRegReq, signal.getDataPtrSend());
+    req->ref = numberToRef(API_CLUSTERMGR, theFacade.ownId());
+    req->version = NDB_VERSION;
+
+    int nodeId= 0;
+    for(int i=0;
+        NodeBitmask::NotFound!=(nodeId= waitForHBFromNodes.find(i));
+        i= nodeId+1)
+    {
+#ifdef DEBUG_REG
+      ndbout << "FORCE HB to " << nodeId << endl;
+#endif
+      theFacade.sendSignalUnCond(&signal, nodeId);
+    }
+
+    theFacade.unlock_mutex();
+
+    NdbMutex_Lock(waitForHBMutex);
+    NdbCondition_WaitTimeout(waitForHBCond, waitForHBMutex, 1000);
+    NdbMutex_Unlock(waitForHBMutex);
+#ifdef DEBUG_REG
+    ndbout << "Still waiting for HB from " << waitForHBFromNodes.getText(buf) << endl;
+#endif
+}
+
 void
 ClusterMgr::threadMain( ){
  NdbApiSignal signal(numberToRef(API_CLUSTERMGR, theFacade.ownId()));
@@ -226,7 +275,7 @@ ClusterMgr::threadMain( ){
 	if (theNode.m_info.m_type == NodeInfo::REP) {
 	  signal.theReceiversBlockNumber = API_CLUSTERMGR;
 	}
-#if 0 
+#ifdef DEBUG_REG
 	ndbout_c("ClusterMgr: Sending API_REGREQ to node %d", (int)nodeId);
 #endif
 	theFacade.sendSignalUnCond(&signal, nodeId);
@@ -278,7 +327,7 @@ ClusterMgr::execAPI_REGREQ(const Uint32 * theData){
  const ApiRegReq * const apiRegReq = (ApiRegReq *)&theData[0];
  const NodeId nodeId = refToNode(apiRegReq->ref);

-#if 0
+#ifdef DEBUG_REG
  ndbout_c("ClusterMgr: Recd API_REGREQ from node %d", nodeId);
 #endif

@@ -319,7 +368,7 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
  const ApiRegConf * const apiRegConf = (ApiRegConf *)&theData[0];
  const NodeId nodeId = refToNode(apiRegConf->qmgrRef);
  
-#if 0 
+#ifdef DEBUG_REG
  ndbout_c("ClusterMgr: Recd API_REGCONF from node %d", nodeId);
 #endif

@@ -351,6 +400,13 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
  if (node.m_info.m_type != NodeInfo::REP) {
    node.hbFrequency = (apiRegConf->apiHeartbeatFrequency * 10) - 50;
  }
+  waitForHBFromNodes.clear(nodeId);
+  if(waitForHBFromNodes.isclear())
+  {
+    NdbMutex_Lock(waitForHBMutex);
+    NdbCondition_Signal(waitForHBCond);
+    NdbMutex_Unlock(waitForHBMutex);
+  }
 }

 void
@@ -379,6 +435,13 @@ ClusterMgr::execAPI_REGREF(const Uint32 * theData){
  default:
    break;
  }
+  waitForHBFromNodes.clear(nodeId);
+  if(waitForHBFromNodes.isclear())
+  {
+    NdbMutex_Lock(waitForHBMutex);
+    NdbCondition_Signal(waitForHBCond);
+    NdbMutex_Unlock(waitForHBMutex);
+  }
 }

 void

--- a/ndb/src/ndbapi/ClusterMgr.hpp
+++ b/ndb/src/ndbapi/ClusterMgr.hpp
@@ -50,6 +50,8 @@ public:
  void doStop();
  void startThread();

+  void forceHB(NodeBitmask waitFor);
+
 private:
  void threadMain();
  
@@ -86,6 +88,10 @@ private:
  Node          theNodes[MAX_NODES];
  NdbThread*    theClusterMgrThread;

+  NodeBitmask   waitForHBFromNodes; // used in forcing HBs
+  NdbMutex*     waitForHBMutex;
+  NdbCondition* waitForHBCond;
+
  /**
   * Used for controlling start/stop of the thread
   */