Commit bcd2abaa authored by unknown's avatar unknown

bug#28445 - Heartbeat does not start until first API_REGREQ is recevied

- move api failure handling into own method
- add START_ORD so that hb checking can start really early


storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp:
  - make sure qmgr is "fully" informed about connections so that it can handle hb correctly
  - dont allow API/mysqld node to reconnect if we have not started yet (sp 8)
storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp:
  - move api failure handling into own method
  - add START_ORD so that hb checking can start really early
storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp:
  - move api failure handling into own method
  - add START_ORD so that hb checking can start really early
  - Init datastructures in constructor
  - as CONNECT_REP may occur before start phases
storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  - Init datastructures in constructor
  - as CONNECT_REP may occur before start phases
  - start hb handling directly on connect rep (instead of first hb)
parent 41362e64
...@@ -421,9 +421,10 @@ void Cmvmi::execCLOSE_COMREQ(Signal* signal) ...@@ -421,9 +421,10 @@ void Cmvmi::execCLOSE_COMREQ(Signal* signal)
// Uint32 noOfNodes = closeCom->noOfNodes; // Uint32 noOfNodes = closeCom->noOfNodes;
jamEntry(); jamEntry();
for (unsigned i = 0; i < MAX_NODES; i++){ for (unsigned i = 0; i < MAX_NODES; i++)
if(NodeBitmask::get(closeCom->theNodes, i)){ {
if(NodeBitmask::get(closeCom->theNodes, i))
{
jam(); jam();
//----------------------------------------------------- //-----------------------------------------------------
...@@ -437,7 +438,9 @@ void Cmvmi::execCLOSE_COMREQ(Signal* signal) ...@@ -437,7 +438,9 @@ void Cmvmi::execCLOSE_COMREQ(Signal* signal)
globalTransporterRegistry.do_disconnect(i); globalTransporterRegistry.do_disconnect(i);
} }
} }
if (failNo != 0) {
if (failNo != 0)
{
jam(); jam();
signal->theData[0] = userRef; signal->theData[0] = userRef;
signal->theData[1] = failNo; signal->theData[1] = failNo;
...@@ -456,13 +459,21 @@ void Cmvmi::execOPEN_COMREQ(Signal* signal) ...@@ -456,13 +459,21 @@ void Cmvmi::execOPEN_COMREQ(Signal* signal)
jamEntry(); jamEntry();
const Uint32 len = signal->getLength(); const Uint32 len = signal->getLength();
if(len == 2){ if(len == 2)
{
#ifdef ERROR_INSERT #ifdef ERROR_INSERT
if (! ((ERROR_INSERTED(9000) || ERROR_INSERTED(9002)) if (! ((ERROR_INSERTED(9000) || ERROR_INSERTED(9002))
&& c_error_9000_nodes_mask.get(tStartingNode))) && c_error_9000_nodes_mask.get(tStartingNode)))
#endif #endif
{ {
if (globalData.theStartLevel != NodeState::SL_STARTED &&
(getNodeInfo(tStartingNode).m_type != NodeInfo::DB &&
getNodeInfo(tStartingNode).m_type != NodeInfo::MGM))
{
jam();
goto done;
}
globalTransporterRegistry.do_connect(tStartingNode); globalTransporterRegistry.do_connect(tStartingNode);
globalTransporterRegistry.setIOState(tStartingNode, HaltIO); globalTransporterRegistry.setIOState(tStartingNode, HaltIO);
...@@ -475,9 +486,11 @@ void Cmvmi::execOPEN_COMREQ(Signal* signal) ...@@ -475,9 +486,11 @@ void Cmvmi::execOPEN_COMREQ(Signal* signal)
//----------------------------------------------------- //-----------------------------------------------------
} }
} else { } else {
for(unsigned int i = 1; i < MAX_NODES; i++ ) { for(unsigned int i = 1; i < MAX_NODES; i++ )
{
jam(); jam();
if (i != getOwnNodeId() && getNodeInfo(i).m_type == tData2){ if (i != getOwnNodeId() && getNodeInfo(i).m_type == tData2)
{
jam(); jam();
#ifdef ERROR_INSERT #ifdef ERROR_INSERT
...@@ -496,6 +509,7 @@ void Cmvmi::execOPEN_COMREQ(Signal* signal) ...@@ -496,6 +509,7 @@ void Cmvmi::execOPEN_COMREQ(Signal* signal)
} }
} }
done:
if (userRef != 0) { if (userRef != 0) {
jam(); jam();
signal->theData[0] = tStartingNode; signal->theData[0] = tStartingNode;
...@@ -536,24 +550,10 @@ void Cmvmi::execDISCONNECT_REP(Signal *signal) ...@@ -536,24 +550,10 @@ void Cmvmi::execDISCONNECT_REP(Signal *signal)
setNodeInfo(hostId).m_connectCount++; setNodeInfo(hostId).m_connectCount++;
const NodeInfo::NodeType type = getNodeInfo(hostId).getType(); const NodeInfo::NodeType type = getNodeInfo(hostId).getType();
ndbrequire(type != NodeInfo::INVALID); ndbrequire(type != NodeInfo::INVALID);
if(type == NodeInfo::DB || globalData.theStartLevel == NodeState::SL_STARTED){
jam();
DisconnectRep * const rep = (DisconnectRep *)&signal->theData[0];
rep->nodeId = hostId;
rep->err = errNo;
sendSignal(QMGR_REF, GSN_DISCONNECT_REP, signal,
DisconnectRep::SignalLength, JBA);
} else if((globalData.theStartLevel == NodeState::SL_CMVMI ||
globalData.theStartLevel == NodeState::SL_STARTING)
&& type == NodeInfo::MGM) {
/**
* Someone disconnected during cmvmi period
*/
jam();
globalTransporterRegistry.do_connect(hostId);
}
sendSignal(QMGR_REF, GSN_DISCONNECT_REP, signal,
DisconnectRep::SignalLength, JBA);
cancelSubscription(hostId); cancelSubscription(hostId);
signal->theData[0] = NDB_LE_Disconnected; signal->theData[0] = NDB_LE_Disconnected;
...@@ -587,6 +587,8 @@ void Cmvmi::execCONNECT_REP(Signal *signal){ ...@@ -587,6 +587,8 @@ void Cmvmi::execCONNECT_REP(Signal *signal){
*/ */
if(type == NodeInfo::MGM){ if(type == NodeInfo::MGM){
jam(); jam();
signal->theData[0] = hostId;
sendSignal(QMGR_REF, GSN_CONNECT_REP, signal, 1, JBA);
} else { } else {
/** /**
* Dont allow api nodes to connect * Dont allow api nodes to connect
...@@ -802,6 +804,8 @@ Cmvmi::execSTART_ORD(Signal* signal) { ...@@ -802,6 +804,8 @@ Cmvmi::execSTART_ORD(Signal* signal) {
} }
} }
} }
EXECUTE_DIRECT(QMGR, GSN_START_ORD, signal, 1);
return ; return ;
} }
...@@ -829,9 +833,6 @@ Cmvmi::execSTART_ORD(Signal* signal) { ...@@ -829,9 +833,6 @@ Cmvmi::execSTART_ORD(Signal* signal) {
* *
* Do Restart * Do Restart
*/ */
globalScheduler.clear();
globalTimeQueue.clear();
// Disconnect all nodes as part of the system restart. // Disconnect all nodes as part of the system restart.
// We need to ensure that we are starting up // We need to ensure that we are starting up
......
...@@ -265,6 +265,8 @@ private: ...@@ -265,6 +265,8 @@ private:
void execALLOC_NODEID_CONF(Signal *); void execALLOC_NODEID_CONF(Signal *);
void execALLOC_NODEID_REF(Signal *); void execALLOC_NODEID_REF(Signal *);
void completeAllocNodeIdReq(Signal *); void completeAllocNodeIdReq(Signal *);
void execSTART_ORD(Signal*);
// Arbitration signals // Arbitration signals
void execARBIT_CFG(Signal* signal); void execARBIT_CFG(Signal* signal);
...@@ -281,6 +283,7 @@ private: ...@@ -281,6 +283,7 @@ private:
void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn); void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
Uint32 check_startup(Signal* signal); Uint32 check_startup(Signal* signal);
void api_failed(Signal* signal, Uint32 aFailedNode);
void node_failed(Signal* signal, Uint16 aFailedNode); void node_failed(Signal* signal, Uint16 aFailedNode);
void checkStartInterface(Signal* signal); void checkStartInterface(Signal* signal);
void failReport(Signal* signal, void failReport(Signal* signal,
......
...@@ -31,10 +31,6 @@ void Qmgr::initData() ...@@ -31,10 +31,6 @@ void Qmgr::initData()
cnoCommitFailedNodes = 0; cnoCommitFailedNodes = 0;
c_maxDynamicId = 0; c_maxDynamicId = 0;
c_clusterNodes.clear(); c_clusterNodes.clear();
Uint32 hbDBAPI = 500;
setHbApiDelay(hbDBAPI);
c_connectedNodes.set(getOwnNodeId());
c_stopReq.senderRef = 0; c_stopReq.senderRef = 0;
/** /**
...@@ -43,6 +39,27 @@ void Qmgr::initData() ...@@ -43,6 +39,27 @@ void Qmgr::initData()
ndbrequire((Uint32)NodeInfo::DB == 0); ndbrequire((Uint32)NodeInfo::DB == 0);
ndbrequire((Uint32)NodeInfo::API == 1); ndbrequire((Uint32)NodeInfo::API == 1);
ndbrequire((Uint32)NodeInfo::MGM == 2); ndbrequire((Uint32)NodeInfo::MGM == 2);
NodeRecPtr nodePtr;
nodePtr.i = getOwnNodeId();
ptrAss(nodePtr, nodeRec);
nodePtr.p->blockRef = reference();
c_connectedNodes.set(getOwnNodeId());
setNodeInfo(getOwnNodeId()).m_version = NDB_VERSION;
/**
* Timeouts
*/
const ndb_mgm_configuration_iterator * p =
m_ctx.m_config.getOwnConfigIterator();
ndbrequire(p != 0);
Uint32 hbDBAPI = 1500;
ndb_mgm_get_int_parameter(p, CFG_DB_API_HEARTBEAT_INTERVAL, &hbDBAPI);
setHbApiDelay(hbDBAPI);
}//Qmgr::initData() }//Qmgr::initData()
void Qmgr::initRecords() void Qmgr::initRecords()
...@@ -113,6 +130,7 @@ Qmgr::Qmgr(Block_context& ctx) ...@@ -113,6 +130,7 @@ Qmgr::Qmgr(Block_context& ctx)
addRecSignal(GSN_DIH_RESTARTREF, &Qmgr::execDIH_RESTARTREF); addRecSignal(GSN_DIH_RESTARTREF, &Qmgr::execDIH_RESTARTREF);
addRecSignal(GSN_DIH_RESTARTCONF, &Qmgr::execDIH_RESTARTCONF); addRecSignal(GSN_DIH_RESTARTCONF, &Qmgr::execDIH_RESTARTCONF);
addRecSignal(GSN_NODE_VERSION_REP, &Qmgr::execNODE_VERSION_REP); addRecSignal(GSN_NODE_VERSION_REP, &Qmgr::execNODE_VERSION_REP);
addRecSignal(GSN_START_ORD, &Qmgr::execSTART_ORD);
initData(); initData();
}//Qmgr::Qmgr() }//Qmgr::Qmgr()
......
...@@ -238,6 +238,38 @@ Qmgr::execREAD_CONFIG_REQ(Signal* signal) ...@@ -238,6 +238,38 @@ Qmgr::execREAD_CONFIG_REQ(Signal* signal)
ReadConfigConf::SignalLength, JBB); ReadConfigConf::SignalLength, JBB);
} }
void
Qmgr::execSTART_ORD(Signal* signal)
{
/**
* Start timer handling
*/
signal->theData[0] = ZTIMER_HANDLING;
sendSignal(QMGR_REF, GSN_CONTINUEB, signal, 1, JBB);
NodeRecPtr nodePtr;
for (nodePtr.i = 1; nodePtr.i < MAX_NODES; nodePtr.i++)
{
ptrAss(nodePtr, nodeRec);
nodePtr.p->ndynamicId = 0;
if(getNodeInfo(nodePtr.i).m_type == NodeInfo::DB)
{
nodePtr.p->phase = ZINIT;
c_definedNodes.set(nodePtr.i);
} else {
nodePtr.p->phase = ZAPI_INACTIVE;
}
setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
nodePtr.p->sendPrepFailReqStatus = Q_NOT_ACTIVE;
nodePtr.p->sendCommitFailReqStatus = Q_NOT_ACTIVE;
nodePtr.p->sendPresToStatus = Q_NOT_ACTIVE;
nodePtr.p->failState = NORMAL;
nodePtr.p->rcv[0] = 0;
nodePtr.p->rcv[1] = 0;
}//for
}
/* /*
4.2 ADD NODE MODULE*/ 4.2 ADD NODE MODULE*/
/*##########################################################################*/ /*##########################################################################*/
...@@ -298,8 +330,6 @@ void Qmgr::startphase1(Signal* signal) ...@@ -298,8 +330,6 @@ void Qmgr::startphase1(Signal* signal)
nodePtr.i = getOwnNodeId(); nodePtr.i = getOwnNodeId();
ptrAss(nodePtr, nodeRec); ptrAss(nodePtr, nodeRec);
nodePtr.p->phase = ZSTARTING; nodePtr.p->phase = ZSTARTING;
nodePtr.p->blockRef = reference();
c_connectedNodes.set(nodePtr.i);
signal->theData[0] = reference(); signal->theData[0] = reference();
sendSignal(DBDIH_REF, GSN_DIH_RESTARTREQ, signal, 1, JBB); sendSignal(DBDIH_REF, GSN_DIH_RESTARTREQ, signal, 1, JBB);
...@@ -371,11 +401,14 @@ void Qmgr::execCONNECT_REP(Signal* signal) ...@@ -371,11 +401,14 @@ void Qmgr::execCONNECT_REP(Signal* signal)
case ZFAIL_CLOSING: case ZFAIL_CLOSING:
jam(); jam();
return; return;
case ZINIT:
ndbrequire(false);
case ZAPI_ACTIVE: case ZAPI_ACTIVE:
case ZAPI_INACTIVE: case ZAPI_INACTIVE:
return; return;
case ZINIT:
ndbrequire(getNodeInfo(nodeId).m_type == NodeInfo::MGM);
break;
default:
ndbrequire(false);
} }
if (getNodeInfo(nodeId).getType() != NodeInfo::DB) if (getNodeInfo(nodeId).getType() != NodeInfo::DB)
...@@ -1212,12 +1245,6 @@ void Qmgr::execCM_REGREF(Signal* signal) ...@@ -1212,12 +1245,6 @@ void Qmgr::execCM_REGREF(Signal* signal)
{ {
jam(); jam();
electionWon(signal); electionWon(signal);
/**
* Start timer handling
*/
signal->theData[0] = ZTIMER_HANDLING;
sendSignal(QMGR_REF, GSN_CONTINUEB, signal, 10, JBB);
} }
return; return;
...@@ -1855,12 +1882,6 @@ Qmgr::joinedCluster(Signal* signal, NodeRecPtr nodePtr){ ...@@ -1855,12 +1882,6 @@ Qmgr::joinedCluster(Signal* signal, NodeRecPtr nodePtr){
sendSttorryLab(signal); sendSttorryLab(signal);
/**
* Start timer handling
*/
signal->theData[0] = ZTIMER_HANDLING;
sendSignal(QMGR_REF, GSN_CONTINUEB, signal, 10, JBB);
sendCmAckAdd(signal, getOwnNodeId(), CmAdd::CommitNew); sendCmAckAdd(signal, getOwnNodeId(), CmAdd::CommitNew);
} }
...@@ -2094,25 +2115,6 @@ void Qmgr::findNeighbours(Signal* signal) ...@@ -2094,25 +2115,6 @@ void Qmgr::findNeighbours(Signal* signal)
/*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/
void Qmgr::initData(Signal* signal) void Qmgr::initData(Signal* signal)
{ {
NodeRecPtr nodePtr;
for (nodePtr.i = 1; nodePtr.i < MAX_NODES; nodePtr.i++) {
ptrAss(nodePtr, nodeRec);
nodePtr.p->ndynamicId = 0;
if(getNodeInfo(nodePtr.i).m_type == NodeInfo::DB){
nodePtr.p->phase = ZINIT;
c_definedNodes.set(nodePtr.i);
} else {
nodePtr.p->phase = ZAPI_INACTIVE;
}
setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
nodePtr.p->sendPrepFailReqStatus = Q_NOT_ACTIVE;
nodePtr.p->sendCommitFailReqStatus = Q_NOT_ACTIVE;
nodePtr.p->sendPresToStatus = Q_NOT_ACTIVE;
nodePtr.p->failState = NORMAL;
nodePtr.p->rcv[0] = 0;
nodePtr.p->rcv[1] = 0;
}//for
cfailureNr = 1; cfailureNr = 1;
ccommitFailureNr = 1; ccommitFailureNr = 1;
cprepareFailureNr = 1; cprepareFailureNr = 1;
...@@ -2146,13 +2148,11 @@ void Qmgr::initData(Signal* signal) ...@@ -2146,13 +2148,11 @@ void Qmgr::initData(Signal* signal)
ndbrequire(p != 0); ndbrequire(p != 0);
Uint32 hbDBDB = 1500; Uint32 hbDBDB = 1500;
Uint32 hbDBAPI = 1500;
Uint32 arbitTimeout = 1000; Uint32 arbitTimeout = 1000;
c_restartPartialTimeout = 30000; c_restartPartialTimeout = 30000;
c_restartPartionedTimeout = 60000; c_restartPartionedTimeout = 60000;
c_restartFailureTimeout = ~0; c_restartFailureTimeout = ~0;
ndb_mgm_get_int_parameter(p, CFG_DB_HEARTBEAT_INTERVAL, &hbDBDB); ndb_mgm_get_int_parameter(p, CFG_DB_HEARTBEAT_INTERVAL, &hbDBDB);
ndb_mgm_get_int_parameter(p, CFG_DB_API_HEARTBEAT_INTERVAL, &hbDBAPI);
ndb_mgm_get_int_parameter(p, CFG_DB_ARBIT_TIMEOUT, &arbitTimeout); ndb_mgm_get_int_parameter(p, CFG_DB_ARBIT_TIMEOUT, &arbitTimeout);
ndb_mgm_get_int_parameter(p, CFG_DB_START_PARTIAL_TIMEOUT, ndb_mgm_get_int_parameter(p, CFG_DB_START_PARTIAL_TIMEOUT,
&c_restartPartialTimeout); &c_restartPartialTimeout);
...@@ -2177,7 +2177,6 @@ void Qmgr::initData(Signal* signal) ...@@ -2177,7 +2177,6 @@ void Qmgr::initData(Signal* signal)
} }
setHbDelay(hbDBDB); setHbDelay(hbDBDB);
setHbApiDelay(hbDBAPI);
setArbitTimeout(arbitTimeout); setArbitTimeout(arbitTimeout);
arbitRec.state = ARBIT_NULL; // start state for all nodes arbitRec.state = ARBIT_NULL; // start state for all nodes
...@@ -2204,7 +2203,6 @@ void Qmgr::initData(Signal* signal) ...@@ -2204,7 +2203,6 @@ void Qmgr::initData(Signal* signal)
execARBIT_CFG(signal); execARBIT_CFG(signal);
} }
setNodeInfo(getOwnNodeId()).m_version = NDB_VERSION;
}//Qmgr::initData() }//Qmgr::initData()
...@@ -2237,20 +2235,22 @@ void Qmgr::timerHandlingLab(Signal* signal) ...@@ -2237,20 +2235,22 @@ void Qmgr::timerHandlingLab(Signal* signal)
hb_check_timer.reset(); hb_check_timer.reset();
} }
} }
if (interface_check_timer.check(TcurrentTime)) { if (interface_check_timer.check(TcurrentTime)) {
jam(); jam();
interface_check_timer.reset(); interface_check_timer.reset();
checkStartInterface(signal); checkStartInterface(signal);
} }
if (hb_api_timer.check(TcurrentTime))
{
jam();
hb_api_timer.reset();
apiHbHandlingLab(signal);
}
if (cactivateApiCheck != 0) { if (cactivateApiCheck != 0) {
jam(); jam();
if (hb_api_timer.check(TcurrentTime)) {
jam();
hb_api_timer.reset();
apiHbHandlingLab(signal);
}//if
if (clatestTransactionCheck == 0) { if (clatestTransactionCheck == 0) {
//------------------------------------------------------------- //-------------------------------------------------------------
// Initialise the Transaction check timer. // Initialise the Transaction check timer.
...@@ -2367,18 +2367,21 @@ void Qmgr::apiHbHandlingLab(Signal* signal) ...@@ -2367,18 +2367,21 @@ void Qmgr::apiHbHandlingLab(Signal* signal)
if(type == NodeInfo::INVALID) if(type == NodeInfo::INVALID)
continue; continue;
if (TnodePtr.p->phase == ZAPI_ACTIVE){ if (c_connectedNodes.get(nodeId))
{
jam(); jam();
setNodeInfo(TnodePtr.i).m_heartbeat_cnt++; setNodeInfo(TnodePtr.i).m_heartbeat_cnt++;
if(getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 2){ if(getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 2)
{
signal->theData[0] = NDB_LE_MissedHeartbeat; signal->theData[0] = NDB_LE_MissedHeartbeat;
signal->theData[1] = nodeId; signal->theData[1] = nodeId;
signal->theData[2] = getNodeInfo(TnodePtr.i).m_heartbeat_cnt - 1; signal->theData[2] = getNodeInfo(TnodePtr.i).m_heartbeat_cnt - 1;
sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB); sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
} }
if (getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 4) { if (getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 4)
{
jam(); jam();
/*------------------------------------------------------------------*/ /*------------------------------------------------------------------*/
/* THE API NODE HAS NOT SENT ANY HEARTBEAT FOR THREE SECONDS. /* THE API NODE HAS NOT SENT ANY HEARTBEAT FOR THREE SECONDS.
...@@ -2390,8 +2393,8 @@ void Qmgr::apiHbHandlingLab(Signal* signal) ...@@ -2390,8 +2393,8 @@ void Qmgr::apiHbHandlingLab(Signal* signal)
signal->theData[0] = NDB_LE_DeadDueToHeartbeat; signal->theData[0] = NDB_LE_DeadDueToHeartbeat;
signal->theData[1] = nodeId; signal->theData[1] = nodeId;
sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB); sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
node_failed(signal, nodeId); api_failed(signal, nodeId);
}//if }//if
}//if }//if
}//for }//for
...@@ -2480,26 +2483,6 @@ void Qmgr::sendApiFailReq(Signal* signal, Uint16 failedNodeNo) ...@@ -2480,26 +2483,6 @@ void Qmgr::sendApiFailReq(Signal* signal, Uint16 failedNodeNo)
sendSignal(DBTC_REF, GSN_API_FAILREQ, signal, 2, JBA); sendSignal(DBTC_REF, GSN_API_FAILREQ, signal, 2, JBA);
sendSignal(DBDICT_REF, GSN_API_FAILREQ, signal, 2, JBA); sendSignal(DBDICT_REF, GSN_API_FAILREQ, signal, 2, JBA);
sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA); sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA);
/**-------------------------------------------------------------------------
* THE OTHER NODE WAS AN API NODE. THE COMMUNICATION LINK IS ALREADY
* BROKEN AND THUS NO ACTION IS NEEDED TO BREAK THE CONNECTION.
* WE ONLY NEED TO SET PARAMETERS TO ENABLE A NEW CONNECTION IN A FEW
* SECONDS.
*-------------------------------------------------------------------------*/
setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
setNodeInfo(failedNodePtr.i).m_version = 0;
recompute_version_info(getNodeInfo(failedNodePtr.i).m_type);
CloseComReqConf * const closeCom = (CloseComReqConf *)&signal->theData[0];
closeCom->xxxBlockRef = reference();
closeCom->failNo = 0;
closeCom->noOfNodes = 1;
NodeBitmask::clear(closeCom->theNodes);
NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal,
CloseComReqConf::SignalLength, JBA);
}//Qmgr::sendApiFailReq() }//Qmgr::sendApiFailReq()
void Qmgr::execAPI_FAILREQ(Signal* signal) void Qmgr::execAPI_FAILREQ(Signal* signal)
...@@ -2512,20 +2495,7 @@ void Qmgr::execAPI_FAILREQ(Signal* signal) ...@@ -2512,20 +2495,7 @@ void Qmgr::execAPI_FAILREQ(Signal* signal)
ndbrequire(getNodeInfo(failedNodePtr.i).getType() != NodeInfo::DB); ndbrequire(getNodeInfo(failedNodePtr.i).getType() != NodeInfo::DB);
// ignore if api not active api_failed(signal, signal->theData[0]);
if (failedNodePtr.p->phase != ZAPI_ACTIVE)
{
jam();
// But send to SUMA anyway...
sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA);
return;
}
signal->theData[0] = NDB_LE_Disconnected;
signal->theData[1] = failedNodePtr.i;
sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
node_failed(signal, failedNodePtr.i);
} }
void Qmgr::execAPI_FAILCONF(Signal* signal) void Qmgr::execAPI_FAILCONF(Signal* signal)
...@@ -2649,6 +2619,13 @@ void Qmgr::execDISCONNECT_REP(Signal* signal) ...@@ -2649,6 +2619,13 @@ void Qmgr::execDISCONNECT_REP(Signal* signal)
ndbrequire(false); ndbrequire(false);
} }
if (getNodeInfo(nodeId).getType() != NodeInfo::DB)
{
jam();
api_failed(signal, nodeId);
return;
}
switch(nodePtr.p->phase){ switch(nodePtr.p->phase){
case ZRUNNING: case ZRUNNING:
jam(); jam();
...@@ -2685,66 +2662,109 @@ void Qmgr::node_failed(Signal* signal, Uint16 aFailedNode) ...@@ -2685,66 +2662,109 @@ void Qmgr::node_failed(Signal* signal, Uint16 aFailedNode)
failedNodePtr.i = aFailedNode; failedNodePtr.i = aFailedNode;
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
if (getNodeInfo(failedNodePtr.i).getType() == NodeInfo::DB){ ndbrequire(getNodeInfo(failedNodePtr.i).getType() == NodeInfo::DB);
/**---------------------------------------------------------------------
* THE OTHER NODE IS AN NDB NODE, WE HANDLE IT AS IF A HEARTBEAT
* FAILURE WAS DISCOVERED.
*---------------------------------------------------------------------*/
switch(failedNodePtr.p->phase){
case ZRUNNING:
jam(); jam();
/**--------------------------------------------------------------------- failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
* THE OTHER NODE IS AN NDB NODE, WE HANDLE IT AS IF A HEARTBEAT
* FAILURE WAS DISCOVERED.
*---------------------------------------------------------------------*/
switch(failedNodePtr.p->phase){
case ZRUNNING:
jam();
failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
return;
case ZFAIL_CLOSING:
jam();
return;
case ZSTARTING:
c_start.reset();
// Fall-through
default:
jam();
/*---------------------------------------------------------------------*/
// The other node is still not in the cluster but disconnected.
// We must restart communication in three seconds.
/*---------------------------------------------------------------------*/
failedNodePtr.p->failState = NORMAL;
failedNodePtr.p->phase = ZFAIL_CLOSING;
setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
CloseComReqConf * const closeCom =
(CloseComReqConf *)&signal->theData[0];
closeCom->xxxBlockRef = reference();
closeCom->failNo = 0;
closeCom->noOfNodes = 1;
NodeBitmask::clear(closeCom->theNodes);
NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal,
CloseComReqConf::SignalLength, JBA);
}//if
return; return;
} case ZFAIL_CLOSING:
jam();
/** return;
* API code case ZSTARTING:
*/ c_start.reset();
jam(); // Fall-through
if (failedNodePtr.p->phase != ZFAIL_CLOSING){ default:
jam(); jam();
//------------------------------------------------------------------------- /*---------------------------------------------------------------------*/
// The API was active and has now failed. We need to initiate API failure // The other node is still not in the cluster but disconnected.
// handling. If the API had already failed then we can ignore this // We must restart communication in three seconds.
// discovery. /*---------------------------------------------------------------------*/
//------------------------------------------------------------------------- failedNodePtr.p->failState = NORMAL;
failedNodePtr.p->phase = ZFAIL_CLOSING; failedNodePtr.p->phase = ZFAIL_CLOSING;
setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
sendApiFailReq(signal, aFailedNode);
arbitRec.code = ArbitCode::ApiFail; CloseComReqConf * const closeCom =
handleArbitApiFail(signal, aFailedNode); (CloseComReqConf *)&signal->theData[0];
closeCom->xxxBlockRef = reference();
closeCom->failNo = 0;
closeCom->noOfNodes = 1;
NodeBitmask::clear(closeCom->theNodes);
NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal,
CloseComReqConf::SignalLength, JBA);
}//if }//if
return; return;
}//Qmgr::node_failed() }
void
Qmgr::api_failed(Signal* signal, Uint32 nodeId)
{
NodeRecPtr failedNodePtr;
/**------------------------------------------------------------------------
* A COMMUNICATION LINK HAS BEEN DISCONNECTED. WE MUST TAKE SOME ACTION
* DUE TO THIS.
*-----------------------------------------------------------------------*/
failedNodePtr.i = nodeId;
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
if (failedNodePtr.p->phase == ZFAIL_CLOSING)
{
/**
* Failure handling already in progress
*/
jam();
return;
}
if (failedNodePtr.p->phase == ZAPI_ACTIVE)
{
jam();
sendApiFailReq(signal, nodeId);
arbitRec.code = ArbitCode::ApiFail;
handleArbitApiFail(signal, nodeId);
}
else
{
/**
* Always inform SUMA
*/
jam();
signal->theData[0] = nodeId;
signal->theData[1] = QMGR_REF;
sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA);
failedNodePtr.p->failState = NORMAL;
}
failedNodePtr.p->phase = ZFAIL_CLOSING;
setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
setNodeInfo(failedNodePtr.i).m_version = 0;
recompute_version_info(getNodeInfo(failedNodePtr.i).m_type);
CloseComReqConf * const closeCom = (CloseComReqConf *)&signal->theData[0];
closeCom->xxxBlockRef = reference();
closeCom->failNo = 0;
closeCom->noOfNodes = 1;
NodeBitmask::clear(closeCom->theNodes);
NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal,
CloseComReqConf::SignalLength, JBA);
if (getNodeInfo(failedNodePtr.i).getType() == NodeInfo::MGM)
{
/**
* Allow MGM do reconnect "directly"
*/
jam();
setNodeInfo(failedNodePtr.i).m_heartbeat_cnt = 3;
}
}
/**-------------------------------------------------------------------------- /**--------------------------------------------------------------------------
* AN API NODE IS REGISTERING. IF FOR THE FIRST TIME WE WILL ENABLE * AN API NODE IS REGISTERING. IF FOR THE FIRST TIME WE WILL ENABLE
...@@ -4963,43 +4983,39 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal) ...@@ -4963,43 +4983,39 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal)
c_start.m_president_candidate_gci); c_start.m_president_candidate_gci);
infoEvent("ctoStatus = %d\n", ctoStatus); infoEvent("ctoStatus = %d\n", ctoStatus);
for(Uint32 i = 1; i<MAX_NDB_NODES; i++){ for(Uint32 i = 1; i<MAX_NDB_NODES; i++){
if(getNodeInfo(i).getType() == NodeInfo::DB){ NodeRecPtr nodePtr;
NodeRecPtr nodePtr; nodePtr.i = i;
nodePtr.i = i; ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec);
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec); char buf[100];
char buf[100]; switch(nodePtr.p->phase){
switch(nodePtr.p->phase){ case ZINIT:
case ZINIT: sprintf(buf, "Node %d: ZINIT(%d)", i, nodePtr.p->phase);
sprintf(buf, "Node %d: ZINIT(%d)", i, nodePtr.p->phase); break;
break; case ZSTARTING:
case ZSTARTING: sprintf(buf, "Node %d: ZSTARTING(%d)", i, nodePtr.p->phase);
sprintf(buf, "Node %d: ZSTARTING(%d)", i, nodePtr.p->phase); break;
break; case ZRUNNING:
case ZRUNNING: sprintf(buf, "Node %d: ZRUNNING(%d)", i, nodePtr.p->phase);
sprintf(buf, "Node %d: ZRUNNING(%d)", i, nodePtr.p->phase); break;
break; case ZPREPARE_FAIL:
case ZPREPARE_FAIL: sprintf(buf, "Node %d: ZPREPARE_FAIL(%d)", i, nodePtr.p->phase);
sprintf(buf, "Node %d: ZPREPARE_FAIL(%d)", i, nodePtr.p->phase); break;
break; case ZFAIL_CLOSING:
case ZFAIL_CLOSING: sprintf(buf, "Node %d: ZFAIL_CLOSING(%d)", i, nodePtr.p->phase);
sprintf(buf, "Node %d: ZFAIL_CLOSING(%d)", i, nodePtr.p->phase); break;
break; case ZAPI_INACTIVE:
case ZAPI_INACTIVE: sprintf(buf, "Node %d: ZAPI_INACTIVE(%d)", i, nodePtr.p->phase);
sprintf(buf, "Node %d: ZAPI_INACTIVE(%d)", i, nodePtr.p->phase); break;
break; case ZAPI_ACTIVE:
case ZAPI_ACTIVE: sprintf(buf, "Node %d: ZAPI_ACTIVE(%d)", i, nodePtr.p->phase);
sprintf(buf, "Node %d: ZAPI_ACTIVE(%d)", i, nodePtr.p->phase); break;
break; default:
default: sprintf(buf, "Node %d: <UNKNOWN>(%d)", i, nodePtr.p->phase);
sprintf(buf, "Node %d: <UNKNOWN>(%d)", i, nodePtr.p->phase); break;
break;
}
infoEvent(buf);
} }
infoEvent(buf);
} }
default: }
;
}//switch
#ifdef ERROR_INSERT #ifdef ERROR_INSERT
if (signal->theData[0] == 935 && signal->getLength() == 2) if (signal->theData[0] == 935 && signal->getLength() == 2)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment