ndb - bug#22893

  Add checking of REDO to earlier during SR
      so take-over of node can be performed
      if it can't be restarted using logs
      (which btw is really weird...as it _should_ be able to use logs of other node in node group)

  Otherwise cluster could be started and 1 fragment on one node could not have been restored
  Making the cluster inconsisten, VERY BAD
parent 9cad0a01
...@@ -1044,6 +1044,8 @@ private: ...@@ -1044,6 +1044,8 @@ private:
void removeStoredReplica(FragmentstorePtr regFragptr, void removeStoredReplica(FragmentstorePtr regFragptr,
ReplicaRecordPtr replicaPtr); ReplicaRecordPtr replicaPtr);
void searchStoredReplicas(FragmentstorePtr regFragptr); void searchStoredReplicas(FragmentstorePtr regFragptr);
bool setup_create_replica(FragmentstorePtr, CreateReplicaRecord*,
ConstPtr<ReplicaRecord>);
void updateNodeInfo(FragmentstorePtr regFragptr); void updateNodeInfo(FragmentstorePtr regFragptr);
//------------------------------------ //------------------------------------
......
...@@ -8344,14 +8344,30 @@ Dbdih::resetReplicaSr(TabRecordPtr tabPtr){ ...@@ -8344,14 +8344,30 @@ Dbdih::resetReplicaSr(TabRecordPtr tabPtr){
resetReplicaLcp(replicaPtr.p, newestRestorableGCI); resetReplicaLcp(replicaPtr.p, newestRestorableGCI);
/* ----------------------------------------------------------------- /**
* LINK THE REPLICA INTO THE STORED REPLICA LIST. WE WILL USE THIS * Make sure we can also find REDO for restoring replica...
* NODE AS A STORED REPLICA. */
* WE MUST FIRST LINK IT OUT OF THE LIST OF OLD STORED REPLICAS. {
* --------------------------------------------------------------- */ CreateReplicaRecord createReplica;
ConstPtr<ReplicaRecord> constReplicaPtr;
constReplicaPtr.i = replicaPtr.i;
constReplicaPtr.p = replicaPtr.p;
if (setup_create_replica(fragPtr,
&createReplica, constReplicaPtr))
{
removeOldStoredReplica(fragPtr, replicaPtr); removeOldStoredReplica(fragPtr, replicaPtr);
linkStoredReplica(fragPtr, replicaPtr); linkStoredReplica(fragPtr, replicaPtr);
}
else
{
infoEvent("Forcing take-over of node %d due to unsufficient REDO"
" for table %d fragment: %d",
nodePtr.i, tabPtr.i, i);
setNodeActiveStatus(nodePtr.i,
Sysfile::NS_NotActive_NotTakenOver);
}
}
} }
default: default:
jam(); jam();
...@@ -12282,37 +12298,14 @@ void Dbdih::removeTooNewCrashedReplicas(ReplicaRecordPtr rtnReplicaPtr) ...@@ -12282,37 +12298,14 @@ void Dbdih::removeTooNewCrashedReplicas(ReplicaRecordPtr rtnReplicaPtr)
/* CHECKPOINT WITHOUT NEEDING ANY EXTRA LOGGING FACILITIES.*/ /* CHECKPOINT WITHOUT NEEDING ANY EXTRA LOGGING FACILITIES.*/
/* A MAXIMUM OF FOUR NODES IS RETRIEVED. */ /* A MAXIMUM OF FOUR NODES IS RETRIEVED. */
/*************************************************************************/ /*************************************************************************/
void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr) bool
Dbdih::setup_create_replica(FragmentstorePtr fragPtr,
CreateReplicaRecord* createReplicaPtrP,
ConstPtr<ReplicaRecord> replicaPtr)
{ {
Uint32 nextReplicaPtrI; createReplicaPtrP->dataNodeId = replicaPtr.p->procNode;
ConstPtr<ReplicaRecord> replicaPtr; createReplicaPtrP->replicaRec = replicaPtr.i;
replicaPtr.i = fragPtr.p->storedReplicas;
while (replicaPtr.i != RNIL) {
jam();
ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord);
nextReplicaPtrI = replicaPtr.p->nextReplica;
NodeRecordPtr nodePtr;
nodePtr.i = replicaPtr.p->procNode;
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
jam();
switch (nodePtr.p->activeStatus) {
case Sysfile::NS_Active:
case Sysfile::NS_ActiveMissed_1:
case Sysfile::NS_ActiveMissed_2:{
/* ----------------------------------------------------------------- */
/* INITIALISE THE CREATE REPLICA STRUCTURE THAT IS USED FOR SENDING*/
/* TO LQH START_FRAGREQ. */
/* SET THE DATA NODE WHERE THE LOCAL CHECKPOINT IS FOUND. ALSO */
/* SET A REFERENCE TO THE REPLICA POINTER OF THAT. */
/* ----------------------------------------------------------------- */
CreateReplicaRecordPtr createReplicaPtr;
createReplicaPtr.i = cnoOfCreateReplicas;
ptrCheckGuard(createReplicaPtr, 4, createReplicaRecord);
cnoOfCreateReplicas++;
createReplicaPtr.p->dataNodeId = replicaPtr.p->procNode;
createReplicaPtr.p->replicaRec = replicaPtr.i;
/* ----------------------------------------------------------------- */ /* ----------------------------------------------------------------- */
/* WE NEED TO SEARCH FOR A PROPER LOCAL CHECKPOINT TO USE FOR THE */ /* WE NEED TO SEARCH FOR A PROPER LOCAL CHECKPOINT TO USE FOR THE */
/* SYSTEM RESTART. */ /* SYSTEM RESTART. */
...@@ -12324,7 +12317,8 @@ void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr) ...@@ -12324,7 +12317,8 @@ void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr)
stopGci, stopGci,
startGci, startGci,
startLcpNo); startLcpNo);
if (!result) { if (!result)
{
jam(); jam();
/* --------------------------------------------------------------- */ /* --------------------------------------------------------------- */
/* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/ /* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/
...@@ -12336,22 +12330,20 @@ void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr) ...@@ -12336,22 +12330,20 @@ void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr)
/* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE */ /* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE */
/* LOCAL CHECKPOINT TO ZNIL. */ /* LOCAL CHECKPOINT TO ZNIL. */
/* --------------------------------------------------------------- */ /* --------------------------------------------------------------- */
createReplicaPtr.p->lcpNo = ZNIL; createReplicaPtrP->lcpNo = ZNIL;
} else { }
else
{
jam(); jam();
/* --------------------------------------------------------------- */ /* --------------------------------------------------------------- */
/* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM. */ /* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM. */
/* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER. */ /* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER. */
/* --------------------------------------------------------------- */ /* --------------------------------------------------------------- */
createReplicaPtr.p->lcpNo = startLcpNo; createReplicaPtrP->lcpNo = startLcpNo;
arrGuard(startLcpNo, MAX_LCP_STORED); arrGuard(startLcpNo, MAX_LCP_STORED);
createReplicaPtr.p->createLcpId = replicaPtr.p->lcpId[startLcpNo]; createReplicaPtrP->createLcpId = replicaPtr.p->lcpId[startLcpNo];
}//if }//if
if(ERROR_INSERTED(7073) || ERROR_INSERTED(7074)){
jam();
nodePtr.p->nodeStatus = NodeRecord::DEAD;
}
/* ----------------------------------------------------------------- */ /* ----------------------------------------------------------------- */
/* WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO */ /* WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO */
...@@ -12359,21 +12351,48 @@ void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr) ...@@ -12359,21 +12351,48 @@ void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr)
/* CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT */ /* CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT */
/* WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT. */ /* WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT. */
/* -_--------------------------------------------------------------- */ /* -_--------------------------------------------------------------- */
if (!findLogNodes(createReplicaPtr.p, fragPtr, startGci, stopGci)) { return findLogNodes(createReplicaPtrP, fragPtr, startGci, stopGci);
jam(); }
/* --------------------------------------------------------------- */
/* WE WERE NOT ABLE TO FIND ANY WAY OF RESTORING THIS REPLICA. */ void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr)
/* THIS IS A POTENTIAL SYSTEM ERROR. */ {
/* --------------------------------------------------------------- */ Uint32 nextReplicaPtrI;
cnoOfCreateReplicas--; Ptr<ReplicaRecord> replicaPtr;
return;
}//if
if(ERROR_INSERTED(7073) || ERROR_INSERTED(7074)){ replicaPtr.i = fragPtr.p->storedReplicas;
while (replicaPtr.i != RNIL) {
jam(); jam();
nodePtr.p->nodeStatus = NodeRecord::ALIVE; ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord);
} nextReplicaPtrI = replicaPtr.p->nextReplica;
ConstPtr<ReplicaRecord> constReplicaPtr;
constReplicaPtr.i = replicaPtr.i;
constReplicaPtr.p = replicaPtr.p;
NodeRecordPtr nodePtr;
nodePtr.i = replicaPtr.p->procNode;
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
jam();
switch (nodePtr.p->activeStatus) {
case Sysfile::NS_Active:
case Sysfile::NS_ActiveMissed_1:
case Sysfile::NS_ActiveMissed_2:{
/* ----------------------------------------------------------------- */
/* INITIALISE THE CREATE REPLICA STRUCTURE THAT IS USED FOR SENDING*/
/* TO LQH START_FRAGREQ. */
/* SET THE DATA NODE WHERE THE LOCAL CHECKPOINT IS FOUND. ALSO */
/* SET A REFERENCE TO THE REPLICA POINTER OF THAT. */
/* ----------------------------------------------------------------- */
CreateReplicaRecordPtr createReplicaPtr;
createReplicaPtr.i = cnoOfCreateReplicas;
ptrCheckGuard(createReplicaPtr, 4, createReplicaRecord);
cnoOfCreateReplicas++;
/**
* Should have been checked in resetReplicaSr
*/
ndbrequire(setup_create_replica(fragPtr,
createReplicaPtr.p,
constReplicaPtr));
break; break;
} }
default: default:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment