ndb - bug#31525

  Fix bug regarding node that missed 2 LCP's (that was not included in next LCP after SR)
parent c9f7d224
......@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4029
Next DBLQH 5045
Next DBDICT 6007
Next DBDIH 7186
Next DBDIH 7193
Next DBTC 8054
Next CMVMI 9000
Next BACKUP 10038
......@@ -155,6 +155,9 @@ And crash when all have "not" been sent
7027: Crash in master when changing state to LCP_TAB_SAVED
7018: Crash in master when changing state to LCP_TAB_SAVED
7191: Crash when receiving LCP_COMPLETE_REP
7192: Crash in setLcpActiveStatusStart - when dead node missed to LCP's
ERROR CODES FOR TESTING NODE FAILURE, FAILURE IN COPY FRAGMENT PROCESS:
-----------------------------------------------------------------------
......
......@@ -10853,6 +10853,8 @@ void Dbdih::execLCP_COMPLETE_REP(Signal* signal)
{
jamEntry();
CRASH_INSERTION(7191);
#if 0
g_eventLogger.info("LCP_COMPLETE_REP");
printLCP_COMPLETE_REP(stdout,
......@@ -13603,6 +13605,7 @@ void Dbdih::setLcpActiveStatusStart(Signal* signal)
// It must be taken over with the copy fragment process after a system
// crash. We indicate this by setting the active status to TAKE_OVER.
/*-------------------------------------------------------------------*/
c_lcpState.m_participatingLQH.set(nodePtr.i);
nodePtr.p->activeStatus = Sysfile::NS_TakeOver;
//break; // Fall through
case Sysfile::NS_TakeOver:{
......@@ -13645,6 +13648,7 @@ void Dbdih::setLcpActiveStatusStart(Signal* signal)
break;
case Sysfile::NS_ActiveMissed_2:
jam();
CRASH_INSERTION(7192);
if ((nodePtr.p->nodeStatus == NodeRecord::ALIVE) &&
(!nodePtr.p->copyCompleted)) {
jam();
......
......@@ -1668,6 +1668,80 @@ runBug28717(NDBT_Context* ctx, NDBT_Step* step)
return NDBT_OK;
}
int
runBug31525(NDBT_Context* ctx, NDBT_Step* step)
{
int result = NDBT_OK;
int loops = ctx->getNumLoops();
int records = ctx->getNumRecords();
Ndb* pNdb = GETNDB(step);
NdbRestarter res;
if (res.getNumDbNodes() < 2)
{
return NDBT_OK;
}
int nodes[2];
nodes[0] = res.getMasterNodeId();
nodes[1] = res.getNextMasterNodeId(nodes[0]);
while (res.getNodeGroup(nodes[0]) != res.getNodeGroup(nodes[1]))
{
ndbout_c("Restarting %u as it not in same node group as %u",
nodes[1], nodes[0]);
if (res.restartOneDbNode(nodes[1], false, true, true))
return NDBT_FAILED;
if (res.waitNodesNoStart(nodes+1, 1))
return NDBT_FAILED;
if (res.startNodes(nodes+1, 1))
return NDBT_FAILED;
if (res.waitClusterStarted())
return NDBT_FAILED;
nodes[1] = res.getNextMasterNodeId(nodes[0]);
}
ndbout_c("nodes[0]: %u nodes[1]: %u", nodes[0], nodes[1]);
int val = DumpStateOrd::DihMinTimeBetweenLCP;
if (res.dumpStateAllNodes(&val, 1))
return NDBT_FAILED;
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
if (res.dumpStateAllNodes(val2, 2))
return NDBT_FAILED;
if (res.insertErrorInAllNodes(932))
return NDBT_FAILED;
if (res.insertErrorInNode(nodes[1], 7192))
return NDBT_FAILED;
if (res.insertErrorInNode(nodes[0], 7191))
return NDBT_FAILED;
if (res.waitClusterNoStart())
return NDBT_FAILED;
if (res.startAll())
return NDBT_FAILED;
if (res.waitClusterStarted())
return NDBT_FAILED;
if (res.restartOneDbNode(nodes[1], false, false, true))
return NDBT_FAILED;
if (res.waitClusterStarted())
return NDBT_FAILED;
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
......@@ -1991,6 +2065,9 @@ TESTCASE("Bug21271",
STEP(runPkUpdateUntilStopped);
FINALIZER(runClearTable);
}
TESTCASE("Bug31525", ""){
INITIALIZER(runBug31525);
}
TESTCASE("Bug24717", ""){
INITIALIZER(runBug24717);
}
......
......@@ -934,3 +934,7 @@ max-time: 1500
cmd: testSystemRestart
args: -n SR_DD_2b_LCP D2
max-time: 600
cmd: testNodeRestart
args: -n Bug31525 T1
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment