ndb - bug#27003

  Handle random(not in order) LQHKEYREQ failures during node-restart
parent 9370d6a3
...@@ -489,3 +489,15 @@ Dbdict: ...@@ -489,3 +489,15 @@ Dbdict:
6003 Crash in participant @ CreateTabReq::Prepare 6003 Crash in participant @ CreateTabReq::Prepare
6004 Crash in participant @ CreateTabReq::Commit 6004 Crash in participant @ CreateTabReq::Commit
6005 Crash in participant @ CreateTabReq::CreateDrop 6005 Crash in participant @ CreateTabReq::CreateDrop
TUP:
----
4025: Fail all inserts with out of memory
4026: Fail one insert with oom
4027: Fail inserts randomly with oom
4028: Fail one random insert with oom
NDBCNTR:
1000: Crash insertion on SystemError::CopyFragRef
...@@ -9641,6 +9641,15 @@ void Dblqh::copyCompletedLab(Signal* signal) ...@@ -9641,6 +9641,15 @@ void Dblqh::copyCompletedLab(Signal* signal)
closeCopyLab(signal); closeCopyLab(signal);
return; return;
}//if }//if
if (scanptr.p->scanState == ScanRecord::WAIT_LQHKEY_COPY &&
scanptr.p->scanErrorCounter)
{
jam();
closeCopyLab(signal);
return;
}
if (scanptr.p->scanState == ScanRecord::WAIT_LQHKEY_COPY) { if (scanptr.p->scanState == ScanRecord::WAIT_LQHKEY_COPY) {
jam(); jam();
/*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/
...@@ -9717,13 +9726,16 @@ void Dblqh::continueCopyAfterBlockedLab(Signal* signal) ...@@ -9717,13 +9726,16 @@ void Dblqh::continueCopyAfterBlockedLab(Signal* signal)
void Dblqh::copyLqhKeyRefLab(Signal* signal) void Dblqh::copyLqhKeyRefLab(Signal* signal)
{ {
ndbrequire(tcConnectptr.p->transid[1] == signal->theData[4]); ndbrequire(tcConnectptr.p->transid[1] == signal->theData[4]);
tcConnectptr.p->copyCountWords -= signal->theData[3]; Uint32 copyWords = signal->theData[3];
scanptr.i = tcConnectptr.p->tcScanRec; scanptr.i = tcConnectptr.p->tcScanRec;
c_scanRecordPool.getPtr(scanptr); c_scanRecordPool.getPtr(scanptr);
scanptr.p->scanErrorCounter++; scanptr.p->scanErrorCounter++;
tcConnectptr.p->errorCode = terrorCode; tcConnectptr.p->errorCode = terrorCode;
closeCopyLab(signal);
return; LqhKeyConf* conf = (LqhKeyConf*)signal->getDataPtrSend();
conf->transId1 = copyWords;
conf->transId2 = tcConnectptr.p->transid[1];
copyCompletedLab(signal);
}//Dblqh::copyLqhKeyRefLab() }//Dblqh::copyLqhKeyRefLab()
void Dblqh::closeCopyLab(Signal* signal) void Dblqh::closeCopyLab(Signal* signal)
...@@ -9734,6 +9746,7 @@ void Dblqh::closeCopyLab(Signal* signal) ...@@ -9734,6 +9746,7 @@ void Dblqh::closeCopyLab(Signal* signal)
// Wait until all of those have arrived until we start the // Wait until all of those have arrived until we start the
// close process. // close process.
/*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/
scanptr.p->scanState = ScanRecord::WAIT_LQHKEY_COPY;
jam(); jam();
return; return;
}//if }//if
......
...@@ -213,6 +213,30 @@ void Dbtup::execTUP_ALLOCREQ(Signal* signal) ...@@ -213,6 +213,30 @@ void Dbtup::execTUP_ALLOCREQ(Signal* signal)
//--------------------------------------------------- //---------------------------------------------------
PagePtr pagePtr; PagePtr pagePtr;
Uint32 pageOffset; Uint32 pageOffset;
if (ERROR_INSERTED(4025))
{
signal->theData[0] = 827;
return;
}
if (ERROR_INSERTED(4026))
{
CLEAR_ERROR_INSERT_VALUE;
signal->theData[0] = 827;
return;
}
if (ERROR_INSERTED(4027) && (rand() % 100) > 25)
{
signal->theData[0] = 827;
return;
}
if (ERROR_INSERTED(4028) && (rand() % 100) > 25)
{
CLEAR_ERROR_INSERT_VALUE;
signal->theData[0] = 827;
return;
}
if (!allocTh(regFragPtr.p, if (!allocTh(regFragPtr.p,
regTabPtr.p, regTabPtr.p,
NORMAL_PAGE, NORMAL_PAGE,
......
...@@ -66,6 +66,7 @@ void Dbtup::initData() ...@@ -66,6 +66,7 @@ void Dbtup::initData()
undoPage = 0; undoPage = 0;
totNoOfPagesAllocated = 0; totNoOfPagesAllocated = 0;
cnoOfAllocatedPages = 0; cnoOfAllocatedPages = 0;
CLEAR_ERROR_INSERT_VALUE;
// Records with constant sizes // Records with constant sizes
}//Dbtup::initData() }//Dbtup::initData()
...@@ -570,7 +571,6 @@ void Dbtup::execSTTOR(Signal* signal) ...@@ -570,7 +571,6 @@ void Dbtup::execSTTOR(Signal* signal)
switch (startPhase) { switch (startPhase) {
case ZSTARTPHASE1: case ZSTARTPHASE1:
ljam(); ljam();
CLEAR_ERROR_INSERT_VALUE;
cownref = calcTupBlockRef(0); cownref = calcTupBlockRef(0);
break; break;
default: default:
......
...@@ -180,6 +180,7 @@ void Ndbcntr::execSYSTEM_ERROR(Signal* signal) ...@@ -180,6 +180,7 @@ void Ndbcntr::execSYSTEM_ERROR(Signal* signal)
break; break;
case SystemError::CopyFragRefError: case SystemError::CopyFragRefError:
CRASH_INSERTION(1000);
BaseString::snprintf(buf, sizeof(buf), BaseString::snprintf(buf, sizeof(buf),
"Killed by node %d as " "Killed by node %d as "
"copyfrag failed, error: %u", "copyfrag failed, error: %u",
......
...@@ -1125,6 +1125,59 @@ runBug26481(NDBT_Context* ctx, NDBT_Step* step) ...@@ -1125,6 +1125,59 @@ runBug26481(NDBT_Context* ctx, NDBT_Step* step)
return NDBT_OK; return NDBT_OK;
} }
int
runBug27003(NDBT_Context* ctx, NDBT_Step* step)
{
int result = NDBT_OK;
int loops = ctx->getNumLoops();
int records = ctx->getNumRecords();
NdbRestarter res;
static const int errnos[] = { 4025, 4026, 4027, 4028, 0 };
int node = res.getRandomNotMasterNodeId(rand());
ndbout_c("node: %d", node);
if (res.restartOneDbNode(node, false, true, true))
return NDBT_FAILED;
Uint32 pos = 0;
for (Uint32 i = 0; i<loops; i++)
{
while (errnos[pos] != 0)
{
ndbout_c("Tesing err: %d", errnos[pos]);
if (res.waitNodesNoStart(&node, 1))
return NDBT_FAILED;
if (res.insertErrorInNode(node, 1000))
return NDBT_FAILED;
if (res.insertErrorInNode(node, errnos[pos]))
return NDBT_FAILED;
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
if (res.dumpStateOneNode(node, val2, 2))
return NDBT_FAILED;
res.startNodes(&node, 1);
res.waitNodesStartPhase(&node, 1, 2);
pos++;
}
pos = 0;
}
if (res.waitNodesNoStart(&node, 1))
return NDBT_FAILED;
res.startNodes(&node, 1);
if (res.waitClusterStarted())
return NDBT_FAILED;
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart); NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\ "Test that one node at a time can be stopped and then restarted "\
...@@ -1452,6 +1505,9 @@ TESTCASE("Bug26457", ""){ ...@@ -1452,6 +1505,9 @@ TESTCASE("Bug26457", ""){
TESTCASE("Bug26481", ""){ TESTCASE("Bug26481", ""){
INITIALIZER(runBug26481); INITIALIZER(runBug26481);
} }
TESTCASE("Bug27003", ""){
INITIALIZER(runBug27003);
}
NDBT_TESTSUITE_END(testNodeRestart); NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){ int main(int argc, const char** argv){
......
...@@ -425,6 +425,10 @@ max-time: 500 ...@@ -425,6 +425,10 @@ max-time: 500
cmd: testScan cmd: testScan
args: -n Bug24447 T1 args: -n Bug24447 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug27003 T1
max-time: 500 max-time: 500
cmd: testNodeRestart cmd: testNodeRestart
args: -n Bug15587 T1 args: -n Bug15587 T1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment