Commit 19340f22 authored by unknown's avatar unknown

ndb - bug#18414

  Fix timeout during ABORT when ZABORT_TIMEOUT_BREAK is outstanding


ndb/src/kernel/blocks/ERROR_codes.txt:
  New error code
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  remove dumping of LCP info during NF
ndb/src/kernel/blocks/dbtc/DbtcMain.cpp:
  Fix timeout during ABORT when ZABORT_TIMEOUT_BREAK is outstanding
ndb/test/ndbapi/testNodeRestart.cpp:
  Add testcase for bug18414
ndb/test/ndbapi/testTimeout.cpp:
  Fix error code checking
ndb/test/run-test/daily-basic-tests.txt:
  Add testcase for bug18414
parent 058019f6
......@@ -226,6 +226,8 @@ Delay execution of COMPLETECONF signal 2 seconds to generate time-out.
8045: (ABORTCONF only as part of take-over)
Delay execution of ABORTCONF signal 2 seconds to generate time-out.
8050: Send ZABORT_TIMEOUT_BREAK delayed
ERROR CODES FOR TESTING TIME-OUT HANDLING IN DBTC
-------------------------------------------------
......
......@@ -5982,10 +5982,6 @@ void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId)
signal->theData[0] = 7012;
execDUMP_STATE_ORD(signal);
signal->theData[0] = 7015;
signal->theData[1] = 0;
execDUMP_STATE_ORD(signal);
c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
checkLocalNodefailComplete(signal, failedNodePtr.i, NF_LCP_TAKE_OVER);
......
......@@ -6386,6 +6386,7 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck)
return;
}
bool found = false;
OperationState tmp[16];
Uint32 TloopCount = 0;
......@@ -6393,7 +6394,31 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck)
jam();
if (tcConnectptr.i == RNIL) {
jam();
if (Tcheck == 0) {
#ifdef VM_TRACE
ndbout_c("found: %d Tcheck: %d apiConnectptr.p->counter: %d",
found, Tcheck, apiConnectptr.p->counter);
#endif
if (found || apiConnectptr.p->counter)
{
jam();
/**
* We sent atleast one ABORT/ABORTED
* or ZABORT_TIMEOUT_BREAK is in job buffer
* wait for reception...
*/
return;
}
if (Tcheck == 1)
{
jam();
releaseAbortResources(signal);
return;
}
if (Tcheck == 0)
{
jam();
/*------------------------------------------------------------------
* All nodes had already reported ABORTED for all tcConnect records.
......@@ -6402,9 +6427,11 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck)
*------------------------------------------------------------------*/
char buf[96]; buf[0] = 0;
char buf2[96];
BaseString::snprintf(buf, sizeof(buf), "TC %d: %d ops:",
__LINE__, apiConnectptr.i);
for(Uint32 i = 0; i<TloopCount; i++){
BaseString::snprintf(buf, sizeof(buf), "TC %d: %d counter: %d ops:",
__LINE__, apiConnectptr.i,
apiConnectptr.p->counter);
for(Uint32 i = 0; i<TloopCount; i++)
{
BaseString::snprintf(buf2, sizeof(buf2), "%s %d", buf, tmp[i]);
BaseString::snprintf(buf, sizeof(buf), buf2);
}
......@@ -6412,7 +6439,9 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck)
ndbout_c(buf);
ndbrequire(false);
releaseAbortResources(signal);
return;
}
return;
}//if
TloopCount++;
......@@ -6427,7 +6456,16 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck)
signal->theData[0] = TcContinueB::ZABORT_TIMEOUT_BREAK;
signal->theData[1] = tcConnectptr.i;
signal->theData[2] = apiConnectptr.i;
sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
if (ERROR_INSERTED(8050))
{
ndbout_c("sending ZABORT_TIMEOUT_BREAK delayed (%d %d)",
Tcheck, apiConnectptr.p->counter);
sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 2000, 3);
}
else
{
sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
}
return;
}//if
ptrCheckGuard(tcConnectptr, ctcConnectFilesize, tcConnectRecord);
......@@ -6450,7 +6488,7 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck)
jam();
if (tcConnectptr.p->tcNodedata[Ti] != 0) {
TloopCount += 31;
Tcheck = 1;
found = true;
hostptr.i = tcConnectptr.p->tcNodedata[Ti];
ptrCheckGuard(hostptr, chostFilesize, hostRecord);
if (hostptr.p->hostStatus == HS_ALIVE) {
......@@ -7007,8 +7045,6 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* signal)
hostptr.i = tfailedNodeId;
ptrCheckGuard(hostptr, chostFilesize, hostRecord);
ndbout_c("received execTAKE_OVERTCCONF(%d) from %x (%x)",
tfailedNodeId, signal->getSendersBlockRef(), reference());
if (signal->getSendersBlockRef() != reference())
{
jam();
......
......@@ -581,6 +581,73 @@ runBug16772(NDBT_Context* ctx, NDBT_Step* step){
return ret ? NDBT_OK : NDBT_FAILED;
}
int
runBug18414(NDBT_Context* ctx, NDBT_Step* step){
NdbRestarter restarter;
if (restarter.getNumDbNodes() < 2)
{
ctx->stopTest();
return NDBT_OK;
}
Ndb* pNdb = GETNDB(step);
HugoOperations hugoOps(*ctx->getTab());
HugoTransactions hugoTrans(*ctx->getTab());
int loop = 0;
do
{
if(hugoOps.startTransaction(pNdb) != 0)
goto err;
if(hugoOps.pkUpdateRecord(pNdb, 0, 128, rand()) != 0)
goto err;
if(hugoOps.execute_NoCommit(pNdb) != 0)
goto err;
int node1 = hugoOps.getTransaction()->getConnectedNodeId();
int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand());
if (node1 == -1 || node2 == -1)
break;
if (loop & 1)
{
if (restarter.insertErrorInNode(node1, 8050))
goto err;
}
if (restarter.insertErrorInNode(node2, 5003))
goto err;
int res= hugoOps.execute_Rollback(pNdb);
if (restarter.waitNodesNoStart(&node2, 1) != 0)
goto err;
if (restarter.insertErrorInAllNodes(0))
goto err;
if (restarter.startNodes(&node2, 1) != 0)
goto err;
if (restarter.waitClusterStarted() != 0)
goto err;
if (hugoTrans.scanUpdateRecords(pNdb, 128) != 0)
goto err;
hugoOps.closeTransaction(pNdb);
} while(++loop < 5);
return NDBT_OK;
err:
hugoOps.closeTransaction(pNdb);
return NDBT_FAILED;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
......@@ -870,6 +937,12 @@ TESTCASE("Bug16772",
"Test bug with restarting before NF handling is complete"){
STEP(runBug16772);
}
TESTCASE("Bug18414",
"Test bug with NF during NR"){
INITIALIZER(runLoadTable);
STEP(runBug18414);
FINALIZER(runClearTable);
}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
......
......@@ -173,8 +173,11 @@ int runTimeoutTrans(NDBT_Context* ctx, NDBT_Step* step){
NdbSleep_MilliSleep(sleep);
// Expect that transaction has timed-out
CHECK(hugoOps.execute_Commit(pNdb) == 237);
int ret = hugoOps.execute_Commit(pNdb);
CHECK(ret != 0);
NdbError err = pNdb->getNdbError(ret);
CHECK(err.classification == NdbError::TimeoutExpired);
} while(false);
hugoOps.closeTransaction(pNdb);
......
......@@ -458,6 +458,10 @@ max-time: 500
cmd: testSystemRestart
args: -n Bug18385 T1
max-time: 500
cmd: testNodeRestart
args: -n Bug18414 T1
# OLD FLEX
max-time: 500
cmd: flexBench
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment