From 327cd42a7f0cf18b8a6b13ac7cf4b04733df07e3 Mon Sep 17 00:00:00 2001 From: unknown <jonas@perch.ndb.mysql.com> Date: Wed, 7 Nov 2007 20:57:21 +0100 Subject: [PATCH] ndb - bug#32160 (recommit to 5.0) fix lcp master take over bug ndb/src/kernel/blocks/ERROR_codes.txt: new error codes ndb/src/kernel/blocks/dbdih/Dbdih.hpp: add debug code ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: fix master lcp bug add 2 new error codes ndb/test/ndbapi/testNodeRestart.cpp: testcase ndb/test/run-test/daily-basic-tests.txt: testcase --- ndb/src/kernel/blocks/ERROR_codes.txt | 7 ++- ndb/src/kernel/blocks/dbdih/Dbdih.hpp | 10 ++++ ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 60 ++++++++++++++++++++--- ndb/test/ndbapi/testNodeRestart.cpp | 48 ++++++++++++++++++ ndb/test/run-test/daily-basic-tests.txt | 4 ++ 5 files changed, 122 insertions(+), 7 deletions(-) diff --git a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt index e45c608b60..2599bf4098 100644 --- a/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/ndb/src/kernel/blocks/ERROR_codes.txt @@ -5,7 +5,7 @@ Next DBACC 3002 Next DBTUP 4014 Next DBLQH 5043 Next DBDICT 6007 -Next DBDIH 7183 +Next DBDIH 7195 Next DBTC 8052 Next CMVMI 9000 Next BACKUP 10022 @@ -73,6 +73,11 @@ Delay GCP_SAVEREQ by 10 secs 7180: Crash master during master-take-over in execMASTER_LCPCONF +7193: Dont send LCP_FRAG_ORD to self, and crash when sending first + LCP_FRAG_ORD(last) + +7194: Force removeNodeFromStored to complete in the middle of MASTER_LCPCONF + ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ----------------------------------------------------------------- diff --git a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index ca91f56909..e471a95339 100644 --- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -1291,7 +1291,17 @@ private: LcpStatus lcpStatus; Uint32 lcpStatusUpdatedPlace; + struct Save { + LcpStatus m_status; + Uint32 m_place; + } m_saveState[10]; + void setLcpStatus(LcpStatus status, Uint32 line){ + for (Uint32 i = 9; i > 0; i--) + m_saveState[i] = m_saveState[i-1]; + m_saveState[0].m_status = lcpStatus; + m_saveState[0].m_place = lcpStatusUpdatedPlace; + lcpStatus = status; lcpStatusUpdatedPlace = line; } diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index 9191bb3fb9..88d167f098 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -4764,11 +4764,19 @@ void Dbdih::startRemoveFailedNode(Signal* signal, NodeRecordPtr failedNodePtr) } jam(); - signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE; - signal->theData[1] = failedNodePtr.i; - signal->theData[2] = 0; // Tab id - sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB); - + + if (!ERROR_INSERTED(7194)) + { + signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE; + signal->theData[1] = failedNodePtr.i; + signal->theData[2] = 0; // Tab id + sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB); + } + else + { + ndbout_c("7194 Not starting ZREMOVE_NODE_FROM_TABLE"); + } + setLocalNodefailHandling(signal, failedNodePtr.i, NF_REMOVE_NODE_FROM_TABLE); }//Dbdih::startRemoveFailedNode() @@ -5676,12 +5684,22 @@ Dbdih::checkEmptyLcpComplete(Signal *signal){ signal->theData[0] = 7012; execDUMP_STATE_ORD(signal); + + if (ERROR_INSERTED(7194)) + { + ndbout_c("7194 starting ZREMOVE_NODE_FROM_TABLE"); + signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE; + signal->theData[1] = c_lcpMasterTakeOverState.failedNodeId; + signal->theData[2] = 0; // Tab id + sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB); + } c_lcpMasterTakeOverState.set(LMTOS_INITIAL, __LINE__); MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0]; req->masterRef = reference(); req->failedNodeId = c_lcpMasterTakeOverState.failedNodeId; sendLoopMacro(MASTER_LCPREQ, sendMASTER_LCPREQ); + } else { sendMASTER_LCPCONF(signal); } @@ -5998,6 +6016,15 @@ void Dbdih::execMASTER_LCPCONF(Signal* signal) { const MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0]; jamEntry(); + + if (ERROR_INSERTED(7194)) + { + ndbout_c("delaying MASTER_LCPCONF due to error 7194"); + sendSignalWithDelay(reference(), GSN_MASTER_LCPCONF, signal, + 300, signal->getLength()); + return; + } + Uint32 senderNodeId = conf->senderNodeId; MasterLCPConf::State lcpState = (MasterLCPConf::State)conf->lcpState; const Uint32 failedNodeId = conf->failedNodeId; @@ -6132,7 +6159,6 @@ void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId) #endif c_lcpState.keepGci = SYSFILE->keepGCI; - c_lcpState.setLcpStatus(LCP_START_LCP_ROUND, __LINE__); startLcpRoundLoopLab(signal, 0, 0); break; } @@ -9924,6 +9950,8 @@ void Dbdih::sendLastLCP_FRAG_ORD(Signal* signal) if(ERROR_INSERTED(7075)){ continue; } + + CRASH_INSERTION(7193); BlockReference ref = calcLqhBlockRef(nodePtr.i); sendSignal(ref, GSN_LCP_FRAG_ORD, signal,LcpFragOrd::SignalLength, JBB); } @@ -10121,6 +10149,13 @@ Dbdih::checkLcpAllTablesDoneInLqh(){ CRASH_INSERTION2(7017, !isMaster()); c_lcpState.setLcpStatus(LCP_TAB_COMPLETED, __LINE__); + + if (ERROR_INSERTED(7194)) + { + ndbout_c("CLEARING 7194"); + CLEAR_ERROR_INSERT_VALUE; + } + return true; } @@ -10276,6 +10311,11 @@ Dbdih::sendLCP_FRAG_ORD(Signal* signal, BlockReference ref = calcLqhBlockRef(replicaPtr.p->procNode); + if (ERROR_INSERTED(7193) && replicaPtr.p->procNode == getOwnNodeId()) + { + return; + } + LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0]; lcpFragOrd->tableId = info.tableId; lcpFragOrd->fragmentId = info.fragId; @@ -13686,6 +13726,14 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) ("immediateLcpStart = %d masterLcpNodeId = %d", c_lcpState.immediateLcpStart, refToNode(c_lcpState.m_masterLcpDihRef)); + + for (Uint32 i = 0; i<10; i++) + { + infoEvent("%u : status: %u place: %u", i, + c_lcpState.m_saveState[i].m_status, + c_lcpState.m_saveState[i].m_place); + } + infoEvent("-- Node %d LCP STATE --", getOwnNodeId()); } diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp index 03a60b1b52..12b0187b71 100644 --- a/ndb/test/ndbapi/testNodeRestart.cpp +++ b/ndb/test/ndbapi/testNodeRestart.cpp @@ -1347,6 +1347,51 @@ runBug28717(NDBT_Context* ctx, NDBT_Step* step) return NDBT_OK; } +int +runBug32160(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int records = ctx->getNumRecords(); + Ndb* pNdb = GETNDB(step); + NdbRestarter res; + + if (res.getNumDbNodes() < 2) + { + return NDBT_OK; + } + + int master = res.getMasterNodeId(); + int next = res.getNextMasterNodeId(master); + + if (res.insertErrorInNode(next, 7194)) + { + return NDBT_FAILED; + } + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + if (res.dumpStateOneNode(master, val2, 2)) + return NDBT_FAILED; + + if (res.insertErrorInNode(master, 7193)) + return NDBT_FAILED; + + int val3[] = { 7099 }; + if (res.dumpStateOneNode(master, val3, 1)) + return NDBT_FAILED; + + if (res.waitNodesNoStart(&master, 1)) + return NDBT_FAILED; + + if (res.startNodes(&master, 1)) + return NDBT_FAILED; + + if (res.waitClusterStarted()) + return NDBT_FAILED; + + return NDBT_OK; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", "Test that one node at a time can be stopped and then restarted "\ @@ -1686,6 +1731,9 @@ TESTCASE("Bug28717", ""){ TESTCASE("Bug29364", ""){ INITIALIZER(runBug29364); } +TESTCASE("Bug32160", ""){ + INITIALIZER(runBug32160); +} NDBT_TESTSUITE_END(testNodeRestart); int main(int argc, const char** argv){ diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index 4f7ba26bf2..7b4a4ca0e2 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -497,6 +497,10 @@ max-time: 1000 cmd: testNodeRestart args: -n Bug26481 T1 +max-time: 300 +cmd: testNodeRestart +args: -n Bug32160 T1 + # OLD FLEX max-time: 500 cmd: flexBench -- 2.30.9