Commit 4a0c8322 authored by unknown's avatar unknown

ndb - bug#29167

  Fix case where all node in node group dies before they saved sysfile (wrt gcp)
  and Qmgr incorrectly thinks that "node group is missing"


storage/ndb/src/kernel/blocks/ERROR_codes.txt:
  code
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  fix
storage/ndb/test/ndbapi/testSystemRestart.cpp:
  test
storage/ndb/test/run-test/daily-basic-tests.txt:
  test
parent b26fa854
...@@ -5,7 +5,7 @@ Next DBACC 3002 ...@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4029 Next DBTUP 4029
Next DBLQH 5045 Next DBLQH 5045
Next DBDICT 6007 Next DBDICT 6007
Next DBDIH 7183 Next DBDIH 7184
Next DBTC 8040 Next DBTC 8040
Next CMVMI 9000 Next CMVMI 9000
Next BACKUP 10038 Next BACKUP 10038
...@@ -75,6 +75,8 @@ Delay GCP_SAVEREQ by 10 secs ...@@ -75,6 +75,8 @@ Delay GCP_SAVEREQ by 10 secs
7180: Crash master during master-take-over in execMASTER_LCPCONF 7180: Crash master during master-take-over in execMASTER_LCPCONF
7183: Crash when receiving COPY_GCIREQ
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
----------------------------------------------------------------- -----------------------------------------------------------------
......
...@@ -747,6 +747,8 @@ done: ...@@ -747,6 +747,8 @@ done:
} }
ndbrequire(ok); ndbrequire(ok);
CRASH_INSERTION(7183);
/* ----------------------------------------------------------------------- */ /* ----------------------------------------------------------------------- */
/* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */ /* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */
/* ----------------------------------------------------------------------- */ /* ----------------------------------------------------------------------- */
...@@ -1230,6 +1232,17 @@ void Dbdih::execDIH_RESTARTREQ(Signal* signal) ...@@ -1230,6 +1232,17 @@ void Dbdih::execDIH_RESTARTREQ(Signal* signal)
Uint32 ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups); Uint32 ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
ndbrequire(ng < MAX_NDB_NODES); ndbrequire(ng < MAX_NDB_NODES);
Uint32 gci = node_gcis[i]; Uint32 gci = node_gcis[i];
if (gci < SYSFILE->lastCompletedGCI[i])
{
jam();
/**
* Handle case, where *I* know that node complete GCI
* but node does not...bug#29167
* i.e node died before it wrote own sysfile
*/
gci = SYSFILE->lastCompletedGCI[i];
}
if (gci > node_group_gcis[ng]) if (gci > node_group_gcis[ng])
{ {
jam(); jam();
......
...@@ -1219,6 +1219,48 @@ runBug24664(NDBT_Context* ctx, NDBT_Step* step) ...@@ -1219,6 +1219,48 @@ runBug24664(NDBT_Context* ctx, NDBT_Step* step)
return result; return result;
} }
int
runBug29167(NDBT_Context* ctx, NDBT_Step* step)
{
int result = NDBT_OK;
NdbRestarter restarter;
Ndb* pNdb = GETNDB(step);
const Uint32 nodeCount = restarter.getNumDbNodes();
if (nodeCount < 2)
return NDBT_OK;
int filter[] = { 15, NDB_MGM_EVENT_CATEGORY_CHECKPOINT, 0 };
NdbLogEventHandle handle =
ndb_mgm_create_logevent_handle(restarter.handle, filter);
struct ndb_logevent event;
int master = restarter.getMasterNodeId();
do {
int node1 = restarter.getRandomNodeOtherNodeGroup(master, rand());
int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand());
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
restarter.dumpStateAllNodes(val2, 2);
int dump[] = { DumpStateOrd::DihSetTimeBetweenGcp, 30000 };
restarter.dumpStateAllNodes(dump, 2);
while(ndb_logevent_get_next(handle, &event, 0) >= 0 &&
event.type != NDB_LE_GlobalCheckpointCompleted);
CHECK(restarter.insertErrorInAllNodes(932) == 0);
CHECK(restarter.insertErrorInNode(node1, 7183) == 0);
CHECK(restarter.insertErrorInNode(node2, 7183) == 0);
CHECK(restarter.waitClusterNoStart() == 0);
restarter.startAll();
CHECK(restarter.waitClusterStarted() == 0);
} while(false);
return result;
}
NDBT_TESTSUITE(testSystemRestart); NDBT_TESTSUITE(testSystemRestart);
TESTCASE("SR1", TESTCASE("SR1",
"Basic system restart test. Focus on testing restart from REDO log.\n" "Basic system restart test. Focus on testing restart from REDO log.\n"
...@@ -1399,6 +1441,12 @@ TESTCASE("Bug24664", ...@@ -1399,6 +1441,12 @@ TESTCASE("Bug24664",
STEP(runBug24664); STEP(runBug24664);
FINALIZER(runClearTable); FINALIZER(runClearTable);
} }
TESTCASE("Bug29167", "")
{
INITIALIZER(runWaitStarted);
STEP(runBug29167);
}
NDBT_TESTSUITE_END(testSystemRestart); NDBT_TESTSUITE_END(testSystemRestart);
int main(int argc, const char** argv){ int main(int argc, const char** argv){
......
...@@ -485,6 +485,10 @@ max-time: 1000 ...@@ -485,6 +485,10 @@ max-time: 1000
cmd: testNodeRestart cmd: testNodeRestart
args: -n Bug27003 T1 args: -n Bug27003 T1
max-time: 300
cmd: testSystemRestart
args: -n Bug29167 T1
max-time: 1000 max-time: 1000
cmd: testNodeRestart cmd: testNodeRestart
args: -n Bug27283 T1 args: -n Bug27283 T1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment