Commit 4a0c8322 authored by unknown's avatar unknown

ndb - bug#29167

  Fix case where all node in node group dies before they saved sysfile (wrt gcp)
  and Qmgr incorrectly thinks that "node group is missing"


storage/ndb/src/kernel/blocks/ERROR_codes.txt:
  code
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  fix
storage/ndb/test/ndbapi/testSystemRestart.cpp:
  test
storage/ndb/test/run-test/daily-basic-tests.txt:
  test
parent b26fa854
......@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4029
Next DBLQH 5045
Next DBDICT 6007
Next DBDIH 7183
Next DBDIH 7184
Next DBTC 8040
Next CMVMI 9000
Next BACKUP 10038
......@@ -75,6 +75,8 @@ Delay GCP_SAVEREQ by 10 secs
7180: Crash master during master-take-over in execMASTER_LCPCONF
7183: Crash when receiving COPY_GCIREQ
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
-----------------------------------------------------------------
......
......@@ -747,6 +747,8 @@ done:
}
ndbrequire(ok);
CRASH_INSERTION(7183);
/* ----------------------------------------------------------------------- */
/* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */
/* ----------------------------------------------------------------------- */
......@@ -1230,6 +1232,17 @@ void Dbdih::execDIH_RESTARTREQ(Signal* signal)
Uint32 ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
ndbrequire(ng < MAX_NDB_NODES);
Uint32 gci = node_gcis[i];
if (gci < SYSFILE->lastCompletedGCI[i])
{
jam();
/**
* Handle case, where *I* know that node complete GCI
* but node does not...bug#29167
* i.e node died before it wrote own sysfile
*/
gci = SYSFILE->lastCompletedGCI[i];
}
if (gci > node_group_gcis[ng])
{
jam();
......
......@@ -1219,6 +1219,48 @@ runBug24664(NDBT_Context* ctx, NDBT_Step* step)
return result;
}
int
runBug29167(NDBT_Context* ctx, NDBT_Step* step)
{
int result = NDBT_OK;
NdbRestarter restarter;
Ndb* pNdb = GETNDB(step);
const Uint32 nodeCount = restarter.getNumDbNodes();
if (nodeCount < 2)
return NDBT_OK;
int filter[] = { 15, NDB_MGM_EVENT_CATEGORY_CHECKPOINT, 0 };
NdbLogEventHandle handle =
ndb_mgm_create_logevent_handle(restarter.handle, filter);
struct ndb_logevent event;
int master = restarter.getMasterNodeId();
do {
int node1 = restarter.getRandomNodeOtherNodeGroup(master, rand());
int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand());
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
restarter.dumpStateAllNodes(val2, 2);
int dump[] = { DumpStateOrd::DihSetTimeBetweenGcp, 30000 };
restarter.dumpStateAllNodes(dump, 2);
while(ndb_logevent_get_next(handle, &event, 0) >= 0 &&
event.type != NDB_LE_GlobalCheckpointCompleted);
CHECK(restarter.insertErrorInAllNodes(932) == 0);
CHECK(restarter.insertErrorInNode(node1, 7183) == 0);
CHECK(restarter.insertErrorInNode(node2, 7183) == 0);
CHECK(restarter.waitClusterNoStart() == 0);
restarter.startAll();
CHECK(restarter.waitClusterStarted() == 0);
} while(false);
return result;
}
NDBT_TESTSUITE(testSystemRestart);
TESTCASE("SR1",
"Basic system restart test. Focus on testing restart from REDO log.\n"
......@@ -1399,6 +1441,12 @@ TESTCASE("Bug24664",
STEP(runBug24664);
FINALIZER(runClearTable);
}
TESTCASE("Bug29167", "")
{
INITIALIZER(runWaitStarted);
STEP(runBug29167);
}
NDBT_TESTSUITE_END(testSystemRestart);
int main(int argc, const char** argv){
......
......@@ -485,6 +485,10 @@ max-time: 1000
cmd: testNodeRestart
args: -n Bug27003 T1
max-time: 300
cmd: testSystemRestart
args: -n Bug29167 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug27283 T1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment