ndb - bug#18781 (5.0) handle rolling upgrade, minor fixes, logging, docs

parent cdc421dc
...@@ -55,7 +55,9 @@ public: ...@@ -55,7 +55,9 @@ public:
enum ErrorCode { enum ErrorCode {
NotMaster = 1, NotMaster = 1,
InvalidLockType = 2, InvalidLockType = 2,
TooManyRequests = 3 BadUserRef = 3,
TooLate = 4,
TooManyRequests = 5
}; };
private: private:
Uint32 userPtr; Uint32 userPtr;
......
...@@ -60,5 +60,7 @@ char ndb_version_string_buf[NDB_VERSION_STRING_BUF_SZ]; ...@@ -60,5 +60,7 @@ char ndb_version_string_buf[NDB_VERSION_STRING_BUF_SZ];
#define NDBD_INCL_NODECONF_VERSION_4 MAKE_VERSION(4,1,17) #define NDBD_INCL_NODECONF_VERSION_4 MAKE_VERSION(4,1,17)
#define NDBD_INCL_NODECONF_VERSION_5 MAKE_VERSION(5,0,18) #define NDBD_INCL_NODECONF_VERSION_5 MAKE_VERSION(5,0,18)
#define NDBD_DICT_LOCK_VERSION_5 MAKE_VERSION(5,0,23)
#endif #endif
...@@ -5,7 +5,7 @@ Next DBACC 3002 ...@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4013 Next DBTUP 4013
Next DBLQH 5043 Next DBLQH 5043
Next DBDICT 6007 Next DBDICT 6007
Next DBDIH 7175 Next DBDIH 7177
Next DBTC 8037 Next DBTC 8037
Next CMVMI 9000 Next CMVMI 9000
Next BACKUP 10022 Next BACKUP 10022
...@@ -312,7 +312,9 @@ Test Crashes in handling node restarts ...@@ -312,7 +312,9 @@ Test Crashes in handling node restarts
7170: Crash when receiving START_PERMREF (InitialStartRequired) 7170: Crash when receiving START_PERMREF (InitialStartRequired)
7174: Send one fake START_PERMREF (ZNODE_ALREADY_STARTING_ERROR) 7174: Crash starting node before sending DICT_LOCK_REQ
7175: Master sends one fake START_PERMREF (ZNODE_ALREADY_STARTING_ERROR)
7176: Slave NR pretends master does not support DICT lock (rolling upgrade)
DICT: DICT:
6000 Crash during NR when receiving DICTSTARTREQ 6000 Crash during NR when receiving DICTSTARTREQ
......
...@@ -205,7 +205,7 @@ void Dbdict::execCONTINUEB(Signal* signal) ...@@ -205,7 +205,7 @@ void Dbdict::execCONTINUEB(Signal* signal)
case ZDICT_LOCK_POLL: case ZDICT_LOCK_POLL:
jam(); jam();
checkDictLockQueue(signal); checkDictLockQueue(signal, true);
break; break;
default : default :
...@@ -2836,7 +2836,6 @@ void Dbdict::execNODE_FAILREP(Signal* signal) ...@@ -2836,7 +2836,6 @@ void Dbdict::execNODE_FAILREP(Signal* signal)
case BS_NODE_RESTART: case BS_NODE_RESTART:
jam(); jam();
ok = true; ok = true;
removeStaleDictLocks(signal, theFailedNodes);
break; break;
} }
ndbrequire(ok); ndbrequire(ok);
...@@ -2860,6 +2859,15 @@ void Dbdict::execNODE_FAILREP(Signal* signal) ...@@ -2860,6 +2859,15 @@ void Dbdict::execNODE_FAILREP(Signal* signal)
}//if }//if
}//for }//for
/*
* NODE_FAILREP guarantees that no "in flight" signal from
* a dead node is accepted, and also that the job buffer contains
* no such (un-executed) signals. Therefore no DICT_UNLOCK_ORD
* from a dead node (leading to master crash) is possible after
* this clean-up removes the lock record.
*/
removeStaleDictLocks(signal, theFailedNodes);
}//execNODE_FAILREP() }//execNODE_FAILREP()
...@@ -12210,7 +12218,7 @@ Dbdict::getIndexAttrMask(TableRecordPtr indexPtr, AttributeMask& mask) ...@@ -12210,7 +12218,7 @@ Dbdict::getIndexAttrMask(TableRecordPtr indexPtr, AttributeMask& mask)
const Dbdict::DictLockType* const Dbdict::DictLockType*
Dbdict::getDictLockType(Uint32 lockType) Dbdict::getDictLockType(Uint32 lockType)
{ {
static DictLockType lt[] = { static const DictLockType lt[] = {
{ DictLockReq::NodeRestartLock, BS_NODE_RESTART, "NodeRestart" } { DictLockReq::NodeRestartLock, BS_NODE_RESTART, "NodeRestart" }
}; };
for (int i = 0; i < sizeof(lt)/sizeof(lt[0]); i++) { for (int i = 0; i < sizeof(lt)/sizeof(lt[0]); i++) {
...@@ -12220,12 +12228,40 @@ Dbdict::getDictLockType(Uint32 lockType) ...@@ -12220,12 +12228,40 @@ Dbdict::getDictLockType(Uint32 lockType)
return NULL; return NULL;
} }
void
Dbdict::sendDictLockInfoEvent(Uint32 pollCount)
{
DictLockPtr loopPtr;
c_dictLockQueue.first(loopPtr);
unsigned count = 0;
char queue_buf[100];
char *p = &queue_buf[0];
const char *const q = &queue_buf[sizeof(queue_buf)];
*p = 0;
while (loopPtr.i != RNIL) {
jam();
my_snprintf(p, q-p, "%s%u%s",
++count == 1 ? "" : " ",
(unsigned)refToNode(loopPtr.p->req.userRef),
loopPtr.p->locked ? "L" : "");
p += strlen(p);
c_dictLockQueue.next(loopPtr);
}
infoEvent("DICT: lock bs: %d ops: %d poll: %d cnt: %d queue: %s",
(int)c_blockState,
c_opRecordPool.getSize() - c_opRecordPool.getNoOfFree(),
c_dictLockPoll, (int)pollCount, queue_buf);
}
void void
Dbdict::sendDictLockInfoEvent(DictLockPtr lockPtr, const char* text) Dbdict::sendDictLockInfoEvent(DictLockPtr lockPtr, const char* text)
{ {
infoEvent("DICT: %s %u for %s", infoEvent("DICT: %s %u for %s",
text, text,
(unsigned int)refToNode(lockPtr.p->req.userRef), lockPtr.p->lt->text); (unsigned)refToNode(lockPtr.p->req.userRef), lockPtr.p->lt->text);
} }
void void
...@@ -12234,6 +12270,8 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal) ...@@ -12234,6 +12270,8 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal)
jamEntry(); jamEntry();
const DictLockReq* req = (const DictLockReq*)&signal->theData[0]; const DictLockReq* req = (const DictLockReq*)&signal->theData[0];
// make sure bad request crashes slave, not master (us)
if (getOwnNodeId() != c_masterNodeId) { if (getOwnNodeId() != c_masterNodeId) {
jam(); jam();
sendDictLockRef(signal, *req, DictLockRef::NotMaster); sendDictLockRef(signal, *req, DictLockRef::NotMaster);
...@@ -12247,6 +12285,19 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal) ...@@ -12247,6 +12285,19 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal)
return; return;
} }
if (req->userRef != signal->getSendersBlockRef() ||
getNodeInfo(refToNode(req->userRef)).m_type != NodeInfo::DB) {
jam();
sendDictLockRef(signal, *req, DictLockRef::BadUserRef);
return;
}
if (c_aliveNodes.get(refToNode(req->userRef))) {
jam();
sendDictLockRef(signal, *req, DictLockRef::TooLate);
return;
}
DictLockPtr lockPtr; DictLockPtr lockPtr;
if (! c_dictLockQueue.seize(lockPtr)) { if (! c_dictLockQueue.seize(lockPtr)) {
jam(); jam();
...@@ -12258,21 +12309,23 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal) ...@@ -12258,21 +12309,23 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal)
lockPtr.p->locked = false; lockPtr.p->locked = false;
lockPtr.p->lt = lt; lockPtr.p->lt = lt;
checkDictLockQueue(signal); checkDictLockQueue(signal, false);
if (! lockPtr.p->locked) if (! lockPtr.p->locked)
sendDictLockInfoEvent(lockPtr, "lock request by node"); sendDictLockInfoEvent(lockPtr, "lock request by node");
} }
void void
Dbdict::checkDictLockQueue(Signal* signal) Dbdict::checkDictLockQueue(Signal* signal, bool poll)
{ {
Uint32 pollCount = ! poll ? 0 : signal->theData[1];
DictLockPtr lockPtr; DictLockPtr lockPtr;
do { do {
if (! c_dictLockQueue.first(lockPtr)) { if (! c_dictLockQueue.first(lockPtr)) {
jam(); jam();
setDictLockPoll(signal, false); setDictLockPoll(signal, false, pollCount);
return; return;
} }
...@@ -12299,7 +12352,7 @@ Dbdict::checkDictLockQueue(Signal* signal) ...@@ -12299,7 +12352,7 @@ Dbdict::checkDictLockQueue(Signal* signal)
// this routine is called again when it is removed for any reason // this routine is called again when it is removed for any reason
bool on = ! lockPtr.p->locked; bool on = ! lockPtr.p->locked;
setDictLockPoll(signal, on); setDictLockPoll(signal, on, pollCount);
} }
void void
...@@ -12326,7 +12379,7 @@ Dbdict::execDICT_UNLOCK_ORD(Signal* signal) ...@@ -12326,7 +12379,7 @@ Dbdict::execDICT_UNLOCK_ORD(Signal* signal)
c_dictLockQueue.release(lockPtr); c_dictLockQueue.release(lockPtr);
checkDictLockQueue(signal); checkDictLockQueue(signal, false);
} }
void void
...@@ -12359,21 +12412,32 @@ Dbdict::sendDictLockRef(Signal* signal, DictLockReq req, Uint32 errorCode) ...@@ -12359,21 +12412,32 @@ Dbdict::sendDictLockRef(Signal* signal, DictLockReq req, Uint32 errorCode)
// control polling // control polling
void void
Dbdict::setDictLockPoll(Signal* signal, bool on) Dbdict::setDictLockPoll(Signal* signal, bool on, Uint32 pollCount)
{ {
if (on) { if (on) {
jam(); jam();
signal->theData[0] = ZDICT_LOCK_POLL; signal->theData[0] = ZDICT_LOCK_POLL;
sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1); signal->theData[1] = pollCount + 1;
sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 2);
} }
if (c_dictLockPoll != on) { bool change = (c_dictLockPoll != on);
if (change) {
jam(); jam();
#ifdef VM_TRACE
infoEvent("DICT: lock polling %s", on ? "On" : "Off");
#endif
c_dictLockPoll = on; c_dictLockPoll = on;
} }
// avoid too many messages if master is stuck busy (BS_NODE_FAILURE)
bool periodic =
pollCount < 8 ||
pollCount < 64 && pollCount % 8 == 0 ||
pollCount < 512 && pollCount % 64 == 0 ||
pollCount < 4096 && pollCount % 512 == 0 ||
pollCount % 4096 == 0; // about every 6 minutes
if (change || periodic)
sendDictLockInfoEvent(pollCount);
} }
// NF handling // NF handling
...@@ -12384,6 +12448,11 @@ Dbdict::removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes) ...@@ -12384,6 +12448,11 @@ Dbdict::removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes)
DictLockPtr loopPtr; DictLockPtr loopPtr;
c_dictLockQueue.first(loopPtr); c_dictLockQueue.first(loopPtr);
if (getOwnNodeId() != c_masterNodeId) {
ndbrequire(loopPtr.i == RNIL);
return;
}
while (loopPtr.i != RNIL) { while (loopPtr.i != RNIL) {
jam(); jam();
DictLockPtr lockPtr = loopPtr; DictLockPtr lockPtr = loopPtr;
...@@ -12409,7 +12478,7 @@ Dbdict::removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes) ...@@ -12409,7 +12478,7 @@ Dbdict::removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes)
} }
} }
checkDictLockQueue(signal); checkDictLockQueue(signal, false);
} }
......
...@@ -1804,14 +1804,15 @@ private: ...@@ -1804,14 +1804,15 @@ private:
bool c_dictLockPoll; bool c_dictLockPoll;
static const DictLockType* getDictLockType(Uint32 lockType); static const DictLockType* getDictLockType(Uint32 lockType);
void sendDictLockInfoEvent(Uint32 pollCount);
void sendDictLockInfoEvent(DictLockPtr lockPtr, const char* text); void sendDictLockInfoEvent(DictLockPtr lockPtr, const char* text);
void checkDictLockQueue(Signal* signal); void checkDictLockQueue(Signal* signal, bool poll);
void sendDictLockConf(Signal* signal, DictLockPtr lockPtr); void sendDictLockConf(Signal* signal, DictLockPtr lockPtr);
void sendDictLockRef(Signal* signal, DictLockReq req, Uint32 errorCode); void sendDictLockRef(Signal* signal, DictLockReq req, Uint32 errorCode);
// control polling i.e. continueB loop // control polling i.e. continueB loop
void setDictLockPoll(Signal* signal, bool on); void setDictLockPoll(Signal* signal, bool on, Uint32 pollCount);
// NF handling // NF handling
void removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes); void removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes);
......
Lock master DICT against schema operations
Implementation
--------------
[ see comments in Dbdict.hpp ]
Use case: Node startup INR / NR
-------------------------------
Master DICT (like any block) keeps list of alive nodes (c_aliveNodes).
These are participants in schema ops.
(1) c_aliveNodes is initialized when DICT starts
in sp3 in READ_NODESCONF from CNTR
(2) when slave node fails (in any sp of the slave node)
it is removed from c_aliveNodes in NODE_FAILREP
(3) when slave starts, it is added to c_aliveNodes
in sp4 of the starting node in INCL_NODEREQ
Slave DIH locks master DICT in sp2 and releases the lock when started.
Based on the constraints:
- the lock is taken when master DICT is known
DIH reads this in sp2 in READ_NODESCONF
- the lock is taken before (3)
- the lock is taken before copying starts and held until it is done
in sp4 DIH meta, DICT meta, tuple data
- on INR in sp2 in START_PERMREQ the LCP info of the slave is erased
in all DIH in invalidateNodeLCP() - not safe under schema ops
Signals:
All but DICT_LOCK are standard v5.0 signals.
s=starting node, m=master, a=all participants, l=local block.
* sp2 - DICT_LOCK and START_PERM
DIH/s
DICT_LOCK_REQ
DICT/m
DICT_LOCK_CONF
DIH/s
START_PERMREQ
DIH/m
START_INFOREQ
DIH/a
invalidateNodeLCP() if INR
DIH/a
START_INFOCONF
DIH/m
START_PERMCONF
DIH/s
* sp4 - START_ME (copy metadata, no changes)
DIH/s
START_MEREQ
DIH/m
COPY_TABREQ
DIH/s
COPY_TABCONF
DIH/m
DICTSTARTREQ
DICT/s
GET_SCHEMA_INFOREQ
DICT/m
SCHEMA_INFO
DICT/s
DICTSTARTCONF
DIH/m
INCL_NODEREQ
DIH/a
INCL_NODEREQ
ANY/l
INCL_NODECONF
DIH/a
INCL_NODECONF
DIH/m
START_MECONF
DIH/s
* sp7 - release DICT lock
DIH/s
DICT_UNLOCK_ORD
DICT/m
# vim: set et sw=4:
...@@ -1594,6 +1594,9 @@ void Dbdih::nodeRestartPh2Lab(Signal* signal) ...@@ -1594,6 +1594,9 @@ void Dbdih::nodeRestartPh2Lab(Signal* signal)
*/ */
ndbrequire(c_dictLockSlavePtrI_nodeRestart == RNIL); ndbrequire(c_dictLockSlavePtrI_nodeRestart == RNIL);
// check that we are not yet taking part in schema ops
CRASH_INSERTION(7174);
Uint32 lockType = DictLockReq::NodeRestartLock; Uint32 lockType = DictLockReq::NodeRestartLock;
Callback c = { safe_cast(&Dbdih::recvDictLockConf_nodeRestart), 0 }; Callback c = { safe_cast(&Dbdih::recvDictLockConf_nodeRestart), 0 };
sendDictLockReq(signal, lockType, c); sendDictLockReq(signal, lockType, c);
...@@ -1746,7 +1749,7 @@ void Dbdih::execSTART_PERMREQ(Signal* signal) ...@@ -1746,7 +1749,7 @@ void Dbdih::execSTART_PERMREQ(Signal* signal)
ndbrequire(refToNode(retRef) == nodeId); ndbrequire(refToNode(retRef) == nodeId);
if ((c_nodeStartMaster.activeState) || if ((c_nodeStartMaster.activeState) ||
(c_nodeStartMaster.wait != ZFALSE) || (c_nodeStartMaster.wait != ZFALSE) ||
ERROR_INSERTED_CLEAR(7174)) { ERROR_INSERTED_CLEAR(7175)) {
jam(); jam();
signal->theData[0] = nodeId; signal->theData[0] = nodeId;
signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR; signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR;
...@@ -14709,6 +14712,34 @@ Dbdih::sendDictLockReq(Signal* signal, Uint32 lockType, Callback c) ...@@ -14709,6 +14712,34 @@ Dbdih::sendDictLockReq(Signal* signal, Uint32 lockType, Callback c)
lockPtr.p->locked = false; lockPtr.p->locked = false;
lockPtr.p->callback = c; lockPtr.p->callback = c;
// handle rolling upgrade
{
Uint32 masterVersion = getNodeInfo(cmasterNodeId).m_version;
unsigned int get_major = getMajor(masterVersion);
unsigned int get_minor = getMinor(masterVersion);
unsigned int get_build = getBuild(masterVersion);
ndbrequire(get_major == 4 || get_major == 5);
if (masterVersion < NDBD_DICT_LOCK_VERSION_5 ||
ERROR_INSERTED(7176)) {
jam();
infoEvent("DIH: detect upgrade: master node %u old version %u.%u.%u",
(unsigned int)cmasterNodeId, get_major, get_minor, get_build);
DictLockConf* conf = (DictLockConf*)&signal->theData[0];
conf->userPtr = lockPtr.i;
conf->lockType = lockType;
conf->lockPtr = ZNIL;
sendSignal(reference(), GSN_DICT_LOCK_CONF, signal,
DictLockConf::SignalLength, JBB);
return;
}
}
BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId); BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId);
sendSignal(dictMasterRef, GSN_DICT_LOCK_REQ, signal, sendSignal(dictMasterRef, GSN_DICT_LOCK_REQ, signal,
DictLockReq::SignalLength, JBB); DictLockReq::SignalLength, JBB);
...@@ -14758,6 +14789,19 @@ Dbdih::sendDictUnlockOrd(Signal* signal, Uint32 lockSlavePtrI) ...@@ -14758,6 +14789,19 @@ Dbdih::sendDictUnlockOrd(Signal* signal, Uint32 lockSlavePtrI)
c_dictLockSlavePool.release(lockPtr); c_dictLockSlavePool.release(lockPtr);
// handle rolling upgrade
{
Uint32 masterVersion = getNodeInfo(cmasterNodeId).m_version;
unsigned int get_major = getMajor(masterVersion);
ndbrequire(get_major == 4 || get_major == 5);
if (masterVersion < NDBD_DICT_LOCK_VERSION_5 ||
ERROR_INSERTED(7176)) {
return;
}
}
BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId); BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId);
sendSignal(dictMasterRef, GSN_DICT_UNLOCK_ORD, signal, sendSignal(dictMasterRef, GSN_DICT_UNLOCK_ORD, signal,
DictUnlockOrd::SignalLength, JBB); DictUnlockOrd::SignalLength, JBB);
......
...@@ -1590,17 +1590,18 @@ recv_dict_ops_run(NDBT_Context* ctx) ...@@ -1590,17 +1590,18 @@ recv_dict_ops_run(NDBT_Context* ctx)
int int
runRestarts(NDBT_Context* ctx, NDBT_Step* step) runRestarts(NDBT_Context* ctx, NDBT_Step* step)
{ {
static int err_master[] = { // non-crashing static int errlst_master[] = { // non-crashing
0, 7175, // send one fake START_PERMREF
7174 // send one fake START_PERMREF 0
}; };
static int err_node[] = { static int errlst_node[] = {
0, 7174, // crash before sending DICT_LOCK_REQ
7121, // crash on START_PERMCONF 7176, // pretend master does not support DICT lock
7130 // crash on START_MECONF 7121, // crash at receive START_PERMCONF
0
}; };
const uint err_master_cnt = sizeof(err_master)/sizeof(err_master[0]); const uint errcnt_master = sizeof(errlst_master)/sizeof(errlst_master[0]);
const uint err_node_cnt = sizeof(err_node)/sizeof(err_node[0]); const uint errcnt_node = sizeof(errlst_node)/sizeof(errlst_node[0]);
myRandom48Init(NdbTick_CurrentMillisecond()); myRandom48Init(NdbTick_CurrentMillisecond());
NdbRestarter restarter; NdbRestarter restarter;
...@@ -1632,7 +1633,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) ...@@ -1632,7 +1633,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
nodeIdList[nodeIdCnt++] = nodeId; nodeIdList[nodeIdCnt++] = nodeId;
} }
if (numnodes >= 4) { if (numnodes >= 4 && myRandom48(2) == 0) {
int rand = myRandom48(numnodes); int rand = myRandom48(numnodes);
int nodeId = restarter.getRandomNodeOtherNodeGroup(nodeIdList[0], rand); int nodeId = restarter.getRandomNodeOtherNodeGroup(nodeIdList[0], rand);
CHECK(nodeId != -1); CHECK(nodeId != -1);
...@@ -1642,6 +1643,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) ...@@ -1642,6 +1643,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
g_info << "1: master=" << masterNodeId << " nodes=" << nodeIdList[0] << "," << nodeIdList[1] << endl; g_info << "1: master=" << masterNodeId << " nodes=" << nodeIdList[0] << "," << nodeIdList[1] << endl;
const uint timeout = 60; //secs for node wait
const unsigned maxsleep = 2000; //ms const unsigned maxsleep = 2000; //ms
bool NF_ops = ctx->getProperty("Restart_NF_ops"); bool NF_ops = ctx->getProperty("Restart_NF_ops");
...@@ -1655,9 +1657,8 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) ...@@ -1655,9 +1657,8 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
NdbSleep_MilliSleep(myRandom48(maxsleep)); NdbSleep_MilliSleep(myRandom48(maxsleep));
{ {
int i = 0; for (int i = 0; i < nodeIdCnt; i++) {
while (i < nodeIdCnt) { int nodeId = nodeIdList[i];
int nodeId = nodeIdList[i++];
bool nostart = true; bool nostart = true;
bool abort = NF_type == 0 ? myRandom48(2) : (NF_type == 2); bool abort = NF_type == 0 ? myRandom48(2) : (NF_type == 2);
...@@ -1676,9 +1677,31 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) ...@@ -1676,9 +1677,31 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
} }
g_info << "1: wait for nostart" << endl; g_info << "1: wait for nostart" << endl;
CHECK(restarter.waitNodesNoStart(nodeIdList, nodeIdCnt) == 0); CHECK(restarter.waitNodesNoStart(nodeIdList, nodeIdCnt, timeout) == 0);
NdbSleep_MilliSleep(myRandom48(maxsleep)); NdbSleep_MilliSleep(myRandom48(maxsleep));
int err_master = 0;
int err_node[2] = { 0, 0 };
if (NR_error) {
err_master = errlst_master[l % errcnt_master];
// limitation: cannot have 2 node restarts and crash_insert
// one node may die for real (NF during startup)
for (int i = 0; i < nodeIdCnt && nodeIdCnt == 1; i++) {
err_node[i] = errlst_node[l % errcnt_node];
// 7176 - no DICT lock protection
if (err_node[i] == 7176) {
g_info << "1: no dict ops due to error insert "
<< err_node[i] << endl;
NR_ops = false;
}
}
}
g_info << "1: " << (NR_ops ? "run" : "pause") << " dict ops" << endl; g_info << "1: " << (NR_ops ? "run" : "pause") << " dict ops" << endl;
if (! send_dict_ops_cmd(ctx, NR_ops ? 1 : 2)) if (! send_dict_ops_cmd(ctx, NR_ops ? 1 : 2))
break; break;
...@@ -1689,23 +1712,17 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) ...@@ -1689,23 +1712,17 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
if (NR_error) { if (NR_error) {
{ {
int rand = myRandom48(err_master_cnt); int err = err_master;
int err = err_master[rand];
if (err != 0) { if (err != 0) {
g_info << "1: insert master error " << err << endl; g_info << "1: insert master error " << err << endl;
CHECK(restarter.insertErrorInNode(masterNodeId, err) == 0); CHECK(restarter.insertErrorInNode(masterNodeId, err) == 0);
} }
} }
// limitation: cannot have 2 node restarts and crash_insert for (int i = 0; i < nodeIdCnt; i++) {
// one node may die for real (NF during startup) int nodeId = nodeIdList[i];
int i = 0; int err = err_node[i];
while (i < nodeIdCnt && nodeIdCnt == 1) {
int nodeId = nodeIdList[i++];
int rand = myRandom48(err_node_cnt);
int err = err_node[rand];
if (err != 0) { if (err != 0) {
g_info << "1: insert node " << nodeId << " error " << err << endl; g_info << "1: insert node " << nodeId << " error " << err << endl;
CHECK(restarter.insertErrorInNode(nodeId, err) == 0); CHECK(restarter.insertErrorInNode(nodeId, err) == 0);
...@@ -1715,7 +1732,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) ...@@ -1715,7 +1732,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
NdbSleep_MilliSleep(myRandom48(maxsleep)); NdbSleep_MilliSleep(myRandom48(maxsleep));
g_info << "1: wait cluster started" << endl; g_info << "1: wait cluster started" << endl;
CHECK(restarter.waitClusterStarted() == 0); CHECK(restarter.waitClusterStarted(timeout) == 0);
NdbSleep_MilliSleep(myRandom48(maxsleep)); NdbSleep_MilliSleep(myRandom48(maxsleep));
g_info << "1: restart done" << endl; g_info << "1: restart done" << endl;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment