Commit be0d6c94 authored by unknown's avatar unknown

ndb - wl1760/bug#18216

  add two new start options that will decrease likelyhood of bug#18612
  push cntr-sp2 logic down into qmgr-sp1 to decrease likelyhood of bug#18612


ndb/include/kernel/signaldata/CmRegSignalData.hpp:
  Expand CmRegReq with lots of stuff
ndb/include/mgmapi/ndb_logevent.h:
  Add Start report during sp1
ndb/src/common/debugger/EventLogger.cpp:
  Add Start report during sp1
ndb/src/kernel/blocks/dbdih/DbdihInit.cpp:
  Init cntrref
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Fix small bugs related to partial initial start
ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp:
  Remove timeouts as they are handled in QMGR nowadays
ndb/src/kernel/blocks/qmgr/Qmgr.hpp:
  Push cntr sp2 logic down to QMGR to prevent the forming of multiple QMGR cluster
    that will lead to start problems (crashes...)
ndb/src/kernel/blocks/qmgr/QmgrInit.cpp:
  Push cntr sp2 logic down to QMGR to prevent the forming of multiple QMGR cluster
    that will lead to start problems (crashes...)
ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  Push cntr sp2 logic down to QMGR to prevent the forming of multiple QMGR cluster
    that will lead to start problems (crashes...)
ndb/src/kernel/vm/Configuration.cpp:
  Add new flags
    --initial-start
    --nowait-nodes
parent e40bdb5b
......@@ -30,12 +30,17 @@ class CmRegReq {
friend class Qmgr;
public:
STATIC_CONST( SignalLength = 3 );
STATIC_CONST( SignalLength = 5 + NdbNodeBitmask::Size );
private:
Uint32 blockRef;
Uint32 nodeId;
Uint32 version; // See ndb_version.h
Uint32 version; // See ndb_version.h
Uint32 start_type; // As specified by cmd-line or mgm, NodeState::StartType
Uint32 latest_gci; // 0 means no fs
Uint32 skip_nodes[NdbNodeBitmask::Size]; // Nodes that does not _need_
// to be part of restart
};
/**
......@@ -59,8 +64,7 @@ private:
* The dynamic id that the node reciving this signal has
*/
Uint32 dynamicId;
Uint32 allNdbNodes[NdbNodeBitmask::Size];
Uint32 allNdbNodes[NdbNodeBitmask::Size];
};
/**
......@@ -73,7 +77,7 @@ class CmRegRef {
friend class Qmgr;
public:
STATIC_CONST( SignalLength = 4 );
STATIC_CONST( SignalLength = 7 + NdbNodeBitmask::Size );
enum ErrorCode {
ZBUSY = 0, /* Only the president can send this */
......@@ -85,14 +89,27 @@ public:
* as president. */
ZNOT_PRESIDENT = 5, /* We are not president */
ZNOT_DEAD = 6, /* We are not dead when we are starting */
ZINCOMPATIBLE_VERSION = 7
ZINCOMPATIBLE_VERSION = 7,
ZINCOMPATIBLE_START_TYPE = 8
};
private:
Uint32 blockRef;
Uint32 nodeId;
Uint32 errorCode;
/**
* Applicable if ZELECTION
*/
Uint32 presidentCandidate;
Uint32 candidate_latest_gci; // 0 means non
/**
* Data for sending node sending node
*/
Uint32 latest_gci;
Uint32 start_type;
Uint32 skip_nodes[NdbNodeBitmask::Size]; // Nodes that does not _need_
// to be part of restart
};
class CmAdd {
......
......@@ -166,10 +166,14 @@ extern "C" {
/** NDB_MGM_EVENT_CATEGORY_BACKUP */
NDB_LE_BackupCompleted = 56,
/** NDB_MGM_EVENT_CATEGORY_BACKUP */
NDB_LE_BackupAborted = 57
NDB_LE_BackupAborted = 57,
/* 58 used in 5.1 */
/* 59 used */
/** NDB_MGM_EVENT_CATEGORY_STARTUP */
NDB_LE_StartReport = 60
/* 60 unused */
/* 61 unused */
/* 62 unused */
......@@ -625,6 +629,13 @@ extern "C" {
unsigned type;
unsigned node_id;
} SingleUser;
/** Log even data @ref NDB_LE_StartReport */
struct {
unsigned report_type;
unsigned remaining_time;
unsigned bitmask_size;
unsigned bitmask_data[1];
} StartReport;
#ifndef DOXYGEN_FIX
};
#else
......
......@@ -707,6 +707,90 @@ void getTextSingleUser(QQQQ) {
}
}
void getTextStartReport(QQQQ) {
Uint32 time = theData[2];
Uint32 sz = theData[3];
char mask1[100];
char mask2[100];
char mask3[100];
char mask4[100];
BitmaskImpl::getText(sz, theData + 4 + (0 * sz), mask1);
BitmaskImpl::getText(sz, theData + 4 + (1 * sz), mask2);
BitmaskImpl::getText(sz, theData + 4 + (2 * sz), mask3);
BitmaskImpl::getText(sz, theData + 4 + (3 * sz), mask4);
switch(theData[1]){
case 1: // Wait initial
BaseString::snprintf
(m_text, m_text_len,
"Initial start, waiting for %s to connect, "
" nodes [ all: %s connected: %s no-wait: %s ]",
mask4, mask1, mask2, mask3);
break;
case 2: // Wait partial
BaseString::snprintf
(m_text, m_text_len,
"Waiting until nodes: %s connects, "
"nodes [ all: %s connected: %s no-wait: %s ]",
mask4, mask1, mask2, mask3);
break;
case 3: // Wait partial timeout
BaseString::snprintf
(m_text, m_text_len,
"Waiting %u sec for nodes %s to connect, "
"nodes [ all: %s connected: %s no-wait: %s ]",
time, mask4, mask1, mask2, mask3);
break;
case 4: // Wait partioned
BaseString::snprintf
(m_text, m_text_len,
"Waiting for non partitioned start, "
"nodes [ all: %s connected: %s missing: %s no-wait: %s ]",
mask1, mask2, mask4, mask3);
break;
case 5:
BaseString::snprintf
(m_text, m_text_len,
"Waiting %u sec for non partitioned start, "
"nodes [ all: %s connected: %s missing: %s no-wait: %s ]",
time, mask1, mask2, mask4, mask3);
break;
case 0x8000: // Do initial
BaseString::snprintf
(m_text, m_text_len,
"Initial start with nodes %s [ missing: %s no-wait: %s ]",
mask2, mask4, mask3);
break;
case 0x8001: // Do start
BaseString::snprintf
(m_text, m_text_len,
"Start with all nodes %s",
mask2);
break;
case 0x8002: // Do partial
BaseString::snprintf
(m_text, m_text_len,
"Start with nodes %s [ missing: %s no-wait: %s ]",
mask2, mask4, mask3);
break;
case 0x8003: // Do partioned
BaseString::snprintf
(m_text, m_text_len,
"Start potentially partitioned with nodes %s "
" [ missing: %s no-wait: %s ]",
mask2, mask4, mask3);
break;
default:
BaseString::snprintf
(m_text, m_text_len,
"Unknown startreport: 0x%x [ %s %s %s %s ]",
theData[1],
mask1, mask2, mask3, mask4);
}
}
#if 0
BaseString::snprintf(m_text,
m_text_len,
......@@ -755,6 +839,7 @@ const EventLoggerBase::EventRepLogLevelMatrix EventLoggerBase::matrix[] = {
ROW(StartREDOLog, LogLevel::llStartUp, 10, Logger::LL_INFO ),
ROW(StartLog, LogLevel::llStartUp, 10, Logger::LL_INFO ),
ROW(UNDORecordsExecuted, LogLevel::llStartUp, 15, Logger::LL_INFO ),
ROW(StartReport, LogLevel::llStartUp, 4, Logger::LL_INFO ),
// NODERESTART
ROW(NR_CopyDict, LogLevel::llNodeRestart, 8, Logger::LL_INFO ),
......
......@@ -71,6 +71,7 @@ void Dbdih::initData()
cwaitLcpSr = false;
c_blockCommit = false;
c_blockCommitNo = 1;
cntrlblockref = RNIL;
}//Dbdih::initData()
void Dbdih::initRecords()
......
......@@ -11659,7 +11659,7 @@ void Dbdih::makeNodeGroups(Uint32 nodeArray[])
Uint32 tmngNode;
Uint32 tmngNodeGroup;
Uint32 tmngLimit;
Uint32 i;
Uint32 i, j;
/**-----------------------------------------------------------------------
* ASSIGN ALL ACTIVE NODES INTO NODE GROUPS. HOT SPARE NODES ARE ASSIGNED
......@@ -11705,6 +11705,38 @@ void Dbdih::makeNodeGroups(Uint32 nodeArray[])
Sysfile::setNodeGroup(mngNodeptr.i, SYSFILE->nodeGroups, mngNodeptr.p->nodeGroup);
}//if
}//for
for (i = 0; i<cnoOfNodeGroups; i++)
{
jam();
bool alive = false;
NodeGroupRecordPtr NGPtr;
NGPtr.i = i;
ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
for (j = 0; j<NGPtr.p->nodeCount; j++)
{
jam();
mngNodeptr.i = NGPtr.p->nodesInGroup[j];
ptrCheckGuard(mngNodeptr, MAX_NDB_NODES, nodeRecord);
if (checkNodeAlive(NGPtr.p->nodesInGroup[j]))
{
alive = true;
break;
}
}
if (!alive)
{
char buf[255];
BaseString::snprintf
(buf, sizeof(buf),
"Illegal initial start, no alive node in nodegroup %u", i);
progError(__LINE__,
NDBD_EXIT_SR_RESTARTCONFLICT,
buf);
}
}
}//Dbdih::makeNodeGroups()
/**
......@@ -12512,7 +12544,6 @@ void Dbdih::sendStartFragreq(Signal* signal,
void Dbdih::setInitialActiveStatus()
{
NodeRecordPtr siaNodeptr;
Uint32 tsiaNodeActiveStatus;
Uint32 tsiaNoActiveNodes;
tsiaNoActiveNodes = csystemnodes - cnoHotSpare;
......@@ -12520,39 +12551,34 @@ void Dbdih::setInitialActiveStatus()
SYSFILE->nodeStatus[i] = 0;
for (siaNodeptr.i = 1; siaNodeptr.i < MAX_NDB_NODES; siaNodeptr.i++) {
ptrAss(siaNodeptr, nodeRecord);
if (siaNodeptr.p->nodeStatus == NodeRecord::ALIVE) {
switch(siaNodeptr.p->nodeStatus){
case NodeRecord::ALIVE:
case NodeRecord::DEAD:
if (tsiaNoActiveNodes == 0) {
jam();
siaNodeptr.p->activeStatus = Sysfile::NS_HotSpare;
} else {
jam();
tsiaNoActiveNodes = tsiaNoActiveNodes - 1;
siaNodeptr.p->activeStatus = Sysfile::NS_Active;
}//if
} else {
jam();
siaNodeptr.p->activeStatus = Sysfile::NS_NotDefined;
}//if
switch (siaNodeptr.p->activeStatus) {
case Sysfile::NS_Active:
jam();
tsiaNodeActiveStatus = Sysfile::NS_Active;
break;
case Sysfile::NS_HotSpare:
jam();
tsiaNodeActiveStatus = Sysfile::NS_HotSpare;
break;
case Sysfile::NS_NotDefined:
jam();
tsiaNodeActiveStatus = Sysfile::NS_NotDefined;
if (siaNodeptr.p->nodeStatus == NodeRecord::ALIVE)
{
jam();
siaNodeptr.p->activeStatus = Sysfile::NS_Active;
}
else
{
siaNodeptr.p->activeStatus = Sysfile::NS_NotActive_NotTakenOver;
}
}
break;
default:
ndbrequire(false);
return;
jam();
siaNodeptr.p->activeStatus = Sysfile::NS_NotDefined;
break;
}//switch
Sysfile::setNodeStatus(siaNodeptr.i, SYSFILE->nodeStatus,
tsiaNodeActiveStatus);
}//if
Sysfile::setNodeStatus(siaNodeptr.i,
SYSFILE->nodeStatus,
siaNodeptr.p->activeStatus);
}//for
}//Dbdih::setInitialActiveStatus()
......
......@@ -820,17 +820,9 @@ Ndbcntr::trySystemRestart(Signal* signal){
return false;
}
if(!allNodes && c_start.m_startPartialTimeout > now){
jam();
return false;
}
NodeState::StartType srType = NodeState::ST_SYSTEM_RESTART;
if(c_start.m_waiting.equal(c_start.m_withoutLog)){
if(!allNodes){
jam();
return false;
}
if(c_start.m_waiting.equal(c_start.m_withoutLog))
{
jam();
srType = NodeState::ST_INITIAL_START;
c_start.m_starting = c_start.m_withoutLog; // Used for starting...
......@@ -860,10 +852,6 @@ Ndbcntr::trySystemRestart(Signal* signal){
ndbrequire(false); // All nodes -> partitioning, which is not allowed
}
if(c_start.m_startPartitionedTimeout > now){
jam();
return false;
}
break;
}
......
......@@ -50,6 +50,7 @@
#define ZAPI_HB_HANDLING 3
#define ZTIMER_HANDLING 4
#define ZARBIT_HANDLING 5
#define ZSTART_FAILURE_LIMIT 6
/* Error Codes ------------------------------*/
#define ZERRTOOMANY 1101
......@@ -113,8 +114,19 @@ public:
Uint32 m_gsn;
SignalCounter m_nodes;
} c_start;
Uint32 m_latest_gci;
Uint32 m_start_type;
NdbNodeBitmask m_skip_nodes;
NdbNodeBitmask m_starting_nodes;
NdbNodeBitmask m_starting_nodes_w_log;
Uint16 m_president_candidate;
Uint32 m_president_candidate_gci;
Uint16 m_regReqReqSent;
Uint16 m_regReqReqRecv;
} c_start;
NdbNodeBitmask c_definedNodes; // DB nodes in config
NdbNodeBitmask c_clusterNodes; // DB nodes in cluster
NodeBitmask c_connectedNodes; // All kinds of connected nodes
......@@ -125,7 +137,7 @@ public:
* i.e. nodes that connect to use, when we already have elected president
*/
NdbNodeBitmask c_readnodes_nodes;
Uint32 c_maxDynamicId;
// Records
......@@ -236,6 +248,9 @@ private:
void execREAD_NODESREF(Signal* signal);
void execREAD_NODESCONF(Signal* signal);
void execDIH_RESTARTREF(Signal* signal);
void execDIH_RESTARTCONF(Signal* signal);
void execAPI_VERSION_REQ(Signal* signal);
void execAPI_BROADCAST_REP(Signal* signal);
......@@ -252,6 +267,7 @@ private:
// Statement blocks
void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
Uint32 check_startup(Signal* signal);
void node_failed(Signal* signal, Uint16 aFailedNode);
void checkStartInterface(Signal* signal);
......@@ -374,12 +390,12 @@ private:
/* Status flags ----------------------------------*/
Uint32 c_restartPartialTimeout;
Uint32 c_restartPartionedTimeout;
Uint32 c_restartFailureTimeout;
Uint64 c_start_election_time;
Uint16 creadyDistCom;
Uint16 c_regReqReqSent;
Uint16 c_regReqReqRecv;
Uint64 c_stopElectionTime;
Uint16 cpresidentCandidate;
Uint16 cdelayRegreq;
Uint16 cpresidentAlive;
Uint16 cnoFailedNodes;
......
......@@ -98,6 +98,9 @@ Qmgr::Qmgr(const class Configuration & conf)
addRecSignal(GSN_READ_NODESREF, &Qmgr::execREAD_NODESREF);
addRecSignal(GSN_READ_NODESCONF, &Qmgr::execREAD_NODESCONF);
addRecSignal(GSN_DIH_RESTARTREF, &Qmgr::execDIH_RESTARTREF);
addRecSignal(GSN_DIH_RESTARTCONF, &Qmgr::execDIH_RESTARTCONF);
initData();
}//Qmgr::Qmgr()
......
This diff is collapsed.
......@@ -55,6 +55,12 @@ enum ndbd_options {
NDB_STD_OPTS_VARS;
// XXX should be my_bool ???
static int _daemon, _no_daemon, _foreground, _initial, _no_start;
static int _initialstart;
static const char* _nowait_nodes;
extern Uint32 g_start_type;
extern NdbNodeBitmask g_nowait_nodes;
/**
* Arguments to NDB process
*/
......@@ -82,6 +88,14 @@ static struct my_option my_long_options[] =
" (implies --nodaemon)",
(gptr*) &_foreground, (gptr*) &_foreground, 0,
GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0 },
{ "nowait-nodes", NO_ARG,
"Nodes that will not be waited for during start",
(gptr*) &_nowait_nodes, (gptr*) &_nowait_nodes, 0,
GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0 },
{ "initial-start", NO_ARG,
"Perform initial start",
(gptr*) &_initialstart, (gptr*) &_initialstart, 0,
GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0 },
{ 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
};
static void short_usage_sub(void)
......@@ -150,6 +164,37 @@ Configuration::init(int argc, char** argv)
globalData.ownId= 0;
if (_nowait_nodes)
{
BaseString str(_nowait_nodes);
Vector<BaseString> arr;
str.split(arr, ",");
for (Uint32 i = 0; i<arr.size(); i++)
{
char *endptr = 0;
long val = strtol(arr[i].c_str(), &endptr, 10);
if (*endptr)
{
ndbout_c("Unable to parse nowait-nodes argument: %s : %s",
arr[i].c_str(), _nowait_nodes);
exit(-1);
}
if (! (val > 0 && val < MAX_NDB_NODES))
{
ndbout_c("Invalid nodeid specified in nowait-nodes: %d : %s",
val, _nowait_nodes);
exit(-1);
}
g_nowait_nodes.set(val);
}
}
if (_initialstart)
{
_initialStart = true;
g_start_type |= (1 << NodeState::ST_INITIAL_START);
}
return true;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment