ndb - bug#34216

  During TC-take-over (NF) the new-TC builds up a new transaction state
  And commits operation according to this state.
  However, in the new state that is build, the operations does not have to be in same order, as "real" state
  In the multi-update-case, this means that operations can be commit in "incorrect" order

  i.e update A, delete A, insert A is normally commited in same order as prepared
      but can be committed in any order

  This patch changes TUP handling of these out-order commits, and previous implementation
    could confuse the TUX triggers
parent 83b4c04a
...@@ -1331,6 +1331,11 @@ typedef Ptr<HostBuffer> HostBufferPtr; ...@@ -1331,6 +1331,11 @@ typedef Ptr<HostBuffer> HostBufferPtr;
struct Tuple_header struct Tuple_header
{ {
union { union {
/**
* List of prepared operations for this tuple.
* Points to most recent/last operation, ie. to walk the list must follow
* regOperPtr->prevActiveOp links.
*/
Uint32 m_operation_ptr_i; // OperationPtrI Uint32 m_operation_ptr_i; // OperationPtrI
Uint32 m_base_record_ref; // For disk tuple, ref to MM tuple Uint32 m_base_record_ref; // For disk tuple, ref to MM tuple
}; };
...@@ -2882,7 +2887,7 @@ private: ...@@ -2882,7 +2887,7 @@ private:
void verify_page_lists(Disk_alloc_info&) {} void verify_page_lists(Disk_alloc_info&) {}
#endif #endif
void fix_commit_order(OperationrecPtr); void findFirstOp(OperationrecPtr&);
void commit_operation(Signal*, Uint32, Tuple_header*, PagePtr, void commit_operation(Signal*, Uint32, Tuple_header*, PagePtr,
Operationrec*, Fragrecord*, Tablerec*); Operationrec*, Fragrecord*, Tablerec*);
......
...@@ -385,3 +385,38 @@ void Dbtup::send_TUPKEYREF(Signal* signal, ...@@ -385,3 +385,38 @@ void Dbtup::send_TUPKEYREF(Signal* signal,
TupKeyRef::SignalLength, JBB); TupKeyRef::SignalLength, JBB);
} }
/**
* Unlink one operation from the m_operation_ptr_i list in the tuple.
*/
void Dbtup::removeActiveOpList(Operationrec* const regOperPtr,
Tuple_header *tuple_ptr)
{
OperationrecPtr raoOperPtr;
if(!regOperPtr->m_copy_tuple_location.isNull())
{
jam();
c_undo_buffer.free_copy_tuple(&regOperPtr->m_copy_tuple_location);
}
if (regOperPtr->op_struct.in_active_list) {
regOperPtr->op_struct.in_active_list= false;
if (regOperPtr->nextActiveOp != RNIL) {
jam();
raoOperPtr.i= regOperPtr->nextActiveOp;
c_operation_pool.getPtr(raoOperPtr);
raoOperPtr.p->prevActiveOp= regOperPtr->prevActiveOp;
} else {
jam();
tuple_ptr->m_operation_ptr_i = regOperPtr->prevActiveOp;
}
if (regOperPtr->prevActiveOp != RNIL) {
jam();
raoOperPtr.i= regOperPtr->prevActiveOp;
c_operation_pool.getPtr(raoOperPtr);
raoOperPtr.p->nextActiveOp= regOperPtr->nextActiveOp;
}
regOperPtr->prevActiveOp= RNIL;
regOperPtr->nextActiveOp= RNIL;
}
}
...@@ -97,42 +97,6 @@ void Dbtup::execTUP_WRITELOG_REQ(Signal* signal) ...@@ -97,42 +97,6 @@ void Dbtup::execTUP_WRITELOG_REQ(Signal* signal)
} while (true); } while (true);
} }
void Dbtup::removeActiveOpList(Operationrec* const regOperPtr,
Tuple_header *tuple_ptr)
{
OperationrecPtr raoOperPtr;
/**
* Release copy tuple
*/
if(!regOperPtr->m_copy_tuple_location.isNull())
{
jam();
c_undo_buffer.free_copy_tuple(&regOperPtr->m_copy_tuple_location);
}
if (regOperPtr->op_struct.in_active_list) {
regOperPtr->op_struct.in_active_list= false;
if (regOperPtr->nextActiveOp != RNIL) {
jam();
raoOperPtr.i= regOperPtr->nextActiveOp;
c_operation_pool.getPtr(raoOperPtr);
raoOperPtr.p->prevActiveOp= regOperPtr->prevActiveOp;
} else {
jam();
tuple_ptr->m_operation_ptr_i = regOperPtr->prevActiveOp;
}
if (regOperPtr->prevActiveOp != RNIL) {
jam();
raoOperPtr.i= regOperPtr->prevActiveOp;
c_operation_pool.getPtr(raoOperPtr);
raoOperPtr.p->nextActiveOp= regOperPtr->nextActiveOp;
}
regOperPtr->prevActiveOp= RNIL;
regOperPtr->nextActiveOp= RNIL;
}
}
/* ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- */
/* INITIALIZATION OF ONE CONNECTION RECORD TO PREPARE FOR NEXT OP. */ /* INITIALIZATION OF ONE CONNECTION RECORD TO PREPARE FOR NEXT OP. */
/* ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- */
...@@ -145,6 +109,7 @@ void Dbtup::initOpConnection(Operationrec* regOperPtr) ...@@ -145,6 +109,7 @@ void Dbtup::initOpConnection(Operationrec* regOperPtr)
regOperPtr->op_struct.m_disk_preallocated= 0; regOperPtr->op_struct.m_disk_preallocated= 0;
regOperPtr->op_struct.m_load_diskpage_on_commit= 0; regOperPtr->op_struct.m_load_diskpage_on_commit= 0;
regOperPtr->op_struct.m_wait_log_buffer= 0; regOperPtr->op_struct.m_wait_log_buffer= 0;
regOperPtr->op_struct.in_active_list = false;
regOperPtr->m_undo_buffer_space= 0; regOperPtr->m_undo_buffer_space= 0;
} }
...@@ -426,39 +391,21 @@ Dbtup::disk_page_log_buffer_callback(Signal* signal, ...@@ -426,39 +391,21 @@ Dbtup::disk_page_log_buffer_callback(Signal* signal,
c_lqh->tupcommit_conf_callback(signal, regOperPtr.p->userpointer); c_lqh->tupcommit_conf_callback(signal, regOperPtr.p->userpointer);
} }
/**
* Move to the first operation performed on this tuple
*/
void void
Dbtup::fix_commit_order(OperationrecPtr opPtr) Dbtup::findFirstOp(OperationrecPtr & firstPtr)
{ {
jam(); jam();
ndbassert(!opPtr.p->is_first_operation()); printf("Detect out-of-order commit(%u) -> ", firstPtr.i);
OperationrecPtr firstPtr = opPtr; ndbassert(!firstPtr.p->is_first_operation());
while(firstPtr.p->prevActiveOp != RNIL) while(firstPtr.p->prevActiveOp != RNIL)
{ {
firstPtr.i = firstPtr.p->prevActiveOp; firstPtr.i = firstPtr.p->prevActiveOp;
c_operation_pool.getPtr(firstPtr); c_operation_pool.getPtr(firstPtr);
} }
ndbout_c("%u", firstPtr.i);
ndbout_c("fix_commit_order (swapping %d and %d)",
opPtr.i, firstPtr.i);
/**
* Swap data between first and curr
*/
Uint32 prev= opPtr.p->prevActiveOp;
Uint32 next= opPtr.p->nextActiveOp;
Uint32 seco= firstPtr.p->nextActiveOp;
Operationrec tmp = *opPtr.p;
* opPtr.p = * firstPtr.p;
* firstPtr.p = tmp;
c_operation_pool.getPtr(seco)->prevActiveOp = opPtr.i;
c_operation_pool.getPtr(prev)->nextActiveOp = firstPtr.i;
if(next != RNIL)
{
jam();
c_operation_pool.getPtr(next)->prevActiveOp = firstPtr.i;
}
} }
/* ----------------------------------------------------------------- */ /* ----------------------------------------------------------------- */
...@@ -471,22 +418,17 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal) ...@@ -471,22 +418,17 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal)
TablerecPtr regTabPtr; TablerecPtr regTabPtr;
KeyReqStruct req_struct; KeyReqStruct req_struct;
TransState trans_state; TransState trans_state;
Uint32 no_of_fragrec, no_of_tablerec, hash_value, gci; Uint32 no_of_fragrec, no_of_tablerec;
TupCommitReq * const tupCommitReq= (TupCommitReq *)signal->getDataPtr(); TupCommitReq * const tupCommitReq= (TupCommitReq *)signal->getDataPtr();
regOperPtr.i= tupCommitReq->opPtr; regOperPtr.i= tupCommitReq->opPtr;
Uint32 hash_value= tupCommitReq->hashValue;
Uint32 gci = tupCommitReq->gci;
jamEntry(); jamEntry();
c_operation_pool.getPtr(regOperPtr); c_operation_pool.getPtr(regOperPtr);
if(!regOperPtr.p->is_first_operation())
{
/**
* Out of order commit XXX check effect on triggers
*/
fix_commit_order(regOperPtr);
}
//ndbassert(regOperPtr.p->is_first_operation());
regFragPtr.i= regOperPtr.p->fragmentPtr; regFragPtr.i= regOperPtr.p->fragmentPtr;
trans_state= get_trans_state(regOperPtr.p); trans_state= get_trans_state(regOperPtr.p);
...@@ -509,8 +451,10 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal) ...@@ -509,8 +451,10 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal)
#ifdef VM_TRACE #ifdef VM_TRACE
if (tupCommitReq->diskpage == RNIL) if (tupCommitReq->diskpage == RNIL)
{ {
m_pgman.m_ptr.setNull(); m_pgman.m_ptr.i = RNIL;
req_struct.m_disk_page_ptr.setNull(); m_pgman.m_ptr.p = 0;
req_struct.m_disk_page_ptr.i = RNIL;
req_struct.m_disk_page_ptr.p = 0;
} }
#endif #endif
...@@ -519,14 +463,56 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal) ...@@ -519,14 +463,56 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal)
PagePtr page; PagePtr page;
Tuple_header* tuple_ptr= (Tuple_header*) Tuple_header* tuple_ptr= (Tuple_header*)
get_ptr(&page, &regOperPtr.p->m_tuple_location, regTabPtr.p); get_ptr(&page, &regOperPtr.p->m_tuple_location, regTabPtr.p);
/**
* NOTE: This has to be run before potential time-slice when
* waiting for disk, as otherwise the "other-ops" in a multi-op
* commit might run while we're waiting for disk
*
*/
if (!regTabPtr.p->tuxCustomTriggers.isEmpty())
{
if(get_tuple_state(regOperPtr.p) == TUPLE_PREPARED)
{
jam();
OperationrecPtr loopPtr = regOperPtr;
if (unlikely(!regOperPtr.p->is_first_operation()))
{
findFirstOp(loopPtr);
}
/**
* Execute all tux triggers at first commit
* since previous tuple is otherwise removed...
*/
jam();
goto first;
while(loopPtr.i != RNIL)
{
c_operation_pool.getPtr(loopPtr);
first:
executeTuxCommitTriggers(signal,
loopPtr.p,
regFragPtr.p,
regTabPtr.p);
set_tuple_state(loopPtr.p, TUPLE_TO_BE_COMMITTED);
loopPtr.i = loopPtr.p->nextActiveOp;
}
}
}
bool get_page = false; bool get_page = false;
if(regOperPtr.p->op_struct.m_load_diskpage_on_commit) if(regOperPtr.p->op_struct.m_load_diskpage_on_commit)
{ {
jam(); jam();
Page_cache_client::Request req; Page_cache_client::Request req;
ndbassert(regOperPtr.p->is_first_operation() &&
regOperPtr.p->is_last_operation()); /**
* Only last op on tuple needs "real" commit,
* hence only this one should have m_load_diskpage_on_commit
*/
ndbassert(tuple_ptr->m_operation_ptr_i == regOperPtr.i);
/** /**
* Check for page * Check for page
...@@ -611,8 +597,11 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal) ...@@ -611,8 +597,11 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal)
if(regOperPtr.p->op_struct.m_wait_log_buffer) if(regOperPtr.p->op_struct.m_wait_log_buffer)
{ {
jam(); jam();
ndbassert(regOperPtr.p->is_first_operation() && /**
regOperPtr.p->is_last_operation()); * Only last op on tuple needs "real" commit,
* hence only this one should have m_wait_log_buffer
*/
ndbassert(tuple_ptr->m_operation_ptr_i == regOperPtr.i);
Callback cb; Callback cb;
cb.m_callbackData= regOperPtr.i; cb.m_callbackData= regOperPtr.i;
...@@ -636,42 +625,23 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal) ...@@ -636,42 +625,23 @@ void Dbtup::execTUP_COMMITREQ(Signal* signal)
} }
} }
if(!tuple_ptr) assert(tuple_ptr);
{
jam();
tuple_ptr = (Tuple_header*)
get_ptr(&page, &regOperPtr.p->m_tuple_location,regTabPtr.p);
}
skip_disk: skip_disk:
req_struct.m_tuple_ptr = tuple_ptr; req_struct.m_tuple_ptr = tuple_ptr;
if(get_tuple_state(regOperPtr.p) == TUPLE_PREPARED) Uint32 nextOp = regOperPtr.p->nextActiveOp;
{ Uint32 prevOp = regOperPtr.p->prevActiveOp;
jam(); /**
/** * The trigger code (which is shared between detached/imediate)
* Execute all tux triggers at first commit * check op-list to check were to read before values from
* since previous tuple is otherwise removed... * detached triggers should always read from original tuple value
* btw...is this a "good" solution?? * from before transaction start, not from any intermediate update
* *
* why can't we instead remove "own version" (when approriate ofcourse) * Setting the op-list has this effect
*/ */
if (!regTabPtr.p->tuxCustomTriggers.isEmpty()) { regOperPtr.p->nextActiveOp = RNIL;
jam(); regOperPtr.p->prevActiveOp = RNIL;
OperationrecPtr loopPtr= regOperPtr; if(tuple_ptr->m_operation_ptr_i == regOperPtr.i)
while(loopPtr.i != RNIL)
{
c_operation_pool.getPtr(loopPtr);
executeTuxCommitTriggers(signal,
loopPtr.p,
regFragPtr.p,
regTabPtr.p);
set_tuple_state(loopPtr.p, TUPLE_TO_BE_COMMITTED);
loopPtr.i = loopPtr.p->nextActiveOp;
}
}
}
if(regOperPtr.p->is_last_operation())
{ {
jam(); jam();
/** /**
...@@ -682,27 +652,38 @@ skip_disk: ...@@ -682,27 +652,38 @@ skip_disk:
checkDetachedTriggers(&req_struct, regOperPtr.p, regTabPtr.p, checkDetachedTriggers(&req_struct, regOperPtr.p, regTabPtr.p,
disk != RNIL); disk != RNIL);
tuple_ptr->m_operation_ptr_i = RNIL;
if(regOperPtr.p->op_struct.op_type != ZDELETE) if(regOperPtr.p->op_struct.op_type != ZDELETE)
{ {
jam(); jam();
commit_operation(signal, gci, tuple_ptr, page, commit_operation(signal, gci, tuple_ptr, page,
regOperPtr.p, regFragPtr.p, regTabPtr.p); regOperPtr.p, regFragPtr.p, regTabPtr.p);
removeActiveOpList(regOperPtr.p, tuple_ptr);
} }
else else
{ {
jam(); jam();
removeActiveOpList(regOperPtr.p, tuple_ptr);
if (get_page) if (get_page)
ndbassert(tuple_ptr->m_header_bits & Tuple_header::DISK_PART); ndbassert(tuple_ptr->m_header_bits & Tuple_header::DISK_PART);
dealloc_tuple(signal, gci, page.p, tuple_ptr, dealloc_tuple(signal, gci, page.p, tuple_ptr,
regOperPtr.p, regFragPtr.p, regTabPtr.p); regOperPtr.p, regFragPtr.p, regTabPtr.p);
} }
} }
else
if (nextOp != RNIL)
{
c_operation_pool.getPtr(nextOp)->prevActiveOp = prevOp;
}
if (prevOp != RNIL)
{
c_operation_pool.getPtr(prevOp)->nextActiveOp = nextOp;
}
if(!regOperPtr.p->m_copy_tuple_location.isNull())
{ {
jam(); jam();
removeActiveOpList(regOperPtr.p, tuple_ptr); c_undo_buffer.free_copy_tuple(&regOperPtr.p->m_copy_tuple_location);
} }
initOpConnection(regOperPtr.p); initOpConnection(regOperPtr.p);
......
...@@ -229,7 +229,7 @@ Dbtup::calculateChecksum(Tuple_header* tuple_ptr, ...@@ -229,7 +229,7 @@ Dbtup::calculateChecksum(Tuple_header* tuple_ptr,
// includes tupVersion // includes tupVersion
//printf("%p - ", tuple_ptr); //printf("%p - ", tuple_ptr);
for (i= 0; i < rec_size-2; i++) { for (i= 0; i < rec_size-Tuple_header::HeaderSize; i++) {
checksum ^= tuple_header[i]; checksum ^= tuple_header[i];
//printf("%.8x ", tuple_header[i]); //printf("%.8x ", tuple_header[i]);
} }
......
...@@ -1052,7 +1052,7 @@ args: -n Bug33793 T1 ...@@ -1052,7 +1052,7 @@ args: -n Bug33793 T1
max-time: 600 max-time: 600
cmd: testNodeRestart cmd: testNodeRestart
args: --nologging -n Bug34216 -l 10 T1 I3 D2 args: -n Bug34216 -l 10 T1 I3 D2
max-time: 1200 max-time: 1200
cmd: testNodeRestart cmd: testNodeRestart
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment