Commit 9a6ba1aa authored by John Esmet's avatar John Esmet

refs #5770 Only check one basement node on pin, remove the assumption that adjacent

available nodes are query-able.
parent 06d56d51
...@@ -193,6 +193,11 @@ toku_create_new_ftnode ( ...@@ -193,6 +193,11 @@ toku_create_new_ftnode (
NULL); NULL);
} }
//
// On success, this function assumes that the caller is trying to pin the node
// with a PL_READ lock. If message application is needed,
// then a PL_WRITE_CHEAP lock is grabbed
//
int int
toku_pin_ftnode_batched( toku_pin_ftnode_batched(
FT_HANDLE brt, FT_HANDLE brt,
...@@ -202,15 +207,22 @@ toku_pin_ftnode_batched( ...@@ -202,15 +207,22 @@ toku_pin_ftnode_batched(
ANCESTORS ancestors, ANCESTORS ancestors,
const PIVOT_BOUNDS bounds, const PIVOT_BOUNDS bounds,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
pair_lock_type lock_type,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE *node_p, FTNODE *node_p,
bool* msgs_applied) bool* msgs_applied)
{ {
void *node_v; void *node_v;
*msgs_applied = false; *msgs_applied = false;
pair_lock_type needed_lock_type = lock_type; FTNODE node = nullptr;
try_again_for_write_lock: MSN max_msn_in_path = ZERO_MSN;
bool needs_ancestors_messages = false;
// this function assumes that if you want ancestor messages applied,
// you are doing a read for a query. This is so we can make some optimizations
// below.
if (apply_ancestor_messages) {
paranoid_invariant(bfe->type == ftnode_fetch_subset);
}
int r = toku_cachetable_get_and_pin_nonblocking_batched( int r = toku_cachetable_get_and_pin_nonblocking_batched(
brt->ft->cf, brt->ft->cf,
blocknum, blocknum,
...@@ -221,25 +233,52 @@ try_again_for_write_lock: ...@@ -221,25 +233,52 @@ try_again_for_write_lock:
toku_ftnode_fetch_callback, toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback, toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback, toku_ftnode_pf_callback,
needed_lock_type, PL_READ,
bfe, //read_extraargs bfe, //read_extraargs
unlockers); unlockers);
if (r==0) { if (r != 0) {
FTNODE node = static_cast<FTNODE>(node_v); assert(r == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
MSN max_msn_in_path; goto exit;
bool needs_ancestors_messages = false;
if (apply_ancestor_messages && node->height == 0) {
needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(brt->ft, node, ancestors, bounds, &max_msn_in_path);
if (needs_ancestors_messages && needed_lock_type == PL_READ) {
toku_unpin_ftnode_read_only(brt->ft, node);
needed_lock_type = PL_WRITE_CHEAP;
goto try_again_for_write_lock;
}
} }
node = static_cast<FTNODE>(node_v);
if (apply_ancestor_messages && node->height == 0) { if (apply_ancestor_messages && node->height == 0) {
needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(
brt->ft,
node,
ancestors,
bounds,
&max_msn_in_path,
bfe->child_to_read
);
if (needs_ancestors_messages) { if (needs_ancestors_messages) {
invariant(needed_lock_type != PL_READ); toku_unpin_ftnode_read_only(brt->ft, node);
toku_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, msgs_applied); int rr = toku_cachetable_get_and_pin_nonblocking_batched(
brt->ft->cf,
blocknum,
fullhash,
&node_v,
NULL,
get_write_callbacks_for_node(brt->ft),
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
PL_WRITE_CHEAP,
bfe, //read_extraargs
unlockers);
if (rr != 0) {
assert(rr == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
r = TOKUDB_TRY_AGAIN;
goto exit;
}
node = static_cast<FTNODE>(node_v);
toku_apply_ancestors_messages_to_node(
brt,
node,
ancestors,
bounds,
msgs_applied,
bfe->child_to_read
);
} else { } else {
// At this point, we aren't going to run // At this point, we aren't going to run
// toku_apply_ancestors_messages_to_node but that doesn't // toku_apply_ancestors_messages_to_node but that doesn't
...@@ -264,20 +303,12 @@ try_again_for_write_lock: ...@@ -264,20 +303,12 @@ try_again_for_write_lock:
// requires a write lock, and a write lock requires you to // requires a write lock, and a write lock requires you to
// resolve checkpointing. // resolve checkpointing.
if (!node->dirty) { if (!node->dirty) {
toku_ft_bn_update_max_msn(node, max_msn_in_path); toku_ft_bn_update_max_msn(node, max_msn_in_path, bfe->child_to_read);
} }
} }
invariant(needed_lock_type != PL_READ || !*msgs_applied);
}
if ((lock_type != PL_READ) && node->height > 0) {
toku_move_ftnode_messages_to_stale(brt->ft, node);
} }
*node_p = node; *node_p = node;
// printf("%*sPin %ld\n", 8-node->height, "", blocknum.b); exit:
} else {
assert(r==TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
// printf("%*sPin %ld try again\n", 8, "", blocknum.b);
}
return r; return r;
} }
......
...@@ -150,7 +150,6 @@ toku_pin_ftnode_batched( ...@@ -150,7 +150,6 @@ toku_pin_ftnode_batched(
ANCESTORS ancestors, ANCESTORS ancestors,
const PIVOT_BOUNDS pbounds, const PIVOT_BOUNDS pbounds,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
pair_lock_type lock_type,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE *node_p, FTNODE *node_p,
bool* msgs_applied bool* msgs_applied
......
...@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode); ...@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
#define VERIFY_NODE(t,n) ((void)0) #define VERIFY_NODE(t,n) ((void)0)
#endif #endif
//#define FT_TRACE
#ifdef FT_TRACE
#define WHEN_FTTRACE(x) x
#else
#define WHEN_FTTRACE(x) ((void)0)
#endif
void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe); void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe);
void toku_ft_status_update_flush_reason(FTNODE node, uint64_t uncompressed_bytes_flushed, uint64_t bytes_written, tokutime_t write_time, bool for_checkpoint); void toku_ft_status_update_flush_reason(FTNODE node, uint64_t uncompressed_bytes_flushed, uint64_t bytes_written, tokutime_t write_time, bool for_checkpoint);
void toku_ft_status_update_serialize_times(FTNODE node, tokutime_t serialize_time, tokutime_t compress_time); void toku_ft_status_update_serialize_times(FTNODE node, tokutime_t serialize_time, tokutime_t compress_time);
...@@ -982,11 +975,11 @@ struct pivot_bounds { ...@@ -982,11 +975,11 @@ struct pivot_bounds {
__attribute__((nonnull)) __attribute__((nonnull))
void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node); void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node);
void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied); void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied, int child_to_read);
__attribute__((nonnull)) __attribute__((nonnull))
bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path); bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path, int child_to_read);
__attribute__((nonnull)) __attribute__((nonnull))
void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied); void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read);
__attribute__((const,nonnull)) __attribute__((const,nonnull))
size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd); size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd);
......
...@@ -4509,8 +4509,53 @@ bnc_apply_messages_to_basement_node( ...@@ -4509,8 +4509,53 @@ bnc_apply_messages_to_basement_node(
} }
} }
static void
apply_ancestors_messages_to_bn(
FT_HANDLE t,
FTNODE node,
int childnum,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds,
TXNID oldest_referenced_xid,
bool* msgs_applied
)
{
BASEMENTNODE curr_bn = BLB(node, childnum);
struct pivot_bounds curr_bounds = next_pivot_keys(node, childnum, bounds);
for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > curr_bn->max_msn_applied.msn) {
paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
bnc_apply_messages_to_basement_node(
t,
curr_bn,
curr_ancestors->node,
curr_ancestors->childnum,
&curr_bounds,
oldest_referenced_xid,
msgs_applied
);
// We don't want to check this ancestor node again if the
// next time we query it, the msn hasn't changed.
curr_bn->max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
}
}
// At this point, we know all the stale messages above this
// basement node have been applied, and any new messages will be
// fresh, so we don't need to look at stale messages for this
// basement node, unless it gets evicted (and this field becomes
// false when it's read in again).
curr_bn->stale_ancestor_messages_applied = true;
}
void void
toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied) toku_apply_ancestors_messages_to_node (
FT_HANDLE t,
FTNODE node,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds,
bool* msgs_applied,
int child_to_read
)
// Effect: // Effect:
// Bring a leaf node up-to-date according to all the messages in the ancestors. // Bring a leaf node up-to-date according to all the messages in the ancestors.
// If the leaf node is already up-to-date then do nothing. // If the leaf node is already up-to-date then do nothing.
...@@ -4521,7 +4566,7 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances ...@@ -4521,7 +4566,7 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
// The entire root-to-leaf path is pinned and appears in the ancestors list. // The entire root-to-leaf path is pinned and appears in the ancestors list.
{ {
VERIFY_NODE(t, node); VERIFY_NODE(t, node);
invariant(node->height == 0); paranoid_invariant(node->height == 0);
TXNID oldest_referenced_xid = ancestors->node->oldest_referenced_xid_known; TXNID oldest_referenced_xid = ancestors->node->oldest_referenced_xid_known;
for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) { for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
...@@ -4530,69 +4575,53 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances ...@@ -4530,69 +4575,53 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
} }
} }
if (!node->dirty && child_to_read >= 0) {
paranoid_invariant(BP_STATE(node, child_to_read) == PT_AVAIL);
apply_ancestors_messages_to_bn(
t,
node,
child_to_read,
ancestors,
bounds,
oldest_referenced_xid,
msgs_applied
);
}
else {
// know we are a leaf node // know we are a leaf node
// An important invariant: // An important invariant:
// We MUST bring every available basement node up to date. // We MUST bring every available basement node for a dirty node up to date.
// flushing on the cleaner thread depends on this. This invariant // flushing on the cleaner thread depends on this. This invariant
// allows the cleaner thread to just pick an internal node and flush it // allows the cleaner thread to just pick an internal node and flush it
// as opposed to being forced to start from the root. // as opposed to being forced to start from the root.
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
if (BP_STATE(node, i) != PT_AVAIL) { continue; } if (BP_STATE(node, i) != PT_AVAIL) { continue; }
BASEMENTNODE curr_bn = BLB(node, i); apply_ancestors_messages_to_bn(
struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > curr_bn->max_msn_applied.msn) {
paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
bnc_apply_messages_to_basement_node(
t, t,
curr_bn, node,
curr_ancestors->node, i,
curr_ancestors->childnum, ancestors,
&curr_bounds, bounds,
oldest_referenced_xid, oldest_referenced_xid,
msgs_applied msgs_applied
); );
// We don't want to check this ancestor node again if the
// next time we query it, the msn hasn't changed.
curr_bn->max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
} }
} }
// At this point, we know all the stale messages above this
// basement node have been applied, and any new messages will be
// fresh, so we don't need to look at stale messages for this
// basement node, unless it gets evicted (and this field becomes
// false when it's read in again).
curr_bn->stale_ancestor_messages_applied = true;
}
VERIFY_NODE(t, node); VERIFY_NODE(t, node);
} }
bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path) static bool bn_needs_ancestors_messages(
// Effect: Determine whether there are messages in a node's ancestors FT ft,
// which must be applied to it. These messages are in the correct FTNODE node,
// keyrange for any available basement nodes, and are in nodes with the int childnum,
// correct max_msn_applied_to_node_on_disk. struct pivot_bounds const * const bounds,
// Notes: ANCESTORS ancestors,
// This is an approximate query. MSN* max_msn_applied
// Output: )
// max_msn_in_path: max of "max_msn_applied_to_node_on_disk" over
// ancestors. This is used later to update basement nodes'
// max_msn_applied values in case we don't do the full algorithm.
// Returns:
// true if there may be some such messages
// false only if there are definitely no such messages
// Rationale:
// When we pin a node with a read lock, we want to quickly determine if
// we should exchange it for a write lock in preparation for applying
// messages. If there are no messages, we don't need the write lock.
{ {
invariant(node->height == 0); BASEMENTNODE bn = BLB(node, childnum);
MSN max_msn_applied = ZERO_MSN; struct pivot_bounds curr_bounds = next_pivot_keys(node, childnum, bounds);
bool needs_ancestors_messages = false; bool needs_ancestors_messages = false;
for (int i = 0; i < node->n_children; ++i) {
if (BP_STATE(node, i) != PT_AVAIL) { continue; }
BASEMENTNODE bn = BLB(node, i);
struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) { for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > bn->max_msn_applied.msn) { if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > bn->max_msn_applied.msn) {
paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL); paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
...@@ -4627,19 +4656,86 @@ bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancesto ...@@ -4627,19 +4656,86 @@ bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancesto
needs_ancestors_messages = true; needs_ancestors_messages = true;
goto cleanup; goto cleanup;
} }
if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > max_msn_applied.msn) { if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > max_msn_applied->msn) {
max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk; max_msn_applied->msn = curr_ancestors->node->max_msn_applied_to_node_on_disk.msn;
}
}
}
cleanup:
return needs_ancestors_messages;
}
bool toku_ft_leaf_needs_ancestors_messages(
FT ft,
FTNODE node,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds,
MSN *const max_msn_in_path,
int child_to_read
)
// Effect: Determine whether there are messages in a node's ancestors
// which must be applied to it. These messages are in the correct
// keyrange for any available basement nodes, and are in nodes with the
// correct max_msn_applied_to_node_on_disk.
// Notes:
// This is an approximate query.
// Output:
// max_msn_in_path: max of "max_msn_applied_to_node_on_disk" over
// ancestors. This is used later to update basement nodes'
// max_msn_applied values in case we don't do the full algorithm.
// Returns:
// true if there may be some such messages
// false only if there are definitely no such messages
// Rationale:
// When we pin a node with a read lock, we want to quickly determine if
// we should exchange it for a write lock in preparation for applying
// messages. If there are no messages, we don't need the write lock.
{
paranoid_invariant(node->height == 0);
bool needs_ancestors_messages = false;
// child_to_read may be -1 in test cases
if (!node->dirty && child_to_read >= 0) {
paranoid_invariant(BP_STATE(node, child_to_read) == PT_AVAIL);
needs_ancestors_messages = bn_needs_ancestors_messages(
ft,
node,
child_to_read,
bounds,
ancestors,
max_msn_in_path
);
} }
else {
for (int i = 0; i < node->n_children; ++i) {
if (BP_STATE(node, i) != PT_AVAIL) { continue; }
needs_ancestors_messages = bn_needs_ancestors_messages(
ft,
node,
i,
bounds,
ancestors,
max_msn_in_path
);
if (needs_ancestors_messages) {
goto cleanup;
} }
} }
} }
*max_msn_in_path = max_msn_applied;
cleanup: cleanup:
return needs_ancestors_messages; return needs_ancestors_messages;
} }
void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied) { void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read) {
invariant(node->height == 0); invariant(node->height == 0);
if (!node->dirty && child_to_read >= 0) {
paranoid_invariant(BP_STATE(node, child_to_read) == PT_AVAIL);
BASEMENTNODE bn = BLB(node, child_to_read);
if (max_msn_applied.msn > bn->max_msn_applied.msn) {
// see comment below
(void) toku_sync_val_compare_and_swap(&bn->max_msn_applied.msn, bn->max_msn_applied.msn, max_msn_applied.msn);
}
}
else {
for (int i = 0; i < node->n_children; ++i) { for (int i = 0; i < node->n_children; ++i) {
if (BP_STATE(node, i) != PT_AVAIL) { continue; } if (BP_STATE(node, i) != PT_AVAIL) { continue; }
BASEMENTNODE bn = BLB(node, i); BASEMENTNODE bn = BLB(node, i);
...@@ -4652,6 +4748,7 @@ void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied) { ...@@ -4652,6 +4748,7 @@ void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied) {
(void) toku_sync_val_compare_and_swap(&bn->max_msn_applied.msn, bn->max_msn_applied.msn, max_msn_applied.msn); (void) toku_sync_val_compare_and_swap(&bn->max_msn_applied.msn, bn->max_msn_applied.msn, max_msn_applied.msn);
} }
} }
}
} }
struct copy_to_stale_extra { struct copy_to_stale_extra {
...@@ -4779,6 +4876,11 @@ got_a_good_value: ...@@ -4779,6 +4876,11 @@ got_a_good_value:
ftcursor->leaf_info.to_be.omt = bn->buffer; ftcursor->leaf_info.to_be.omt = bn->buffer;
ftcursor->leaf_info.to_be.index = idx; ftcursor->leaf_info.to_be.index = idx;
//
// IMPORTANT: bulk fetch CANNOT go past the current basement node,
// because there is no guarantee that messages have been applied
// to other basement nodes, as part of #5770
//
if (r == TOKUDB_CURSOR_CONTINUE && can_bulk_fetch) { if (r == TOKUDB_CURSOR_CONTINUE && can_bulk_fetch) {
r = ft_cursor_shortcut( r = ft_cursor_shortcut(
ftcursor, ftcursor,
...@@ -4908,7 +5010,7 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F ...@@ -4908,7 +5010,7 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
BLOCKNUM childblocknum = BP_BLOCKNUM(node,childnum); BLOCKNUM childblocknum = BP_BLOCKNUM(node,childnum);
uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, childnum); uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, childnum);
FTNODE childnode; FTNODE childnode = nullptr;
// If the current node's height is greater than 1, then its child is an internal node. // If the current node's height is greater than 1, then its child is an internal node.
// Therefore, to warm the cache better (#5798), we want to read all the partitions off disk in one shot. // Therefore, to warm the cache better (#5798), we want to read all the partitions off disk in one shot.
...@@ -4931,7 +5033,6 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F ...@@ -4931,7 +5033,6 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
unlockers, unlockers,
&next_ancestors, bounds, &next_ancestors, bounds,
&bfe, &bfe,
PL_READ, // we try to get a read lock, but we may upgrade to a write lock on a leaf for message application.
true, true,
&childnode, &childnode,
&msgs_applied); &msgs_applied);
...@@ -5090,19 +5191,6 @@ ft_search_node( ...@@ -5090,19 +5191,6 @@ ft_search_node(
// At this point, we must have the necessary partition available to continue the search // At this point, we must have the necessary partition available to continue the search
// //
assert(BP_STATE(node,child_to_search) == PT_AVAIL); assert(BP_STATE(node,child_to_search) == PT_AVAIL);
while (child_to_search >= 0 && child_to_search < node->n_children) {
//
// Normally, the child we want to use is available, as we checked
// before entering this while loop. However, if we pass through
// the loop once, getting DB_NOTFOUND for this first value
// of child_to_search, we enter the while loop again with a
// child_to_search that may not be in memory. If it is not,
// we need to return TOKUDB_TRY_AGAIN so the query can
// read the appropriate partition into memory
//
if (BP_STATE(node,child_to_search) != PT_AVAIL) {
return TOKUDB_TRY_AGAIN;
}
const struct pivot_bounds next_bounds = next_pivot_keys(node, child_to_search, bounds); const struct pivot_bounds next_bounds = next_pivot_keys(node, child_to_search, bounds);
if (node->height > 0) { if (node->height > 0) {
r = ft_search_child( r = ft_search_child(
...@@ -5131,7 +5219,9 @@ ft_search_node( ...@@ -5131,7 +5219,9 @@ ft_search_node(
can_bulk_fetch can_bulk_fetch
); );
} }
if (r == 0) return r; //Success if (r == 0) {
return r; //Success
}
if (r != DB_NOTFOUND) { if (r != DB_NOTFOUND) {
return r; //Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN) return r; //Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
...@@ -5143,17 +5233,19 @@ ft_search_node( ...@@ -5143,17 +5233,19 @@ ft_search_node(
// we have a new pivotkey // we have a new pivotkey
if (node->height == 0) { if (node->height == 0) {
// when we run off the end of a basement, try to lock the range up to the pivot. solves #3529 // when we run off the end of a basement, try to lock the range up to the pivot. solves #3529
const DBT *pivot = NULL; const DBT *pivot = nullptr;
if (search->direction == FT_SEARCH_LEFT) if (search->direction == FT_SEARCH_LEFT) {
pivot = next_bounds.upper_bound_inclusive; // left -> right pivot = next_bounds.upper_bound_inclusive; // left -> right
else } else {
pivot = next_bounds.lower_bound_exclusive; // right -> left pivot = next_bounds.lower_bound_exclusive; // right -> left
if (pivot) { }
int rr = getf(pivot->size, pivot->data, 0, NULL, getf_v, true); if (pivot != nullptr) {
if (rr != 0) int rr = getf(pivot->size, pivot->data, 0, nullptr, getf_v, true);
if (rr != 0) {
return rr; // lock was not granted return rr; // lock was not granted
} }
} }
}
// If we got a DB_NOTFOUND then we have to search the next record. Possibly everything present is not visible. // If we got a DB_NOTFOUND then we have to search the next record. Possibly everything present is not visible.
// This way of doing DB_NOTFOUND is a kludge, and ought to be simplified. Something like this is needed for DB_NEXT, but // This way of doing DB_NOTFOUND is a kludge, and ought to be simplified. Something like this is needed for DB_NEXT, but
...@@ -5162,15 +5254,15 @@ ft_search_node( ...@@ -5162,15 +5254,15 @@ ft_search_node(
// If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left). // If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
// So save the pivot key in the search object. // So save the pivot key in the search object.
maybe_search_save_bound(node, child_to_search, search); maybe_search_save_bound(node, child_to_search, search);
// as part of #5770, if we can continue searching,
// We're about to pin some more nodes, but we thought we were done before. // we MUST return TOKUDB_TRY_AGAIN,
if (search->direction == FT_SEARCH_LEFT) { // because there is no guarantee that messages have been applied
child_to_search++; // on any other path.
} if ((search->direction == FT_SEARCH_LEFT && child_to_search < node->n_children-1) ||
else { (search->direction == FT_SEARCH_RIGHT && child_to_search > 0)) {
child_to_search--; r = TOKUDB_TRY_AGAIN;
}
} }
return r; return r;
} }
...@@ -5775,7 +5867,6 @@ toku_ft_keysrange_internal (FT_HANDLE brt, FTNODE node, ...@@ -5775,7 +5867,6 @@ toku_ft_keysrange_internal (FT_HANDLE brt, FTNODE node,
&next_ancestors, &next_ancestors,
bounds, bounds,
child_may_find_right ? match_bfe : min_bfe, child_may_find_right ? match_bfe : min_bfe,
PL_READ, // may_modify_node is false, because node guaranteed to not change
false, false,
&childnode, &childnode,
&msgs_applied &msgs_applied
...@@ -5986,7 +6077,7 @@ static int get_key_after_bytes_in_child(FT_HANDLE ft_h, FT ft, FTNODE node, UNLO ...@@ -5986,7 +6077,7 @@ static int get_key_after_bytes_in_child(FT_HANDLE ft_h, FT ft, FTNODE node, UNLO
uint32_t fullhash = compute_child_fullhash(ft->cf, node, childnum); uint32_t fullhash = compute_child_fullhash(ft->cf, node, childnum);
FTNODE child; FTNODE child;
bool msgs_applied = false; bool msgs_applied = false;
r = toku_pin_ftnode_batched(ft_h, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, PL_READ, false, &child, &msgs_applied); r = toku_pin_ftnode_batched(ft_h, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, false, &child, &msgs_applied);
paranoid_invariant(!msgs_applied); paranoid_invariant(!msgs_applied);
if (r == TOKUDB_TRY_AGAIN) { if (r == TOKUDB_TRY_AGAIN) {
return r; return r;
......
...@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) { ...@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL }; struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL }; const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
bool msgs_applied; bool msgs_applied;
toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied); toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied, -1);
FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh, FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
{ {
...@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) { ...@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
.upper_bound_inclusive = toku_clone_dbt(&ubi, childkeys[7]) .upper_bound_inclusive = toku_clone_dbt(&ubi, childkeys[7])
}; };
bool msgs_applied; bool msgs_applied;
toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied); toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied, -1);
FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh, FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
{ {
...@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) { ...@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL }; struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL }; const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
bool msgs_applied; bool msgs_applied;
toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied); toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied, -1);
FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh, FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment