Commit 9a6ba1aa authored by John Esmet's avatar John Esmet

refs #5770 Only check one basement node on pin, remove the assumption that adjacent

available nodes are query-able.
parent 06d56d51
...@@ -193,6 +193,11 @@ toku_create_new_ftnode ( ...@@ -193,6 +193,11 @@ toku_create_new_ftnode (
NULL); NULL);
} }
//
// On success, this function assumes that the caller is trying to pin the node
// with a PL_READ lock. If message application is needed,
// then a PL_WRITE_CHEAP lock is grabbed
//
int int
toku_pin_ftnode_batched( toku_pin_ftnode_batched(
FT_HANDLE brt, FT_HANDLE brt,
...@@ -202,15 +207,22 @@ toku_pin_ftnode_batched( ...@@ -202,15 +207,22 @@ toku_pin_ftnode_batched(
ANCESTORS ancestors, ANCESTORS ancestors,
const PIVOT_BOUNDS bounds, const PIVOT_BOUNDS bounds,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
pair_lock_type lock_type,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE *node_p, FTNODE *node_p,
bool* msgs_applied) bool* msgs_applied)
{ {
void *node_v; void *node_v;
*msgs_applied = false; *msgs_applied = false;
pair_lock_type needed_lock_type = lock_type; FTNODE node = nullptr;
try_again_for_write_lock: MSN max_msn_in_path = ZERO_MSN;
bool needs_ancestors_messages = false;
// this function assumes that if you want ancestor messages applied,
// you are doing a read for a query. This is so we can make some optimizations
// below.
if (apply_ancestor_messages) {
paranoid_invariant(bfe->type == ftnode_fetch_subset);
}
int r = toku_cachetable_get_and_pin_nonblocking_batched( int r = toku_cachetable_get_and_pin_nonblocking_batched(
brt->ft->cf, brt->ft->cf,
blocknum, blocknum,
...@@ -221,63 +233,82 @@ toku_pin_ftnode_batched( ...@@ -221,63 +233,82 @@ toku_pin_ftnode_batched(
toku_ftnode_fetch_callback, toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback, toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback, toku_ftnode_pf_callback,
needed_lock_type, PL_READ,
bfe, //read_extraargs bfe, //read_extraargs
unlockers); unlockers);
if (r==0) { if (r != 0) {
FTNODE node = static_cast<FTNODE>(node_v); assert(r == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
MSN max_msn_in_path; goto exit;
bool needs_ancestors_messages = false; }
if (apply_ancestor_messages && node->height == 0) { node = static_cast<FTNODE>(node_v);
needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(brt->ft, node, ancestors, bounds, &max_msn_in_path); if (apply_ancestor_messages && node->height == 0) {
if (needs_ancestors_messages && needed_lock_type == PL_READ) { needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(
toku_unpin_ftnode_read_only(brt->ft, node); brt->ft,
needed_lock_type = PL_WRITE_CHEAP; node,
goto try_again_for_write_lock; ancestors,
bounds,
&max_msn_in_path,
bfe->child_to_read
);
if (needs_ancestors_messages) {
toku_unpin_ftnode_read_only(brt->ft, node);
int rr = toku_cachetable_get_and_pin_nonblocking_batched(
brt->ft->cf,
blocknum,
fullhash,
&node_v,
NULL,
get_write_callbacks_for_node(brt->ft),
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
PL_WRITE_CHEAP,
bfe, //read_extraargs
unlockers);
if (rr != 0) {
assert(rr == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
r = TOKUDB_TRY_AGAIN;
goto exit;
} }
} node = static_cast<FTNODE>(node_v);
if (apply_ancestor_messages && node->height == 0) { toku_apply_ancestors_messages_to_node(
if (needs_ancestors_messages) { brt,
invariant(needed_lock_type != PL_READ); node,
toku_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, msgs_applied); ancestors,
} else { bounds,
// At this point, we aren't going to run msgs_applied,
// toku_apply_ancestors_messages_to_node but that doesn't bfe->child_to_read
// mean max_msn_applied shouldn't be updated if possible );
// (this saves the CPU work involved in } else {
// toku_ft_leaf_needs_ancestors_messages). // At this point, we aren't going to run
// // toku_apply_ancestors_messages_to_node but that doesn't
// We still have a read lock, so we have not resolved // mean max_msn_applied shouldn't be updated if possible
// checkpointing. If the node is pending and dirty, we // (this saves the CPU work involved in
// can't modify anything, including max_msn, until we // toku_ft_leaf_needs_ancestors_messages).
// resolve checkpointing. If we do, the node might get //
// written out that way as part of a checkpoint with a // We still have a read lock, so we have not resolved
// root that was already written out with a smaller // checkpointing. If the node is pending and dirty, we
// max_msn. During recovery, we would then inject a // can't modify anything, including max_msn, until we
// message based on the root's max_msn, and that message // resolve checkpointing. If we do, the node might get
// would get filtered by the leaf because it had too high // written out that way as part of a checkpoint with a
// a max_msn value. (see #5407) // root that was already written out with a smaller
// // max_msn. During recovery, we would then inject a
// So for simplicity we only update the max_msn if the // message based on the root's max_msn, and that message
// node is clean. That way, in order for the node to get // would get filtered by the leaf because it had too high
// written out, it would have to be dirtied. That // a max_msn value. (see #5407)
// requires a write lock, and a write lock requires you to //
// resolve checkpointing. // So for simplicity we only update the max_msn if the
if (!node->dirty) { // node is clean. That way, in order for the node to get
toku_ft_bn_update_max_msn(node, max_msn_in_path); // written out, it would have to be dirtied. That
} // requires a write lock, and a write lock requires you to
// resolve checkpointing.
if (!node->dirty) {
toku_ft_bn_update_max_msn(node, max_msn_in_path, bfe->child_to_read);
} }
invariant(needed_lock_type != PL_READ || !*msgs_applied);
}
if ((lock_type != PL_READ) && node->height > 0) {
toku_move_ftnode_messages_to_stale(brt->ft, node);
} }
*node_p = node;
// printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
} else {
assert(r==TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
// printf("%*sPin %ld try again\n", 8, "", blocknum.b);
} }
*node_p = node;
exit:
return r; return r;
} }
......
...@@ -150,7 +150,6 @@ toku_pin_ftnode_batched( ...@@ -150,7 +150,6 @@ toku_pin_ftnode_batched(
ANCESTORS ancestors, ANCESTORS ancestors,
const PIVOT_BOUNDS pbounds, const PIVOT_BOUNDS pbounds,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
pair_lock_type lock_type,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE *node_p, FTNODE *node_p,
bool* msgs_applied bool* msgs_applied
......
...@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode); ...@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
#define VERIFY_NODE(t,n) ((void)0) #define VERIFY_NODE(t,n) ((void)0)
#endif #endif
//#define FT_TRACE
#ifdef FT_TRACE
#define WHEN_FTTRACE(x) x
#else
#define WHEN_FTTRACE(x) ((void)0)
#endif
void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe); void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe);
void toku_ft_status_update_flush_reason(FTNODE node, uint64_t uncompressed_bytes_flushed, uint64_t bytes_written, tokutime_t write_time, bool for_checkpoint); void toku_ft_status_update_flush_reason(FTNODE node, uint64_t uncompressed_bytes_flushed, uint64_t bytes_written, tokutime_t write_time, bool for_checkpoint);
void toku_ft_status_update_serialize_times(FTNODE node, tokutime_t serialize_time, tokutime_t compress_time); void toku_ft_status_update_serialize_times(FTNODE node, tokutime_t serialize_time, tokutime_t compress_time);
...@@ -982,11 +975,11 @@ struct pivot_bounds { ...@@ -982,11 +975,11 @@ struct pivot_bounds {
__attribute__((nonnull)) __attribute__((nonnull))
void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node); void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node);
void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied); void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied, int child_to_read);
__attribute__((nonnull)) __attribute__((nonnull))
bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path); bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path, int child_to_read);
__attribute__((nonnull)) __attribute__((nonnull))
void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied); void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read);
__attribute__((const,nonnull)) __attribute__((const,nonnull))
size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd); size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd);
......
This diff is collapsed.
...@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) { ...@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL }; struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL }; const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
bool msgs_applied; bool msgs_applied;
toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied); toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied, -1);
FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh, FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
{ {
...@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) { ...@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
.upper_bound_inclusive = toku_clone_dbt(&ubi, childkeys[7]) .upper_bound_inclusive = toku_clone_dbt(&ubi, childkeys[7])
}; };
bool msgs_applied; bool msgs_applied;
toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied); toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied, -1);
FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh, FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
{ {
...@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) { ...@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL }; struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL }; const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
bool msgs_applied; bool msgs_applied;
toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied); toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied, -1);
FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh, FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment