refs #5770 Only check one basement node on pin, remove the assumption that adjacent

available nodes are query-able.

refs #5770 Only check one basement node on pin, remove the assumption that adjacent
available nodes are query-able.
9a6ba1aa · John Esmet · 06d56d51 · 9a6ba1aa · 9a6ba1aa · 9a6ba1aa
Commit 9a6ba1aa authored Jun 28, 2013 by John Esmet
5 changed files
--- a/ft/ft-cachetable-wrappers.cc
+++ b/ft/ft-cachetable-wrappers.cc
@@ -193,6 +193,11 @@ toku_create_new_ftnode (
        NULL);
 }

+//
+// On success, this function assumes that the caller is trying to pin the node
+// with a PL_READ lock. If message application is needed,
+// then a PL_WRITE_CHEAP lock is grabbed
+//
 int
 toku_pin_ftnode_batched(
    FT_HANDLE brt,
@@ -202,15 +207,22 @@ toku_pin_ftnode_batched(
    ANCESTORS ancestors,
    const PIVOT_BOUNDS bounds,
    FTNODE_FETCH_EXTRA bfe,
-    pair_lock_type lock_type,
    bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
    FTNODE *node_p,
    bool* msgs_applied)
 {
    void *node_v;
    *msgs_applied = false;
-    pair_lock_type needed_lock_type = lock_type;
-try_again_for_write_lock:
+    FTNODE node = nullptr;
+    MSN max_msn_in_path = ZERO_MSN;
+    bool needs_ancestors_messages = false;
+    // this function assumes that if you want ancestor messages applied,
+    // you are doing a read for a query. This is so we can make some optimizations
+    // below.
+    if (apply_ancestor_messages) {
+        paranoid_invariant(bfe->type == ftnode_fetch_subset);
+    }
+    
    int r = toku_cachetable_get_and_pin_nonblocking_batched(
            brt->ft->cf,
            blocknum,
@@ -221,63 +233,82 @@ toku_pin_ftnode_batched(
            toku_ftnode_fetch_callback,
            toku_ftnode_pf_req_callback,
            toku_ftnode_pf_callback,
-            needed_lock_type,
+            PL_READ,
            bfe, //read_extraargs
            unlockers);
-    if (r==0) {
-        FTNODE node = static_cast<FTNODE>(node_v);
-        MSN max_msn_in_path;
-        bool needs_ancestors_messages = false;
-        if (apply_ancestor_messages && node->height == 0) {
-            needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(brt->ft, node, ancestors, bounds, &max_msn_in_path);
-            if (needs_ancestors_messages && needed_lock_type == PL_READ) {
-                toku_unpin_ftnode_read_only(brt->ft, node);
-                needed_lock_type = PL_WRITE_CHEAP;
-                goto try_again_for_write_lock;
+    if (r != 0) {
+        assert(r == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
+        goto exit;
+    }
+    node = static_cast<FTNODE>(node_v);
+    if (apply_ancestor_messages && node->height == 0) {
+        needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(
+            brt->ft, 
+            node, 
+            ancestors, 
+            bounds, 
+            &max_msn_in_path, 
+            bfe->child_to_read
+            );
+        if (needs_ancestors_messages) {
+            toku_unpin_ftnode_read_only(brt->ft, node);
+            int rr = toku_cachetable_get_and_pin_nonblocking_batched(
+                    brt->ft->cf,
+                    blocknum,
+                    fullhash,
+                    &node_v,
+                    NULL,
+                    get_write_callbacks_for_node(brt->ft),
+                    toku_ftnode_fetch_callback,
+                    toku_ftnode_pf_req_callback,
+                    toku_ftnode_pf_callback,
+                    PL_WRITE_CHEAP,
+                    bfe, //read_extraargs
+                    unlockers);
+            if (rr != 0) {
+                assert(rr == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
+                r = TOKUDB_TRY_AGAIN;
+                goto exit;
            }
-        }
-        if (apply_ancestor_messages && node->height == 0) {
-            if (needs_ancestors_messages) {
-                invariant(needed_lock_type != PL_READ);
-                toku_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, msgs_applied);
-            } else {
-                // At this point, we aren't going to run
-                // toku_apply_ancestors_messages_to_node but that doesn't
-                // mean max_msn_applied shouldn't be updated if possible
-                // (this saves the CPU work involved in
-                // toku_ft_leaf_needs_ancestors_messages).
-                //
-                // We still have a read lock, so we have not resolved
-                // checkpointing.  If the node is pending and dirty, we
-                // can't modify anything, including max_msn, until we
-                // resolve checkpointing.  If we do, the node might get
-                // written out that way as part of a checkpoint with a
-                // root that was already written out with a smaller
-                // max_msn.  During recovery, we would then inject a
-                // message based on the root's max_msn, and that message
-                // would get filtered by the leaf because it had too high
-                // a max_msn value. (see #5407)
-                //
-                // So for simplicity we only update the max_msn if the
-                // node is clean.  That way, in order for the node to get
-                // written out, it would have to be dirtied.  That
-                // requires a write lock, and a write lock requires you to
-                // resolve checkpointing.
-                if (!node->dirty) {
-                    toku_ft_bn_update_max_msn(node, max_msn_in_path);
-                }
+            node = static_cast<FTNODE>(node_v);
+            toku_apply_ancestors_messages_to_node(
+                brt, 
+                node, 
+                ancestors, 
+                bounds, 
+                msgs_applied,
+                bfe->child_to_read
+                );
+        } else {
+            // At this point, we aren't going to run
+            // toku_apply_ancestors_messages_to_node but that doesn't
+            // mean max_msn_applied shouldn't be updated if possible
+            // (this saves the CPU work involved in
+            // toku_ft_leaf_needs_ancestors_messages).
+            //
+            // We still have a read lock, so we have not resolved
+            // checkpointing.  If the node is pending and dirty, we
+            // can't modify anything, including max_msn, until we
+            // resolve checkpointing.  If we do, the node might get
+            // written out that way as part of a checkpoint with a
+            // root that was already written out with a smaller
+            // max_msn.  During recovery, we would then inject a
+            // message based on the root's max_msn, and that message
+            // would get filtered by the leaf because it had too high
+            // a max_msn value. (see #5407)
+            //
+            // So for simplicity we only update the max_msn if the
+            // node is clean.  That way, in order for the node to get
+            // written out, it would have to be dirtied.  That
+            // requires a write lock, and a write lock requires you to
+            // resolve checkpointing.
+            if (!node->dirty) {
+                toku_ft_bn_update_max_msn(node, max_msn_in_path, bfe->child_to_read);
            }
-            invariant(needed_lock_type != PL_READ || !*msgs_applied);
-        }
-        if ((lock_type != PL_READ) && node->height > 0) {
-            toku_move_ftnode_messages_to_stale(brt->ft, node);
        }
-        *node_p = node;
-        // printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
-    } else {
-        assert(r==TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
-        // printf("%*sPin %ld try again\n", 8, "", blocknum.b);
    }
+    *node_p = node;
+exit:
    return r;
 }


--- a/ft/ft-cachetable-wrappers.h
+++ b/ft/ft-cachetable-wrappers.h
@@ -150,7 +150,6 @@ toku_pin_ftnode_batched(
    ANCESTORS ancestors,
    const PIVOT_BOUNDS pbounds,
    FTNODE_FETCH_EXTRA bfe,
-    pair_lock_type lock_type,
    bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
    FTNODE *node_p,
    bool* msgs_applied

--- a/ft/ft-internal.h
+++ b/ft/ft-internal.h
@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
 #define VERIFY_NODE(t,n) ((void)0)
 #endif

-//#define FT_TRACE
-#ifdef FT_TRACE
-#define WHEN_FTTRACE(x) x
-#else
-#define WHEN_FTTRACE(x) ((void)0)
-#endif
-
 void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe);
 void toku_ft_status_update_flush_reason(FTNODE node, uint64_t uncompressed_bytes_flushed, uint64_t bytes_written, tokutime_t write_time, bool for_checkpoint);
 void toku_ft_status_update_serialize_times(FTNODE node, tokutime_t serialize_time, tokutime_t compress_time);
@@ -982,11 +975,11 @@ struct pivot_bounds {

 __attribute__((nonnull))
 void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node);
-void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied);
+void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied, int child_to_read);
 __attribute__((nonnull))
-bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path);
+bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path, int child_to_read);
 __attribute__((nonnull))
-void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied);
+void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read);

 __attribute__((const,nonnull))
 size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd);

--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
@@ -4509,8 +4509,53 @@ bnc_apply_messages_to_basement_node(
    }
 }

+static void
+apply_ancestors_messages_to_bn(
+    FT_HANDLE t,
+    FTNODE node,
+    int childnum,
+    ANCESTORS ancestors,
+    struct pivot_bounds const * const bounds, 
+    TXNID oldest_referenced_xid,
+    bool* msgs_applied
+    )
+{
+    BASEMENTNODE curr_bn = BLB(node, childnum);
+    struct pivot_bounds curr_bounds = next_pivot_keys(node, childnum, bounds);
+    for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
+        if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > curr_bn->max_msn_applied.msn) {
+            paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
+            bnc_apply_messages_to_basement_node(
+                t,
+                curr_bn,
+                curr_ancestors->node,
+                curr_ancestors->childnum,
+                &curr_bounds,
+                oldest_referenced_xid,
+                msgs_applied
+                );
+            // We don't want to check this ancestor node again if the
+            // next time we query it, the msn hasn't changed.
+            curr_bn->max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
+        }
+    }
+    // At this point, we know all the stale messages above this
+    // basement node have been applied, and any new messages will be
+    // fresh, so we don't need to look at stale messages for this
+    // basement node, unless it gets evicted (and this field becomes
+    // false when it's read in again).
+    curr_bn->stale_ancestor_messages_applied = true;
+}
+
 void
-toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied)
+toku_apply_ancestors_messages_to_node (
+    FT_HANDLE t, 
+    FTNODE node, 
+    ANCESTORS ancestors, 
+    struct pivot_bounds const * const bounds, 
+    bool* msgs_applied, 
+    int child_to_read
+    )
 // Effect:
 //   Bring a leaf node up-to-date according to all the messages in the ancestors.
 //   If the leaf node is already up-to-date then do nothing.
@@ -4521,7 +4566,7 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
 //   The entire root-to-leaf path is pinned and appears in the ancestors list.
 {
    VERIFY_NODE(t, node);
-    invariant(node->height == 0);
+    paranoid_invariant(node->height == 0);

    TXNID oldest_referenced_xid = ancestors->node->oldest_referenced_xid_known;
    for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
@@ -4530,44 +4575,104 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
        }
    }

-    // know we are a leaf node
-    // An important invariant:
-    // We MUST bring every available basement node up to date.
-    // flushing on the cleaner thread depends on this. This invariant
-    // allows the cleaner thread to just pick an internal node and flush it
-    // as opposed to being forced to start from the root.
-    for (int i = 0; i < node->n_children; i++) {
-        if (BP_STATE(node, i) != PT_AVAIL) { continue; }
-        BASEMENTNODE curr_bn = BLB(node, i);
-        struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
-        for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
-            if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > curr_bn->max_msn_applied.msn) {
-                paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
-                bnc_apply_messages_to_basement_node(
-                    t,
-                    curr_bn,
-                    curr_ancestors->node,
-                    curr_ancestors->childnum,
-                    &curr_bounds,
-                    oldest_referenced_xid,
-                    msgs_applied
-                    );
-                // We don't want to check this ancestor node again if the
-                // next time we query it, the msn hasn't changed.
-                curr_bn->max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
-            }
+    if (!node->dirty && child_to_read >= 0) {
+        paranoid_invariant(BP_STATE(node, child_to_read) == PT_AVAIL);
+        apply_ancestors_messages_to_bn(
+            t,
+            node,
+            child_to_read,
+            ancestors,
+            bounds,
+            oldest_referenced_xid,
+            msgs_applied
+            );
+    }
+    else {
+        // know we are a leaf node
+        // An important invariant:
+        // We MUST bring every available basement node for a dirty node up to date.
+        // flushing on the cleaner thread depends on this. This invariant
+        // allows the cleaner thread to just pick an internal node and flush it
+        // as opposed to being forced to start from the root.
+        for (int i = 0; i < node->n_children; i++) {
+            if (BP_STATE(node, i) != PT_AVAIL) { continue; }
+            apply_ancestors_messages_to_bn(
+                t,
+                node,
+                i,
+                ancestors,
+                bounds,
+                oldest_referenced_xid,
+                msgs_applied
+                );
        }
-        // At this point, we know all the stale messages above this
-        // basement node have been applied, and any new messages will be
-        // fresh, so we don't need to look at stale messages for this
-        // basement node, unless it gets evicted (and this field becomes
-        // false when it's read in again).
-        curr_bn->stale_ancestor_messages_applied = true;
    }
    VERIFY_NODE(t, node);
 }

-bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path)
+static bool bn_needs_ancestors_messages(
+    FT ft,
+    FTNODE node,
+    int childnum,
+    struct pivot_bounds const * const bounds,
+    ANCESTORS ancestors, 
+    MSN* max_msn_applied
+    ) 
+{
+    BASEMENTNODE bn = BLB(node, childnum);
+    struct pivot_bounds curr_bounds = next_pivot_keys(node, childnum, bounds);
+    bool needs_ancestors_messages = false;
+    for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
+        if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > bn->max_msn_applied.msn) {
+            paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
+            NONLEAF_CHILDINFO bnc = BNC(curr_ancestors->node, curr_ancestors->childnum);
+            if (bnc->broadcast_list.size() > 0) {
+                needs_ancestors_messages = true;
+                goto cleanup;
+            }
+            if (!bn->stale_ancestor_messages_applied) {
+                uint32_t stale_lbi, stale_ube;
+                find_bounds_within_message_tree(&ft->cmp_descriptor,
+                                                ft->compare_fun,
+                                                bnc->stale_message_tree,
+                                                bnc->buffer,
+                                                &curr_bounds,
+                                                &stale_lbi,
+                                                &stale_ube);
+                if (stale_lbi < stale_ube) {
+                    needs_ancestors_messages = true;
+                    goto cleanup;
+                }
+            }
+            uint32_t fresh_lbi, fresh_ube;
+            find_bounds_within_message_tree(&ft->cmp_descriptor,
+                                            ft->compare_fun,
+                                            bnc->fresh_message_tree,
+                                            bnc->buffer,
+                                            &curr_bounds,
+                                            &fresh_lbi,
+                                            &fresh_ube);
+            if (fresh_lbi < fresh_ube) {
+                needs_ancestors_messages = true;
+                goto cleanup;
+            }
+            if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > max_msn_applied->msn) {
+                max_msn_applied->msn = curr_ancestors->node->max_msn_applied_to_node_on_disk.msn;
+            }
+        }
+    }
+cleanup:
+    return needs_ancestors_messages;
+}
+
+bool toku_ft_leaf_needs_ancestors_messages(
+    FT ft, 
+    FTNODE node, 
+    ANCESTORS ancestors, 
+    struct pivot_bounds const * const bounds, 
+    MSN *const max_msn_in_path, 
+    int child_to_read
+    )
 // Effect: Determine whether there are messages in a node's ancestors
 //  which must be applied to it.  These messages are in the correct
 //  keyrange for any available basement nodes, and are in nodes with the
@@ -4586,72 +4691,64 @@ bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancesto
 //  we should exchange it for a write lock in preparation for applying
 //  messages.  If there are no messages, we don't need the write lock.
 {
-    invariant(node->height == 0);
-    MSN max_msn_applied = ZERO_MSN;
+    paranoid_invariant(node->height == 0);
    bool needs_ancestors_messages = false;
-    for (int i = 0; i < node->n_children; ++i) {
-        if (BP_STATE(node, i) != PT_AVAIL) { continue; }
-        BASEMENTNODE bn = BLB(node, i);
-        struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
-        for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
-            if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > bn->max_msn_applied.msn) {
-                paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
-                NONLEAF_CHILDINFO bnc = BNC(curr_ancestors->node, curr_ancestors->childnum);
-                if (bnc->broadcast_list.size() > 0) {
-                    needs_ancestors_messages = true;
-                    goto cleanup;
-                }
-                if (!bn->stale_ancestor_messages_applied) {
-                    uint32_t stale_lbi, stale_ube;
-                    find_bounds_within_message_tree(&ft->cmp_descriptor,
-                                                    ft->compare_fun,
-                                                    bnc->stale_message_tree,
-                                                    bnc->buffer,
-                                                    &curr_bounds,
-                                                    &stale_lbi,
-                                                    &stale_ube);
-                    if (stale_lbi < stale_ube) {
-                        needs_ancestors_messages = true;
-                        goto cleanup;
-                    }
-                }
-                uint32_t fresh_lbi, fresh_ube;
-                find_bounds_within_message_tree(&ft->cmp_descriptor,
-                                                ft->compare_fun,
-                                                bnc->fresh_message_tree,
-                                                bnc->buffer,
-                                                &curr_bounds,
-                                                &fresh_lbi,
-                                                &fresh_ube);
-                if (fresh_lbi < fresh_ube) {
-                    needs_ancestors_messages = true;
-                    goto cleanup;
-                }
-                if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > max_msn_applied.msn) {
-                    max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
-                }
+    // child_to_read may be -1 in test cases
+    if (!node->dirty && child_to_read >= 0) {
+        paranoid_invariant(BP_STATE(node, child_to_read) == PT_AVAIL);
+        needs_ancestors_messages = bn_needs_ancestors_messages(
+            ft,
+            node,
+            child_to_read,
+            bounds,
+            ancestors,
+            max_msn_in_path
+            );
+    }
+    else {
+        for (int i = 0; i < node->n_children; ++i) {
+            if (BP_STATE(node, i) != PT_AVAIL) { continue; }
+            needs_ancestors_messages = bn_needs_ancestors_messages(
+                ft,
+                node,
+                i,
+                bounds,
+                ancestors,
+                max_msn_in_path
+                );
+            if (needs_ancestors_messages) {
+                goto cleanup;
            }
        }
    }
-    *max_msn_in_path = max_msn_applied;
 cleanup:
    return needs_ancestors_messages;
 }

-void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied) {
+void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read) {
    invariant(node->height == 0);
-    for (int i = 0; i < node->n_children; ++i) {
-        if (BP_STATE(node, i) != PT_AVAIL) { continue; }
-        BASEMENTNODE bn = BLB(node, i);
+    if (!node->dirty && child_to_read >= 0) {
+        paranoid_invariant(BP_STATE(node, child_to_read) == PT_AVAIL);
+        BASEMENTNODE bn = BLB(node, child_to_read);
        if (max_msn_applied.msn > bn->max_msn_applied.msn) {
-            // This function runs in a shared access context, so to silence tools
-            // like DRD, we use a CAS and ignore the result.
-            // Any threads trying to update these basement nodes should be
-            // updating them to the same thing (since they all have a read lock on
-            // the same root-to-leaf path) so this is safe.
+            // see comment below
            (void) toku_sync_val_compare_and_swap(&bn->max_msn_applied.msn, bn->max_msn_applied.msn, max_msn_applied.msn);
        }
    }
+    else {
+        for (int i = 0; i < node->n_children; ++i) {
+            if (BP_STATE(node, i) != PT_AVAIL) { continue; }
+            BASEMENTNODE bn = BLB(node, i);
+            if (max_msn_applied.msn > bn->max_msn_applied.msn) {
+                // This function runs in a shared access context, so to silence tools
+                // like DRD, we use a CAS and ignore the result.
+                // Any threads trying to update these basement nodes should be
+                // updating them to the same thing (since they all have a read lock on
+                // the same root-to-leaf path) so this is safe.
+                (void) toku_sync_val_compare_and_swap(&bn->max_msn_applied.msn, bn->max_msn_applied.msn, max_msn_applied.msn);
+            }
+        }
+    }
 }

 struct copy_to_stale_extra {
@@ -4779,6 +4876,11 @@ ok: ;
            ftcursor->leaf_info.to_be.omt   = bn->buffer;
            ftcursor->leaf_info.to_be.index = idx;

+            // 
+            // IMPORTANT: bulk fetch CANNOT go past the current basement node,
+            // because there is no guarantee that messages have been applied
+            // to other basement nodes, as part of #5770
+            //
            if (r == TOKUDB_CURSOR_CONTINUE && can_bulk_fetch) {
                r = ft_cursor_shortcut(
                    ftcursor,
@@ -4908,7 +5010,7 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F

    BLOCKNUM childblocknum = BP_BLOCKNUM(node,childnum);
    uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, childnum);
-    FTNODE childnode;
+    FTNODE childnode = nullptr;

    // If the current node's height is greater than 1, then its child is an internal node.
    // Therefore, to warm the cache better (#5798), we want to read all the partitions off disk in one shot.
@@ -4931,7 +5033,6 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
                                         unlockers,
                                         &next_ancestors, bounds,
                                         &bfe,
-                                         PL_READ, // we try to get a read lock, but we may upgrade to a write lock on a leaf for message application.
                                         true,
                                         &childnode,
                                         &msgs_applied);
@@ -5090,87 +5191,78 @@ ft_search_node(
    // At this point, we must have the necessary partition available to continue the search
    //
    assert(BP_STATE(node,child_to_search) == PT_AVAIL);
-    while (child_to_search >= 0 && child_to_search < node->n_children) {
-        //
-        // Normally, the child we want to use is available, as we checked
-        // before entering this while loop. However, if we pass through
-        // the loop once, getting DB_NOTFOUND for this first value
-        // of child_to_search, we enter the while loop again with a
-        // child_to_search that may not be in memory. If it is not,
-        // we need to return TOKUDB_TRY_AGAIN so the query can
-        // read the appropriate partition into memory
-        //
-        if (BP_STATE(node,child_to_search) != PT_AVAIL) {
-            return TOKUDB_TRY_AGAIN;
-        }
-        const struct pivot_bounds next_bounds = next_pivot_keys(node, child_to_search, bounds);
-        if (node->height > 0) {
-            r = ft_search_child(
-                brt,
-                node,
-                child_to_search,
-                search,
-                getf,
-                getf_v,
-                doprefetch,
-                ftcursor,
-                unlockers,
-                ancestors,
-                &next_bounds,
-                can_bulk_fetch
-                );
-        }
-        else {
-            r = ft_search_basement_node(
-                BLB(node, child_to_search),
-                search,
-                getf,
-                getf_v,
-                doprefetch,
-                ftcursor,
-                can_bulk_fetch
-                );
-        }
-        if (r == 0) return r; //Success
+    const struct pivot_bounds next_bounds = next_pivot_keys(node, child_to_search, bounds);
+    if (node->height > 0) {
+        r = ft_search_child(
+            brt,
+            node,
+            child_to_search,
+            search,
+            getf,
+            getf_v,
+            doprefetch,
+            ftcursor,
+            unlockers,
+            ancestors,
+            &next_bounds,
+            can_bulk_fetch
+            );
+    }
+    else {
+        r = ft_search_basement_node(
+            BLB(node, child_to_search),
+            search,
+            getf,
+            getf_v,
+            doprefetch,
+            ftcursor,
+            can_bulk_fetch
+            );
+    }
+    if (r == 0) {
+        return r; //Success
+    }

-        if (r != DB_NOTFOUND) {
-            return r; //Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
+    if (r != DB_NOTFOUND) {
+        return r; //Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
+    }
+    // not really necessary, just put this here so that reading the
+    // code becomes simpler. The point is at this point in the code,
+    // we know that we got DB_NOTFOUND and we have to continue
+    assert(r == DB_NOTFOUND);
+    // we have a new pivotkey
+    if (node->height == 0) {
+        // when we run off the end of a basement, try to lock the range up to the pivot. solves #3529
+        const DBT *pivot = nullptr;
+        if (search->direction == FT_SEARCH_LEFT) {
+            pivot = next_bounds.upper_bound_inclusive; // left -> right
+        } else {
+            pivot = next_bounds.lower_bound_exclusive; // right -> left
        }
-        // not really necessary, just put this here so that reading the
-        // code becomes simpler. The point is at this point in the code,
-        // we know that we got DB_NOTFOUND and we have to continue
-        assert(r == DB_NOTFOUND);
-        // we have a new pivotkey
-        if (node->height == 0) {
-            // when we run off the end of a basement, try to lock the range up to the pivot. solves #3529
-            const DBT *pivot = NULL;
-            if (search->direction == FT_SEARCH_LEFT)
-                pivot = next_bounds.upper_bound_inclusive; // left -> right
-            else
-                pivot = next_bounds.lower_bound_exclusive; // right -> left
-            if (pivot) {
-                int rr = getf(pivot->size, pivot->data, 0, NULL, getf_v, true);
-                if (rr != 0)
-                    return rr; // lock was not granted
+        if (pivot != nullptr) {
+            int rr = getf(pivot->size, pivot->data, 0, nullptr, getf_v, true);
+            if (rr != 0) {
+                return rr; // lock was not granted
            }
        }
+    }

-        // If we got a DB_NOTFOUND then we have to search the next record.        Possibly everything present is not visible.
-        // This way of doing DB_NOTFOUND is a kludge, and ought to be simplified.  Something like this is needed for DB_NEXT, but
-        //        for point queries, it's overkill.  If we got a DB_NOTFOUND on a point query then we should just stop looking.
-        // When releasing locks on I/O we must not search the same subtree again, or we won't be guaranteed to make forward progress.
-        // If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
-        // So save the pivot key in the search object.
-        maybe_search_save_bound(node, child_to_search, search);
-
-        // We're about to pin some more nodes, but we thought we were done before.
-        if (search->direction == FT_SEARCH_LEFT) {
-            child_to_search++;
-        }
-        else {
-            child_to_search--;
-        }
+    // If we got a DB_NOTFOUND then we have to search the next record.        Possibly everything present is not visible.
+    // This way of doing DB_NOTFOUND is a kludge, and ought to be simplified.  Something like this is needed for DB_NEXT, but
+    //        for point queries, it's overkill.  If we got a DB_NOTFOUND on a point query then we should just stop looking.
+    // When releasing locks on I/O we must not search the same subtree again, or we won't be guaranteed to make forward progress.
+    // If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
+    // So save the pivot key in the search object.
+    maybe_search_save_bound(node, child_to_search, search);
+    // as part of #5770, if we can continue searching,
+    // we MUST return TOKUDB_TRY_AGAIN,
+    // because there is no guarantee that messages have been applied
+    // on any other path.
+    if ((search->direction == FT_SEARCH_LEFT && child_to_search < node->n_children-1) ||
+        (search->direction == FT_SEARCH_RIGHT && child_to_search > 0)) {
+        r = TOKUDB_TRY_AGAIN;
    }
+
    return r;
 }

@@ -5775,7 +5867,6 @@ toku_ft_keysrange_internal (FT_HANDLE brt, FTNODE node,
            &next_ancestors,
            bounds,
            child_may_find_right ? match_bfe : min_bfe,
-            PL_READ, // may_modify_node is false, because node guaranteed to not change
            false,
            &childnode,
            &msgs_applied
@@ -5986,7 +6077,7 @@ static int get_key_after_bytes_in_child(FT_HANDLE ft_h, FT ft, FTNODE node, UNLO
    uint32_t fullhash = compute_child_fullhash(ft->cf, node, childnum);
    FTNODE child;
    bool msgs_applied = false;
-    r = toku_pin_ftnode_batched(ft_h, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, PL_READ, false, &child, &msgs_applied);
+    r = toku_pin_ftnode_batched(ft_h, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, false, &child, &msgs_applied);
    paranoid_invariant(!msgs_applied);
    if (r == TOKUDB_TRY_AGAIN) {
        return r;

--- a/ft/tests/orthopush-flush.cc
+++ b/ft/tests/orthopush-flush.cc
@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
        struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
        const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
        bool msgs_applied;
-        toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied);
+        toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied, -1);

        FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
                     {
@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
        .upper_bound_inclusive = toku_clone_dbt(&ubi, childkeys[7])
    };
    bool msgs_applied;
-    toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied);
+    toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied, -1);

    FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
                 {
@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
    struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
    const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
    bool msgs_applied;
-    toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied);
+    toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied, -1);

    FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
                 {