refs #5770 Only check one basement node on pin, remove the assumption that adjacent

available nodes are query-able.

refs #5770 Only check one basement node on pin, remove the assumption that adjacent
available nodes are query-able.
9a6ba1aa · John Esmet · 06d56d51 · 9a6ba1aa · 9a6ba1aa · 9a6ba1aa
Commit 9a6ba1aa authored Jun 28, 2013 by John Esmet
5 changed files
--- a/ft/ft-cachetable-wrappers.cc
+++ b/ft/ft-cachetable-wrappers.cc
@@ -193,6 +193,11 @@ toku_create_new_ftnode (
        NULL);
 }
+//
+// On success, this function assumes that the caller is trying to pin the node
+// with a PL_READ lock. If message application is needed,
+// then a PL_WRITE_CHEAP lock is grabbed
+//
 int
 toku_pin_ftnode_batched(
    FT_HANDLE brt,
@@ -202,15 +207,22 @@ toku_pin_ftnode_batched(
    ANCESTORS ancestors,
    const PIVOT_BOUNDS bounds,
    FTNODE_FETCH_EXTRA bfe,
-    pair_lock_type lock_type,
    bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
    FTNODE *node_p,
    bool* msgs_applied)
 {
    void *node_v;
    *msgs_applied = false;
-    pair_lock_type needed_lock_type = lock_type;
+    FTNODE node = nullptr;
-try_again_for_write_lock:
+    MSN max_msn_in_path = ZERO_MSN;
+    bool needs_ancestors_messages = false;
+    // this function assumes that if you want ancestor messages applied,
+    // you are doing a read for a query. This is so we can make some optimizations
+    // below.
+    if (apply_ancestor_messages) {
+        paranoid_invariant(bfe->type == ftnode_fetch_subset);
+    }
    int r = toku_cachetable_get_and_pin_nonblocking_batched(
            brt->ft->cf,
            blocknum,
@@ -221,25 +233,52 @@ try_again_for_write_lock:
            toku_ftnode_fetch_callback,
            toku_ftnode_pf_req_callback,
            toku_ftnode_pf_callback,
-            needed_lock_type,
+            PL_READ,
            bfe, //read_extraargs
            unlockers);
-    if (r==0) {
+    if (r != 0) {
-        FTNODE node = static_cast<FTNODE>(node_v);
+        assert(r == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
-        MSN max_msn_in_path;
+        goto exit;
-        bool needs_ancestors_messages = false;
-        if (apply_ancestor_messages && node->height == 0) {
-            needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(brt->ft, node, ancestors, bounds, &max_msn_in_path);
-            if (needs_ancestors_messages && needed_lock_type == PL_READ) {
-                toku_unpin_ftnode_read_only(brt->ft, node);
-                needed_lock_type = PL_WRITE_CHEAP;
-                goto try_again_for_write_lock;
-            }
    }
+    node = static_cast<FTNODE>(node_v);
    if (apply_ancestor_messages && node->height == 0) {
+        needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(
+            brt->ft, 
+            node, 
+            ancestors, 
+            bounds, 
+            &max_msn_in_path, 
+            bfe->child_to_read
+            );
        if (needs_ancestors_messages) {
-                invariant(needed_lock_type != PL_READ);
+            toku_unpin_ftnode_read_only(brt->ft, node);
-                toku_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, msgs_applied);
+            int rr = toku_cachetable_get_and_pin_nonblocking_batched(
+                    brt->ft->cf,
+                    blocknum,
+                    fullhash,
+                    &node_v,
+                    NULL,
+                    get_write_callbacks_for_node(brt->ft),
+                    toku_ftnode_fetch_callback,
+                    toku_ftnode_pf_req_callback,
+                    toku_ftnode_pf_callback,
+                    PL_WRITE_CHEAP,
+                    bfe, //read_extraargs
+                    unlockers);
+            if (rr != 0) {
+                assert(rr == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
+                r = TOKUDB_TRY_AGAIN;
+                goto exit;
+            }
+            node = static_cast<FTNODE>(node_v);
+            toku_apply_ancestors_messages_to_node(
+                brt, 
+                node, 
+                ancestors, 
+                bounds, 
+                msgs_applied,
+                bfe->child_to_read
+                );
        } else {
            // At this point, we aren't going to run
            // toku_apply_ancestors_messages_to_node but that doesn't
@@ -264,20 +303,12 @@ try_again_for_write_lock:
            // requires a write lock, and a write lock requires you to
            // resolve checkpointing.
            if (!node->dirty) {
-                    toku_ft_bn_update_max_msn(node, max_msn_in_path);
+                toku_ft_bn_update_max_msn(node, max_msn_in_path, bfe->child_to_read);
            }
        }
-            invariant(needed_lock_type != PL_READ || !*msgs_applied);
-        }
-        if ((lock_type != PL_READ) && node->height > 0) {
-            toku_move_ftnode_messages_to_stale(brt->ft, node);
    }
    *node_p = node;
-        // printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
+exit:
-    } else {
-        assert(r==TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
-        // printf("%*sPin %ld try again\n", 8, "", blocknum.b);
-    }
    return r;
 }

--- a/ft/ft-cachetable-wrappers.h
+++ b/ft/ft-cachetable-wrappers.h
@@ -150,7 +150,6 @@ toku_pin_ftnode_batched(
    ANCESTORS ancestors,
    const PIVOT_BOUNDS pbounds,
    FTNODE_FETCH_EXTRA bfe,
-    pair_lock_type lock_type,
    bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
    FTNODE *node_p,
    bool* msgs_applied

--- a/ft/ft-internal.h
+++ b/ft/ft-internal.h
@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
 #define VERIFY_NODE(t,n) ((void)0)
 #endif
-//#define FT_TRACE
-#ifdef FT_TRACE
-#define WHEN_FTTRACE(x) x
-#else
-#define WHEN_FTTRACE(x) ((void)0)
-#endif
 void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe);
 void toku_ft_status_update_flush_reason(FTNODE node, uint64_t uncompressed_bytes_flushed, uint64_t bytes_written, tokutime_t write_time, bool for_checkpoint);
 void toku_ft_status_update_serialize_times(FTNODE node, tokutime_t serialize_time, tokutime_t compress_time);
@@ -982,11 +975,11 @@ struct pivot_bounds {
 __attribute__((nonnull))
 void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node);
-void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied);
+void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied, int child_to_read);
 __attribute__((nonnull))
-bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path);
+bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path, int child_to_read);
 __attribute__((nonnull))
-void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied);
+void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read);
 __attribute__((const,nonnull))
 size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd);

--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
@@ -4509,8 +4509,53 @@ bnc_apply_messages_to_basement_node(
    }
 }
+static void
+apply_ancestors_messages_to_bn(
+    FT_HANDLE t,
+    FTNODE node,
+    int childnum,
+    ANCESTORS ancestors,
+    struct pivot_bounds const * const bounds, 
+    TXNID oldest_referenced_xid,
+    bool* msgs_applied
+    )
+{
+    BASEMENTNODE curr_bn = BLB(node, childnum);
+    struct pivot_bounds curr_bounds = next_pivot_keys(node, childnum, bounds);
+    for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
+        if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > curr_bn->max_msn_applied.msn) {
+            paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
+            bnc_apply_messages_to_basement_node(
+                t,
+                curr_bn,
+                curr_ancestors->node,
+                curr_ancestors->childnum,
+                &curr_bounds,
+                oldest_referenced_xid,
+                msgs_applied
+                );
+            // We don't want to check this ancestor node again if the
+            // next time we query it, the msn hasn't changed.
+            curr_bn->max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
+        }
+    }
+    // At this point, we know all the stale messages above this
+    // basement node have been applied, and any new messages will be
+    // fresh, so we don't need to look at stale messages for this
+    // basement node, unless it gets evicted (and this field becomes
+    // false when it's read in again).
+    curr_bn->stale_ancestor_messages_applied = true;
+}
 void
-toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied)
+toku_apply_ancestors_messages_to_node (
+    FT_HANDLE t, 
+    FTNODE node, 
+    ANCESTORS ancestors, 
+    struct pivot_bounds const * const bounds, 
+    bool* msgs_applied, 
+    int child_to_read
+    )
 // Effect:
 //   Bring a leaf node up-to-date according to all the messages in the ancestors.
 //   If the leaf node is already up-to-date then do nothing.
@@ -4521,7 +4566,7 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
 //   The entire root-to-leaf path is pinned and appears in the ancestors list.
 {
    VERIFY_NODE(t, node);
-    invariant(node->height == 0);
+    paranoid_invariant(node->height == 0);
    TXNID oldest_referenced_xid = ancestors->node->oldest_referenced_xid_known;
    for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
@@ -4530,69 +4575,53 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
        }
    }
+    if (!node->dirty && child_to_read >= 0) {
+        paranoid_invariant(BP_STATE(node, child_to_read) == PT_AVAIL);
+        apply_ancestors_messages_to_bn(
+            t,
+            node,
+            child_to_read,
+            ancestors,
+            bounds,
+            oldest_referenced_xid,
+            msgs_applied
+            );
+    }
+    else {
        // know we are a leaf node
        // An important invariant:
-    // We MUST bring every available basement node up to date.
+        // We MUST bring every available basement node for a dirty node up to date.
        // flushing on the cleaner thread depends on this. This invariant
        // allows the cleaner thread to just pick an internal node and flush it
        // as opposed to being forced to start from the root.
        for (int i = 0; i < node->n_children; i++) {
            if (BP_STATE(node, i) != PT_AVAIL) { continue; }
-        BASEMENTNODE curr_bn = BLB(node, i);
+            apply_ancestors_messages_to_bn(
-        struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
-        for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
-            if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > curr_bn->max_msn_applied.msn) {
-                paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
-                bnc_apply_messages_to_basement_node(
                t,
-                    curr_bn,
+                node,
-                    curr_ancestors->node,
+                i,
-                    curr_ancestors->childnum,
+                ancestors,
-                    &curr_bounds,
+                bounds,
                oldest_referenced_xid,
                msgs_applied
                );
-                // We don't want to check this ancestor node again if the
-                // next time we query it, the msn hasn't changed.
-                curr_bn->max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
        }
    }
-        // At this point, we know all the stale messages above this
-        // basement node have been applied, and any new messages will be
-        // fresh, so we don't need to look at stale messages for this
-        // basement node, unless it gets evicted (and this field becomes
-        // false when it's read in again).
-        curr_bn->stale_ancestor_messages_applied = true;
-    }
    VERIFY_NODE(t, node);
 }
-bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path)
+static bool bn_needs_ancestors_messages(
-// Effect: Determine whether there are messages in a node's ancestors
+    FT ft,
-//  which must be applied to it.  These messages are in the correct
+    FTNODE node,
-//  keyrange for any available basement nodes, and are in nodes with the
+    int childnum,
-//  correct max_msn_applied_to_node_on_disk.
+    struct pivot_bounds const * const bounds,
-// Notes:
+    ANCESTORS ancestors, 
-//  This is an approximate query.
+    MSN* max_msn_applied
-// Output:
+    ) 
-//  max_msn_in_path: max of "max_msn_applied_to_node_on_disk" over
-//    ancestors.  This is used later to update basement nodes'
-//    max_msn_applied values in case we don't do the full algorithm.
-// Returns:
-//  true if there may be some such messages
-//  false only if there are definitely no such messages
-// Rationale:
-//  When we pin a node with a read lock, we want to quickly determine if
-//  we should exchange it for a write lock in preparation for applying
-//  messages.  If there are no messages, we don't need the write lock.
 {
-    invariant(node->height == 0);
+    BASEMENTNODE bn = BLB(node, childnum);
-    MSN max_msn_applied = ZERO_MSN;
+    struct pivot_bounds curr_bounds = next_pivot_keys(node, childnum, bounds);
    bool needs_ancestors_messages = false;
-    for (int i = 0; i < node->n_children; ++i) {
-        if (BP_STATE(node, i) != PT_AVAIL) { continue; }
-        BASEMENTNODE bn = BLB(node, i);
-        struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
    for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
        if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > bn->max_msn_applied.msn) {
            paranoid_invariant(BP_STATE(curr_ancestors->node, curr_ancestors->childnum) == PT_AVAIL);
@@ -4627,19 +4656,86 @@ bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancesto
                needs_ancestors_messages = true;
                goto cleanup;
            }
-                if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > max_msn_applied.msn) {
+            if (curr_ancestors->node->max_msn_applied_to_node_on_disk.msn > max_msn_applied->msn) {
-                    max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
+                max_msn_applied->msn = curr_ancestors->node->max_msn_applied_to_node_on_disk.msn;
+            }
+        }
+    }
+cleanup:
+    return needs_ancestors_messages;
+}
+bool toku_ft_leaf_needs_ancestors_messages(
+    FT ft, 
+    FTNODE node, 
+    ANCESTORS ancestors, 
+    struct pivot_bounds const * const bounds, 
+    MSN *const max_msn_in_path, 
+    int child_to_read
+    )
+// Effect: Determine whether there are messages in a node's ancestors
+//  which must be applied to it.  These messages are in the correct
+//  keyrange for any available basement nodes, and are in nodes with the
+//  correct max_msn_applied_to_node_on_disk.
+// Notes:
+//  This is an approximate query.
+// Output:
+//  max_msn_in_path: max of "max_msn_applied_to_node_on_disk" over
+//    ancestors.  This is used later to update basement nodes'
+//    max_msn_applied values in case we don't do the full algorithm.
+// Returns:
+//  true if there may be some such messages
+//  false only if there are definitely no such messages
+// Rationale:
+//  When we pin a node with a read lock, we want to quickly determine if
+//  we should exchange it for a write lock in preparation for applying
+//  messages.  If there are no messages, we don't need the write lock.
+{
+    paranoid_invariant(node->height == 0);
+    bool needs_ancestors_messages = false;
+    // child_to_read may be -1 in test cases
+    if (!node->dirty && child_to_read >= 0) {
+        paranoid_invariant(BP_STATE(node, child_to_read) == PT_AVAIL);
+        needs_ancestors_messages = bn_needs_ancestors_messages(
+            ft,
+            node,
+            child_to_read,
+            bounds,
+            ancestors,
+            max_msn_in_path
+            );
    }
+    else {
+        for (int i = 0; i < node->n_children; ++i) {
+            if (BP_STATE(node, i) != PT_AVAIL) { continue; }
+            needs_ancestors_messages = bn_needs_ancestors_messages(
+                ft,
+                node,
+                i,
+                bounds,
+                ancestors,
+                max_msn_in_path
+                );
+            if (needs_ancestors_messages) {
+                goto cleanup;
            }
        }
    }
-    *max_msn_in_path = max_msn_applied;
 cleanup:
    return needs_ancestors_messages;
 }
-void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied) {
+void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read) {
    invariant(node->height == 0);
+    if (!node->dirty && child_to_read >= 0) {
+        paranoid_invariant(BP_STATE(node, child_to_read) == PT_AVAIL);
+        BASEMENTNODE bn = BLB(node, child_to_read);
+        if (max_msn_applied.msn > bn->max_msn_applied.msn) {
+            // see comment below
+            (void) toku_sync_val_compare_and_swap(&bn->max_msn_applied.msn, bn->max_msn_applied.msn, max_msn_applied.msn);
+        }
+    }
+    else {
        for (int i = 0; i < node->n_children; ++i) {
            if (BP_STATE(node, i) != PT_AVAIL) { continue; }
            BASEMENTNODE bn = BLB(node, i);
@@ -4652,6 +4748,7 @@ void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied) {
                (void) toku_sync_val_compare_and_swap(&bn->max_msn_applied.msn, bn->max_msn_applied.msn, max_msn_applied.msn);
            }
        }
+    }
 }
 struct copy_to_stale_extra {
@@ -4779,6 +4876,11 @@ got_a_good_value:
            ftcursor->leaf_info.to_be.omt   = bn->buffer;
            ftcursor->leaf_info.to_be.index = idx;
+            // 
+            // IMPORTANT: bulk fetch CANNOT go past the current basement node,
+            // because there is no guarantee that messages have been applied
+            // to other basement nodes, as part of #5770
+            //
            if (r == TOKUDB_CURSOR_CONTINUE && can_bulk_fetch) {
                r = ft_cursor_shortcut(
                    ftcursor,
@@ -4908,7 +5010,7 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
    BLOCKNUM childblocknum = BP_BLOCKNUM(node,childnum);
    uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, childnum);
-    FTNODE childnode;
+    FTNODE childnode = nullptr;
    // If the current node's height is greater than 1, then its child is an internal node.
    // Therefore, to warm the cache better (#5798), we want to read all the partitions off disk in one shot.
@@ -4931,7 +5033,6 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
                                         unlockers,
                                         &next_ancestors, bounds,
                                         &bfe,
-                                         PL_READ, // we try to get a read lock, but we may upgrade to a write lock on a leaf for message application.
                                         true,
                                         &childnode,
                                         &msgs_applied);
@@ -5090,19 +5191,6 @@ ft_search_node(
    // At this point, we must have the necessary partition available to continue the search
    //
    assert(BP_STATE(node,child_to_search) == PT_AVAIL);
-    while (child_to_search >= 0 && child_to_search < node->n_children) {
-        //
-        // Normally, the child we want to use is available, as we checked
-        // before entering this while loop. However, if we pass through
-        // the loop once, getting DB_NOTFOUND for this first value
-        // of child_to_search, we enter the while loop again with a
-        // child_to_search that may not be in memory. If it is not,
-        // we need to return TOKUDB_TRY_AGAIN so the query can
-        // read the appropriate partition into memory
-        //
-        if (BP_STATE(node,child_to_search) != PT_AVAIL) {
-            return TOKUDB_TRY_AGAIN;
-        }
    const struct pivot_bounds next_bounds = next_pivot_keys(node, child_to_search, bounds);
    if (node->height > 0) {
        r = ft_search_child(
@@ -5131,7 +5219,9 @@ ft_search_node(
            can_bulk_fetch
            );
    }
-        if (r == 0) return r; //Success
+    if (r == 0) {
+        return r; //Success
+    }
    if (r != DB_NOTFOUND) {
        return r; //Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
@@ -5143,17 +5233,19 @@ ft_search_node(
    // we have a new pivotkey
    if (node->height == 0) {
        // when we run off the end of a basement, try to lock the range up to the pivot. solves #3529
-            const DBT *pivot = NULL;
+        const DBT *pivot = nullptr;
-            if (search->direction == FT_SEARCH_LEFT)
+        if (search->direction == FT_SEARCH_LEFT) {
            pivot = next_bounds.upper_bound_inclusive; // left -> right
-            else
+        } else {
            pivot = next_bounds.lower_bound_exclusive; // right -> left
-            if (pivot) {
+        }
-                int rr = getf(pivot->size, pivot->data, 0, NULL, getf_v, true);
+        if (pivot != nullptr) {
-                if (rr != 0)
+            int rr = getf(pivot->size, pivot->data, 0, nullptr, getf_v, true);
+            if (rr != 0) {
                return rr; // lock was not granted
            }
        }
+    }
    // If we got a DB_NOTFOUND then we have to search the next record.        Possibly everything present is not visible.
    // This way of doing DB_NOTFOUND is a kludge, and ought to be simplified.  Something like this is needed for DB_NEXT, but
@@ -5162,15 +5254,15 @@ ft_search_node(
    // If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
    // So save the pivot key in the search object.
    maybe_search_save_bound(node, child_to_search, search);
+    // as part of #5770, if we can continue searching,
-        // We're about to pin some more nodes, but we thought we were done before.
+    // we MUST return TOKUDB_TRY_AGAIN,
-        if (search->direction == FT_SEARCH_LEFT) {
+    // because there is no guarantee that messages have been applied
-            child_to_search++;
+    // on any other path.
-        }
+    if ((search->direction == FT_SEARCH_LEFT && child_to_search < node->n_children-1) ||
-        else {
+        (search->direction == FT_SEARCH_RIGHT && child_to_search > 0)) {
-            child_to_search--;
+        r = TOKUDB_TRY_AGAIN;
-        }
    }
    return r;
 }
@@ -5775,7 +5867,6 @@ toku_ft_keysrange_internal (FT_HANDLE brt, FTNODE node,
            &next_ancestors,
            bounds,
            child_may_find_right ? match_bfe : min_bfe,
-            PL_READ, // may_modify_node is false, because node guaranteed to not change
            false,
            &childnode,
            &msgs_applied
@@ -5986,7 +6077,7 @@ static int get_key_after_bytes_in_child(FT_HANDLE ft_h, FT ft, FTNODE node, UNLO
    uint32_t fullhash = compute_child_fullhash(ft->cf, node, childnum);
    FTNODE child;
    bool msgs_applied = false;
-    r = toku_pin_ftnode_batched(ft_h, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, PL_READ, false, &child, &msgs_applied);
+    r = toku_pin_ftnode_batched(ft_h, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, false, &child, &msgs_applied);
    paranoid_invariant(!msgs_applied);
    if (r == TOKUDB_TRY_AGAIN) {
        return r;

--- a/ft/tests/orthopush-flush.cc
+++ b/ft/tests/orthopush-flush.cc
@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
        struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
        const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
        bool msgs_applied;
-        toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied);
+        toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied, -1);
        FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
                     {
@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
        .upper_bound_inclusive = toku_clone_dbt(&ubi, childkeys[7])
    };
    bool msgs_applied;
-    toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied);
+    toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied, -1);
    FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
                 {
@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
    struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
    const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
    bool msgs_applied;
-    toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied);
+    toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied, -1);
    FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
                 {