[t:3315] merge indexed buffers work into mainline

git-svn-id: file:///svn/toku/tokudb@33979 c7de825b-a66e-492c-adef-691d508d4ae1

[t:3315] merge indexed buffers work into mainline
git-svn-id: file:///svn/toku/tokudb@33979 c7de825b-a66e-492c-adef-691d508d4ae1
2b4da5c0 · Leif Walsh · Yoni Fogel · b246fcf4 · 2b4da5c0 · 2b4da5c0
Commit 2b4da5c0 authored Aug 17, 2011 by Leif Walsh Committed by Yoni Fogel Apr 16, 2013
27 changed files
--- a/newbrt/Makefile
+++ b/newbrt/Makefile
@@ -67,6 +67,7 @@ BRT_SOURCES = \
  recover \
  roll \
  rollback \
+  sort \
  sub_block \
  ule \
  threadpool \

--- a/newbrt/brt-internal.h
+++ b/newbrt/brt-internal.h
@@ -107,7 +107,8 @@ struct brtnode_fetch_extra {
    // used in the case where type == brtnode_fetch_subset
    // parameters needed to find out which child needs to be decompressed (so it can be read)
    brt_search_t* search;
-    BRT brt;
+    DB *cmp_extra;
+    brt_compare_func cmp;
    DBT *range_lock_left_key, *range_lock_right_key;
    BOOL left_is_neg_infty, right_is_pos_infty;
    // this value will be set during the fetch_callback call by toku_brtnode_fetch_callback or toku_brtnode_pf_req_callback
@@ -121,11 +122,12 @@ struct brtnode_fetch_extra {
 // necessary. Used in cases where the entire node
 // is required, such as for flushes.
 //
-static inline void fill_bfe_for_full_read(struct brtnode_fetch_extra *bfe, struct brt_header *h) {
+static inline void fill_bfe_for_full_read(struct brtnode_fetch_extra *bfe, struct brt_header *h, DB *cmp_extra, brt_compare_func cmp) {
    bfe->type = brtnode_fetch_all;
    bfe->h = h;
    bfe->search = NULL;
-    bfe->brt = NULL;
+    bfe->cmp_extra = cmp_extra;
+    bfe->cmp = cmp;
    bfe->range_lock_left_key = NULL;
    bfe->range_lock_right_key = NULL;
    bfe->left_is_neg_infty = FALSE;
@@ -133,7 +135,7 @@ static inline void fill_bfe_for_full_read(struct brtnode_fetch_extra *bfe, struc
    bfe->child_to_read = -1;
 }

-static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe, struct brt_header *h, BRT brt, BRT_CURSOR c);
+static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe, struct brt_header *h, DB *cmp_extra, brt_compare_func cmp, BRT_CURSOR c);

 //
 // Helper function to fill a brtnode_fetch_extra with data
@@ -142,20 +144,22 @@ static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe, struct
 // such as for a point query.
 //
 static inline void fill_bfe_for_subset_read(
-    struct brtnode_fetch_extra *bfe, 
+    struct brtnode_fetch_extra *bfe,
    struct brt_header *h,
-    BRT brt,
+    DB *cmp_extra,
+    brt_compare_func cmp,
    brt_search_t* search,
    DBT *left,
    DBT *right,
    BOOL left_is_neg_infty,
    BOOL right_is_pos_infty
-    ) 
+    )
 {
    bfe->type = brtnode_fetch_subset;
    bfe->h = h;
    bfe->search = search;
-    bfe->brt = brt;
+    bfe->cmp_extra = cmp_extra;
+    bfe->cmp = cmp;
    bfe->range_lock_left_key = (left->data ? left : NULL);
    bfe->range_lock_right_key = (right->data ? right : NULL);
    bfe->left_is_neg_infty = left_is_neg_infty;
@@ -169,11 +173,12 @@ static inline void fill_bfe_for_subset_read(
 // necessary, only the pivots and/or subtree estimates.
 // Currently used for stat64.
 //
-static inline void fill_bfe_for_min_read(struct brtnode_fetch_extra *bfe, struct brt_header *h) {
+static inline void fill_bfe_for_min_read(struct brtnode_fetch_extra *bfe, struct brt_header *h, DB *cmp_extra, brt_compare_func cmp) {
    bfe->type = brtnode_fetch_none;
    bfe->h = h;
    bfe->search = NULL;
-    bfe->brt = NULL;
+    bfe->cmp_extra = cmp_extra;
+    bfe->cmp = cmp;
    bfe->range_lock_left_key = NULL;
    bfe->range_lock_right_key = NULL;
    bfe->left_is_neg_infty = FALSE;
@@ -197,9 +202,35 @@ static inline void destroy_bfe_for_prefetch(struct brtnode_fetch_extra *bfe) {
    }
 }

+struct toku_fifo_entry_key_msn_heaviside_extra {
+    DB *cmp_extra;
+    brt_compare_func cmp;
+    FIFO fifo;
+    bytevec key;
+    ITEMLEN keylen;
+    MSN msn;
+};
+
+// comparison function for inserting messages into a
+// brtnode_nonleaf_childinfo's message_tree
+int
+toku_fifo_entry_key_msn_heaviside(OMTVALUE v, void *extrap);
+
+struct toku_fifo_entry_key_msn_cmp_extra {
+    DB *cmp_extra;
+    brt_compare_func cmp;
+    FIFO fifo;
+};
+
+// same thing for qsort_r
+int
+toku_fifo_entry_key_msn_cmp(void *extrap, const void *ap, const void *bp);
+
 // data of an available partition of a nonleaf brtnode
 struct brtnode_nonleaf_childinfo {
-    FIFO         buffer;
+    FIFO buffer;
+    OMT broadcast_buffer;
+    OMT message_tree;
    unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */
 };

@@ -210,7 +241,6 @@ struct brtnode_leaf_basement_node {
    unsigned int n_bytes_in_buffer; /* How many bytes to represent the OMT (including the per-key overheads, but not including the overheads for the node. */
    unsigned int seqinsert;         /* number of sequential inserts to this leaf */
    MSN max_msn_applied; // max message sequence number applied
-    DSN max_dsn_applied; // max deserialization sequence number applied
 };

 #define PT_INVALID 0
@@ -277,7 +307,6 @@ struct   __attribute__((__packed__)) brtnode_partition {

 struct brtnode {
    MSN      max_msn_applied_to_node_on_disk; // max_msn_applied that will be written to disk
-    DSN dsn; // deserialization sequence number
    unsigned int nodesize;
    unsigned int flags;
    BLOCKNUM thisnodename;   // Which block number is this node?
@@ -374,6 +403,8 @@ static inline void set_BSB(BRTNODE node, int i, SUB_BLOCK sb) {

 // macros for brtnode_nonleaf_childinfo
 #define BNC_BUFFER(node,i) (BNC(node,i)->buffer)
+#define BNC_BROADCAST_BUFFER(node,i) (BNC(node,i)->broadcast_buffer)
+#define BNC_MESSAGE_TREE(node, i) (BNC(node,i)->message_tree)
 #define BNC_NBYTESINBUF(node,i) (BNC(node,i)->n_bytes_in_buffer)

 // brtnode leaf basementnode macros, 
@@ -443,8 +474,6 @@ struct brt_header {
    struct toku_list live_brts;
    struct toku_list zombie_brts;
    struct toku_list checkpoint_before_commit_link;
-
-    DSN curr_dsn;
 };

 struct brt {
@@ -488,7 +517,7 @@ int toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE
                                    BOOL for_checkpoint);
 int toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *logp, struct brt_header *h);
 void toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode_fetch_extra* bfe);
-void toku_deserialize_bp_from_compressed(BRTNODE node, int childnum);
+void toku_deserialize_bp_from_compressed(BRTNODE node, int childnum, DB *cmp_extra, brt_compare_func cmp);
 int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, BRTNODE *brtnode, struct brtnode_fetch_extra* bfe);
 unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */
 int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
@@ -514,7 +543,7 @@ void toku_assert_entire_node_in_memory(BRTNODE node);
 void toku_brt_nonleaf_append_child(BRTNODE node, BRTNODE child, struct kv_pair *pivotkey, size_t pivotkeysize);

 // append a cmd to a nonleaf node child buffer
-void toku_brt_append_to_child_buffer(BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val);
+void toku_brt_append_to_child_buffer(BRT brt, BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val);

 #if 1
 #define DEADBEEF ((void*)0xDEADBEEF)
@@ -568,15 +597,20 @@ struct brt_cursor {
 };

 // this is in a strange place because it needs the cursor struct to be defined
-static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe, struct brt_header *h, BRT brt, BRT_CURSOR c) {
+static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe,
+                                         struct brt_header *h,
+                                         DB *cmp_extra,
+                                         brt_compare_func cmp,
+                                         BRT_CURSOR c) {
    bfe->type = brtnode_fetch_prefetch;
    bfe->h = h;
    bfe->search = NULL;
-    bfe->brt = brt;
+    bfe->cmp_extra = cmp_extra;
+    bfe->cmp = cmp;
    {
        const DBT *left = &c->range_lock_left_key;
        const DBT *right = &c->range_lock_right_key;
-        if (left->data) {
+	if (left->data) {
            MALLOC(bfe->range_lock_left_key); resource_assert(bfe->range_lock_left_key);
            toku_fill_dbt(bfe->range_lock_left_key, toku_xmemdup(left->data, left->size), left->size);
        } else {
@@ -607,12 +641,13 @@ struct pivot_bounds {

 int
 toku_brt_search_which_child(
-    BRT brt, 
-    BRTNODE node, 
+    DB *cmp_extra,
+    brt_compare_func cmp,
+    BRTNODE node,
    brt_search_t *search
    );

-bool 
+bool
 toku_bfe_wants_child_available (struct brtnode_fetch_extra* bfe, int childnum);

 int
@@ -645,7 +680,8 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha
                                   struct brtnode_fetch_extra *bfe,
 				   BRTNODE *node_p);
 void toku_unpin_brtnode (BRT brt, BRTNODE node);
-unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t)
+unsigned int toku_brtnode_which_child(BRTNODE node, const DBT *k,
+                                      DB *cmp_extra, brt_compare_func cmp)
    __attribute__((__warn_unused_result__));

 /* Stuff for testing */

--- a/newbrt/brt-serialize.c
+++ b/newbrt/brt-serialize.c
@@ -4,6 +4,7 @@
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include "includes.h"
+#include "sort.h"
 #include "toku_atomic.h"
 #include "threadpool.h"
 #include <compress.h>
@@ -485,7 +486,6 @@ sum_item (OMTVALUE lev, u_int32_t UU(idx), void *vsi) {
 // Because all messages above have been applied, setting msn of all new basements 
 // to max msn of existing basements is correct.  (There cannot be any messages in
 // buffers above that still need to be applied.)
-// TODO: assert that all basement DSNs are the same.
 static void
 rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)
 {
@@ -539,11 +539,8 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)
    u_int32_t tmp_seqinsert = BLB_SEQINSERT(node, node->n_children-1);

    MSN max_msn = MIN_MSN;
-    DSN min_dsn = MAX_DSN;
    for (int i = 0; i < node->n_children; i++) {
-        DSN curr_dsn = BLB_MAX_DSN_APPLIED(node,i);
        MSN curr_msn = BLB_MAX_MSN_APPLIED(node,i);
-        min_dsn = (curr_dsn.dsn < min_dsn.dsn) ? curr_dsn : min_dsn;
        max_msn = (curr_msn.msn > max_msn.msn) ? curr_msn : max_msn;
    }

@@ -604,7 +601,6 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)

        BP_STATE(node,i) = PT_AVAIL;
        BP_TOUCH_CLOCK(node,i);
-        BLB_MAX_DSN_APPLIED(node,i) = min_dsn;
        BLB_MAX_MSN_APPLIED(node,i) = max_msn;
    }
    node->max_msn_applied_to_node_on_disk = max_msn;
@@ -826,20 +822,46 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h
 }

 static void
-deserialize_child_buffer(BRTNODE node, int cnum, struct rbuf *rbuf) {
+deserialize_child_buffer(BRTNODE node, int cnum, struct rbuf *rbuf,
+                         DB *cmp_extra, brt_compare_func cmp) {
+    int r;
    int n_bytes_in_buffer = 0;
    int n_in_this_buffer = rbuf_int(rbuf);
+    void **offsets;
+    void **broadcast_offsets;
+    int noffsets = 0;
+    int nbroadcast_offsets = 0;
+    if (cmp) {
+        MALLOC_N(n_in_this_buffer, offsets);
+        MALLOC_N(n_in_this_buffer, broadcast_offsets);
+    }
    for (int i = 0; i < n_in_this_buffer; i++) {
        bytevec key; ITEMLEN keylen;
        bytevec val; ITEMLEN vallen;
-        int type = rbuf_char(rbuf);
+        // this is weird but it's necessary to pass icc and gcc together
+        unsigned char ctype = rbuf_char(rbuf);
+        enum brt_msg_type type = (enum brt_msg_type) ctype;
        MSN msn = rbuf_msn(rbuf);
        XIDS xids;
        xids_create_from_buffer(rbuf, &xids);
        rbuf_bytes(rbuf, &key, &keylen); /* Returns a pointer into the rbuf. */
        rbuf_bytes(rbuf, &val, &vallen);
        //printf("Found %s,%s\n", (char*)key, (char*)val);
-        int r = toku_fifo_enq(BNC_BUFFER(node, cnum), key, keylen, val, vallen, type, msn, xids); /* Copies the data into the fifo */
+        long *dest;
+        if (cmp) {
+            if (brt_msg_type_applies_once(type)) {
+                dest = (long *) &offsets[noffsets];
+                noffsets++;
+            } else if (brt_msg_type_applies_all(type) || brt_msg_type_does_nothing(type)) {
+                dest = (long *) &broadcast_offsets[nbroadcast_offsets];
+                nbroadcast_offsets++;
+            } else {
+                assert(FALSE);
+            }
+        } else {
+            dest = NULL;
+        }
+        r = toku_fifo_enq(BNC_BUFFER(node, cnum), key, keylen, val, vallen, type, msn, xids, dest); /* Copies the data into the fifo */
        lazy_assert_zero(r);
        n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids);
        //printf("Inserted\n");
@@ -847,6 +869,17 @@ deserialize_child_buffer(BRTNODE node, int cnum, struct rbuf *rbuf) {
    }
    invariant(rbuf->ndone == rbuf->size);

+    if (cmp) {
+        struct toku_fifo_entry_key_msn_cmp_extra extra = { .cmp_extra = cmp_extra, .cmp = cmp, .fifo = BNC_BUFFER(node, cnum) };
+        r = mergesort_r(offsets, noffsets, sizeof offsets[0], &extra, toku_fifo_entry_key_msn_cmp);
+        assert_zero(r);
+        toku_omt_destroy(&BNC_MESSAGE_TREE(node, cnum));
+        r = toku_omt_create_steal_sorted_array(&BNC_MESSAGE_TREE(node, cnum), &offsets, noffsets, n_in_this_buffer);
+        assert_zero(r);
+        toku_omt_destroy(&BNC_BROADCAST_BUFFER(node, cnum));
+        r = toku_omt_create_steal_sorted_array(&BNC_BROADCAST_BUFFER(node, cnum), &broadcast_offsets, nbroadcast_offsets, n_in_this_buffer);
+        assert_zero(r);
+    }
    BNC_NBYTESINBUF(node, cnum) = n_bytes_in_buffer;
    BP_WORKDONE(node, cnum) = 0;
 }
@@ -897,7 +930,6 @@ BASEMENTNODE toku_create_empty_bn(void) {

 BASEMENTNODE toku_create_empty_bn_no_buffer(void) {
    BASEMENTNODE XMALLOC(bn);
-    bn->max_dsn_applied = MIN_DSN;
    bn->max_msn_applied.msn = 0;
    bn->buffer = NULL;
    bn->n_bytes_in_buffer = 0;
@@ -910,7 +942,11 @@ NONLEAF_CHILDINFO toku_create_empty_nl(void) {
    NONLEAF_CHILDINFO XMALLOC(cn);
    cn->n_bytes_in_buffer = 0;
    int r = toku_fifo_create(&cn->buffer);
-    assert(r==0);
+    assert_zero(r);
+    r = toku_omt_create(&cn->message_tree);
+    assert_zero(r);
+    r = toku_omt_create(&cn->broadcast_buffer);
+    assert_zero(r);
    return cn;
 }

@@ -926,6 +962,8 @@ void destroy_basement_node (BASEMENTNODE bn)
 void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl)
 {
    toku_fifo_free(&nl->buffer);
+    toku_omt_destroy(&nl->message_tree);
+    toku_omt_destroy(&nl->broadcast_buffer);
    toku_free(nl);
 }

@@ -1023,8 +1061,6 @@ deserialize_brtnode_info(
    struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
    rbuf_init(&rb, sb->uncompressed_ptr, data_size);

-    node->dsn = INVALID_DSN;
-
    node->max_msn_applied_to_node_on_disk = rbuf_msn(&rb);
    node->nodesize = rbuf_int(&rb);
    node->flags = rbuf_int(&rb);
@@ -1087,7 +1123,6 @@ setup_available_brtnode_partition(BRTNODE node, int i) {
    if (node->height == 0) {
 	set_BLB(node, i, toku_create_empty_bn());
        BLB_MAX_MSN_APPLIED(node,i) = node->max_msn_applied_to_node_on_disk;
-        BLB_MAX_DSN_APPLIED(node,i).dsn = 0;
    }
    else {
 	set_BNC(node, i, toku_create_empty_nl());
@@ -1102,10 +1137,11 @@ setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe) {
        // we can possibly require is a single basement node
        // we find out what basement node the query cares about
        // and check if it is available
-        assert(bfe->brt);
+        assert(bfe->cmp);
        assert(bfe->search);
        bfe->child_to_read = toku_brt_search_which_child(
-            bfe->brt,
+            bfe->cmp_extra,
+            bfe->cmp,
            node,
            bfe->search
            );
@@ -1142,31 +1178,32 @@ setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe) {
    }
 }

-static void 
+static void
 deserialize_brtnode_partition(
-    struct sub_block *sb, 
-    BRTNODE node, 
-    int index
+    struct sub_block *sb,
+    BRTNODE node,
+    int index,
+    DB *cmp_extra,
+    brt_compare_func cmp
    )
 {
    verify_brtnode_sub_block(sb);
    u_int32_t data_size = sb->uncompressed_size - 4; // checksum is 4 bytes at end
-    
+
    // now with the data verified, we can read the information into the node
    struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
    rbuf_init(&rb, sb->uncompressed_ptr, data_size);
    u_int32_t start_of_data;
-    
+
    if (node->height > 0) {
        unsigned char ch = rbuf_char(&rb);
        assert(ch == BRTNODE_PARTITION_FIFO_MSG);
-        deserialize_child_buffer(node, index, &rb);
+        deserialize_child_buffer(node, index, &rb, cmp_extra, cmp);
    }
    else {
        unsigned char ch = rbuf_char(&rb);
        assert(ch == BRTNODE_PARTITION_OMT_LEAVES);
        BLB_OPTIMIZEDFORUPGRADE(node, index) = rbuf_int(&rb);
-        // dont need to set max_dsn_applied because creation of basement node set it to correct value
        BLB_SEQINSERT(node, index) = 0;
        u_int32_t num_entries = rbuf_int(&rb);
        OMTVALUE *XMALLOC_N(num_entries, array);
@@ -1191,11 +1228,11 @@ deserialize_brtnode_partition(
 }

 static void
-decompress_and_deserialize_worker(struct rbuf curr_rbuf, struct sub_block curr_sb, BRTNODE node, int child)
+decompress_and_deserialize_worker(struct rbuf curr_rbuf, struct sub_block curr_sb, BRTNODE node, int child, DB *cmp_extra, brt_compare_func cmp)
 {
    read_and_decompress_sub_block(&curr_rbuf, &curr_sb);
    // at this point, sb->uncompressed_ptr stores the serialized node partition
-    deserialize_brtnode_partition(&curr_sb, node, child);
+    deserialize_brtnode_partition(&curr_sb, node, child, cmp_extra, cmp);
    toku_free(curr_sb.uncompressed_ptr);
 }

@@ -1306,7 +1343,7 @@ deserialize_brtnode_from_rbuf(
        // deserialize_brtnode_info figures out what the state
        // should be and sets up the memory so that we are ready to use it
        if (BP_STATE(node,i) == PT_AVAIL) {
-            cilk_spawn decompress_and_deserialize_worker(curr_rbuf, curr_sb, node, i);
+            cilk_spawn decompress_and_deserialize_worker(curr_rbuf, curr_sb, node, i, bfe->cmp_extra, bfe->cmp);
        }
        // case where we leave the partition in the compressed state
        else if (BP_STATE(node,i) == PT_COMPRESSED) {
@@ -1358,13 +1395,13 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode
        ssize_t rlen = toku_os_pread(fd, raw_block, curr_size, node_offset+curr_offset);
        lazy_assert((DISKOFF)rlen == curr_size);
    }
-    
+
    struct sub_block curr_sb;
    sub_block_init(&curr_sb);

    read_and_decompress_sub_block(&rb, &curr_sb);
    // at this point, sb->uncompressed_ptr stores the serialized node partition
-    deserialize_brtnode_partition(&curr_sb, node, childnum);
+    deserialize_brtnode_partition(&curr_sb, node, childnum, bfe->cmp_extra, bfe->cmp);
    if (node->height == 0) {
        toku_brt_bn_reset_stats(node, childnum);
    }
@@ -1374,13 +1411,14 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode

 // Take a brtnode partition that is in the compressed state, and make it avail
 void
-toku_deserialize_bp_from_compressed(BRTNODE node, int childnum) {
+toku_deserialize_bp_from_compressed(BRTNODE node, int childnum,
+                                    DB *cmp_extra, brt_compare_func cmp) {
    assert(BP_STATE(node, childnum) == PT_COMPRESSED);
    SUB_BLOCK curr_sb = BSB(node, childnum);

    assert(curr_sb->uncompressed_ptr == NULL);
    curr_sb->uncompressed_ptr = toku_xmalloc(curr_sb->uncompressed_size);
-    
+
    setup_available_brtnode_partition(node, childnum);
    BP_STATE(node,childnum) = PT_AVAIL;
    // decompress the sub_block
@@ -1390,7 +1428,7 @@ toku_deserialize_bp_from_compressed(BRTNODE node, int childnum) {
        curr_sb->compressed_ptr,
        curr_sb->compressed_size
        );
-    deserialize_brtnode_partition(curr_sb, node, childnum);
+    deserialize_brtnode_partition(curr_sb, node, childnum, cmp_extra, cmp);
    if (node->height == 0) {
        toku_brt_bn_reset_stats(node, childnum);
    }
@@ -1784,7 +1822,6 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
    h->dirty=0;
    h->panic = 0;
    h->panic_string = 0;
-    h->curr_dsn.dsn = MIN_DSN.dsn+1;
    toku_list_init(&h->live_brts);
    toku_list_init(&h->zombie_brts);
    toku_list_init(&h->checkpoint_before_commit_link);

--- a/newbrt/brt-test-helpers.c
+++ b/newbrt/brt-test-helpers.c
@@ -78,18 +78,18 @@ int toku_testsetup_get_sersize(BRT brt, BLOCKNUM diskoff) // Return the size on
    assert(testsetup_initialized);
    void *node_v;
    struct brtnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, brt->h);
+    fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
    int r  = toku_cachetable_get_and_pin(
-        brt->cf, diskoff, 
-        toku_cachetable_hash(brt->cf, diskoff), 
-        &node_v, 
+        brt->cf, diskoff,
+        toku_cachetable_hash(brt->cf, diskoff),
+        &node_v,
        NULL,
-        toku_brtnode_flush_callback, 
-        toku_brtnode_fetch_callback, 
-        toku_brtnode_pe_callback, 
+        toku_brtnode_flush_callback,
+        toku_brtnode_fetch_callback,
+        toku_brtnode_pe_callback,
        toku_brtnode_pf_req_callback,
        toku_brtnode_pf_callback,
-        &bfe, 
+        &bfe,
        brt->h
        );
    assert(r==0);
@@ -103,21 +103,21 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke
    int r;

    assert(testsetup_initialized);
-    
+
    struct brtnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, brt->h);
+    fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
    r = toku_cachetable_get_and_pin(
-        brt->cf, 
-        blocknum, 
-        toku_cachetable_hash(brt->cf, blocknum), 
-        &node_v, 
+        brt->cf,
+        blocknum,
+        toku_cachetable_hash(brt->cf, blocknum),
+        &node_v,
        NULL,
-	toku_brtnode_flush_callback, 
-	toku_brtnode_fetch_callback, 
-	toku_brtnode_pe_callback, 
+	toku_brtnode_flush_callback,
+	toku_brtnode_fetch_callback,
+	toku_brtnode_pe_callback,
        toku_brtnode_pf_req_callback,
        toku_brtnode_pf_callback,
-	&bfe, 
+	&bfe,
 	brt->h
 	);
    if (r!=0) return r;
@@ -176,19 +176,19 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_t
    assert(testsetup_initialized);

    struct brtnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, brt->h);
+    fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
    r = toku_cachetable_get_and_pin(
-        brt->cf, 
-        blocknum, 
-        toku_cachetable_hash(brt->cf, blocknum), 
-        &node_v, 
+        brt->cf,
+        blocknum,
+        toku_cachetable_hash(brt->cf, blocknum),
+        &node_v,
        NULL,
-	toku_brtnode_flush_callback, 
-	toku_brtnode_fetch_callback, 
-	toku_brtnode_pe_callback, 
+	toku_brtnode_flush_callback,
+	toku_brtnode_fetch_callback,
+	toku_brtnode_pe_callback,
        toku_brtnode_pf_req_callback,
        toku_brtnode_pf_callback,
-	&bfe, 
+	&bfe,
 	brt->h
 	);
    if (r!=0) return r;
@@ -197,12 +197,12 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_t

    DBT k;
    int childnum = toku_brtnode_which_child(node,
-				       toku_fill_dbt(&k, key, keylen),
-				       brt);
+                                            toku_fill_dbt(&k, key, keylen),
+                                            brt->db, brt->compare_fun);

    XIDS xids_0 = xids_get_root_xids();
    MSN msn = next_dummymsn();
-    r = toku_fifo_enq(BNC_BUFFER(node, childnum), key, keylen, val, vallen, cmdtype, msn, xids_0);
+    r = toku_fifo_enq(BNC_BUFFER(node, childnum), key, keylen, val, vallen, cmdtype, msn, xids_0, NULL);
    assert(r==0);
    // Hack to get the test working. The problem is that this test
    // is directly queueing something in a FIFO instead of 

--- a/newbrt/brt-verify.c
+++ b/newbrt/brt-verify.c
@@ -114,7 +114,7 @@ toku_verify_brtnode (BRT brt,
    u_int32_t fullhash = toku_cachetable_hash(brt->cf, blocknum);
    {
        struct brtnode_fetch_extra bfe;
-        fill_bfe_for_full_read(&bfe, brt->h);
+        fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
        int r = toku_cachetable_get_and_pin(
            brt->cf, 
            blocknum, 

--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@@ -109,6 +109,7 @@ Split_or_merge (node, childnum) {
 #include "roll.h"
 #include "toku_atomic.h"
 #include "sub_block.h"
+#include "sort.h"

 #if defined(HAVE_CILK)
 #include <cilk/cilk.h>
@@ -153,16 +154,6 @@ toku_assert_entire_node_in_memory(BRTNODE node) {
    }
 }

-//
-// MUST be called with the ydb lock held
-//
-static void
-set_new_DSN_for_node(BRTNODE node, BRT t) {
-    assert(t->h->curr_dsn.dsn > MIN_DSN.dsn);
-    node->dsn = t->h->curr_dsn;
-    t->h->curr_dsn.dsn++;
-}
-
 static u_int32_t
 get_leaf_num_entries(BRTNODE node) {
    u_int32_t result = 0;
@@ -275,8 +266,8 @@ static long brtnode_memory_size (BRTNODE node);

 //
 // The intent of toku_pin_brtnode(_holding_lock) is to abstract the process of retrieving a node from
-// the rest of brt.c, so that there is only one place where we need to worry about setting
-// the DSN and applying ancestor messages to a leaf node. The idea is for all of brt.c (search, splits, merges, flushes, etc)
+// the rest of brt.c, so that there is only one place where we need to worry applying ancestor 
+// messages to a leaf node. The idea is for all of brt.c (search, splits, merges, flushes, etc)
 // to access a node via toku_pin_brtnode(_holding_lock)
 //
 int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
@@ -301,9 +292,6 @@ int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
            unlockers);
    if (r==0) {
 	BRTNODE node = node_v;
-        if (node->dsn.dsn == INVALID_DSN.dsn) {
-            set_new_DSN_for_node(node, brt);
-        }
 	maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds);
 	*node_p = node;
 	// printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
@@ -336,9 +324,6 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha
        );
    assert(r==0);
    BRTNODE node = node_v;
-    if (node->dsn.dsn == INVALID_DSN.dsn) {
-        set_new_DSN_for_node(node, brt);
-    }
    maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds);
    *node_p = node;
 }
@@ -427,19 +412,19 @@ fixup_child_estimates (BRTNODE node, int childnum_of_node, BRTNODE child, BOOL d
    estimates.exact = TRUE;
    int i;
    for (i=0; i<child->n_children; i++) {
-	SUBTREE_EST child_se = &BP_SUBTREE_EST(child,i);
-	estimates.nkeys += child_se->nkeys;
-	estimates.ndata += child_se->ndata;
-	estimates.dsize += child_se->dsize;
-	if (!child_se->exact) estimates.exact = FALSE;
-	if (child->height>0) {
-	    if (BP_STATE(child,i) != PT_AVAIL || 
-                toku_fifo_n_entries(BNC_BUFFER(child,i))!=0) 
+        SUBTREE_EST child_se = &BP_SUBTREE_EST(child,i);
+        estimates.nkeys += child_se->nkeys;
+        estimates.ndata += child_se->ndata;
+        estimates.dsize += child_se->dsize;
+        if (!child_se->exact) estimates.exact = FALSE;
+        if (child->height>0) {
+            if (BP_STATE(child,i) != PT_AVAIL ||
+                toku_fifo_n_entries(BNC_BUFFER(child,i))!=0)
            {
                estimates.exact=FALSE;
-	    }
-	}
-    } 
+            }
+        }
+    }
    // We only call this function if we have reason to believe that the child changed.
    BP_SUBTREE_EST(node,childnum_of_node) = estimates;
    if (dirty_it) {
@@ -483,7 +468,7 @@ toku_verify_estimates (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bo
                u_int32_t fullhash = compute_child_fullhash(t->cf, node, childnum);
                BRTNODE childnode;
                struct brtnode_fetch_extra bfe;
-                fill_bfe_for_full_read(&bfe, t->h);
+                fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
 		toku_pin_brtnode_holding_lock(t, childblocknum, fullhash, &next_ancestors, &next_bounds, &bfe, &childnode);
                for (int i=0; i<childnode->n_children; i++) {
            	    child_estimate += BP_SUBTREE_EST(childnode, i).ndata;
@@ -531,6 +516,8 @@ brtnode_memory_size (BRTNODE node)
                NONLEAF_CHILDINFO childinfo = BNC(node, i);
                retval += sizeof(*childinfo);
                retval += toku_fifo_memory_size(BNC_BUFFER(node, i));
+                retval += toku_omt_memory_size(BNC_BROADCAST_BUFFER(node, i));
+                retval += toku_omt_memory_size(BNC_MESSAGE_TREE(node, i));
            }
            else {
                BASEMENTNODE bn = BLB(node, i);
@@ -584,7 +571,7 @@ toku_bfe_leftmost_child_wanted(struct brtnode_fetch_extra *bfe, BRTNODE node)
    } else if (bfe->range_lock_left_key == NULL) {
        return -1;
    } else {
-        return toku_brtnode_which_child(node, bfe->range_lock_left_key, bfe->brt);
+        return toku_brtnode_which_child(node, bfe->range_lock_left_key, bfe->cmp_extra, bfe->cmp);
    }
 }

@@ -597,7 +584,7 @@ toku_bfe_rightmost_child_wanted(struct brtnode_fetch_extra *bfe, BRTNODE node)
    } else if (bfe->range_lock_right_key == NULL) {
        return -1;
    } else {
-        return toku_brtnode_which_child(node, bfe->range_lock_right_key, bfe->brt);
+        return toku_brtnode_which_child(node, bfe->range_lock_right_key, bfe->cmp_extra, bfe->cmp);
    }
 }

@@ -609,7 +596,7 @@ brt_cursor_rightmost_child_wanted(BRT_CURSOR cursor, BRT brt, BRTNODE node)
    } else if (cursor->range_lock_right_key.data == NULL) {
        return -1;
    } else {
-        return toku_brtnode_which_child(node, &cursor->range_lock_right_key, brt);
+        return toku_brtnode_which_child(node, &cursor->range_lock_right_key, brt->db, brt->compare_fun);
    }
 }

@@ -792,10 +779,11 @@ BOOL toku_brtnode_pf_req_callback(void* brtnode_pv, void* read_extraargs) {
        // we can possibly require is a single basement node
        // we find out what basement node the query cares about
        // and check if it is available
-        assert(bfe->brt);
+        assert(bfe->cmp);
        assert(bfe->search);
        bfe->child_to_read = toku_brt_search_which_child(
-            bfe->brt,
+            bfe->cmp_extra,
+            bfe->cmp,
            node,
            bfe->search
            );
@@ -847,7 +835,7 @@ int toku_brtnode_pf_callback(void* brtnode_pv, void* read_extraargs, int fd, lon
        }
        if ((lc <= i && i <= rc) || toku_bfe_wants_child_available(bfe, i)) {
            if (BP_STATE(node,i) == PT_COMPRESSED) {
-                cilk_spawn toku_deserialize_bp_from_compressed(node, i);
+                cilk_spawn toku_deserialize_bp_from_compressed(node, i, bfe->cmp_extra, bfe->cmp);
            }
            else if (BP_STATE(node,i) == PT_ON_DISK) {
                cilk_spawn toku_deserialize_bp_from_disk(node, i, fd, bfe);
@@ -897,17 +885,17 @@ toku_cmd_leafval_heaviside (OMTVALUE lev, void *extra) {
 }

 static int
-brt_compare_pivot(BRT brt, const DBT *key, bytevec ck)
+brt_compare_pivot(DB *cmp_extra, brt_compare_func cmp, const DBT *key, bytevec ck)
    __attribute__((__warn_unused_result__));

 static int
-brt_compare_pivot(BRT brt, const DBT *key, bytevec ck)
+brt_compare_pivot(DB *cmp_extra, brt_compare_func cmp, const DBT *key, bytevec ck)
 {
-    int cmp;
+    int r;
    DBT mydbt;
    struct kv_pair *kv = (struct kv_pair *) ck;
-    cmp = brt->compare_fun(brt->db, key, toku_fill_dbt(&mydbt, kv_pair_key(kv), kv_pair_keylen(kv)));
-    return cmp;
+    r = cmp(cmp_extra, key, toku_fill_dbt(&mydbt, kv_pair_key(kv), kv_pair_keylen(kv)));
+    return r;
 }

 // destroys the internals of the brtnode, but it does not free the values
@@ -1025,7 +1013,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
    assert(height >= 0);

    n->max_msn_applied_to_node_on_disk = MIN_MSN;    // correct value for root node, harmless for others
-    n->dsn = INVALID_DSN; // the owner of the node should take responsibility for properly setting this
    n->nodesize = nodesize;
    n->flags = flags;
    n->thisnodename = nodename;
@@ -1089,12 +1076,6 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r
 	invariant(msna.msn == msnb.msn);
 	newroot->max_msn_applied_to_node_on_disk = msna;
    }
-    {
-	DSN dsna = nodea->dsn;
-	DSN dsnb = nodeb->dsn;
-	invariant(dsna.dsn == dsnb.dsn);
-	newroot->dsn = dsna;
-    }
    BP_STATE(newroot,0) = PT_AVAIL;
    BP_STATE(newroot,1) = PT_AVAIL;
    newroot->dirty = 1;
@@ -1121,7 +1102,6 @@ toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children) {
    BRTNODE XMALLOC(n);
    toku_initialize_empty_brtnode(n, name, height, n_children, t->h->layout_version, t->h->nodesize, t->flags);
    assert(n->nodesize > 0);
-    set_new_DSN_for_node(n, t);

    u_int32_t fullhash = toku_cachetable_hash(t->cf, n->thisnodename);
    n->fullhash = fullhash;
@@ -1268,7 +1248,6 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
 // Effect: Split a leaf node.
 {
    BRTNODE B;
-    DSN dsn = node->dsn;
    //printf("%s:%d splitting leaf %" PRIu64 " which is size %u (targetsize = %u)\n", __FILE__, __LINE__, node->thisnodename.b, toku_serialize_brtnode_size(node), node->nodesize);

    assert(node->height==0);
@@ -1354,8 +1333,6 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
 	    );
 	BLB_NBYTESINBUF(node, split_node) -= diff_size;
 	BLB_NBYTESINBUF(B, 0) += diff_size;
-        BLB_MAX_DSN_APPLIED(B,0) = BLB_MAX_DSN_APPLIED(node, split_node);
-        BLB_MAX_MSN_APPLIED(B,0) = BLB_MAX_MSN_APPLIED(node, split_node);
 	subtract_estimates(&BP_SUBTREE_EST(node,split_node), &se_diff);
 	add_estimates(&BP_SUBTREE_EST(B,0), &se_diff);

@@ -1399,9 +1376,6 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
    node->max_msn_applied_to_node_on_disk= max_msn_applied_to_node;
    B	->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;

-    node->dsn = dsn;
-    B->dsn = dsn;
-
    node->dirty = 1;
    B->dirty = 1;

@@ -1430,7 +1404,6 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
    int n_children_in_a = old_n_children/2;
    int n_children_in_b = old_n_children-n_children_in_a;
    MSN max_msn_applied_to_node = node->max_msn_applied_to_node_on_disk;
-    DSN dsn = node->dsn;
    BRTNODE B;
    assert(node->height>0);
    assert(node->n_children>=2); // Otherwise, how do we split?	 We need at least two children to split. */
@@ -1481,9 +1454,6 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
    node->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;
    B	->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;

-    node->dsn = dsn;
-    B->dsn = dsn;
-
    node->dirty = 1;
    B	->dirty = 1;
    toku_assert_entire_node_in_memory(node);
@@ -1603,7 +1573,7 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react, ANCESTORS a
 	struct ancestors next_ancestors = {node, childnum, ancestors};
 	const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
        struct brtnode_fetch_extra bfe;
-        fill_bfe_for_full_read(&bfe, t->h);
+        fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
 	toku_pin_brtnode_holding_lock(t,
 				      BP_BLOCKNUM(node, childnum),
 				      compute_child_fullhash(t->cf, node, childnum),
@@ -1892,13 +1862,6 @@ brt_leaf_put_cmd (

    LEAFENTRY storeddata;
    OMTVALUE storeddatav=NULL;
-    if (cmd->msn.msn <= bn->max_msn_applied.msn) {
-	brt_status.msn_discards++;
-	return;
-    }
-    else {
-	bn->max_msn_applied = cmd->msn;
-    }

    u_int32_t omt_size;
    int r;
@@ -2094,34 +2057,87 @@ brt_leaf_put_cmd (
    return;
 }

+static inline int
+key_msn_cmp(const DBT *a, const DBT *b, const MSN amsn, const MSN bmsn,
+            DB *key_cmp_extra, brt_compare_func key_cmp)
+{
+    int r = key_cmp(key_cmp_extra, a, b);
+    if (r == 0) {
+        r = (amsn.msn > bmsn.msn) - (amsn.msn < bmsn.msn);
+    }
+    return r;
+}
+
+int
+toku_fifo_entry_key_msn_heaviside(OMTVALUE v, void *extrap)
+{
+    const struct toku_fifo_entry_key_msn_heaviside_extra *extra = extrap;
+    const long offset = (long) v;
+    const struct fifo_entry *query = toku_fifo_get_entry(extra->fifo, offset);
+    DBT qdbt, tdbt;
+    const DBT *query_key = fill_dbt_for_fifo_entry(&qdbt, query);
+    const DBT *target_key = toku_fill_dbt(&tdbt, extra->key, extra->keylen);
+    return key_msn_cmp(query_key, target_key, query->msn, extra->msn,
+                       extra->cmp_extra, extra->cmp);
+}
+
+int
+toku_fifo_entry_key_msn_cmp(void *extrap, const void *ap, const void *bp)
+{
+    const struct toku_fifo_entry_key_msn_cmp_extra *extra = extrap;
+    const long ao = *(long *) ap;
+    const long bo = *(long *) bp;
+    const struct fifo_entry *a = toku_fifo_get_entry(extra->fifo, ao);
+    const struct fifo_entry *b = toku_fifo_get_entry(extra->fifo, bo);
+    DBT adbt, bdbt;
+    const DBT *akey = fill_dbt_for_fifo_entry(&adbt, a);
+    const DBT *bkey = fill_dbt_for_fifo_entry(&bdbt, b);
+    return key_msn_cmp(akey, bkey, a->msn, b->msn,
+                       extra->cmp_extra, extra->cmp);
+}
+
 // append a cmd to a nonleaf node's child buffer
 // should be static, but used by test programs
 void
-toku_brt_append_to_child_buffer(BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val) {
+toku_brt_append_to_child_buffer(BRT brt, BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val) {
    assert(BP_STATE(node,childnum) == PT_AVAIL);
    int diff = key->size + val->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids);
-    int r = toku_fifo_enq(BNC_BUFFER(node,childnum), key->data, key->size, val->data, val->size, type, msn, xids);
+    long offset;
+    int r = toku_fifo_enq(BNC_BUFFER(node, childnum), key->data, key->size, val->data, val->size, type, msn, xids, &offset);
    assert_zero(r);
+    enum brt_msg_type etype = (enum brt_msg_type) type;
+    if (brt_msg_type_applies_once(etype)) {
+        struct toku_fifo_entry_key_msn_heaviside_extra extra = { .cmp_extra = brt->db, .cmp = brt->compare_fun, .fifo = BNC_BUFFER(node, childnum), .key = key->data, .keylen = key->size, .msn = msn };
+        r = toku_omt_insert(BNC_MESSAGE_TREE(node, childnum), (OMTVALUE) offset, toku_fifo_entry_key_msn_heaviside, &extra, NULL);
+        assert_zero(r);
+    } else if (brt_msg_type_applies_all(etype) || brt_msg_type_does_nothing(etype)) {
+        u_int32_t idx = toku_omt_size(BNC_BROADCAST_BUFFER(node, childnum));
+        r = toku_omt_insert_at(BNC_BROADCAST_BUFFER(node, childnum), (OMTVALUE) offset, idx);
+        assert_zero(r);
+    } else {
+        assert(FALSE);
+    }
    BNC_NBYTESINBUF(node, childnum) += diff;
    node->dirty = 1;
 }

-static void brt_nonleaf_cmd_once_to_child (BRTNODE node, unsigned int childnum, BRT_MSG cmd)
+static void brt_nonleaf_cmd_once_to_child (BRT brt, BRTNODE node, unsigned int childnum, BRT_MSG cmd)
 // Previously we had passive aggressive promotion, but that causes a lot of I/O a the checkpoint.  So now we are just putting it in the buffer here.
 // Also we don't worry about the node getting overfull here.  It's the caller's problem.
 {
-    toku_brt_append_to_child_buffer(node, childnum, cmd->type, cmd->msn, cmd->xids, cmd->u.id.key, cmd->u.id.val);
+    toku_brt_append_to_child_buffer(brt, node, childnum, cmd->type, cmd->msn, cmd->xids, cmd->u.id.key, cmd->u.id.val);
 }

 /* find the leftmost child that may contain the key */
-unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) {
+unsigned int toku_brtnode_which_child(BRTNODE node, const DBT *k,
+                                      DB *cmp_extra, brt_compare_func cmp) {
 #define DO_PIVOT_SEARCH_LR 0
 #if DO_PIVOT_SEARCH_LR
    int i;
    for (i=0; i<node->n_children-1; i++) {
-	int cmp = brt_compare_pivot(t, k, d, node->childkeys[i]);
-	if (cmp > 0) continue;
-	if (cmp < 0) return i;
+	int c = brt_compare_pivot(cmp_extra, cmp, k, d, node->childkeys[i]);
+	if (c > 0) continue;
+	if (c < 0) return i;
 	return i;
    }
    return node->n_children-1;
@@ -2133,8 +2149,8 @@ unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) {
    // random keys
    int i;
    for (i = node->n_children-2; i >= 0; i--) {
-	int cmp = brt_compare_pivot(t, k, d, node->childkeys[i]);
-	if (cmp > 0) return i+1;
+	int c = brt_compare_pivot(cmp_extra, cmp, k, d, node->childkeys[i]);
+	if (c > 0) return i+1;
    }
    return 0;
 #endif
@@ -2145,8 +2161,8 @@ unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) {

    // check the last key to optimize seq insertions
    int n = node->n_children-1;
-    int cmp = brt_compare_pivot(t, k, node->childkeys[n-1]);
-    if (cmp > 0) return n;
+    int c = brt_compare_pivot(cmp_extra, cmp, k, node->childkeys[n-1]);
+    if (c > 0) return n;

    // binary search the pivots
    int lo = 0;
@@ -2154,12 +2170,12 @@ unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) {
    int mi;
    while (lo < hi) {
 	mi = (lo + hi) / 2;
-	cmp = brt_compare_pivot(t, k, node->childkeys[mi]);
-	if (cmp > 0) {
+	c = brt_compare_pivot(cmp_extra, cmp, k, node->childkeys[mi]);
+	if (c > 0) {
 	    lo = mi+1;
 	    continue;
 	} 
-	if (cmp < 0) {
+	if (c < 0) {
 	    hi = mi;
 	    continue;
 	}
@@ -2177,87 +2193,39 @@ static void brt_nonleaf_cmd_once (BRT t, BRTNODE node, BRT_MSG cmd)

    /* find the right subtree */
    //TODO: accesses key, val directly
-    unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t);
+    unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t->db, t->compare_fun);

-    brt_nonleaf_cmd_once_to_child (node, childnum, cmd);
+    brt_nonleaf_cmd_once_to_child (t, node, childnum, cmd);
 }

 static void
-brt_nonleaf_cmd_all (BRTNODE node, BRT_MSG cmd)
+brt_nonleaf_cmd_all (BRT t, BRTNODE node, BRT_MSG cmd)
 // Effect: Put the cmd into a nonleaf node.  We put it into all children, possibly causing the children to become reactive.
 //  We don't do the splitting and merging.  That's up to the caller after doing all the puts it wants to do.
 //  The re_array[i] gets set to the reactivity of any modified child i.	 (And there may be several such children.)
 {
    int i;
    for (i = 0; i < node->n_children; i++) {
-	brt_nonleaf_cmd_once_to_child(node, i, cmd);
+	brt_nonleaf_cmd_once_to_child(t, node, i, cmd);
    }
 }

 static BOOL
 brt_msg_applies_once(BRT_MSG cmd)
 {
-    BOOL ret_val;
-    
-    //TODO: Accessing type directly
-    switch (cmd->type) {
-    case BRT_INSERT_NO_OVERWRITE: 
-    case BRT_INSERT:
-    case BRT_DELETE_ANY:
-    case BRT_ABORT_ANY:
-    case BRT_COMMIT_ANY:
-    case BRT_UPDATE:
-	ret_val = TRUE;
-	break;
-    case BRT_COMMIT_BROADCAST_ALL:
-    case BRT_COMMIT_BROADCAST_TXN:
-    case BRT_ABORT_BROADCAST_TXN:
-    case BRT_OPTIMIZE:
-    case BRT_OPTIMIZE_FOR_UPGRADE:
-    case BRT_UPDATE_BROADCAST_ALL:
-    case BRT_NONE:
-	ret_val = FALSE;
-	break;
-    default:
-	assert(FALSE);
-    }
-    return ret_val;
+    return brt_msg_type_applies_once(cmd->type);
 }

 static BOOL
 brt_msg_applies_all(BRT_MSG cmd)
 {
-    BOOL ret_val;
-    
-    //TODO: Accessing type directly
-    switch (cmd->type) {
-    case BRT_NONE:
-    case BRT_INSERT_NO_OVERWRITE: 
-    case BRT_INSERT:
-    case BRT_DELETE_ANY:
-    case BRT_ABORT_ANY:
-    case BRT_COMMIT_ANY:
-    case BRT_UPDATE:
-	ret_val = FALSE;
-	break;
-    case BRT_COMMIT_BROADCAST_ALL:
-    case BRT_COMMIT_BROADCAST_TXN:
-    case BRT_ABORT_BROADCAST_TXN:
-    case BRT_OPTIMIZE:
-    case BRT_OPTIMIZE_FOR_UPGRADE:
-    case BRT_UPDATE_BROADCAST_ALL:
-	ret_val = TRUE;
-	break;
-    default:
-	assert(FALSE);
-    }
-    return ret_val;
+    return brt_msg_type_applies_all(cmd->type);
 }

 static BOOL
 brt_msg_does_nothing(BRT_MSG cmd)
 {
-    return (cmd->type == BRT_NONE);
+    return brt_msg_type_does_nothing(cmd->type);
 }

 static void
@@ -2287,7 +2255,7 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd)
    case BRT_OPTIMIZE:
    case BRT_OPTIMIZE_FOR_UPGRADE:
    case BRT_UPDATE_BROADCAST_ALL:
-	brt_nonleaf_cmd_all (node, cmd);  // send message to all children
+	brt_nonleaf_cmd_all (t, node, cmd);  // send message to all children
 	return;
    case BRT_NONE:
 	return;
@@ -2469,7 +2437,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
 //  splitk		(OUT):	If the two nodes did not get merged, the new pivot key between the two nodes.
 {
    MSN msn_max;
-    DSN dsn_max;
    assert(a->height == b->height);
    toku_assert_entire_node_in_memory(parent);
    toku_assert_entire_node_in_memory(a);
@@ -2483,7 +2450,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
 	    invariant(msn_max.msn <= parent->max_msn_applied_to_node_on_disk.msn);  // parent msn must be >= children's msn
 	}
    }
-    dsn_max = (a->dsn.dsn > b->dsn.dsn) ? a->dsn : b->dsn; // this value is ignored for leafnodes, only basement dsn is use for leafnodes
    if (a->height == 0) {
 	maybe_merge_pinned_leaf_nodes(parent, childnum_of_parent, a, b, parent_splitk, did_merge, did_rebalance, splitk);
    } else {
@@ -2494,8 +2460,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
 	// accurate for non-leaf nodes because buffer immediately above each node has been flushed
 	a->max_msn_applied_to_node_on_disk = msn_max;
 	b->max_msn_applied_to_node_on_disk = msn_max;
-        a->dsn = dsn_max;
-        b->dsn = dsn_max;
    }
 }

@@ -2540,7 +2504,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react,
 	struct ancestors next_ancestors = {node, childnuma, ancestors};
 	const struct pivot_bounds next_bounds = next_pivot_keys(node, childnuma, bounds);
        struct brtnode_fetch_extra bfe;
-        fill_bfe_for_full_read(&bfe, t->h);
+        fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
 	toku_pin_brtnode_holding_lock(t, BP_BLOCKNUM(node, childnuma), childfullhash, &next_ancestors, &next_bounds, &bfe, &childa);
    }
    {
@@ -2548,7 +2512,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react,
 	struct ancestors next_ancestors = {node, childnumb, ancestors};
 	const struct pivot_bounds next_bounds = next_pivot_keys(node, childnumb, bounds);
        struct brtnode_fetch_extra bfe;
-        fill_bfe_for_full_read(&bfe, t->h);
+        fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
 	toku_pin_brtnode_holding_lock(t, BP_BLOCKNUM(node, childnumb), childfullhash, &next_ancestors, &next_bounds, &bfe, &childb);
    }

@@ -2698,14 +2662,11 @@ flush_some_child (BRT t, BRTNODE node, BOOL is_first_flush, BOOL flush_recursive
 static void assert_leaf_up_to_date(BRTNODE node) {
    assert(node->height == 0);
    toku_assert_entire_node_in_memory(node);
-    for (int i=0; i < node->n_children; i++) {
-	assert(BLB_MAX_DSN_APPLIED(node, i).dsn >= MIN_DSN.dsn);
-    }
 }

 static void
 flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re, BOOL is_first_flush, BOOL flush_recursively,
-		  ANCESTORS ancestors, struct pivot_bounds const * const bounds)
+                  ANCESTORS ancestors, struct pivot_bounds const * const bounds)
 // Effect: Push everything in the CHILDNUMth buffer of node down into the child.
 //  The child may split or merge as a result of the activity.
 //  The IS_FIRST_FLUSH variable is a way to prevent the flushing from walking the entire tree.	If IS_FIRST_FLUSH==TRUE then we are allowed to flush more than one child, otherwise
@@ -2721,7 +2682,7 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re,
    u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnum);
    BRTNODE child;
    struct brtnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, t->h);
+    fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
    toku_pin_brtnode_holding_lock(t, targetchild, childfullhash, &next_ancestors, &next_bounds, &bfe, &child); // get that child node in, and apply the ancestor messages if it's a leaf.

    toku_assert_entire_node_in_memory(node);
@@ -2729,90 +2690,99 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re,
    VERIFY_NODE(t, child);

    FIFO fifo = BNC_BUFFER(node,childnum);
+    int r;
    if (child->height==0) {
-	// The child is a leaf node. 
-	assert_leaf_up_to_date(child); //  The child has all the messages applied to it.
-	// We've arranged that the path from the root to this child is empty, except for the childnum fifo in node.
-	// We must empty the fifo, and arrange for the child to be written to disk, and then mark it as clean and up-to-date.
-	bytevec key, val;
-	ITEMLEN keylen, vallen;
-	u_int32_t type;
-	MSN msn;
-	XIDS xids;
-	while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) {
-	    int n_bytes_removed = (keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids));
-
-	    int r = toku_fifo_deq(fifo);
-	    assert(r==0);
-
-	    BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed;
-	}
-	toku_fifo_size_is_stabilized(fifo);
-
-	invariant(BNC_NBYTESINBUF(node, childnum) == 0);
-	BP_WORKDONE(node, childnum) = 0;  // this buffer is drained, no work has been done by its contents
-
-	node->dirty=TRUE;
-	child->dirty=TRUE;
-	fixup_child_estimates(node, childnum, child, TRUE);
-	*child_re = get_node_reactivity(child);
-	toku_unpin_brtnode(t, child);
+        // The child is a leaf node.
+        assert_leaf_up_to_date(child); //  The child has all the messages applied to it.
+        // We've arranged that the path from the root to this child is empty, except for the childnum fifo in node.
+        // We must empty the fifo, and arrange for the child to be written to disk, and then mark it as clean and up-to-date.
+        bytevec key, val;
+        ITEMLEN keylen, vallen;
+        u_int32_t type;
+        MSN msn;
+        XIDS xids;
+        while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) {
+            int n_bytes_removed = (keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids));
+
+            r = toku_fifo_deq(fifo);
+            assert(r==0);
+
+            BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed;
+        }
+        toku_fifo_size_is_stabilized(fifo);
+
+        invariant(BNC_NBYTESINBUF(node, childnum) == 0);
+        toku_omt_destroy(&BNC_MESSAGE_TREE(node, childnum));
+        r = toku_omt_create(&BNC_MESSAGE_TREE(node, childnum)); resource_assert_zero(r);
+        toku_omt_destroy(&BNC_BROADCAST_BUFFER(node, childnum));
+        r = toku_omt_create(&BNC_BROADCAST_BUFFER(node, childnum)); resource_assert_zero(r);
+        BP_WORKDONE(node, childnum) = 0;  // this buffer is drained, no work has been done by its contents
+
+        node->dirty=TRUE;
+        child->dirty=TRUE;
+        fixup_child_estimates(node, childnum, child, TRUE);
+        *child_re = get_node_reactivity(child);
+        toku_unpin_brtnode(t, child);
    } else {
-	bytevec key,val;
-	ITEMLEN keylen, vallen;
-	//printf("%s:%d Try random_pick, weight=%d \n", __FILE__, __LINE__, BNC_NBYTESINBUF(node, childnum));
-	assert(toku_fifo_n_entries(fifo)>0);
-	u_int32_t type;
-	MSN msn;
-	XIDS xids;
-	while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) {
-	    DBT hk,hv;
+        bytevec key,val;
+        ITEMLEN keylen, vallen;
+        //printf("%s:%d Try random_pick, weight=%d \n", __FILE__, __LINE__, BNC_NBYTESINBUF(node, childnum));
+        assert(toku_fifo_n_entries(fifo)>0);
+        u_int32_t type;
+        MSN msn;
+        XIDS xids;
+        while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) {
+            DBT hk,hv;

-	    //TODO: Factor out (into a function) conversion of fifo_entry to message
-	    BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id= {toku_fill_dbt(&hk, key, keylen),
-										  toku_fill_dbt(&hv, val, vallen)} };
+            //TODO: Factor out (into a function) conversion of fifo_entry to message
+            BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id= {toku_fill_dbt(&hk, key, keylen),
+                                                                             toku_fill_dbt(&hv, val, vallen)} };

-	    int n_bytes_removed = (hk.size + hv.size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids));
+            int n_bytes_removed = (hk.size + hv.size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids));

-	    //printf("%s:%d random_picked\n", __FILE__, __LINE__);
-	    brtnode_put_cmd (t, child, &brtcmd);
+            //printf("%s:%d random_picked\n", __FILE__, __LINE__);
+            brtnode_put_cmd (t, child, &brtcmd);

-	    //printf("%s:%d %d=push_a_brt_cmd_down=();	child_did_split=%d (weight=%d)\n", __FILE__, __LINE__, r, child_did_split, BNC_NBYTESINBUF(node, childnum));
+            //printf("%s:%d %d=push_a_brt_cmd_down=();	child_did_split=%d (weight=%d)\n", __FILE__, __LINE__, r, child_did_split, BNC_NBYTESINBUF(node, childnum));

-	    {
-		int r = toku_fifo_deq(fifo);
-		//printf("%s:%d deleted status=%d\n", __FILE__, __LINE__, r);
-		assert(r==0);
-	    }
+            {
+                r = toku_fifo_deq(fifo);
+                //printf("%s:%d deleted status=%d\n", __FILE__, __LINE__, r);
+                assert(r==0);
+            }

-	    BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed;
-	    node->dirty = 1;
+            BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed;
+            node->dirty = 1;

-	}
-	toku_fifo_size_is_stabilized(fifo);
-
-	invariant(BNC_NBYTESINBUF(node, childnum) == 0);
-	BP_WORKDONE(node, childnum) = 0;  // this buffer is drained, no work has been done by its contents
-
-	if (0) printf("%s:%d done random picking\n", __FILE__, __LINE__);
-
-	// Having pushed all that stuff to a child, do we need to flush the child?  We may have to flush it many times if there were lots of messages that just got pushed down.
-	// If we were to only flush one child, we could possibly end up with a very big node after a while.
-	// This repeated flushing can cause some inserts to take a long time (possibly walking all over the tree).
-	// When we get the background flushing working, it may be OK if that happens, but for now, we just flush a little.
-	if (flush_recursively) {
-	    int n_flushed = 0;
-	    while (nonleaf_node_is_gorged(child) && (is_first_flush || n_flushed==0)) {
-		// don't do more than one child unless this is the first flush.
-		flush_some_child(t, child, is_first_flush && n_flushed==0, flush_recursively,
-				 &next_ancestors, &next_bounds);
-		n_flushed++;
-	    }
-	}
-	fixup_child_estimates(node, childnum, child, TRUE);
-	// Now it's possible that the child needs to be merged or split.
-	*child_re = get_node_reactivity(child);
-	toku_unpin_brtnode(t, child);
+        }
+        toku_fifo_size_is_stabilized(fifo);
+
+        invariant(BNC_NBYTESINBUF(node, childnum) == 0);
+        toku_omt_destroy(&BNC_MESSAGE_TREE(node, childnum));
+        r = toku_omt_create(&BNC_MESSAGE_TREE(node, childnum)); resource_assert_zero(r);
+        toku_omt_destroy(&BNC_BROADCAST_BUFFER(node, childnum));
+        r = toku_omt_create(&BNC_BROADCAST_BUFFER(node, childnum)); resource_assert_zero(r);
+        BP_WORKDONE(node, childnum) = 0;  // this buffer is drained, no work has been done by its contents
+
+        if (0) printf("%s:%d done random picking\n", __FILE__, __LINE__);
+
+        // Having pushed all that stuff to a child, do we need to flush the child?  We may have to flush it many times if there were lots of messages that just got pushed down.
+        // If we were to only flush one child, we could possibly end up with a very big node after a while.
+        // This repeated flushing can cause some inserts to take a long time (possibly walking all over the tree).
+        // When we get the background flushing working, it may be OK if that happens, but for now, we just flush a little.
+        if (flush_recursively) {
+            int n_flushed = 0;
+            while (nonleaf_node_is_gorged(child) && (is_first_flush || n_flushed==0)) {
+                // don't do more than one child unless this is the first flush.
+                flush_some_child(t, child, is_first_flush && n_flushed==0, flush_recursively,
+                                 &next_ancestors, &next_bounds);
+                n_flushed++;
+            }
+        }
+        fixup_child_estimates(node, childnum, child, TRUE);
+        // Now it's possible that the child needs to be merged or split.
+        *child_re = get_node_reactivity(child);
+        toku_unpin_brtnode(t, child);
    }
 }

@@ -2863,6 +2833,10 @@ flush_this_height1_child (BRT t, BRTNODE node, int childnum, BRTNODE child)
    }

    invariant(BNC_NBYTESINBUF(node, childnum) == 0);
+    toku_omt_destroy(&BNC_MESSAGE_TREE(node, childnum));
+    r = toku_omt_create(&BNC_MESSAGE_TREE(node, childnum)); resource_assert_zero(r);
+    toku_omt_destroy(&BNC_BROADCAST_BUFFER(node, childnum));
+    r = toku_omt_create(&BNC_BROADCAST_BUFFER(node, childnum)); resource_assert_zero(r);
    BP_WORKDONE(node, childnum) = 0;  // this buffer is drained, no work has been done by its contents

    node->dirty=TRUE;
@@ -2911,73 +2885,56 @@ brtnode_nonleaf_put_cmd_at_root (BRT t, BRTNODE node, BRT_MSG cmd)
    brt_nonleaf_put_cmd(t, node, cmd);
 }

-static BOOL 
-partition_requires_msg_application(BRTNODE leaf, int childnum, ANCESTORS ancestors) {
-    invariant(leaf->height == 0);
-    BOOL requires_msg_application = FALSE;
-    if (BP_STATE(leaf,childnum) != PT_AVAIL) return FALSE;
-    for (
-        ANCESTORS curr_ancestors = ancestors; 
-        curr_ancestors; 
-        curr_ancestors = curr_ancestors->next
-        ) 
-    {
-	// Note, we compare DSN of each nonleaf ancestor to DSN of relevant basement.
-        if (curr_ancestors->node->dsn.dsn > BLB_MAX_DSN_APPLIED(leaf,childnum).dsn) {
-            requires_msg_application = TRUE;
-	    brt_status.dsn_gap++;
-            break;
-        }
-    }
-    return requires_msg_application;
-}
-
-

 // Effect: applies the cmd to the leaf if the appropriate basement node is in memory.
 //           If the appropriate basement node is not in memory, then nothing gets applied
 //           If the appropriate basement node must be in memory, it is the caller's responsibility to ensure
 //             that it is
-void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, bool *made_change, ANCESTORS ancestors, uint64_t *workdone) {
+void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, bool *made_change, ANCESTORS UU(ancestors), uint64_t *workdone) {
    VERIFY_NODE(t, node);
    // ignore messages that have already been applied to this leaf
-    
+
    if (brt_msg_applies_once(cmd)) {
-	unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t);
-        BOOL req_msg_app = partition_requires_msg_application(node, childnum, ancestors);
+        unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t->db, t->compare_fun);
        // only apply the message if we have an available basement node that is up to date
        // we know it is up to date if partition_requires_msg_application returns FALSE
-        if (BP_STATE(node,childnum) == PT_AVAIL && !req_msg_app) {
-            brt_leaf_put_cmd(t, 
-			     BLB(node, childnum),
-			     &BP_SUBTREE_EST(node, childnum),
-			     cmd, 
-			     made_change,
-			     workdone
-			     );
+        if (BP_STATE(node,childnum) == PT_AVAIL) {
+            if (cmd->msn.msn > BLB(node, childnum)->max_msn_applied.msn) {
+                BLB(node, childnum)->max_msn_applied = cmd->msn;
+                brt_leaf_put_cmd(t,
+                                 BLB(node, childnum),
+                                 &BP_SUBTREE_EST(node, childnum),
+                                 cmd,
+                                 made_change,
+                                 workdone);
+            } else {
+                brt_status.msn_discards++;
+            }
        }
    }
    else if (brt_msg_applies_all(cmd)) {
-	bool bn_made_change = false;
-	for (int childnum=0; childnum<node->n_children; childnum++) {
-            BOOL req_msg_app = partition_requires_msg_application(node, childnum, ancestors);
+        bool bn_made_change = false;
+        for (int childnum=0; childnum<node->n_children; childnum++) {
            // only apply the message if we have an available basement node that is up to date
            // we know it is up to date if partition_requires_msg_application returns FALSE
-            if (BP_STATE(node,childnum) == PT_AVAIL && !req_msg_app) {
-                brt_leaf_put_cmd(
-                    t, 
-                    BLB(node, childnum),
-                    &BP_SUBTREE_EST(node,childnum),
-                    cmd, 
-                    &bn_made_change,
-		    workdone
-                    );
-                if (bn_made_change) *made_change = 1;
+            if (BP_STATE(node,childnum) == PT_AVAIL) {
+                if (cmd->msn.msn > BLB(node, childnum)->max_msn_applied.msn) {
+                    BLB(node, childnum)->max_msn_applied = cmd->msn;
+                    brt_leaf_put_cmd(t,
+                                     BLB(node, childnum),
+                                     &BP_SUBTREE_EST(node,childnum),
+                                     cmd,
+                                     &bn_made_change,
+                                     workdone);
+                    if (bn_made_change) *made_change = 1;
+                } else {
+                    brt_status.msn_discards++;
+                }
            }
-	}
+        }
    }
    else if (!brt_msg_does_nothing(cmd)) {
-	assert(FALSE);
+        assert(FALSE);
    }
    VERIFY_NODE(t, node);
 }
@@ -3051,105 +3008,6 @@ static u_int32_t get_roothash (BRT brt) {
    return rh->fullhash;
 }

-
-static void apply_cmd_to_in_memory_non_root_leaves (
-    BRT t,
-    CACHEKEY nodenum,
-    u_int32_t fullhash,
-    BRT_MSG cmd,
-    BRTNODE parent,
-    int parents_childnum,
-    ANCESTORS ancestors,
-    struct pivot_bounds const * const bounds,
-    uint64_t * workdone,
-    bool *made_change_p
-    );
-
-static void apply_cmd_to_in_memory_non_root_leaves_starting_at_node (BRT t,
-								     BRTNODE node,
-								     BRT_MSG cmd,
-								     BOOL is_root,
-								     BRTNODE parent,
-								     int parents_childnum,
-								     ANCESTORS ancestors,
-								     struct pivot_bounds const * const bounds,
-								     uint64_t * workdone,
-                                                                     bool *made_change_p)  {
-    bool made_change = false;
-    if (made_change_p == NULL) {
-        made_change_p = &made_change;
-    }
-    // internal node
-    if (node->height>0) {
-	if (brt_msg_applies_once(cmd)) {
-	    unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t);
-            struct ancestors next_ancestors = {node, childnum, ancestors};
-            const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
-	    u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum);
-	    if (is_root)  // record workdone in root only, if not root then this is a recursive call so just pass along pointer
-		workdone = &(BP_WORKDONE(node,childnum));
-	    apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), child_fullhash, cmd, node, childnum, &next_ancestors, &next_bounds, workdone, made_change_p);
-	}
-	else if (brt_msg_applies_all(cmd)) {
-	    for (int childnum=0; childnum<node->n_children; childnum++) {
-                struct ancestors next_ancestors = {node, childnum, ancestors};
-                const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
-                u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum);
-		if (is_root)
-		    workdone = &(BP_WORKDONE(node,childnum));
-		apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), child_fullhash, cmd, node, childnum, &next_ancestors, &next_bounds, workdone, made_change_p);
-	    }
-	}
-    }
-    // leaf node
-    else {
-	invariant(!is_root);
-	toku_apply_cmd_to_leaf(t, node, cmd, made_change_p, ancestors, workdone);
-    }
-
-    if (*made_change_p) {
-        if (parent) {
-            fixup_child_estimates(parent, parents_childnum, node, FALSE);
-        } else {
-            invariant(is_root);  // only root has no parent
-        }
-    }
-}
-
-// apply a single message, stored in root's buffer(s), to all relevant leaves that are in memory
-static void apply_cmd_to_in_memory_non_root_leaves (
-    BRT t,
-    CACHEKEY nodenum,
-    u_int32_t fullhash,
-    BRT_MSG cmd,
-    BRTNODE parent,
-    int parents_childnum,
-    ANCESTORS ancestors,
-    struct pivot_bounds const * const bounds,
-    uint64_t * workdone,
-    bool *made_change_p
-    )
-{
-    BRTNODE node = NULL;
-    void *node_v;
-
-    int r = toku_cachetable_get_and_pin_if_in_memory(
-        t->cf,
-        nodenum,
-        fullhash,
-        &node_v
-        );
-    if (r) { goto exit; }
-
-    node = node_v;
-
-    apply_cmd_to_in_memory_non_root_leaves_starting_at_node(t, node, cmd, FALSE, parent, parents_childnum, ancestors, bounds, workdone, made_change_p);
-
-    toku_unpin_brtnode(t, node);
-exit:
-    return;
-}
-
 CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *roothash) {
    *roothash = get_roothash(brt);
    return &brt->h->root;
@@ -3173,7 +3031,7 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)

    // get the root node
    struct brtnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, brt->h);
+    fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
    toku_pin_brtnode_holding_lock(brt, *rootp, fullhash, NULL, &infinite_bounds, &bfe, &node);
    toku_assert_entire_node_in_memory(node);
    cmd->msn.msn = node->max_msn_applied_to_node_on_disk.msn + 1;
@@ -3190,7 +3048,6 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
    // verify that msn of latest message was captured in root node (push_something_at_root() did not release ydb lock)
    invariant(cmd->msn.msn == node->max_msn_applied_to_node_on_disk.msn);
    if (node->height > 0) {
-	apply_cmd_to_in_memory_non_root_leaves_starting_at_node(brt, node, cmd, TRUE, NULL, -1, (ANCESTORS)NULL, &infinite_bounds, NULL, NULL);
 	if (nonleaf_node_is_gorged(node)) {
 	    // No need for a loop here.  We only inserted one message, so flushing a single child suffices.
 	    flush_some_child(brt, node, TRUE, TRUE,
@@ -3634,7 +3491,6 @@ static int setup_initial_brt_root_node (BRT t, BLOCKNUM blocknum) {
    BRTNODE XMALLOC(node);
    toku_initialize_empty_brtnode(node, blocknum, 0, 1, t->h->layout_version, t->h->nodesize, t->flags);
    BP_STATE(node,0) = PT_AVAIL;
-    set_new_DSN_for_node(node, t);

    u_int32_t fullhash = toku_cachetable_hash(t->cf, blocknum);
    node->fullhash = fullhash;
@@ -3794,8 +3650,6 @@ brt_alloc_init_header(BRT t, TOKUTXN txn) {

    memset(&t->h->descriptor, 0, sizeof(t->h->descriptor));

-    t->h->curr_dsn.dsn = MIN_DSN.dsn + 1; // start at MIN_DSN + 1, as MIN_DSN is reserved for basement nodes
-
    r = brt_init_header(t, txn);
    if (r != 0) goto died2;
    return r;
@@ -5038,13 +4892,13 @@ static void search_save_bound (brt_search_t *search, DBT *pivot) {
    search->have_pivot_bound = TRUE;
 }

-static BOOL search_pivot_is_bounded (brt_search_t *search, BRT brt, DBT *pivot)
+static BOOL search_pivot_is_bounded (brt_search_t *search, DB *cmp_extra, brt_compare_func cmp, DBT *pivot)
 // Effect:  Return TRUE iff the pivot has already been searched (for fixing #3522.)
 //  If searching from left to right, if we have already searched all the values less than pivot, we don't want to search again.
 //  If searching from right to left, if we have already searched all the vlaues greater than pivot, we don't want to search again.
 {
    if (!search->have_pivot_bound) return TRUE; // isn't bounded.
-    int comp = brt->compare_fun(brt->db, pivot, &search->pivot_bound);
+    int comp = cmp(cmp_extra, pivot, &search->pivot_bound);
    if (search->direction == BRT_SEARCH_LEFT) {
 	// searching from left to right.  If the comparison function says the pivot is <= something we already compared, don't do it again.
 	return comp>0;
@@ -5074,13 +4928,90 @@ static BOOL msg_type_has_key (enum brt_msg_type m) {
    assert(0);
 }

+struct store_fifo_offset_extra {
+    long *offsets;
+    int i;
+};
+
+static int
+store_fifo_offset(OMTVALUE v, u_int32_t UU(idx), void *extrap)
+{
+    struct store_fifo_offset_extra *extra = extrap;
+    const long offset = (long) v;
+    extra->offsets[extra->i] = offset;
+    extra->i++;
+    return 0;
+}
+
+static int
+fifo_offset_msn_cmp(void *extrap, const void *va, const void *vb)
+{
+    FIFO fifo = extrap;
+    const long *ao = va;
+    const long *bo = vb;
+    const struct fifo_entry *a = toku_fifo_get_entry(fifo, *ao);
+    const struct fifo_entry *b = toku_fifo_get_entry(fifo, *bo);
+    return (a->msn.msn > b->msn.msn) - (a->msn.msn < b->msn.msn);
+}
+
+static void
+do_brt_leaf_put_cmd(BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRTNODE ancestor, int childnum, DBT *lbe_ptr, DBT *ubi_ptr, MSN *max_msn_applied, const struct fifo_entry *entry)
+{
+    ITEMLEN keylen = entry->keylen;
+    ITEMLEN vallen = entry->vallen;
+    enum brt_msg_type type = (enum brt_msg_type)entry->type;
+    MSN msn = entry->msn;
+    const XIDS xids = (XIDS) &entry->xids_s;
+    bytevec key = xids_get_end_of_array(xids);
+    bytevec val = (u_int8_t*)key + entry->keylen;
+
+    DBT hk;
+    toku_fill_dbt(&hk, key, keylen);
+    assert(!msg_type_has_key(type) || key_is_in_leaf_range(t, &hk, lbe_ptr, ubi_ptr));
+    DBT hv;
+    BRT_MSG_S brtcmd = { type, msn, xids, .u.id = { &hk, toku_fill_dbt(&hv, val, vallen) } };
+    bool made_change;
+    // the messages are in (key,msn) order so all the messages for one key
+    // in one buffer are in ascending msn order, so it's ok that we don't
+    // update the basement node's msn until the end
+    if (brtcmd.msn.msn > bn->max_msn_applied.msn) {
+        if (brtcmd.msn.msn > max_msn_applied->msn) {
+            *max_msn_applied = brtcmd.msn;
+        }
+        brt_leaf_put_cmd(t, bn, se, &brtcmd, &made_change, &BP_WORKDONE(ancestor, childnum));
+    } else {
+        brt_status.msn_discards++;
+    }
+}
+
+struct iterate_do_brt_leaf_put_cmd_extra {
+    BRT t;
+    BASEMENTNODE bn;
+    SUBTREE_EST se;
+    BRTNODE ancestor;
+    int childnum;
+    DBT *lbe_ptr;
+    DBT *ubi_ptr;
+    MSN *max_msn_applied;
+};
+
+static int
+iterate_do_brt_leaf_put_cmd(OMTVALUE v, u_int32_t UU(idx), void *extrap)
+{
+    struct iterate_do_brt_leaf_put_cmd_extra *e = extrap;
+    const long offset = (long) v;
+    const struct fifo_entry *entry = toku_fifo_get_entry(BNC_BUFFER(e->ancestor, e->childnum), offset);
+    do_brt_leaf_put_cmd(e->t, e->bn, e->se, e->ancestor, e->childnum, e->lbe_ptr, e->ubi_ptr, e->max_msn_applied, entry);
+    return 0;
+}
+
 static int
 apply_buffer_messages_to_basement_node (
-    BRT t, 
-    BASEMENTNODE bn, 
-    SUBTREE_EST se, 
-    BRTNODE ancestor, 
-    int childnum, 
+    BRT t,
+    BASEMENTNODE bn,
+    SUBTREE_EST se,
+    BRTNODE ancestor,
+    int childnum,
    struct pivot_bounds const * const bounds
    )
 // Effect: For each messages in ANCESTOR that is between lower_bound_exclusive (exclusive) and upper_bound_inclusive (inclusive), apply the message to the node.
@@ -5090,40 +5021,106 @@ apply_buffer_messages_to_basement_node (
 {
    assert(0 <= childnum && childnum < ancestor->n_children);
    int r = 0;
-    DBT lbe, ubi;                 // lbe is lower bound exclusive, ubi is upper bound inclusive
+
+    MSN max_msn_applied = MIN_MSN;
+    u_int32_t lbe, ubi;
+    DBT lbedbt, ubidbt;  // lbe is lower bound exclusive, ubi is upper bound inclusive
    DBT *lbe_ptr, *ubi_ptr;
-    if (bounds->lower_bound_exclusive==NULL) {
-	lbe_ptr = NULL;
+    if (bounds->lower_bound_exclusive) {
+        struct toku_fifo_entry_key_msn_heaviside_extra lbe_extra = {
+            .cmp_extra = t->db, .cmp = t->compare_fun,
+            .fifo = BNC_BUFFER(ancestor, childnum),
+            .key = kv_pair_key((struct kv_pair *) bounds->lower_bound_exclusive),
+            .keylen = kv_pair_keylen((struct kv_pair *) bounds->lower_bound_exclusive),
+            .msn = MAX_MSN };
+        // TODO: get this value and compare it with ubi to see if we even
+        // need to continue
+        OMTVALUE found_lb;
+        r = toku_omt_find(BNC_MESSAGE_TREE(ancestor, childnum),
+                          toku_fifo_entry_key_msn_heaviside, &lbe_extra,
+                          +1, &found_lb, &lbe);
+        if (r == DB_NOTFOUND) {
+            // no relevant data, we're done
+            if (toku_omt_size(BNC_BROADCAST_BUFFER(ancestor, childnum)) == 0) {
+                return 0;
+            } else {
+                lbe = 0;
+                lbe_ptr = NULL;
+                ubi = 0;
+                ubi_ptr = NULL;
+                goto just_apply_broadcast_messages;
+            }
+        }
+        if (bounds->upper_bound_inclusive) {
+            DBT ubidbt_tmp = kv_pair_key_to_dbt((struct kv_pair *) bounds->upper_bound_inclusive);
+            const long offset = (long) found_lb;
+            DBT found_lbedbt;
+            fill_dbt_for_fifo_entry(&found_lbedbt, toku_fifo_get_entry(BNC_BUFFER(ancestor, childnum), offset));
+            int c = t->compare_fun(t->db, &found_lbedbt, &ubidbt_tmp);
+            if (c > 0) {
+                if (toku_omt_size(BNC_BROADCAST_BUFFER(ancestor, childnum)) == 0) {
+                    return 0;
+                } else {
+                    lbe = 0;
+                    lbe_ptr = NULL;
+                    ubi = 0;
+                    ubi_ptr = NULL;
+                    goto just_apply_broadcast_messages;
+                }
+            }
+        }
+        lbedbt = kv_pair_key_to_dbt((struct kv_pair *) bounds->lower_bound_exclusive);
+        lbe_ptr = &lbedbt;
    } else {
-	lbe = kv_pair_key_to_dbt(bounds->lower_bound_exclusive);
-	lbe_ptr = &lbe;
-    }
-    if (bounds->upper_bound_inclusive==NULL) {
-	ubi_ptr = NULL;
+        lbe = 0;
+        lbe_ptr = NULL;
+    }
+    if (bounds->upper_bound_inclusive) {
+        struct toku_fifo_entry_key_msn_heaviside_extra ubi_extra = {
+            .cmp_extra = t->db, .cmp = t->compare_fun,
+            .fifo = BNC_BUFFER(ancestor, childnum),
+            .key = kv_pair_key((struct kv_pair *) bounds->upper_bound_inclusive),
+            .keylen = kv_pair_keylen((struct kv_pair *) bounds->upper_bound_inclusive),
+            .msn = MAX_MSN };
+        r = toku_omt_find(BNC_MESSAGE_TREE(ancestor, childnum),
+                          toku_fifo_entry_key_msn_heaviside, &ubi_extra,
+                          +1, NULL, &ubi);
+        if (r == DB_NOTFOUND) {
+            ubi = toku_omt_size(BNC_MESSAGE_TREE(ancestor, childnum));
+        }
+        ubidbt = kv_pair_key_to_dbt((struct kv_pair *) bounds->upper_bound_inclusive);
+        ubi_ptr = &ubidbt;
    } else {
-	ubi = kv_pair_key_to_dbt(bounds->upper_bound_inclusive);
-	ubi_ptr = &ubi;
-    }
-    assert(BP_STATE(ancestor,childnum) == PT_AVAIL);
-    FIFO_ITERATE(BNC_BUFFER(ancestor, childnum), key, keylen, val, vallen, type, msn, xids,
-		 ({
-		     DBT hk;
-		     toku_fill_dbt(&hk, key, keylen);
-		     if ((!msg_type_has_key(type) || key_is_in_leaf_range(t, &hk, lbe_ptr, ubi_ptr))) {
-			 DBT hv;
-			 BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id = {&hk,
-											   toku_fill_dbt(&hv, val, vallen)} };
-                         bool made_change;
-			 brt_leaf_put_cmd(t,
-					  bn, se,
-					  &brtcmd, &made_change, &BP_WORKDONE(ancestor, childnum));
-		     }
-		 }));
-
-    //F    uint64_t end_workdone = BP_WORKDONE(ancestor, childnum);
-    //    printf("                    workdone = %"PRIu64", msndiff = 0x%"PRIx64", ancestorworkdone start, end = %"PRIu64", %"PRIu64"\n", 
-    //	   workdone_this_leaf_total, node->max_msn_applied_to_node.msn - start_msn.msn, start_workdone, end_workdone);
+        ubi = toku_omt_size(BNC_MESSAGE_TREE(ancestor, childnum));
+        ubi_ptr = NULL;
+    }
+
+just_apply_broadcast_messages:
+    if (toku_omt_size(BNC_BROADCAST_BUFFER(ancestor, childnum)) > 0) {
+        const int buffer_size = ubi - lbe + toku_omt_size(BNC_BROADCAST_BUFFER(ancestor, childnum));
+        long *MALLOC_N(buffer_size, offsets);
+
+        struct store_fifo_offset_extra sfo_extra = { .offsets = offsets, .i = 0 };
+        r = toku_omt_iterate_on_range(BNC_MESSAGE_TREE(ancestor, childnum), lbe, ubi, store_fifo_offset, &sfo_extra); assert_zero(r);
+        r = toku_omt_iterate(BNC_BROADCAST_BUFFER(ancestor, childnum), store_fifo_offset, &sfo_extra); assert_zero(r);
+        invariant(sfo_extra.i == buffer_size);
+        r = mergesort_r(offsets, buffer_size, sizeof offsets[0], BNC_BUFFER(ancestor, childnum), fifo_offset_msn_cmp); assert_zero(r);
+        assert(BP_STATE(ancestor, childnum) == PT_AVAIL);
+        for (int i = 0; i < buffer_size; ++i) {
+            const struct fifo_entry *entry = toku_fifo_get_entry(BNC_BUFFER(ancestor, childnum), offsets[i]);
+            do_brt_leaf_put_cmd(t, bn, se, ancestor, childnum, lbe_ptr, ubi_ptr, &max_msn_applied, entry);
+        }

+        toku_free(offsets);
+    } else {
+        assert(BP_STATE(ancestor, childnum) == PT_AVAIL);
+        struct iterate_do_brt_leaf_put_cmd_extra iter_extra = { .t = t, .bn = bn, .se = se, .ancestor = ancestor, .childnum = childnum, .lbe_ptr = lbe_ptr, .ubi_ptr = ubi_ptr, .max_msn_applied = &max_msn_applied };
+        r = toku_omt_iterate_on_range(BNC_MESSAGE_TREE(ancestor, childnum), lbe, ubi, iterate_do_brt_leaf_put_cmd, &iter_extra);
+        assert_zero(r);
+    }
+    if (max_msn_applied.msn > bn->max_msn_applied.msn) {
+        bn->max_msn_applied = max_msn_applied;
+    }
    return r;
 }

@@ -5264,17 +5261,8 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
    // need to apply messages to each basement node
    // TODO: (Zardosht) cilkify this
    for (int i = 0; i < node->n_children; i++) {
-        BOOL requires_msg_application = partition_requires_msg_application(
-            node,
-            i,
-            ancestors
-            );
-
-        if (!requires_msg_application) {
-            continue;
-        }
-        update_stats = TRUE;
        int height = 0;
+        if (BP_STATE(node, i) != PT_AVAIL) { continue; }
        BASEMENTNODE curr_bn = BLB(node, i);
        SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i);
        struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
@@ -5292,10 +5280,8 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
                // we don't want to check this node again if the next time
                // we query it, the msn hasn't changed.
                curr_bn->max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
+                update_stats = TRUE;
            }
-            curr_bn->max_dsn_applied = (curr_ancestors->node->dsn.dsn > curr_bn->max_dsn_applied.dsn)
-                ? curr_ancestors->node->dsn
-                : curr_bn->max_dsn_applied;
        }
    }
    // Must update the leaf estimates.	Might as well use the estimates from the soft copy (even if they make it out to disk), since they are
@@ -5343,8 +5329,6 @@ brt_search_basement_node(
    BOOL can_bulk_fetch
    )
 {
-    assert(bn->max_dsn_applied.dsn >= MIN_DSN.dsn);
-
    // Now we have to convert from brt_search_t to the heaviside function with a direction.  What a pain...

    int direction;
@@ -5483,7 +5467,7 @@ brt_node_maybe_prefetch(BRT brt, BRTNODE node, int childnum, BRT_CURSOR brtcurso
            BLOCKNUM nextchildblocknum = BP_BLOCKNUM(node, i);
            u_int32_t nextfullhash = compute_child_fullhash(brt->cf, node, i);
            struct brtnode_fetch_extra *MALLOC(bfe);
-            fill_bfe_for_prefetch(bfe, brt->h, brt, brtcursor);
+            fill_bfe_for_prefetch(bfe, brt->h, brt->db, brt->compare_fun, brtcursor);
            BOOL doing_prefetch = FALSE;
            toku_cachefile_prefetch(
                brt->cf,
@@ -5540,7 +5524,8 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_
    fill_bfe_for_subset_read(
        &bfe,
        brt->h,
-        brt,
+        brt->db,
+        brt->compare_fun,
        search,
        &brtcursor->range_lock_left_key,
        &brtcursor->range_lock_right_key,
@@ -5592,18 +5577,19 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_

 int
 toku_brt_search_which_child(
-    BRT brt, 
-    BRTNODE node, 
+    DB *cmp_extra,
+    brt_compare_func cmp,
+    BRTNODE node,
    brt_search_t *search
    )
 {
-    int c;    
+    int c;
    DBT pivotkey;
    toku_init_dbt(&pivotkey);

    /* binary search is overkill for a small array */
    int child[node->n_children];
-    
+
    /* scan left to right or right to left depending on the search direction */
    for (c = 0; c < node->n_children; c++) {
        child[c] = (search->direction == BRT_SEARCH_LEFT) ? c : node->n_children - 1 - c;
@@ -5612,7 +5598,7 @@ toku_brt_search_which_child(
        int p = (search->direction == BRT_SEARCH_LEFT) ? child[c] : child[c] - 1;
        struct kv_pair *pivot = node->childkeys[p];
        toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot));
-        if (search_pivot_is_bounded(search, brt, &pivotkey) && search->compare(search, &pivotkey)) {
+        if (search_pivot_is_bounded(search, cmp_extra, cmp, &pivotkey) && search->compare(search, &pivotkey)) {
            return child[c];
        }
    }
@@ -5781,7 +5767,8 @@ toku_brt_search (BRT brt, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf,
    fill_bfe_for_subset_read(
        &bfe, 
        brt->h,
-        brt,
+        brt->db,
+        brt->compare_fun,
        search,
        &brtcursor->range_lock_left_key,
        &brtcursor->range_lock_right_key,
@@ -6230,7 +6217,7 @@ static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename,
    {
 	//assert(fullhash == toku_cachetable_hash(brt->cf, nodename));
        struct brtnode_fetch_extra bfe;
-        fill_bfe_for_min_read(&bfe, brt->h);
+        fill_bfe_for_min_read(&bfe, brt->h, brt->db, brt->compare_fun);
 	toku_pin_brtnode_holding_lock(brt, nodename, fullhash,
 				      ancestors, bounds, &bfe,
 				      &node);
@@ -6318,7 +6305,7 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) {
    CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
    CACHEKEY root = *rootp;
    struct brtnode_fetch_extra bfe;
-    fill_bfe_for_min_read(&bfe, brt->h);
+    fill_bfe_for_min_read(&bfe, brt->h, brt->db, brt->compare_fun);
    BRTNODE node;
    toku_pin_brtnode_holding_lock(brt, root, fullhash, (ANCESTORS)NULL, &infinite_bounds, &bfe, &node);

@@ -6344,7 +6331,7 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_
    void *node_v;
    u_int32_t fullhash = toku_cachetable_hash(brt->cf, blocknum);
    struct brtnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, brt->h);
+    fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
    int r = toku_cachetable_get_and_pin(
        brt->cf, 
        blocknum, 
@@ -6661,7 +6648,7 @@ static BOOL is_empty_fast_iter (BRT brt, BRTNODE node) {
 		BLOCKNUM childblocknum = BP_BLOCKNUM(node,childnum);
 		u_int32_t fullhash =  compute_child_fullhash(brt->cf, node, childnum);
                struct brtnode_fetch_extra bfe;
-                fill_bfe_for_full_read(&bfe, brt->h);
+                fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
 		int rr = toku_cachetable_get_and_pin(
                    brt->cf, 
                    childblocknum, 
@@ -6706,7 +6693,7 @@ BOOL toku_brt_is_empty_fast (BRT brt)
    {
 	void *node_v;
        struct brtnode_fetch_extra bfe;
-        fill_bfe_for_full_read(&bfe, brt->h);
+        fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
 	int rr = toku_cachetable_get_and_pin(
            brt->cf, 
            *rootp, 

--- a/newbrt/brtdump.c
+++ b/newbrt/brtdump.c
@@ -120,7 +120,7 @@ static void
 dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
    BRTNODE n;
    struct brtnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, h);
+    fill_bfe_for_full_read(&bfe, h, NULL, NULL);
    int r = toku_deserialize_brtnode_from (f, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, &bfe);
    assert(r==0);
    assert(n!=0);
@@ -230,7 +230,7 @@ fragmentation_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra)
    frag_help_extra *info = extra;
    BRTNODE n;
    struct brtnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, info->h);
+    fill_bfe_for_full_read(&bfe, info->h, NULL, NULL);
    int r = toku_deserialize_brtnode_from(info->f, b, 0 /*pass zero for hash, it doesn't matter*/, &n, &bfe);
    if (r==0) {
        info->blocksizes += size;

--- a/newbrt/brttypes.h
+++ b/newbrt/brttypes.h
@@ -11,6 +11,7 @@
 #endif
 #define _FILE_OFFSET_BITS 64

+#include "toku_assert.h"
 #include <db.h>
 #include <inttypes.h>

@@ -60,11 +61,6 @@ typedef struct __toku_msn { u_int64_t msn; } MSN;
 #define MIN_MSN  ((MSN){(u_int64_t)1000*1000*1000})  // first 1B values reserved for messages created before Dr. No (for upgrade)
 #define MAX_MSN  ((MSN){UINT64_MAX})

-typedef struct __toku_dsn { int64_t dsn; } DSN; // DESERIALIZATION sequence number
-#define INVALID_DSN ((DSN){-1})
-#define MIN_DSN ((DSN){0})
-#define MAX_DSN ((DSN){INT64_MAX})
-
 /* At the brt layer, a FILENUM uniquely identifies an open file.
 * At the ydb layer, a DICTIONARY_ID uniquely identifies an open dictionary.
 * With the introduction of the loader (ticket 2216), it is possible for the file that holds
@@ -123,6 +119,68 @@ enum brt_msg_type {
    BRT_UPDATE_BROADCAST_ALL = 15
 };

+static inline BOOL
+brt_msg_type_applies_once(enum brt_msg_type type)
+{
+    BOOL ret_val;
+    switch (type) {
+    case BRT_INSERT_NO_OVERWRITE:
+    case BRT_INSERT:
+    case BRT_DELETE_ANY:
+    case BRT_ABORT_ANY:
+    case BRT_COMMIT_ANY:
+    case BRT_UPDATE:
+        ret_val = TRUE;
+        break;
+    case BRT_COMMIT_BROADCAST_ALL:
+    case BRT_COMMIT_BROADCAST_TXN:
+    case BRT_ABORT_BROADCAST_TXN:
+    case BRT_OPTIMIZE:
+    case BRT_OPTIMIZE_FOR_UPGRADE:
+    case BRT_UPDATE_BROADCAST_ALL:
+    case BRT_NONE:
+        ret_val = FALSE;
+        break;
+    default:
+        assert(FALSE);
+    }
+    return ret_val;
+}
+
+static inline BOOL
+brt_msg_type_applies_all(enum brt_msg_type type)
+{
+    BOOL ret_val;
+    switch (type) {
+    case BRT_NONE:
+    case BRT_INSERT_NO_OVERWRITE:
+    case BRT_INSERT:
+    case BRT_DELETE_ANY:
+    case BRT_ABORT_ANY:
+    case BRT_COMMIT_ANY:
+    case BRT_UPDATE:
+        ret_val = FALSE;
+        break;
+    case BRT_COMMIT_BROADCAST_ALL:
+    case BRT_COMMIT_BROADCAST_TXN:
+    case BRT_ABORT_BROADCAST_TXN:
+    case BRT_OPTIMIZE:
+    case BRT_OPTIMIZE_FOR_UPGRADE:
+    case BRT_UPDATE_BROADCAST_ALL:
+        ret_val = TRUE;
+        break;
+    default:
+        assert(FALSE);
+    }
+    return ret_val;
+}
+
+static inline BOOL
+brt_msg_type_does_nothing(enum brt_msg_type type)
+{
+    return (type == BRT_NONE);
+}
+
 typedef struct xids_t *XIDS;
 typedef struct fifo_msg_t *FIFO_MSG;
 /* tree commands */

--- a/newbrt/cachetable.c
+++ b/newbrt/cachetable.c
@@ -42,7 +42,6 @@ static u_int64_t cachetable_puts;          // how many times has a newly created
 static u_int64_t cachetable_prefetches;    // how many times has a block been prefetched into the cachetable?
 static u_int64_t cachetable_maybe_get_and_pins;      // how many times has maybe_get_and_pin(_clean) been called?
 static u_int64_t cachetable_maybe_get_and_pin_hits;  // how many times has get_and_pin(_clean) returned with a node?
-static u_int64_t cachetable_get_and_pin_if_in_memorys; // how many times has get_and_pin_if_in_memorys been called?
 static u_int64_t cachetable_wait_checkpoint;         // number of times get_and_pin waits for a node to be written for a checkpoint
 static u_int64_t cachetable_misstime;     // time spent waiting for disk read
 static u_int64_t cachetable_waittime;     // time spent waiting for another thread to release lock (e.g. prefetch, writing)
@@ -1733,42 +1732,6 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, u_int3
    return r;
 }

-int toku_cachetable_get_and_pin_if_in_memory (CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value)
-// Effect: Lookup a key in the cachetable.  If it is found then acquire a read lock on the pair, don't update the LRU list, and return success.
-//  Unlike toku_cachetable_maybe_get_and_pin, which gives up if there is any blocking (e.g., the node is waiting to be checkpointing), this
-//  version waits.  
-// Rationale: orthodox pushing needs to get the in-memory state right. 
-//  Don't update the LRU list because we don't want this operation to cause something to stick in memory longer.
-{
-    CACHETABLE ct = cachefile->cachetable;
-    PAIR p;
-    int count = 0;
-    int r = -1;
-    cachetable_lock(ct);
-    cachetable_get_and_pin_if_in_memorys++;
-    for (p=ct->table[fullhash&(ct->table_size-1)]; p; p=p->hash_chain) {
-	count++;
-	if (p->key.b==key.b && p->cachefile==cachefile) {
-	    // It's the right block.  Now we must wait.
-	    if (p->checkpoint_pending) {
-		write_pair_for_checkpoint(ct, p, FALSE);
-	    }
-	    rwlock_read_lock(&p->rwlock, ct->mutex);
-	    if (p->state == CTPAIR_INVALID) {
-		assert(0); // This is the branch that returns ENODEV in the get_and_pin code in the 5.0 branch.  Let's just crash now.
-	    }
-	    // do not increment PAIR's clock count.
-	    *value = p->value;
-	    cachetable_hit++;
-	    r = 0;
-	    break;
-	}
-    }
-    note_hash_count(count);
-    cachetable_unlock(ct);
-    return r;
-}
-
 //Used by shortcut query path.
 //Same as toku_cachetable_maybe_get_and_pin except that we don't care if the node is clean or dirty (return the node regardless).
 //All other conditions remain the same.
@@ -2955,7 +2918,6 @@ void toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS s) {
    s->prefetches   = cachetable_prefetches;
    s->maybe_get_and_pins      = cachetable_maybe_get_and_pins;
    s->maybe_get_and_pin_hits  = cachetable_maybe_get_and_pin_hits;
-    s->get_and_pin_if_in_memorys = cachetable_get_and_pin_if_in_memorys;
    s->size_current = ct->size_current;          
    s->size_limit   = ct->size_limit;            
    s->size_max     = ct->size_max;

--- a/newbrt/cachetable.h
+++ b/newbrt/cachetable.h
@@ -232,12 +232,6 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, u_int32_t /*fullhash
 // Returns: If the the item is already in memory, then return 0 and store it in the
 // void**.  If the item is not in memory, then return a nonzero error number.

-int toku_cachetable_get_and_pin_if_in_memory (CACHEFILE /*cachefile*/, CACHEKEY /*key*/, u_int32_t /*fullhash*/, void**/*value*/);
-// Effect: Get and pin an object if it is in memory, (even if doing so would require blocking, e.g., to wait on a checkpoint).
-//  This is similar to maybe_get_and_pin except that maybe_get_and_pin won't block waiting on a checkpoint.
-// Returns: 0 iff the item is in memory (otherwise return a error)
-// Modifies: *value (if returning 0, then the pointer to the value is stored in *value.
-
 int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE, CACHEKEY, u_int32_t /*fullhash*/, void**);
 // Effect: Like maybe get and pin, but may pin a clean pair.

@@ -403,7 +397,6 @@ typedef struct cachetable_status {
    u_int64_t prefetches;    // how many times has a block been prefetched into the cachetable?
    u_int64_t maybe_get_and_pins;      // how many times has maybe_get_and_pin(_clean) been called?
    u_int64_t maybe_get_and_pin_hits;  // how many times has maybe_get_and_pin(_clean) returned with a node?
-    u_int64_t get_and_pin_if_in_memorys; // how many times has get_and_pin_if_in_memory been called?
    int64_t   size_current;            // the sum of the sizes of the nodes represented in the cachetable
    int64_t   size_limit;              // the limit to the sum of the node sizes
    int64_t   size_max;                // high water mark of size_current (max value size_current ever had)

--- a/newbrt/fifo.c
+++ b/newbrt/fifo.c
@@ -69,7 +69,7 @@ void toku_fifo_size_hint(FIFO fifo, size_t size) {
    }
 }

-int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, MSN msn, XIDS xids) {
+int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, MSN msn, XIDS xids, long *dest) {
    int need_space_here = sizeof(struct fifo_entry)
                          + keylen + datalen
                          + xids_get_size(xids)
@@ -80,24 +80,26 @@ int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *d
        fifo->memory = toku_malloc(fifo->memory_size);
    }
    if (fifo->memory_start+need_space_total > fifo->memory_size) {
-	// Out of memory at the end.
-	int next_2 = next_power_of_two(need_space_total);
-	if ((2*next_2 > fifo->memory_size)
-	    || (8*next_2 < fifo->memory_size)) {
-	    // resize the fifo
-	    char *newmem = toku_malloc(next_2);
-	    char *oldmem = fifo->memory;
-	    if (newmem==0) return ENOMEM;
-	    memcpy(newmem, oldmem+fifo->memory_start, fifo->memory_used);
-	    fifo->memory_size = next_2;
-	    fifo->memory_start = 0;
-	    fifo->memory = newmem;
-	    toku_free(oldmem);
-	} else {
-	    // slide things over
-	    memmove(fifo->memory, fifo->memory+fifo->memory_start, fifo->memory_used);
-	    fifo->memory_start = 0;
-	}
+        // Out of memory at the end.
+        int next_2 = next_power_of_two(need_space_total);
+        if ((2*next_2 > fifo->memory_size)
+            || (8*next_2 < fifo->memory_size)) {
+            // resize the fifo
+            char *newmem = toku_malloc(next_2);
+            char *oldmem = fifo->memory;
+            if (newmem==0) return ENOMEM;
+            memcpy(newmem, oldmem+fifo->memory_start, fifo->memory_used);
+            fifo->memory_size = next_2;
+            assert(fifo->memory_start == 0);
+            fifo->memory_start = 0;
+            fifo->memory = newmem;
+            toku_free(oldmem);
+        } else {
+            // slide things over
+            memmove(fifo->memory, fifo->memory+fifo->memory_start, fifo->memory_used);
+            assert(fifo->memory_start == 0);
+            fifo->memory_start = 0;
+        }
    }
    struct fifo_entry *entry = (struct fifo_entry *)(fifo->memory + fifo->memory_start + fifo->memory_used);
    entry->type = (unsigned char)type;
@@ -108,13 +110,17 @@ int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *d
    memcpy(e_key, key, keylen);
    entry->vallen = datalen;
    memcpy(e_key + keylen, data, datalen);
+    if (dest) {
+        assert(fifo->memory_start == 0);
+        *dest = fifo->memory_used;
+    }
    fifo->n_items_in_fifo++;
    fifo->memory_used += need_space_here;
    return 0;
 }

-int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd) {
-  return toku_fifo_enq(fifo, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size, cmd->type, cmd->msn, cmd->xids);
+int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd, long *dest) {
+    return toku_fifo_enq(fifo, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size, cmd->type, cmd->msn, cmd->xids, dest);
 }

 /* peek at the head (the oldest entry) of the fifo */
@@ -193,3 +199,10 @@ unsigned long toku_fifo_memory_size(FIFO fifo) {
    return sizeof(*fifo)+fifo->memory_size;
 }

+DBT *fill_dbt_for_fifo_entry(DBT *dbt, const struct fifo_entry *entry) {
+    return toku_fill_dbt(dbt, xids_get_end_of_array((XIDS) &entry->xids_s), entry->keylen);
+}
+
+const struct fifo_entry *toku_fifo_get_entry(FIFO fifo, long off) {
+    return toku_fifo_iterate_internal_get_entry(fifo, off);
+}
--- a/newbrt/fifo.h
+++ b/newbrt/fifo.h
@@ -44,9 +44,9 @@ void toku_fifo_size_is_stabilized(FIFO);

 int toku_fifo_n_entries(FIFO);

-int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd);
+int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd, long *dest);

-int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type, MSN msn, XIDS xids);
+int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type, MSN msn, XIDS xids, long *dest);

 int toku_fifo_peek (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, u_int32_t *type, MSN *msn, XIDS *xids);

@@ -81,6 +81,9 @@ int toku_fifo_iterate_internal_has_more(FIFO fifo, int off);
 int toku_fifo_iterate_internal_next(FIFO fifo, int off);
 struct fifo_entry * toku_fifo_iterate_internal_get_entry(FIFO fifo, int off);

+DBT *fill_dbt_for_fifo_entry(DBT *dbt, const struct fifo_entry *entry);
+const struct fifo_entry *toku_fifo_get_entry(FIFO fifo, long off);
+
 #if defined(__cplusplus) || defined(__cilkplusplus)
 };
 #endif

--- a/newbrt/sort.c
+++ b/newbrt/sort.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "$Id$"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include "includes.h"
+#include "sort.h"
+
+#if defined(HAVE_CILK)
+#include <cilk/cilk.h>
+#define cilk_worker_count (__cilkrts_get_nworkers())
+#else
+#define cilk_spawn
+#define cilk_sync
+#define cilk_for for
+#define cilk_worker_count 1
+#endif
+
+static int
+merge_c(void *vdest, void *va, int an, void *vb, int bn, int width,
+        void *extra, int (*cmp)(void *, const void *, const void *))
+{
+    char *dest = vdest, *a = va, *b = vb;
+    while (an > 0 && bn > 0) {
+        int c = cmp(extra, a, b);
+        if (c < 0) {
+            memcpy(dest, a, width);
+            dest+=width; a+=width; an--;
+        } else {
+            memcpy(dest, b, width);
+            dest+=width; b+=width; bn--;
+        }
+    }
+    if (an > 0) {
+        memcpy(dest, a, an * width);
+    }
+    if (bn > 0) {
+        memcpy(dest, b, bn * width);
+    }
+    return 0;
+}
+
+static int
+binsearch(void *key, void *va, int n, int abefore, int width,
+          void *extra, int (*cmp)(void *, const void *, const void *))
+{
+    if (n == 0) {
+        return abefore;
+    }
+    char *a = va;
+    int mid = n / 2;
+    void *akey = a + mid * width;
+    int c = cmp(extra, key, akey);
+    if (c == 0) {
+        // this won't happen because msns are unique, but is here for completeness
+        return abefore + mid;
+    } else if (c < 0) {
+        if (n == 1) {
+            return abefore;
+        } else {
+            return binsearch(key, a, mid, abefore, width, extra, cmp);
+        }
+    } else {
+        if (n == 1) {
+            return abefore + 1;
+        } else {
+            return binsearch(key, a+mid*width, n-mid, abefore+mid, width, extra, cmp);
+        }
+    }
+}
+
+static int
+merge(void *vdest, void *va, int an, void *vb, int bn, int width,
+      void *extra, int (*cmp)(void *, const void *, const void *))
+{
+    if (an + bn < 10000) {
+        return merge_c(vdest, va, an, vb, bn, width, extra, cmp);
+    }
+
+    char *dest = vdest, *a = va, *b = vb;
+    if (an < bn) {
+        char *tmp1 = a; a = b; b = tmp1;
+        int tmp2 = an; an = bn; bn = tmp2;
+    }
+    int a2 = an/2;
+    void *akey = a + a2 * width;
+    int b2 = binsearch(akey, b, bn, 0, width, extra, cmp);
+    int ra, rb;
+    ra = cilk_spawn merge(dest, a, a2, b, b2, width, extra, cmp);
+    rb = merge(dest+(a2+b2)*width, a+a2*width, an-a2, b+b2*width, bn-b2, width, extra, cmp);
+    cilk_sync;
+    if (ra != 0) return ra;
+    return rb;
+}
+
+int
+mergesort_r(void *va, int n, int width,
+            void *extra, int (*cmp)(void *, const void *, const void *))
+{
+    const BOOL use_cilk = (n > 10000);
+    if (n <= 1) { return 0; }
+    unsigned char *a = va;
+    int mid = n/2;
+    int r1, r2;
+    if (use_cilk) {
+        r1 = cilk_spawn mergesort_r(a, mid, width, extra, cmp);
+    } else {
+        r1 = mergesort_r(a, mid, width, extra, cmp);
+    }
+    r2 = mergesort_r(a+mid*width, n-mid, width, extra, cmp);
+    cilk_sync;
+    if (r1 != 0) return r1;
+    if (r2 != 0) return r2;
+
+    void *tmp = toku_xmalloc(n * width);
+    int r;
+    if (use_cilk) {
+        r = merge(tmp, a, mid, a+mid*width, n-mid, width, extra, cmp);
+    } else {
+        r = merge_c(tmp, a, mid, a+mid*width, n-mid, width, extra, cmp);
+    }
+    if (r != 0) {
+        toku_free(tmp);
+        return r;
+    }
+    memcpy(a, tmp, n*width);
+    toku_free(tmp);
+    return 0;
+}
--- a/newbrt/sort.h
+++ b/newbrt/sort.h
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ifndef SORT_H
+#define SORT_H
+#ident "$Id$"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#if defined(__cplusplus) || defined(__cilkplusplus)
+extern "C" {
+#endif
+
+// apes qsort_r which is not available in centos 5's version of libc
+// is parallelized with cilk, therefore probably faster than qsort_r on large arrays
+// TODO: switch to qsort_r for small arrays (at the bottom of the recursion)
+//       this requires figuring out what to do about libc
+//
+// a: array of elements
+// n: number of elements
+// width: size of each element in bytes
+// extra: extra data for comparison function (passed in as first parameter)
+// cmp: comparison function, compatible with qsort_r
+//
+// Returns 0 on success.
+int
+mergesort_r(void *a, int n, int width,
+            void *extra, int (*cmp)(void *, const void *, const void *));
+
+
+#if defined(__cplusplus) || defined(__cilkplusplus)
+};
+#endif
+
+#endif
--- a/newbrt/tests/brt-serialize-test.c
+++ b/newbrt/tests/brt-serialize-test.c
@@ -84,18 +84,25 @@ enum brtnode_verify_type {
    read_none
 };

+static int
+string_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
+{
+    char *s = a->data, *t = b->data;
+    return strcmp(s, t);
+}
+
 static void
 setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE *dn) {
    int r;
    if (bft == read_all) {
        struct brtnode_fetch_extra bfe;
-        fill_bfe_for_full_read(&bfe, brt_h);
+        fill_bfe_for_full_read(&bfe, brt_h, NULL, string_key_cmp);
        r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe);
        assert(r==0);
    }
    else if (bft == read_compressed || bft == read_none) {
        struct brtnode_fetch_extra bfe;
-        fill_bfe_for_min_read(&bfe, brt_h);
+        fill_bfe_for_min_read(&bfe, brt_h, NULL, string_key_cmp);
        r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe);
        assert(r==0);
        // assert all bp's are compressed
@@ -118,7 +125,7 @@ setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE
            }
        }
        // now decompress them
-        fill_bfe_for_full_read(&bfe, brt_h);
+        fill_bfe_for_full_read(&bfe, brt_h, NULL, string_key_cmp);
        assert(toku_brtnode_pf_req_callback(*dn, &bfe));
        long size;
        r = toku_brtnode_pf_callback(*dn, &bfe, fd, &size);
@@ -1067,9 +1074,9 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) {
    r = xids_create_child(xids_123, &xids_234, (TXNID)234);
    CKERR(r);

-    r = toku_fifo_enq(BNC_BUFFER(&sn,0), "a", 2, "aval", 5, BRT_NONE, next_dummymsn(), xids_0);    assert(r==0);
-    r = toku_fifo_enq(BNC_BUFFER(&sn,0), "b", 2, "bval", 5, BRT_NONE, next_dummymsn(), xids_123);  assert(r==0);
-    r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, next_dummymsn(), xids_234);  assert(r==0);
+    r = toku_fifo_enq(BNC_BUFFER(&sn,0), "a", 2, "aval", 5, BRT_NONE, next_dummymsn(), xids_0, NULL);    assert(r==0);
+    r = toku_fifo_enq(BNC_BUFFER(&sn,0), "b", 2, "bval", 5, BRT_NONE, next_dummymsn(), xids_123, NULL);  assert(r==0);
+    r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, next_dummymsn(), xids_234, NULL);  assert(r==0);
    BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_0) + xids_get_serialize_size(xids_123);
    BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_234);
    //Cleanup:

--- a/newbrt/tests/cachetable-test2.c
+++ b/newbrt/tests/cachetable-test2.c
@@ -154,7 +154,7 @@ static void verify_cachetable_against_present (void) {
    for (i=0; i<my_n_present; i++) {
 	void *v;
 	u_int32_t fullhash = toku_cachetable_hash(my_present_items[i].cf, my_present_items[i].key);
-	int r=toku_cachetable_get_and_pin_if_in_memory(my_present_items[i].cf,
+	int r=toku_cachetable_maybe_get_and_pin_clean(my_present_items[i].cf,
 						my_present_items[i].key,
 						toku_cachetable_hash(my_present_items[i].cf, my_present_items[i].key),
 						&v);

--- a/newbrt/tests/fifo-test.c
+++ b/newbrt/tests/fifo-test.c
@@ -57,7 +57,7 @@ test_fifo_enq (int n) {
 	MSN msn = next_dummymsn();
 	if (startmsn.msn == ZERO_MSN.msn)
 	  startmsn = msn;
-        r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, msn, xids); assert(r == 0);
+        r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, msn, xids, NULL); assert(r == 0);
        xids_destroy(&xids);
    }


--- a/newbrt/tests/make-tree.c
+++ b/newbrt/tests/make-tree.c
@@ -58,13 +58,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
 }

 static void
-insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) {
+insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
    for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
        MSN msn = next_dummymsn();
        unsigned int key = htonl(val);
        DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
        DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
-        toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
+        toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
 	node->max_msn_applied_to_node_on_disk = msn;
    }
 }
@@ -89,7 +89,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey,
                toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k);
            }
            toku_unpin_brtnode(brt, child);
-            insert_into_child_buffer(node, childnum, minkeys[childnum], maxkeys[childnum]);
+            insert_into_child_buffer(brt, node, childnum, minkeys[childnum], maxkeys[childnum]);
        }
        *minkey = minkeys[0];
        *maxkey = maxkeys[0];

--- a/newbrt/tests/sort-test.c
+++ b/newbrt/tests/sort-test.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "$Id$"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include "test.h"
+
+#include <stdlib.h>
+#include "sort.h"
+
+const int MAX_NUM = 0x0fffffffL;
+int MAGIC_EXTRA = 0xd3adb00f;
+
+static int
+int_cmp(void *ve, const void *va, const void *vb)
+{
+    int *e = ve;
+    assert(e);
+    assert(*e == MAGIC_EXTRA);
+    const int *a = va, *b = vb;
+    assert(*a < MAX_NUM);
+    assert(*b < MAX_NUM);
+    return (*a > *b) - (*a < *b);
+}
+
+static void
+check_int_array(int a[], int nelts)
+{
+    assert(a[0] < MAX_NUM);
+    for (int i = 1; i < nelts; ++i) {
+        assert(a[i] < MAX_NUM);
+        assert(a[i-1] <= a[i]);
+    }
+}
+
+static void
+zero_array_test(void)
+{
+    mergesort_r(NULL, 0, sizeof(int), NULL, int_cmp);
+}
+
+static void
+already_sorted_test(int nelts)
+{
+    int *MALLOC_N(nelts, a);
+    for (int i = 0; i < nelts; ++i) {
+        a[i] = i;
+    }
+    mergesort_r(a, nelts, sizeof a[0], &MAGIC_EXTRA, int_cmp);
+    check_int_array(a, nelts);
+    toku_free(a);
+}
+
+static void
+random_array_test(int nelts)
+{
+    int *MALLOC_N(nelts, a);
+    for (int i = 0; i < nelts; ++i) {
+        a[i] = rand() % MAX_NUM;
+    }
+    mergesort_r(a, nelts, sizeof a[0], &MAGIC_EXTRA, int_cmp);
+    check_int_array(a, nelts);
+    toku_free(a);
+}
+
+int
+test_main(int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__)))
+{
+    zero_array_test();
+    already_sorted_test(10);
+    already_sorted_test(1000);
+    already_sorted_test(10001);
+    already_sorted_test(10000000);
+    random_array_test(10);
+    random_array_test(1000);
+    random_array_test(10001);
+    random_array_test(10000000);
+    return 0;
+}
--- a/newbrt/tests/test3748.c
+++ b/newbrt/tests/test3748.c
@@ -44,7 +44,7 @@ static void test_3748 (void) {

 	if (startmsn.msn == ZERO_MSN.msn)
 	    startmsn = msn;
-	r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, msn, xids); assert(r == 0);
+	r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, msn, xids, NULL); assert(r == 0);
 	xids_destroy(&xids);
    }
    for (int i=N/10; i<N; i++) {

--- a/newbrt/tests/verify-bad-msn.c
+++ b/newbrt/tests/verify-bad-msn.c
@@ -62,13 +62,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
 }

 static void
-insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) {
+insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
    for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
        MSN msn = next_dummymsn();
        unsigned int key = htonl(val);
        DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
        DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
-        toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
+        toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);

 	// Create bad tree (don't do following):
 	// node->max_msn_applied_to_node = msn;
@@ -95,7 +95,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey,
                toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k);
            }
            toku_unpin_brtnode(brt, child);
-            insert_into_child_buffer(node, childnum, minkeys[childnum], maxkeys[childnum]);
+            insert_into_child_buffer(brt, node, childnum, minkeys[childnum], maxkeys[childnum]);
        }
        *minkey = minkeys[0];
        *maxkey = maxkeys[0];

--- a/newbrt/tests/verify-bad-pivots.c
+++ b/newbrt/tests/verify-bad-pivots.c
@@ -47,13 +47,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
 }

 static UU() void
-insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) {
+insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
    for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
        unsigned int key = htonl(val);
        DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
        DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
 	MSN msn = next_dummymsn();
-        toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
+        toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
    }
 }


--- a/newbrt/tests/verify-dup-pivots.c
+++ b/newbrt/tests/verify-dup-pivots.c
@@ -47,13 +47,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
 }

 static UU() void
-insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) {
+insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
    for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
        unsigned int key = htonl(val);
        DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
        DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
 	MSN msn = next_dummymsn();
-        toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
+        toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
    }
 }


--- a/newbrt/tests/verify-misrouted-msgs.c
+++ b/newbrt/tests/verify-misrouted-msgs.c
@@ -48,7 +48,7 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
 }

 static void
-insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) {
+insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
    int k = htonl(maxkey);
    maxkey = htonl(k+1);
    for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
@@ -56,7 +56,7 @@ insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) {
        DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
        DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
 	MSN msn = next_dummymsn();
-        toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
+        toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
    }
 }

@@ -80,7 +80,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey,
                toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k);
            }
            toku_unpin_brtnode(brt, child);
-            insert_into_child_buffer(node, childnum, minkeys[childnum], maxkeys[childnum]);
+            insert_into_child_buffer(brt, node, childnum, minkeys[childnum], maxkeys[childnum]);
        }
        *minkey = minkeys[0];
        *maxkey = maxkeys[0];

--- a/newbrt/tests/verify-unsorted-pivots.c
+++ b/newbrt/tests/verify-unsorted-pivots.c
@@ -47,13 +47,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
 }

 static UU() void
-insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) {
+insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
    for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
        unsigned int key = htonl(val);
        DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
        DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
 	MSN msn = next_dummymsn();
-        toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
+        toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
    }
 }


--- a/newbrt/ule.c
+++ b/newbrt/ule.c
@@ -265,6 +265,7 @@ garbage_collection(ULE ule, OMT snapshot_xids, OMT live_list_reverse) {
 done:;
 }

+
 /////////////////////////////////////////////////////////////////////////////////
 // This is the big enchilada.  (Bring Tums.)  Note that this level of abstraction 
 // has no knowledge of the inner structure of either leafentry or msg.  It makes

--- a/newbrt/ule.h
+++ b/newbrt/ule.h
@@ -43,6 +43,13 @@ TXNID uxr_get_txnid(UXRHANDLE uxr);
 //1 does much slower debugging
 #define GARBAGE_COLLECTION_DEBUG 0

+
+void fast_msg_to_leafentry(
+    BRT_MSG   msg, // message to apply to leafentry
+    size_t *new_leafentry_memorysize, 
+    size_t *new_leafentry_disksize, 
+    LEAFENTRY *new_leafentry_p) ;
+
 int apply_msg_to_leafentry(BRT_MSG   msg,
 			   LEAFENTRY old_leafentry, // NULL if there was no stored data.
 			   size_t *new_leafentry_memorysize,