[t:4002] Commiting HOT to main.

git-svn-id: file:///svn/toku/tokudb@38549 c7de825b-a66e-492c-adef-691d508d4ae1

[t:4002] Commiting HOT to main.
git-svn-id: file:///svn/toku/tokudb@38549 c7de825b-a66e-492c-adef-691d508d4ae1
f165ee02 · Leif Walsh · Yoni Fogel · e44c7d7a · f165ee02 · f165ee02
Commit f165ee02 authored Jan 06, 2012 by Leif Walsh Committed by Yoni Fogel Apr 17, 2013
68 changed files
--- a/buildheader/db.h_4_1
+++ b/buildheader/db.h_4_1
@@ -197,7 +197,7 @@ typedef struct __toku_engine_status {
  uint64_t         cleaner_max_buffer_workdone;   /* max workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_min_buffer_workdone;   /* min workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_total_buffer_workdone; /* total workdone value of message buffers flushed by cleaner thread */
-  uint64_t         cleaner_num_leaves_unmerged;   /* number of leaves left unmerged by the cleaner thread */
+  uint64_t         cleaner_num_dirtied_for_leaf_merge;  /* nodes dirtied by the "flush from root" process to merge a leaf node */
  uint64_t         flush_total;                 /* total number of flushes done by flusher threads or cleaner threads */
  uint64_t         flush_in_memory;             /* number of in memory flushes */
  uint64_t         flush_needed_io;             /* number of flushes that had to read a child (or part) off disk */
@@ -223,6 +223,10 @@ typedef struct __toku_engine_status {
  uint64_t         dirty_leaf;                  /* number of times leaf nodes are dirtied when previously clean */
  uint64_t         dirty_nonleaf;               /* number of times nonleaf nodes are dirtied when previously clean */
  uint64_t         balance_leaf;                /* number of times a leaf node is balanced inside brt */
+  uint64_t         hot_num_started;             /* number of HOT operations that have begun */
+  uint64_t         hot_num_completed;           /* number of HOT operations that have successfully completed */
+  uint64_t         hot_num_aborted;             /* number of HOT operations that have been aborted */
+  uint64_t         hot_max_root_flush_count;    /* max number of flushes from root ever required to optimize a tree */
  uint64_t         msg_bytes_in;                /* how many bytes of messages injected at root (for all trees)*/
  uint64_t         msg_bytes_out;               /* how many bytes of messages flushed from h1 nodes to leaves*/
  uint64_t         msg_bytes_curr;              /* how many bytes of messages currently in trees (estimate)*/
@@ -541,6 +545,7 @@ struct __toku_db {
  int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */;
  int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */;
  int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */;
+  int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra);
  int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION);
  int (*get_readpagesize)(DB*,u_int32_t*);
  int (*set_readpagesize)(DB*,u_int32_t);
@@ -549,7 +554,7 @@ struct __toku_db {
  int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going);
  int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags);
  int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags);
-  void* __toku_dummy0[11];
+  void* __toku_dummy0[10];
  char __toku_dummy1[96];
  void *api_internal; /* 32-bit offset=236 size=4, 64=bit offset=376 size=8 */
  void* __toku_dummy2[5];

--- a/buildheader/db.h_4_3
+++ b/buildheader/db.h_4_3
@@ -197,7 +197,7 @@ typedef struct __toku_engine_status {
  uint64_t         cleaner_max_buffer_workdone;   /* max workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_min_buffer_workdone;   /* min workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_total_buffer_workdone; /* total workdone value of message buffers flushed by cleaner thread */
-  uint64_t         cleaner_num_leaves_unmerged;   /* number of leaves left unmerged by the cleaner thread */
+  uint64_t         cleaner_num_dirtied_for_leaf_merge;  /* nodes dirtied by the "flush from root" process to merge a leaf node */
  uint64_t         flush_total;                 /* total number of flushes done by flusher threads or cleaner threads */
  uint64_t         flush_in_memory;             /* number of in memory flushes */
  uint64_t         flush_needed_io;             /* number of flushes that had to read a child (or part) off disk */
@@ -223,6 +223,10 @@ typedef struct __toku_engine_status {
  uint64_t         dirty_leaf;                  /* number of times leaf nodes are dirtied when previously clean */
  uint64_t         dirty_nonleaf;               /* number of times nonleaf nodes are dirtied when previously clean */
  uint64_t         balance_leaf;                /* number of times a leaf node is balanced inside brt */
+  uint64_t         hot_num_started;             /* number of HOT operations that have begun */
+  uint64_t         hot_num_completed;           /* number of HOT operations that have successfully completed */
+  uint64_t         hot_num_aborted;             /* number of HOT operations that have been aborted */
+  uint64_t         hot_max_root_flush_count;    /* max number of flushes from root ever required to optimize a tree */
  uint64_t         msg_bytes_in;                /* how many bytes of messages injected at root (for all trees)*/
  uint64_t         msg_bytes_out;               /* how many bytes of messages flushed from h1 nodes to leaves*/
  uint64_t         msg_bytes_curr;              /* how many bytes of messages currently in trees (estimate)*/
@@ -551,6 +555,7 @@ struct __toku_db {
  int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */;
  int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */;
  int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */;
+  int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra);
  int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION);
  int (*get_readpagesize)(DB*,u_int32_t*);
  int (*set_readpagesize)(DB*,u_int32_t);
@@ -559,7 +564,7 @@ struct __toku_db {
  int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going);
  int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags);
  int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags);
-  void* __toku_dummy0[14];
+  void* __toku_dummy0[13];
  char __toku_dummy1[96];
  void *api_internal; /* 32-bit offset=248 size=4, 64=bit offset=400 size=8 */
  void* __toku_dummy2[5];

--- a/buildheader/db.h_4_4
+++ b/buildheader/db.h_4_4
@@ -197,7 +197,7 @@ typedef struct __toku_engine_status {
  uint64_t         cleaner_max_buffer_workdone;   /* max workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_min_buffer_workdone;   /* min workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_total_buffer_workdone; /* total workdone value of message buffers flushed by cleaner thread */
-  uint64_t         cleaner_num_leaves_unmerged;   /* number of leaves left unmerged by the cleaner thread */
+  uint64_t         cleaner_num_dirtied_for_leaf_merge;  /* nodes dirtied by the "flush from root" process to merge a leaf node */
  uint64_t         flush_total;                 /* total number of flushes done by flusher threads or cleaner threads */
  uint64_t         flush_in_memory;             /* number of in memory flushes */
  uint64_t         flush_needed_io;             /* number of flushes that had to read a child (or part) off disk */
@@ -223,6 +223,10 @@ typedef struct __toku_engine_status {
  uint64_t         dirty_leaf;                  /* number of times leaf nodes are dirtied when previously clean */
  uint64_t         dirty_nonleaf;               /* number of times nonleaf nodes are dirtied when previously clean */
  uint64_t         balance_leaf;                /* number of times a leaf node is balanced inside brt */
+  uint64_t         hot_num_started;             /* number of HOT operations that have begun */
+  uint64_t         hot_num_completed;           /* number of HOT operations that have successfully completed */
+  uint64_t         hot_num_aborted;             /* number of HOT operations that have been aborted */
+  uint64_t         hot_max_root_flush_count;    /* max number of flushes from root ever required to optimize a tree */
  uint64_t         msg_bytes_in;                /* how many bytes of messages injected at root (for all trees)*/
  uint64_t         msg_bytes_out;               /* how many bytes of messages flushed from h1 nodes to leaves*/
  uint64_t         msg_bytes_curr;              /* how many bytes of messages currently in trees (estimate)*/
@@ -553,6 +557,7 @@ struct __toku_db {
  int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */;
  int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */;
  int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */;
+  int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra);
  int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION);
  int (*get_readpagesize)(DB*,u_int32_t*);
  int (*set_readpagesize)(DB*,u_int32_t);
@@ -561,7 +566,7 @@ struct __toku_db {
  int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going);
  int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags);
  int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags);
-  void* __toku_dummy0[16];
+  void* __toku_dummy0[15];
  char __toku_dummy1[96];
  void *api_internal; /* 32-bit offset=256 size=4, 64=bit offset=416 size=8 */
  void* __toku_dummy2[5];

--- a/buildheader/db.h_4_5
+++ b/buildheader/db.h_4_5
@@ -197,7 +197,7 @@ typedef struct __toku_engine_status {
  uint64_t         cleaner_max_buffer_workdone;   /* max workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_min_buffer_workdone;   /* min workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_total_buffer_workdone; /* total workdone value of message buffers flushed by cleaner thread */
-  uint64_t         cleaner_num_leaves_unmerged;   /* number of leaves left unmerged by the cleaner thread */
+  uint64_t         cleaner_num_dirtied_for_leaf_merge;  /* nodes dirtied by the "flush from root" process to merge a leaf node */
  uint64_t         flush_total;                 /* total number of flushes done by flusher threads or cleaner threads */
  uint64_t         flush_in_memory;             /* number of in memory flushes */
  uint64_t         flush_needed_io;             /* number of flushes that had to read a child (or part) off disk */
@@ -223,6 +223,10 @@ typedef struct __toku_engine_status {
  uint64_t         dirty_leaf;                  /* number of times leaf nodes are dirtied when previously clean */
  uint64_t         dirty_nonleaf;               /* number of times nonleaf nodes are dirtied when previously clean */
  uint64_t         balance_leaf;                /* number of times a leaf node is balanced inside brt */
+  uint64_t         hot_num_started;             /* number of HOT operations that have begun */
+  uint64_t         hot_num_completed;           /* number of HOT operations that have successfully completed */
+  uint64_t         hot_num_aborted;             /* number of HOT operations that have been aborted */
+  uint64_t         hot_max_root_flush_count;    /* max number of flushes from root ever required to optimize a tree */
  uint64_t         msg_bytes_in;                /* how many bytes of messages injected at root (for all trees)*/
  uint64_t         msg_bytes_out;               /* how many bytes of messages flushed from h1 nodes to leaves*/
  uint64_t         msg_bytes_curr;              /* how many bytes of messages currently in trees (estimate)*/
@@ -553,6 +557,7 @@ struct __toku_db {
  int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */;
  int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */;
  int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */;
+  int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra);
  int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION);
  int (*get_readpagesize)(DB*,u_int32_t*);
  int (*set_readpagesize)(DB*,u_int32_t);
@@ -561,7 +566,7 @@ struct __toku_db {
  int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going);
  int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags);
  int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags);
-  void* __toku_dummy0[19];
+  void* __toku_dummy0[18];
  char __toku_dummy1[96];
  void *api_internal; /* 32-bit offset=268 size=4, 64=bit offset=440 size=8 */
  void* __toku_dummy2[5];

--- a/buildheader/db.h_4_6
+++ b/buildheader/db.h_4_6
@@ -197,7 +197,7 @@ typedef struct __toku_engine_status {
  uint64_t         cleaner_max_buffer_workdone;   /* max workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_min_buffer_workdone;   /* min workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_total_buffer_workdone; /* total workdone value of message buffers flushed by cleaner thread */
-  uint64_t         cleaner_num_leaves_unmerged;   /* number of leaves left unmerged by the cleaner thread */
+  uint64_t         cleaner_num_dirtied_for_leaf_merge;  /* nodes dirtied by the "flush from root" process to merge a leaf node */
  uint64_t         flush_total;                 /* total number of flushes done by flusher threads or cleaner threads */
  uint64_t         flush_in_memory;             /* number of in memory flushes */
  uint64_t         flush_needed_io;             /* number of flushes that had to read a child (or part) off disk */
@@ -223,6 +223,10 @@ typedef struct __toku_engine_status {
  uint64_t         dirty_leaf;                  /* number of times leaf nodes are dirtied when previously clean */
  uint64_t         dirty_nonleaf;               /* number of times nonleaf nodes are dirtied when previously clean */
  uint64_t         balance_leaf;                /* number of times a leaf node is balanced inside brt */
+  uint64_t         hot_num_started;             /* number of HOT operations that have begun */
+  uint64_t         hot_num_completed;           /* number of HOT operations that have successfully completed */
+  uint64_t         hot_num_aborted;             /* number of HOT operations that have been aborted */
+  uint64_t         hot_max_root_flush_count;    /* max number of flushes from root ever required to optimize a tree */
  uint64_t         msg_bytes_in;                /* how many bytes of messages injected at root (for all trees)*/
  uint64_t         msg_bytes_out;               /* how many bytes of messages flushed from h1 nodes to leaves*/
  uint64_t         msg_bytes_curr;              /* how many bytes of messages currently in trees (estimate)*/
@@ -556,6 +560,7 @@ struct __toku_db {
  int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */;
  int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */;
  int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */;
+  int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra);
  int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION);
  int (*get_readpagesize)(DB*,u_int32_t*);
  int (*set_readpagesize)(DB*,u_int32_t);
@@ -564,7 +569,7 @@ struct __toku_db {
  int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going);
  int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags);
  int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags);
-  void* __toku_dummy1[23];
+  void* __toku_dummy1[22];
  char __toku_dummy2[80];
  void *api_internal; /* 32-bit offset=276 size=4, 64=bit offset=464 size=8 */
  void* __toku_dummy3[5];

--- a/buildheader/make_db_h.c
+++ b/buildheader/make_db_h.c
@@ -590,7 +590,7 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__
    printf("  uint64_t         cleaner_max_buffer_workdone;   /* max workdone value of any message buffer flushed by cleaner thread */\n");
    printf("  uint64_t         cleaner_min_buffer_workdone;   /* min workdone value of any message buffer flushed by cleaner thread */\n");
    printf("  uint64_t         cleaner_total_buffer_workdone; /* total workdone value of message buffers flushed by cleaner thread */\n");
-    printf("  uint64_t         cleaner_num_leaves_unmerged;   /* number of leaves left unmerged by the cleaner thread */\n");
+    printf("  uint64_t         cleaner_num_dirtied_for_leaf_merge;  /* nodes dirtied by the \"flush from root\" process to merge a leaf node */\n");
    printf("  uint64_t         flush_total;                 /* total number of flushes done by flusher threads or cleaner threads */\n");
    printf("  uint64_t         flush_in_memory;             /* number of in memory flushes */\n");
    printf("  uint64_t         flush_needed_io;             /* number of flushes that had to read a child (or part) off disk */\n");
@@ -616,6 +616,10 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__
    printf("  uint64_t         dirty_leaf;                  /* number of times leaf nodes are dirtied when previously clean */\n");
    printf("  uint64_t         dirty_nonleaf;               /* number of times nonleaf nodes are dirtied when previously clean */\n");
    printf("  uint64_t         balance_leaf;                /* number of times a leaf node is balanced inside brt */\n");
+    printf("  uint64_t         hot_num_started;             /* number of HOT operations that have begun */\n");
+    printf("  uint64_t         hot_num_completed;           /* number of HOT operations that have successfully completed */\n");
+    printf("  uint64_t         hot_num_aborted;             /* number of HOT operations that have been aborted */\n");
+    printf("  uint64_t         hot_max_root_flush_count;    /* max number of flushes from root ever required to optimize a tree */\n");
    printf("  uint64_t         msg_bytes_in;                /* how many bytes of messages injected at root (for all trees)*/\n");
    printf("  uint64_t         msg_bytes_out;               /* how many bytes of messages flushed from h1 nodes to leaves*/\n");
    printf("  uint64_t         msg_bytes_curr;              /* how many bytes of messages currently in trees (estimate)*/\n");
@@ -804,6 +808,7 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__
 			     "int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */",
                             "int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */",
                             "int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */",
+                             "int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra)",
                             "int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION)",
                             "int (*get_readpagesize)(DB*,u_int32_t*)",
                             "int (*set_readpagesize)(DB*,u_int32_t)",

--- a/buildheader/tdb.h
+++ b/buildheader/tdb.h
@@ -197,7 +197,7 @@ typedef struct __toku_engine_status {
  uint64_t         cleaner_max_buffer_workdone;   /* max workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_min_buffer_workdone;   /* min workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_total_buffer_workdone; /* total workdone value of message buffers flushed by cleaner thread */
-  uint64_t         cleaner_num_leaves_unmerged;   /* number of leaves left unmerged by the cleaner thread */
+  uint64_t         cleaner_num_dirtied_for_leaf_merge;  /* nodes dirtied by the "flush from root" process to merge a leaf node */
  uint64_t         flush_total;                 /* total number of flushes done by flusher threads or cleaner threads */
  uint64_t         flush_in_memory;             /* number of in memory flushes */
  uint64_t         flush_needed_io;             /* number of flushes that had to read a child (or part) off disk */
@@ -223,6 +223,10 @@ typedef struct __toku_engine_status {
  uint64_t         dirty_leaf;                  /* number of times leaf nodes are dirtied when previously clean */
  uint64_t         dirty_nonleaf;               /* number of times nonleaf nodes are dirtied when previously clean */
  uint64_t         balance_leaf;                /* number of times a leaf node is balanced inside brt */
+  uint64_t         hot_num_started;             /* number of HOT operations that have begun */
+  uint64_t         hot_num_completed;           /* number of HOT operations that have successfully completed */
+  uint64_t         hot_num_aborted;             /* number of HOT operations that have been aborted */
+  uint64_t         hot_max_root_flush_count;    /* max number of flushes from root ever required to optimize a tree */
  uint64_t         msg_bytes_in;                /* how many bytes of messages injected at root (for all trees)*/
  uint64_t         msg_bytes_out;               /* how many bytes of messages flushed from h1 nodes to leaves*/
  uint64_t         msg_bytes_curr;              /* how many bytes of messages currently in trees (estimate)*/
@@ -525,6 +529,7 @@ struct __toku_db {
  int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */;
  int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */;
  int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */;
+  int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra);
  int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION);
  int (*get_readpagesize)(DB*,u_int32_t*);
  int (*set_readpagesize)(DB*,u_int32_t);

--- a/include/db.h
+++ b/include/db.h
@@ -197,7 +197,7 @@ typedef struct __toku_engine_status {
  uint64_t         cleaner_max_buffer_workdone;   /* max workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_min_buffer_workdone;   /* min workdone value of any message buffer flushed by cleaner thread */
  uint64_t         cleaner_total_buffer_workdone; /* total workdone value of message buffers flushed by cleaner thread */
-  uint64_t         cleaner_num_leaves_unmerged;   /* number of leaves left unmerged by the cleaner thread */
+  uint64_t         cleaner_num_dirtied_for_leaf_merge;  /* nodes dirtied by the "flush from root" process to merge a leaf node */
  uint64_t         flush_total;                 /* total number of flushes done by flusher threads or cleaner threads */
  uint64_t         flush_in_memory;             /* number of in memory flushes */
  uint64_t         flush_needed_io;             /* number of flushes that had to read a child (or part) off disk */
@@ -223,6 +223,10 @@ typedef struct __toku_engine_status {
  uint64_t         dirty_leaf;                  /* number of times leaf nodes are dirtied when previously clean */
  uint64_t         dirty_nonleaf;               /* number of times nonleaf nodes are dirtied when previously clean */
  uint64_t         balance_leaf;                /* number of times a leaf node is balanced inside brt */
+  uint64_t         hot_num_started;             /* number of HOT operations that have begun */
+  uint64_t         hot_num_completed;           /* number of HOT operations that have successfully completed */
+  uint64_t         hot_num_aborted;             /* number of HOT operations that have been aborted */
+  uint64_t         hot_max_root_flush_count;    /* max number of flushes from root ever required to optimize a tree */
  uint64_t         msg_bytes_in;                /* how many bytes of messages injected at root (for all trees)*/
  uint64_t         msg_bytes_out;               /* how many bytes of messages flushed from h1 nodes to leaves*/
  uint64_t         msg_bytes_curr;              /* how many bytes of messages currently in trees (estimate)*/
@@ -525,6 +529,7 @@ struct __toku_db {
  int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */;
  int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */;
  int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */;
+  int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra);
  int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION);
  int (*get_readpagesize)(DB*,u_int32_t*);
  int (*set_readpagesize)(DB*,u_int32_t);

--- a/newbrt/Makefile
+++ b/newbrt/Makefile
@@ -51,6 +51,7 @@ BRT_SOURCES = \
  brt \
  brt-cachetable-wrappers \
  brt-flusher \
+  brt-hot-flusher \
  brt_msg \
  brt-test-helpers \
  cachetable \

--- a/newbrt/brt-XY.c
+++ b/newbrt/brt-XY.c
@@ -8,7 +8,7 @@ static int brt_root_put_cmd_XY (BRT brt, BRT_MSG *md, TOKUTXN txn) {
 	if (0) { died0: toku_unpin_brt_header(brt); }
 	return r;
    }
-    CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt);
+    CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt->h);
    if ((r=cachetable_get_and_pin(brt->cf, *rootp, &node_v, NULL,
 				  toku_brtnode_flush_callback, toku_brtnode_fetch_callback, (void*)(long)brt->h->nodesize))) {
 	goto died0;

--- a/newbrt/brt-cachetable-wrappers.c
+++ b/newbrt/brt-cachetable-wrappers.c
@@ -6,6 +6,7 @@
 #include <brt-cachetable-wrappers.h>

 #include <brttypes.h>
+#include <brt-flusher.h>
 #include <brt-internal.h>
 #include <cachetable.h>


--- a/newbrt/brt-flusher-internal.h
+++ b/newbrt/brt-flusher-internal.h
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ifndef BRT_FLUSHER_INTERNAL
+#define BRT_FLUSHER_INTERNAL
+#ident "$Id$"
+#ident "Copyright (c) 2007-2010 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <brttypes.h>
+#include <c_dialects.h>
+
+C_BEGIN
+
+typedef struct flusher_advice FLUSHER_ADVICE;
+
+/**
+ * Choose a child to flush to.  Returns a childnum, or -1 if we should
+ * go no further.
+ *
+ * Flusher threads: pick the heaviest child buffer
+ * Cleaner threads: pick the heaviest child buffer
+ * Cleaner thread merging leaf nodes: follow down to a key
+ * Hot optimize table: follow down to the right of a key
+ */
+typedef int (*FA_PICK_CHILD)(struct brt_header *h, BRTNODE parent, void* extra);
+
+/**
+ * Decide whether to call `flush_some_child` on the child if it is
+ * stable and a nonleaf node.
+ *
+ * Flusher threads: yes if child is gorged
+ * Cleaner threads: yes if child is gorged
+ * Cleaner thread merging leaf nodes: always yes
+ * Hot optimize table: always yes
+ */
+typedef bool (*FA_SHOULD_RECURSIVELY_FLUSH)(BRTNODE child, void* extra);
+
+/**
+ * Called if the child needs merging.  Should do something to get the
+ * child out of a fusible state.  Must unpin parent and child.
+ *
+ * Flusher threads: just do the merge
+ * Cleaner threads: if nonleaf, just merge, otherwise start a "cleaner
+ *                  thread merge"
+ * Cleaner thread merging leaf nodes: just do the merge
+ * Hot optimize table: just do the merge
+ */
+typedef void (*FA_MAYBE_MERGE_CHILD)(struct flusher_advice *fa,
+                              struct brt_header *h,
+                              BRTNODE parent,
+                              int childnum,
+                              BRTNODE child,
+                              void* extra);
+
+/**
+ * Cleaner threads may need to destroy basement nodes which have been
+ * brought more up to date than the height 1 node flushing to them.
+ * This function is used to determine if we need to check for basement
+ * nodes that are too up to date, and then destroy them if we find
+ * them.
+ *
+ * Flusher threads: no
+ * Cleaner threads: yes
+ * Cleaner thread merging leaf nodes: no
+ * Hot optimize table: no
+ */
+typedef bool (*FA_SHOULD_DESTROY_BN)(void* extra);
+
+/**
+ * Update `brt_flusher_status` in whatever way necessary.  Called once
+ * by `flush_some_child` right before choosing what to do next (split,
+ * merge, recurse), with the number of nodes that were dirtied by this
+ * execution of `flush_some_child`.
+ */
+typedef void (*FA_UPDATE_STATUS)(BRTNODE child, int dirtied, void* extra);
+
+/**
+ * Choose whether to go to the left or right child after a split.  Called
+ * by `brt_split_child`.  If -1 is returned, `brt_split_child` defaults to
+ * the old behavior.
+ */
+typedef int (*FA_PICK_CHILD_AFTER_SPLIT)(struct brt_header* h,
+                                         BRTNODE node,
+                                         int childnuma,
+                                         int childnumb,
+                                         void* extra);
+
+/**
+ * A collection of callbacks used by the flushing machinery to make
+ * various decisions.  There are implementations of each of these
+ * functions for flusher threads (ft_*), cleaner threads (ct_*), , and hot
+ * optimize table (hot_*).
+ */
+struct flusher_advice {
+    FA_PICK_CHILD pick_child;
+    FA_SHOULD_RECURSIVELY_FLUSH should_recursively_flush;
+    FA_MAYBE_MERGE_CHILD maybe_merge_child;
+    FA_SHOULD_DESTROY_BN should_destroy_basement_nodes;
+    FA_UPDATE_STATUS update_status;
+    FA_PICK_CHILD_AFTER_SPLIT pick_child_after_split;
+    void* extra; // parameter passed into callbacks
+};
+
+
+void
+flusher_advice_init(
+    struct flusher_advice *fa,
+    FA_PICK_CHILD pick_child,
+    FA_SHOULD_DESTROY_BN should_destroy_basement_nodes,
+    FA_SHOULD_RECURSIVELY_FLUSH should_recursively_flush,
+    FA_MAYBE_MERGE_CHILD maybe_merge_child,
+    FA_UPDATE_STATUS update_status,
+    FA_PICK_CHILD_AFTER_SPLIT pick_child_after_split,
+    void* extra
+    );
+
+void
+flush_some_child(
+    struct brt_header* h,
+    BRTNODE parent,
+    struct flusher_advice *fa);
+
+bool
+always_recursively_flush(BRTNODE child, void* extra);
+
+bool
+dont_destroy_basement_nodes(void* extra);
+
+void
+default_merge_child(struct flusher_advice *fa,
+                    struct brt_header *h,
+                    BRTNODE parent,
+                    int childnum,
+                    BRTNODE child,
+                    void* extra);
+
+int
+default_pick_child_after_split(struct brt_header *h,
+                               BRTNODE parent,
+                               int childnuma,
+                               int childnumb,
+                               void *extra);
+
+C_END
+
+#endif // End of header guardian.
--- a/newbrt/brt-flusher.c
+++ b/newbrt/brt-flusher.c
@@ -3,9 +3,23 @@
 #ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

+#include <brt-internal.h>
 #include <brt-flusher.h>
+#include <brt-flusher-internal.h>
 #include <brt-cachetable-wrappers.h>
-#include <brt-internal.h>
+
+static BRT_FLUSHER_STATUS_S brt_flusher_status;
+
+void toku_brt_flusher_status_init(void)
+{
+    brt_flusher_status.cleaner_min_buffer_size = UINT64_MAX;
+    brt_flusher_status.cleaner_min_buffer_workdone = UINT64_MAX;
+}
+
+void toku_brt_flusher_get_status(BRT_FLUSHER_STATUS status)
+{
+    *status = brt_flusher_status;
+}

 #define ft_flush_before_applying_inbox 1
 #define ft_flush_before_child_pin 2
@@ -36,8 +50,8 @@ static void call_flusher_thread_callback(int ft_state) {
    }
 }

-static void
-find_heaviest_child(BRTNODE node, int *childnum)
+static int
+find_heaviest_child(BRTNODE node)
 {
    int max_child = 0;
    int max_weight = toku_bnc_nbytesinbuf(BNC(node, 0)) + BP_WORKDONE(node, 0);
@@ -56,30 +70,29 @@ find_heaviest_child(BRTNODE node, int *childnum)
            max_weight = this_weight;
        }
    }
-    *childnum = max_child;
    if (0) printf("\n");
+    return max_child;
 }

 static void
-update_flush_status(BRTNODE UU(parent), BRTNODE child, int cascades, BRT_STATUS brt_status)
+update_flush_status(BRTNODE child, int cascades)
 {
-    lazy_assert(brt_status);
-    brt_status->flush_total++;
+    brt_flusher_status.flush_total++;
    if (cascades > 0) {
-        brt_status->flush_cascades++;
+        brt_flusher_status.flush_cascades++;
        switch (cascades) {
        case 1:
-            brt_status->flush_cascades_1++; break;
+            brt_flusher_status.flush_cascades_1++; break;
        case 2:
-            brt_status->flush_cascades_2++; break;
+            brt_flusher_status.flush_cascades_2++; break;
        case 3:
-            brt_status->flush_cascades_3++; break;
+            brt_flusher_status.flush_cascades_3++; break;
        case 4:
-            brt_status->flush_cascades_4++; break;
+            brt_flusher_status.flush_cascades_4++; break;
        case 5:
-            brt_status->flush_cascades_5++; break;
+            brt_flusher_status.flush_cascades_5++; break;
        default:
-            brt_status->flush_cascades_gt_5++; break;
+            brt_flusher_status.flush_cascades_gt_5++; break;
        }
    }
    bool flush_needs_io = false;
@@ -89,9 +102,9 @@ update_flush_status(BRTNODE UU(parent), BRTNODE child, int cascades, BRT_STATUS
        }
    }
    if (flush_needs_io) {
-        brt_status->flush_needed_io++;
+        brt_flusher_status.flush_needed_io++;
    } else {
-        brt_status->flush_in_memory++;
+        brt_flusher_status.flush_in_memory++;
    }
 }

@@ -113,6 +126,267 @@ maybe_destroy_child_blbs(BRTNODE node, BRTNODE child)
    }
 }

+static void
+brt_merge_child(
+    struct brt_header* h,
+    BRTNODE node,
+    int childnum_to_merge,
+    BOOL *did_react,
+    struct flusher_advice *fa);
+
+static int
+pick_heaviest_child(struct brt_header *UU(h),
+                    BRTNODE parent,
+                    void* UU(extra))
+{
+    int childnum = find_heaviest_child(parent);
+    assert(toku_bnc_n_entries(BNC(parent, childnum))>0);
+    return childnum;
+}
+
+bool
+dont_destroy_basement_nodes(void* UU(extra))
+{
+    return false;
+}
+
+static bool
+do_destroy_basement_nodes(void* UU(extra))
+{
+    return true;
+}
+
+bool
+always_recursively_flush(BRTNODE UU(child), void* UU(extra))
+{
+    return true;
+}
+
+static bool
+recurse_if_child_is_gorged(BRTNODE child, void* UU(extra))
+{
+    return toku_brt_nonleaf_is_gorged(child);
+}
+
+int
+default_pick_child_after_split(struct brt_header* UU(h),
+                               BRTNODE UU(parent),
+                               int UU(childnuma),
+                               int UU(childnumb),
+                               void* UU(extra))
+{
+    return -1;
+}
+
+void
+default_merge_child(struct flusher_advice *fa,
+                    struct brt_header *h,
+                    BRTNODE parent,
+                    int childnum,
+                    BRTNODE child,
+                    void* UU(extra))
+{
+    //
+    // There is probably a way to pass BRTNODE child
+    // into brt_merge_child, but for simplicity for now,
+    // we are just going to unpin child and
+    // let brt_merge_child pin it again
+    //
+    toku_unpin_brtnode_off_client_thread(h, child);
+    //
+    //
+    // it is responsibility of brt_merge_child to unlock parent
+    //
+    BOOL did_react;
+    brt_merge_child(h, parent, childnum, &did_react, fa);
+}
+
+void
+flusher_advice_init(
+    struct flusher_advice *fa,
+    FA_PICK_CHILD pick_child,
+    FA_SHOULD_DESTROY_BN should_destroy_basement_nodes,
+    FA_SHOULD_RECURSIVELY_FLUSH should_recursively_flush,
+    FA_MAYBE_MERGE_CHILD maybe_merge_child,
+    FA_UPDATE_STATUS update_status,
+    FA_PICK_CHILD_AFTER_SPLIT pick_child_after_split,
+    void* extra
+    )
+{
+    fa->pick_child = pick_child;
+    fa->should_destroy_basement_nodes = should_destroy_basement_nodes;
+    fa->should_recursively_flush = should_recursively_flush;
+    fa->maybe_merge_child = maybe_merge_child;
+    fa->update_status = update_status;
+    fa->pick_child_after_split = pick_child_after_split;
+    fa->extra = extra;
+}
+
+/**
+ * Flusher thread ("normal" flushing) implementation.
+ */
+struct flush_status_update_extra {
+    int cascades;
+};
+
+static void
+ft_update_status(BRTNODE child,
+                 int UU(dirtied),
+                 void* extra)
+{
+    struct flush_status_update_extra *fste = extra;
+    update_flush_status(child, fste->cascades);
+    // If `flush_some_child` decides to recurse after this, we'll need
+    // cascades to increase.  If not it doesn't matter.
+    fste->cascades++;
+}
+
+static void
+ft_flusher_advice_init(struct flusher_advice *fa, struct flush_status_update_extra *fste)
+{
+    fste->cascades = 0;
+    flusher_advice_init(fa,
+                        pick_heaviest_child,
+                        dont_destroy_basement_nodes,
+                        recurse_if_child_is_gorged,
+                        default_merge_child,
+                        ft_update_status,
+                        default_pick_child_after_split,
+                        fste);
+}
+
+struct ctm_extra {
+    BOOL is_last_child;
+    DBT target_key;
+};
+
+static int
+ctm_pick_child(struct brt_header *h,
+               BRTNODE parent,
+               void* extra)
+{
+    struct ctm_extra* ctme = extra;
+    int childnum;
+    if (parent->height == 1 && ctme->is_last_child) {
+        childnum = parent->n_children - 1;
+    }
+    else {
+        childnum = toku_brtnode_which_child(
+            parent,
+            &ctme->target_key,
+            &h->descriptor,
+            h->compare_fun);
+    }
+    return childnum;
+}
+
+static void
+ctm_update_status(
+    BRTNODE UU(child),
+    int dirtied,
+    void* UU(extra)
+    )
+{
+    brt_flusher_status.cleaner_num_dirtied_for_leaf_merge += dirtied;
+}
+
+static void
+ct_maybe_merge_child(struct flusher_advice *fa,
+                     struct brt_header *h,
+                     BRTNODE parent,
+                     int childnum,
+                     BRTNODE child,
+                     void* extra)
+{
+    if (child->height > 0) {
+        default_merge_child(fa, h, parent, childnum, child, extra);
+    }
+    else {
+        struct ctm_extra ctme;
+        assert(parent->n_children > 1);
+        int pivot_to_save;
+        //
+        // we have two cases, one where the childnum
+        // is the last child, and therefore the pivot we
+        // save is not of the pivot which we wish to descend
+        // and another where it is not the last child,
+        // so the pivot is sufficient for identifying the leaf
+        // to be merged
+        //
+        if (childnum == (parent->n_children - 1)) {
+            ctme.is_last_child = TRUE;
+            pivot_to_save = childnum - 1;
+        }
+        else {
+            ctme.is_last_child = FALSE;
+            pivot_to_save = childnum;
+        }
+        struct kv_pair *pivot = parent->childkeys[pivot_to_save];
+        size_t pivotlen = kv_pair_keylen(pivot);
+        char *buf = toku_xmemdup(kv_pair_key_const(pivot), pivotlen);
+        toku_fill_dbt(&ctme.target_key, buf, pivotlen);
+
+        // at this point, ctme is properly setup, now we can do the merge
+        struct flusher_advice new_fa;
+        flusher_advice_init(
+            &new_fa,
+            ctm_pick_child,
+            dont_destroy_basement_nodes,
+            always_recursively_flush,
+            default_merge_child,
+            ctm_update_status,
+            default_pick_child_after_split,
+            &ctme);
+
+        toku_unpin_brtnode_off_client_thread(h, parent);
+        toku_unpin_brtnode_off_client_thread(h, child);
+
+        // grab ydb lock, if it exists, if we are running a brt
+        // layer test, there may be no ydb lock and that is ok
+        toku_cachetable_call_ydb_lock(h->cf);
+        CACHEKEY *rootp;
+        u_int32_t fullhash;
+        rootp = toku_calculate_root_offset_pointer(h, &fullhash);
+        struct brtnode_fetch_extra bfe;
+        fill_bfe_for_full_read(&bfe, h);
+        BRTNODE root_node;
+        toku_pin_brtnode_off_client_thread(h, *rootp, fullhash, &bfe, 0,NULL, &root_node);
+        toku_assert_entire_node_in_memory(root_node);
+        // release ydb lock, if it exists, if we are running a brt
+        // layer test, there may be no ydb lock and that is ok
+        toku_cachetable_call_ydb_unlock(h->cf);
+
+        flush_some_child(h, root_node, &new_fa);
+
+        toku_free(buf);
+    }
+}
+
+static void
+ct_update_status(BRTNODE child,
+                 int dirtied,
+                 void* extra)
+{
+    struct flush_status_update_extra* fste = extra;
+    update_flush_status(child, fste->cascades);
+    brt_flusher_status.cleaner_nodes_dirtied += dirtied;
+    // Incrementing this in case `flush_some_child` decides to recurse.
+    fste->cascades++;
+}
+
+static void
+ct_flusher_advice_init(struct flusher_advice *fa, struct flush_status_update_extra* fste)
+{
+    fste->cascades = 0;
+    flusher_advice_init(fa,
+                        pick_heaviest_child,
+                        do_destroy_basement_nodes,
+                        recurse_if_child_is_gorged,
+                        ct_maybe_merge_child,
+                        ct_update_status,
+                        default_pick_child_after_split,
+                        fste);
+}

 //
 // This returns true if the node MAY be reactive,
@@ -379,8 +653,7 @@ brtleaf_split(
    DBT *splitk,
    BOOL create_new_node,
    u_int32_t num_dependent_nodes,
-    BRTNODE* dependent_nodes,
-    BRT_STATUS brt_status)
+    BRTNODE* dependent_nodes)
 // Effect: Split a leaf node.
 // Argument "node" is node to be split.
 // Upon return:
@@ -390,7 +663,7 @@ brtleaf_split(
 {

    invariant(node->height == 0);
-    brt_status->split_leaf++;
+    brt_flusher_status.split_leaf++;
    if (node->n_children) {
 	// First move all the accumulated stat64info deltas into the first basement.
 	// After the split, either both nodes or neither node will be included in the next checkpoint.
@@ -592,11 +865,10 @@ brt_nonleaf_split(
    BRTNODE *nodeb,
    DBT *splitk,
    u_int32_t num_dependent_nodes,
-    BRTNODE* dependent_nodes,
-    BRT_STATUS brt_status)
+    BRTNODE* dependent_nodes)
 {
    //VERIFY_NODE(t,node);
-    brt_status->split_nonleaf++;
+    brt_flusher_status.split_nonleaf++;
    toku_assert_entire_node_in_memory(node);
    int old_n_children = node->n_children;
    int n_children_in_a = old_n_children/2;
@@ -660,15 +932,6 @@ brt_nonleaf_split(
    *nodeb = B;
 }

-static void
-flush_some_child(
-    struct brt_header* h,
-    BRTNODE parent,
-    int *n_dirtied,
-    int cascades,
-    bool started_at_root,
-    BRT_STATUS brt_status);
-
 //
 // responsibility of brt_split_child is to take locked BRTNODEs node and child
 // and do the following:
@@ -683,8 +946,7 @@ brt_split_child(
    BRTNODE node,
    int childnum,
    BRTNODE child,
-    bool started_at_root,
-    BRT_STATUS brt_status)
+    struct flusher_advice *fa)
 {
    assert(node->height>0);
    assert(toku_bnc_nbytesinbuf(BNC(node, childnum))==0); // require that the buffer for this child is empty
@@ -700,9 +962,9 @@ brt_split_child(
    dep_nodes[0] = node;
    dep_nodes[1] = child;
    if (child->height==0) {
-        brtleaf_split(h, child, &nodea, &nodeb, &splitk, TRUE, 2, dep_nodes, brt_status);
+        brtleaf_split(h, child, &nodea, &nodeb, &splitk, TRUE, 2, dep_nodes);
    } else {
-        brt_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes, brt_status);
+        brt_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes);
    }
    // printf("%s:%d child did split\n", __FILE__, __LINE__);
    handle_split_of_child (node, childnum, nodea, nodeb, &splitk);
@@ -714,14 +976,17 @@ brt_split_child(
    // now we need to unlock node,
    // and possibly continue
    // flushing one of the children
+    int picked_child = fa->pick_child_after_split(h, node, childnum, childnum + 1, fa->extra);
    toku_unpin_brtnode_off_client_thread(h, node);
-    if (nodea->height > 0 && toku_brt_nonleaf_is_gorged(nodea)) {
+    if (picked_child == childnum ||
+        (picked_child < 0 && nodea->height > 0 && fa->should_recursively_flush(nodea, fa->extra))) {
        toku_unpin_brtnode_off_client_thread(h, nodeb);
-        flush_some_child(h, nodea, NULL, 0, started_at_root, brt_status);
+        flush_some_child(h, nodea, fa);
    }
-    else if (nodeb->height > 0 && toku_brt_nonleaf_is_gorged(nodeb)) {
+    else if (picked_child == childnum + 1 ||
+             (picked_child < 0 && nodeb->height > 0 && fa->should_recursively_flush(nodeb, fa->extra))) {
        toku_unpin_brtnode_off_client_thread(h, nodea);
-        flush_some_child(h, nodeb, NULL, 0, started_at_root, brt_status);
+        flush_some_child(h, nodeb, fa);
    }
    else {
        toku_unpin_brtnode_off_client_thread(h, nodea);
@@ -735,14 +1000,13 @@ flush_this_child(
    BRTNODE node,
    BRTNODE child,
    int childnum,
-    bool started_at_root,
-    BRT_STATUS brt_status)
+    struct flusher_advice *fa)
 // Effect: Push everything in the CHILDNUMth buffer of node down into the child.
 {
-    update_flush_status(node, child, 0, brt_status);
+    update_flush_status(child, 0);
    int r;
    toku_assert_entire_node_in_memory(node);
-    if (!started_at_root) {
+    if (fa->should_destroy_basement_nodes(fa)) {
        maybe_destroy_child_blbs(node, child);
    }
    bring_node_fully_into_memory(child, h);
@@ -764,9 +1028,9 @@ flush_this_child(
 }

 static void
-merge_leaf_nodes(BRTNODE a, BRTNODE b, BRT_STATUS brt_status)
+merge_leaf_nodes(BRTNODE a, BRTNODE b)
 {
-    brt_status->merge_leaf++;
+    brt_flusher_status.merge_leaf++;
    toku_assert_entire_node_in_memory(a);
    toku_assert_entire_node_in_memory(b);
    assert(a->height == 0);
@@ -841,19 +1105,18 @@ static int
 balance_leaf_nodes(
    BRTNODE a,
    BRTNODE b,
-    struct kv_pair **splitk,
-    BRT_STATUS brt_status)
+    struct kv_pair **splitk)
 // Effect:
 //  If b is bigger then move stuff from b to a until b is the smaller.
 //  If a is bigger then move stuff from a to b until a is the smaller.
 {
-    brt_status->balance_leaf++;
+    brt_flusher_status.balance_leaf++;
    DBT splitk_dbt;
    // first merge all the data into a
-    merge_leaf_nodes(a,b, brt_status);
+    merge_leaf_nodes(a,b);
    // now split them
    // because we are not creating a new node, we can pass in no dependent nodes
-    brtleaf_split(NULL, a, &a, &b, &splitk_dbt, FALSE, 0, NULL, brt_status);
+    brtleaf_split(NULL, a, &a, &b, &splitk_dbt, FALSE, 0, NULL);
    *splitk = splitk_dbt.data;

    return 0;
@@ -866,8 +1129,7 @@ maybe_merge_pinned_leaf_nodes(
    struct kv_pair *parent_splitk,
    BOOL *did_merge,
    BOOL *did_rebalance,
-    struct kv_pair **splitk,
-    BRT_STATUS brt_status)
+    struct kv_pair **splitk)
 // Effect: Either merge a and b into one one node (merge them into a) and set *did_merge = TRUE.
 //	   (We do this if the resulting node is not fissible)
 //	   or distribute the leafentries evenly between a and b, and set *did_rebalance = TRUE.
@@ -887,7 +1149,7 @@ maybe_merge_pinned_leaf_nodes(
        // one is less than 1/4 of a node, and together they are more than 3/4 of a node.
        toku_free(parent_splitk); // We don't need the parent_splitk any more. If we need a splitk (if we don't merge) we'll malloc a new one.
        *did_rebalance = TRUE;
-        int r = balance_leaf_nodes(a, b, splitk, brt_status);
+        int r = balance_leaf_nodes(a, b, splitk);
        assert(r==0);
    } else {
        // we are merging them.
@@ -895,7 +1157,7 @@ maybe_merge_pinned_leaf_nodes(
        *did_rebalance = FALSE;
        *splitk = 0;
        toku_free(parent_splitk); // if we are merging, the splitk gets freed.
-        merge_leaf_nodes(a, b, brt_status);
+        merge_leaf_nodes(a, b);
    }
 }

@@ -906,8 +1168,7 @@ maybe_merge_pinned_nonleaf_nodes(
    BRTNODE b,
    BOOL *did_merge,
    BOOL *did_rebalance,
-    struct kv_pair **splitk,
-    BRT_STATUS brt_status)
+    struct kv_pair **splitk)
 {
    toku_assert_entire_node_in_memory(a);
    toku_assert_entire_node_in_memory(b);
@@ -938,7 +1199,7 @@ maybe_merge_pinned_nonleaf_nodes(
    *did_rebalance = FALSE;
    *splitk = NULL;

-    brt_status->merge_nonleaf++;
+    brt_flusher_status.merge_nonleaf++;
 }

 static void
@@ -949,8 +1210,7 @@ maybe_merge_pinned_nodes(
    BRTNODE b,
    BOOL *did_merge,
    BOOL *did_rebalance,
-    struct kv_pair **splitk,
-    BRT_STATUS brt_status)
+    struct kv_pair **splitk)
 // Effect: either merge a and b into one node (merge them into a) and set *did_merge = TRUE.
 //	   (We do this if the resulting node is not fissible)
 //	   or distribute a and b evenly and set *did_merge = FALSE and *did_rebalance = TRUE
@@ -984,9 +1244,9 @@ maybe_merge_pinned_nodes(
        }
    }
    if (a->height == 0) {
-        maybe_merge_pinned_leaf_nodes(a, b, parent_splitk, did_merge, did_rebalance, splitk, brt_status);
+        maybe_merge_pinned_leaf_nodes(a, b, parent_splitk, did_merge, did_rebalance, splitk);
    } else {
-        maybe_merge_pinned_nonleaf_nodes(parent_splitk, a, b, did_merge, did_rebalance, splitk, brt_status);
+        maybe_merge_pinned_nonleaf_nodes(parent_splitk, a, b, did_merge, did_rebalance, splitk);
    }
    if (*did_merge || *did_rebalance) {
        // accurate for leaf nodes because all msgs above have been
@@ -998,12 +1258,11 @@ maybe_merge_pinned_nodes(
 }

 static void merge_remove_key_callback(
-    BLOCKNUM* bp, 
+    BLOCKNUM *bp,
    BOOL for_checkpoint,
-    void* extra
-    )
+    void *extra)
 {
-    struct brt_header* h = extra;
+    struct brt_header *h = extra;
    toku_free_blocknum(h->blocktable, bp, h, for_checkpoint);
 }

@@ -1013,12 +1272,11 @@ static void merge_remove_key_callback(
 //
 static void
 brt_merge_child(
-    struct brt_header* h,
+    struct brt_header *h,
    BRTNODE node,
    int childnum_to_merge,
    BOOL *did_react,
-    bool started_at_root,
-    BRT_STATUS brt_status)
+    struct flusher_advice *fa)
 {
    // this function should not be called
    // if the child is not mergable
@@ -1039,7 +1297,6 @@ brt_merge_child(

    assert(node->height>0);

-
    // We suspect that at least one of the children is fusible, but they might not be.
    // for test
    call_flusher_thread_callback(ft_flush_before_merge);
@@ -1062,10 +1319,10 @@ brt_merge_child(
    }

    if (toku_bnc_n_entries(BNC(node,childnuma))>0) {
-        flush_this_child(h, node, childa, childnuma, started_at_root, brt_status);
+        flush_this_child(h, node, childa, childnuma, fa);
    }
    if (toku_bnc_n_entries(BNC(node,childnumb))>0) {
-        flush_this_child(h, node, childb, childnumb, started_at_root, brt_status);
+        flush_this_child(h, node, childb, childnumb, fa);
    }

    // now we have both children pinned in main memory, and cachetable locked,
@@ -1076,7 +1333,7 @@ brt_merge_child(
        struct kv_pair *splitk_kvpair = 0;
        struct kv_pair *old_split_key = node->childkeys[childnuma];
        unsigned int deleted_size = toku_brt_pivot_key_len(old_split_key);
-        maybe_merge_pinned_nodes(node, node->childkeys[childnuma], childa, childb, &did_merge, &did_rebalance, &splitk_kvpair, brt_status);
+        maybe_merge_pinned_nodes(node, node->childkeys[childnuma], childa, childb, &did_merge, &did_rebalance, &splitk_kvpair);
        if (childa->height>0) { int i; for (i=0; i+1<childa->n_children; i++) assert(childa->childkeys[i]); }
        //toku_verify_estimates(t,childa);
        // the tree did react if a merge (did_merge) or rebalance (new spkit key) occurred
@@ -1138,8 +1395,8 @@ brt_merge_child(
        toku_unpin_brtnode_off_client_thread(h, node);
        toku_unpin_brtnode_off_client_thread(h, childb);
    }
-    if (childa->height > 0 && toku_brt_nonleaf_is_gorged(childa)) {
-        flush_some_child(h, childa, NULL, 0, started_at_root, brt_status);
+    if (childa->height > 0 && fa->should_recursively_flush(childa, fa->extra)) {
+        flush_some_child(h, childa, fa);
    }
    else {
        toku_unpin_brtnode_off_client_thread(h, childa);
@@ -1172,14 +1429,11 @@ brt_merge_child(
 // will have started_at_root==false and anything started by the flusher
 // thread will have started_at_root==true, but future mechanisms need to
 // be mindful of this issue.
-static void
+void
 flush_some_child(
-    struct brt_header* h,
+    struct brt_header *h,
    BRTNODE parent,
-    int *n_dirtied,
-    int cascades,
-    bool started_at_root,
-    BRT_STATUS brt_status)
+    struct flusher_advice *fa)
 // Effect: This function does the following:
 //   - Pick a child of parent (the heaviest child),
 //   - flush from parent to child,
@@ -1189,17 +1443,13 @@ flush_some_child(
 //  Upon exit of this function, parent is unlocked and no new
 //  new nodes (such as a child) remain locked
 {
-    bool parent_unpinned = false;
+    int dirtied = 0;
+    NONLEAF_CHILDINFO bnc = NULL;
    assert(parent->height>0);
    toku_assert_entire_node_in_memory(parent);
-    if (n_dirtied && !parent->dirty) {
-        (*n_dirtied)++;
-    }

    // pick the child we want to flush to
-    int childnum;
-    find_heaviest_child(parent, &childnum);
-    assert(toku_bnc_n_entries(BNC(parent, childnum))>0);
+    int childnum = fa->pick_child(h, parent, fa->extra);

    // for test
    call_flusher_thread_callback(ft_flush_before_child_pin);
@@ -1216,15 +1466,10 @@ flush_some_child(
    fill_bfe_for_min_read(&bfe, h);
    toku_pin_brtnode_off_client_thread(h, targetchild, childfullhash, &bfe, 1, &parent, &child);

-    if (n_dirtied && !child->dirty) {
-        (*n_dirtied)++;
-    }
-    update_flush_status(parent, child, cascades, brt_status);
-
    // for test
    call_flusher_thread_callback(ft_flush_after_child_pin);

-    if (!started_at_root) {
+    if (fa->should_destroy_basement_nodes(fa)) {
        maybe_destroy_child_blbs(parent, child);
    }

@@ -1237,12 +1482,17 @@ flush_some_child(
    assert(child->thisnodename.b!=0);
    //VERIFY_NODE(brt, child);

+    // only do the following work if there is a flush to perform
+    if (toku_bnc_n_entries(BNC(parent, childnum)) > 0) {
+        if (!parent->dirty) {
+            dirtied++;
            toku_mark_node_dirty(parent);
-
+        }
        // detach buffer
        BP_WORKDONE(parent, childnum) = 0;  // this buffer is drained, no work has been done by its contents
-    NONLEAF_CHILDINFO bnc = BNC(parent, childnum);
+        bnc = BNC(parent, childnum);
        set_BNC(parent, childnum, toku_create_empty_nl());
+    }

    //
    // at this point, the buffer has been detached from the parent
@@ -1252,7 +1502,7 @@ flush_some_child(
    //
    if (!may_child_be_reactive) {
        toku_unpin_brtnode_off_client_thread(h, parent);
-        parent_unpinned = true;
+        parent = NULL;
    }

    //
@@ -1260,7 +1510,6 @@ flush_some_child(
    // so that we can proceed and apply the flush
    //
    bring_node_fully_into_memory(child, h);
-    toku_mark_node_dirty(child);

    // It is possible after reading in the entire child,
    // that we now know that the child is not reactive
@@ -1269,12 +1518,22 @@ flush_some_child(
    // and we have already replaced the bnc
    // for the root with a fresh one
    enum reactivity child_re = get_node_reactivity(child);
-    if (!parent_unpinned && child_re == RE_STABLE) {
+    if (parent && child_re == RE_STABLE) {
        toku_unpin_brtnode_off_client_thread(h, parent);
-        parent_unpinned = true;
+        parent = NULL;
    }

-    // now we have a bnc to flush to the child
+    // from above, we know at this point that either the bnc
+    // is detached from the parent (which may be unpinned),
+    // and we have to apply the flush, or there was no data
+    // in the buffer to flush, and as a result, flushing is not necessary
+    // and bnc is NULL
+    if (bnc != NULL) {
+        if (!child->dirty) {
+            dirtied++;
+            toku_mark_node_dirty(child);
+        }
+        // do the actual flush
        r = toku_bnc_flush_to_child(
            h->compare_fun,
            h->update_fun,
@@ -1285,34 +1544,31 @@ flush_some_child(
            );
        assert_zero(r);
        destroy_nonleaf_childinfo(bnc);
+    }

+    fa->update_status(child, dirtied, fa->extra);
    // let's get the reactivity of the child again,
    // it is possible that the flush got rid of some values
    // and now the parent is no longer reactive
    child_re = get_node_reactivity(child);
-    if (!started_at_root && child->height == 0 && child_re == RE_FUSIBLE) {
-        // prevent merging leaf nodes, sometimes (when the cleaner thread
-        // called us)
-        child_re = RE_STABLE;
-        brt_status->cleaner_num_leaves_unmerged++;
-    }
    // if the parent has been unpinned above, then
    // this is our only option, even if the child is not stable
    // if the child is not stable, we'll handle it the next
    // time we need to flush to the child
-    if (parent_unpinned || 
+    if (!parent ||
        child_re == RE_STABLE ||
        (child_re == RE_FUSIBLE && parent->n_children == 1)
        )
    {
-        if (!parent_unpinned) {
+        if (parent) {
            toku_unpin_brtnode_off_client_thread(h, parent);
+            parent = NULL;
        }
        //
-        // it is the responsibility of flush_some_child to unpin parent
+        // it is the responsibility of flush_some_child to unpin child
        //
-        if (child->height > 0 && toku_brt_nonleaf_is_gorged(child)) {
-            flush_some_child(h, child, n_dirtied, cascades+1, started_at_root, brt_status);
+        if (child->height > 0 && fa->should_recursively_flush(child, fa->extra)) {
+            flush_some_child(h, child, fa);
        }
        else {
            toku_unpin_brtnode_off_client_thread(h, child);
@@ -1320,75 +1576,66 @@ flush_some_child(
    }
    else if (child_re == RE_FISSIBLE) {
        //
-        // it is responsibility of brt_split_child to unlock nodes
-        // of parent and child as it sees fit
+        // it is responsibility of `brt_split_child` to unlock nodes of
+        // parent and child as it sees fit
        //
-        brt_split_child(h, parent, childnum, child, started_at_root, brt_status);
+        assert(parent); // just make sure we have not accidentally unpinned parent
+        brt_split_child(h, parent, childnum, child, fa);
    }
    else if (child_re == RE_FUSIBLE) {
-        BOOL did_react;
-        //
-        // There is probably a way to pass BRTNODE child
-        // into brt_merge_child, but for simplicity for now,
-        // we are just going to unpin child and
-        // let brt_merge_child pin it again
-        //
-        toku_unpin_brtnode_off_client_thread(h, child);
        //
+        // it is responsibility of `maybe_merge_child to unlock nodes of
+        // parent and child as it sees fit
        //
-        // it is responsibility of brt_merge_child to unlock parent
-        //
-        brt_merge_child(h, parent, childnum, &did_react, started_at_root, brt_status);
+        assert(parent); // just make sure we have not accidentally unpinned parent
+        fa->maybe_merge_child(fa, h, parent, childnum, child, fa->extra);
    }
    else {
        assert(FALSE);
    }
 }

-// TODO 3988 Leif set cleaner_nodes_dirtied
 static void
 update_cleaner_status(
    BRTNODE node,
-    int childnum,
-    BRT_STATUS brt_status)
+    int childnum)
 {
-    brt_status->cleaner_total_nodes++;
+    brt_flusher_status.cleaner_total_nodes++;
    if (node->height == 1) {
-        brt_status->cleaner_h1_nodes++;
+        brt_flusher_status.cleaner_h1_nodes++;
    } else {
-        brt_status->cleaner_hgt1_nodes++;
+        brt_flusher_status.cleaner_hgt1_nodes++;
    }

    unsigned int nbytesinbuf = toku_bnc_nbytesinbuf(BNC(node, childnum));
    if (nbytesinbuf == 0) {
-        brt_status->cleaner_empty_nodes++;
+        brt_flusher_status.cleaner_empty_nodes++;
    } else {
-        if (nbytesinbuf > brt_status->cleaner_max_buffer_size) {
-            brt_status->cleaner_max_buffer_size = nbytesinbuf;
+        if (nbytesinbuf > brt_flusher_status.cleaner_max_buffer_size) {
+            brt_flusher_status.cleaner_max_buffer_size = nbytesinbuf;
        }
-        if (nbytesinbuf < brt_status->cleaner_min_buffer_size) {
-            brt_status->cleaner_min_buffer_size = nbytesinbuf;
+        if (nbytesinbuf < brt_flusher_status.cleaner_min_buffer_size) {
+            brt_flusher_status.cleaner_min_buffer_size = nbytesinbuf;
        }
-        brt_status->cleaner_total_buffer_size += nbytesinbuf;
+        brt_flusher_status.cleaner_total_buffer_size += nbytesinbuf;

        uint64_t workdone = BP_WORKDONE(node, childnum);
-        if (workdone > brt_status->cleaner_max_buffer_workdone) {
-            brt_status->cleaner_max_buffer_workdone = workdone;
+        if (workdone > brt_flusher_status.cleaner_max_buffer_workdone) {
+            brt_flusher_status.cleaner_max_buffer_workdone = workdone;
        }
-        if (workdone < brt_status->cleaner_min_buffer_workdone) {
-            brt_status->cleaner_min_buffer_workdone = workdone;
+        if (workdone < brt_flusher_status.cleaner_min_buffer_workdone) {
+            brt_flusher_status.cleaner_min_buffer_workdone = workdone;
        }
-        brt_status->cleaner_total_buffer_workdone += workdone;
+        brt_flusher_status.cleaner_total_buffer_workdone += workdone;
    }
 }

 int
-toku_brtnode_cleaner_callback_internal(
+toku_brtnode_cleaner_callback(
    void *brtnode_pv,
    BLOCKNUM blocknum,
    u_int32_t fullhash,
-    void *extraargs,
-    BRT_STATUS brt_status)
+    void *extraargs)
 {
    BRTNODE node = brtnode_pv;
    invariant(node->thisnodename.b == blocknum.b);
@@ -1396,15 +1643,15 @@ toku_brtnode_cleaner_callback_internal(
    invariant(node->height > 0);   // we should never pick a leaf node (for now at least)
    struct brt_header *h = extraargs;
    bring_node_fully_into_memory(node, h);
-    int childnum;
-    find_heaviest_child(node, &childnum);
-    update_cleaner_status(node, childnum, brt_status);
+    int childnum = find_heaviest_child(node);
+    update_cleaner_status(node, childnum);

    // Either flush_some_child will unlock the node, or we do it here.
    if (toku_bnc_nbytesinbuf(BNC(node, childnum)) > 0) {
-        int n_dirtied = 0;
-        flush_some_child(h, node, &n_dirtied, 0, false, brt_status);
-        brt_status->cleaner_nodes_dirtied += n_dirtied;
+        struct flusher_advice fa;
+        struct flush_status_update_extra fste;
+        ct_flusher_advice_init(&fa, &fste);
+        flush_some_child(h, node, &fa);
    } else {
        toku_unpin_brtnode_off_client_thread(h, node);
    }
@@ -1415,7 +1662,6 @@ struct flusher_extra {
    struct brt_header* h;
    BRTNODE node;
    NONLEAF_CHILDINFO bnc;
-    BRT_STATUS brt_status;
 };

 //
@@ -1440,6 +1686,10 @@ static void flush_node_fun(void *fe_v)
    bring_node_fully_into_memory(fe->node,fe->h);
    toku_mark_node_dirty(fe->node);

+    struct flusher_advice fa;
+    struct flush_status_update_extra fste;
+    ft_flusher_advice_init(&fa, &fste);
+
    if (fe->bnc) {
        // In this case, we have a bnc to flush to a node

@@ -1462,7 +1712,7 @@ static void flush_node_fun(void *fe_v)
        // of flush_some_child to unlock the node
        // otherwise, we unlock the node here.
        if (fe->node->height > 0 && toku_brt_nonleaf_is_gorged(fe->node)) {
-            flush_some_child(fe->h, fe->node, NULL, 0, true, fe->brt_status);
+            flush_some_child(fe->h, fe->node, &fa);
        }
        else {
            toku_unpin_brtnode_off_client_thread(fe->h,fe->node);
@@ -1473,7 +1723,7 @@ static void flush_node_fun(void *fe_v)
        // bnc, which means we are tasked with flushing some
        // buffer in the node.
        // It is the responsibility of flush_some_child to unlock the node
-        flush_some_child(fe->h, fe->node, NULL, 0, true, fe->brt_status);
+        flush_some_child(fe->h, fe->node, &fa);
    }
    remove_background_job(fe->h->cf, false);
    toku_free(fe);
@@ -1483,9 +1733,7 @@ static void
 place_node_and_bnc_on_background_thread(
    BRT brt,
    BRTNODE node,
-    NONLEAF_CHILDINFO bnc,
-    BRT_STATUS brt_status
-    )
+    NONLEAF_CHILDINFO bnc)
 {
    struct flusher_extra* fe = NULL;
    fe = toku_xmalloc(sizeof(struct flusher_extra));
@@ -1493,7 +1741,6 @@ place_node_and_bnc_on_background_thread(
    fe->h = brt->h;
    fe->node = node;
    fe->bnc = bnc;
-    fe->brt_status = brt_status;
    cachefile_kibbutz_enq(brt->cf, flush_node_fun, fe);
 }

@@ -1511,14 +1758,13 @@ place_node_and_bnc_on_background_thread(
 //     The parent will be unlocked on the background thread
 //
 void
-flush_node_on_background_thread(BRT brt, BRTNODE parent, BRT_STATUS brt_status)
+flush_node_on_background_thread(BRT brt, BRTNODE parent)
 {
    //
    // first let's see if we can detach buffer on client thread
    // and pick the child we want to flush to
    //
-    int childnum;
-    find_heaviest_child(parent, &childnum);
+    int childnum = find_heaviest_child(parent);
    assert(toku_bnc_n_entries(BNC(parent, childnum))>0);
    //
    // see if we can pin the child
@@ -1536,7 +1782,7 @@ flush_node_on_background_thread(BRT brt, BRTNODE parent, BRT_STATUS brt_status)
        // In this case, we could not lock the child, so just place the parent on the background thread
        // In the callback, we will use flush_some_child, which checks to
        // see if we should blow away the old basement nodes.
-        place_node_and_bnc_on_background_thread(brt, parent, NULL, brt_status);
+        place_node_and_bnc_on_background_thread(brt, parent, NULL);
    }
    else {
        //
@@ -1564,7 +1810,7 @@ flush_node_on_background_thread(BRT brt, BRTNODE parent, BRT_STATUS brt_status)
            // so, because we know for sure the child is not
            // reactive, we can unpin the parent
            //
-            place_node_and_bnc_on_background_thread(brt, child, bnc, brt_status);
+            place_node_and_bnc_on_background_thread(brt, child, bnc);
            toku_unpin_brtnode(brt, parent);
        }
        else {
@@ -1574,7 +1820,7 @@ flush_node_on_background_thread(BRT brt, BRTNODE parent, BRT_STATUS brt_status)
            toku_unpin_brtnode(brt, child);
            // Again, we'll have the parent on the background thread, so
            // we don't need to destroy the basement nodes yet.
-            place_node_and_bnc_on_background_thread(brt, parent, NULL, brt_status);
+            place_node_and_bnc_on_background_thread(brt, parent, NULL);
        }
    }
 }
--- a/newbrt/brt-flusher.h
+++ b/newbrt/brt-flusher.h
@@ -11,6 +11,39 @@

 C_BEGIN

+typedef struct brt_flusher_status {
+    uint64_t  cleaner_total_nodes;         // total number of nodes whose buffers are potentially flushed by cleaner thread
+    uint64_t  cleaner_h1_nodes;            // number of nodes of height one whose message buffers are flushed by cleaner thread
+    uint64_t  cleaner_hgt1_nodes;          // number of nodes of height > 1 whose message buffers are flushed by cleaner thread
+    uint64_t  cleaner_empty_nodes;         // number of nodes that are selected by cleaner, but whose buffers are empty
+    uint64_t  cleaner_nodes_dirtied;       // number of nodes that are made dirty by the cleaner thread
+    uint64_t  cleaner_max_buffer_size;     // max number of bytes in message buffer flushed by cleaner thread
+    uint64_t  cleaner_min_buffer_size;
+    uint64_t  cleaner_total_buffer_size;
+    uint64_t  cleaner_max_buffer_workdone; // max workdone value of any message buffer flushed by cleaner thread
+    uint64_t  cleaner_min_buffer_workdone;
+    uint64_t  cleaner_total_buffer_workdone;
+    uint64_t  cleaner_num_dirtied_for_leaf_merge;  // nodes dirtied by the "flush from root" process to merge a leaf node
+    uint64_t  flush_total;                 // total number of flushes done by flusher threads or cleaner threads
+    uint64_t  flush_in_memory;             // number of in memory flushes
+    uint64_t  flush_needed_io;             // number of flushes that had to read a child (or part) off disk
+    uint64_t  flush_cascades;              // number of flushes that triggered another flush in the child
+    uint64_t  flush_cascades_1;            // number of flushes that triggered 1 cascading flush
+    uint64_t  flush_cascades_2;            // number of flushes that triggered 2 cascading flushes
+    uint64_t  flush_cascades_3;            // number of flushes that triggered 3 cascading flushes
+    uint64_t  flush_cascades_4;            // number of flushes that triggered 4 cascading flushes
+    uint64_t  flush_cascades_5;            // number of flushes that triggered 5 cascading flushes
+    uint64_t  flush_cascades_gt_5;         // number of flushes that triggered more than 5 cascading flushes
+    uint64_t  split_leaf;                  // number of leaf nodes split
+    uint64_t  split_nonleaf;               // number of nonleaf nodes split
+    uint64_t  merge_leaf;                  // number of times leaf nodes are merged
+    uint64_t  merge_nonleaf;               // number of times nonleaf nodes are merged    
+    uint64_t  balance_leaf;                // number of times a leaf node is balanced inside brt
+} BRT_FLUSHER_STATUS_S, *BRT_FLUSHER_STATUS;
+
+void toku_brt_flusher_status_init(void);
+void toku_brt_flusher_get_status(BRT_FLUSHER_STATUS);
+
 /**
 * Only for testing, not for production.
 *
@@ -32,12 +65,11 @@ toku_flusher_thread_set_callback(
 * brt_status which currently just lives in brt.c.
 */
 int
-toku_brtnode_cleaner_callback_internal(
+toku_brtnode_cleaner_callback(
    void *brtnode_pv,
    BLOCKNUM blocknum,
    u_int32_t fullhash,
-    void *extraargs,
-    BRT_STATUS brt_status
+    void *extraargs
    );

 /**
@@ -47,8 +79,7 @@ toku_brtnode_cleaner_callback_internal(
 void
 flush_node_on_background_thread(
    BRT brt,
-    BRTNODE parent,
-    BRT_STATUS brt_status
+    BRTNODE parent
    );

 /**
@@ -68,8 +99,7 @@ brtleaf_split(
    DBT *splitk,
    BOOL create_new_node,
    u_int32_t num_dependent_nodes,
-    BRTNODE* dependent_nodes,
-    BRT_STATUS brt_status
+    BRTNODE* dependent_nodes
    );

 /**
@@ -89,10 +119,33 @@ brt_nonleaf_split(
    BRTNODE *nodeb,
    DBT *splitk,
    u_int32_t num_dependent_nodes,
-    BRTNODE* dependent_nodes,
-    BRT_STATUS brt_status
+    BRTNODE* dependent_nodes
    );

+
+
+/************************************************************************
+ * HOT optimize, should perhaps be factored out to its own header file  *
+ ************************************************************************
+ */
+
+typedef struct brt_hot_status {
+    uint64_t  num_started;          // number of HOT operations that have begun
+    uint64_t  num_completed;        // number of HOT operations that have successfully completed
+    uint64_t  num_aborted;          // number of HOT operations that have been aborted
+    uint64_t  max_root_flush_count; // max number of flushes from root ever required to optimize a tree
+} BRT_HOT_STATUS_S, *BRT_HOT_STATUS;
+
+void toku_brt_hot_get_status(BRT_HOT_STATUS);
+
+/**
+ * Takes given BRT and pushes all pending messages to the leaf nodes.
+ */
+int
+toku_brt_hot_optimize(BRT brt,
+                      int (*progress_callback)(void *extra, float progress),
+                      void *progress_extra);
+
 C_END

 #endif // End of header guardian.
--- a/newbrt/brt-hot-flusher.c
+++ b/newbrt/brt-hot-flusher.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "$Id$"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <brt-flusher.h>
+#include <brt-flusher-internal.h>
+#include <brt-cachetable-wrappers.h>
+#include <brt-internal.h>
+
+// Member Descirption:
+// 1. highest_pivot_key - this is the key that corresponds to the 
+// most recently flushed leaf entry.
+// 2. max_current_key - this is the pivot/key that we inherit as
+// we descend down the tree.  We use this to set the highest_pivot_key.
+// 3. sub_tree_size - this is the percentage of the entire tree that our
+// current position (in a sub-tree) encompasses.
+// 4. percentage_done - this is the percentage of leaf nodes that have
+// been flushed into.
+// 5. rightmost_leaf_seen - this is a boolean we use to determine if
+// if we have flushed to every leaf node.
+struct hot_flusher_extra {
+    DBT highest_pivot_key;
+    DBT max_current_key;
+    float sub_tree_size;
+    float percentage_done;
+    bool rightmost_leaf_seen;
+};
+
+static BRT_HOT_STATUS_S hot_status;
+
+void 
+toku_brt_hot_get_status(BRT_HOT_STATUS s) {
+    *s = hot_status;
+}
+
+// Copies the max current key to the highest pivot key seen.
+static void
+hot_set_highest_key(struct hot_flusher_extra *flusher)
+{
+    // The max current key will be NULL if we are traversing in the
+    // rightmost subtree of a given parent.  As such, we don't want to
+    // allocate memory for this case.
+    if (flusher->max_current_key.data == NULL) {
+        if (flusher->highest_pivot_key.data) {
+            toku_free(flusher->highest_pivot_key.data);
+        }
+        flusher->highest_pivot_key.data = NULL;
+    } else {
+        // Otherwise, let's copy all the contents from one key to the other.
+        void *source = flusher->max_current_key.data;
+        void *destination = flusher->highest_pivot_key.data;
+        u_int32_t size = flusher->max_current_key.size;
+
+        destination = toku_xrealloc(destination, size);
+        memcpy(destination, source, size);
+
+        // Finish copying all fields from the max current key.
+        // Add free here.
+        toku_fill_dbt(&(flusher->highest_pivot_key), destination, size);
+    }
+}
+
+// Copies the pivot key in the parent to the given DBT key, using the
+// pivot corresponding to the given child.
+static void
+hot_set_key(DBT *key, BRTNODE parent, int childnum)
+{
+    // assert that childnum is less than number of children - 1.
+    DBT pivot;
+    struct kv_pair *pair;
+    pair = parent->childkeys[childnum];
+    pivot = kv_pair_key_to_dbt(pair);
+
+    void *data = key->data;
+    u_int32_t size = pivot.size;
+
+    data = toku_xrealloc(data, size);
+    memcpy(data, pivot.data, size);
+
+    toku_fill_dbt(key, data, size);
+}
+
+static int
+hot_just_pick_child(struct brt_header *h,
+                    BRTNODE parent,
+                    struct hot_flusher_extra *flusher)
+{
+    int childnum = 0;
+
+    // Search through Parents pivots, see which one is greater than
+    // the highest_pivot_key seen so far.
+    if (flusher->highest_pivot_key.data == NULL)
+    {
+        // Special case of the first child of the root node.
+        // Also known as, NEGATIVE INFINITY....
+        childnum = 0;
+    } else {
+        // Find the pivot boundary.
+        childnum = toku_brtnode_hot_next_child(parent,
+                                               &flusher->highest_pivot_key,
+                                               &h->descriptor,
+                                               h->compare_fun);
+    }
+
+    return childnum;
+}
+
+static void
+hot_update_flusher_keys(BRTNODE parent,
+                        int childnum,
+                        struct hot_flusher_extra *flusher)
+{
+    // Update maximum current key if the child is NOT the rightmost
+    // child node.
+    if (childnum < (parent->n_children - 1)) {
+        hot_set_key(&flusher->max_current_key, parent, childnum);
+    }
+}
+
+// Picks which child flush_some_child will use for flushing and
+// recursion.
+static int
+hot_pick_child(struct brt_header *h,
+               BRTNODE parent,
+               void *extra)
+{
+    struct hot_flusher_extra *flusher = extra;
+    int childnum = hot_just_pick_child(h, parent, flusher);
+
+    // Now we determine the percentage of the tree flushed so far.
+
+    // Whichever subtree we choose to recurse into, it is a fraction
+    // of the current parent.
+    flusher->sub_tree_size /= parent->n_children;
+
+    // Update the precentage complete, using our new sub tree size AND
+    // the number of children we have already flushed.
+    flusher->percentage_done += (flusher->sub_tree_size * childnum);
+
+    hot_update_flusher_keys(parent, childnum, flusher);
+
+    return childnum;
+}
+
+// Does nothing for now.
+static void
+hot_update_status(BRTNODE UU(child),
+                  int UU(dirtied),
+                  void *UU(extra))
+{
+    return;
+}
+
+static int
+hot_pick_child_after_split(struct brt_header *h,
+                           BRTNODE parent,
+                           int childnuma,
+                           int childnumb,
+                           void *extra)
+{
+    struct hot_flusher_extra *flusher = extra;
+    int childnum = hot_just_pick_child(h, parent, flusher);
+    assert(childnum == childnuma || childnum == childnumb);
+    hot_update_flusher_keys(parent, childnum, flusher);
+    if (parent->height == 1) {
+        childnum = -1;
+    }
+    return childnum;
+}
+
+// Basic constructor/initializer for the hot flusher struct.
+static void
+hot_flusher_init(struct flusher_advice *advice,
+                 struct hot_flusher_extra *flusher)
+{
+    // Initialize the highest pivot key seen to NULL.  This represents
+    // NEGATIVE INFINITY and is used to cover the special case of our
+    // first traversal of the tree.
+    toku_init_dbt(&(flusher->highest_pivot_key));
+    toku_init_dbt(&(flusher->max_current_key));
+    flusher->rightmost_leaf_seen = 0;
+    flusher->sub_tree_size = 1.0;
+    flusher->percentage_done = 0.0;
+    flusher_advice_init(advice,
+                        hot_pick_child,
+                        dont_destroy_basement_nodes,
+                        always_recursively_flush,
+                        default_merge_child,
+                        hot_update_status,
+                        hot_pick_child_after_split,
+                        flusher
+                        );
+}
+
+// Erases any DBT keys we have copied from a traversal.
+static void
+hot_flusher_destroy(struct hot_flusher_extra *flusher)
+{
+    if (flusher->highest_pivot_key.data) {
+        toku_free(flusher->highest_pivot_key.data);
+    }
+
+    if (flusher->max_current_key.data) {
+        toku_free(flusher->max_current_key.data);
+    }
+}
+
+// Entry point for Hot Optimize Table (HOT).  Note, this function is
+// not recursive.  It iterates over root-to-leaf paths.
+int
+toku_brt_hot_optimize(BRT brt,
+                      int (*progress_callback)(void *extra, float progress),
+                      void *progress_extra)
+{
+    int r = 0;
+    struct hot_flusher_extra flusher;
+    struct flusher_advice advice;
+
+    hot_flusher_init(&advice, &flusher);
+
+    uint64_t loop_count = 0;
+    MSN msn_at_start_of_hot = ZERO_MSN;  // capture msn from root at
+                                         // start of HOT operation
+    (void) __sync_fetch_and_add(&hot_status.num_started, 1);
+
+    // Higher level logic prevents a dictionary from being deleted or truncated 
+    // during a hot optimize operation.  Doing so would violate the hot optimize contract.
+    do {
+        BRTNODE root;
+        CACHEKEY *rootp;
+        u_int32_t fullhash;
+
+        // Grab YDB Lock.
+        toku_cachetable_call_ydb_lock(brt->h->cf);
+
+        // Get root node (the first parent of each successive HOT
+        // call.)
+        rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);
+        struct brtnode_fetch_extra bfe;
+        fill_bfe_for_full_read(&bfe, brt->h);
+        toku_pin_brtnode_off_client_thread(brt->h,
+                                           (BLOCKNUM) *rootp,
+                                           fullhash,
+                                           &bfe,
+                                           0,
+                                           NULL,
+                                           &root);
+
+        toku_assert_entire_node_in_memory(root);
+
+        // Prepare HOT diagnostics.
+        if (loop_count == 0) {
+            // The first time through, capture msn from root and set
+            // info in header while holding ydb lock.
+            msn_at_start_of_hot = root->max_msn_applied_to_node_on_disk;
+            toku_brt_header_note_hot_begin(brt);
+        }
+
+        loop_count++;
+
+        if (loop_count > hot_status.max_root_flush_count) {
+            // This is threadsafe, since we're holding the ydb lock.
+            hot_status.max_root_flush_count = loop_count;
+        }
+
+        // Release YDB Lock.
+        toku_cachetable_call_ydb_unlock(brt->h->cf);
+
+        // Initialize the maximum current key.  We need to do this for
+        // every traversal.
+        if (flusher.max_current_key.data) {
+            toku_free(flusher.max_current_key.data);
+        }
+        flusher.max_current_key.data = NULL;
+
+        flusher.sub_tree_size = 1.0;
+        flusher.percentage_done = 0.0;
+
+        // This should recurse to the bottom of the tree and then
+        // return.
+        if (root->height > 0) {
+            flush_some_child(brt->h, root, &advice);
+        } else {
+            // Since there are no children to flush, we should abort
+            // the HOT call.
+            flusher.rightmost_leaf_seen = 1;
+        }
+
+        // Set the highest pivot key seen here, since the parent may
+        // be unlocked and NULL'd later in our caller:
+        // flush_some_child().
+        hot_set_highest_key(&flusher);
+
+        // This is where we determine if the traversal is finished or
+        // not.
+        if (flusher.max_current_key.data == NULL) {
+            flusher.rightmost_leaf_seen = 1;
+        }
+
+        // Update HOT's progress.
+        if (progress_callback != NULL) {
+            r = progress_callback(progress_extra, flusher.percentage_done);
+
+            // Check if the callback wants us to stop running HOT.
+            if (r != 0) {
+                flusher.rightmost_leaf_seen = 1;
+            }
+        }
+
+        // Loop until the max key has been updated to positive
+        // infinity.
+    } while (!flusher.rightmost_leaf_seen);
+
+    // Cleanup.
+    hot_flusher_destroy(&flusher);
+
+    // More diagnostics.
+    {
+        BOOL success = false;
+        if (r == 0) success = true;
+        toku_cachetable_call_ydb_lock(brt->h->cf);
+        toku_brt_header_note_hot_complete(brt, success, msn_at_start_of_hot);
+        toku_cachetable_call_ydb_unlock(brt->h->cf);
+        if (success)
+            (void) __sync_fetch_and_add(&hot_status.num_completed, 1);
+        else
+            (void) __sync_fetch_and_add(&hot_status.num_aborted, 1);
+    }
+    return r;
+}
--- a/newbrt/brt-internal.h
+++ b/newbrt/brt-internal.h
@@ -341,13 +341,6 @@ enum {

 u_int32_t compute_child_fullhash (CACHEFILE cf, BRTNODE node, int childnum);

-struct remembered_hash {
-    BOOL    valid;      // set to FALSE if the fullhash is invalid
-    FILENUM fnum;
-    BLOCKNUM root;
-    u_int32_t fullhash; // fullhash is the hashed value of fnum and root.
-};
-
 // The brt_header is not managed by the cachetable.  Instead, it hangs off the cachefile as userdata.

 enum brtheader_type {BRTHEADER_CURRENT=1, BRTHEADER_CHECKPOINT_INPROGRESS};
@@ -380,7 +373,6 @@ struct brt_header {
    unsigned int nodesize;
    unsigned int basementnodesize;
    BLOCKNUM root;            // roots of the dictionary
-    struct remembered_hash root_hash;     // hash of the root offset.
    unsigned int flags;
    DESCRIPTOR_S descriptor;

@@ -404,6 +396,11 @@ struct brt_header {
    STAT64INFO_S in_memory_stats;
    STAT64INFO_S on_disk_stats;
    STAT64INFO_S checkpoint_staging_stats;
+    uint64_t time_of_last_optimize_begin;     // last time that a hot optimize operation was begun
+    uint64_t time_of_last_optimize_end;       // last time that a hot optimize operation was successfully completed
+    uint32_t count_of_optimize_in_progress;   // the number of hot optimize operations currently in progress on this tree
+    uint32_t count_of_optimize_in_progress_read_from_disk;   // the number of hot optimize operations in progress on this tree at the time of the last crash  (this field is in-memory only)
+    MSN      msn_at_start_of_last_completed_optimize;   // all messages before this msn have been applied to leaf nodes
 };

 struct brt {
@@ -526,10 +523,9 @@ extern void toku_brtnode_pe_est_callback(void* brtnode_pv, long* bytes_freed_est
 extern int toku_brtnode_pe_callback (void *brtnode_pv, PAIR_ATTR old_attr, PAIR_ATTR* new_attr, void *extraargs);
 extern BOOL toku_brtnode_pf_req_callback(void* brtnode_pv, void* read_extraargs);
 int toku_brtnode_pf_callback(void* brtnode_pv, void* read_extraargs, int fd, PAIR_ATTR* sizep);
-extern int toku_brtnode_cleaner_callback (void* brtnode_pv, BLOCKNUM blocknum, u_int32_t fullhash, void* extraargs);
 extern int toku_brt_alloc_init_header(BRT t, TOKUTXN txn);
 extern int toku_read_brt_header_and_store_in_cachefile (BRT brt, CACHEFILE cf, LSN max_acceptable_lsn, struct brt_header **header, BOOL* was_open);
-extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *root_hash);
+extern CACHEKEY* toku_calculate_root_offset_pointer (struct brt_header* h, u_int32_t *root_hash);

 static const BRTNODE null_brtnode=0;

@@ -716,15 +712,31 @@ unsigned int toku_brtnode_which_child(BRTNODE node, const DBT *k,
                                      DESCRIPTOR desc, brt_compare_func cmp)
    __attribute__((__warn_unused_result__));

+/**
+ * Finds the next child for HOT to flush to, given that everything up to
+ * and including k has been flattened.
+ *
+ * If k falls between pivots in node, then we return the childnum where k
+ * lies.
+ *
+ * If k is equal to some pivot, then we return the next (to the right)
+ * childnum.
+ */
+unsigned int toku_brtnode_hot_next_child(BRTNODE node,
+                                         const DBT *k,
+                                         DESCRIPTOR desc,
+                                         brt_compare_func cmp);
+
 /* Stuff for testing */
 // toku_testsetup_initialize() must be called before any other test_setup_xxx() functions are called.
 void toku_testsetup_initialize(void);
-int toku_testsetup_leaf(BRT brt, BLOCKNUM *);
+int toku_testsetup_leaf(BRT brt, BLOCKNUM *blocknum, int n_children, char **keys, int *keylens);
 int toku_testsetup_nonleaf (BRT brt, int height, BLOCKNUM *diskoff, int n_children, BLOCKNUM *children, char **keys, int *keylens);
 int toku_testsetup_root(BRT brt, BLOCKNUM);
 int toku_testsetup_get_sersize(BRT brt, BLOCKNUM); // Return the size on disk.
 int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM, char *key, int keylen, char *val, int vallen);
 int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM, enum brt_msg_type, char *key, int keylen, char *val, int vallen);
+void toku_pin_node_with_min_bfe(BRTNODE* node, BLOCKNUM b, BRT t);

 // These two go together to do lookups in a brtnode using the keys in a command.
 struct cmd_leafval_heaviside_extra {
@@ -799,28 +811,6 @@ struct brt_status {
    uint64_t  search_root_retries;         // number of searches that required the root node to be fetched more than once
    uint64_t  search_tries_gt_height;      // number of searches that required more tries than the height of the tree
    uint64_t  search_tries_gt_heightplus3; // number of searches that required more tries than the height of the tree plus three
-    uint64_t  cleaner_total_nodes;         // total number of nodes whose buffers are potentially flushed by cleaner thread
-    uint64_t  cleaner_h1_nodes;            // number of nodes of height one whose message buffers are flushed by cleaner thread
-    uint64_t  cleaner_hgt1_nodes;          // number of nodes of height > 1 whose message buffers are flushed by cleaner thread
-    uint64_t  cleaner_empty_nodes;         // number of nodes that are selected by cleaner, but whose buffers are empty
-    uint64_t  cleaner_nodes_dirtied;       // number of nodes that are made dirty by the cleaner thread
-    uint64_t  cleaner_max_buffer_size;     // max number of bytes in message buffer flushed by cleaner thread
-    uint64_t  cleaner_min_buffer_size;
-    uint64_t  cleaner_total_buffer_size;
-    uint64_t  cleaner_max_buffer_workdone; // max workdone value of any message buffer flushed by cleaner thread
-    uint64_t  cleaner_min_buffer_workdone;
-    uint64_t  cleaner_total_buffer_workdone;
-    uint64_t  cleaner_num_leaves_unmerged; // number of leaves left unmerged by the cleaner thread
-    uint64_t  flush_total;                 // total number of flushes done by flusher threads or cleaner threads
-    uint64_t  flush_in_memory;             // number of in memory flushes
-    uint64_t  flush_needed_io;             // number of flushes that had to read a child (or part) off disk
-    uint64_t  flush_cascades;              // number of flushes that triggered another flush in the child
-    uint64_t  flush_cascades_1;            // number of flushes that triggered 1 cascading flush
-    uint64_t  flush_cascades_2;            // number of flushes that triggered 2 cascading flushes
-    uint64_t  flush_cascades_3;            // number of flushes that triggered 3 cascading flushes
-    uint64_t  flush_cascades_4;            // number of flushes that triggered 4 cascading flushes
-    uint64_t  flush_cascades_5;            // number of flushes that triggered 5 cascading flushes
-    uint64_t  flush_cascades_gt_5;         // number of flushes that triggered more than 5 cascading flushes
    uint64_t  disk_flush_leaf;             // number of leaf nodes flushed to disk, not for checkpoint
    uint64_t  disk_flush_nonleaf;          // number of nonleaf nodes flushed to disk, not for checkpoint
    uint64_t  disk_flush_leaf_for_checkpoint; // number of leaf nodes flushed to disk for checkpoint
@@ -829,13 +819,8 @@ struct brt_status {
    uint64_t  create_nonleaf;              // number of nonleaf nodes created
    uint64_t  destroy_leaf;                // number of leaf nodes destroyed
    uint64_t  destroy_nonleaf;             // number of nonleaf nodes destroyed
-    uint64_t  split_leaf;                  // number of leaf nodes split
-    uint64_t  split_nonleaf;               // number of nonleaf nodes split
-    uint64_t  merge_leaf;                  // number of times leaf nodes are merged
-    uint64_t  merge_nonleaf;               // number of times nonleaf nodes are merged    
    uint64_t  dirty_leaf;                  // number of times leaf nodes are dirtied when previously clean
    uint64_t  dirty_nonleaf;               // number of times nonleaf nodes are dirtied when previously clean    
-    uint64_t  balance_leaf;                // number of times a leaf node is balanced inside brt
    uint64_t  msg_bytes_in;                // how many bytes of messages injected at root (for all trees)
    uint64_t  msg_bytes_out;               // how many bytes of messages flushed from h1 nodes to leaves
    uint64_t  msg_bytes_curr;              // how many bytes of messages currently in trees (estimate)
@@ -865,9 +850,6 @@ struct brt_status {

 void toku_brt_get_status(BRT_STATUS);

-void
-brtleaf_split (struct brt_header* h, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, BOOL create_new_node, u_int32_t num_dependent_nodes, BRTNODE* dependent_nodes, BRT_STATUS brt_status);
-
 void
 brt_leaf_apply_cmd_once (
    BRTNODE leafnode,
@@ -906,6 +888,18 @@ void toku_apply_cmd_to_leaf(
    OMT live_list_reverse
    );

+void brtnode_put_cmd (
+    brt_compare_func compare_fun,
+    brt_update_func update_fun,
+    DESCRIPTOR desc,
+    BRTNODE node, 
+    BRT_MSG cmd, 
+    bool is_fresh, 
+    OMT snapshot_txnids, 
+    OMT live_list_reverse
+    );
+
+
 void toku_reset_root_xid_that_created(BRT brt, TXNID new_root_xid_that_created);
 // Reset the root_xid_that_created field to the given value.  
 // This redefines which xid created the dictionary.
@@ -913,6 +907,10 @@ void toku_reset_root_xid_that_created(BRT brt, TXNID new_root_xid_that_created);

 void toku_flusher_thread_set_callback(void (*callback_f)(int, void*), void* extra);

+void toku_brt_header_note_hot_begin(BRT brt);
+void toku_brt_header_note_hot_complete(BRT brt, BOOL success, MSN msn_at_start_of_hot);
+
+
 C_END

 #endif
--- a/newbrt/brt-serialize.c
+++ b/newbrt/brt-serialize.c
@@ -1815,6 +1815,11 @@ serialize_brt_header_min_size (u_int32_t version) {


    switch(version) {
+        case BRT_LAYOUT_VERSION_18:
+	    size += sizeof(uint64_t);  // time_of_last_optimize_begin
+	    size += sizeof(uint64_t);  // time_of_last_optimize_end
+	    size += sizeof(uint32_t);  // count_of_optimize_in_progress
+	    size += sizeof(MSN);       // msn_at_start_of_last_completed_optimize
        case BRT_LAYOUT_VERSION_17:
 	    size += 16;
 	    invariant(sizeof(STAT64INFO_S) == 16);
@@ -1891,6 +1896,10 @@ int toku_serialize_brt_header_to_wbuf (struct wbuf *wbuf, struct brt_header *h,
    wbuf_ulonglong(wbuf, h->time_of_last_verification);
    wbuf_ulonglong(wbuf, h->checkpoint_staging_stats.numrows);
    wbuf_ulonglong(wbuf, h->checkpoint_staging_stats.numbytes);
+    wbuf_ulonglong(wbuf, h->time_of_last_optimize_begin);
+    wbuf_ulonglong(wbuf, h->time_of_last_optimize_end);
+    wbuf_int(wbuf, h->count_of_optimize_in_progress);
+    wbuf_MSN(wbuf, h->msn_at_start_of_last_completed_optimize);
    u_int32_t checksum = x1764_finish(&wbuf->checksum);
    wbuf_int(wbuf, checksum);
    lazy_assert(wbuf->ndone == wbuf->size);
@@ -2143,7 +2152,6 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
    }

    h->root = rbuf_blocknum(&rc);
-    h->root_hash.valid = FALSE;
    h->flags = rbuf_int(&rc);
    h->layout_version_original = rbuf_int(&rc);    
    h->build_id_original = rbuf_int(&rc);
@@ -2161,10 +2169,15 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
        h->basementnodesize = rbuf_int(&rc);
        h->time_of_last_verification = rbuf_ulonglong(&rc);
    }
-    if (h->layout_version >= BRT_LAYOUT_VERSION_17) {
+    if (h->layout_version >= BRT_LAYOUT_VERSION_18) {
 	h->on_disk_stats.numrows = rbuf_ulonglong(&rc);
 	h->on_disk_stats.numbytes = rbuf_ulonglong(&rc);
 	h->in_memory_stats = h->on_disk_stats;
+	h->time_of_last_optimize_begin = rbuf_ulonglong(&rc);
+	h->time_of_last_optimize_end   = rbuf_ulonglong(&rc);
+	h->count_of_optimize_in_progress = rbuf_int(&rc);
+	h->count_of_optimize_in_progress_read_from_disk = h->count_of_optimize_in_progress;
+	h->msn_at_start_of_last_completed_optimize = rbuf_msn(&rc);
    }

    (void)rbuf_int(&rc); //Read in checksum and ignore (already verified).
@@ -2219,7 +2232,8 @@ deserialize_brtheader_versioned (int fd, struct rbuf *rb, struct brt_header **br
            case BRT_LAYOUT_VERSION_14:
                h->basementnodesize = 128*1024;  // basement nodes added in v15
                //fall through on purpose
-            case BRT_LAYOUT_VERSION_17:
+            case BRT_LAYOUT_VERSION_18:
+            case BRT_LAYOUT_VERSION_17: // version 17 never released to customers
            case BRT_LAYOUT_VERSION_16: // version 16 never released to customers
            case BRT_LAYOUT_VERSION_15: // this will not properly support version 15, we'll fix that on upgrade.
                invariant(h->layout_version == BRT_LAYOUT_VERSION);

--- a/newbrt/brt-test-helpers.c
+++ b/newbrt/brt-test-helpers.c
@@ -6,10 +6,11 @@
 #include "includes.h"
 #include "ule.h"
 #include <brt-cachetable-wrappers.h>
+#include <brt-flusher.h>


 // dummymsn needed to simulate msn because messages are injected at a lower level than toku_brt_root_put_cmd()
-#define MIN_DUMMYMSN ((MSN) {(uint64_t)1<<48})
+#define MIN_DUMMYMSN ((MSN) {(uint64_t)100000000000})
 static MSN dummymsn;      
 static int testsetup_initialized = 0;

@@ -31,13 +32,21 @@ next_dummymsn(void) {


 BOOL ignore_if_was_already_open;
-int toku_testsetup_leaf(BRT brt, BLOCKNUM *blocknum) {
+int toku_testsetup_leaf(BRT brt, BLOCKNUM *blocknum, int n_children, char **keys, int *keylens) {
    BRTNODE node;
    assert(testsetup_initialized);
    int r = toku_read_brt_header_and_store_in_cachefile(brt, brt->cf, MAX_LSN, &brt->h, &ignore_if_was_already_open);
    if (r!=0) return r;
-    toku_create_new_brtnode(brt, &node, 0, 1);
-    BP_STATE(node,0) = PT_AVAIL;
+    toku_create_new_brtnode(brt, &node, 0, n_children);
+    int i;
+    for (i=0; i<n_children; i++) {
+        BP_STATE(node,i) = PT_AVAIL;
+    }
+
+    for (i=0; i+1<n_children; i++) {
+        node->childkeys[i] = kv_pair_malloc(keys[i], keylens[i], 0, 0);
+        node->totalchildkeylens += keylens[i];
+    }

    *blocknum = node->thisnodename;
    toku_unpin_brtnode(brt, node);
@@ -71,7 +80,6 @@ int toku_testsetup_root(BRT brt, BLOCKNUM blocknum) {
    int r = toku_read_brt_header_and_store_in_cachefile(brt, brt->cf, MAX_LSN, &brt->h, &ignore_if_was_already_open);
    if (r!=0) return r;
    brt->h->root = blocknum;
-    brt->h->root_hash.valid = FALSE;
    return 0;
 }

@@ -131,55 +139,22 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke
    toku_verify_or_set_counts(node);
    assert(node->height==0);

-    size_t newlesize;
-    LEAFENTRY leafentry;
-    OMTVALUE storeddatav;
-    u_int32_t idx;
    DBT keydbt,valdbt;
    MSN msn = next_dummymsn();
    BRT_MSG_S cmd = {BRT_INSERT, msn, xids_get_root_xids(),
                     .u.id={toku_fill_dbt(&keydbt, key, keylen),
                            toku_fill_dbt(&valdbt, val, vallen)}};
-    //Generate a leafentry (committed insert key,val)
-
-    uint childnum = toku_brtnode_which_child(node,
-					     &keydbt,
-					     &brt->h->descriptor, brt->compare_fun);
-
-    BASEMENTNODE bn = BLB(node, childnum);
-    void * maybe_free = 0;
-    
-    {
-	int64_t ignoreme;
-	r = apply_msg_to_leafentry(&cmd, NULL, //No old leafentry
-				   &newlesize, &leafentry, 
-				   bn->buffer, &bn->buffer_mempool, &maybe_free,
-				   NULL, NULL, &ignoreme);
-	assert(r==0);
-    }
-
-
-    struct cmd_leafval_heaviside_extra be = {brt->compare_fun, &brt->h->descriptor, &keydbt};
-    r = toku_omt_find_zero(BLB_BUFFER(node, 0), toku_cmd_leafval_heaviside, &be, &storeddatav, &idx);

-
-    if (r==0) {
-	LEAFENTRY storeddata=storeddatav;
-	// It's already there.  So now we have to remove it and put the new one back in.
-	BLB_NBYTESINBUF(node, 0) -= leafentry_disksize(storeddata);
-	toku_free(storeddata);
-	// Now put the new kv in.
-	toku_omt_set_at(BLB_BUFFER(node, 0), leafentry, idx);
-    } else {
-	r = toku_omt_insert(BLB_BUFFER(node, 0), leafentry, toku_cmd_leafval_heaviside, &be, 0);
-	assert(r==0);
-    }
-    // hack to get tests passing. These tests should not be directly inserting into buffers
-    BLB(node, 0)->max_msn_applied = msn;
-
-    BLB_NBYTESINBUF(node, 0) += newlesize;
-
-    node->dirty=1;
+    brtnode_put_cmd (
+        brt->h->compare_fun,
+        brt->h->update_fun,
+        &brt->h->descriptor,
+        node,
+        &cmd,
+        true, 
+        NULL, 
+        NULL
+        );    

    toku_verify_or_set_counts(node);

@@ -194,6 +169,23 @@ testhelper_string_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
    return strcmp(s, t);
 }

+
+void
+toku_pin_node_with_min_bfe(BRTNODE* node, BLOCKNUM b, BRT t)
+{
+    struct brtnode_fetch_extra bfe;
+    fill_bfe_for_min_read(&bfe, t->h);
+    toku_pin_brtnode_off_client_thread(
+        t->h, 
+        b,
+        toku_cachetable_hash(t->h->cf, b),
+        &bfe,
+        0,
+        NULL,
+        node
+        );
+}
+
 int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_type cmdtype, char *key, int keylen, char *val, int vallen) {
    void *node_v;
    int r;

--- a/newbrt/brt-verify.c
+++ b/newbrt/brt-verify.c
@@ -12,6 +12,7 @@
 */

 #include "includes.h"
+#include <brt-flusher.h>

 static int 
 compare_pairs (BRT brt, struct kv_pair *a, struct kv_pair *b) {
@@ -388,7 +389,7 @@ int
 toku_verify_brt_with_progress (BRT brt, int (*progress_callback)(void *extra, float progress), void *progress_extra, int verbose, int keep_on_going) {
    assert(brt->h);
    u_int32_t root_hash;
-    CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &root_hash);
+    CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt->h, &root_hash);
    int r = toku_verify_brtnode(brt, ZERO_MSN, ZERO_MSN, *rootp, -1, NULL, NULL, progress_callback, progress_extra, 1, verbose, keep_on_going);
    if (r == 0) {
        toku_brtheader_lock(brt->h);

--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@@ -173,8 +173,8 @@ get_leaf_num_entries(BRTNODE node) {
 static enum reactivity
 get_leaf_reactivity (BRTNODE node) {
    enum reactivity re = RE_STABLE;
+    toku_assert_entire_node_in_memory(node);
    assert(node->height==0);
-    if (node->dirty) {
    unsigned int size = toku_serialize_brtnode_size(node);
    if (size > node->nodesize && get_leaf_num_entries(node) > 1) {
        re = RE_FISSIBLE;
@@ -182,7 +182,6 @@ get_leaf_reactivity (BRTNODE node) {
    else if ((size*4) < node->nodesize && !BLB_SEQINSERT(node, node->n_children-1)) {
        re = RE_FUSIBLE;
    }
-    }
    return re;
 }

@@ -243,17 +242,6 @@ static inline void add_to_brt_status(u_int64_t* val, u_int64_t data) {
    (*val) += data;
 }

-static void brtnode_put_cmd (
-    brt_compare_func compare_fun,
-    brt_update_func update_fun,
-    DESCRIPTOR desc,
-    BRTNODE node, 
-    BRT_MSG cmd, 
-    bool is_fresh, 
-    OMT snapshot_txnids, 
-    OMT live_list_reverse
-    );
-
 static void brt_verify_flags(BRT brt, BRTNODE node) {
    assert(brt->flags == node->flags);
 }
@@ -766,16 +754,6 @@ int toku_brtnode_pe_callback (void *brtnode_pv, PAIR_ATTR UU(old_attr), PAIR_ATT
    return 0;
 }

-int
-toku_brtnode_cleaner_callback(
-    void *brtnode_pv,
-    BLOCKNUM blocknum,
-    u_int32_t fullhash,
-    void *extraargs)
-{
-    return toku_brtnode_cleaner_callback_internal(brtnode_pv, blocknum, fullhash, extraargs, &brt_status);
-}
-
 static inline void
 brt_status_update_partial_fetch(u_int8_t state)
 {
@@ -1852,6 +1830,32 @@ unsigned int toku_brtnode_which_child(BRTNODE node, const DBT *k,
 #endif
 }

+// Used for HOT.
+unsigned int
+toku_brtnode_hot_next_child(BRTNODE node,
+                            const DBT *k,
+                            DESCRIPTOR desc,
+                            brt_compare_func cmp) {
+    int low = 0;
+    int hi = node->n_children - 1;
+    int mi;
+    while (low < hi) {
+        mi = (low + hi) / 2;
+        int r = brt_compare_pivot(desc, cmp, k, node->childkeys[mi]);
+        if (r > 0) {
+            low = mi + 1;
+        } else if (r < 0) {
+            hi = mi;
+        } else {
+            // if they were exactly equal, then we want the sub-tree under
+            // the next pivot.
+            return mi + 1;
+        }
+    }
+    invariant(low == hi);
+    return low;
+}
+
 // TODO Use this function to clean up other places where bits of messages are passed around
 //      such as toku_bnc_insert_msg() and the call stack above it.
 static size_t
@@ -1969,9 +1973,9 @@ brt_handle_maybe_reactive_root (BRT brt, CACHEKEY *rootp, BRTNODE *nodep) {
            // in just node. That would be correct.
            //
 	    if (node->height==0) {
-		brtleaf_split(brt->h, node, &nodea, &nodeb, &splitk, TRUE, 0, NULL, &brt_status);
+		brtleaf_split(brt->h, node, &nodea, &nodeb, &splitk, TRUE, 0, NULL);
 	    } else {
-		brt_nonleaf_split(brt->h, node, &nodea, &nodeb, &splitk, 0, NULL, &brt_status);
+		brt_nonleaf_split(brt->h, node, &nodea, &nodeb, &splitk, 0, NULL);
 	    }
 	    brt_init_new_root(brt, nodea, nodeb, splitk, rootp, nodep);
 	    return;
@@ -1993,6 +1997,7 @@ toku_bnc_flush_to_child(
    )
 {
    assert(toku_fifo_n_entries(bnc->buffer)>0);
+    assert(bnc);
    OMT snapshot_txnids, live_list_reverse;
    TOKULOGGER logger = toku_cachefile_logger(cf);
    if (child->height == 0 && logger) {
@@ -2050,7 +2055,7 @@ void bring_node_fully_into_memory(BRTNODE node, struct brt_header* h)
    }
 }

-static void
+void
 brtnode_put_cmd (
    brt_compare_func compare_fun,
    brt_update_func update_fun,
@@ -2229,35 +2234,9 @@ static void push_something_at_root (BRT brt, BRTNODE *nodep, BRT_MSG cmd)
    }
 }

-static void compute_and_fill_remembered_hash (BRT brt) {
-    struct remembered_hash *rh = &brt->h->root_hash;
-    assert(brt->cf); // if cf is null, we'll be hosed.
-    rh->valid = TRUE;
-    rh->fnum=toku_cachefile_filenum(brt->cf);
-    rh->root=brt->h->root;
-    rh->fullhash = toku_cachetable_hash(brt->cf, rh->root);
-}
-
-static u_int32_t get_roothash (BRT brt) {
-    struct remembered_hash *rh = &brt->h->root_hash;
-    BLOCKNUM root = brt->h->root;
-    // compare cf first, since cf is NULL for invalid entries.
-    assert(rh);
-    //printf("v=%d\n", rh->valid);
-    if (rh->valid) {
-	//printf("f=%d\n", rh->fnum.fileid); 
-	//printf("cf=%d\n", toku_cachefile_filenum(brt->cf).fileid);
-	if (rh->fnum.fileid == toku_cachefile_filenum(brt->cf).fileid)
-	    if (rh->root.b == root.b)
-		return rh->fullhash;
-    }
-    compute_and_fill_remembered_hash(brt);
-    return rh->fullhash;
-}
-
-CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *roothash) {
-    *roothash = get_roothash(brt);
-    return &brt->h->root;
+CACHEKEY* toku_calculate_root_offset_pointer (struct brt_header* h, u_int32_t *roothash) {
+    *roothash = toku_cachetable_hash(h->cf, h->root);
+    return &h->root;
 }

 int 
@@ -2272,7 +2251,7 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
    //assert(0==toku_cachetable_assert_all_unpinned(brt->cachetable));
    assert(brt->h);
    u_int32_t fullhash;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    // get the root node
    struct brtnode_fetch_extra bfe;
@@ -2300,7 +2279,7 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
    // if we call flush_some_child, then that function unpins the root
    // otherwise, we unpin ourselves
    if (node->height > 0 && toku_brt_nonleaf_is_gorged(node)) {
-        flush_node_on_background_thread(brt, node, &brt_status);
+        flush_node_on_background_thread(brt, node);
    }
    else {
        toku_unpin_brtnode(brt, node);  // unpin root
@@ -2406,7 +2385,6 @@ brt_optimize (BRT brt, BOOL upgrade) {
    return r;
 }

-
 int
 toku_brt_load(BRT brt, TOKUTXN txn, char const * new_iname, int do_fsync, LSN *load_lsn) {
    int r = 0;
@@ -2892,7 +2870,6 @@ brt_init_header_partial (BRT t, TOKUTXN txn) {
    t->h->in_memory_stats          = ZEROSTATS;
    t->h->on_disk_stats            = ZEROSTATS;
    t->h->checkpoint_staging_stats = ZEROSTATS;
-    compute_and_fill_remembered_hash(t);

    BLOCKNUM root = t->h->root;
    if ((r=setup_initial_brt_root_node(t, root))!=0) { return r; }
@@ -5055,7 +5032,7 @@ toku_brt_search (BRT brt, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf,
    assert(brt->h);

    u_int32_t fullhash;
-    CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    BRTNODE node;

@@ -5670,7 +5647,7 @@ toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less_p, u_int64_t *equal_p, u_i
    {
 	u_int64_t less = 0, equal = 0, greater = 0;
 	u_int32_t fullhash;
-	CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+	CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

 	struct brtnode_fetch_extra bfe;
 	fill_bfe_for_min_read(&bfe, brt->h);  // read pivot keys but not message buffers
@@ -5831,7 +5808,7 @@ int toku_dump_brt (FILE *f, BRT brt) {
    assert(brt->h);
    u_int32_t fullhash = 0;
    toku_dump_translation_table(f, brt->h->blocktable);
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);
    return toku_dump_brtnode(f, brt, *rootp, 0, 0, 0);
 }

@@ -5891,8 +5868,7 @@ int toku_brt_init(void (*ydb_lock_callback)(void),
 	r = toku_brt_serialize_init();
    if (r==0)
 	callback_db_set_brt = db_set_brt;
-    brt_status.cleaner_min_buffer_size = UINT64_MAX;
-    brt_status.cleaner_min_buffer_workdone = UINT64_MAX;
+    toku_brt_flusher_status_init();
    return r;
 }

@@ -6106,7 +6082,7 @@ BOOL toku_brt_is_empty_fast (BRT brt)
 // messages and leafentries would all optimize away and that the tree is empty, but we'll say it is nonempty.
 {
    u_int32_t fullhash;
-    CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);
    BRTNODE node;
    //assert(fullhash == toku_cachetable_hash(brt->cf, *rootp));
    {
@@ -6173,6 +6149,51 @@ toku_reset_root_xid_that_created(BRT brt, TXNID new_root_xid_that_created) {
    toku_brtheader_unlock (h);
 }

+// Purpose: set fields in brt_header to capture accountability info for start of HOT optimize.
+// Requires: ydb lock is held.
+// Note: HOT accountability variables in header are modified only while holding header lock.
+//       (Header lock is really needed for touching the dirty bit, but it's useful and 
+//       convenient here for keeping the HOT variables threadsafe.)
+void
+toku_brt_header_note_hot_begin(BRT brt) {
+    struct brt_header *h = brt->h;
+    time_t now = time(NULL);
+
+    // hold lock around setting and clearing of dirty bit
+    // (see cooperative use of dirty bit in toku_brtheader_begin_checkpoint())
+    toku_brtheader_lock(h);
+    h->time_of_last_optimize_begin = now;
+    h->count_of_optimize_in_progress++;
+    h->dirty = 1;
+    toku_brtheader_unlock(h);
+}
+
+
+// Purpose: set fields in brt_header to capture accountability info for end of HOT optimize.
+// Requires: ydb lock is held.
+// Note: See note for toku_brt_header_note_hot_begin().
+void
+toku_brt_header_note_hot_complete(BRT brt, BOOL success, MSN msn_at_start_of_hot) {
+    struct brt_header *h = brt->h;
+    time_t now = time(NULL);
+
+    toku_brtheader_lock(h);
+    h->count_of_optimize_in_progress--;
+    if (success) {
+        h->time_of_last_optimize_end = now;
+        h->msn_at_start_of_last_completed_optimize = msn_at_start_of_hot;
+	// If we just successfully completed an optimization and no other thread is performing
+	// an optimization, then the number of optimizations in progress is zero.
+	// If there was a crash during a HOT optimization, this is how count_of_optimize_in_progress
+	// would be reset to zero on the disk after recovery from that crash.  
+	if (h->count_of_optimize_in_progress == h->count_of_optimize_in_progress_read_from_disk)
+            h->count_of_optimize_in_progress = 0;
+    }
+    h->dirty = 1;
+    toku_brtheader_unlock(h);
+}
+
+
 void
 toku_brt_header_init(struct brt_header *h, 
                     BLOCKNUM root_blocknum_on_disk, LSN checkpoint_lsn, TXNID root_xid_that_created, uint32_t target_nodesize, uint32_t target_basementnodesize) {

--- a/newbrt/brt_layout_version.h
+++ b/newbrt/brt_layout_version.h
@@ -22,6 +22,7 @@ enum brt_layout_version_e {
    BRT_LAYOUT_VERSION_16 = 16, // Dr. No:  No subtree estimates, partition layout information represented more transparently. 
                                // ALERT ALERT ALERT: version 16 never released to customers, internal and beta use only
    BRT_LAYOUT_VERSION_17 = 17, // Dr. No:  Add STAT64INFO_S to brt_header
+    BRT_LAYOUT_VERSION_18 = 18, // Dr. No:  Add HOT info to brt_header
    BRT_NEXT_VERSION,           // the version after the current version
    BRT_LAYOUT_VERSION   = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line.
    BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_13, // Minimum version supported

--- a/newbrt/cachetable.c
+++ b/newbrt/cachetable.c
@@ -466,6 +466,19 @@ toku_cachetable_set_lock_unlock_for_io (CACHETABLE ct, void (*ydb_lock_callback)
    ct->ydb_unlock_callback = ydb_unlock_callback;
 }

+void
+toku_cachetable_call_ydb_lock(CACHEFILE cf){
+    if (cf->cachetable->ydb_lock_callback) {
+        assert(cf->cachetable->ydb_unlock_callback);
+        cf->cachetable->ydb_lock_callback();
+    }
+}
+
+void
+toku_cachetable_call_ydb_unlock(CACHEFILE cf){
+    if (cf->cachetable->ydb_unlock_callback) cf->cachetable->ydb_unlock_callback();
+}
+
 //
 // Increment the reference count
 // MUST HOLD cachetable lock

--- a/newbrt/cachetable.h
+++ b/newbrt/cachetable.h
@@ -523,6 +523,8 @@ char * toku_cachetable_get_fname_in_cwd(CACHETABLE ct, const char * fname_in_env

 void toku_cachetable_set_lock_unlock_for_io (CACHETABLE ct, void (*ydb_lock_callback)(void), void (*ydb_unlock_callback)(void));
 // Effect: When we do I/O we may need to release locks (e.g., the ydb lock).  These functions release the lock acquire the lock.
+void toku_cachetable_call_ydb_lock(CACHEFILE cf);
+void toku_cachetable_call_ydb_unlock(CACHEFILE cf);

    
 void cachefile_kibbutz_enq (CACHEFILE cf, void (*f)(void*), void *extra);

--- a/newbrt/dbufio.h
+++ b/newbrt/dbufio.h
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ifndef TOKU_DBUFIO_H
 #define TOKU_DBUFIO_H
-#ident "$Id: queue.c 20104 2010-05-12 17:22:40Z bkuszmaul $"
+#ident "$Id$"
 #ident "Copyright (c) 2010 Tokutek Inc.  All rights reserved."

 #include <toku_portability.h>

--- a/newbrt/le-cursor.c
+++ b/newbrt/le-cursor.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "$Id$"
 #ident "Copyright (c) 2010 Tokutek Inc.  All rights reserved."
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."


--- a/newbrt/le-cursor.h
+++ b/newbrt/le-cursor.h
 /* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "$Id$"
 #ident "Copyright (c) 2010 Tokutek Inc.  All rights reserved."
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."


--- a/newbrt/mempool.c
+++ b/newbrt/mempool.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
-#ident "$Id: mempool.c 19902 2010-05-06 20:41:32Z bkuszmaul $"
+#ident "$Id$"
 #ident "Copyright (c) 2007-2010 Tokutek Inc.  All rights reserved."
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."


--- a/newbrt/mempool.h
+++ b/newbrt/mempool.h
 #ifndef _TOKU_MEMPOOL_H
 #define _TOKU_MEMPOOL_H
-#ident "$Id: mempool.h 19902 2010-05-06 20:41:32Z bkuszmaul $"
+#ident "$Id$"
 #ident "Copyright (c) 2007-2010 Tokutek Inc.  All rights reserved."
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."


--- a/newbrt/nonblocking_mutex.h
+++ b/newbrt/nonblocking_mutex.h
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ifndef TOKU_NBMUTEX_H
 #define TOKU_NBMUTEX_H
-#ident "$Id: rwlock.h 32279 2011-06-29 13:51:57Z bkuszmaul $"
+#ident "$Id$"
 #ident "Copyright (c) 2007-2010 Tokutek Inc.  All rights reserved."
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."


--- a/newbrt/tests/brt-clock-test.c
+++ b/newbrt/tests/brt-clock-test.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
-#ident "$Id: brt-serialize-test.c 36450 2011-11-02 20:10:18Z bperlman $"
+#ident "$Id$"
 #ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."

 #include "test.h"

--- a/newbrt/tests/cachetable-flush-during-cleaner.c
+++ b/newbrt/tests/cachetable-flush-during-cleaner.c
-#ident "$Id: cachetable-simple-verify.c 36579 2011-11-04 20:02:04Z zardosht $"
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "$Id$"
 #ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
 #include "includes.h"
 #include "test.h"

--- a/newbrt/tests/make-tree.c
+++ b/newbrt/tests/make-tree.c
@@ -136,7 +136,7 @@ test_make_tree(int height, int fanout, int nperleaf, int do_verify) {
    // discard the old root block
    u_int32_t fullhash = 0;
    CACHEKEY *rootp;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    // set the new root to point to the new tree
    *rootp = newroot->thisnodename;

--- a/newbrt/tests/msnfilter.c
+++ b/newbrt/tests/msnfilter.c
@@ -126,7 +126,7 @@ test_msnfilter(int do_verify) {
    // discard the old root block
    u_int32_t fullhash = 0;
    CACHEKEY *rootp;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    BRTNODE newroot = make_node(brt, 0);


--- a/newbrt/tests/test-brt-overflow.c
+++ b/newbrt/tests/test-brt-overflow.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "$Id$"
+#ident "Copyright (c) 2011 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
 /* Test an overflow condition on the leaf.  See #632. */



--- a/newbrt/tests/test-del-inorder.c
+++ b/newbrt/tests/test-del-inorder.c
@@ -34,7 +34,7 @@ doit (void) {

    toku_testsetup_initialize();  // must precede any other toku_testsetup calls

-    r = toku_testsetup_leaf(t, &nodea);
+    r = toku_testsetup_leaf(t, &nodea, 1, NULL, NULL);
    assert(r==0);

    r = toku_testsetup_nonleaf(t, 1, &nodeb, 1, &nodea, 0, 0);

--- a/newbrt/tests/test-dirty-flushes-on-cleaner.c
+++ b/newbrt/tests/test-dirty-flushes-on-cleaner.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#ident "$Id$"
+/* The goal of this test.  Make sure that inserts stay behind deletes. */
+
+
+#include "test.h"
+#include "includes.h"
+#include <brt-cachetable-wrappers.h>
+#include "brt-flusher.h"
+#include "checkpoint.h"
+
+
+static TOKUTXN const null_txn = 0;
+static DB * const null_db = 0;
+
+enum { NODESIZE = 1024, KSIZE=NODESIZE-100, TOKU_PSIZE=20 };
+
+CACHETABLE ct;
+BRT brt;
+int fnamelen;
+char *fname;
+
+static int update_func(
+    DB* UU(db),
+    const DBT* key,
+    const DBT* old_val, 
+    const DBT* UU(extra),
+    void (*set_val)(const DBT *new_val, void *set_extra),
+    void *set_extra)
+{
+    DBT new_val;
+    assert(old_val->size > 0);
+    if (verbose) {
+        printf("applying update to %s\n", (char *)key->data);
+    }
+    toku_init_dbt(&new_val);
+    set_val(&new_val, set_extra);
+    return 0;
+}
+
+
+static void
+doit (void) {
+    BLOCKNUM node_leaf;
+    BLOCKNUM node_internal, node_root;
+
+    int r;
+    
+    fnamelen = strlen(__FILE__) + 20;
+    fname = toku_malloc(fnamelen);
+    assert(fname!=0);
+
+    snprintf(fname, fnamelen, "%s.brt", __FILE__);
+    r = toku_brt_create_cachetable(&ct, 500*1024*1024, ZERO_LSN, NULL_LOGGER); assert(r==0);
+    unlink(fname);
+    r = toku_open_brt(fname, 1, &brt, NODESIZE, NODESIZE/2, ct, null_txn, toku_builtin_compare_fun, null_db);
+    assert(r==0);
+    toku_free(fname);
+
+    brt->update_fun = update_func;
+    brt->h->update_fun = update_func;
+    
+    toku_testsetup_initialize();  // must precede any other toku_testsetup calls
+
+    char* pivots[1];
+    pivots[0] = toku_strdup("kkkkk");
+    int pivot_len = 6;
+
+    r = toku_testsetup_leaf(brt, &node_leaf, 2, pivots, &pivot_len);
+    assert(r==0);
+
+    r = toku_testsetup_nonleaf(brt, 1, &node_internal, 1, &node_leaf, 0, 0);
+    assert(r==0);
+
+    r = toku_testsetup_nonleaf(brt, 2, &node_root, 1, &node_internal, 0, 0);
+    assert(r==0);
+
+    r = toku_testsetup_root(brt, node_root);
+    assert(r==0);
+
+    //
+    // at this point we have created a tree with a root, an internal node,
+    // and two leaf nodes, the pivot being "kkkkk"
+    //
+
+    // now we insert a row into each leaf node
+    r = toku_testsetup_insert_to_leaf (
+        brt, 
+        node_leaf, 
+        "a", // key
+        2, // keylen
+        "aa", 
+        3
+        );
+    assert(r==0);
+    r = toku_testsetup_insert_to_leaf (
+        brt, 
+        node_leaf, 
+        "z", // key
+        2, // keylen
+        "zz", 
+        3
+        );
+    assert(r==0);
+    char filler[400];
+    memset(filler, 0, sizeof(filler));
+    // now we insert filler data so that the rebalance
+    // keeps it at two nodes
+    r = toku_testsetup_insert_to_leaf (
+        brt, 
+        node_leaf, 
+        "b", // key
+        2, // keylen
+        filler, 
+        sizeof(filler)
+        );
+    assert(r==0);
+    r = toku_testsetup_insert_to_leaf (
+        brt, 
+        node_leaf, 
+        "y", // key
+        2, // keylen
+        filler, 
+        sizeof(filler)
+        );
+    assert(r==0);
+
+    //
+    // now insert a bunch of dummy delete messages
+    // into the internal node, to get its cachepressure size up    
+    //
+    for (int i = 0; i < 100000; i++) {
+        r = toku_testsetup_insert_to_nonleaf (
+            brt, 
+            node_internal, 
+            BRT_DELETE_ANY, 
+            "jj", // this key does not exist, so its message application should be a no-op
+            3, 
+            NULL, 
+            0
+            );
+        assert(r==0);
+    }
+
+    //
+    // now insert a broadcast message into the root
+    //
+    r = toku_testsetup_insert_to_nonleaf (
+        brt, 
+        node_root, 
+        BRT_UPDATE_BROADCAST_ALL, 
+        NULL, 
+        0, 
+        NULL, 
+        0
+        );
+    assert(r==0);
+
+    // now lock and release the leaf node to make sure it is what we expect it to be.
+    BRTNODE node = NULL;
+    struct brtnode_fetch_extra bfe;
+    fill_bfe_for_min_read(&bfe, brt->h);
+    toku_pin_brtnode_off_client_thread(
+        brt->h, 
+        node_leaf,
+        toku_cachetable_hash(brt->h->cf, node_leaf),
+        &bfe,
+        0,
+        NULL,
+        &node
+        );
+    assert(node->dirty);
+    assert(node->n_children == 2);
+    assert(BP_STATE(node,0) == PT_AVAIL);
+    assert(BP_STATE(node,1) == PT_AVAIL);
+    toku_unpin_brtnode_off_client_thread(brt->h, node);
+
+    // now do a lookup on one of the keys, this should bring a leaf node up to date 
+    DBT k;
+    struct check_pair pair = {2, "a", 0, NULL, 0};
+    r = toku_brt_lookup(brt, toku_fill_dbt(&k, "a", 2), lookup_checkf, &pair);
+    assert(r==0);
+
+    //
+    // pin the leaf one more time
+    // and make sure that one basement
+    // node is in memory and another is
+    // on disk
+    //
+    fill_bfe_for_min_read(&bfe, brt->h);
+    toku_pin_brtnode_off_client_thread(
+        brt->h, 
+        node_leaf,
+        toku_cachetable_hash(brt->h->cf, node_leaf),
+        &bfe,
+        0,
+        NULL,
+        &node
+        );
+    assert(node->dirty);
+    assert(node->n_children == 2);
+    assert(BP_STATE(node,0) == PT_AVAIL);
+    assert(BP_STATE(node,1) == PT_AVAIL);
+    toku_unpin_brtnode_off_client_thread(brt->h, node);
+    
+    //
+    // now let us induce a clean on the internal node
+    //    
+    fill_bfe_for_min_read(&bfe, brt->h);
+    toku_pin_brtnode_off_client_thread(
+        brt->h, 
+        node_internal,
+        toku_cachetable_hash(brt->h->cf, node_internal),
+        &bfe,
+        0,
+        NULL,
+        &node
+        );
+    assert(node->dirty);
+
+    // we expect that this flushes its buffer, that
+    // a merge is not done, and that the lookup
+    // of values "a" and "z" still works
+    r = toku_brtnode_cleaner_callback(
+        node,
+        node_internal,
+        toku_cachetable_hash(brt->h->cf, node_internal),
+        brt->h
+        );
+
+    // verify that node_internal's buffer is empty
+    fill_bfe_for_min_read(&bfe, brt->h);
+    toku_pin_brtnode_off_client_thread(
+        brt->h, 
+        node_internal,
+        toku_cachetable_hash(brt->h->cf, node_internal),
+        &bfe,
+        0,
+        NULL,
+        &node
+        );
+    // check that buffers are empty
+    assert(toku_bnc_nbytesinbuf(BNC(node, 0)) == 0);
+    toku_unpin_brtnode_off_client_thread(brt->h, node);
+    
+    //
+    // now run a checkpoint to get everything clean,
+    // and to get the rebalancing to happen
+    //
+    r = toku_checkpoint(ct, NULL, NULL, NULL, NULL, NULL, CLIENT_CHECKPOINT);
+    assert_zero(r);
+
+    // check that lookups on the two keys is still good
+    struct check_pair pair1 = {2, "a", 0, NULL, 0};
+    r = toku_brt_lookup(brt, toku_fill_dbt(&k, "a", 2), lookup_checkf, &pair1);
+    assert(r==0);
+    struct check_pair pair2 = {2, "z", 0, NULL, 0};
+    r = toku_brt_lookup(brt, toku_fill_dbt(&k, "z", 2), lookup_checkf, &pair2);
+    assert(r==0);
+
+
+    r = toku_close_brt(brt, 0);    assert(r==0);
+    r = toku_cachetable_close(&ct); assert(r==0);
+
+    toku_free(pivots[0]);
+}
+
+int
+test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+    default_parse_args(argc, argv);
+    doit();
+    return 0;
+}
--- a/newbrt/tests/test-flushes-on-cleaner.c
+++ b/newbrt/tests/test-flushes-on-cleaner.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#ident "$Id$"
+/* The goal of this test.  Make sure that inserts stay behind deletes. */
+
+
+#include "test.h"
+#include "includes.h"
+#include <brt-cachetable-wrappers.h>
+#include "brt-flusher.h"
+#include "checkpoint.h"
+
+
+static TOKUTXN const null_txn = 0;
+static DB * const null_db = 0;
+
+enum { NODESIZE = 1024, KSIZE=NODESIZE-100, TOKU_PSIZE=20 };
+
+CACHETABLE ct;
+BRT brt;
+int fnamelen;
+char *fname;
+
+static int update_func(
+    DB* UU(db),
+    const DBT* key,
+    const DBT* old_val, 
+    const DBT* UU(extra),
+    void (*set_val)(const DBT *new_val, void *set_extra),
+    void *set_extra)
+{
+    DBT new_val;
+    assert(old_val->size > 0);
+    if (verbose) {
+        printf("applying update to %s\n", (char *)key->data);
+    }
+    toku_init_dbt(&new_val);
+    set_val(&new_val, set_extra);
+    return 0;
+}
+
+
+static void
+doit (void) {
+    BLOCKNUM node_leaf;
+    BLOCKNUM node_internal, node_root;
+
+    int r;
+    
+    fnamelen = strlen(__FILE__) + 20;
+    fname = toku_malloc(fnamelen);
+    assert(fname!=0);
+
+    snprintf(fname, fnamelen, "%s.brt", __FILE__);
+    r = toku_brt_create_cachetable(&ct, 500*1024*1024, ZERO_LSN, NULL_LOGGER); assert(r==0);
+    unlink(fname);
+    r = toku_open_brt(fname, 1, &brt, NODESIZE, NODESIZE/2, ct, null_txn, toku_builtin_compare_fun, null_db);
+    assert(r==0);
+    toku_free(fname);
+
+    brt->update_fun = update_func;
+    brt->h->update_fun = update_func;
+    
+    toku_testsetup_initialize();  // must precede any other toku_testsetup calls
+
+    char* pivots[1];
+    pivots[0] = toku_strdup("kkkkk");
+    int pivot_len = 6;
+
+    r = toku_testsetup_leaf(brt, &node_leaf, 2, pivots, &pivot_len);
+    assert(r==0);
+
+    r = toku_testsetup_nonleaf(brt, 1, &node_internal, 1, &node_leaf, 0, 0);
+    assert(r==0);
+
+    r = toku_testsetup_nonleaf(brt, 2, &node_root, 1, &node_internal, 0, 0);
+    assert(r==0);
+
+    r = toku_testsetup_root(brt, node_root);
+    assert(r==0);
+
+    //
+    // at this point we have created a tree with a root, an internal node,
+    // and two leaf nodes, the pivot being "kkkkk"
+    //
+
+    // now we insert a row into each leaf node
+    r = toku_testsetup_insert_to_leaf (
+        brt, 
+        node_leaf, 
+        "a", // key
+        2, // keylen
+        "aa", 
+        3
+        );
+    assert(r==0);
+    r = toku_testsetup_insert_to_leaf (
+        brt, 
+        node_leaf, 
+        "z", // key
+        2, // keylen
+        "zz", 
+        3
+        );
+    assert(r==0);
+    char filler[400];
+    memset(filler, 0, sizeof(filler));
+    // now we insert filler data so that the rebalance
+    // keeps it at two nodes
+    r = toku_testsetup_insert_to_leaf (
+        brt, 
+        node_leaf, 
+        "b", // key
+        2, // keylen
+        filler, 
+        sizeof(filler)
+        );
+    assert(r==0);
+    r = toku_testsetup_insert_to_leaf (
+        brt, 
+        node_leaf, 
+        "y", // key
+        2, // keylen
+        filler, 
+        sizeof(filler)
+        );
+    assert(r==0);
+
+    //
+    // now insert a bunch of dummy delete messages
+    // into the internal node, to get its cachepressure size up    
+    //
+    for (int i = 0; i < 100000; i++) {
+        r = toku_testsetup_insert_to_nonleaf (
+            brt, 
+            node_internal, 
+            BRT_DELETE_ANY, 
+            "jj", // this key does not exist, so its message application should be a no-op
+            3, 
+            NULL, 
+            0
+            );
+        assert(r==0);
+    }
+
+    //
+    // now insert a broadcast message into the root
+    //
+    r = toku_testsetup_insert_to_nonleaf (
+        brt, 
+        node_root, 
+        BRT_UPDATE_BROADCAST_ALL, 
+        NULL, 
+        0, 
+        NULL, 
+        0
+        );
+    assert(r==0);
+
+    //
+    // now run a checkpoint to get everything clean
+    //
+    r = toku_checkpoint(ct, NULL, NULL, NULL, NULL, NULL, CLIENT_CHECKPOINT);
+    assert_zero(r);
+    // now lock and release the leaf node to make sure it is what we expect it to be.
+    BRTNODE node = NULL;
+    struct brtnode_fetch_extra bfe;
+    fill_bfe_for_min_read(&bfe, brt->h);
+    toku_pin_brtnode_off_client_thread(
+        brt->h, 
+        node_leaf,
+        toku_cachetable_hash(brt->h->cf, node_leaf),
+        &bfe,
+        0,
+        NULL,
+        &node
+        );
+    assert(!node->dirty);
+    assert(node->n_children == 2);
+    // a hack to get the basement nodes evicted
+    for (int i = 0; i < 20; i++) {
+        PAIR_ATTR attr;
+        toku_brtnode_pe_callback(node, make_pair_attr(0xffffffff), &attr, NULL);
+    }
+    // this ensures that when we do the lookups below,
+    // that the data is read off disk
+    assert(BP_STATE(node,0) == PT_ON_DISK);
+    assert(BP_STATE(node,1) == PT_ON_DISK);
+    toku_unpin_brtnode_off_client_thread(brt->h, node);
+
+    // now do a lookup on one of the keys, this should bring a leaf node up to date 
+    DBT k;
+    struct check_pair pair = {2, "a", 0, NULL, 0};
+    r = toku_brt_lookup(brt, toku_fill_dbt(&k, "a", 2), lookup_checkf, &pair);
+    assert(r==0);
+
+    //
+    // pin the leaf one more time
+    // and make sure that one basement
+    // node is in memory and another is
+    // on disk
+    //
+    fill_bfe_for_min_read(&bfe, brt->h);
+    toku_pin_brtnode_off_client_thread(
+        brt->h, 
+        node_leaf,
+        toku_cachetable_hash(brt->h->cf, node_leaf),
+        &bfe,
+        0,
+        NULL,
+        &node
+        );
+    assert(!node->dirty);
+    assert(node->n_children == 2);
+    assert(BP_STATE(node,0) == PT_AVAIL);
+    assert(BP_STATE(node,1) == PT_ON_DISK);
+    toku_unpin_brtnode_off_client_thread(brt->h, node);
+    
+    //
+    // now let us induce a clean on the internal node
+    //    
+    fill_bfe_for_min_read(&bfe, brt->h);
+    toku_pin_brtnode_off_client_thread(
+        brt->h, 
+        node_internal,
+        toku_cachetable_hash(brt->h->cf, node_internal),
+        &bfe,
+        0,
+        NULL,
+        &node
+        );
+    assert(!node->dirty);
+
+    // we expect that this flushes its buffer, that
+    // a merge is not done, and that the lookup
+    // of values "a" and "z" still works
+    r = toku_brtnode_cleaner_callback(
+        node,
+        node_internal,
+        toku_cachetable_hash(brt->h->cf, node_internal),
+        brt->h
+        );
+
+    // verify that node_internal's buffer is empty
+    fill_bfe_for_min_read(&bfe, brt->h);
+    toku_pin_brtnode_off_client_thread(
+        brt->h, 
+        node_internal,
+        toku_cachetable_hash(brt->h->cf, node_internal),
+        &bfe,
+        0,
+        NULL,
+        &node
+        );
+    // check that buffers are empty
+    assert(toku_bnc_nbytesinbuf(BNC(node, 0)) == 0);
+    toku_unpin_brtnode_off_client_thread(brt->h, node);
+    
+    //
+    // now run a checkpoint to get everything clean,
+    // and to get the rebalancing to happen
+    //
+    r = toku_checkpoint(ct, NULL, NULL, NULL, NULL, NULL, CLIENT_CHECKPOINT);
+    assert_zero(r);
+
+    // check that lookups on the two keys is still good
+    struct check_pair pair1 = {2, "a", 0, NULL, 0};
+    r = toku_brt_lookup(brt, toku_fill_dbt(&k, "a", 2), lookup_checkf, &pair1);
+    assert(r==0);
+    struct check_pair pair2 = {2, "z", 0, NULL, 0};
+    r = toku_brt_lookup(brt, toku_fill_dbt(&k, "z", 2), lookup_checkf, &pair2);
+    assert(r==0);
+
+
+    r = toku_close_brt(brt, 0);    assert(r==0);
+    r = toku_cachetable_close(&ct); assert(r==0);
+
+    toku_free(pivots[0]);
+}
+
+int
+test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+    default_parse_args(argc, argv);
+    doit();
+    return 0;
+}
--- a/newbrt/tests/test-inc-split.c
+++ b/newbrt/tests/test-inc-split.c
@@ -65,7 +65,7 @@ doit (int ksize __attribute__((__unused__))) {
    toku_testsetup_initialize();  // must precede any other toku_testsetup calls

    for (i=0; i<BRT_FANOUT; i++) {
-	r=toku_testsetup_leaf(t, &cnodes[i]);
+	r=toku_testsetup_leaf(t, &cnodes[i], 1, NULL, NULL);
 	assert(r==0);
 	char key[KSIZE+10];
 	int keylen = 1+snprintf(key, KSIZE, "%08d%0*d", i*10000+1, KSIZE-9, 0);

--- a/newbrt/tests/test-merges-on-cleaner.c
+++ b/newbrt/tests/test-merges-on-cleaner.c
-#ident "$Id: test-del-inorder.c 32975 2011-07-11 23:42:51Z leifwalsh $"
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#ident "$Id: test-merges-on-cleaner.c 38542 2012-01-06 14:06:23Z christianrober $"
 /* The goal of this test.  Make sure that inserts stay behind deletes. */


 #include "test.h"
 #include "includes.h"
 #include <brt-cachetable-wrappers.h>
+#include "brt-flusher.h"
+#include "checkpoint.h"

 static TOKUTXN const null_txn = 0;
 static DB * const null_db = 0;
@@ -58,9 +62,9 @@ doit (void) {
    
    toku_testsetup_initialize();  // must precede any other toku_testsetup calls

-    r = toku_testsetup_leaf(brt, &node_leaf[0]);
+    r = toku_testsetup_leaf(brt, &node_leaf[0], 1, NULL, NULL);
    assert(r==0);
-    r = toku_testsetup_leaf(brt, &node_leaf[1]);
+    r = toku_testsetup_leaf(brt, &node_leaf[1], 1, NULL, NULL);
    assert(r==0);

    char* pivots[1];
@@ -70,7 +74,7 @@ doit (void) {
    r = toku_testsetup_nonleaf(brt, 1, &node_internal, 2, node_leaf, pivots, &pivot_len);
    assert(r==0);

-    r = toku_testsetup_nonleaf(brt, 1, &node_root, 1, &node_internal, 0, 0);
+    r = toku_testsetup_nonleaf(brt, 2, &node_root, 1, &node_internal, 0, 0);
    assert(r==0);

    r = toku_testsetup_root(brt, node_root);
@@ -132,16 +136,21 @@ doit (void) {
        );
    assert(r==0);
    
+    //
+    // now let us induce a clean on the internal node
+    //    
+    BRTNODE node;
+    toku_pin_node_with_min_bfe(&node, node_leaf[1], brt);
+    // hack to get merge going
+    BLB_SEQINSERT(node, node->n_children-1) = FALSE;
+    toku_unpin_brtnode(brt, node);
+
    // now do a lookup on one of the keys, this should bring a leaf node up to date 
    DBT k;
    struct check_pair pair = {2, "a", 0, NULL, 0};
    r = toku_brt_lookup(brt, toku_fill_dbt(&k, "a", 2), lookup_checkf, &pair);
    assert(r==0);

-    //
-    // now let us induce a clean on the internal node
-    //    
-    BRTNODE node;
    struct brtnode_fetch_extra bfe;
    fill_bfe_for_min_read(&bfe, brt->h);
    toku_pin_brtnode_off_client_thread(
@@ -175,12 +184,19 @@ doit (void) {
        NULL,
        &node
        );
-    // check that no merge happened
-    assert(node->n_children == 2);
+    // check that merge happened
+    assert(node->n_children == 1);
    // check that buffers are empty
    assert(toku_bnc_nbytesinbuf(BNC(node, 0)) == 0);
-    assert(toku_bnc_nbytesinbuf(BNC(node, 1)) == 0);
    toku_unpin_brtnode_off_client_thread(brt->h, node);
+
+    //
+    // now run a checkpoint to get everything clean,
+    // and to get the rebalancing to happen
+    //
+    r = toku_checkpoint(ct, NULL, NULL, NULL, NULL, NULL, CLIENT_CHECKPOINT);
+    assert_zero(r);
+
    // check that lookups on the two keys is still good
    struct check_pair pair1 = {2, "a", 0, NULL, 0};
    r = toku_brt_lookup(brt, toku_fill_dbt(&k, "a", 2), lookup_checkf, &pair1);
@@ -198,6 +214,7 @@ doit (void) {

 int
 test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+    default_parse_args(argc, argv);
    doit();
    return 0;
 }
--- a/newbrt/tests/test-pick-child-to-flush.c
+++ b/newbrt/tests/test-pick-child-to-flush.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "$Id$"
+#ident "Copyright (c) 2011 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+/* The goal of this test.  Make sure that inserts stay behind deletes. */
+
+
+#include "test.h"
+#include "includes.h"
+#include <brt-cachetable-wrappers.h>
+
+#include "brt-flusher.h"
+#include "brt-flusher-internal.h"
+#include "checkpoint.h"
+
+static TOKUTXN const null_txn = 0;
+static DB * const null_db = 0;
+
+enum { NODESIZE = 1024, KSIZE=NODESIZE-100, TOKU_PSIZE=20 };
+
+CACHETABLE ct;
+BRT t;
+int fnamelen;
+char *fname;
+
+int curr_child_to_flush;
+int num_flushes_called;
+
+static int child_to_flush(struct brt_header* UU(h), BRTNODE parent, void* UU(extra)) {
+    // internal node has 2 children
+    if (parent->height == 1) {
+        assert(parent->n_children == 2);
+        return curr_child_to_flush;
+    }
+    // root has 1 child
+    else if (parent->height == 2) {
+        assert(parent->n_children == 1);
+        return 0;
+    }
+    else {
+        assert(FALSE);
+    }
+    return curr_child_to_flush;
+}
+
+static void update_status(BRTNODE UU(child), int UU(dirtied), void* UU(extra)) {
+    num_flushes_called++;
+}
+
+
+
+static bool
+dont_destroy_bn(void* UU(extra))
+{
+    return false;
+}
+
+static void merge_should_not_happen(struct flusher_advice* UU(fa),
+                              struct brt_header* UU(h),
+                              BRTNODE UU(parent),
+                              int UU(childnum),
+                              BRTNODE UU(child),
+                              void* UU(extra))
+{
+    assert(FALSE);
+}
+
+static bool recursively_flush_should_not_happen(BRTNODE UU(child), void* UU(extra)) {
+    assert(FALSE);
+}
+
+static bool always_flush(BRTNODE UU(child), void* UU(extra)) {
+    return true;
+}
+
+
+static void
+doit (void) {
+    BLOCKNUM node_internal, node_root;
+    BLOCKNUM node_leaf[2];
+    int r;
+    
+    fnamelen = strlen(__FILE__) + 20;
+    fname = toku_malloc(fnamelen);
+    assert(fname!=0);
+
+    snprintf(fname, fnamelen, "%s.brt", __FILE__);
+    r = toku_brt_create_cachetable(&ct, 500*1024*1024, ZERO_LSN, NULL_LOGGER); assert(r==0);
+    unlink(fname);
+    r = toku_open_brt(fname, 1, &t, NODESIZE, NODESIZE/2, ct, null_txn, toku_builtin_compare_fun, null_db);
+    assert(r==0);
+    toku_free(fname);
+
+    toku_testsetup_initialize();  // must precede any other toku_testsetup calls
+
+    r = toku_testsetup_leaf(t, &node_leaf[0], 1, NULL, NULL);
+    assert(r==0);
+    r = toku_testsetup_leaf(t, &node_leaf[1], 1, NULL, NULL);
+    assert(r==0);
+
+    char* pivots[1];
+    pivots[0] = toku_strdup("kkkkk");
+    int pivot_len = 6;
+    r = toku_testsetup_nonleaf(t, 1, &node_internal, 2, node_leaf, pivots, &pivot_len);
+    assert(r==0);
+
+    r = toku_testsetup_nonleaf(t, 2, &node_root, 1, &node_internal, 0, 0);
+    assert(r==0);
+
+    r = toku_testsetup_root(t, node_root);
+    assert(r==0);
+
+    char filler[900];
+    memset(filler, 0, sizeof(filler));
+    // now we insert filler data so that a merge does not happen
+    r = toku_testsetup_insert_to_leaf (
+        t, 
+        node_leaf[0], 
+        "b", // key
+        2, // keylen
+        filler, 
+        sizeof(filler)
+        );
+    assert(r==0);
+    r = toku_testsetup_insert_to_leaf (
+        t, 
+        node_leaf[1], 
+        "y", // key
+        2, // keylen
+        filler, 
+        sizeof(filler)
+        );
+    assert(r==0);
+
+    // make buffers in internal node non-empty
+    r = toku_testsetup_insert_to_nonleaf(
+        t, 
+        node_internal, 
+        BRT_INSERT, 
+        "a",
+        2,
+        NULL,
+        0
+        );
+    assert_zero(r);
+    r = toku_testsetup_insert_to_nonleaf(
+        t, 
+        node_internal, 
+        BRT_INSERT, 
+        "z",
+        2,
+        NULL,
+        0
+        );
+    assert_zero(r);
+    
+    //
+    // now run a checkpoint to get everything clean
+    //
+    r = toku_checkpoint(ct, NULL, NULL, NULL, NULL, NULL, CLIENT_CHECKPOINT);
+    assert_zero(r);
+
+    // now with setup done, start the test
+    // test that if flush_some_child properly honors
+    // what we say and flushes the child we pick
+    BRTNODE node = NULL;
+    toku_pin_node_with_min_bfe(&node, node_internal, t);
+    toku_assert_entire_node_in_memory(node);
+    assert(node->n_children == 2);
+    assert(!node->dirty);
+    assert(toku_bnc_n_entries(node->bp[0].ptr.u.nonleaf) > 0);
+    assert(toku_bnc_n_entries(node->bp[1].ptr.u.nonleaf) > 0);
+
+    struct flusher_advice fa;
+    flusher_advice_init(
+        &fa,
+        child_to_flush,
+        dont_destroy_bn,
+        recursively_flush_should_not_happen,
+        merge_should_not_happen,
+        update_status,
+	default_pick_child_after_split,
+        NULL
+        );
+    curr_child_to_flush = 0;
+    num_flushes_called = 0;
+    flush_some_child(t->h, node, &fa);
+    assert(num_flushes_called == 1);
+
+    toku_pin_node_with_min_bfe(&node, node_internal, t);
+    toku_assert_entire_node_in_memory(node);
+    assert(node->dirty);
+    assert(node->n_children == 2);
+    // child 0 should have empty buffer because it flushed
+    // child 1 should still have message in buffer
+    assert(toku_bnc_n_entries(node->bp[0].ptr.u.nonleaf) == 0);
+    assert(toku_bnc_n_entries(node->bp[1].ptr.u.nonleaf) > 0);
+    toku_unpin_brtnode(t, node);
+    r = toku_checkpoint(ct, NULL, NULL, NULL, NULL, NULL, CLIENT_CHECKPOINT);
+    assert_zero(r);    
+    toku_pin_node_with_min_bfe(&node, node_internal, t);
+    assert(!node->dirty);
+    curr_child_to_flush = 1;
+    num_flushes_called = 0;
+    flush_some_child(t->h, node, &fa);
+    assert(num_flushes_called == 1);
+    
+    toku_pin_node_with_min_bfe(&node, node_internal, t);
+    assert(node->dirty);
+    toku_assert_entire_node_in_memory(node);
+    assert(node->n_children == 2);
+    // both buffers should be empty now
+    assert(toku_bnc_n_entries(node->bp[0].ptr.u.nonleaf) == 0);
+    assert(toku_bnc_n_entries(node->bp[1].ptr.u.nonleaf) == 0);
+    // now let's do a flush with an empty buffer, make sure it is ok
+    toku_unpin_brtnode(t, node);
+    r = toku_checkpoint(ct, NULL, NULL, NULL, NULL, NULL, CLIENT_CHECKPOINT);
+    assert_zero(r);    
+    toku_pin_node_with_min_bfe(&node, node_internal, t);
+    assert(!node->dirty);
+    curr_child_to_flush = 0;
+    num_flushes_called = 0;
+    flush_some_child(t->h, node, &fa);
+    assert(num_flushes_called == 1);
+
+    toku_pin_node_with_min_bfe(&node, node_internal, t);
+    assert(!node->dirty); // nothing was flushed, so node better not be dirty
+    toku_assert_entire_node_in_memory(node);
+    assert(node->n_children == 2);
+    // both buffers should be empty now
+    assert(toku_bnc_n_entries(node->bp[0].ptr.u.nonleaf) == 0);
+    assert(toku_bnc_n_entries(node->bp[1].ptr.u.nonleaf) == 0);
+    toku_unpin_brtnode(t, node);
+
+    // now let's start a flush from the root, that always recursively flushes    
+    flusher_advice_init(
+        &fa,
+        child_to_flush,
+        dont_destroy_bn,
+        always_flush,
+        merge_should_not_happen,
+        update_status,
+	default_pick_child_after_split,
+        NULL
+        );
+    // use a for loop so to get us down both paths
+    for (int i = 0; i < 2; i++) {
+        toku_pin_node_with_min_bfe(&node, node_root, t);
+        toku_assert_entire_node_in_memory(node); // entire root is in memory
+        curr_child_to_flush = i;
+        num_flushes_called = 0;
+        flush_some_child(t->h, node, &fa);
+        assert(num_flushes_called == 2);
+    
+        toku_pin_node_with_min_bfe(&node, node_internal, t);
+        assert(!node->dirty); // nothing was flushed, so node better not be dirty
+        toku_unpin_brtnode(t, node);
+        toku_pin_node_with_min_bfe(&node, node_leaf[0], t);
+        assert(!node->dirty); // nothing was flushed, so node better not be dirty
+        toku_unpin_brtnode(t, node);
+        toku_pin_node_with_min_bfe(&node, node_leaf[1], t);
+        assert(!node->dirty); // nothing was flushed, so node better not be dirty
+        toku_unpin_brtnode(t, node);
+    }
+
+    // now one more test to show a bug was fixed
+    // if there is nothing to flush from parent to child,
+    // and child is not fully in memory, we used to crash
+    // so, to make sure that is fixed, let's get internal to not
+    // be fully in memory, and make sure the above test works
+    
+    // a hack to get internal compressed
+    r = toku_testsetup_insert_to_nonleaf(
+        t, 
+        node_internal, 
+        BRT_INSERT, 
+        "c",
+        2,
+        NULL,
+        0
+        );
+    assert_zero(r);
+    r = toku_checkpoint(ct, NULL, NULL, NULL, NULL, NULL, CLIENT_CHECKPOINT);
+    assert_zero(r);    
+    toku_pin_node_with_min_bfe(&node, node_internal, t);
+    for (int i = 0; i < 20; i++) {
+        PAIR_ATTR attr;
+        toku_brtnode_pe_callback(node, make_pair_attr(0xffffffff), &attr, NULL);
+    }
+    assert(BP_STATE(node,0) == PT_COMPRESSED);
+    toku_unpin_brtnode(t, node);
+
+    //now let's do the same test as above
+    toku_pin_node_with_min_bfe(&node, node_root, t);
+    toku_assert_entire_node_in_memory(node); // entire root is in memory
+    curr_child_to_flush = 0;
+    num_flushes_called = 0;
+    flush_some_child(t->h, node, &fa);
+    assert(num_flushes_called == 2);
+    
+    r = toku_close_brt(t, 0);    assert(r==0);
+    r = toku_cachetable_close(&ct); assert(r==0);
+
+    toku_free(pivots[0]);
+}
+
+int
+test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+    doit();
+    return 0;
+}
--- a/newbrt/tests/test3884.c
+++ b/newbrt/tests/test3884.c
@@ -10,6 +10,7 @@

 #include "includes.h"
 #include <brt-cachetable-wrappers.h>
+#include <brt-flusher.h>

 // Some constants to be used in calculations below
 static const int nodesize = 1024; // Target max node size
@@ -27,8 +28,6 @@ static TOKUTXN const null_txn = 0;
 static DB * const null_db = 0;
 static const char fname[]= __FILE__ ".brt";

-static BRT_STATUS_S my_brt_status;
-
 static int omt_long_cmp(OMTVALUE p, void *q)
 {
    LEAFENTRY a = p, b = q;
@@ -172,7 +171,7 @@ test_split_on_boundary(void)
    BRTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL, &my_brt_status);
+    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL);

    verify_basement_node_msns(nodea, dummy_msn_3884);
    verify_basement_node_msns(nodeb, dummy_msn_3884);
@@ -245,7 +244,7 @@ test_split_with_everything_on_the_left(void)
    BRTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL, &my_brt_status);
+    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL);

    toku_unpin_brtnode(brt, nodeb);
    r = toku_close_brt(brt, NULL); assert(r == 0);
@@ -320,7 +319,7 @@ test_split_on_boundary_of_last_node(void)
    BRTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL, &my_brt_status);
+    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL);

    toku_unpin_brtnode(brt, nodeb);
    r = toku_close_brt(brt, NULL); assert(r == 0);
@@ -388,7 +387,7 @@ test_split_at_begin(void)
    BRTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL, &my_brt_status);
+    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL);

    toku_unpin_brtnode(brt, nodeb);
    r = toku_close_brt(brt, NULL); assert(r == 0);
@@ -452,7 +451,7 @@ test_split_at_end(void)
    BRTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL, &my_brt_status);
+    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL);

    toku_unpin_brtnode(brt, nodeb);
    r = toku_close_brt(brt, NULL); assert(r == 0);
@@ -506,7 +505,7 @@ test_split_odd_nodes(void)
    BRTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL, &my_brt_status);
+    brtleaf_split(brt->h, &sn, &nodea, &nodeb, &splitk, TRUE, 0, NULL);

    verify_basement_node_msns(nodea, dummy_msn_3884);
    verify_basement_node_msns(nodeb, dummy_msn_3884);

--- a/newbrt/tests/test4244.c
+++ b/newbrt/tests/test4244.c
@@ -35,7 +35,7 @@ doit (void) {

    toku_testsetup_initialize();  // must precede any other toku_testsetup calls

-    r = toku_testsetup_leaf(t, &node_leaf);
+    r = toku_testsetup_leaf(t, &node_leaf, 1, NULL, NULL);
    assert(r==0);

    r = toku_testsetup_nonleaf(t, 1, &node_internal, 1, &node_leaf, 0, 0);

--- a/newbrt/tests/verify-bad-msn.c
+++ b/newbrt/tests/verify-bad-msn.c
@@ -142,7 +142,7 @@ test_make_tree(int height, int fanout, int nperleaf, int do_verify) {
    // discard the old root block
    u_int32_t fullhash = 0;
    CACHEKEY *rootp;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    // set the new root to point to the new tree
    *rootp = newroot->thisnodename;

--- a/newbrt/tests/verify-bad-pivots.c
+++ b/newbrt/tests/verify-bad-pivots.c
@@ -112,7 +112,7 @@ test_make_tree(int height, int fanout, int nperleaf, int do_verify) {
    // discard the old root block
    u_int32_t fullhash = 0;
    CACHEKEY *rootp;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    // set the new root to point to the new tree
    *rootp = newroot->thisnodename;

--- a/newbrt/tests/verify-dup-in-leaf.c
+++ b/newbrt/tests/verify-dup-in-leaf.c
@@ -66,7 +66,7 @@ test_dup_in_leaf(int do_verify) {
    // discard the old root block
    u_int32_t fullhash = 0;
    CACHEKEY *rootp;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    BRTNODE newroot = make_node(brt, 0);
    populate_leaf(newroot, htonl(2), 1);

--- a/newbrt/tests/verify-dup-pivots.c
+++ b/newbrt/tests/verify-dup-pivots.c
@@ -112,7 +112,7 @@ test_make_tree(int height, int fanout, int nperleaf, int do_verify) {
    // discard the old root block
    u_int32_t fullhash = 0;
    CACHEKEY *rootp;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    // set the new root to point to the new tree
    *rootp = newroot->thisnodename;

--- a/newbrt/tests/verify-misrouted-msgs.c
+++ b/newbrt/tests/verify-misrouted-msgs.c
@@ -127,7 +127,7 @@ test_make_tree(int height, int fanout, int nperleaf, int do_verify) {
    // discard the old root block
    u_int32_t fullhash = 0;
    CACHEKEY *rootp;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    // set the new root to point to the new tree
    *rootp = newroot->thisnodename;

--- a/newbrt/tests/verify-unsorted-leaf.c
+++ b/newbrt/tests/verify-unsorted-leaf.c
@@ -66,7 +66,7 @@ test_dup_in_leaf(int do_verify) {
    // discard the old root block
    u_int32_t fullhash = 0;
    CACHEKEY *rootp;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    BRTNODE newroot = make_node(brt, 0);
    populate_leaf(newroot, htonl(2), 1);

--- a/newbrt/tests/verify-unsorted-pivots.c
+++ b/newbrt/tests/verify-unsorted-pivots.c
@@ -112,7 +112,7 @@ test_make_tree(int height, int fanout, int nperleaf, int do_verify) {
    // discard the old root block
    u_int32_t fullhash = 0;
    CACHEKEY *rootp;
-    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
+    rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash);

    // set the new root to point to the new tree
    *rootp = newroot->thisnodename;

--- a/newbrt/ule-internal.h
+++ b/newbrt/ule-internal.h
@@ -7,7 +7,7 @@
 #ifndef TOKU_ULE_INTERNAL_H
 #define TOKU_ULE_INTERNAL_H

-#ident "$Id: ule.h 24600 2010-10-15 15:22:18Z dwells $"
+#ident "$Id$"
 #ident "Copyright (c) 2007-2010 Tokutek Inc.  All rights reserved."
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."


--- a/src/tests/Makefile
+++ b/src/tests/Makefile
@@ -136,6 +136,7 @@ BDB_DONTRUN_TESTS = \
 	hotindexer-simple-abort \
 	hotindexer-undo-do-test \
 	hotindexer-with-queries \
+        hot-optimize-table-tests \
 	insert-dup-prelock \
        isolation \
        isolation-read-committed \
@@ -278,6 +279,7 @@ BDB_DONTRUN_TESTS = \
 	test_stress4 \
 	test_stress5 \
 	test_stress6 \
+	test_stress7 \
 	test_stress_with_verify \
        test_transactional_descriptor \
        test_trans_desc_during_chkpt \
@@ -355,6 +357,7 @@ DEPENDS_ON_STRESS_HELPERS = \
 test_stress4 \
 test_stress5 \
 test_stress6 \
+ test_stress7 \
 #blank
 $(patsubst %,%.tdb,$(DEPENDS_ON_STRESS_HELPERS)): threaded_stress_test_helpers.h

@@ -682,6 +685,7 @@ test_update_broadcast_stress.tdbrun: VGRIND=
 test_update_stress.tdbrun: VGRIND=
 stress-test.tdbrun: VGRIND=
 stress-test.bdbrun: VGRIND=
+hot-optimize-table-tests.tdbrun: VGRIND=


 libs:

--- a/src/tests/hot-optimize-table-tests.c
+++ b/src/tests/hot-optimize-table-tests.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+// hot-optimize-table-tests.c
+
+#include "test.h"
+#include "includes.h"
+#include <brt-cachetable-wrappers.h>
+#include "db.h"
+#include "ydb.h"
+
+const int envflags = DB_INIT_MPOOL |
+                     DB_CREATE |
+                     DB_THREAD |
+                     DB_INIT_LOCK |
+                     DB_INIT_LOG |
+                     DB_INIT_TXN |
+                     DB_PRIVATE;
+
+DB_ENV* env;
+unsigned int leaf_hits;
+
+// Custom Update Function for our test BRT.
+static int
+update_func(DB* UU(db),
+            const DBT* key,
+            const DBT* old_val,
+            const DBT* extra,
+            void (*set_val)(const DBT* new_val, void* set_extra) __attribute__((unused)),
+            void* UU(set_extra))
+{
+    unsigned int *x_results;
+    assert(extra->size == sizeof x_results);
+    x_results = *(unsigned int **) extra->data;
+    assert(x_results);
+    assert(old_val->size > 0);
+    unsigned int* indexptr;
+    assert(key->size == (sizeof *indexptr));
+    indexptr = (unsigned int*)key->data;
+    ++leaf_hits;
+
+    if (verbose && x_results[*indexptr] != 0) {
+        printf("x_results = %p, indexptr = %p, *indexptr = %u, x_results[*indexptr] = %u\n", x_results, indexptr, *indexptr, x_results[*indexptr]);
+    }
+
+    assert(x_results[*indexptr] == 0);
+    x_results[*indexptr]++;
+    // ++(x_results[*indexptr]);
+    // memset(&new_val, 0, sizeof(new_val));
+    // set_val(&new_val, set_extra);
+    unsigned int i = *indexptr;
+    if (verbose && ((i + 1) % 50000 == 0)) {
+        printf("applying update to %u\n", i);
+        //printf("x_results[] = %u\n", x_results[*indexptr]);
+    }
+
+    return 0;
+}
+
+///
+static void
+hot_test_setup(void)
+{
+    int r = 0;
+    // Remove any previous environment.
+    CHK(system("rm -rf " ENVDIR));
+
+    // Set up a new TokuDB.
+    CHK(toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO));
+    CHK(db_env_create(&env, 0));
+    env->set_errfile(env, stderr);
+    r = env->set_default_bt_compare(env, uint_dbt_cmp);CKERR(r);
+    env->set_update(env, update_func);
+    CHK(env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO));
+}
+
+///
+static void
+hot_insert_keys(DB* db, unsigned int key_count)
+{
+    int r = 0;
+    DB_TXN * xact;
+    unsigned int limit = 1;
+    if (key_count > 10) {
+        limit = 100000;
+    }
+
+    // Dummy data.
+    const unsigned int DUMMY_SIZE = 100;
+    size_t size = DUMMY_SIZE;
+    char* dummy = NULL;
+    dummy = (char*)toku_xmalloc(size);
+    memset(dummy, 0, size);
+
+    // Start the transaction for insertions.
+    //
+    r = env->txn_begin(env, 0, &xact, 0); CKERR(r);
+
+    unsigned int key;
+
+    DBT key_thing;
+    DBT *keyptr = dbt_init(&key_thing, &key, sizeof(key));
+    DBT value_thing;
+    DBT *valueptr = dbt_init(&value_thing, dummy, size);
+    for (key = 0; key < key_count; ++key)
+    {
+        CHK(db->put(db, xact, keyptr, valueptr, 0));
+
+        // DEBUG OUTPUT
+        //
+        if (verbose && (key + 1) % limit == 0) {
+            printf("%d Elements inserted.\n", key + 1);
+        }
+    }
+
+    // Commit the insert transaction.
+    //
+    r = xact->commit(xact, 0); CKERR(r);
+
+    toku_free(dummy);
+}
+
+///
+static void
+hot_create_db(DB** db, const char* c)
+{
+    int r = 0;
+    DB_TXN* xact;
+    verbose ? printf("Creating DB.\n") : 0;
+    r = env->txn_begin(env, 0, &xact, 0); CKERR(r);
+    CHK(db_create(db, env, 0));
+    CHK((*db)->open((*db), xact, c, NULL, DB_BTREE, DB_CREATE, 0666));
+    r = xact->commit(xact, 0); CKERR(r);
+    verbose ? printf("DB Created.\n") : 0;
+}
+
+///
+static void
+hot_test(DB* db, unsigned int size)
+{
+    int r = 0;
+    leaf_hits = 0;
+    verbose ? printf("Insert some data.\n") : 0;
+
+    // Insert our keys to assemble the tree.
+    hot_insert_keys(db, size);
+
+    // Insert Broadcast Message.
+    verbose ? printf("Insert Broadcast Message.\n") : 0;
+    unsigned int *XMALLOC_N(size, x_results);
+    memset(x_results, 0, (sizeof x_results[0]) * size);
+    DBT extra;
+    DBT *extrap = dbt_init(&extra, &x_results, sizeof x_results);
+    DB_TXN * xact;
+    r = env->txn_begin(env, 0, &xact, 0); CKERR(r);
+    r = CHK(db->update_broadcast(db, xact, extrap, 0));
+    r = xact->commit(xact, 0); CKERR(r);
+
+    // Flatten the tree.
+    verbose ? printf("Calling hot optimize...\n") : 0;
+    r = db->hot_optimize(db, NULL, NULL);
+    assert(r == 0);
+    verbose ? printf("HOT Finished!\n") : 0;
+    for (unsigned int i = 0; i < size; ++i) {
+        assert(x_results[i] == 1);
+    }
+    verbose ? printf("Leaves hit = %u\n", leaf_hits) :0;
+    toku_free(x_results);
+}
+
+///
+int 
+test_main(int argc, char * const argv[])
+{
+    int r = 0;
+    default_parse_args(argc, argv);
+    hot_test_setup();
+
+    // Create and Open the Database/BRT
+    DB *db = NULL;
+    const unsigned int BIG = 4000000;
+    const unsigned int SMALL = 10;
+    const unsigned int NONE = 0;
+
+    hot_create_db(&db, "none.db");
+    hot_test(db, NONE);
+    hot_create_db(&db, "small.db");
+    hot_test(db, SMALL);
+    hot_create_db(&db, "big.db");
+    hot_test(db, BIG);
+
+    verbose ? printf("Exiting Test.\n") : 0;
+    return r;
+}
--- a/src/tests/recover-ft1.c
+++ b/src/tests/recover-ft1.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/recover-ft2.c
+++ b/src/tests/recover-ft2.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/recover-ft3.c
+++ b/src/tests/recover-ft3.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/recover-ft4.c
+++ b/src/tests/recover-ft4.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/recover-ft5.c
+++ b/src/tests/recover-ft5.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/recover-ft6.c
+++ b/src/tests/recover-ft6.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/recover-ft7.c
+++ b/src/tests/recover-ft7.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/recover-ft8.c
+++ b/src/tests/recover-ft8.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/recover-test_stress1.c
+++ b/src/tests/recover-test_stress1.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/recover-test_stress2.c
+++ b/src/tests/recover-test_stress2.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35324 2011-10-04 01:48:45Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/simple.c
+++ b/src/tests/simple.c
 /* -*- mode: C; c-basic-offset: 4 -*- */

 #ident "Copyright (c) 2009 Tokutek Inc.  All rights reserved."
-#ident "$Id: env_startup.c 20778 2010-05-28 20:38:42Z yfogel $"
+#ident "$Id$"

 /* Purpose of this test is to verify that a failed assert will
 * cause a panic, which should be visible via engine status.

--- a/src/tests/test_stress5.c
+++ b/src/tests/test_stress5.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress1.c 35109 2011-09-27 18:41:25Z leifwalsh $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/test_stress6.c
+++ b/src/tests/test_stress6.c
 /* -*- mode: C; c-basic-offset: 4 -*- */
 #ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-#ident "$Id: test_stress2.c 35151 2011-09-29 01:32:27Z zardosht $"
+#ident "$Id$"
 #include "test.h"

 #include <stdio.h>

--- a/src/tests/test_stress7.c
+++ b/src/tests/test_stress7.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
+#ident "$Id: test_stress7.c 38515 2012-01-05 20:48:10Z leifwalsh $"
+#include "test.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <toku_pthread.h>
+#include <unistd.h>
+#include <memory.h>
+#include <sys/stat.h>
+#include <db.h>
+
+#include "threaded_stress_test_helpers.h"
+
+
+static void
+stress_table(DB_ENV *env, DB **dbp, struct cli_args *cli_args) {
+    int n = cli_args->num_elements;
+
+    //
+    // do insertions and queries with a loader lying around doing stuff
+    //
+
+    if (verbose) printf("starting creation of pthreads\n");
+    const int num_threads = 4 + cli_args->num_update_threads + cli_args->num_ptquery_threads;
+    struct arg myargs[num_threads];
+    for (int i = 0; i < num_threads; i++) {
+        arg_init(&myargs[i], n, dbp, env, cli_args);
+    }
+    // make the forward fast scanner
+    myargs[0].fast = TRUE;
+    myargs[0].fwd = TRUE;
+    myargs[0].operation = scan_op;
+
+    // make the backward slow scanner
+    myargs[1].fast = FALSE;
+    myargs[1].fwd = FALSE;
+    myargs[1].operation = scan_op;
+
+    // make the guy that runs HOT in the background
+    myargs[2].operation = hot_op;
+    myargs[3].operation = keyrange_op;
+
+    for (int i = 4; i < 4 + cli_args->num_update_threads; ++i) {
+        myargs[i].operation = update_op;
+    }
+
+    // make the guy that does point queries
+    for (int i = 4 + cli_args->num_update_threads; i < num_threads; i++) {
+        myargs[i].operation = ptquery_op;
+    }
+    run_workers(myargs, num_threads, cli_args->time_of_test, false);
+}
+
+int
+test_main(int argc, char *const argv[]) {
+    struct cli_args args = DEFAULT_ARGS;
+    // let's make default checkpointing period really slow
+    args.checkpointing_period = 1;
+    parse_stress_test_args(argc, argv, &args);
+    stress_test_main(&args);
+    return 0;
+}
--- a/src/tests/threaded_stress_test_helpers.h
+++ b/src/tests/threaded_stress_test_helpers.h
@@ -555,6 +555,14 @@ static int UU() update_broadcast_op(DB_ENV *UU(env), DB **dbp, DB_TXN *txn, ARG
    return r;
 }

+static int UU() hot_op(DB_ENV *UU(env), DB **dbp, DB_TXN *UU(txn), ARG UU(arg)) {
+    int r;
+    DB* db = *dbp;
+    r = db->hot_optimize(db, NULL, NULL);
+    CKERR(r);
+    return r;
+}
+
 static int UU() remove_and_recreate_me(DB_ENV *env, DB **dbp, DB_TXN *UU(txn), ARG UU(arg)) {
    int r;
    r = (*dbp)->close(*dbp, 0); CKERR(r);

--- a/src/ydb.c
+++ b/src/ydb.c
@@ -23,6 +23,7 @@ const char *toku_copyright_string = "Copyright (c) 2007-2009 Tokutek Inc.  All r
 #include "ydb.h"
 #include "ydb-internal.h"
 #include "brt-internal.h"
+#include "brt-flusher.h"
 #include "cachetable.h"
 #include "log.h"
 #include "memory.h"
@@ -2065,43 +2066,18 @@ env_get_engine_status(DB_ENV * env, ENGINE_STATUS * engstat, char * env_panic_st
            engstat->search_root_retries = brt_stat.search_root_retries;
            engstat->search_tries_gt_height = brt_stat.search_tries_gt_height;
            engstat->search_tries_gt_heightplus3 = brt_stat.search_tries_gt_heightplus3;
-	    engstat->cleaner_total_nodes = brt_stat.cleaner_total_nodes;
-	    engstat->cleaner_h1_nodes = brt_stat.cleaner_h1_nodes;
-	    engstat->cleaner_hgt1_nodes = brt_stat.cleaner_hgt1_nodes;
-	    engstat->cleaner_empty_nodes = brt_stat.cleaner_empty_nodes;
-	    engstat->cleaner_nodes_dirtied = brt_stat.cleaner_nodes_dirtied;
-	    engstat->cleaner_max_buffer_size = brt_stat.cleaner_max_buffer_size;
-	    engstat->cleaner_min_buffer_size = brt_stat.cleaner_min_buffer_size;
-	    engstat->cleaner_total_buffer_size = brt_stat.cleaner_total_buffer_size;
-	    engstat->cleaner_max_buffer_workdone = brt_stat.cleaner_max_buffer_workdone;
-	    engstat->cleaner_min_buffer_workdone = brt_stat.cleaner_min_buffer_workdone;
-	    engstat->cleaner_total_buffer_workdone = brt_stat.cleaner_total_buffer_workdone;
-            engstat->cleaner_num_leaves_unmerged = brt_stat.cleaner_num_leaves_unmerged;
-            engstat->flush_total = brt_stat.flush_total;
-            engstat->flush_in_memory = brt_stat.flush_in_memory;
-            engstat->flush_needed_io = brt_stat.flush_needed_io;
-            engstat->flush_cascades = brt_stat.flush_cascades;
-            engstat->flush_cascades_1 = brt_stat.flush_cascades_1;
-            engstat->flush_cascades_2 = brt_stat.flush_cascades_2;
-            engstat->flush_cascades_3 = brt_stat.flush_cascades_3;
-            engstat->flush_cascades_4 = brt_stat.flush_cascades_4;
-            engstat->flush_cascades_5 = brt_stat.flush_cascades_5;
-            engstat->flush_cascades_gt_5 = brt_stat.flush_cascades_gt_5;
            engstat->disk_flush_leaf = brt_stat.disk_flush_leaf;
            engstat->disk_flush_nonleaf = brt_stat.disk_flush_nonleaf;
            engstat->disk_flush_leaf_for_checkpoint = brt_stat.disk_flush_leaf_for_checkpoint;
            engstat->disk_flush_nonleaf_for_checkpoint = brt_stat.disk_flush_nonleaf_for_checkpoint;
            engstat->create_leaf = brt_stat.create_leaf;
            engstat->create_nonleaf = brt_stat.create_nonleaf;
+            engstat->create_leaf = brt_stat.create_leaf;
+            engstat->create_nonleaf = brt_stat.create_nonleaf;
            engstat->destroy_leaf = brt_stat.destroy_leaf;
            engstat->destroy_nonleaf = brt_stat.destroy_nonleaf;
-            engstat->split_leaf = brt_stat.split_leaf;
-            engstat->split_nonleaf = brt_stat.split_nonleaf;
-            engstat->merge_leaf = brt_stat.merge_leaf;
-            engstat->merge_nonleaf = brt_stat.merge_nonleaf;
            engstat->dirty_leaf = brt_stat.dirty_leaf;
            engstat->dirty_nonleaf = brt_stat.dirty_nonleaf;
-            engstat->balance_leaf = brt_stat.balance_leaf;
            engstat->msg_bytes_in = brt_stat.msg_bytes_in;
            engstat->msg_bytes_out = brt_stat.msg_bytes_out;
            engstat->msg_bytes_curr = brt_stat.msg_bytes_curr;
@@ -2128,6 +2104,45 @@ env_get_engine_status(DB_ENV * env, ENGINE_STATUS * engstat, char * env_panic_st
            engstat->num_msg_buffer_fetched_prefetch = brt_stat.num_msg_buffer_fetched_prefetch;
            engstat->num_msg_buffer_fetched_write = brt_stat.num_msg_buffer_fetched_write;
        }
+        {
+            BRT_FLUSHER_STATUS_S brt_flusher_stat;
+            toku_brt_flusher_get_status(&brt_flusher_stat);
+            engstat->cleaner_total_nodes = brt_flusher_stat.cleaner_total_nodes;
+            engstat->cleaner_h1_nodes = brt_flusher_stat.cleaner_h1_nodes;
+            engstat->cleaner_hgt1_nodes = brt_flusher_stat.cleaner_hgt1_nodes;
+            engstat->cleaner_empty_nodes = brt_flusher_stat.cleaner_empty_nodes;
+            engstat->cleaner_nodes_dirtied = brt_flusher_stat.cleaner_nodes_dirtied;
+            engstat->cleaner_max_buffer_size = brt_flusher_stat.cleaner_max_buffer_size;
+            engstat->cleaner_min_buffer_size = brt_flusher_stat.cleaner_min_buffer_size;
+            engstat->cleaner_total_buffer_size = brt_flusher_stat.cleaner_total_buffer_size;
+            engstat->cleaner_max_buffer_workdone = brt_flusher_stat.cleaner_max_buffer_workdone;
+            engstat->cleaner_min_buffer_workdone = brt_flusher_stat.cleaner_min_buffer_workdone;
+            engstat->cleaner_total_buffer_workdone = brt_flusher_stat.cleaner_total_buffer_workdone;
+            engstat->cleaner_num_dirtied_for_leaf_merge = brt_flusher_stat.cleaner_num_dirtied_for_leaf_merge;
+            engstat->flush_total = brt_flusher_stat.flush_total;
+            engstat->flush_in_memory = brt_flusher_stat.flush_in_memory;
+            engstat->flush_needed_io = brt_flusher_stat.flush_needed_io;
+            engstat->flush_cascades = brt_flusher_stat.flush_cascades;
+            engstat->flush_cascades_1 = brt_flusher_stat.flush_cascades_1;
+            engstat->flush_cascades_2 = brt_flusher_stat.flush_cascades_2;
+            engstat->flush_cascades_3 = brt_flusher_stat.flush_cascades_3;
+            engstat->flush_cascades_4 = brt_flusher_stat.flush_cascades_4;
+            engstat->flush_cascades_5 = brt_flusher_stat.flush_cascades_5;
+            engstat->flush_cascades_gt_5 = brt_flusher_stat.flush_cascades_gt_5;
+            engstat->split_leaf = brt_flusher_stat.split_leaf;
+            engstat->split_nonleaf = brt_flusher_stat.split_nonleaf;
+            engstat->merge_leaf = brt_flusher_stat.merge_leaf;
+            engstat->merge_nonleaf = brt_flusher_stat.merge_nonleaf;
+            engstat->balance_leaf = brt_flusher_stat.balance_leaf;
+        }
+	{
+	    BRT_HOT_STATUS_S hot_stat;
+	    toku_brt_hot_get_status(&hot_stat);
+	    engstat->hot_num_started     = hot_stat.num_started;
+	    engstat->hot_num_completed   = hot_stat.num_completed;
+	    engstat->hot_num_aborted     = hot_stat.num_aborted;
+	    engstat->hot_max_root_flush_count = hot_stat.max_root_flush_count;
+	}
 	{
 	    u_int64_t fsync_count, fsync_time;
 	    toku_get_fsync_times(&fsync_count, &fsync_time);
@@ -2373,7 +2388,7 @@ env_get_engine_status_text(DB_ENV * env, char * buff, int bufsiz) {
 	n += snprintf(buff + n, bufsiz - n, "cleaner_max_buffer_workdone      %"PRIu64"\n", engstat.cleaner_max_buffer_workdone);
 	n += snprintf(buff + n, bufsiz - n, "cleaner_min_buffer_workdone      %"PRIu64"\n", engstat.cleaner_min_buffer_workdone);
 	n += snprintf(buff + n, bufsiz - n, "cleaner_total_buffer_workdone    %"PRIu64"\n", engstat.cleaner_total_buffer_workdone);
-        n += snprintf(buff + n, bufsiz - n, "cleaner_num_leaves_unmerged      %"PRIu64"\n", engstat.cleaner_num_leaves_unmerged);
+        n += snprintf(buff + n, bufsiz - n, "cleaner_num_dirtied_for_leaf_merge  %"PRIu64"\n", engstat.cleaner_num_dirtied_for_leaf_merge);
        n += snprintf(buff + n, bufsiz - n, "flush_total                      %"PRIu64"\n", engstat.flush_total);
        n += snprintf(buff + n, bufsiz - n, "flush_in_memory                  %"PRIu64"\n", engstat.flush_in_memory);
        n += snprintf(buff + n, bufsiz - n, "flush_needed_io                  %"PRIu64"\n", engstat.flush_needed_io);
@@ -2399,6 +2414,10 @@ env_get_engine_status_text(DB_ENV * env, char * buff, int bufsiz) {
        n += snprintf(buff + n, bufsiz - n, "dirty_leaf                       %"PRIu64"\n", engstat.dirty_leaf); 
        n += snprintf(buff + n, bufsiz - n, "dirty_nonleaf                    %"PRIu64"\n", engstat.dirty_nonleaf); 
        n += snprintf(buff + n, bufsiz - n, "balance_leaf                     %"PRIu64"\n", engstat.balance_leaf); 
+        n += snprintf(buff + n, bufsiz - n, "hot_num_started                  %"PRIu64"\n", engstat.hot_num_started); 
+        n += snprintf(buff + n, bufsiz - n, "hot_num_completed                %"PRIu64"\n", engstat.hot_num_completed); 
+        n += snprintf(buff + n, bufsiz - n, "hot_num_aborted                  %"PRIu64"\n", engstat.hot_num_aborted); 
+        n += snprintf(buff + n, bufsiz - n, "hot_max_root_flush_count         %"PRIu64"\n", engstat.hot_max_root_flush_count); 
        n += snprintf(buff + n, bufsiz - n, "msg_bytes_in                     %"PRIu64"\n", engstat.msg_bytes_in); 
        n += snprintf(buff + n, bufsiz - n, "msg_bytes_out                    %"PRIu64"\n", engstat.msg_bytes_out); 
        n += snprintf(buff + n, bufsiz - n, "msg_bytes_curr                   %"PRIu64"\n", engstat.msg_bytes_curr); 
@@ -6311,6 +6330,44 @@ toku_db_optimize(DB *db) {
    return r;
 }

+static int
+toku_db_hot_optimize(DB *db,
+                     int (*progress_callback)(void *extra, float progress),
+                     void *progress_extra)
+{
+    HANDLE_PANICKED_DB(db);
+    int r = 0;
+
+    // #4356 Take directory read lock around hot optimize to prevent
+    // race condition of another thread deleting the dictionary during
+    // the hot optimize.  Create a long-lived transaction to hold the
+    // lock, but the transaction does nothing else so the rollback log
+    // is tiny and the txnid does not appear in any dictionary.
+    int using_txns = db->dbenv->i->open_flags & DB_INIT_TXN;
+    DB_TXN *txn;
+    if (using_txns) {
+        toku_ydb_lock();
+        int rx = toku_txn_begin(db->dbenv, NULL, &txn, DB_TXN_NOSYNC, 1);
+        invariant_zero(rx);
+        r = toku_grab_read_lock_on_directory(db, txn);
+        toku_ydb_unlock();
+    }
+
+    // If we areunable to get a directory read lock, do nothing.
+    if (r == 0) {
+        r = toku_brt_hot_optimize(db->i->brt,
+                                  progress_callback,
+                                  progress_extra);
+    }
+
+    if (using_txns) {
+        int rx = locked_txn_commit(txn, 0);
+        invariant_zero(rx);
+    }
+
+    return r;
+}
+
 static int
 toku_db_flatten(DB *db, DB_TXN *txn) {
    HANDLE_PANICKED_DB(db);
@@ -6328,7 +6385,6 @@ autotxn_db_flatten(DB* db, DB_TXN* txn) {
    return toku_db_destruct_autotxn(txn, r, changed);
 }

-
 static int 
 locked_db_flatten(DB *db, DB_TXN *txn) {
    toku_ydb_lock(); int r = autotxn_db_flatten(db, txn); toku_ydb_unlock(); return r;
@@ -6342,6 +6398,15 @@ locked_db_optimize(DB *db) {
    return r;
 }

+static int
+locked_db_hot_optimize(DB *db,
+                       int (*progress_callback)(void *extra, float progress),
+                       void *progress_extra)
+{
+    int r = toku_db_hot_optimize(db, progress_callback, progress_extra);
+    return r;
+}
+
 static int
 db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) {
    HANDLE_PANICKED_DB(db);
@@ -6461,6 +6526,7 @@ toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags) {
    SDB(getf_set);
    SDB(flatten);
    SDB(optimize);
+    SDB(hot_optimize);
    SDB(get_fragmentation);
    SDB(set_indexer);
    SDB(get_indexer);