ft-internal.h 52.2 KB
Newer Older
1 2
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 4
#ifndef FT_INTERNAL_H
#define FT_INTERNAL_H
5

Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
6
#ident "$Id$"
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
/*
COPYING CONDITIONS NOTICE:

  This program is free software; you can redistribute it and/or modify
  it under the terms of version 2 of the GNU General Public License as
  published by the Free Software Foundation, and provided that the
  following conditions are met:

      * Redistributions of source code must retain this COPYING
        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
        GRANT (below).

      * Redistributions in binary form must reproduce this COPYING
        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
        GRANT (below) in the documentation and/or other materials
        provided with the distribution.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  02110-1301, USA.

COPYRIGHT NOTICE:

  TokuDB, Tokutek Fractal Tree Indexing Library.
  Copyright (C) 2007-2013 Tokutek, Inc.

DISCLAIMER:

  This program is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

UNIVERSITY PATENT NOTICE:

  The technology is licensed by the Massachusetts Institute of
  Technology, Rutgers State University of New Jersey, and the Research
  Foundation of State University of New York at Stony Brook under
  United States of America Serial No. 11/760379 and to the patents
  and/or patent applications resulting from it.

PATENT MARKING NOTICE:

  This software is covered by US Patent No. 8,185,551.
56
  This software is covered by US Patent No. 8,489,638.
57 58 59

PATENT RIGHTS GRANT:

60
  "THIS IMPLEMENTATION" means the copyrightable works distributed by
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
  Tokutek as part of the Fractal Tree project.

  "PATENT CLAIMS" means the claims of patents that are owned or
  licensable by Tokutek, both currently or in the future; and that in
  the absence of this license would be infringed by THIS
  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.

  "PATENT CHALLENGE" shall mean a challenge to the validity,
  patentability, enforceability and/or non-infringement of any of the
  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.

  Tokutek hereby grants to you, for the term and geographical scope of
  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
  irrevocable (except as stated in this section) patent license to
  make, have made, use, offer to sell, sell, import, transfer, and
  otherwise run, modify, and propagate the contents of THIS
  IMPLEMENTATION, where such license applies only to the PATENT
  CLAIMS.  This grant does not include claims that would be infringed
  only as a consequence of further modifications of THIS
  IMPLEMENTATION.  If you or your agent or licensee institute or order
  or agree to the institution of patent litigation against any entity
  (including a cross-claim or counterclaim in a lawsuit) alleging that
  THIS IMPLEMENTATION constitutes direct or contributory patent
  infringement, or inducement of patent infringement, then any rights
  granted to you under this License shall terminate as of the date
  such litigation is filed.  If you or your agent or exclusive
  licensee institute or order or agree to the institution of a PATENT
  CHALLENGE, then Tokutek may terminate any rights granted to you
  under this License.
*/

92
#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
93
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
94

95
#include "toku_config.h"
96
#include <toku_race_tools.h>
97

98 99 100 101 102
// Symbol TOKUDB_REVISION is not defined by fractal-tree makefiles, so
// BUILD_ID of 1000 indicates development build of main, not a release build.  
#if defined(TOKUDB_REVISION)
#define BUILD_ID TOKUDB_REVISION
#else
103
#error
104 105
#endif

106
#include "ft_layout_version.h"
107
#include "block_allocator.h"
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
108
#include "cachetable.h"
109
#include "fifo.h"
110
#include "ft-ops.h"
111
#include "toku_list.h"
112
#include <util/omt.h>
113
#include "leafentry.h"
Yoni Fogel's avatar
Yoni Fogel committed
114
#include "block_table.h"
115
#include "compress.h"
116 117
#include <util/mempool.h>
#include <util/omt.h>
118
#include "bndata.h"
119

Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
120
enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */
121 122 123
enum { FT_CMD_OVERHEAD = (2 + sizeof(MSN)) };   // the type plus freshness plus MSN
enum { FT_DEFAULT_FANOUT = 16 };
enum { FT_DEFAULT_NODE_SIZE = 4 * 1024 * 1024 };
124
enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 };
125

126
//
127
// Field in ftnode_fetch_extra that tells the 
128 129 130
// partial fetch callback what piece of the node
// is needed by the ydb
//
131 132 133 134
enum ftnode_fetch_type {
    ftnode_fetch_none=1, // no partitions needed.  
    ftnode_fetch_subset, // some subset of partitions needed
    ftnode_fetch_prefetch, // this is part of a prefetch call
135 136
    ftnode_fetch_all, // every partition is needed
    ftnode_fetch_keymatch, // one child is needed if it holds both keys
137 138
};

139 140 141 142 143 144 145 146 147 148 149 150 151 152
static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) UU();
static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) {
    switch (type) {
        case ftnode_fetch_none:
        case ftnode_fetch_subset:
        case ftnode_fetch_prefetch:
        case ftnode_fetch_all:
        case ftnode_fetch_keymatch:
            return true;
        default:
            return false;
    }
}

153 154 155 156 157 158 159
//
// An extra parameter passed to cachetable functions 
// That is used in all types of fetch callbacks.
// The contents help the partial fetch and fetch
// callbacks retrieve the pieces of a node necessary
// for the ensuing operation (flush, query, ...)
//
160 161
struct ftnode_fetch_extra {
    enum ftnode_fetch_type type;
162
    // needed for reading a node off disk
163 164
    FT h;
    // used in the case where type == ftnode_fetch_subset
165
    // parameters needed to find out which child needs to be decompressed (so it can be read)
166
    ft_search_t* search;
167
    DBT range_lock_left_key, range_lock_right_key;
Yoni Fogel's avatar
Yoni Fogel committed
168
    bool left_is_neg_infty, right_is_pos_infty;
169 170 171 172 173 174 175
    // states if we should try to aggressively fetch basement nodes 
    // that are not specifically needed for current query, 
    // but may be needed for other cursor operations user is doing
    // For example, if we have not disabled prefetching,
    // and the user is doing a dictionary wide scan, then
    // even though a query may only want one basement node,
    // we fetch all basement nodes in a leaf node.
176
    bool disable_prefetching;
177
    // this value will be set during the fetch_callback call by toku_ftnode_fetch_callback or toku_ftnode_pf_req_callback
178 179
    // thi callbacks need to evaluate this anyway, so we cache it here so the search code does not reevaluate it
    int child_to_read;
180 181
    // when we read internal nodes, we want to read all the data off disk in one I/O
    // then we'll treat it as normal and only decompress the needed partitions etc.
182

183
    bool read_all_partitions;
184 185 186 187 188
    // Accounting: How many bytes were read, and how much time did we spend doing I/O?
    uint64_t bytes_read;
    tokutime_t io_time;
    tokutime_t decompress_time;
    tokutime_t deserialize_time;
189 190
};

191
struct toku_fifo_entry_key_msn_heaviside_extra {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
192
    DESCRIPTOR desc;
193
    ft_compare_func cmp;
194
    FIFO fifo;
195
    const DBT *key;
196 197 198 199
    MSN msn;
};

// comparison function for inserting messages into a
200
// ftnode_nonleaf_childinfo's message_tree
201
int
202
toku_fifo_entry_key_msn_heaviside(const int32_t &v, const struct toku_fifo_entry_key_msn_heaviside_extra &extra);
203 204

struct toku_fifo_entry_key_msn_cmp_extra {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
205
    DESCRIPTOR desc;
206
    ft_compare_func cmp;
207 208 209 210 211
    FIFO fifo;
};

// same thing for qsort_r
int
212
toku_fifo_entry_key_msn_cmp(const struct toku_fifo_entry_key_msn_cmp_extra &extrap, const int &a, const int &b);
213

214 215
typedef toku::omt<int32_t> off_omt_t;
typedef toku::omt<int32_t, int32_t, true> marked_off_omt_t;
216

217 218
// data of an available partition of a nonleaf ftnode
struct ftnode_nonleaf_childinfo {
219
    FIFO buffer;
220
    off_omt_t broadcast_list;
221
    marked_off_omt_t fresh_message_tree;
222
    off_omt_t stale_message_tree;
223
    uint64_t flow[2];  // current and last checkpoint
224 225
};

226 227 228
unsigned int toku_bnc_nbytesinbuf(NONLEAF_CHILDINFO bnc);
int toku_bnc_n_entries(NONLEAF_CHILDINFO bnc);
long toku_bnc_memory_size(NONLEAF_CHILDINFO bnc);
229
long toku_bnc_memory_used(NONLEAF_CHILDINFO bnc);
230
void toku_bnc_insert_msg(NONLEAF_CHILDINFO bnc, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, DESCRIPTOR desc, ft_compare_func cmp);
231
void toku_bnc_empty(NONLEAF_CHILDINFO bnc);
232
void toku_bnc_flush_to_child(FT h, NONLEAF_CHILDINFO bnc, FTNODE child, TXNID parent_oldest_referenced_xid_known);
233
bool toku_bnc_should_promote(FT ft, NONLEAF_CHILDINFO bnc) __attribute__((const, nonnull));
234
bool toku_ft_nonleaf_is_gorged(FTNODE node, uint32_t nodesize);
235

236 237
enum reactivity get_nonleaf_reactivity(FTNODE node, unsigned int fanout);
enum reactivity get_node_reactivity(FT ft, FTNODE node);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
238
uint32_t get_leaf_num_entries(FTNODE node);
239

240 241
// data of an available partition of a leaf ftnode
struct ftnode_leaf_basement_node {
242
    bn_data data_buffer;
243 244
    unsigned int seqinsert;         // number of sequential inserts to this leaf 
    MSN max_msn_applied;            // max message sequence number applied
245
    bool stale_ancestor_messages_applied;
246
    STAT64INFO_S stat64_delta;      // change in stat64 counters since basement was last written to disk
247 248
};

249
enum   pt_state {  // declare this to be packed so that when used below it will only take 1 byte.
250 251 252 253
    PT_INVALID = 0,
    PT_ON_DISK = 1,
    PT_COMPRESSED = 2,
    PT_AVAIL = 3};
254

255
enum  ftnode_child_tag {
256 257 258 259 260 261 262
    BCT_INVALID = 0,
    BCT_NULL,
    BCT_SUBBLOCK,
    BCT_LEAF,
    BCT_NONLEAF
};
    
263
typedef struct  ftnode_child_pointer {
264 265
    union {
	struct sub_block *subblock;
266 267
	struct ftnode_nonleaf_childinfo *nonleaf;
	struct ftnode_leaf_basement_node *leaf;
268
    } u;
269
    enum ftnode_child_tag tag;
270
} FTNODE_CHILD_POINTER;
271

Zardosht Kasheff's avatar
Zardosht Kasheff committed
272

273
struct ftnode_disk_data {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
274
    //
275
    // stores the offset to the beginning of the partition on disk from the ftnode, and the length, needed to read a partition off of disk
Zardosht Kasheff's avatar
Zardosht Kasheff committed
276 277 278 279 280
    // the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
    //  The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition
    //  The SIZE is the size of the compressed partition.
    // Rationale:  We cannot store the size from the beginning of the node since we don't know how big the header will be.
    //  However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align.
Yoni Fogel's avatar
Yoni Fogel committed
281 282
    uint32_t start;
    uint32_t size;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
283 284 285 286 287
};
#define BP_START(node_dd,i) ((node_dd)[i].start)
#define BP_SIZE(node_dd,i) ((node_dd)[i].size)


288
// a ftnode partition, associated with a child of a node
289
struct ftnode_partition {
290 291 292
    // the following three variables are used for nonleaf nodes
    // for leaf nodes, they are meaningless
    BLOCKNUM     blocknum; // blocknum of child 
293

294 295 296
    // How many bytes worth of work was performed by messages in each buffer.
    uint64_t     workdone;

297 298 299 300 301 302
    //
    // pointer to the partition. Depending on the state, they may be different things
    // if state == PT_INVALID, then the node was just initialized and ptr == NULL
    // if state == PT_ON_DISK, then ptr == NULL
    // if state == PT_COMPRESSED, then ptr points to a struct sub_block*
    // if state == PT_AVAIL, then ptr is:
303 304
    //         a struct ftnode_nonleaf_childinfo for internal nodes, 
    //         a struct ftnode_leaf_basement_node for leaf nodes
305
    //
306
    struct ftnode_child_pointer ptr;
307 308 309 310 311 312 313 314
    //
    // at any time, the partitions may be in one of the following three states (stored in pt_state):
    //   PT_INVALID - means that the partition was just initialized
    //   PT_ON_DISK - means that the partition is not in memory and needs to be read from disk. To use, must read off disk and decompress
    //   PT_COMPRESSED - means that the partition is compressed in memory. To use, must decompress
    //   PT_AVAIL - means the partition is decompressed and in memory
    //
    enum pt_state state; // make this an enum to make debugging easier.  
Zardosht Kasheff's avatar
Zardosht Kasheff committed
315 316 317

    // clock count used to for pe_callback to determine if a node should be evicted or not
    // for now, saturating the count at 1
Yoni Fogel's avatar
Yoni Fogel committed
318
    uint8_t clock_count;
319
};
320

321
struct ftnode {
322
    MSN      max_msn_applied_to_node_on_disk; // max_msn_applied that will be written to disk
323
    unsigned int flags;
324
    BLOCKNUM thisnodename;   // Which block number is this node?
325
    int    layout_version; // What version of the data structure?
326 327
    int    layout_version_original;	// different (<) from layout_version if upgraded from a previous version (useful for debugging)
    int    layout_version_read_from_disk;  // transient, not serialized to disk, (useful for debugging)
328
    uint32_t build_id;       // build_id (svn rev number) of software that wrote this node to disk
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
329
    int    height; /* height is always >= 0.  0 for leaf, >0 for nonleaf. */
Bradley C. Kuszmaul's avatar
Up  
Bradley C. Kuszmaul committed
330
    int    dirty;
Yoni Fogel's avatar
Yoni Fogel committed
331
    uint32_t fullhash;
332
    int n_children; //for internal nodes, if n_children==fanout+1 then the tree needs to be rebalanced.
333 334
                    // for leaf nodes, represents number of basement nodes
    unsigned int    totalchildkeylens;
335
    DBT *childkeys;   /* Pivot keys.  Child 0's keys are <= childkeys[0].  Child 1's keys are <= childkeys[1].
336
                                                                        Child 1's keys are > childkeys[0]. */
John Esmet's avatar
John Esmet committed
337 338 339 340 341 342 343 344 345

    // What's the oldest referenced xid that this node knows about? The real oldest
    // referenced xid might be younger, but this is our best estimate. We use it
    // as a heuristic to transition provisional mvcc entries from provisional to
    // committed (from implicity committed to really committed).
    //
    // A better heuristic would be the oldest live txnid, but we use this since it
    // still works well most of the time, and its readily available on the inject
    // code path.
346
    TXNID oldest_referenced_xid_known;
John Esmet's avatar
John Esmet committed
347

348
    // array of size n_children, consisting of ftnode partitions
349 350 351
    // each one is associated with a child
    // for internal nodes, the ith partition corresponds to the ith message buffer
    // for leaf nodes, the ith partition corresponds to the ith basement node
352
    struct ftnode_partition *bp;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
353
    PAIR ct_pair;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
354 355
};

356 357
// ftnode partition macros
// BP stands for ftnode_partition
358 359 360 361 362 363
#define BP_BLOCKNUM(node,i) ((node)->bp[i].blocknum)
#define BP_STATE(node,i) ((node)->bp[i].state)
#define BP_WORKDONE(node, i)((node)->bp[i].workdone)

//
// macros for managing a node's clock
364
// Should be managed by ft-ops.c, NOT by serialize/deserialize
365
//
Zardosht Kasheff's avatar
Zardosht Kasheff committed
366 367 368 369 370 371

//
// BP_TOUCH_CLOCK uses a compare and swap because multiple threads
// that have a read lock on an internal node may try to touch the clock
// simultaneously
//
372
#define BP_TOUCH_CLOCK(node, i) ((node)->bp[i].clock_count = 1)
373 374 375
#define BP_SWEEP_CLOCK(node, i) ((node)->bp[i].clock_count = 0)
#define BP_SHOULD_EVICT(node, i) ((node)->bp[i].clock_count == 0)
// not crazy about having these two here, one is for the case where we create new
376
// nodes, such as in splits and creating new roots, and the other is for when
377 378 379 380 381
// we are deserializing a node and not all bp's are touched
#define BP_INIT_TOUCHED_CLOCK(node, i) ((node)->bp[i].clock_count = 1)
#define BP_INIT_UNTOUCHED_CLOCK(node, i) ((node)->bp[i].clock_count = 0)

// internal node macros
382
static inline void set_BNULL(FTNODE node, int i) {
383 384
    paranoid_invariant(i >= 0);
    paranoid_invariant(i < node->n_children);
385 386
    node->bp[i].ptr.tag = BCT_NULL;
}
387
static inline bool is_BNULL (FTNODE node, int i) {
388 389
    paranoid_invariant(i >= 0);
    paranoid_invariant(i < node->n_children);
390 391
    return node->bp[i].ptr.tag == BCT_NULL;
}
392
static inline NONLEAF_CHILDINFO BNC(FTNODE node, int i) {
393 394
    paranoid_invariant(i >= 0);
    paranoid_invariant(i < node->n_children);
395
    FTNODE_CHILD_POINTER p = node->bp[i].ptr;
396
    paranoid_invariant(p.tag==BCT_NONLEAF);
397 398
    return p.u.nonleaf;
}
399
static inline void set_BNC(FTNODE node, int i, NONLEAF_CHILDINFO nl) {
400 401
    paranoid_invariant(i >= 0);
    paranoid_invariant(i < node->n_children);
402
    FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
403 404 405
    p->tag = BCT_NONLEAF;
    p->u.nonleaf = nl;
}
406

407
static inline BASEMENTNODE BLB(FTNODE node, int i) {
408 409 410 411 412 413 414
    paranoid_invariant(i >= 0);
    // The optimizer really doesn't like it when we compare
    // i to n_children as signed integers. So we assert that
    // n_children is in fact positive before doing a comparison
    // on the values forcibly cast to unsigned ints.
    paranoid_invariant(node->n_children > 0);
    paranoid_invariant((unsigned) i < (unsigned) node->n_children);
415
    FTNODE_CHILD_POINTER p = node->bp[i].ptr;
416
    paranoid_invariant(p.tag==BCT_LEAF);
417 418
    return p.u.leaf;
}
419
static inline void set_BLB(FTNODE node, int i, BASEMENTNODE bn) {
420 421
    paranoid_invariant(i >= 0);
    paranoid_invariant(i < node->n_children);
422
    FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
423 424 425 426
    p->tag = BCT_LEAF;
    p->u.leaf = bn;
}

427
static inline SUB_BLOCK BSB(FTNODE node, int i) {
428 429
    paranoid_invariant(i >= 0);
    paranoid_invariant(i < node->n_children);
430
    FTNODE_CHILD_POINTER p = node->bp[i].ptr;
431
    paranoid_invariant(p.tag==BCT_SUBBLOCK);
432 433
    return p.u.subblock;
}
434
static inline void set_BSB(FTNODE node, int i, SUB_BLOCK sb) {
435 436
    paranoid_invariant(i >= 0);
    paranoid_invariant(i < node->n_children);
437
    FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
438 439 440 441
    p->tag = BCT_SUBBLOCK;
    p->u.subblock = sb;
}

442
// ftnode leaf basementnode macros, 
443 444
#define BLB_MAX_MSN_APPLIED(node,i) (BLB(node,i)->max_msn_applied)
#define BLB_MAX_DSN_APPLIED(node,i) (BLB(node,i)->max_dsn_applied)
445 446
#define BLB_DATA(node,i) (&(BLB(node,i)->data_buffer))
#define BLB_NBYTESINDATA(node,i) (BLB_DATA(node,i)->get_disk_size())
447 448
#define BLB_SEQINSERT(node,i) (BLB(node,i)->seqinsert)

449
/* pivot flags  (must fit in 8 bits) */
450
enum {
451 452
    FT_PIVOT_TRUNC = 4,
    FT_PIVOT_FRONT_COMPRESS = 8,
453 454
};

Yoni Fogel's avatar
Yoni Fogel committed
455
uint32_t compute_child_fullhash (CACHEFILE cf, FTNODE node, int childnum);
456

457 458
// The brt_header is not managed by the cachetable.  Instead, it hangs off the cachefile as userdata.

459
enum ft_type {FT_CURRENT=1, FT_CHECKPOINT_INPROGRESS};
460

461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
struct ft_header {
    enum ft_type type;

    int dirty;

    // Free-running counter incremented once per checkpoint (toggling LSB).
    // LSB indicates which header location is used on disk so this
    // counter is effectively a boolean which alternates with each checkpoint.
    uint64_t checkpoint_count;
    // LSN of creation of "checkpoint-begin" record in log.
    LSN checkpoint_lsn;

    // see brt_layout_version.h.  maybe don't need this if we assume
    // it's always the current version after deserializing
    const int layout_version;
    // different (<) from layout_version if upgraded from a previous
    // version (useful for debugging)
    const int layout_version_original;
    // build_id (svn rev number) of software that wrote this node to
    // disk. (read from disk, overwritten when written to disk, I
    // think).
    const uint32_t build_id;
    // build_id of software that created this tree
    const uint32_t build_id_original;

    // time this tree was created
    const uint64_t time_of_creation;
    // and the root transaction id that created it
    TXNID root_xid_that_created;
    // last time this header was serialized to disk (read from disk,
    // overwritten when written to disk)
    uint64_t time_of_last_modification;
    // last time that this tree was verified
    uint64_t time_of_last_verification;

Zardosht Kasheff's avatar
Zardosht Kasheff committed
496
    // this field is essentially a const
497 498 499
    BLOCKNUM root_blocknum;

    const unsigned int flags;
500 501 502 503 504

    //protected by toku_ft_lock
    unsigned int nodesize; 
    unsigned int basementnodesize;
    enum toku_compression_method compression_method;
505
    unsigned int fanout;
506 507 508 509 510

    // Current Minimum MSN to be used when upgrading pre-MSN BRT's.
    // This is decremented from our currnt MIN_MSN so as not to clash
    // with any existing 'normal' MSN's.
    MSN highest_unused_msn_for_upgrade;
511 512 513
    // Largest MSN ever injected into the tree.  Used to set the MSN for
    // messages as they get injected.
    MSN max_msn_in_ft;
514 515 516 517 518 519 520 521 522 523 524 525

    // last time that a hot optimize operation was begun
    uint64_t time_of_last_optimize_begin;
    // last time that a hot optimize operation was successfully completed
    uint64_t time_of_last_optimize_end;
    // the number of hot optimize operations currently in progress on this tree
    uint32_t count_of_optimize_in_progress;
    // the number of hot optimize operations in progress on this tree at the time of the last crash  (this field is in-memory only)
    uint32_t count_of_optimize_in_progress_read_from_disk;
    // all messages before this msn have been applied to leaf nodes
    MSN msn_at_start_of_last_completed_optimize;

526
    STAT64INFO_S on_disk_stats;
527 528
};

529
// brt_header is always the current version.
530
struct ft {
531 532 533 534 535
    FT_HEADER h;
    FT_HEADER checkpoint_header;

    // These are (mostly) read-only.

536
    CACHEFILE cf;
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
    // unique id for dictionary
    DICTIONARY_ID dict_id;
    ft_compare_func compare_fun;
    ft_update_func update_fun;

    // protected by locktree
    DESCRIPTOR_S descriptor;
    // protected by locktree and user. User 
    // makes sure this is only changed
    // when no activity on tree
    DESCRIPTOR_S cmp_descriptor;

    // These are not read-only:

    // protected by blocktable lock
Yoni Fogel's avatar
Yoni Fogel committed
552
    BLOCK_TABLE blocktable;
553 554 555 556 557 558 559 560 561

    // protected by atomic builtins
    STAT64INFO_S in_memory_stats;

    // transient, not serialized to disk.  updated when we do write to
    // disk.  tells us whether we can do partial eviction (we can't if
    // the on-disk layout version is from before basement nodes)
    int layout_version_read_from_disk;

562
    // Logically the reference count is zero if live_ft_handles is empty, txns is 0, and pinned_by_checkpoint is false.
563 564

    // ft_ref_lock protects modifying live_ft_handles, txns, and pinned_by_checkpoint.
565
    toku_mutex_t ft_ref_lock;
566
    struct toku_list live_ft_handles;
567 568 569
    // Number of transactions that are using this FT.  you should only be able
    // to modify this if you have a valid handle in live_ft_handles
    uint32_t num_txns;
570
    // A checkpoint is running.  If true, then keep this header around for checkpoint, like a transaction
571 572
    bool pinned_by_checkpoint;

573 574
    // is this ft a blackhole? if so, all messages are dropped.
    bool blackhole;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
575 576
};

577 578 579 580
// Allocate a DB struct off the stack and only set its comparison
// descriptor. We don't bother setting any other fields because
// the comparison function doesn't need it, and we would like to
// reduce the CPU work done per comparison.
Leif Walsh's avatar
Leif Walsh committed
581
#define FAKE_DB(db, desc) struct __toku_db db; do { db.cmp_descriptor = desc; } while (0)
582

583
struct ft_options {
Bradley C. Kuszmaul's avatar
Up  
Bradley C. Kuszmaul committed
584
    unsigned int nodesize;
585
    unsigned int basementnodesize;
586
    enum toku_compression_method compression_method;
587
    unsigned int fanout;
Bradley C. Kuszmaul's avatar
Up  
Bradley C. Kuszmaul committed
588
    unsigned int flags;
589 590
    ft_compare_func compare_fun;
    ft_update_func update_fun;
591
};
592

593
struct ft_handle {
594 595
    // The fractal tree.
    FT ft;
596 597 598

    on_redirect_callback redirect_callback;
    void *redirect_callback_extra;
599
    struct toku_list live_ft_handle_link;
Yoni Fogel's avatar
Yoni Fogel committed
600
    bool did_set_flags;
601 602

    struct ft_options options;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
603 604
};

605
PAIR_ATTR make_ftnode_pair_attr(FTNODE node);
606
PAIR_ATTR make_invalid_pair_attr(void);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
607

Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
608
/* serialization code */
Zardosht Kasheff's avatar
Zardosht Kasheff committed
609 610
void
toku_create_compressed_partition_from_available(
611
    FTNODE node,
612 613
    int childnum,
    enum toku_compression_method compression_method,
Zardosht Kasheff's avatar
Zardosht Kasheff committed
614 615
    SUB_BLOCK sb
    );
616 617 618
void rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize);
int toku_serialize_ftnode_to_memory (FTNODE node,
                                      FTNODE_DISK_DATA* ndd,
619
                                      unsigned int basementnodesize,
620
                                      enum toku_compression_method compression_method,
Yoni Fogel's avatar
Yoni Fogel committed
621 622
                                      bool do_rebalancing,
                                      bool in_parallel,
623
                              /*out*/ size_t *n_bytes_to_write,
John Esmet's avatar
John Esmet committed
624
                              /*out*/ size_t *n_uncompressed_bytes,
625
                              /*out*/ char  **bytes_to_write);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
626
int toku_serialize_ftnode_to(int fd, BLOCKNUM, FTNODE node, FTNODE_DISK_DATA* ndd, bool do_rebalancing, FT h, bool for_checkpoint);
627 628 629
int toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized_log, bool is_serialized,
                                    FT h, bool for_checkpoint);
void toku_serialize_rollback_log_to_memory_uncompressed(ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized);
630
int toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE *logp, FT h);
631
int toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, int fd, struct ftnode_fetch_extra* bfe);
632
int toku_deserialize_bp_from_compressed(FTNODE node, int childnum, struct ftnode_fetch_extra *bfe);
Yoni Fogel's avatar
Yoni Fogel committed
633
int toku_deserialize_ftnode_from (int fd, BLOCKNUM off, uint32_t /*fullhash*/, FTNODE *ftnode, FTNODE_DISK_DATA* ndd, struct ftnode_fetch_extra* bfe);
634

635 636
// <CER> For verifying old, non-upgraded nodes (versions 13 and 14).
int
Yoni Fogel's avatar
Yoni Fogel committed
637
decompress_from_raw_block_into_rbuf(uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum);
638 639
// 
    
640 641 642 643 644
//////////////// <CER> TODO: Move these function declarations
int
deserialize_ft_from_fd_into_rbuf(int fd,
                                 toku_off_t offset_of_header,
                                 struct rbuf *rb,
Yoni Fogel's avatar
Yoni Fogel committed
645
                                 uint64_t *checkpoint_count,
646
                                 LSN *checkpoint_lsn,
Yoni Fogel's avatar
Yoni Fogel committed
647
                                 uint32_t * version_p);
648

649
int
650 651
deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version);

652
void read_block_from_fd_into_rbuf(
653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
    int fd, 
    BLOCKNUM blocknum,
    FT h,
    struct rbuf *rb
    );

int
read_compressed_sub_block(struct rbuf *rb, struct sub_block *sb);

int
verify_ftnode_sub_block (struct sub_block *sb);

void
just_decompress_sub_block(struct sub_block *sb);

/* Beginning of ft-node-deserialize.c helper functions. */
669 670 671 672 673 674 675 676 677
void initialize_ftnode(FTNODE node, BLOCKNUM blocknum);
int read_and_check_magic(struct rbuf *rb);
int read_and_check_version(FTNODE node, struct rbuf *rb);
void read_node_info(FTNODE node, struct rbuf *rb, int version);
void allocate_and_read_partition_offsets(FTNODE node, struct rbuf *rb, FTNODE_DISK_DATA *ndd);
int check_node_info_checksum(struct rbuf *rb);
void read_legacy_node_info(FTNODE node, struct rbuf *rb, int version);
int check_legacy_end_checksum(struct rbuf *rb);
/* End of ft-node-deserialization.c helper functions. */
678

679
unsigned int toku_serialize_ftnode_size(FTNODE node); /* How much space will it take? */
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
680

681
void toku_verify_or_set_counts(FTNODE);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
682

683 684 685
size_t toku_serialize_ft_size (FT_HEADER h);
void toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFILE cf);
void toku_serialize_ft_to_wbuf (
686 687 688 689 690
    struct wbuf *wbuf, 
    FT_HEADER h, 
    DISKOFF translation_location_on_disk, 
    DISKOFF translation_size_on_disk
    );
691
int toku_deserialize_ft_from (int fd, LSN max_acceptable_lsn, FT *ft);
692
void toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset);
693
void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc);
694 695
BASEMENTNODE toku_create_empty_bn(void);
BASEMENTNODE toku_create_empty_bn_no_buffer(void); // create a basement node with a null buffer.
Zardosht Kasheff's avatar
Zardosht Kasheff committed
696 697
NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo);
BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn);
698
NONLEAF_CHILDINFO toku_create_empty_nl(void);
699
// FIXME needs toku prefix
700
void destroy_basement_node (BASEMENTNODE bn);
701
// FIXME needs toku prefix
702
void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl);
703 704 705 706
void toku_destroy_ftnode_internals(FTNODE node);
void toku_ftnode_free (FTNODE *node);
bool is_entire_node_in_memory(FTNODE node);
void toku_assert_entire_node_in_memory(FTNODE node);
707

708
// append a child node to a parent node
709
void toku_ft_nonleaf_append_child(FTNODE node, FTNODE child, const DBT *pivotkey);
710 711

// append a cmd to a nonleaf node child buffer
712
void toku_ft_append_to_child_buffer(ft_compare_func compare_fun, DESCRIPTOR desc, FTNODE node, int childnum, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, const DBT *key, const DBT *val);
713

714
STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
715

716 717 718 719 720 721 722
//#define SLOW
#ifdef SLOW
#define VERIFY_NODE(t,n) (toku_verify_or_set_counts(n), toku_verify_estimates(t,n))
#else
#define VERIFY_NODE(t,n) ((void)0)
#endif

723
void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe);
724
void toku_ft_status_update_flush_reason(FTNODE node, uint64_t uncompressed_bytes_flushed, uint64_t bytes_written, tokutime_t write_time, bool for_checkpoint);
725 726
void toku_ft_status_update_serialize_times(FTNODE node, tokutime_t serialize_time, tokutime_t compress_time);
void toku_ft_status_update_deserialize_times(FTNODE node, tokutime_t deserialize_time, tokutime_t decompress_time);
727

Zardosht Kasheff's avatar
Zardosht Kasheff committed
728
void toku_ftnode_clone_callback(void* value_data, void** cloned_value_data, long* clone_size, PAIR_ATTR* new_attr, bool for_checkpoint, void* write_extraargs);
729 730 731 732
void toku_ftnode_checkpoint_complete_callback(void *value_data);
void toku_ftnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *ftnode_v, void** UU(disk_data), void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, bool write_me, bool keep_me, bool for_checkpoint, bool is_clone);
int toku_ftnode_fetch_callback (CACHEFILE cachefile, PAIR p, int fd, BLOCKNUM nodename, uint32_t fullhash, void **ftnode_pv, void** UU(disk_data), PAIR_ATTR *sizep, int*dirty, void*extraargs);
void toku_ftnode_pe_est_callback(void* ftnode_pv, void* disk_data, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void* write_extraargs);
733 734
int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *extraargs,
                            void (*finalize)(PAIR_ATTR new_attr, void *extra), void *finalize_extra);
735
bool toku_ftnode_pf_req_callback(void* ftnode_pv, void* read_extraargs);
736
int toku_ftnode_pf_callback(void* ftnode_pv, void* UU(disk_data), void* read_extraargs, int fd, PAIR_ATTR* sizep);
737 738
int toku_ftnode_cleaner_callback( void *ftnode_pv, BLOCKNUM blocknum, uint32_t fullhash, void *extraargs);
void toku_evict_bn_from_memory(FTNODE node, int childnum, FT h);
739
BASEMENTNODE toku_detach_bn(FTNODE node, int childnum);
740

Zardosht Kasheff's avatar
Zardosht Kasheff committed
741 742 743 744 745 746
// Given pinned node and pinned child, split child into two
// and update node with information about its new child.
void toku_ft_split_child(
    FT h,
    FTNODE node,
    int childnum,
747 748 749 750 751 752 753 754 755
    FTNODE child,
    enum split_mode split_mode
    );
// Given pinned node, merge childnum with a neighbor and update node with
// information about the change
void toku_ft_merge_child(
    FT ft,
    FTNODE node,
    int childnum
Zardosht Kasheff's avatar
Zardosht Kasheff committed
756
    );
757
static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(FT h) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
758
    CACHETABLE_WRITE_CALLBACK wc;
759 760 761 762 763
    wc.flush_callback = toku_ftnode_flush_callback;
    wc.pe_est_callback = toku_ftnode_pe_est_callback;
    wc.pe_callback = toku_ftnode_pe_callback;
    wc.cleaner_callback = toku_ftnode_cleaner_callback;
    wc.clone_callback = toku_ftnode_clone_callback;
764
    wc.checkpoint_complete_callback = toku_ftnode_checkpoint_complete_callback;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
765 766 767 768
    wc.write_extraargs = h;
    return wc;
}

769
static const FTNODE null_ftnode=0;
Bradley C. Kuszmaul's avatar
Up  
Bradley C. Kuszmaul committed
770

771
/* a brt cursor is represented as a kv pair in a tree */
772
struct ft_cursor {
773
    struct toku_list cursors_link;
774
    FT_HANDLE ft_handle;
775
    DBT key, val;             // The key-value pair that the cursor currently points to
776
    DBT range_lock_left_key, range_lock_right_key;
777
    bool prefetching;
Yoni Fogel's avatar
Yoni Fogel committed
778 779 780 781 782
    bool left_is_neg_infty, right_is_pos_infty;
    bool is_snapshot_read; // true if query is read_committed, false otherwise
    bool is_leaf_mode;
    bool disable_prefetching;
    bool is_temporary;
783
    int out_of_range_error;
784
    int direction;
785
    TOKUTXN ttxn;
786 787
    FT_CHECK_INTERRUPT_CALLBACK interrupt_cb;
    void *interrupt_cb_extra;
788 789
};

Zardosht Kasheff's avatar
Zardosht Kasheff committed
790
//
791
// Helper function to fill a ftnode_fetch_extra with data
Zardosht Kasheff's avatar
Zardosht Kasheff committed
792 793 794 795
// that will tell the fetch callback that the entire node is
// necessary. Used in cases where the entire node
// is required, such as for flushes.
//
796 797
static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h) {
    bfe->type = ftnode_fetch_all;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
798 799
    bfe->h = h;
    bfe->search = NULL;
800 801
    toku_init_dbt(&bfe->range_lock_left_key);
    toku_init_dbt(&bfe->range_lock_right_key);
Yoni Fogel's avatar
Yoni Fogel committed
802 803
    bfe->left_is_neg_infty = false;
    bfe->right_is_pos_infty = false;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
804
    bfe->child_to_read = -1;
Yoni Fogel's avatar
Yoni Fogel committed
805
    bfe->disable_prefetching = false;
806
    bfe->read_all_partitions = false;
807
    bfe->bytes_read = 0;
808 809
    bfe->io_time = 0;
    bfe->deserialize_time = 0;
810
    bfe->decompress_time = 0;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
811 812
}

813 814 815 816 817 818 819 820 821 822
//
// Helper function to fill a ftnode_fetch_extra with data
// that will tell the fetch callback that an explicit range of children is
// necessary. Used in cases where the portion of the node that is required
// is known in advance, e.g. for keysrange when the left and right key
// are in the same basement node.
//
static inline void fill_bfe_for_keymatch(
    struct ftnode_fetch_extra *bfe,
    FT h,
823 824
    const DBT *left,
    const DBT *right,
825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
    bool disable_prefetching,
    bool read_all_partitions
    )
{
    paranoid_invariant(h->h->type == FT_CURRENT);
    bfe->type = ftnode_fetch_keymatch;
    bfe->h = h;
    bfe->search = nullptr;
    toku_init_dbt(&bfe->range_lock_left_key);
    toku_init_dbt(&bfe->range_lock_right_key);
    if (left) {
        toku_copyref_dbt(&bfe->range_lock_left_key, *left);
    }

    if (right) {
        toku_copyref_dbt(&bfe->range_lock_right_key, *right);
    }
    bfe->left_is_neg_infty = left == nullptr;
    bfe->right_is_pos_infty = right == nullptr;
    bfe->child_to_read = -1;
    bfe->disable_prefetching = disable_prefetching;
    bfe->read_all_partitions = read_all_partitions;
    bfe->bytes_read = 0;
    bfe->io_time = 0;
    bfe->deserialize_time = 0;
    bfe->decompress_time = 0;
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
853
//
854
// Helper function to fill a ftnode_fetch_extra with data
Zardosht Kasheff's avatar
Zardosht Kasheff committed
855 856 857 858 859
// that will tell the fetch callback that some subset of the node
// necessary. Used in cases where some of the node is required
// such as for a point query.
//
static inline void fill_bfe_for_subset_read(
860 861 862
    struct ftnode_fetch_extra *bfe,
    FT h,
    ft_search_t* search,
863 864
    const DBT *left,
    const DBT *right,
Yoni Fogel's avatar
Yoni Fogel committed
865 866
    bool left_is_neg_infty,
    bool right_is_pos_infty,
867 868
    bool disable_prefetching,
    bool read_all_partitions
Zardosht Kasheff's avatar
Zardosht Kasheff committed
869 870
    )
{
871
    paranoid_invariant(h->h->type == FT_CURRENT);
872
    bfe->type = ftnode_fetch_subset;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
873 874
    bfe->h = h;
    bfe->search = search;
875 876 877 878 879 880 881 882
    toku_init_dbt(&bfe->range_lock_left_key);
    toku_init_dbt(&bfe->range_lock_right_key);
    if (left) {
        toku_copyref_dbt(&bfe->range_lock_left_key, *left);
    }
    if (right) {
        toku_copyref_dbt(&bfe->range_lock_right_key, *right);
    }
Zardosht Kasheff's avatar
Zardosht Kasheff committed
883 884 885
    bfe->left_is_neg_infty = left_is_neg_infty;
    bfe->right_is_pos_infty = right_is_pos_infty;
    bfe->child_to_read = -1;
886
    bfe->disable_prefetching = disable_prefetching;
887
    bfe->read_all_partitions = read_all_partitions;
888
    bfe->bytes_read = 0;
889
    bfe->io_time = 0;
890 891
    bfe->deserialize_time = 0;
    bfe->decompress_time = 0;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
892 893 894
}

//
895
// Helper function to fill a ftnode_fetch_extra with data
Zardosht Kasheff's avatar
Zardosht Kasheff committed
896 897 898 899
// that will tell the fetch callback that no partitions are
// necessary, only the pivots and/or subtree estimates.
// Currently used for stat64.
//
900
static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) {
901
    paranoid_invariant(h->h->type == FT_CURRENT);
902
    bfe->type = ftnode_fetch_none;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
903 904
    bfe->h = h;
    bfe->search = NULL;
905 906
    toku_init_dbt(&bfe->range_lock_left_key);
    toku_init_dbt(&bfe->range_lock_right_key);
Yoni Fogel's avatar
Yoni Fogel committed
907 908
    bfe->left_is_neg_infty = false;
    bfe->right_is_pos_infty = false;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
909
    bfe->child_to_read = -1;
Yoni Fogel's avatar
Yoni Fogel committed
910
    bfe->disable_prefetching = false;
911
    bfe->read_all_partitions = false;
912
    bfe->bytes_read = 0;
913
    bfe->io_time = 0;
914 915
    bfe->deserialize_time = 0;
    bfe->decompress_time = 0;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
916 917
}

918
static inline void destroy_bfe_for_prefetch(struct ftnode_fetch_extra *bfe) {
919
    paranoid_invariant(bfe->type == ftnode_fetch_prefetch);
920 921
    toku_destroy_dbt(&bfe->range_lock_left_key);
    toku_destroy_dbt(&bfe->range_lock_right_key);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
922 923
}

924
// this is in a strange place because it needs the cursor struct to be defined
925 926 927
static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe,
                                         FT h,
                                         FT_CURSOR c) {
928
    paranoid_invariant(h->h->type == FT_CURRENT);
929
    bfe->type = ftnode_fetch_prefetch;
930 931
    bfe->h = h;
    bfe->search = NULL;
932 933 934 935 936 937 938 939 940
    toku_init_dbt(&bfe->range_lock_left_key);
    toku_init_dbt(&bfe->range_lock_right_key);
    const DBT *left = &c->range_lock_left_key;
    if (left->data) {
        toku_clone_dbt(&bfe->range_lock_left_key, *left);
    }
    const DBT *right = &c->range_lock_right_key;
    if (right->data) {
        toku_clone_dbt(&bfe->range_lock_right_key, *right);
941 942 943 944
    }
    bfe->left_is_neg_infty = c->left_is_neg_infty;
    bfe->right_is_pos_infty = c->right_is_pos_infty;
    bfe->child_to_read = -1;
945
    bfe->disable_prefetching = c->disable_prefetching;
946
    bfe->read_all_partitions = false;
947
    bfe->bytes_read = 0;
948
    bfe->io_time = 0;
949 950
    bfe->deserialize_time = 0;
    bfe->decompress_time = 0;
951 952
}

953
struct ancestors {
954
    FTNODE   node;     // This is the root node if next is NULL.
955 956
    int       childnum; // which buffer holds messages destined to the node whose ancestors this list represents.
    ANCESTORS next;     // Parent of this node (so next->node.(next->childnum) refers to this node).
957 958
};
struct pivot_bounds {
959 960
    const DBT * const lower_bound_exclusive;
    const DBT * const upper_bound_inclusive; // NULL to indicate negative or positive infinity (which are in practice exclusive since there are now transfinite keys in messages).
961 962
};

963 964
__attribute__((nonnull))
void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node);
965
void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied, int child_to_read);
Leif Walsh's avatar
Leif Walsh committed
966
__attribute__((nonnull))
967
bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path, int child_to_read);
Leif Walsh's avatar
Leif Walsh committed
968
__attribute__((nonnull))
969
void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
970

971 972 973
__attribute__((const,nonnull))
size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd);

974
int
975
toku_ft_search_which_child(
Zardosht Kasheff's avatar
Zardosht Kasheff committed
976
    DESCRIPTOR desc,
977 978 979
    ft_compare_func cmp,
    FTNODE node,
    ft_search_t *search
980
    );
981

982
bool
983
toku_bfe_wants_child_available (struct ftnode_fetch_extra* bfe, int childnum);
984

985
int
986
toku_bfe_leftmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node);
987
int
988
toku_bfe_rightmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node);
989

990
// allocate a block number
991 992 993
// allocate and initialize a ftnode
// put the ftnode into the cache table
void toku_create_new_ftnode (FT_HANDLE t, FTNODE *result, int height, int n_children);
994

995 996
// Effect: Fill in N as an empty ftnode.
void toku_initialize_empty_ftnode (FTNODE n, BLOCKNUM nodename, int height, int num_children, 
Zardosht Kasheff's avatar
Zardosht Kasheff committed
997
                                    int layout_version, unsigned int flags);
998

999 1000
int toku_ftnode_which_child(FTNODE node, const DBT *k,
                            DESCRIPTOR desc, ft_compare_func cmp)
1001
    __attribute__((__warn_unused_result__));
1002

Leif Walsh's avatar
Leif Walsh committed
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
/**
 * Finds the next child for HOT to flush to, given that everything up to
 * and including k has been flattened.
 *
 * If k falls between pivots in node, then we return the childnum where k
 * lies.
 *
 * If k is equal to some pivot, then we return the next (to the right)
 * childnum.
 */
1013 1014 1015 1016
int toku_ftnode_hot_next_child(FTNODE node,
                               const DBT *k,
                               DESCRIPTOR desc,
                               ft_compare_func cmp);
Leif Walsh's avatar
Leif Walsh committed
1017

Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
1018
/* Stuff for testing */
1019 1020
// toku_testsetup_initialize() must be called before any other test_setup_xxx() functions are called.
void toku_testsetup_initialize(void);
1021 1022 1023 1024
int toku_testsetup_leaf(FT_HANDLE brt, BLOCKNUM *blocknum, int n_children, char **keys, int *keylens);
int toku_testsetup_nonleaf (FT_HANDLE brt, int height, BLOCKNUM *diskoff, int n_children, BLOCKNUM *children, char **keys, int *keylens);
int toku_testsetup_root(FT_HANDLE brt, BLOCKNUM);
int toku_testsetup_get_sersize(FT_HANDLE brt, BLOCKNUM); // Return the size on disk.
1025 1026
int toku_testsetup_insert_to_leaf (FT_HANDLE brt, BLOCKNUM, const char *key, int keylen, const char *val, int vallen);
int toku_testsetup_insert_to_nonleaf (FT_HANDLE brt, BLOCKNUM, enum ft_msg_type, const char *key, int keylen, const char *val, int vallen);
1027 1028 1029
void toku_pin_node_with_min_bfe(FTNODE* node, BLOCKNUM b, FT_HANDLE t);

// toku_ft_root_put_cmd() accepts non-constant cmd because this is where we set the msn
1030
void toku_ft_root_put_cmd(FT h, FT_MSG_S * cmd, txn_gc_info *gc_info);
1031

1032 1033 1034
void
toku_get_node_for_verify(
    BLOCKNUM blocknum,
1035 1036
    FT_HANDLE brt,
    FTNODE* nodep
1037
    );
1038

1039
int
1040
toku_verify_ftnode (FT_HANDLE brt,
1041
                    MSN rootmsn, MSN parentmsn, bool messages_exist_above,
1042
                     FTNODE node, int height,
1043 1044
                     const DBT *lesser_pivot,               // Everything in the subtree should be > lesser_pivot.  (lesser_pivot==NULL if there is no lesser pivot.)
                     const DBT *greatereq_pivot,            // Everything in the subtree should be <= lesser_pivot.  (lesser_pivot==NULL if there is no lesser pivot.)
1045 1046
                     int (*progress_callback)(void *extra, float progress), void *progress_extra,
                     int recurse, int verbose, int keep_going_on_failure)
1047
    __attribute__ ((warn_unused_result));
1048

1049
int toku_db_badformat(void) __attribute__((__warn_unused_result__));
1050

1051
typedef enum {
1052 1053 1054
    FT_UPGRADE_FOOTPRINT = 0,
    FT_UPGRADE_STATUS_NUM_ROWS
} ft_upgrade_status_entry;
1055 1056

typedef struct {
Yoni Fogel's avatar
Yoni Fogel committed
1057
    bool initialized;
1058 1059
    TOKU_ENGINE_STATUS_ROW_S status[FT_UPGRADE_STATUS_NUM_ROWS];
} FT_UPGRADE_STATUS_S, *FT_UPGRADE_STATUS;
1060

1061
void toku_ft_upgrade_get_status(FT_UPGRADE_STATUS);
1062

1063 1064 1065 1066 1067
typedef enum {
    LE_MAX_COMMITTED_XR = 0,
    LE_MAX_PROVISIONAL_XR,
    LE_EXPANDED,
    LE_MAX_MEMSIZE,
1068 1069 1070 1071
    LE_APPLY_GC_BYTES_IN,
    LE_APPLY_GC_BYTES_OUT,
    LE_NORMAL_GC_BYTES_IN,
    LE_NORMAL_GC_BYTES_OUT,
1072 1073 1074 1075
    LE_STATUS_NUM_ROWS
} le_status_entry;

typedef struct {
Yoni Fogel's avatar
Yoni Fogel committed
1076
    bool initialized;
1077
    TOKU_ENGINE_STATUS_ROW_S status[LE_STATUS_NUM_ROWS];
1078 1079 1080 1081
} LE_STATUS_S, *LE_STATUS;

void toku_le_get_status(LE_STATUS);

1082
typedef enum {
1083 1084 1085 1086 1087 1088 1089 1090
    FT_UPDATES = 0,
    FT_UPDATES_BROADCAST,
    FT_DESCRIPTOR_SET,
    FT_MSN_DISCARDS,                           // how many messages were ignored by leaf because of msn
    FT_TOTAL_RETRIES,                          // total number of search retries due to TRY_AGAIN
    FT_SEARCH_TRIES_GT_HEIGHT,                 // number of searches that required more tries than the height of the tree
    FT_SEARCH_TRIES_GT_HEIGHTPLUS3,            // number of searches that required more tries than the height of the tree plus three
    FT_DISK_FLUSH_LEAF,                        // number of leaf nodes flushed to disk,    not for checkpoint
1091 1092 1093
    FT_DISK_FLUSH_LEAF_BYTES,                  // number of leaf nodes flushed to disk,    not for checkpoint
    FT_DISK_FLUSH_LEAF_UNCOMPRESSED_BYTES,                  // number of leaf nodes flushed to disk,    not for checkpoint
    FT_DISK_FLUSH_LEAF_TOKUTIME,               // number of leaf nodes flushed to disk,    not for checkpoint
1094
    FT_DISK_FLUSH_NONLEAF,                     // number of nonleaf nodes flushed to disk, not for checkpoint
1095 1096 1097
    FT_DISK_FLUSH_NONLEAF_BYTES,               // number of nonleaf nodes flushed to disk, not for checkpoint
    FT_DISK_FLUSH_NONLEAF_UNCOMPRESSED_BYTES,               // number of nonleaf nodes flushed to disk, not for checkpoint
    FT_DISK_FLUSH_NONLEAF_TOKUTIME,            // number of nonleaf nodes flushed to disk, not for checkpoint
1098
    FT_DISK_FLUSH_LEAF_FOR_CHECKPOINT,         // number of leaf nodes flushed to disk for checkpoint
1099 1100 1101
    FT_DISK_FLUSH_LEAF_BYTES_FOR_CHECKPOINT,   // number of leaf nodes flushed to disk for checkpoint
    FT_DISK_FLUSH_LEAF_UNCOMPRESSED_BYTES_FOR_CHECKPOINT,// number of leaf nodes flushed to disk for checkpoint
    FT_DISK_FLUSH_LEAF_TOKUTIME_FOR_CHECKPOINT,// number of leaf nodes flushed to disk for checkpoint
1102
    FT_DISK_FLUSH_NONLEAF_FOR_CHECKPOINT,      // number of nonleaf nodes flushed to disk for checkpoint
1103 1104 1105
    FT_DISK_FLUSH_NONLEAF_BYTES_FOR_CHECKPOINT,// number of nonleaf nodes flushed to disk for checkpoint
    FT_DISK_FLUSH_NONLEAF_UNCOMPRESSED_BYTES_FOR_CHECKPOINT,// number of nonleaf nodes flushed to disk for checkpoint
    FT_DISK_FLUSH_NONLEAF_TOKUTIME_FOR_CHECKPOINT,// number of nonleaf nodes flushed to disk for checkpoint
1106 1107
    FT_DISK_FLUSH_LEAF_COMPRESSION_RATIO,      // effective compression ratio for leaf bytes flushed to disk
    FT_DISK_FLUSH_NONLEAF_COMPRESSION_RATIO,   // effective compression ratio for nonleaf bytes flushed to disk
1108
    FT_DISK_FLUSH_OVERALL_COMPRESSION_RATIO,   // effective compression ratio for all bytes flushed to disk
1109 1110 1111 1112 1113 1114 1115 1116
    FT_PARTIAL_EVICTIONS_NONLEAF,              // number of nonleaf node partial evictions
    FT_PARTIAL_EVICTIONS_NONLEAF_BYTES,        // number of nonleaf node partial evictions
    FT_PARTIAL_EVICTIONS_LEAF,                 // number of leaf node partial evictions
    FT_PARTIAL_EVICTIONS_LEAF_BYTES,           // number of leaf node partial evictions
    FT_FULL_EVICTIONS_LEAF,                    // number of full cachetable evictions on leaf nodes
    FT_FULL_EVICTIONS_LEAF_BYTES,              // number of full cachetable evictions on leaf nodes (bytes)
    FT_FULL_EVICTIONS_NONLEAF,                 // number of full cachetable evictions on nonleaf nodes
    FT_FULL_EVICTIONS_NONLEAF_BYTES,           // number of full cachetable evictions on nonleaf nodes (bytes)
1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
    FT_CREATE_LEAF,                            // number of leaf nodes created
    FT_CREATE_NONLEAF,                         // number of nonleaf nodes created
    FT_DESTROY_LEAF,                           // number of leaf nodes destroyed
    FT_DESTROY_NONLEAF,                        // number of nonleaf nodes destroyed
    FT_MSG_BYTES_IN,                           // how many bytes of messages injected at root (for all trees)
    FT_MSG_BYTES_OUT,                          // how many bytes of messages flushed from h1 nodes to leaves
    FT_MSG_BYTES_CURR,                         // how many bytes of messages currently in trees (estimate)
    FT_MSG_NUM,                                // how many messages injected at root
    FT_MSG_NUM_BROADCAST,                      // how many broadcast messages injected at root
    FT_NUM_BASEMENTS_DECOMPRESSED_NORMAL,      // how many basement nodes were decompressed because they were the target of a query
    FT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE,  // ... because they were between lc and rc
    FT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH,
    FT_NUM_BASEMENTS_DECOMPRESSED_WRITE,
    FT_NUM_MSG_BUFFER_DECOMPRESSED_NORMAL,     // how many msg buffers were decompressed because they were the target of a query
    FT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE, // ... because they were between lc and rc
    FT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH,
    FT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE,
    FT_NUM_PIVOTS_FETCHED_QUERY,               // how many pivots were fetched for a query
1135
    FT_BYTES_PIVOTS_FETCHED_QUERY,               // how many pivots were fetched for a query
1136
    FT_TOKUTIME_PIVOTS_FETCHED_QUERY,               // how many pivots were fetched for a query
1137
    FT_NUM_PIVOTS_FETCHED_PREFETCH,            // ... for a prefetch
1138
    FT_BYTES_PIVOTS_FETCHED_PREFETCH,            // ... for a prefetch
1139
    FT_TOKUTIME_PIVOTS_FETCHED_PREFETCH,            // ... for a prefetch
1140
    FT_NUM_PIVOTS_FETCHED_WRITE,               // ... for a write
1141
    FT_BYTES_PIVOTS_FETCHED_WRITE,               // ... for a write
1142
    FT_TOKUTIME_PIVOTS_FETCHED_WRITE,               // ... for a write
1143
    FT_NUM_BASEMENTS_FETCHED_NORMAL,           // how many basement nodes were fetched because they were the target of a query
1144
    FT_BYTES_BASEMENTS_FETCHED_NORMAL,           // how many basement nodes were fetched because they were the target of a query
1145
    FT_TOKUTIME_BASEMENTS_FETCHED_NORMAL,           // how many basement nodes were fetched because they were the target of a query
1146
    FT_NUM_BASEMENTS_FETCHED_AGGRESSIVE,       // ... because they were between lc and rc
1147
    FT_BYTES_BASEMENTS_FETCHED_AGGRESSIVE,       // ... because they were between lc and rc
1148
    FT_TOKUTIME_BASEMENTS_FETCHED_AGGRESSIVE,       // ... because they were between lc and rc
1149
    FT_NUM_BASEMENTS_FETCHED_PREFETCH,
1150
    FT_BYTES_BASEMENTS_FETCHED_PREFETCH,
1151
    FT_TOKUTIME_BASEMENTS_FETCHED_PREFETCH,
1152
    FT_NUM_BASEMENTS_FETCHED_WRITE,
1153
    FT_BYTES_BASEMENTS_FETCHED_WRITE,
1154
    FT_TOKUTIME_BASEMENTS_FETCHED_WRITE,
1155
    FT_NUM_MSG_BUFFER_FETCHED_NORMAL,          // how many msg buffers were fetched because they were the target of a query
1156
    FT_BYTES_MSG_BUFFER_FETCHED_NORMAL,          // how many msg buffers were fetched because they were the target of a query
1157
    FT_TOKUTIME_MSG_BUFFER_FETCHED_NORMAL,          // how many msg buffers were fetched because they were the target of a query
1158
    FT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE,      // ... because they were between lc and rc
1159
    FT_BYTES_MSG_BUFFER_FETCHED_AGGRESSIVE,      // ... because they were between lc and rc
1160
    FT_TOKUTIME_MSG_BUFFER_FETCHED_AGGRESSIVE,      // ... because they were between lc and rc
1161
    FT_NUM_MSG_BUFFER_FETCHED_PREFETCH,
1162
    FT_BYTES_MSG_BUFFER_FETCHED_PREFETCH,
1163
    FT_TOKUTIME_MSG_BUFFER_FETCHED_PREFETCH,
1164
    FT_NUM_MSG_BUFFER_FETCHED_WRITE,
1165
    FT_BYTES_MSG_BUFFER_FETCHED_WRITE,
1166
    FT_TOKUTIME_MSG_BUFFER_FETCHED_WRITE,
1167 1168 1169 1170 1171 1172 1173 1174
    FT_LEAF_COMPRESS_TOKUTIME, // seconds spent compressing leaf leaf nodes to memory
    FT_LEAF_SERIALIZE_TOKUTIME, // seconds spent serializing leaf node to memory
    FT_LEAF_DECOMPRESS_TOKUTIME, // seconds spent decompressing leaf nodes to memory
    FT_LEAF_DESERIALIZE_TOKUTIME, // seconds spent deserializing leaf nodes to memory
    FT_NONLEAF_COMPRESS_TOKUTIME, // seconds spent compressing nonleaf nodes to memory
    FT_NONLEAF_SERIALIZE_TOKUTIME, // seconds spent serializing nonleaf nodes to memory
    FT_NONLEAF_DECOMPRESS_TOKUTIME, // seconds spent decompressing nonleaf nodes to memory
    FT_NONLEAF_DESERIALIZE_TOKUTIME, // seconds spent deserializing nonleaf nodes to memory
1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
    FT_PRO_NUM_ROOT_SPLIT,
    FT_PRO_NUM_ROOT_H0_INJECT,
    FT_PRO_NUM_ROOT_H1_INJECT,
    FT_PRO_NUM_INJECT_DEPTH_0,
    FT_PRO_NUM_INJECT_DEPTH_1,
    FT_PRO_NUM_INJECT_DEPTH_2,
    FT_PRO_NUM_INJECT_DEPTH_3,
    FT_PRO_NUM_INJECT_DEPTH_GT3,
    FT_PRO_NUM_STOP_NONEMPTY_BUF,
    FT_PRO_NUM_STOP_H1,
    FT_PRO_NUM_STOP_LOCK_CHILD,
    FT_PRO_NUM_STOP_CHILD_INMEM,
    FT_PRO_NUM_DIDNT_WANT_PROMOTE,
1188 1189
    FT_STATUS_NUM_ROWS
} ft_status_entry;
1190 1191 1192

typedef struct {
    bool initialized;
1193 1194
    TOKU_ENGINE_STATUS_ROW_S status[FT_STATUS_NUM_ROWS];
} FT_STATUS_S, *FT_STATUS;
1195

1196
void toku_ft_get_status(FT_STATUS);
1197

1198
void
1199
toku_ft_bn_apply_cmd_once (
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1200
    BASEMENTNODE bn,
1201
    const FT_MSG cmd,
Yoni Fogel's avatar
Yoni Fogel committed
1202
    uint32_t idx,
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1203
    LEAFENTRY le,
1204
    txn_gc_info *gc_info,
1205 1206
    uint64_t *workdonep,
    STAT64INFO stats_to_update
1207
    );
1208

1209
void
1210 1211 1212
toku_ft_bn_apply_cmd (
    ft_compare_func compare_fun,
    ft_update_func update_fun,
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1213
    DESCRIPTOR desc,
1214
    BASEMENTNODE bn,
1215
    FT_MSG cmd,
1216
    txn_gc_info *gc_info,
1217 1218
    uint64_t *workdone,
    STAT64INFO stats_to_update
1219 1220
    );

1221
void
1222 1223 1224
toku_ft_leaf_apply_cmd (
    ft_compare_func compare_fun,
    ft_update_func update_fun,
1225
    DESCRIPTOR desc,
1226
    FTNODE node,
1227
    int target_childnum,
1228
    FT_MSG cmd,
1229
    txn_gc_info *gc_info,
1230 1231
    uint64_t *workdone,
    STAT64INFO stats_to_update
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1232
    );
1233

1234
void
1235 1236 1237
toku_ft_node_put_cmd (
    ft_compare_func compare_fun,
    ft_update_func update_fun,
Leif Walsh's avatar
Leif Walsh committed
1238
    DESCRIPTOR desc,
1239 1240 1241
    FTNODE node,
    int target_childnum,
    FT_MSG cmd,
1242
    bool is_fresh,
1243
    txn_gc_info *gc_info,
1244
    size_t flow_deltas[],
1245
    STAT64INFO stats_to_update
Leif Walsh's avatar
Leif Walsh committed
1246 1247
    );

Zardosht Kasheff's avatar
Zardosht Kasheff committed
1248 1249
void toku_flusher_thread_set_callback(void (*callback_f)(int, void*), void* extra);

1250 1251
int toku_upgrade_subtree_estimates_to_stat64info(int fd, FT h) __attribute__((nonnull));
int toku_upgrade_msn_from_root_to_header(int fd, FT h) __attribute__((nonnull));
1252

1253
#endif