Commit 2b4da5c0 authored by Leif Walsh's avatar Leif Walsh Committed by Yoni Fogel

[t:3315] merge indexed buffers work into mainline

git-svn-id: file:///svn/toku/tokudb@33979 c7de825b-a66e-492c-adef-691d508d4ae1
parent b246fcf4
...@@ -67,6 +67,7 @@ BRT_SOURCES = \ ...@@ -67,6 +67,7 @@ BRT_SOURCES = \
recover \ recover \
roll \ roll \
rollback \ rollback \
sort \
sub_block \ sub_block \
ule \ ule \
threadpool \ threadpool \
......
...@@ -107,7 +107,8 @@ struct brtnode_fetch_extra { ...@@ -107,7 +107,8 @@ struct brtnode_fetch_extra {
// used in the case where type == brtnode_fetch_subset // used in the case where type == brtnode_fetch_subset
// parameters needed to find out which child needs to be decompressed (so it can be read) // parameters needed to find out which child needs to be decompressed (so it can be read)
brt_search_t* search; brt_search_t* search;
BRT brt; DB *cmp_extra;
brt_compare_func cmp;
DBT *range_lock_left_key, *range_lock_right_key; DBT *range_lock_left_key, *range_lock_right_key;
BOOL left_is_neg_infty, right_is_pos_infty; BOOL left_is_neg_infty, right_is_pos_infty;
// this value will be set during the fetch_callback call by toku_brtnode_fetch_callback or toku_brtnode_pf_req_callback // this value will be set during the fetch_callback call by toku_brtnode_fetch_callback or toku_brtnode_pf_req_callback
...@@ -121,11 +122,12 @@ struct brtnode_fetch_extra { ...@@ -121,11 +122,12 @@ struct brtnode_fetch_extra {
// necessary. Used in cases where the entire node // necessary. Used in cases where the entire node
// is required, such as for flushes. // is required, such as for flushes.
// //
static inline void fill_bfe_for_full_read(struct brtnode_fetch_extra *bfe, struct brt_header *h) { static inline void fill_bfe_for_full_read(struct brtnode_fetch_extra *bfe, struct brt_header *h, DB *cmp_extra, brt_compare_func cmp) {
bfe->type = brtnode_fetch_all; bfe->type = brtnode_fetch_all;
bfe->h = h; bfe->h = h;
bfe->search = NULL; bfe->search = NULL;
bfe->brt = NULL; bfe->cmp_extra = cmp_extra;
bfe->cmp = cmp;
bfe->range_lock_left_key = NULL; bfe->range_lock_left_key = NULL;
bfe->range_lock_right_key = NULL; bfe->range_lock_right_key = NULL;
bfe->left_is_neg_infty = FALSE; bfe->left_is_neg_infty = FALSE;
...@@ -133,7 +135,7 @@ static inline void fill_bfe_for_full_read(struct brtnode_fetch_extra *bfe, struc ...@@ -133,7 +135,7 @@ static inline void fill_bfe_for_full_read(struct brtnode_fetch_extra *bfe, struc
bfe->child_to_read = -1; bfe->child_to_read = -1;
} }
static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe, struct brt_header *h, BRT brt, BRT_CURSOR c); static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe, struct brt_header *h, DB *cmp_extra, brt_compare_func cmp, BRT_CURSOR c);
// //
// Helper function to fill a brtnode_fetch_extra with data // Helper function to fill a brtnode_fetch_extra with data
...@@ -142,20 +144,22 @@ static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe, struct ...@@ -142,20 +144,22 @@ static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe, struct
// such as for a point query. // such as for a point query.
// //
static inline void fill_bfe_for_subset_read( static inline void fill_bfe_for_subset_read(
struct brtnode_fetch_extra *bfe, struct brtnode_fetch_extra *bfe,
struct brt_header *h, struct brt_header *h,
BRT brt, DB *cmp_extra,
brt_compare_func cmp,
brt_search_t* search, brt_search_t* search,
DBT *left, DBT *left,
DBT *right, DBT *right,
BOOL left_is_neg_infty, BOOL left_is_neg_infty,
BOOL right_is_pos_infty BOOL right_is_pos_infty
) )
{ {
bfe->type = brtnode_fetch_subset; bfe->type = brtnode_fetch_subset;
bfe->h = h; bfe->h = h;
bfe->search = search; bfe->search = search;
bfe->brt = brt; bfe->cmp_extra = cmp_extra;
bfe->cmp = cmp;
bfe->range_lock_left_key = (left->data ? left : NULL); bfe->range_lock_left_key = (left->data ? left : NULL);
bfe->range_lock_right_key = (right->data ? right : NULL); bfe->range_lock_right_key = (right->data ? right : NULL);
bfe->left_is_neg_infty = left_is_neg_infty; bfe->left_is_neg_infty = left_is_neg_infty;
...@@ -169,11 +173,12 @@ static inline void fill_bfe_for_subset_read( ...@@ -169,11 +173,12 @@ static inline void fill_bfe_for_subset_read(
// necessary, only the pivots and/or subtree estimates. // necessary, only the pivots and/or subtree estimates.
// Currently used for stat64. // Currently used for stat64.
// //
static inline void fill_bfe_for_min_read(struct brtnode_fetch_extra *bfe, struct brt_header *h) { static inline void fill_bfe_for_min_read(struct brtnode_fetch_extra *bfe, struct brt_header *h, DB *cmp_extra, brt_compare_func cmp) {
bfe->type = brtnode_fetch_none; bfe->type = brtnode_fetch_none;
bfe->h = h; bfe->h = h;
bfe->search = NULL; bfe->search = NULL;
bfe->brt = NULL; bfe->cmp_extra = cmp_extra;
bfe->cmp = cmp;
bfe->range_lock_left_key = NULL; bfe->range_lock_left_key = NULL;
bfe->range_lock_right_key = NULL; bfe->range_lock_right_key = NULL;
bfe->left_is_neg_infty = FALSE; bfe->left_is_neg_infty = FALSE;
...@@ -197,9 +202,35 @@ static inline void destroy_bfe_for_prefetch(struct brtnode_fetch_extra *bfe) { ...@@ -197,9 +202,35 @@ static inline void destroy_bfe_for_prefetch(struct brtnode_fetch_extra *bfe) {
} }
} }
struct toku_fifo_entry_key_msn_heaviside_extra {
DB *cmp_extra;
brt_compare_func cmp;
FIFO fifo;
bytevec key;
ITEMLEN keylen;
MSN msn;
};
// comparison function for inserting messages into a
// brtnode_nonleaf_childinfo's message_tree
int
toku_fifo_entry_key_msn_heaviside(OMTVALUE v, void *extrap);
struct toku_fifo_entry_key_msn_cmp_extra {
DB *cmp_extra;
brt_compare_func cmp;
FIFO fifo;
};
// same thing for qsort_r
int
toku_fifo_entry_key_msn_cmp(void *extrap, const void *ap, const void *bp);
// data of an available partition of a nonleaf brtnode // data of an available partition of a nonleaf brtnode
struct brtnode_nonleaf_childinfo { struct brtnode_nonleaf_childinfo {
FIFO buffer; FIFO buffer;
OMT broadcast_buffer;
OMT message_tree;
unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */ unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */
}; };
...@@ -210,7 +241,6 @@ struct brtnode_leaf_basement_node { ...@@ -210,7 +241,6 @@ struct brtnode_leaf_basement_node {
unsigned int n_bytes_in_buffer; /* How many bytes to represent the OMT (including the per-key overheads, but not including the overheads for the node. */ unsigned int n_bytes_in_buffer; /* How many bytes to represent the OMT (including the per-key overheads, but not including the overheads for the node. */
unsigned int seqinsert; /* number of sequential inserts to this leaf */ unsigned int seqinsert; /* number of sequential inserts to this leaf */
MSN max_msn_applied; // max message sequence number applied MSN max_msn_applied; // max message sequence number applied
DSN max_dsn_applied; // max deserialization sequence number applied
}; };
#define PT_INVALID 0 #define PT_INVALID 0
...@@ -277,7 +307,6 @@ struct __attribute__((__packed__)) brtnode_partition { ...@@ -277,7 +307,6 @@ struct __attribute__((__packed__)) brtnode_partition {
struct brtnode { struct brtnode {
MSN max_msn_applied_to_node_on_disk; // max_msn_applied that will be written to disk MSN max_msn_applied_to_node_on_disk; // max_msn_applied that will be written to disk
DSN dsn; // deserialization sequence number
unsigned int nodesize; unsigned int nodesize;
unsigned int flags; unsigned int flags;
BLOCKNUM thisnodename; // Which block number is this node? BLOCKNUM thisnodename; // Which block number is this node?
...@@ -374,6 +403,8 @@ static inline void set_BSB(BRTNODE node, int i, SUB_BLOCK sb) { ...@@ -374,6 +403,8 @@ static inline void set_BSB(BRTNODE node, int i, SUB_BLOCK sb) {
// macros for brtnode_nonleaf_childinfo // macros for brtnode_nonleaf_childinfo
#define BNC_BUFFER(node,i) (BNC(node,i)->buffer) #define BNC_BUFFER(node,i) (BNC(node,i)->buffer)
#define BNC_BROADCAST_BUFFER(node,i) (BNC(node,i)->broadcast_buffer)
#define BNC_MESSAGE_TREE(node, i) (BNC(node,i)->message_tree)
#define BNC_NBYTESINBUF(node,i) (BNC(node,i)->n_bytes_in_buffer) #define BNC_NBYTESINBUF(node,i) (BNC(node,i)->n_bytes_in_buffer)
// brtnode leaf basementnode macros, // brtnode leaf basementnode macros,
...@@ -443,8 +474,6 @@ struct brt_header { ...@@ -443,8 +474,6 @@ struct brt_header {
struct toku_list live_brts; struct toku_list live_brts;
struct toku_list zombie_brts; struct toku_list zombie_brts;
struct toku_list checkpoint_before_commit_link; struct toku_list checkpoint_before_commit_link;
DSN curr_dsn;
}; };
struct brt { struct brt {
...@@ -488,7 +517,7 @@ int toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE ...@@ -488,7 +517,7 @@ int toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE
BOOL for_checkpoint); BOOL for_checkpoint);
int toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *logp, struct brt_header *h); int toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *logp, struct brt_header *h);
void toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode_fetch_extra* bfe); void toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode_fetch_extra* bfe);
void toku_deserialize_bp_from_compressed(BRTNODE node, int childnum); void toku_deserialize_bp_from_compressed(BRTNODE node, int childnum, DB *cmp_extra, brt_compare_func cmp);
int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, BRTNODE *brtnode, struct brtnode_fetch_extra* bfe); int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, BRTNODE *brtnode, struct brtnode_fetch_extra* bfe);
unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */ unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */
int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len); int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
...@@ -514,7 +543,7 @@ void toku_assert_entire_node_in_memory(BRTNODE node); ...@@ -514,7 +543,7 @@ void toku_assert_entire_node_in_memory(BRTNODE node);
void toku_brt_nonleaf_append_child(BRTNODE node, BRTNODE child, struct kv_pair *pivotkey, size_t pivotkeysize); void toku_brt_nonleaf_append_child(BRTNODE node, BRTNODE child, struct kv_pair *pivotkey, size_t pivotkeysize);
// append a cmd to a nonleaf node child buffer // append a cmd to a nonleaf node child buffer
void toku_brt_append_to_child_buffer(BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val); void toku_brt_append_to_child_buffer(BRT brt, BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val);
#if 1 #if 1
#define DEADBEEF ((void*)0xDEADBEEF) #define DEADBEEF ((void*)0xDEADBEEF)
...@@ -568,15 +597,20 @@ struct brt_cursor { ...@@ -568,15 +597,20 @@ struct brt_cursor {
}; };
// this is in a strange place because it needs the cursor struct to be defined // this is in a strange place because it needs the cursor struct to be defined
static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe, struct brt_header *h, BRT brt, BRT_CURSOR c) { static inline void fill_bfe_for_prefetch(struct brtnode_fetch_extra *bfe,
struct brt_header *h,
DB *cmp_extra,
brt_compare_func cmp,
BRT_CURSOR c) {
bfe->type = brtnode_fetch_prefetch; bfe->type = brtnode_fetch_prefetch;
bfe->h = h; bfe->h = h;
bfe->search = NULL; bfe->search = NULL;
bfe->brt = brt; bfe->cmp_extra = cmp_extra;
bfe->cmp = cmp;
{ {
const DBT *left = &c->range_lock_left_key; const DBT *left = &c->range_lock_left_key;
const DBT *right = &c->range_lock_right_key; const DBT *right = &c->range_lock_right_key;
if (left->data) { if (left->data) {
MALLOC(bfe->range_lock_left_key); resource_assert(bfe->range_lock_left_key); MALLOC(bfe->range_lock_left_key); resource_assert(bfe->range_lock_left_key);
toku_fill_dbt(bfe->range_lock_left_key, toku_xmemdup(left->data, left->size), left->size); toku_fill_dbt(bfe->range_lock_left_key, toku_xmemdup(left->data, left->size), left->size);
} else { } else {
...@@ -607,12 +641,13 @@ struct pivot_bounds { ...@@ -607,12 +641,13 @@ struct pivot_bounds {
int int
toku_brt_search_which_child( toku_brt_search_which_child(
BRT brt, DB *cmp_extra,
BRTNODE node, brt_compare_func cmp,
BRTNODE node,
brt_search_t *search brt_search_t *search
); );
bool bool
toku_bfe_wants_child_available (struct brtnode_fetch_extra* bfe, int childnum); toku_bfe_wants_child_available (struct brtnode_fetch_extra* bfe, int childnum);
int int
...@@ -645,7 +680,8 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha ...@@ -645,7 +680,8 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha
struct brtnode_fetch_extra *bfe, struct brtnode_fetch_extra *bfe,
BRTNODE *node_p); BRTNODE *node_p);
void toku_unpin_brtnode (BRT brt, BRTNODE node); void toku_unpin_brtnode (BRT brt, BRTNODE node);
unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) unsigned int toku_brtnode_which_child(BRTNODE node, const DBT *k,
DB *cmp_extra, brt_compare_func cmp)
__attribute__((__warn_unused_result__)); __attribute__((__warn_unused_result__));
/* Stuff for testing */ /* Stuff for testing */
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "includes.h" #include "includes.h"
#include "sort.h"
#include "toku_atomic.h" #include "toku_atomic.h"
#include "threadpool.h" #include "threadpool.h"
#include <compress.h> #include <compress.h>
...@@ -485,7 +486,6 @@ sum_item (OMTVALUE lev, u_int32_t UU(idx), void *vsi) { ...@@ -485,7 +486,6 @@ sum_item (OMTVALUE lev, u_int32_t UU(idx), void *vsi) {
// Because all messages above have been applied, setting msn of all new basements // Because all messages above have been applied, setting msn of all new basements
// to max msn of existing basements is correct. (There cannot be any messages in // to max msn of existing basements is correct. (There cannot be any messages in
// buffers above that still need to be applied.) // buffers above that still need to be applied.)
// TODO: assert that all basement DSNs are the same.
static void static void
rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize) rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)
{ {
...@@ -539,11 +539,8 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize) ...@@ -539,11 +539,8 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)
u_int32_t tmp_seqinsert = BLB_SEQINSERT(node, node->n_children-1); u_int32_t tmp_seqinsert = BLB_SEQINSERT(node, node->n_children-1);
MSN max_msn = MIN_MSN; MSN max_msn = MIN_MSN;
DSN min_dsn = MAX_DSN;
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
DSN curr_dsn = BLB_MAX_DSN_APPLIED(node,i);
MSN curr_msn = BLB_MAX_MSN_APPLIED(node,i); MSN curr_msn = BLB_MAX_MSN_APPLIED(node,i);
min_dsn = (curr_dsn.dsn < min_dsn.dsn) ? curr_dsn : min_dsn;
max_msn = (curr_msn.msn > max_msn.msn) ? curr_msn : max_msn; max_msn = (curr_msn.msn > max_msn.msn) ? curr_msn : max_msn;
} }
...@@ -604,7 +601,6 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize) ...@@ -604,7 +601,6 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)
BP_STATE(node,i) = PT_AVAIL; BP_STATE(node,i) = PT_AVAIL;
BP_TOUCH_CLOCK(node,i); BP_TOUCH_CLOCK(node,i);
BLB_MAX_DSN_APPLIED(node,i) = min_dsn;
BLB_MAX_MSN_APPLIED(node,i) = max_msn; BLB_MAX_MSN_APPLIED(node,i) = max_msn;
} }
node->max_msn_applied_to_node_on_disk = max_msn; node->max_msn_applied_to_node_on_disk = max_msn;
...@@ -826,20 +822,46 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h ...@@ -826,20 +822,46 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h
} }
static void static void
deserialize_child_buffer(BRTNODE node, int cnum, struct rbuf *rbuf) { deserialize_child_buffer(BRTNODE node, int cnum, struct rbuf *rbuf,
DB *cmp_extra, brt_compare_func cmp) {
int r;
int n_bytes_in_buffer = 0; int n_bytes_in_buffer = 0;
int n_in_this_buffer = rbuf_int(rbuf); int n_in_this_buffer = rbuf_int(rbuf);
void **offsets;
void **broadcast_offsets;
int noffsets = 0;
int nbroadcast_offsets = 0;
if (cmp) {
MALLOC_N(n_in_this_buffer, offsets);
MALLOC_N(n_in_this_buffer, broadcast_offsets);
}
for (int i = 0; i < n_in_this_buffer; i++) { for (int i = 0; i < n_in_this_buffer; i++) {
bytevec key; ITEMLEN keylen; bytevec key; ITEMLEN keylen;
bytevec val; ITEMLEN vallen; bytevec val; ITEMLEN vallen;
int type = rbuf_char(rbuf); // this is weird but it's necessary to pass icc and gcc together
unsigned char ctype = rbuf_char(rbuf);
enum brt_msg_type type = (enum brt_msg_type) ctype;
MSN msn = rbuf_msn(rbuf); MSN msn = rbuf_msn(rbuf);
XIDS xids; XIDS xids;
xids_create_from_buffer(rbuf, &xids); xids_create_from_buffer(rbuf, &xids);
rbuf_bytes(rbuf, &key, &keylen); /* Returns a pointer into the rbuf. */ rbuf_bytes(rbuf, &key, &keylen); /* Returns a pointer into the rbuf. */
rbuf_bytes(rbuf, &val, &vallen); rbuf_bytes(rbuf, &val, &vallen);
//printf("Found %s,%s\n", (char*)key, (char*)val); //printf("Found %s,%s\n", (char*)key, (char*)val);
int r = toku_fifo_enq(BNC_BUFFER(node, cnum), key, keylen, val, vallen, type, msn, xids); /* Copies the data into the fifo */ long *dest;
if (cmp) {
if (brt_msg_type_applies_once(type)) {
dest = (long *) &offsets[noffsets];
noffsets++;
} else if (brt_msg_type_applies_all(type) || brt_msg_type_does_nothing(type)) {
dest = (long *) &broadcast_offsets[nbroadcast_offsets];
nbroadcast_offsets++;
} else {
assert(FALSE);
}
} else {
dest = NULL;
}
r = toku_fifo_enq(BNC_BUFFER(node, cnum), key, keylen, val, vallen, type, msn, xids, dest); /* Copies the data into the fifo */
lazy_assert_zero(r); lazy_assert_zero(r);
n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids); n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids);
//printf("Inserted\n"); //printf("Inserted\n");
...@@ -847,6 +869,17 @@ deserialize_child_buffer(BRTNODE node, int cnum, struct rbuf *rbuf) { ...@@ -847,6 +869,17 @@ deserialize_child_buffer(BRTNODE node, int cnum, struct rbuf *rbuf) {
} }
invariant(rbuf->ndone == rbuf->size); invariant(rbuf->ndone == rbuf->size);
if (cmp) {
struct toku_fifo_entry_key_msn_cmp_extra extra = { .cmp_extra = cmp_extra, .cmp = cmp, .fifo = BNC_BUFFER(node, cnum) };
r = mergesort_r(offsets, noffsets, sizeof offsets[0], &extra, toku_fifo_entry_key_msn_cmp);
assert_zero(r);
toku_omt_destroy(&BNC_MESSAGE_TREE(node, cnum));
r = toku_omt_create_steal_sorted_array(&BNC_MESSAGE_TREE(node, cnum), &offsets, noffsets, n_in_this_buffer);
assert_zero(r);
toku_omt_destroy(&BNC_BROADCAST_BUFFER(node, cnum));
r = toku_omt_create_steal_sorted_array(&BNC_BROADCAST_BUFFER(node, cnum), &broadcast_offsets, nbroadcast_offsets, n_in_this_buffer);
assert_zero(r);
}
BNC_NBYTESINBUF(node, cnum) = n_bytes_in_buffer; BNC_NBYTESINBUF(node, cnum) = n_bytes_in_buffer;
BP_WORKDONE(node, cnum) = 0; BP_WORKDONE(node, cnum) = 0;
} }
...@@ -897,7 +930,6 @@ BASEMENTNODE toku_create_empty_bn(void) { ...@@ -897,7 +930,6 @@ BASEMENTNODE toku_create_empty_bn(void) {
BASEMENTNODE toku_create_empty_bn_no_buffer(void) { BASEMENTNODE toku_create_empty_bn_no_buffer(void) {
BASEMENTNODE XMALLOC(bn); BASEMENTNODE XMALLOC(bn);
bn->max_dsn_applied = MIN_DSN;
bn->max_msn_applied.msn = 0; bn->max_msn_applied.msn = 0;
bn->buffer = NULL; bn->buffer = NULL;
bn->n_bytes_in_buffer = 0; bn->n_bytes_in_buffer = 0;
...@@ -910,7 +942,11 @@ NONLEAF_CHILDINFO toku_create_empty_nl(void) { ...@@ -910,7 +942,11 @@ NONLEAF_CHILDINFO toku_create_empty_nl(void) {
NONLEAF_CHILDINFO XMALLOC(cn); NONLEAF_CHILDINFO XMALLOC(cn);
cn->n_bytes_in_buffer = 0; cn->n_bytes_in_buffer = 0;
int r = toku_fifo_create(&cn->buffer); int r = toku_fifo_create(&cn->buffer);
assert(r==0); assert_zero(r);
r = toku_omt_create(&cn->message_tree);
assert_zero(r);
r = toku_omt_create(&cn->broadcast_buffer);
assert_zero(r);
return cn; return cn;
} }
...@@ -926,6 +962,8 @@ void destroy_basement_node (BASEMENTNODE bn) ...@@ -926,6 +962,8 @@ void destroy_basement_node (BASEMENTNODE bn)
void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl) void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl)
{ {
toku_fifo_free(&nl->buffer); toku_fifo_free(&nl->buffer);
toku_omt_destroy(&nl->message_tree);
toku_omt_destroy(&nl->broadcast_buffer);
toku_free(nl); toku_free(nl);
} }
...@@ -1023,8 +1061,6 @@ deserialize_brtnode_info( ...@@ -1023,8 +1061,6 @@ deserialize_brtnode_info(
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0}; struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
rbuf_init(&rb, sb->uncompressed_ptr, data_size); rbuf_init(&rb, sb->uncompressed_ptr, data_size);
node->dsn = INVALID_DSN;
node->max_msn_applied_to_node_on_disk = rbuf_msn(&rb); node->max_msn_applied_to_node_on_disk = rbuf_msn(&rb);
node->nodesize = rbuf_int(&rb); node->nodesize = rbuf_int(&rb);
node->flags = rbuf_int(&rb); node->flags = rbuf_int(&rb);
...@@ -1087,7 +1123,6 @@ setup_available_brtnode_partition(BRTNODE node, int i) { ...@@ -1087,7 +1123,6 @@ setup_available_brtnode_partition(BRTNODE node, int i) {
if (node->height == 0) { if (node->height == 0) {
set_BLB(node, i, toku_create_empty_bn()); set_BLB(node, i, toku_create_empty_bn());
BLB_MAX_MSN_APPLIED(node,i) = node->max_msn_applied_to_node_on_disk; BLB_MAX_MSN_APPLIED(node,i) = node->max_msn_applied_to_node_on_disk;
BLB_MAX_DSN_APPLIED(node,i).dsn = 0;
} }
else { else {
set_BNC(node, i, toku_create_empty_nl()); set_BNC(node, i, toku_create_empty_nl());
...@@ -1102,10 +1137,11 @@ setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe) { ...@@ -1102,10 +1137,11 @@ setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe) {
// we can possibly require is a single basement node // we can possibly require is a single basement node
// we find out what basement node the query cares about // we find out what basement node the query cares about
// and check if it is available // and check if it is available
assert(bfe->brt); assert(bfe->cmp);
assert(bfe->search); assert(bfe->search);
bfe->child_to_read = toku_brt_search_which_child( bfe->child_to_read = toku_brt_search_which_child(
bfe->brt, bfe->cmp_extra,
bfe->cmp,
node, node,
bfe->search bfe->search
); );
...@@ -1142,31 +1178,32 @@ setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe) { ...@@ -1142,31 +1178,32 @@ setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe) {
} }
} }
static void static void
deserialize_brtnode_partition( deserialize_brtnode_partition(
struct sub_block *sb, struct sub_block *sb,
BRTNODE node, BRTNODE node,
int index int index,
DB *cmp_extra,
brt_compare_func cmp
) )
{ {
verify_brtnode_sub_block(sb); verify_brtnode_sub_block(sb);
u_int32_t data_size = sb->uncompressed_size - 4; // checksum is 4 bytes at end u_int32_t data_size = sb->uncompressed_size - 4; // checksum is 4 bytes at end
// now with the data verified, we can read the information into the node // now with the data verified, we can read the information into the node
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0}; struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
rbuf_init(&rb, sb->uncompressed_ptr, data_size); rbuf_init(&rb, sb->uncompressed_ptr, data_size);
u_int32_t start_of_data; u_int32_t start_of_data;
if (node->height > 0) { if (node->height > 0) {
unsigned char ch = rbuf_char(&rb); unsigned char ch = rbuf_char(&rb);
assert(ch == BRTNODE_PARTITION_FIFO_MSG); assert(ch == BRTNODE_PARTITION_FIFO_MSG);
deserialize_child_buffer(node, index, &rb); deserialize_child_buffer(node, index, &rb, cmp_extra, cmp);
} }
else { else {
unsigned char ch = rbuf_char(&rb); unsigned char ch = rbuf_char(&rb);
assert(ch == BRTNODE_PARTITION_OMT_LEAVES); assert(ch == BRTNODE_PARTITION_OMT_LEAVES);
BLB_OPTIMIZEDFORUPGRADE(node, index) = rbuf_int(&rb); BLB_OPTIMIZEDFORUPGRADE(node, index) = rbuf_int(&rb);
// dont need to set max_dsn_applied because creation of basement node set it to correct value
BLB_SEQINSERT(node, index) = 0; BLB_SEQINSERT(node, index) = 0;
u_int32_t num_entries = rbuf_int(&rb); u_int32_t num_entries = rbuf_int(&rb);
OMTVALUE *XMALLOC_N(num_entries, array); OMTVALUE *XMALLOC_N(num_entries, array);
...@@ -1191,11 +1228,11 @@ deserialize_brtnode_partition( ...@@ -1191,11 +1228,11 @@ deserialize_brtnode_partition(
} }
static void static void
decompress_and_deserialize_worker(struct rbuf curr_rbuf, struct sub_block curr_sb, BRTNODE node, int child) decompress_and_deserialize_worker(struct rbuf curr_rbuf, struct sub_block curr_sb, BRTNODE node, int child, DB *cmp_extra, brt_compare_func cmp)
{ {
read_and_decompress_sub_block(&curr_rbuf, &curr_sb); read_and_decompress_sub_block(&curr_rbuf, &curr_sb);
// at this point, sb->uncompressed_ptr stores the serialized node partition // at this point, sb->uncompressed_ptr stores the serialized node partition
deserialize_brtnode_partition(&curr_sb, node, child); deserialize_brtnode_partition(&curr_sb, node, child, cmp_extra, cmp);
toku_free(curr_sb.uncompressed_ptr); toku_free(curr_sb.uncompressed_ptr);
} }
...@@ -1306,7 +1343,7 @@ deserialize_brtnode_from_rbuf( ...@@ -1306,7 +1343,7 @@ deserialize_brtnode_from_rbuf(
// deserialize_brtnode_info figures out what the state // deserialize_brtnode_info figures out what the state
// should be and sets up the memory so that we are ready to use it // should be and sets up the memory so that we are ready to use it
if (BP_STATE(node,i) == PT_AVAIL) { if (BP_STATE(node,i) == PT_AVAIL) {
cilk_spawn decompress_and_deserialize_worker(curr_rbuf, curr_sb, node, i); cilk_spawn decompress_and_deserialize_worker(curr_rbuf, curr_sb, node, i, bfe->cmp_extra, bfe->cmp);
} }
// case where we leave the partition in the compressed state // case where we leave the partition in the compressed state
else if (BP_STATE(node,i) == PT_COMPRESSED) { else if (BP_STATE(node,i) == PT_COMPRESSED) {
...@@ -1358,13 +1395,13 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode ...@@ -1358,13 +1395,13 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode
ssize_t rlen = toku_os_pread(fd, raw_block, curr_size, node_offset+curr_offset); ssize_t rlen = toku_os_pread(fd, raw_block, curr_size, node_offset+curr_offset);
lazy_assert((DISKOFF)rlen == curr_size); lazy_assert((DISKOFF)rlen == curr_size);
} }
struct sub_block curr_sb; struct sub_block curr_sb;
sub_block_init(&curr_sb); sub_block_init(&curr_sb);
read_and_decompress_sub_block(&rb, &curr_sb); read_and_decompress_sub_block(&rb, &curr_sb);
// at this point, sb->uncompressed_ptr stores the serialized node partition // at this point, sb->uncompressed_ptr stores the serialized node partition
deserialize_brtnode_partition(&curr_sb, node, childnum); deserialize_brtnode_partition(&curr_sb, node, childnum, bfe->cmp_extra, bfe->cmp);
if (node->height == 0) { if (node->height == 0) {
toku_brt_bn_reset_stats(node, childnum); toku_brt_bn_reset_stats(node, childnum);
} }
...@@ -1374,13 +1411,14 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode ...@@ -1374,13 +1411,14 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode
// Take a brtnode partition that is in the compressed state, and make it avail // Take a brtnode partition that is in the compressed state, and make it avail
void void
toku_deserialize_bp_from_compressed(BRTNODE node, int childnum) { toku_deserialize_bp_from_compressed(BRTNODE node, int childnum,
DB *cmp_extra, brt_compare_func cmp) {
assert(BP_STATE(node, childnum) == PT_COMPRESSED); assert(BP_STATE(node, childnum) == PT_COMPRESSED);
SUB_BLOCK curr_sb = BSB(node, childnum); SUB_BLOCK curr_sb = BSB(node, childnum);
assert(curr_sb->uncompressed_ptr == NULL); assert(curr_sb->uncompressed_ptr == NULL);
curr_sb->uncompressed_ptr = toku_xmalloc(curr_sb->uncompressed_size); curr_sb->uncompressed_ptr = toku_xmalloc(curr_sb->uncompressed_size);
setup_available_brtnode_partition(node, childnum); setup_available_brtnode_partition(node, childnum);
BP_STATE(node,childnum) = PT_AVAIL; BP_STATE(node,childnum) = PT_AVAIL;
// decompress the sub_block // decompress the sub_block
...@@ -1390,7 +1428,7 @@ toku_deserialize_bp_from_compressed(BRTNODE node, int childnum) { ...@@ -1390,7 +1428,7 @@ toku_deserialize_bp_from_compressed(BRTNODE node, int childnum) {
curr_sb->compressed_ptr, curr_sb->compressed_ptr,
curr_sb->compressed_size curr_sb->compressed_size
); );
deserialize_brtnode_partition(curr_sb, node, childnum); deserialize_brtnode_partition(curr_sb, node, childnum, cmp_extra, cmp);
if (node->height == 0) { if (node->height == 0) {
toku_brt_bn_reset_stats(node, childnum); toku_brt_bn_reset_stats(node, childnum);
} }
...@@ -1784,7 +1822,6 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) { ...@@ -1784,7 +1822,6 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
h->dirty=0; h->dirty=0;
h->panic = 0; h->panic = 0;
h->panic_string = 0; h->panic_string = 0;
h->curr_dsn.dsn = MIN_DSN.dsn+1;
toku_list_init(&h->live_brts); toku_list_init(&h->live_brts);
toku_list_init(&h->zombie_brts); toku_list_init(&h->zombie_brts);
toku_list_init(&h->checkpoint_before_commit_link); toku_list_init(&h->checkpoint_before_commit_link);
......
...@@ -78,18 +78,18 @@ int toku_testsetup_get_sersize(BRT brt, BLOCKNUM diskoff) // Return the size on ...@@ -78,18 +78,18 @@ int toku_testsetup_get_sersize(BRT brt, BLOCKNUM diskoff) // Return the size on
assert(testsetup_initialized); assert(testsetup_initialized);
void *node_v; void *node_v;
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->h); fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
int r = toku_cachetable_get_and_pin( int r = toku_cachetable_get_and_pin(
brt->cf, diskoff, brt->cf, diskoff,
toku_cachetable_hash(brt->cf, diskoff), toku_cachetable_hash(brt->cf, diskoff),
&node_v, &node_v,
NULL, NULL,
toku_brtnode_flush_callback, toku_brtnode_flush_callback,
toku_brtnode_fetch_callback, toku_brtnode_fetch_callback,
toku_brtnode_pe_callback, toku_brtnode_pe_callback,
toku_brtnode_pf_req_callback, toku_brtnode_pf_req_callback,
toku_brtnode_pf_callback, toku_brtnode_pf_callback,
&bfe, &bfe,
brt->h brt->h
); );
assert(r==0); assert(r==0);
...@@ -103,21 +103,21 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke ...@@ -103,21 +103,21 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke
int r; int r;
assert(testsetup_initialized); assert(testsetup_initialized);
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->h); fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
r = toku_cachetable_get_and_pin( r = toku_cachetable_get_and_pin(
brt->cf, brt->cf,
blocknum, blocknum,
toku_cachetable_hash(brt->cf, blocknum), toku_cachetable_hash(brt->cf, blocknum),
&node_v, &node_v,
NULL, NULL,
toku_brtnode_flush_callback, toku_brtnode_flush_callback,
toku_brtnode_fetch_callback, toku_brtnode_fetch_callback,
toku_brtnode_pe_callback, toku_brtnode_pe_callback,
toku_brtnode_pf_req_callback, toku_brtnode_pf_req_callback,
toku_brtnode_pf_callback, toku_brtnode_pf_callback,
&bfe, &bfe,
brt->h brt->h
); );
if (r!=0) return r; if (r!=0) return r;
...@@ -176,19 +176,19 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_t ...@@ -176,19 +176,19 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_t
assert(testsetup_initialized); assert(testsetup_initialized);
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->h); fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
r = toku_cachetable_get_and_pin( r = toku_cachetable_get_and_pin(
brt->cf, brt->cf,
blocknum, blocknum,
toku_cachetable_hash(brt->cf, blocknum), toku_cachetable_hash(brt->cf, blocknum),
&node_v, &node_v,
NULL, NULL,
toku_brtnode_flush_callback, toku_brtnode_flush_callback,
toku_brtnode_fetch_callback, toku_brtnode_fetch_callback,
toku_brtnode_pe_callback, toku_brtnode_pe_callback,
toku_brtnode_pf_req_callback, toku_brtnode_pf_req_callback,
toku_brtnode_pf_callback, toku_brtnode_pf_callback,
&bfe, &bfe,
brt->h brt->h
); );
if (r!=0) return r; if (r!=0) return r;
...@@ -197,12 +197,12 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_t ...@@ -197,12 +197,12 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_t
DBT k; DBT k;
int childnum = toku_brtnode_which_child(node, int childnum = toku_brtnode_which_child(node,
toku_fill_dbt(&k, key, keylen), toku_fill_dbt(&k, key, keylen),
brt); brt->db, brt->compare_fun);
XIDS xids_0 = xids_get_root_xids(); XIDS xids_0 = xids_get_root_xids();
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
r = toku_fifo_enq(BNC_BUFFER(node, childnum), key, keylen, val, vallen, cmdtype, msn, xids_0); r = toku_fifo_enq(BNC_BUFFER(node, childnum), key, keylen, val, vallen, cmdtype, msn, xids_0, NULL);
assert(r==0); assert(r==0);
// Hack to get the test working. The problem is that this test // Hack to get the test working. The problem is that this test
// is directly queueing something in a FIFO instead of // is directly queueing something in a FIFO instead of
......
...@@ -114,7 +114,7 @@ toku_verify_brtnode (BRT brt, ...@@ -114,7 +114,7 @@ toku_verify_brtnode (BRT brt,
u_int32_t fullhash = toku_cachetable_hash(brt->cf, blocknum); u_int32_t fullhash = toku_cachetable_hash(brt->cf, blocknum);
{ {
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->h); fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
int r = toku_cachetable_get_and_pin( int r = toku_cachetable_get_and_pin(
brt->cf, brt->cf,
blocknum, blocknum,
......
...@@ -109,6 +109,7 @@ Split_or_merge (node, childnum) { ...@@ -109,6 +109,7 @@ Split_or_merge (node, childnum) {
#include "roll.h" #include "roll.h"
#include "toku_atomic.h" #include "toku_atomic.h"
#include "sub_block.h" #include "sub_block.h"
#include "sort.h"
#if defined(HAVE_CILK) #if defined(HAVE_CILK)
#include <cilk/cilk.h> #include <cilk/cilk.h>
...@@ -153,16 +154,6 @@ toku_assert_entire_node_in_memory(BRTNODE node) { ...@@ -153,16 +154,6 @@ toku_assert_entire_node_in_memory(BRTNODE node) {
} }
} }
//
// MUST be called with the ydb lock held
//
static void
set_new_DSN_for_node(BRTNODE node, BRT t) {
assert(t->h->curr_dsn.dsn > MIN_DSN.dsn);
node->dsn = t->h->curr_dsn;
t->h->curr_dsn.dsn++;
}
static u_int32_t static u_int32_t
get_leaf_num_entries(BRTNODE node) { get_leaf_num_entries(BRTNODE node) {
u_int32_t result = 0; u_int32_t result = 0;
...@@ -275,8 +266,8 @@ static long brtnode_memory_size (BRTNODE node); ...@@ -275,8 +266,8 @@ static long brtnode_memory_size (BRTNODE node);
// //
// The intent of toku_pin_brtnode(_holding_lock) is to abstract the process of retrieving a node from // The intent of toku_pin_brtnode(_holding_lock) is to abstract the process of retrieving a node from
// the rest of brt.c, so that there is only one place where we need to worry about setting // the rest of brt.c, so that there is only one place where we need to worry applying ancestor
// the DSN and applying ancestor messages to a leaf node. The idea is for all of brt.c (search, splits, merges, flushes, etc) // messages to a leaf node. The idea is for all of brt.c (search, splits, merges, flushes, etc)
// to access a node via toku_pin_brtnode(_holding_lock) // to access a node via toku_pin_brtnode(_holding_lock)
// //
int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash, int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
...@@ -301,9 +292,6 @@ int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash, ...@@ -301,9 +292,6 @@ int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
unlockers); unlockers);
if (r==0) { if (r==0) {
BRTNODE node = node_v; BRTNODE node = node_v;
if (node->dsn.dsn == INVALID_DSN.dsn) {
set_new_DSN_for_node(node, brt);
}
maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds); maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds);
*node_p = node; *node_p = node;
// printf("%*sPin %ld\n", 8-node->height, "", blocknum.b); // printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
...@@ -336,9 +324,6 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha ...@@ -336,9 +324,6 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha
); );
assert(r==0); assert(r==0);
BRTNODE node = node_v; BRTNODE node = node_v;
if (node->dsn.dsn == INVALID_DSN.dsn) {
set_new_DSN_for_node(node, brt);
}
maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds); maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds);
*node_p = node; *node_p = node;
} }
...@@ -427,19 +412,19 @@ fixup_child_estimates (BRTNODE node, int childnum_of_node, BRTNODE child, BOOL d ...@@ -427,19 +412,19 @@ fixup_child_estimates (BRTNODE node, int childnum_of_node, BRTNODE child, BOOL d
estimates.exact = TRUE; estimates.exact = TRUE;
int i; int i;
for (i=0; i<child->n_children; i++) { for (i=0; i<child->n_children; i++) {
SUBTREE_EST child_se = &BP_SUBTREE_EST(child,i); SUBTREE_EST child_se = &BP_SUBTREE_EST(child,i);
estimates.nkeys += child_se->nkeys; estimates.nkeys += child_se->nkeys;
estimates.ndata += child_se->ndata; estimates.ndata += child_se->ndata;
estimates.dsize += child_se->dsize; estimates.dsize += child_se->dsize;
if (!child_se->exact) estimates.exact = FALSE; if (!child_se->exact) estimates.exact = FALSE;
if (child->height>0) { if (child->height>0) {
if (BP_STATE(child,i) != PT_AVAIL || if (BP_STATE(child,i) != PT_AVAIL ||
toku_fifo_n_entries(BNC_BUFFER(child,i))!=0) toku_fifo_n_entries(BNC_BUFFER(child,i))!=0)
{ {
estimates.exact=FALSE; estimates.exact=FALSE;
} }
} }
} }
// We only call this function if we have reason to believe that the child changed. // We only call this function if we have reason to believe that the child changed.
BP_SUBTREE_EST(node,childnum_of_node) = estimates; BP_SUBTREE_EST(node,childnum_of_node) = estimates;
if (dirty_it) { if (dirty_it) {
...@@ -483,7 +468,7 @@ toku_verify_estimates (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bo ...@@ -483,7 +468,7 @@ toku_verify_estimates (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bo
u_int32_t fullhash = compute_child_fullhash(t->cf, node, childnum); u_int32_t fullhash = compute_child_fullhash(t->cf, node, childnum);
BRTNODE childnode; BRTNODE childnode;
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, t->h); fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
toku_pin_brtnode_holding_lock(t, childblocknum, fullhash, &next_ancestors, &next_bounds, &bfe, &childnode); toku_pin_brtnode_holding_lock(t, childblocknum, fullhash, &next_ancestors, &next_bounds, &bfe, &childnode);
for (int i=0; i<childnode->n_children; i++) { for (int i=0; i<childnode->n_children; i++) {
child_estimate += BP_SUBTREE_EST(childnode, i).ndata; child_estimate += BP_SUBTREE_EST(childnode, i).ndata;
...@@ -531,6 +516,8 @@ brtnode_memory_size (BRTNODE node) ...@@ -531,6 +516,8 @@ brtnode_memory_size (BRTNODE node)
NONLEAF_CHILDINFO childinfo = BNC(node, i); NONLEAF_CHILDINFO childinfo = BNC(node, i);
retval += sizeof(*childinfo); retval += sizeof(*childinfo);
retval += toku_fifo_memory_size(BNC_BUFFER(node, i)); retval += toku_fifo_memory_size(BNC_BUFFER(node, i));
retval += toku_omt_memory_size(BNC_BROADCAST_BUFFER(node, i));
retval += toku_omt_memory_size(BNC_MESSAGE_TREE(node, i));
} }
else { else {
BASEMENTNODE bn = BLB(node, i); BASEMENTNODE bn = BLB(node, i);
...@@ -584,7 +571,7 @@ toku_bfe_leftmost_child_wanted(struct brtnode_fetch_extra *bfe, BRTNODE node) ...@@ -584,7 +571,7 @@ toku_bfe_leftmost_child_wanted(struct brtnode_fetch_extra *bfe, BRTNODE node)
} else if (bfe->range_lock_left_key == NULL) { } else if (bfe->range_lock_left_key == NULL) {
return -1; return -1;
} else { } else {
return toku_brtnode_which_child(node, bfe->range_lock_left_key, bfe->brt); return toku_brtnode_which_child(node, bfe->range_lock_left_key, bfe->cmp_extra, bfe->cmp);
} }
} }
...@@ -597,7 +584,7 @@ toku_bfe_rightmost_child_wanted(struct brtnode_fetch_extra *bfe, BRTNODE node) ...@@ -597,7 +584,7 @@ toku_bfe_rightmost_child_wanted(struct brtnode_fetch_extra *bfe, BRTNODE node)
} else if (bfe->range_lock_right_key == NULL) { } else if (bfe->range_lock_right_key == NULL) {
return -1; return -1;
} else { } else {
return toku_brtnode_which_child(node, bfe->range_lock_right_key, bfe->brt); return toku_brtnode_which_child(node, bfe->range_lock_right_key, bfe->cmp_extra, bfe->cmp);
} }
} }
...@@ -609,7 +596,7 @@ brt_cursor_rightmost_child_wanted(BRT_CURSOR cursor, BRT brt, BRTNODE node) ...@@ -609,7 +596,7 @@ brt_cursor_rightmost_child_wanted(BRT_CURSOR cursor, BRT brt, BRTNODE node)
} else if (cursor->range_lock_right_key.data == NULL) { } else if (cursor->range_lock_right_key.data == NULL) {
return -1; return -1;
} else { } else {
return toku_brtnode_which_child(node, &cursor->range_lock_right_key, brt); return toku_brtnode_which_child(node, &cursor->range_lock_right_key, brt->db, brt->compare_fun);
} }
} }
...@@ -792,10 +779,11 @@ BOOL toku_brtnode_pf_req_callback(void* brtnode_pv, void* read_extraargs) { ...@@ -792,10 +779,11 @@ BOOL toku_brtnode_pf_req_callback(void* brtnode_pv, void* read_extraargs) {
// we can possibly require is a single basement node // we can possibly require is a single basement node
// we find out what basement node the query cares about // we find out what basement node the query cares about
// and check if it is available // and check if it is available
assert(bfe->brt); assert(bfe->cmp);
assert(bfe->search); assert(bfe->search);
bfe->child_to_read = toku_brt_search_which_child( bfe->child_to_read = toku_brt_search_which_child(
bfe->brt, bfe->cmp_extra,
bfe->cmp,
node, node,
bfe->search bfe->search
); );
...@@ -847,7 +835,7 @@ int toku_brtnode_pf_callback(void* brtnode_pv, void* read_extraargs, int fd, lon ...@@ -847,7 +835,7 @@ int toku_brtnode_pf_callback(void* brtnode_pv, void* read_extraargs, int fd, lon
} }
if ((lc <= i && i <= rc) || toku_bfe_wants_child_available(bfe, i)) { if ((lc <= i && i <= rc) || toku_bfe_wants_child_available(bfe, i)) {
if (BP_STATE(node,i) == PT_COMPRESSED) { if (BP_STATE(node,i) == PT_COMPRESSED) {
cilk_spawn toku_deserialize_bp_from_compressed(node, i); cilk_spawn toku_deserialize_bp_from_compressed(node, i, bfe->cmp_extra, bfe->cmp);
} }
else if (BP_STATE(node,i) == PT_ON_DISK) { else if (BP_STATE(node,i) == PT_ON_DISK) {
cilk_spawn toku_deserialize_bp_from_disk(node, i, fd, bfe); cilk_spawn toku_deserialize_bp_from_disk(node, i, fd, bfe);
...@@ -897,17 +885,17 @@ toku_cmd_leafval_heaviside (OMTVALUE lev, void *extra) { ...@@ -897,17 +885,17 @@ toku_cmd_leafval_heaviside (OMTVALUE lev, void *extra) {
} }
static int static int
brt_compare_pivot(BRT brt, const DBT *key, bytevec ck) brt_compare_pivot(DB *cmp_extra, brt_compare_func cmp, const DBT *key, bytevec ck)
__attribute__((__warn_unused_result__)); __attribute__((__warn_unused_result__));
static int static int
brt_compare_pivot(BRT brt, const DBT *key, bytevec ck) brt_compare_pivot(DB *cmp_extra, brt_compare_func cmp, const DBT *key, bytevec ck)
{ {
int cmp; int r;
DBT mydbt; DBT mydbt;
struct kv_pair *kv = (struct kv_pair *) ck; struct kv_pair *kv = (struct kv_pair *) ck;
cmp = brt->compare_fun(brt->db, key, toku_fill_dbt(&mydbt, kv_pair_key(kv), kv_pair_keylen(kv))); r = cmp(cmp_extra, key, toku_fill_dbt(&mydbt, kv_pair_key(kv), kv_pair_keylen(kv)));
return cmp; return r;
} }
// destroys the internals of the brtnode, but it does not free the values // destroys the internals of the brtnode, but it does not free the values
...@@ -1025,7 +1013,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num ...@@ -1025,7 +1013,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
assert(height >= 0); assert(height >= 0);
n->max_msn_applied_to_node_on_disk = MIN_MSN; // correct value for root node, harmless for others n->max_msn_applied_to_node_on_disk = MIN_MSN; // correct value for root node, harmless for others
n->dsn = INVALID_DSN; // the owner of the node should take responsibility for properly setting this
n->nodesize = nodesize; n->nodesize = nodesize;
n->flags = flags; n->flags = flags;
n->thisnodename = nodename; n->thisnodename = nodename;
...@@ -1089,12 +1076,6 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r ...@@ -1089,12 +1076,6 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r
invariant(msna.msn == msnb.msn); invariant(msna.msn == msnb.msn);
newroot->max_msn_applied_to_node_on_disk = msna; newroot->max_msn_applied_to_node_on_disk = msna;
} }
{
DSN dsna = nodea->dsn;
DSN dsnb = nodeb->dsn;
invariant(dsna.dsn == dsnb.dsn);
newroot->dsn = dsna;
}
BP_STATE(newroot,0) = PT_AVAIL; BP_STATE(newroot,0) = PT_AVAIL;
BP_STATE(newroot,1) = PT_AVAIL; BP_STATE(newroot,1) = PT_AVAIL;
newroot->dirty = 1; newroot->dirty = 1;
...@@ -1121,7 +1102,6 @@ toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children) { ...@@ -1121,7 +1102,6 @@ toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children) {
BRTNODE XMALLOC(n); BRTNODE XMALLOC(n);
toku_initialize_empty_brtnode(n, name, height, n_children, t->h->layout_version, t->h->nodesize, t->flags); toku_initialize_empty_brtnode(n, name, height, n_children, t->h->layout_version, t->h->nodesize, t->flags);
assert(n->nodesize > 0); assert(n->nodesize > 0);
set_new_DSN_for_node(n, t);
u_int32_t fullhash = toku_cachetable_hash(t->cf, n->thisnodename); u_int32_t fullhash = toku_cachetable_hash(t->cf, n->thisnodename);
n->fullhash = fullhash; n->fullhash = fullhash;
...@@ -1268,7 +1248,6 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1268,7 +1248,6 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
// Effect: Split a leaf node. // Effect: Split a leaf node.
{ {
BRTNODE B; BRTNODE B;
DSN dsn = node->dsn;
//printf("%s:%d splitting leaf %" PRIu64 " which is size %u (targetsize = %u)\n", __FILE__, __LINE__, node->thisnodename.b, toku_serialize_brtnode_size(node), node->nodesize); //printf("%s:%d splitting leaf %" PRIu64 " which is size %u (targetsize = %u)\n", __FILE__, __LINE__, node->thisnodename.b, toku_serialize_brtnode_size(node), node->nodesize);
assert(node->height==0); assert(node->height==0);
...@@ -1354,8 +1333,6 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1354,8 +1333,6 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
); );
BLB_NBYTESINBUF(node, split_node) -= diff_size; BLB_NBYTESINBUF(node, split_node) -= diff_size;
BLB_NBYTESINBUF(B, 0) += diff_size; BLB_NBYTESINBUF(B, 0) += diff_size;
BLB_MAX_DSN_APPLIED(B,0) = BLB_MAX_DSN_APPLIED(node, split_node);
BLB_MAX_MSN_APPLIED(B,0) = BLB_MAX_MSN_APPLIED(node, split_node);
subtract_estimates(&BP_SUBTREE_EST(node,split_node), &se_diff); subtract_estimates(&BP_SUBTREE_EST(node,split_node), &se_diff);
add_estimates(&BP_SUBTREE_EST(B,0), &se_diff); add_estimates(&BP_SUBTREE_EST(B,0), &se_diff);
...@@ -1399,9 +1376,6 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1399,9 +1376,6 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
node->max_msn_applied_to_node_on_disk= max_msn_applied_to_node; node->max_msn_applied_to_node_on_disk= max_msn_applied_to_node;
B ->max_msn_applied_to_node_on_disk = max_msn_applied_to_node; B ->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;
node->dsn = dsn;
B->dsn = dsn;
node->dirty = 1; node->dirty = 1;
B->dirty = 1; B->dirty = 1;
...@@ -1430,7 +1404,6 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -1430,7 +1404,6 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
int n_children_in_a = old_n_children/2; int n_children_in_a = old_n_children/2;
int n_children_in_b = old_n_children-n_children_in_a; int n_children_in_b = old_n_children-n_children_in_a;
MSN max_msn_applied_to_node = node->max_msn_applied_to_node_on_disk; MSN max_msn_applied_to_node = node->max_msn_applied_to_node_on_disk;
DSN dsn = node->dsn;
BRTNODE B; BRTNODE B;
assert(node->height>0); assert(node->height>0);
assert(node->n_children>=2); // Otherwise, how do we split? We need at least two children to split. */ assert(node->n_children>=2); // Otherwise, how do we split? We need at least two children to split. */
...@@ -1481,9 +1454,6 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -1481,9 +1454,6 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
node->max_msn_applied_to_node_on_disk = max_msn_applied_to_node; node->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;
B ->max_msn_applied_to_node_on_disk = max_msn_applied_to_node; B ->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;
node->dsn = dsn;
B->dsn = dsn;
node->dirty = 1; node->dirty = 1;
B ->dirty = 1; B ->dirty = 1;
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
...@@ -1603,7 +1573,7 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react, ANCESTORS a ...@@ -1603,7 +1573,7 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react, ANCESTORS a
struct ancestors next_ancestors = {node, childnum, ancestors}; struct ancestors next_ancestors = {node, childnum, ancestors};
const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds); const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, t->h); fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
toku_pin_brtnode_holding_lock(t, toku_pin_brtnode_holding_lock(t,
BP_BLOCKNUM(node, childnum), BP_BLOCKNUM(node, childnum),
compute_child_fullhash(t->cf, node, childnum), compute_child_fullhash(t->cf, node, childnum),
...@@ -1892,13 +1862,6 @@ brt_leaf_put_cmd ( ...@@ -1892,13 +1862,6 @@ brt_leaf_put_cmd (
LEAFENTRY storeddata; LEAFENTRY storeddata;
OMTVALUE storeddatav=NULL; OMTVALUE storeddatav=NULL;
if (cmd->msn.msn <= bn->max_msn_applied.msn) {
brt_status.msn_discards++;
return;
}
else {
bn->max_msn_applied = cmd->msn;
}
u_int32_t omt_size; u_int32_t omt_size;
int r; int r;
...@@ -2094,34 +2057,87 @@ brt_leaf_put_cmd ( ...@@ -2094,34 +2057,87 @@ brt_leaf_put_cmd (
return; return;
} }
static inline int
key_msn_cmp(const DBT *a, const DBT *b, const MSN amsn, const MSN bmsn,
DB *key_cmp_extra, brt_compare_func key_cmp)
{
int r = key_cmp(key_cmp_extra, a, b);
if (r == 0) {
r = (amsn.msn > bmsn.msn) - (amsn.msn < bmsn.msn);
}
return r;
}
int
toku_fifo_entry_key_msn_heaviside(OMTVALUE v, void *extrap)
{
const struct toku_fifo_entry_key_msn_heaviside_extra *extra = extrap;
const long offset = (long) v;
const struct fifo_entry *query = toku_fifo_get_entry(extra->fifo, offset);
DBT qdbt, tdbt;
const DBT *query_key = fill_dbt_for_fifo_entry(&qdbt, query);
const DBT *target_key = toku_fill_dbt(&tdbt, extra->key, extra->keylen);
return key_msn_cmp(query_key, target_key, query->msn, extra->msn,
extra->cmp_extra, extra->cmp);
}
int
toku_fifo_entry_key_msn_cmp(void *extrap, const void *ap, const void *bp)
{
const struct toku_fifo_entry_key_msn_cmp_extra *extra = extrap;
const long ao = *(long *) ap;
const long bo = *(long *) bp;
const struct fifo_entry *a = toku_fifo_get_entry(extra->fifo, ao);
const struct fifo_entry *b = toku_fifo_get_entry(extra->fifo, bo);
DBT adbt, bdbt;
const DBT *akey = fill_dbt_for_fifo_entry(&adbt, a);
const DBT *bkey = fill_dbt_for_fifo_entry(&bdbt, b);
return key_msn_cmp(akey, bkey, a->msn, b->msn,
extra->cmp_extra, extra->cmp);
}
// append a cmd to a nonleaf node's child buffer // append a cmd to a nonleaf node's child buffer
// should be static, but used by test programs // should be static, but used by test programs
void void
toku_brt_append_to_child_buffer(BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val) { toku_brt_append_to_child_buffer(BRT brt, BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val) {
assert(BP_STATE(node,childnum) == PT_AVAIL); assert(BP_STATE(node,childnum) == PT_AVAIL);
int diff = key->size + val->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids); int diff = key->size + val->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids);
int r = toku_fifo_enq(BNC_BUFFER(node,childnum), key->data, key->size, val->data, val->size, type, msn, xids); long offset;
int r = toku_fifo_enq(BNC_BUFFER(node, childnum), key->data, key->size, val->data, val->size, type, msn, xids, &offset);
assert_zero(r); assert_zero(r);
enum brt_msg_type etype = (enum brt_msg_type) type;
if (brt_msg_type_applies_once(etype)) {
struct toku_fifo_entry_key_msn_heaviside_extra extra = { .cmp_extra = brt->db, .cmp = brt->compare_fun, .fifo = BNC_BUFFER(node, childnum), .key = key->data, .keylen = key->size, .msn = msn };
r = toku_omt_insert(BNC_MESSAGE_TREE(node, childnum), (OMTVALUE) offset, toku_fifo_entry_key_msn_heaviside, &extra, NULL);
assert_zero(r);
} else if (brt_msg_type_applies_all(etype) || brt_msg_type_does_nothing(etype)) {
u_int32_t idx = toku_omt_size(BNC_BROADCAST_BUFFER(node, childnum));
r = toku_omt_insert_at(BNC_BROADCAST_BUFFER(node, childnum), (OMTVALUE) offset, idx);
assert_zero(r);
} else {
assert(FALSE);
}
BNC_NBYTESINBUF(node, childnum) += diff; BNC_NBYTESINBUF(node, childnum) += diff;
node->dirty = 1; node->dirty = 1;
} }
static void brt_nonleaf_cmd_once_to_child (BRTNODE node, unsigned int childnum, BRT_MSG cmd) static void brt_nonleaf_cmd_once_to_child (BRT brt, BRTNODE node, unsigned int childnum, BRT_MSG cmd)
// Previously we had passive aggressive promotion, but that causes a lot of I/O a the checkpoint. So now we are just putting it in the buffer here. // Previously we had passive aggressive promotion, but that causes a lot of I/O a the checkpoint. So now we are just putting it in the buffer here.
// Also we don't worry about the node getting overfull here. It's the caller's problem. // Also we don't worry about the node getting overfull here. It's the caller's problem.
{ {
toku_brt_append_to_child_buffer(node, childnum, cmd->type, cmd->msn, cmd->xids, cmd->u.id.key, cmd->u.id.val); toku_brt_append_to_child_buffer(brt, node, childnum, cmd->type, cmd->msn, cmd->xids, cmd->u.id.key, cmd->u.id.val);
} }
/* find the leftmost child that may contain the key */ /* find the leftmost child that may contain the key */
unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) { unsigned int toku_brtnode_which_child(BRTNODE node, const DBT *k,
DB *cmp_extra, brt_compare_func cmp) {
#define DO_PIVOT_SEARCH_LR 0 #define DO_PIVOT_SEARCH_LR 0
#if DO_PIVOT_SEARCH_LR #if DO_PIVOT_SEARCH_LR
int i; int i;
for (i=0; i<node->n_children-1; i++) { for (i=0; i<node->n_children-1; i++) {
int cmp = brt_compare_pivot(t, k, d, node->childkeys[i]); int c = brt_compare_pivot(cmp_extra, cmp, k, d, node->childkeys[i]);
if (cmp > 0) continue; if (c > 0) continue;
if (cmp < 0) return i; if (c < 0) return i;
return i; return i;
} }
return node->n_children-1; return node->n_children-1;
...@@ -2133,8 +2149,8 @@ unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) { ...@@ -2133,8 +2149,8 @@ unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) {
// random keys // random keys
int i; int i;
for (i = node->n_children-2; i >= 0; i--) { for (i = node->n_children-2; i >= 0; i--) {
int cmp = brt_compare_pivot(t, k, d, node->childkeys[i]); int c = brt_compare_pivot(cmp_extra, cmp, k, d, node->childkeys[i]);
if (cmp > 0) return i+1; if (c > 0) return i+1;
} }
return 0; return 0;
#endif #endif
...@@ -2145,8 +2161,8 @@ unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) { ...@@ -2145,8 +2161,8 @@ unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) {
// check the last key to optimize seq insertions // check the last key to optimize seq insertions
int n = node->n_children-1; int n = node->n_children-1;
int cmp = brt_compare_pivot(t, k, node->childkeys[n-1]); int c = brt_compare_pivot(cmp_extra, cmp, k, node->childkeys[n-1]);
if (cmp > 0) return n; if (c > 0) return n;
// binary search the pivots // binary search the pivots
int lo = 0; int lo = 0;
...@@ -2154,12 +2170,12 @@ unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) { ...@@ -2154,12 +2170,12 @@ unsigned int toku_brtnode_which_child (BRTNODE node , const DBT *k, BRT t) {
int mi; int mi;
while (lo < hi) { while (lo < hi) {
mi = (lo + hi) / 2; mi = (lo + hi) / 2;
cmp = brt_compare_pivot(t, k, node->childkeys[mi]); c = brt_compare_pivot(cmp_extra, cmp, k, node->childkeys[mi]);
if (cmp > 0) { if (c > 0) {
lo = mi+1; lo = mi+1;
continue; continue;
} }
if (cmp < 0) { if (c < 0) {
hi = mi; hi = mi;
continue; continue;
} }
...@@ -2177,87 +2193,39 @@ static void brt_nonleaf_cmd_once (BRT t, BRTNODE node, BRT_MSG cmd) ...@@ -2177,87 +2193,39 @@ static void brt_nonleaf_cmd_once (BRT t, BRTNODE node, BRT_MSG cmd)
/* find the right subtree */ /* find the right subtree */
//TODO: accesses key, val directly //TODO: accesses key, val directly
unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t); unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t->db, t->compare_fun);
brt_nonleaf_cmd_once_to_child (node, childnum, cmd); brt_nonleaf_cmd_once_to_child (t, node, childnum, cmd);
} }
static void static void
brt_nonleaf_cmd_all (BRTNODE node, BRT_MSG cmd) brt_nonleaf_cmd_all (BRT t, BRTNODE node, BRT_MSG cmd)
// Effect: Put the cmd into a nonleaf node. We put it into all children, possibly causing the children to become reactive. // Effect: Put the cmd into a nonleaf node. We put it into all children, possibly causing the children to become reactive.
// We don't do the splitting and merging. That's up to the caller after doing all the puts it wants to do. // We don't do the splitting and merging. That's up to the caller after doing all the puts it wants to do.
// The re_array[i] gets set to the reactivity of any modified child i. (And there may be several such children.) // The re_array[i] gets set to the reactivity of any modified child i. (And there may be several such children.)
{ {
int i; int i;
for (i = 0; i < node->n_children; i++) { for (i = 0; i < node->n_children; i++) {
brt_nonleaf_cmd_once_to_child(node, i, cmd); brt_nonleaf_cmd_once_to_child(t, node, i, cmd);
} }
} }
static BOOL static BOOL
brt_msg_applies_once(BRT_MSG cmd) brt_msg_applies_once(BRT_MSG cmd)
{ {
BOOL ret_val; return brt_msg_type_applies_once(cmd->type);
//TODO: Accessing type directly
switch (cmd->type) {
case BRT_INSERT_NO_OVERWRITE:
case BRT_INSERT:
case BRT_DELETE_ANY:
case BRT_ABORT_ANY:
case BRT_COMMIT_ANY:
case BRT_UPDATE:
ret_val = TRUE;
break;
case BRT_COMMIT_BROADCAST_ALL:
case BRT_COMMIT_BROADCAST_TXN:
case BRT_ABORT_BROADCAST_TXN:
case BRT_OPTIMIZE:
case BRT_OPTIMIZE_FOR_UPGRADE:
case BRT_UPDATE_BROADCAST_ALL:
case BRT_NONE:
ret_val = FALSE;
break;
default:
assert(FALSE);
}
return ret_val;
} }
static BOOL static BOOL
brt_msg_applies_all(BRT_MSG cmd) brt_msg_applies_all(BRT_MSG cmd)
{ {
BOOL ret_val; return brt_msg_type_applies_all(cmd->type);
//TODO: Accessing type directly
switch (cmd->type) {
case BRT_NONE:
case BRT_INSERT_NO_OVERWRITE:
case BRT_INSERT:
case BRT_DELETE_ANY:
case BRT_ABORT_ANY:
case BRT_COMMIT_ANY:
case BRT_UPDATE:
ret_val = FALSE;
break;
case BRT_COMMIT_BROADCAST_ALL:
case BRT_COMMIT_BROADCAST_TXN:
case BRT_ABORT_BROADCAST_TXN:
case BRT_OPTIMIZE:
case BRT_OPTIMIZE_FOR_UPGRADE:
case BRT_UPDATE_BROADCAST_ALL:
ret_val = TRUE;
break;
default:
assert(FALSE);
}
return ret_val;
} }
static BOOL static BOOL
brt_msg_does_nothing(BRT_MSG cmd) brt_msg_does_nothing(BRT_MSG cmd)
{ {
return (cmd->type == BRT_NONE); return brt_msg_type_does_nothing(cmd->type);
} }
static void static void
...@@ -2287,7 +2255,7 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd) ...@@ -2287,7 +2255,7 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd)
case BRT_OPTIMIZE: case BRT_OPTIMIZE:
case BRT_OPTIMIZE_FOR_UPGRADE: case BRT_OPTIMIZE_FOR_UPGRADE:
case BRT_UPDATE_BROADCAST_ALL: case BRT_UPDATE_BROADCAST_ALL:
brt_nonleaf_cmd_all (node, cmd); // send message to all children brt_nonleaf_cmd_all (t, node, cmd); // send message to all children
return; return;
case BRT_NONE: case BRT_NONE:
return; return;
...@@ -2469,7 +2437,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair ...@@ -2469,7 +2437,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
// splitk (OUT): If the two nodes did not get merged, the new pivot key between the two nodes. // splitk (OUT): If the two nodes did not get merged, the new pivot key between the two nodes.
{ {
MSN msn_max; MSN msn_max;
DSN dsn_max;
assert(a->height == b->height); assert(a->height == b->height);
toku_assert_entire_node_in_memory(parent); toku_assert_entire_node_in_memory(parent);
toku_assert_entire_node_in_memory(a); toku_assert_entire_node_in_memory(a);
...@@ -2483,7 +2450,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair ...@@ -2483,7 +2450,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
invariant(msn_max.msn <= parent->max_msn_applied_to_node_on_disk.msn); // parent msn must be >= children's msn invariant(msn_max.msn <= parent->max_msn_applied_to_node_on_disk.msn); // parent msn must be >= children's msn
} }
} }
dsn_max = (a->dsn.dsn > b->dsn.dsn) ? a->dsn : b->dsn; // this value is ignored for leafnodes, only basement dsn is use for leafnodes
if (a->height == 0) { if (a->height == 0) {
maybe_merge_pinned_leaf_nodes(parent, childnum_of_parent, a, b, parent_splitk, did_merge, did_rebalance, splitk); maybe_merge_pinned_leaf_nodes(parent, childnum_of_parent, a, b, parent_splitk, did_merge, did_rebalance, splitk);
} else { } else {
...@@ -2494,8 +2460,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair ...@@ -2494,8 +2460,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
// accurate for non-leaf nodes because buffer immediately above each node has been flushed // accurate for non-leaf nodes because buffer immediately above each node has been flushed
a->max_msn_applied_to_node_on_disk = msn_max; a->max_msn_applied_to_node_on_disk = msn_max;
b->max_msn_applied_to_node_on_disk = msn_max; b->max_msn_applied_to_node_on_disk = msn_max;
a->dsn = dsn_max;
b->dsn = dsn_max;
} }
} }
...@@ -2540,7 +2504,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react, ...@@ -2540,7 +2504,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react,
struct ancestors next_ancestors = {node, childnuma, ancestors}; struct ancestors next_ancestors = {node, childnuma, ancestors};
const struct pivot_bounds next_bounds = next_pivot_keys(node, childnuma, bounds); const struct pivot_bounds next_bounds = next_pivot_keys(node, childnuma, bounds);
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, t->h); fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
toku_pin_brtnode_holding_lock(t, BP_BLOCKNUM(node, childnuma), childfullhash, &next_ancestors, &next_bounds, &bfe, &childa); toku_pin_brtnode_holding_lock(t, BP_BLOCKNUM(node, childnuma), childfullhash, &next_ancestors, &next_bounds, &bfe, &childa);
} }
{ {
...@@ -2548,7 +2512,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react, ...@@ -2548,7 +2512,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react,
struct ancestors next_ancestors = {node, childnumb, ancestors}; struct ancestors next_ancestors = {node, childnumb, ancestors};
const struct pivot_bounds next_bounds = next_pivot_keys(node, childnumb, bounds); const struct pivot_bounds next_bounds = next_pivot_keys(node, childnumb, bounds);
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, t->h); fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
toku_pin_brtnode_holding_lock(t, BP_BLOCKNUM(node, childnumb), childfullhash, &next_ancestors, &next_bounds, &bfe, &childb); toku_pin_brtnode_holding_lock(t, BP_BLOCKNUM(node, childnumb), childfullhash, &next_ancestors, &next_bounds, &bfe, &childb);
} }
...@@ -2698,14 +2662,11 @@ flush_some_child (BRT t, BRTNODE node, BOOL is_first_flush, BOOL flush_recursive ...@@ -2698,14 +2662,11 @@ flush_some_child (BRT t, BRTNODE node, BOOL is_first_flush, BOOL flush_recursive
static void assert_leaf_up_to_date(BRTNODE node) { static void assert_leaf_up_to_date(BRTNODE node) {
assert(node->height == 0); assert(node->height == 0);
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
for (int i=0; i < node->n_children; i++) {
assert(BLB_MAX_DSN_APPLIED(node, i).dsn >= MIN_DSN.dsn);
}
} }
static void static void
flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re, BOOL is_first_flush, BOOL flush_recursively, flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re, BOOL is_first_flush, BOOL flush_recursively,
ANCESTORS ancestors, struct pivot_bounds const * const bounds) ANCESTORS ancestors, struct pivot_bounds const * const bounds)
// Effect: Push everything in the CHILDNUMth buffer of node down into the child. // Effect: Push everything in the CHILDNUMth buffer of node down into the child.
// The child may split or merge as a result of the activity. // The child may split or merge as a result of the activity.
// The IS_FIRST_FLUSH variable is a way to prevent the flushing from walking the entire tree. If IS_FIRST_FLUSH==TRUE then we are allowed to flush more than one child, otherwise // The IS_FIRST_FLUSH variable is a way to prevent the flushing from walking the entire tree. If IS_FIRST_FLUSH==TRUE then we are allowed to flush more than one child, otherwise
...@@ -2721,7 +2682,7 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re, ...@@ -2721,7 +2682,7 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re,
u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnum); u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnum);
BRTNODE child; BRTNODE child;
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, t->h); fill_bfe_for_full_read(&bfe, t->h, t->db, t->compare_fun);
toku_pin_brtnode_holding_lock(t, targetchild, childfullhash, &next_ancestors, &next_bounds, &bfe, &child); // get that child node in, and apply the ancestor messages if it's a leaf. toku_pin_brtnode_holding_lock(t, targetchild, childfullhash, &next_ancestors, &next_bounds, &bfe, &child); // get that child node in, and apply the ancestor messages if it's a leaf.
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
...@@ -2729,90 +2690,99 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re, ...@@ -2729,90 +2690,99 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re,
VERIFY_NODE(t, child); VERIFY_NODE(t, child);
FIFO fifo = BNC_BUFFER(node,childnum); FIFO fifo = BNC_BUFFER(node,childnum);
int r;
if (child->height==0) { if (child->height==0) {
// The child is a leaf node. // The child is a leaf node.
assert_leaf_up_to_date(child); // The child has all the messages applied to it. assert_leaf_up_to_date(child); // The child has all the messages applied to it.
// We've arranged that the path from the root to this child is empty, except for the childnum fifo in node. // We've arranged that the path from the root to this child is empty, except for the childnum fifo in node.
// We must empty the fifo, and arrange for the child to be written to disk, and then mark it as clean and up-to-date. // We must empty the fifo, and arrange for the child to be written to disk, and then mark it as clean and up-to-date.
bytevec key, val; bytevec key, val;
ITEMLEN keylen, vallen; ITEMLEN keylen, vallen;
u_int32_t type; u_int32_t type;
MSN msn; MSN msn;
XIDS xids; XIDS xids;
while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) { while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) {
int n_bytes_removed = (keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids)); int n_bytes_removed = (keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids));
int r = toku_fifo_deq(fifo); r = toku_fifo_deq(fifo);
assert(r==0); assert(r==0);
BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed; BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed;
} }
toku_fifo_size_is_stabilized(fifo); toku_fifo_size_is_stabilized(fifo);
invariant(BNC_NBYTESINBUF(node, childnum) == 0); invariant(BNC_NBYTESINBUF(node, childnum) == 0);
BP_WORKDONE(node, childnum) = 0; // this buffer is drained, no work has been done by its contents toku_omt_destroy(&BNC_MESSAGE_TREE(node, childnum));
r = toku_omt_create(&BNC_MESSAGE_TREE(node, childnum)); resource_assert_zero(r);
node->dirty=TRUE; toku_omt_destroy(&BNC_BROADCAST_BUFFER(node, childnum));
child->dirty=TRUE; r = toku_omt_create(&BNC_BROADCAST_BUFFER(node, childnum)); resource_assert_zero(r);
fixup_child_estimates(node, childnum, child, TRUE); BP_WORKDONE(node, childnum) = 0; // this buffer is drained, no work has been done by its contents
*child_re = get_node_reactivity(child);
toku_unpin_brtnode(t, child); node->dirty=TRUE;
child->dirty=TRUE;
fixup_child_estimates(node, childnum, child, TRUE);
*child_re = get_node_reactivity(child);
toku_unpin_brtnode(t, child);
} else { } else {
bytevec key,val; bytevec key,val;
ITEMLEN keylen, vallen; ITEMLEN keylen, vallen;
//printf("%s:%d Try random_pick, weight=%d \n", __FILE__, __LINE__, BNC_NBYTESINBUF(node, childnum)); //printf("%s:%d Try random_pick, weight=%d \n", __FILE__, __LINE__, BNC_NBYTESINBUF(node, childnum));
assert(toku_fifo_n_entries(fifo)>0); assert(toku_fifo_n_entries(fifo)>0);
u_int32_t type; u_int32_t type;
MSN msn; MSN msn;
XIDS xids; XIDS xids;
while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) { while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) {
DBT hk,hv; DBT hk,hv;
//TODO: Factor out (into a function) conversion of fifo_entry to message //TODO: Factor out (into a function) conversion of fifo_entry to message
BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id= {toku_fill_dbt(&hk, key, keylen), BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id= {toku_fill_dbt(&hk, key, keylen),
toku_fill_dbt(&hv, val, vallen)} }; toku_fill_dbt(&hv, val, vallen)} };
int n_bytes_removed = (hk.size + hv.size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids)); int n_bytes_removed = (hk.size + hv.size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids));
//printf("%s:%d random_picked\n", __FILE__, __LINE__); //printf("%s:%d random_picked\n", __FILE__, __LINE__);
brtnode_put_cmd (t, child, &brtcmd); brtnode_put_cmd (t, child, &brtcmd);
//printf("%s:%d %d=push_a_brt_cmd_down=(); child_did_split=%d (weight=%d)\n", __FILE__, __LINE__, r, child_did_split, BNC_NBYTESINBUF(node, childnum)); //printf("%s:%d %d=push_a_brt_cmd_down=(); child_did_split=%d (weight=%d)\n", __FILE__, __LINE__, r, child_did_split, BNC_NBYTESINBUF(node, childnum));
{ {
int r = toku_fifo_deq(fifo); r = toku_fifo_deq(fifo);
//printf("%s:%d deleted status=%d\n", __FILE__, __LINE__, r); //printf("%s:%d deleted status=%d\n", __FILE__, __LINE__, r);
assert(r==0); assert(r==0);
} }
BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed; BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed;
node->dirty = 1; node->dirty = 1;
} }
toku_fifo_size_is_stabilized(fifo); toku_fifo_size_is_stabilized(fifo);
invariant(BNC_NBYTESINBUF(node, childnum) == 0); invariant(BNC_NBYTESINBUF(node, childnum) == 0);
BP_WORKDONE(node, childnum) = 0; // this buffer is drained, no work has been done by its contents toku_omt_destroy(&BNC_MESSAGE_TREE(node, childnum));
r = toku_omt_create(&BNC_MESSAGE_TREE(node, childnum)); resource_assert_zero(r);
if (0) printf("%s:%d done random picking\n", __FILE__, __LINE__); toku_omt_destroy(&BNC_BROADCAST_BUFFER(node, childnum));
r = toku_omt_create(&BNC_BROADCAST_BUFFER(node, childnum)); resource_assert_zero(r);
// Having pushed all that stuff to a child, do we need to flush the child? We may have to flush it many times if there were lots of messages that just got pushed down. BP_WORKDONE(node, childnum) = 0; // this buffer is drained, no work has been done by its contents
// If we were to only flush one child, we could possibly end up with a very big node after a while.
// This repeated flushing can cause some inserts to take a long time (possibly walking all over the tree). if (0) printf("%s:%d done random picking\n", __FILE__, __LINE__);
// When we get the background flushing working, it may be OK if that happens, but for now, we just flush a little.
if (flush_recursively) { // Having pushed all that stuff to a child, do we need to flush the child? We may have to flush it many times if there were lots of messages that just got pushed down.
int n_flushed = 0; // If we were to only flush one child, we could possibly end up with a very big node after a while.
while (nonleaf_node_is_gorged(child) && (is_first_flush || n_flushed==0)) { // This repeated flushing can cause some inserts to take a long time (possibly walking all over the tree).
// don't do more than one child unless this is the first flush. // When we get the background flushing working, it may be OK if that happens, but for now, we just flush a little.
flush_some_child(t, child, is_first_flush && n_flushed==0, flush_recursively, if (flush_recursively) {
&next_ancestors, &next_bounds); int n_flushed = 0;
n_flushed++; while (nonleaf_node_is_gorged(child) && (is_first_flush || n_flushed==0)) {
} // don't do more than one child unless this is the first flush.
} flush_some_child(t, child, is_first_flush && n_flushed==0, flush_recursively,
fixup_child_estimates(node, childnum, child, TRUE); &next_ancestors, &next_bounds);
// Now it's possible that the child needs to be merged or split. n_flushed++;
*child_re = get_node_reactivity(child); }
toku_unpin_brtnode(t, child); }
fixup_child_estimates(node, childnum, child, TRUE);
// Now it's possible that the child needs to be merged or split.
*child_re = get_node_reactivity(child);
toku_unpin_brtnode(t, child);
} }
} }
...@@ -2863,6 +2833,10 @@ flush_this_height1_child (BRT t, BRTNODE node, int childnum, BRTNODE child) ...@@ -2863,6 +2833,10 @@ flush_this_height1_child (BRT t, BRTNODE node, int childnum, BRTNODE child)
} }
invariant(BNC_NBYTESINBUF(node, childnum) == 0); invariant(BNC_NBYTESINBUF(node, childnum) == 0);
toku_omt_destroy(&BNC_MESSAGE_TREE(node, childnum));
r = toku_omt_create(&BNC_MESSAGE_TREE(node, childnum)); resource_assert_zero(r);
toku_omt_destroy(&BNC_BROADCAST_BUFFER(node, childnum));
r = toku_omt_create(&BNC_BROADCAST_BUFFER(node, childnum)); resource_assert_zero(r);
BP_WORKDONE(node, childnum) = 0; // this buffer is drained, no work has been done by its contents BP_WORKDONE(node, childnum) = 0; // this buffer is drained, no work has been done by its contents
node->dirty=TRUE; node->dirty=TRUE;
...@@ -2911,73 +2885,56 @@ brtnode_nonleaf_put_cmd_at_root (BRT t, BRTNODE node, BRT_MSG cmd) ...@@ -2911,73 +2885,56 @@ brtnode_nonleaf_put_cmd_at_root (BRT t, BRTNODE node, BRT_MSG cmd)
brt_nonleaf_put_cmd(t, node, cmd); brt_nonleaf_put_cmd(t, node, cmd);
} }
static BOOL
partition_requires_msg_application(BRTNODE leaf, int childnum, ANCESTORS ancestors) {
invariant(leaf->height == 0);
BOOL requires_msg_application = FALSE;
if (BP_STATE(leaf,childnum) != PT_AVAIL) return FALSE;
for (
ANCESTORS curr_ancestors = ancestors;
curr_ancestors;
curr_ancestors = curr_ancestors->next
)
{
// Note, we compare DSN of each nonleaf ancestor to DSN of relevant basement.
if (curr_ancestors->node->dsn.dsn > BLB_MAX_DSN_APPLIED(leaf,childnum).dsn) {
requires_msg_application = TRUE;
brt_status.dsn_gap++;
break;
}
}
return requires_msg_application;
}
// Effect: applies the cmd to the leaf if the appropriate basement node is in memory. // Effect: applies the cmd to the leaf if the appropriate basement node is in memory.
// If the appropriate basement node is not in memory, then nothing gets applied // If the appropriate basement node is not in memory, then nothing gets applied
// If the appropriate basement node must be in memory, it is the caller's responsibility to ensure // If the appropriate basement node must be in memory, it is the caller's responsibility to ensure
// that it is // that it is
void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, bool *made_change, ANCESTORS ancestors, uint64_t *workdone) { void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, bool *made_change, ANCESTORS UU(ancestors), uint64_t *workdone) {
VERIFY_NODE(t, node); VERIFY_NODE(t, node);
// ignore messages that have already been applied to this leaf // ignore messages that have already been applied to this leaf
if (brt_msg_applies_once(cmd)) { if (brt_msg_applies_once(cmd)) {
unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t); unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t->db, t->compare_fun);
BOOL req_msg_app = partition_requires_msg_application(node, childnum, ancestors);
// only apply the message if we have an available basement node that is up to date // only apply the message if we have an available basement node that is up to date
// we know it is up to date if partition_requires_msg_application returns FALSE // we know it is up to date if partition_requires_msg_application returns FALSE
if (BP_STATE(node,childnum) == PT_AVAIL && !req_msg_app) { if (BP_STATE(node,childnum) == PT_AVAIL) {
brt_leaf_put_cmd(t, if (cmd->msn.msn > BLB(node, childnum)->max_msn_applied.msn) {
BLB(node, childnum), BLB(node, childnum)->max_msn_applied = cmd->msn;
&BP_SUBTREE_EST(node, childnum), brt_leaf_put_cmd(t,
cmd, BLB(node, childnum),
made_change, &BP_SUBTREE_EST(node, childnum),
workdone cmd,
); made_change,
workdone);
} else {
brt_status.msn_discards++;
}
} }
} }
else if (brt_msg_applies_all(cmd)) { else if (brt_msg_applies_all(cmd)) {
bool bn_made_change = false; bool bn_made_change = false;
for (int childnum=0; childnum<node->n_children; childnum++) { for (int childnum=0; childnum<node->n_children; childnum++) {
BOOL req_msg_app = partition_requires_msg_application(node, childnum, ancestors);
// only apply the message if we have an available basement node that is up to date // only apply the message if we have an available basement node that is up to date
// we know it is up to date if partition_requires_msg_application returns FALSE // we know it is up to date if partition_requires_msg_application returns FALSE
if (BP_STATE(node,childnum) == PT_AVAIL && !req_msg_app) { if (BP_STATE(node,childnum) == PT_AVAIL) {
brt_leaf_put_cmd( if (cmd->msn.msn > BLB(node, childnum)->max_msn_applied.msn) {
t, BLB(node, childnum)->max_msn_applied = cmd->msn;
BLB(node, childnum), brt_leaf_put_cmd(t,
&BP_SUBTREE_EST(node,childnum), BLB(node, childnum),
cmd, &BP_SUBTREE_EST(node,childnum),
&bn_made_change, cmd,
workdone &bn_made_change,
); workdone);
if (bn_made_change) *made_change = 1; if (bn_made_change) *made_change = 1;
} else {
brt_status.msn_discards++;
}
} }
} }
} }
else if (!brt_msg_does_nothing(cmd)) { else if (!brt_msg_does_nothing(cmd)) {
assert(FALSE); assert(FALSE);
} }
VERIFY_NODE(t, node); VERIFY_NODE(t, node);
} }
...@@ -3051,105 +3008,6 @@ static u_int32_t get_roothash (BRT brt) { ...@@ -3051,105 +3008,6 @@ static u_int32_t get_roothash (BRT brt) {
return rh->fullhash; return rh->fullhash;
} }
static void apply_cmd_to_in_memory_non_root_leaves (
BRT t,
CACHEKEY nodenum,
u_int32_t fullhash,
BRT_MSG cmd,
BRTNODE parent,
int parents_childnum,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds,
uint64_t * workdone,
bool *made_change_p
);
static void apply_cmd_to_in_memory_non_root_leaves_starting_at_node (BRT t,
BRTNODE node,
BRT_MSG cmd,
BOOL is_root,
BRTNODE parent,
int parents_childnum,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds,
uint64_t * workdone,
bool *made_change_p) {
bool made_change = false;
if (made_change_p == NULL) {
made_change_p = &made_change;
}
// internal node
if (node->height>0) {
if (brt_msg_applies_once(cmd)) {
unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t);
struct ancestors next_ancestors = {node, childnum, ancestors};
const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum);
if (is_root) // record workdone in root only, if not root then this is a recursive call so just pass along pointer
workdone = &(BP_WORKDONE(node,childnum));
apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), child_fullhash, cmd, node, childnum, &next_ancestors, &next_bounds, workdone, made_change_p);
}
else if (brt_msg_applies_all(cmd)) {
for (int childnum=0; childnum<node->n_children; childnum++) {
struct ancestors next_ancestors = {node, childnum, ancestors};
const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum);
if (is_root)
workdone = &(BP_WORKDONE(node,childnum));
apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), child_fullhash, cmd, node, childnum, &next_ancestors, &next_bounds, workdone, made_change_p);
}
}
}
// leaf node
else {
invariant(!is_root);
toku_apply_cmd_to_leaf(t, node, cmd, made_change_p, ancestors, workdone);
}
if (*made_change_p) {
if (parent) {
fixup_child_estimates(parent, parents_childnum, node, FALSE);
} else {
invariant(is_root); // only root has no parent
}
}
}
// apply a single message, stored in root's buffer(s), to all relevant leaves that are in memory
static void apply_cmd_to_in_memory_non_root_leaves (
BRT t,
CACHEKEY nodenum,
u_int32_t fullhash,
BRT_MSG cmd,
BRTNODE parent,
int parents_childnum,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds,
uint64_t * workdone,
bool *made_change_p
)
{
BRTNODE node = NULL;
void *node_v;
int r = toku_cachetable_get_and_pin_if_in_memory(
t->cf,
nodenum,
fullhash,
&node_v
);
if (r) { goto exit; }
node = node_v;
apply_cmd_to_in_memory_non_root_leaves_starting_at_node(t, node, cmd, FALSE, parent, parents_childnum, ancestors, bounds, workdone, made_change_p);
toku_unpin_brtnode(t, node);
exit:
return;
}
CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *roothash) { CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *roothash) {
*roothash = get_roothash(brt); *roothash = get_roothash(brt);
return &brt->h->root; return &brt->h->root;
...@@ -3173,7 +3031,7 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd) ...@@ -3173,7 +3031,7 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
// get the root node // get the root node
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->h); fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
toku_pin_brtnode_holding_lock(brt, *rootp, fullhash, NULL, &infinite_bounds, &bfe, &node); toku_pin_brtnode_holding_lock(brt, *rootp, fullhash, NULL, &infinite_bounds, &bfe, &node);
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
cmd->msn.msn = node->max_msn_applied_to_node_on_disk.msn + 1; cmd->msn.msn = node->max_msn_applied_to_node_on_disk.msn + 1;
...@@ -3190,7 +3048,6 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd) ...@@ -3190,7 +3048,6 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
// verify that msn of latest message was captured in root node (push_something_at_root() did not release ydb lock) // verify that msn of latest message was captured in root node (push_something_at_root() did not release ydb lock)
invariant(cmd->msn.msn == node->max_msn_applied_to_node_on_disk.msn); invariant(cmd->msn.msn == node->max_msn_applied_to_node_on_disk.msn);
if (node->height > 0) { if (node->height > 0) {
apply_cmd_to_in_memory_non_root_leaves_starting_at_node(brt, node, cmd, TRUE, NULL, -1, (ANCESTORS)NULL, &infinite_bounds, NULL, NULL);
if (nonleaf_node_is_gorged(node)) { if (nonleaf_node_is_gorged(node)) {
// No need for a loop here. We only inserted one message, so flushing a single child suffices. // No need for a loop here. We only inserted one message, so flushing a single child suffices.
flush_some_child(brt, node, TRUE, TRUE, flush_some_child(brt, node, TRUE, TRUE,
...@@ -3634,7 +3491,6 @@ static int setup_initial_brt_root_node (BRT t, BLOCKNUM blocknum) { ...@@ -3634,7 +3491,6 @@ static int setup_initial_brt_root_node (BRT t, BLOCKNUM blocknum) {
BRTNODE XMALLOC(node); BRTNODE XMALLOC(node);
toku_initialize_empty_brtnode(node, blocknum, 0, 1, t->h->layout_version, t->h->nodesize, t->flags); toku_initialize_empty_brtnode(node, blocknum, 0, 1, t->h->layout_version, t->h->nodesize, t->flags);
BP_STATE(node,0) = PT_AVAIL; BP_STATE(node,0) = PT_AVAIL;
set_new_DSN_for_node(node, t);
u_int32_t fullhash = toku_cachetable_hash(t->cf, blocknum); u_int32_t fullhash = toku_cachetable_hash(t->cf, blocknum);
node->fullhash = fullhash; node->fullhash = fullhash;
...@@ -3794,8 +3650,6 @@ brt_alloc_init_header(BRT t, TOKUTXN txn) { ...@@ -3794,8 +3650,6 @@ brt_alloc_init_header(BRT t, TOKUTXN txn) {
memset(&t->h->descriptor, 0, sizeof(t->h->descriptor)); memset(&t->h->descriptor, 0, sizeof(t->h->descriptor));
t->h->curr_dsn.dsn = MIN_DSN.dsn + 1; // start at MIN_DSN + 1, as MIN_DSN is reserved for basement nodes
r = brt_init_header(t, txn); r = brt_init_header(t, txn);
if (r != 0) goto died2; if (r != 0) goto died2;
return r; return r;
...@@ -5038,13 +4892,13 @@ static void search_save_bound (brt_search_t *search, DBT *pivot) { ...@@ -5038,13 +4892,13 @@ static void search_save_bound (brt_search_t *search, DBT *pivot) {
search->have_pivot_bound = TRUE; search->have_pivot_bound = TRUE;
} }
static BOOL search_pivot_is_bounded (brt_search_t *search, BRT brt, DBT *pivot) static BOOL search_pivot_is_bounded (brt_search_t *search, DB *cmp_extra, brt_compare_func cmp, DBT *pivot)
// Effect: Return TRUE iff the pivot has already been searched (for fixing #3522.) // Effect: Return TRUE iff the pivot has already been searched (for fixing #3522.)
// If searching from left to right, if we have already searched all the values less than pivot, we don't want to search again. // If searching from left to right, if we have already searched all the values less than pivot, we don't want to search again.
// If searching from right to left, if we have already searched all the vlaues greater than pivot, we don't want to search again. // If searching from right to left, if we have already searched all the vlaues greater than pivot, we don't want to search again.
{ {
if (!search->have_pivot_bound) return TRUE; // isn't bounded. if (!search->have_pivot_bound) return TRUE; // isn't bounded.
int comp = brt->compare_fun(brt->db, pivot, &search->pivot_bound); int comp = cmp(cmp_extra, pivot, &search->pivot_bound);
if (search->direction == BRT_SEARCH_LEFT) { if (search->direction == BRT_SEARCH_LEFT) {
// searching from left to right. If the comparison function says the pivot is <= something we already compared, don't do it again. // searching from left to right. If the comparison function says the pivot is <= something we already compared, don't do it again.
return comp>0; return comp>0;
...@@ -5074,13 +4928,90 @@ static BOOL msg_type_has_key (enum brt_msg_type m) { ...@@ -5074,13 +4928,90 @@ static BOOL msg_type_has_key (enum brt_msg_type m) {
assert(0); assert(0);
} }
struct store_fifo_offset_extra {
long *offsets;
int i;
};
static int
store_fifo_offset(OMTVALUE v, u_int32_t UU(idx), void *extrap)
{
struct store_fifo_offset_extra *extra = extrap;
const long offset = (long) v;
extra->offsets[extra->i] = offset;
extra->i++;
return 0;
}
static int
fifo_offset_msn_cmp(void *extrap, const void *va, const void *vb)
{
FIFO fifo = extrap;
const long *ao = va;
const long *bo = vb;
const struct fifo_entry *a = toku_fifo_get_entry(fifo, *ao);
const struct fifo_entry *b = toku_fifo_get_entry(fifo, *bo);
return (a->msn.msn > b->msn.msn) - (a->msn.msn < b->msn.msn);
}
static void
do_brt_leaf_put_cmd(BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRTNODE ancestor, int childnum, DBT *lbe_ptr, DBT *ubi_ptr, MSN *max_msn_applied, const struct fifo_entry *entry)
{
ITEMLEN keylen = entry->keylen;
ITEMLEN vallen = entry->vallen;
enum brt_msg_type type = (enum brt_msg_type)entry->type;
MSN msn = entry->msn;
const XIDS xids = (XIDS) &entry->xids_s;
bytevec key = xids_get_end_of_array(xids);
bytevec val = (u_int8_t*)key + entry->keylen;
DBT hk;
toku_fill_dbt(&hk, key, keylen);
assert(!msg_type_has_key(type) || key_is_in_leaf_range(t, &hk, lbe_ptr, ubi_ptr));
DBT hv;
BRT_MSG_S brtcmd = { type, msn, xids, .u.id = { &hk, toku_fill_dbt(&hv, val, vallen) } };
bool made_change;
// the messages are in (key,msn) order so all the messages for one key
// in one buffer are in ascending msn order, so it's ok that we don't
// update the basement node's msn until the end
if (brtcmd.msn.msn > bn->max_msn_applied.msn) {
if (brtcmd.msn.msn > max_msn_applied->msn) {
*max_msn_applied = brtcmd.msn;
}
brt_leaf_put_cmd(t, bn, se, &brtcmd, &made_change, &BP_WORKDONE(ancestor, childnum));
} else {
brt_status.msn_discards++;
}
}
struct iterate_do_brt_leaf_put_cmd_extra {
BRT t;
BASEMENTNODE bn;
SUBTREE_EST se;
BRTNODE ancestor;
int childnum;
DBT *lbe_ptr;
DBT *ubi_ptr;
MSN *max_msn_applied;
};
static int
iterate_do_brt_leaf_put_cmd(OMTVALUE v, u_int32_t UU(idx), void *extrap)
{
struct iterate_do_brt_leaf_put_cmd_extra *e = extrap;
const long offset = (long) v;
const struct fifo_entry *entry = toku_fifo_get_entry(BNC_BUFFER(e->ancestor, e->childnum), offset);
do_brt_leaf_put_cmd(e->t, e->bn, e->se, e->ancestor, e->childnum, e->lbe_ptr, e->ubi_ptr, e->max_msn_applied, entry);
return 0;
}
static int static int
apply_buffer_messages_to_basement_node ( apply_buffer_messages_to_basement_node (
BRT t, BRT t,
BASEMENTNODE bn, BASEMENTNODE bn,
SUBTREE_EST se, SUBTREE_EST se,
BRTNODE ancestor, BRTNODE ancestor,
int childnum, int childnum,
struct pivot_bounds const * const bounds struct pivot_bounds const * const bounds
) )
// Effect: For each messages in ANCESTOR that is between lower_bound_exclusive (exclusive) and upper_bound_inclusive (inclusive), apply the message to the node. // Effect: For each messages in ANCESTOR that is between lower_bound_exclusive (exclusive) and upper_bound_inclusive (inclusive), apply the message to the node.
...@@ -5090,40 +5021,106 @@ apply_buffer_messages_to_basement_node ( ...@@ -5090,40 +5021,106 @@ apply_buffer_messages_to_basement_node (
{ {
assert(0 <= childnum && childnum < ancestor->n_children); assert(0 <= childnum && childnum < ancestor->n_children);
int r = 0; int r = 0;
DBT lbe, ubi; // lbe is lower bound exclusive, ubi is upper bound inclusive
MSN max_msn_applied = MIN_MSN;
u_int32_t lbe, ubi;
DBT lbedbt, ubidbt; // lbe is lower bound exclusive, ubi is upper bound inclusive
DBT *lbe_ptr, *ubi_ptr; DBT *lbe_ptr, *ubi_ptr;
if (bounds->lower_bound_exclusive==NULL) { if (bounds->lower_bound_exclusive) {
lbe_ptr = NULL; struct toku_fifo_entry_key_msn_heaviside_extra lbe_extra = {
.cmp_extra = t->db, .cmp = t->compare_fun,
.fifo = BNC_BUFFER(ancestor, childnum),
.key = kv_pair_key((struct kv_pair *) bounds->lower_bound_exclusive),
.keylen = kv_pair_keylen((struct kv_pair *) bounds->lower_bound_exclusive),
.msn = MAX_MSN };
// TODO: get this value and compare it with ubi to see if we even
// need to continue
OMTVALUE found_lb;
r = toku_omt_find(BNC_MESSAGE_TREE(ancestor, childnum),
toku_fifo_entry_key_msn_heaviside, &lbe_extra,
+1, &found_lb, &lbe);
if (r == DB_NOTFOUND) {
// no relevant data, we're done
if (toku_omt_size(BNC_BROADCAST_BUFFER(ancestor, childnum)) == 0) {
return 0;
} else {
lbe = 0;
lbe_ptr = NULL;
ubi = 0;
ubi_ptr = NULL;
goto just_apply_broadcast_messages;
}
}
if (bounds->upper_bound_inclusive) {
DBT ubidbt_tmp = kv_pair_key_to_dbt((struct kv_pair *) bounds->upper_bound_inclusive);
const long offset = (long) found_lb;
DBT found_lbedbt;
fill_dbt_for_fifo_entry(&found_lbedbt, toku_fifo_get_entry(BNC_BUFFER(ancestor, childnum), offset));
int c = t->compare_fun(t->db, &found_lbedbt, &ubidbt_tmp);
if (c > 0) {
if (toku_omt_size(BNC_BROADCAST_BUFFER(ancestor, childnum)) == 0) {
return 0;
} else {
lbe = 0;
lbe_ptr = NULL;
ubi = 0;
ubi_ptr = NULL;
goto just_apply_broadcast_messages;
}
}
}
lbedbt = kv_pair_key_to_dbt((struct kv_pair *) bounds->lower_bound_exclusive);
lbe_ptr = &lbedbt;
} else { } else {
lbe = kv_pair_key_to_dbt(bounds->lower_bound_exclusive); lbe = 0;
lbe_ptr = &lbe; lbe_ptr = NULL;
} }
if (bounds->upper_bound_inclusive==NULL) { if (bounds->upper_bound_inclusive) {
ubi_ptr = NULL; struct toku_fifo_entry_key_msn_heaviside_extra ubi_extra = {
.cmp_extra = t->db, .cmp = t->compare_fun,
.fifo = BNC_BUFFER(ancestor, childnum),
.key = kv_pair_key((struct kv_pair *) bounds->upper_bound_inclusive),
.keylen = kv_pair_keylen((struct kv_pair *) bounds->upper_bound_inclusive),
.msn = MAX_MSN };
r = toku_omt_find(BNC_MESSAGE_TREE(ancestor, childnum),
toku_fifo_entry_key_msn_heaviside, &ubi_extra,
+1, NULL, &ubi);
if (r == DB_NOTFOUND) {
ubi = toku_omt_size(BNC_MESSAGE_TREE(ancestor, childnum));
}
ubidbt = kv_pair_key_to_dbt((struct kv_pair *) bounds->upper_bound_inclusive);
ubi_ptr = &ubidbt;
} else { } else {
ubi = kv_pair_key_to_dbt(bounds->upper_bound_inclusive); ubi = toku_omt_size(BNC_MESSAGE_TREE(ancestor, childnum));
ubi_ptr = &ubi; ubi_ptr = NULL;
} }
assert(BP_STATE(ancestor,childnum) == PT_AVAIL);
FIFO_ITERATE(BNC_BUFFER(ancestor, childnum), key, keylen, val, vallen, type, msn, xids, just_apply_broadcast_messages:
({ if (toku_omt_size(BNC_BROADCAST_BUFFER(ancestor, childnum)) > 0) {
DBT hk; const int buffer_size = ubi - lbe + toku_omt_size(BNC_BROADCAST_BUFFER(ancestor, childnum));
toku_fill_dbt(&hk, key, keylen); long *MALLOC_N(buffer_size, offsets);
if ((!msg_type_has_key(type) || key_is_in_leaf_range(t, &hk, lbe_ptr, ubi_ptr))) {
DBT hv; struct store_fifo_offset_extra sfo_extra = { .offsets = offsets, .i = 0 };
BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id = {&hk, r = toku_omt_iterate_on_range(BNC_MESSAGE_TREE(ancestor, childnum), lbe, ubi, store_fifo_offset, &sfo_extra); assert_zero(r);
toku_fill_dbt(&hv, val, vallen)} }; r = toku_omt_iterate(BNC_BROADCAST_BUFFER(ancestor, childnum), store_fifo_offset, &sfo_extra); assert_zero(r);
bool made_change; invariant(sfo_extra.i == buffer_size);
brt_leaf_put_cmd(t, r = mergesort_r(offsets, buffer_size, sizeof offsets[0], BNC_BUFFER(ancestor, childnum), fifo_offset_msn_cmp); assert_zero(r);
bn, se, assert(BP_STATE(ancestor, childnum) == PT_AVAIL);
&brtcmd, &made_change, &BP_WORKDONE(ancestor, childnum)); for (int i = 0; i < buffer_size; ++i) {
} const struct fifo_entry *entry = toku_fifo_get_entry(BNC_BUFFER(ancestor, childnum), offsets[i]);
})); do_brt_leaf_put_cmd(t, bn, se, ancestor, childnum, lbe_ptr, ubi_ptr, &max_msn_applied, entry);
}
//F uint64_t end_workdone = BP_WORKDONE(ancestor, childnum);
// printf(" workdone = %"PRIu64", msndiff = 0x%"PRIx64", ancestorworkdone start, end = %"PRIu64", %"PRIu64"\n",
// workdone_this_leaf_total, node->max_msn_applied_to_node.msn - start_msn.msn, start_workdone, end_workdone);
toku_free(offsets);
} else {
assert(BP_STATE(ancestor, childnum) == PT_AVAIL);
struct iterate_do_brt_leaf_put_cmd_extra iter_extra = { .t = t, .bn = bn, .se = se, .ancestor = ancestor, .childnum = childnum, .lbe_ptr = lbe_ptr, .ubi_ptr = ubi_ptr, .max_msn_applied = &max_msn_applied };
r = toku_omt_iterate_on_range(BNC_MESSAGE_TREE(ancestor, childnum), lbe, ubi, iterate_do_brt_leaf_put_cmd, &iter_extra);
assert_zero(r);
}
if (max_msn_applied.msn > bn->max_msn_applied.msn) {
bn->max_msn_applied = max_msn_applied;
}
return r; return r;
} }
...@@ -5264,17 +5261,8 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -5264,17 +5261,8 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
// need to apply messages to each basement node // need to apply messages to each basement node
// TODO: (Zardosht) cilkify this // TODO: (Zardosht) cilkify this
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
BOOL requires_msg_application = partition_requires_msg_application(
node,
i,
ancestors
);
if (!requires_msg_application) {
continue;
}
update_stats = TRUE;
int height = 0; int height = 0;
if (BP_STATE(node, i) != PT_AVAIL) { continue; }
BASEMENTNODE curr_bn = BLB(node, i); BASEMENTNODE curr_bn = BLB(node, i);
SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i); SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i);
struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds); struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
...@@ -5292,10 +5280,8 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -5292,10 +5280,8 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
// we don't want to check this node again if the next time // we don't want to check this node again if the next time
// we query it, the msn hasn't changed. // we query it, the msn hasn't changed.
curr_bn->max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk; curr_bn->max_msn_applied = curr_ancestors->node->max_msn_applied_to_node_on_disk;
update_stats = TRUE;
} }
curr_bn->max_dsn_applied = (curr_ancestors->node->dsn.dsn > curr_bn->max_dsn_applied.dsn)
? curr_ancestors->node->dsn
: curr_bn->max_dsn_applied;
} }
} }
// Must update the leaf estimates. Might as well use the estimates from the soft copy (even if they make it out to disk), since they are // Must update the leaf estimates. Might as well use the estimates from the soft copy (even if they make it out to disk), since they are
...@@ -5343,8 +5329,6 @@ brt_search_basement_node( ...@@ -5343,8 +5329,6 @@ brt_search_basement_node(
BOOL can_bulk_fetch BOOL can_bulk_fetch
) )
{ {
assert(bn->max_dsn_applied.dsn >= MIN_DSN.dsn);
// Now we have to convert from brt_search_t to the heaviside function with a direction. What a pain... // Now we have to convert from brt_search_t to the heaviside function with a direction. What a pain...
int direction; int direction;
...@@ -5483,7 +5467,7 @@ brt_node_maybe_prefetch(BRT brt, BRTNODE node, int childnum, BRT_CURSOR brtcurso ...@@ -5483,7 +5467,7 @@ brt_node_maybe_prefetch(BRT brt, BRTNODE node, int childnum, BRT_CURSOR brtcurso
BLOCKNUM nextchildblocknum = BP_BLOCKNUM(node, i); BLOCKNUM nextchildblocknum = BP_BLOCKNUM(node, i);
u_int32_t nextfullhash = compute_child_fullhash(brt->cf, node, i); u_int32_t nextfullhash = compute_child_fullhash(brt->cf, node, i);
struct brtnode_fetch_extra *MALLOC(bfe); struct brtnode_fetch_extra *MALLOC(bfe);
fill_bfe_for_prefetch(bfe, brt->h, brt, brtcursor); fill_bfe_for_prefetch(bfe, brt->h, brt->db, brt->compare_fun, brtcursor);
BOOL doing_prefetch = FALSE; BOOL doing_prefetch = FALSE;
toku_cachefile_prefetch( toku_cachefile_prefetch(
brt->cf, brt->cf,
...@@ -5540,7 +5524,8 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_ ...@@ -5540,7 +5524,8 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_
fill_bfe_for_subset_read( fill_bfe_for_subset_read(
&bfe, &bfe,
brt->h, brt->h,
brt, brt->db,
brt->compare_fun,
search, search,
&brtcursor->range_lock_left_key, &brtcursor->range_lock_left_key,
&brtcursor->range_lock_right_key, &brtcursor->range_lock_right_key,
...@@ -5592,18 +5577,19 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_ ...@@ -5592,18 +5577,19 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_
int int
toku_brt_search_which_child( toku_brt_search_which_child(
BRT brt, DB *cmp_extra,
BRTNODE node, brt_compare_func cmp,
BRTNODE node,
brt_search_t *search brt_search_t *search
) )
{ {
int c; int c;
DBT pivotkey; DBT pivotkey;
toku_init_dbt(&pivotkey); toku_init_dbt(&pivotkey);
/* binary search is overkill for a small array */ /* binary search is overkill for a small array */
int child[node->n_children]; int child[node->n_children];
/* scan left to right or right to left depending on the search direction */ /* scan left to right or right to left depending on the search direction */
for (c = 0; c < node->n_children; c++) { for (c = 0; c < node->n_children; c++) {
child[c] = (search->direction == BRT_SEARCH_LEFT) ? c : node->n_children - 1 - c; child[c] = (search->direction == BRT_SEARCH_LEFT) ? c : node->n_children - 1 - c;
...@@ -5612,7 +5598,7 @@ toku_brt_search_which_child( ...@@ -5612,7 +5598,7 @@ toku_brt_search_which_child(
int p = (search->direction == BRT_SEARCH_LEFT) ? child[c] : child[c] - 1; int p = (search->direction == BRT_SEARCH_LEFT) ? child[c] : child[c] - 1;
struct kv_pair *pivot = node->childkeys[p]; struct kv_pair *pivot = node->childkeys[p];
toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot)); toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot));
if (search_pivot_is_bounded(search, brt, &pivotkey) && search->compare(search, &pivotkey)) { if (search_pivot_is_bounded(search, cmp_extra, cmp, &pivotkey) && search->compare(search, &pivotkey)) {
return child[c]; return child[c];
} }
} }
...@@ -5781,7 +5767,8 @@ toku_brt_search (BRT brt, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf, ...@@ -5781,7 +5767,8 @@ toku_brt_search (BRT brt, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf,
fill_bfe_for_subset_read( fill_bfe_for_subset_read(
&bfe, &bfe,
brt->h, brt->h,
brt, brt->db,
brt->compare_fun,
search, search,
&brtcursor->range_lock_left_key, &brtcursor->range_lock_left_key,
&brtcursor->range_lock_right_key, &brtcursor->range_lock_right_key,
...@@ -6230,7 +6217,7 @@ static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename, ...@@ -6230,7 +6217,7 @@ static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename,
{ {
//assert(fullhash == toku_cachetable_hash(brt->cf, nodename)); //assert(fullhash == toku_cachetable_hash(brt->cf, nodename));
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_min_read(&bfe, brt->h); fill_bfe_for_min_read(&bfe, brt->h, brt->db, brt->compare_fun);
toku_pin_brtnode_holding_lock(brt, nodename, fullhash, toku_pin_brtnode_holding_lock(brt, nodename, fullhash,
ancestors, bounds, &bfe, ancestors, bounds, &bfe,
&node); &node);
...@@ -6318,7 +6305,7 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) { ...@@ -6318,7 +6305,7 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) {
CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash); CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
CACHEKEY root = *rootp; CACHEKEY root = *rootp;
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_min_read(&bfe, brt->h); fill_bfe_for_min_read(&bfe, brt->h, brt->db, brt->compare_fun);
BRTNODE node; BRTNODE node;
toku_pin_brtnode_holding_lock(brt, root, fullhash, (ANCESTORS)NULL, &infinite_bounds, &bfe, &node); toku_pin_brtnode_holding_lock(brt, root, fullhash, (ANCESTORS)NULL, &infinite_bounds, &bfe, &node);
...@@ -6344,7 +6331,7 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_ ...@@ -6344,7 +6331,7 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_
void *node_v; void *node_v;
u_int32_t fullhash = toku_cachetable_hash(brt->cf, blocknum); u_int32_t fullhash = toku_cachetable_hash(brt->cf, blocknum);
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->h); fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
int r = toku_cachetable_get_and_pin( int r = toku_cachetable_get_and_pin(
brt->cf, brt->cf,
blocknum, blocknum,
...@@ -6661,7 +6648,7 @@ static BOOL is_empty_fast_iter (BRT brt, BRTNODE node) { ...@@ -6661,7 +6648,7 @@ static BOOL is_empty_fast_iter (BRT brt, BRTNODE node) {
BLOCKNUM childblocknum = BP_BLOCKNUM(node,childnum); BLOCKNUM childblocknum = BP_BLOCKNUM(node,childnum);
u_int32_t fullhash = compute_child_fullhash(brt->cf, node, childnum); u_int32_t fullhash = compute_child_fullhash(brt->cf, node, childnum);
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->h); fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
int rr = toku_cachetable_get_and_pin( int rr = toku_cachetable_get_and_pin(
brt->cf, brt->cf,
childblocknum, childblocknum,
...@@ -6706,7 +6693,7 @@ BOOL toku_brt_is_empty_fast (BRT brt) ...@@ -6706,7 +6693,7 @@ BOOL toku_brt_is_empty_fast (BRT brt)
{ {
void *node_v; void *node_v;
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->h); fill_bfe_for_full_read(&bfe, brt->h, brt->db, brt->compare_fun);
int rr = toku_cachetable_get_and_pin( int rr = toku_cachetable_get_and_pin(
brt->cf, brt->cf,
*rootp, *rootp,
......
...@@ -120,7 +120,7 @@ static void ...@@ -120,7 +120,7 @@ static void
dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) { dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
BRTNODE n; BRTNODE n;
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, h); fill_bfe_for_full_read(&bfe, h, NULL, NULL);
int r = toku_deserialize_brtnode_from (f, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, &bfe); int r = toku_deserialize_brtnode_from (f, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, &bfe);
assert(r==0); assert(r==0);
assert(n!=0); assert(n!=0);
...@@ -230,7 +230,7 @@ fragmentation_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra) ...@@ -230,7 +230,7 @@ fragmentation_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra)
frag_help_extra *info = extra; frag_help_extra *info = extra;
BRTNODE n; BRTNODE n;
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, info->h); fill_bfe_for_full_read(&bfe, info->h, NULL, NULL);
int r = toku_deserialize_brtnode_from(info->f, b, 0 /*pass zero for hash, it doesn't matter*/, &n, &bfe); int r = toku_deserialize_brtnode_from(info->f, b, 0 /*pass zero for hash, it doesn't matter*/, &n, &bfe);
if (r==0) { if (r==0) {
info->blocksizes += size; info->blocksizes += size;
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#endif #endif
#define _FILE_OFFSET_BITS 64 #define _FILE_OFFSET_BITS 64
#include "toku_assert.h"
#include <db.h> #include <db.h>
#include <inttypes.h> #include <inttypes.h>
...@@ -60,11 +61,6 @@ typedef struct __toku_msn { u_int64_t msn; } MSN; ...@@ -60,11 +61,6 @@ typedef struct __toku_msn { u_int64_t msn; } MSN;
#define MIN_MSN ((MSN){(u_int64_t)1000*1000*1000}) // first 1B values reserved for messages created before Dr. No (for upgrade) #define MIN_MSN ((MSN){(u_int64_t)1000*1000*1000}) // first 1B values reserved for messages created before Dr. No (for upgrade)
#define MAX_MSN ((MSN){UINT64_MAX}) #define MAX_MSN ((MSN){UINT64_MAX})
typedef struct __toku_dsn { int64_t dsn; } DSN; // DESERIALIZATION sequence number
#define INVALID_DSN ((DSN){-1})
#define MIN_DSN ((DSN){0})
#define MAX_DSN ((DSN){INT64_MAX})
/* At the brt layer, a FILENUM uniquely identifies an open file. /* At the brt layer, a FILENUM uniquely identifies an open file.
* At the ydb layer, a DICTIONARY_ID uniquely identifies an open dictionary. * At the ydb layer, a DICTIONARY_ID uniquely identifies an open dictionary.
* With the introduction of the loader (ticket 2216), it is possible for the file that holds * With the introduction of the loader (ticket 2216), it is possible for the file that holds
...@@ -123,6 +119,68 @@ enum brt_msg_type { ...@@ -123,6 +119,68 @@ enum brt_msg_type {
BRT_UPDATE_BROADCAST_ALL = 15 BRT_UPDATE_BROADCAST_ALL = 15
}; };
static inline BOOL
brt_msg_type_applies_once(enum brt_msg_type type)
{
BOOL ret_val;
switch (type) {
case BRT_INSERT_NO_OVERWRITE:
case BRT_INSERT:
case BRT_DELETE_ANY:
case BRT_ABORT_ANY:
case BRT_COMMIT_ANY:
case BRT_UPDATE:
ret_val = TRUE;
break;
case BRT_COMMIT_BROADCAST_ALL:
case BRT_COMMIT_BROADCAST_TXN:
case BRT_ABORT_BROADCAST_TXN:
case BRT_OPTIMIZE:
case BRT_OPTIMIZE_FOR_UPGRADE:
case BRT_UPDATE_BROADCAST_ALL:
case BRT_NONE:
ret_val = FALSE;
break;
default:
assert(FALSE);
}
return ret_val;
}
static inline BOOL
brt_msg_type_applies_all(enum brt_msg_type type)
{
BOOL ret_val;
switch (type) {
case BRT_NONE:
case BRT_INSERT_NO_OVERWRITE:
case BRT_INSERT:
case BRT_DELETE_ANY:
case BRT_ABORT_ANY:
case BRT_COMMIT_ANY:
case BRT_UPDATE:
ret_val = FALSE;
break;
case BRT_COMMIT_BROADCAST_ALL:
case BRT_COMMIT_BROADCAST_TXN:
case BRT_ABORT_BROADCAST_TXN:
case BRT_OPTIMIZE:
case BRT_OPTIMIZE_FOR_UPGRADE:
case BRT_UPDATE_BROADCAST_ALL:
ret_val = TRUE;
break;
default:
assert(FALSE);
}
return ret_val;
}
static inline BOOL
brt_msg_type_does_nothing(enum brt_msg_type type)
{
return (type == BRT_NONE);
}
typedef struct xids_t *XIDS; typedef struct xids_t *XIDS;
typedef struct fifo_msg_t *FIFO_MSG; typedef struct fifo_msg_t *FIFO_MSG;
/* tree commands */ /* tree commands */
......
...@@ -42,7 +42,6 @@ static u_int64_t cachetable_puts; // how many times has a newly created ...@@ -42,7 +42,6 @@ static u_int64_t cachetable_puts; // how many times has a newly created
static u_int64_t cachetable_prefetches; // how many times has a block been prefetched into the cachetable? static u_int64_t cachetable_prefetches; // how many times has a block been prefetched into the cachetable?
static u_int64_t cachetable_maybe_get_and_pins; // how many times has maybe_get_and_pin(_clean) been called? static u_int64_t cachetable_maybe_get_and_pins; // how many times has maybe_get_and_pin(_clean) been called?
static u_int64_t cachetable_maybe_get_and_pin_hits; // how many times has get_and_pin(_clean) returned with a node? static u_int64_t cachetable_maybe_get_and_pin_hits; // how many times has get_and_pin(_clean) returned with a node?
static u_int64_t cachetable_get_and_pin_if_in_memorys; // how many times has get_and_pin_if_in_memorys been called?
static u_int64_t cachetable_wait_checkpoint; // number of times get_and_pin waits for a node to be written for a checkpoint static u_int64_t cachetable_wait_checkpoint; // number of times get_and_pin waits for a node to be written for a checkpoint
static u_int64_t cachetable_misstime; // time spent waiting for disk read static u_int64_t cachetable_misstime; // time spent waiting for disk read
static u_int64_t cachetable_waittime; // time spent waiting for another thread to release lock (e.g. prefetch, writing) static u_int64_t cachetable_waittime; // time spent waiting for another thread to release lock (e.g. prefetch, writing)
...@@ -1733,42 +1732,6 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, u_int3 ...@@ -1733,42 +1732,6 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, u_int3
return r; return r;
} }
int toku_cachetable_get_and_pin_if_in_memory (CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value)
// Effect: Lookup a key in the cachetable. If it is found then acquire a read lock on the pair, don't update the LRU list, and return success.
// Unlike toku_cachetable_maybe_get_and_pin, which gives up if there is any blocking (e.g., the node is waiting to be checkpointing), this
// version waits.
// Rationale: orthodox pushing needs to get the in-memory state right.
// Don't update the LRU list because we don't want this operation to cause something to stick in memory longer.
{
CACHETABLE ct = cachefile->cachetable;
PAIR p;
int count = 0;
int r = -1;
cachetable_lock(ct);
cachetable_get_and_pin_if_in_memorys++;
for (p=ct->table[fullhash&(ct->table_size-1)]; p; p=p->hash_chain) {
count++;
if (p->key.b==key.b && p->cachefile==cachefile) {
// It's the right block. Now we must wait.
if (p->checkpoint_pending) {
write_pair_for_checkpoint(ct, p, FALSE);
}
rwlock_read_lock(&p->rwlock, ct->mutex);
if (p->state == CTPAIR_INVALID) {
assert(0); // This is the branch that returns ENODEV in the get_and_pin code in the 5.0 branch. Let's just crash now.
}
// do not increment PAIR's clock count.
*value = p->value;
cachetable_hit++;
r = 0;
break;
}
}
note_hash_count(count);
cachetable_unlock(ct);
return r;
}
//Used by shortcut query path. //Used by shortcut query path.
//Same as toku_cachetable_maybe_get_and_pin except that we don't care if the node is clean or dirty (return the node regardless). //Same as toku_cachetable_maybe_get_and_pin except that we don't care if the node is clean or dirty (return the node regardless).
//All other conditions remain the same. //All other conditions remain the same.
...@@ -2955,7 +2918,6 @@ void toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS s) { ...@@ -2955,7 +2918,6 @@ void toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS s) {
s->prefetches = cachetable_prefetches; s->prefetches = cachetable_prefetches;
s->maybe_get_and_pins = cachetable_maybe_get_and_pins; s->maybe_get_and_pins = cachetable_maybe_get_and_pins;
s->maybe_get_and_pin_hits = cachetable_maybe_get_and_pin_hits; s->maybe_get_and_pin_hits = cachetable_maybe_get_and_pin_hits;
s->get_and_pin_if_in_memorys = cachetable_get_and_pin_if_in_memorys;
s->size_current = ct->size_current; s->size_current = ct->size_current;
s->size_limit = ct->size_limit; s->size_limit = ct->size_limit;
s->size_max = ct->size_max; s->size_max = ct->size_max;
......
...@@ -232,12 +232,6 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, u_int32_t /*fullhash ...@@ -232,12 +232,6 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, u_int32_t /*fullhash
// Returns: If the the item is already in memory, then return 0 and store it in the // Returns: If the the item is already in memory, then return 0 and store it in the
// void**. If the item is not in memory, then return a nonzero error number. // void**. If the item is not in memory, then return a nonzero error number.
int toku_cachetable_get_and_pin_if_in_memory (CACHEFILE /*cachefile*/, CACHEKEY /*key*/, u_int32_t /*fullhash*/, void**/*value*/);
// Effect: Get and pin an object if it is in memory, (even if doing so would require blocking, e.g., to wait on a checkpoint).
// This is similar to maybe_get_and_pin except that maybe_get_and_pin won't block waiting on a checkpoint.
// Returns: 0 iff the item is in memory (otherwise return a error)
// Modifies: *value (if returning 0, then the pointer to the value is stored in *value.
int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE, CACHEKEY, u_int32_t /*fullhash*/, void**); int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE, CACHEKEY, u_int32_t /*fullhash*/, void**);
// Effect: Like maybe get and pin, but may pin a clean pair. // Effect: Like maybe get and pin, but may pin a clean pair.
...@@ -403,7 +397,6 @@ typedef struct cachetable_status { ...@@ -403,7 +397,6 @@ typedef struct cachetable_status {
u_int64_t prefetches; // how many times has a block been prefetched into the cachetable? u_int64_t prefetches; // how many times has a block been prefetched into the cachetable?
u_int64_t maybe_get_and_pins; // how many times has maybe_get_and_pin(_clean) been called? u_int64_t maybe_get_and_pins; // how many times has maybe_get_and_pin(_clean) been called?
u_int64_t maybe_get_and_pin_hits; // how many times has maybe_get_and_pin(_clean) returned with a node? u_int64_t maybe_get_and_pin_hits; // how many times has maybe_get_and_pin(_clean) returned with a node?
u_int64_t get_and_pin_if_in_memorys; // how many times has get_and_pin_if_in_memory been called?
int64_t size_current; // the sum of the sizes of the nodes represented in the cachetable int64_t size_current; // the sum of the sizes of the nodes represented in the cachetable
int64_t size_limit; // the limit to the sum of the node sizes int64_t size_limit; // the limit to the sum of the node sizes
int64_t size_max; // high water mark of size_current (max value size_current ever had) int64_t size_max; // high water mark of size_current (max value size_current ever had)
......
...@@ -69,7 +69,7 @@ void toku_fifo_size_hint(FIFO fifo, size_t size) { ...@@ -69,7 +69,7 @@ void toku_fifo_size_hint(FIFO fifo, size_t size) {
} }
} }
int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, MSN msn, XIDS xids) { int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, MSN msn, XIDS xids, long *dest) {
int need_space_here = sizeof(struct fifo_entry) int need_space_here = sizeof(struct fifo_entry)
+ keylen + datalen + keylen + datalen
+ xids_get_size(xids) + xids_get_size(xids)
...@@ -80,24 +80,26 @@ int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *d ...@@ -80,24 +80,26 @@ int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *d
fifo->memory = toku_malloc(fifo->memory_size); fifo->memory = toku_malloc(fifo->memory_size);
} }
if (fifo->memory_start+need_space_total > fifo->memory_size) { if (fifo->memory_start+need_space_total > fifo->memory_size) {
// Out of memory at the end. // Out of memory at the end.
int next_2 = next_power_of_two(need_space_total); int next_2 = next_power_of_two(need_space_total);
if ((2*next_2 > fifo->memory_size) if ((2*next_2 > fifo->memory_size)
|| (8*next_2 < fifo->memory_size)) { || (8*next_2 < fifo->memory_size)) {
// resize the fifo // resize the fifo
char *newmem = toku_malloc(next_2); char *newmem = toku_malloc(next_2);
char *oldmem = fifo->memory; char *oldmem = fifo->memory;
if (newmem==0) return ENOMEM; if (newmem==0) return ENOMEM;
memcpy(newmem, oldmem+fifo->memory_start, fifo->memory_used); memcpy(newmem, oldmem+fifo->memory_start, fifo->memory_used);
fifo->memory_size = next_2; fifo->memory_size = next_2;
fifo->memory_start = 0; assert(fifo->memory_start == 0);
fifo->memory = newmem; fifo->memory_start = 0;
toku_free(oldmem); fifo->memory = newmem;
} else { toku_free(oldmem);
// slide things over } else {
memmove(fifo->memory, fifo->memory+fifo->memory_start, fifo->memory_used); // slide things over
fifo->memory_start = 0; memmove(fifo->memory, fifo->memory+fifo->memory_start, fifo->memory_used);
} assert(fifo->memory_start == 0);
fifo->memory_start = 0;
}
} }
struct fifo_entry *entry = (struct fifo_entry *)(fifo->memory + fifo->memory_start + fifo->memory_used); struct fifo_entry *entry = (struct fifo_entry *)(fifo->memory + fifo->memory_start + fifo->memory_used);
entry->type = (unsigned char)type; entry->type = (unsigned char)type;
...@@ -108,13 +110,17 @@ int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *d ...@@ -108,13 +110,17 @@ int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *d
memcpy(e_key, key, keylen); memcpy(e_key, key, keylen);
entry->vallen = datalen; entry->vallen = datalen;
memcpy(e_key + keylen, data, datalen); memcpy(e_key + keylen, data, datalen);
if (dest) {
assert(fifo->memory_start == 0);
*dest = fifo->memory_used;
}
fifo->n_items_in_fifo++; fifo->n_items_in_fifo++;
fifo->memory_used += need_space_here; fifo->memory_used += need_space_here;
return 0; return 0;
} }
int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd) { int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd, long *dest) {
return toku_fifo_enq(fifo, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size, cmd->type, cmd->msn, cmd->xids); return toku_fifo_enq(fifo, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size, cmd->type, cmd->msn, cmd->xids, dest);
} }
/* peek at the head (the oldest entry) of the fifo */ /* peek at the head (the oldest entry) of the fifo */
...@@ -193,3 +199,10 @@ unsigned long toku_fifo_memory_size(FIFO fifo) { ...@@ -193,3 +199,10 @@ unsigned long toku_fifo_memory_size(FIFO fifo) {
return sizeof(*fifo)+fifo->memory_size; return sizeof(*fifo)+fifo->memory_size;
} }
DBT *fill_dbt_for_fifo_entry(DBT *dbt, const struct fifo_entry *entry) {
return toku_fill_dbt(dbt, xids_get_end_of_array((XIDS) &entry->xids_s), entry->keylen);
}
const struct fifo_entry *toku_fifo_get_entry(FIFO fifo, long off) {
return toku_fifo_iterate_internal_get_entry(fifo, off);
}
...@@ -44,9 +44,9 @@ void toku_fifo_size_is_stabilized(FIFO); ...@@ -44,9 +44,9 @@ void toku_fifo_size_is_stabilized(FIFO);
int toku_fifo_n_entries(FIFO); int toku_fifo_n_entries(FIFO);
int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd); int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd, long *dest);
int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type, MSN msn, XIDS xids); int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type, MSN msn, XIDS xids, long *dest);
int toku_fifo_peek (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, u_int32_t *type, MSN *msn, XIDS *xids); int toku_fifo_peek (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, u_int32_t *type, MSN *msn, XIDS *xids);
...@@ -81,6 +81,9 @@ int toku_fifo_iterate_internal_has_more(FIFO fifo, int off); ...@@ -81,6 +81,9 @@ int toku_fifo_iterate_internal_has_more(FIFO fifo, int off);
int toku_fifo_iterate_internal_next(FIFO fifo, int off); int toku_fifo_iterate_internal_next(FIFO fifo, int off);
struct fifo_entry * toku_fifo_iterate_internal_get_entry(FIFO fifo, int off); struct fifo_entry * toku_fifo_iterate_internal_get_entry(FIFO fifo, int off);
DBT *fill_dbt_for_fifo_entry(DBT *dbt, const struct fifo_entry *entry);
const struct fifo_entry *toku_fifo_get_entry(FIFO fifo, long off);
#if defined(__cplusplus) || defined(__cilkplusplus) #if defined(__cplusplus) || defined(__cilkplusplus)
}; };
#endif #endif
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "$Id$"
#ident "Copyright (c) 2007-2011 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "includes.h"
#include "sort.h"
#if defined(HAVE_CILK)
#include <cilk/cilk.h>
#define cilk_worker_count (__cilkrts_get_nworkers())
#else
#define cilk_spawn
#define cilk_sync
#define cilk_for for
#define cilk_worker_count 1
#endif
static int
merge_c(void *vdest, void *va, int an, void *vb, int bn, int width,
void *extra, int (*cmp)(void *, const void *, const void *))
{
char *dest = vdest, *a = va, *b = vb;
while (an > 0 && bn > 0) {
int c = cmp(extra, a, b);
if (c < 0) {
memcpy(dest, a, width);
dest+=width; a+=width; an--;
} else {
memcpy(dest, b, width);
dest+=width; b+=width; bn--;
}
}
if (an > 0) {
memcpy(dest, a, an * width);
}
if (bn > 0) {
memcpy(dest, b, bn * width);
}
return 0;
}
static int
binsearch(void *key, void *va, int n, int abefore, int width,
void *extra, int (*cmp)(void *, const void *, const void *))
{
if (n == 0) {
return abefore;
}
char *a = va;
int mid = n / 2;
void *akey = a + mid * width;
int c = cmp(extra, key, akey);
if (c == 0) {
// this won't happen because msns are unique, but is here for completeness
return abefore + mid;
} else if (c < 0) {
if (n == 1) {
return abefore;
} else {
return binsearch(key, a, mid, abefore, width, extra, cmp);
}
} else {
if (n == 1) {
return abefore + 1;
} else {
return binsearch(key, a+mid*width, n-mid, abefore+mid, width, extra, cmp);
}
}
}
static int
merge(void *vdest, void *va, int an, void *vb, int bn, int width,
void *extra, int (*cmp)(void *, const void *, const void *))
{
if (an + bn < 10000) {
return merge_c(vdest, va, an, vb, bn, width, extra, cmp);
}
char *dest = vdest, *a = va, *b = vb;
if (an < bn) {
char *tmp1 = a; a = b; b = tmp1;
int tmp2 = an; an = bn; bn = tmp2;
}
int a2 = an/2;
void *akey = a + a2 * width;
int b2 = binsearch(akey, b, bn, 0, width, extra, cmp);
int ra, rb;
ra = cilk_spawn merge(dest, a, a2, b, b2, width, extra, cmp);
rb = merge(dest+(a2+b2)*width, a+a2*width, an-a2, b+b2*width, bn-b2, width, extra, cmp);
cilk_sync;
if (ra != 0) return ra;
return rb;
}
int
mergesort_r(void *va, int n, int width,
void *extra, int (*cmp)(void *, const void *, const void *))
{
const BOOL use_cilk = (n > 10000);
if (n <= 1) { return 0; }
unsigned char *a = va;
int mid = n/2;
int r1, r2;
if (use_cilk) {
r1 = cilk_spawn mergesort_r(a, mid, width, extra, cmp);
} else {
r1 = mergesort_r(a, mid, width, extra, cmp);
}
r2 = mergesort_r(a+mid*width, n-mid, width, extra, cmp);
cilk_sync;
if (r1 != 0) return r1;
if (r2 != 0) return r2;
void *tmp = toku_xmalloc(n * width);
int r;
if (use_cilk) {
r = merge(tmp, a, mid, a+mid*width, n-mid, width, extra, cmp);
} else {
r = merge_c(tmp, a, mid, a+mid*width, n-mid, width, extra, cmp);
}
if (r != 0) {
toku_free(tmp);
return r;
}
memcpy(a, tmp, n*width);
toku_free(tmp);
return 0;
}
/* -*- mode: C; c-basic-offset: 4 -*- */
#ifndef SORT_H
#define SORT_H
#ident "$Id$"
#ident "Copyright (c) 2007-2011 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#if defined(__cplusplus) || defined(__cilkplusplus)
extern "C" {
#endif
// apes qsort_r which is not available in centos 5's version of libc
// is parallelized with cilk, therefore probably faster than qsort_r on large arrays
// TODO: switch to qsort_r for small arrays (at the bottom of the recursion)
// this requires figuring out what to do about libc
//
// a: array of elements
// n: number of elements
// width: size of each element in bytes
// extra: extra data for comparison function (passed in as first parameter)
// cmp: comparison function, compatible with qsort_r
//
// Returns 0 on success.
int
mergesort_r(void *a, int n, int width,
void *extra, int (*cmp)(void *, const void *, const void *));
#if defined(__cplusplus) || defined(__cilkplusplus)
};
#endif
#endif
...@@ -84,18 +84,25 @@ enum brtnode_verify_type { ...@@ -84,18 +84,25 @@ enum brtnode_verify_type {
read_none read_none
}; };
static int
string_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
{
char *s = a->data, *t = b->data;
return strcmp(s, t);
}
static void static void
setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE *dn) { setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE *dn) {
int r; int r;
if (bft == read_all) { if (bft == read_all) {
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt_h); fill_bfe_for_full_read(&bfe, brt_h, NULL, string_key_cmp);
r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe); r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe);
assert(r==0); assert(r==0);
} }
else if (bft == read_compressed || bft == read_none) { else if (bft == read_compressed || bft == read_none) {
struct brtnode_fetch_extra bfe; struct brtnode_fetch_extra bfe;
fill_bfe_for_min_read(&bfe, brt_h); fill_bfe_for_min_read(&bfe, brt_h, NULL, string_key_cmp);
r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe); r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe);
assert(r==0); assert(r==0);
// assert all bp's are compressed // assert all bp's are compressed
...@@ -118,7 +125,7 @@ setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE ...@@ -118,7 +125,7 @@ setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE
} }
} }
// now decompress them // now decompress them
fill_bfe_for_full_read(&bfe, brt_h); fill_bfe_for_full_read(&bfe, brt_h, NULL, string_key_cmp);
assert(toku_brtnode_pf_req_callback(*dn, &bfe)); assert(toku_brtnode_pf_req_callback(*dn, &bfe));
long size; long size;
r = toku_brtnode_pf_callback(*dn, &bfe, fd, &size); r = toku_brtnode_pf_callback(*dn, &bfe, fd, &size);
...@@ -1067,9 +1074,9 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) { ...@@ -1067,9 +1074,9 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) {
r = xids_create_child(xids_123, &xids_234, (TXNID)234); r = xids_create_child(xids_123, &xids_234, (TXNID)234);
CKERR(r); CKERR(r);
r = toku_fifo_enq(BNC_BUFFER(&sn,0), "a", 2, "aval", 5, BRT_NONE, next_dummymsn(), xids_0); assert(r==0); r = toku_fifo_enq(BNC_BUFFER(&sn,0), "a", 2, "aval", 5, BRT_NONE, next_dummymsn(), xids_0, NULL); assert(r==0);
r = toku_fifo_enq(BNC_BUFFER(&sn,0), "b", 2, "bval", 5, BRT_NONE, next_dummymsn(), xids_123); assert(r==0); r = toku_fifo_enq(BNC_BUFFER(&sn,0), "b", 2, "bval", 5, BRT_NONE, next_dummymsn(), xids_123, NULL); assert(r==0);
r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, next_dummymsn(), xids_234); assert(r==0); r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, next_dummymsn(), xids_234, NULL); assert(r==0);
BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_0) + xids_get_serialize_size(xids_123); BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_0) + xids_get_serialize_size(xids_123);
BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_234); BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_234);
//Cleanup: //Cleanup:
......
...@@ -154,7 +154,7 @@ static void verify_cachetable_against_present (void) { ...@@ -154,7 +154,7 @@ static void verify_cachetable_against_present (void) {
for (i=0; i<my_n_present; i++) { for (i=0; i<my_n_present; i++) {
void *v; void *v;
u_int32_t fullhash = toku_cachetable_hash(my_present_items[i].cf, my_present_items[i].key); u_int32_t fullhash = toku_cachetable_hash(my_present_items[i].cf, my_present_items[i].key);
int r=toku_cachetable_get_and_pin_if_in_memory(my_present_items[i].cf, int r=toku_cachetable_maybe_get_and_pin_clean(my_present_items[i].cf,
my_present_items[i].key, my_present_items[i].key,
toku_cachetable_hash(my_present_items[i].cf, my_present_items[i].key), toku_cachetable_hash(my_present_items[i].cf, my_present_items[i].key),
&v); &v);
......
...@@ -57,7 +57,7 @@ test_fifo_enq (int n) { ...@@ -57,7 +57,7 @@ test_fifo_enq (int n) {
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
if (startmsn.msn == ZERO_MSN.msn) if (startmsn.msn == ZERO_MSN.msn)
startmsn = msn; startmsn = msn;
r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, msn, xids); assert(r == 0); r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, msn, xids, NULL); assert(r == 0);
xids_destroy(&xids); xids_destroy(&xids);
} }
......
...@@ -58,13 +58,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) { ...@@ -58,13 +58,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
} }
static void static void
insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) { insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) { for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
unsigned int key = htonl(val); unsigned int key = htonl(val);
DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key); DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
DBT theval; toku_fill_dbt(&theval, &val, sizeof val); DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval); toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
node->max_msn_applied_to_node_on_disk = msn; node->max_msn_applied_to_node_on_disk = msn;
} }
} }
...@@ -89,7 +89,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey, ...@@ -89,7 +89,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey,
toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k); toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k);
} }
toku_unpin_brtnode(brt, child); toku_unpin_brtnode(brt, child);
insert_into_child_buffer(node, childnum, minkeys[childnum], maxkeys[childnum]); insert_into_child_buffer(brt, node, childnum, minkeys[childnum], maxkeys[childnum]);
} }
*minkey = minkeys[0]; *minkey = minkeys[0];
*maxkey = maxkeys[0]; *maxkey = maxkeys[0];
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "$Id$"
#ident "Copyright (c) 2007-2011 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "test.h"
#include <stdlib.h>
#include "sort.h"
const int MAX_NUM = 0x0fffffffL;
int MAGIC_EXTRA = 0xd3adb00f;
static int
int_cmp(void *ve, const void *va, const void *vb)
{
int *e = ve;
assert(e);
assert(*e == MAGIC_EXTRA);
const int *a = va, *b = vb;
assert(*a < MAX_NUM);
assert(*b < MAX_NUM);
return (*a > *b) - (*a < *b);
}
static void
check_int_array(int a[], int nelts)
{
assert(a[0] < MAX_NUM);
for (int i = 1; i < nelts; ++i) {
assert(a[i] < MAX_NUM);
assert(a[i-1] <= a[i]);
}
}
static void
zero_array_test(void)
{
mergesort_r(NULL, 0, sizeof(int), NULL, int_cmp);
}
static void
already_sorted_test(int nelts)
{
int *MALLOC_N(nelts, a);
for (int i = 0; i < nelts; ++i) {
a[i] = i;
}
mergesort_r(a, nelts, sizeof a[0], &MAGIC_EXTRA, int_cmp);
check_int_array(a, nelts);
toku_free(a);
}
static void
random_array_test(int nelts)
{
int *MALLOC_N(nelts, a);
for (int i = 0; i < nelts; ++i) {
a[i] = rand() % MAX_NUM;
}
mergesort_r(a, nelts, sizeof a[0], &MAGIC_EXTRA, int_cmp);
check_int_array(a, nelts);
toku_free(a);
}
int
test_main(int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__)))
{
zero_array_test();
already_sorted_test(10);
already_sorted_test(1000);
already_sorted_test(10001);
already_sorted_test(10000000);
random_array_test(10);
random_array_test(1000);
random_array_test(10001);
random_array_test(10000000);
return 0;
}
...@@ -44,7 +44,7 @@ static void test_3748 (void) { ...@@ -44,7 +44,7 @@ static void test_3748 (void) {
if (startmsn.msn == ZERO_MSN.msn) if (startmsn.msn == ZERO_MSN.msn)
startmsn = msn; startmsn = msn;
r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, msn, xids); assert(r == 0); r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, msn, xids, NULL); assert(r == 0);
xids_destroy(&xids); xids_destroy(&xids);
} }
for (int i=N/10; i<N; i++) { for (int i=N/10; i<N; i++) {
......
...@@ -62,13 +62,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) { ...@@ -62,13 +62,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
} }
static void static void
insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) { insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) { for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
unsigned int key = htonl(val); unsigned int key = htonl(val);
DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key); DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
DBT theval; toku_fill_dbt(&theval, &val, sizeof val); DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval); toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
// Create bad tree (don't do following): // Create bad tree (don't do following):
// node->max_msn_applied_to_node = msn; // node->max_msn_applied_to_node = msn;
...@@ -95,7 +95,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey, ...@@ -95,7 +95,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey,
toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k); toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k);
} }
toku_unpin_brtnode(brt, child); toku_unpin_brtnode(brt, child);
insert_into_child_buffer(node, childnum, minkeys[childnum], maxkeys[childnum]); insert_into_child_buffer(brt, node, childnum, minkeys[childnum], maxkeys[childnum]);
} }
*minkey = minkeys[0]; *minkey = minkeys[0];
*maxkey = maxkeys[0]; *maxkey = maxkeys[0];
......
...@@ -47,13 +47,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) { ...@@ -47,13 +47,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
} }
static UU() void static UU() void
insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) { insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) { for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
unsigned int key = htonl(val); unsigned int key = htonl(val);
DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key); DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
DBT theval; toku_fill_dbt(&theval, &val, sizeof val); DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval); toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
} }
} }
......
...@@ -47,13 +47,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) { ...@@ -47,13 +47,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
} }
static UU() void static UU() void
insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) { insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) { for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
unsigned int key = htonl(val); unsigned int key = htonl(val);
DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key); DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
DBT theval; toku_fill_dbt(&theval, &val, sizeof val); DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval); toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
} }
} }
......
...@@ -48,7 +48,7 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) { ...@@ -48,7 +48,7 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
} }
static void static void
insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) { insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
int k = htonl(maxkey); int k = htonl(maxkey);
maxkey = htonl(k+1); maxkey = htonl(k+1);
for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) { for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
...@@ -56,7 +56,7 @@ insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) { ...@@ -56,7 +56,7 @@ insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) {
DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key); DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
DBT theval; toku_fill_dbt(&theval, &val, sizeof val); DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval); toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
} }
} }
...@@ -80,7 +80,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey, ...@@ -80,7 +80,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey,
toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k); toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k);
} }
toku_unpin_brtnode(brt, child); toku_unpin_brtnode(brt, child);
insert_into_child_buffer(node, childnum, minkeys[childnum], maxkeys[childnum]); insert_into_child_buffer(brt, node, childnum, minkeys[childnum], maxkeys[childnum]);
} }
*minkey = minkeys[0]; *minkey = minkeys[0];
*maxkey = maxkeys[0]; *maxkey = maxkeys[0];
......
...@@ -47,13 +47,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) { ...@@ -47,13 +47,13 @@ populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
} }
static UU() void static UU() void
insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) { insert_into_child_buffer(BRT brt, BRTNODE node, int childnum, int minkey, int maxkey) {
for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) { for (unsigned int val = htonl(minkey); val <= htonl(maxkey); val++) {
unsigned int key = htonl(val); unsigned int key = htonl(val);
DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key); DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
DBT theval; toku_fill_dbt(&theval, &val, sizeof val); DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval); toku_brt_append_to_child_buffer(brt, node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
} }
} }
......
...@@ -265,6 +265,7 @@ garbage_collection(ULE ule, OMT snapshot_xids, OMT live_list_reverse) { ...@@ -265,6 +265,7 @@ garbage_collection(ULE ule, OMT snapshot_xids, OMT live_list_reverse) {
done:; done:;
} }
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// This is the big enchilada. (Bring Tums.) Note that this level of abstraction // This is the big enchilada. (Bring Tums.) Note that this level of abstraction
// has no knowledge of the inner structure of either leafentry or msg. It makes // has no knowledge of the inner structure of either leafentry or msg. It makes
......
...@@ -43,6 +43,13 @@ TXNID uxr_get_txnid(UXRHANDLE uxr); ...@@ -43,6 +43,13 @@ TXNID uxr_get_txnid(UXRHANDLE uxr);
//1 does much slower debugging //1 does much slower debugging
#define GARBAGE_COLLECTION_DEBUG 0 #define GARBAGE_COLLECTION_DEBUG 0
void fast_msg_to_leafentry(
BRT_MSG msg, // message to apply to leafentry
size_t *new_leafentry_memorysize,
size_t *new_leafentry_disksize,
LEAFENTRY *new_leafentry_p) ;
int apply_msg_to_leafentry(BRT_MSG msg, int apply_msg_to_leafentry(BRT_MSG msg,
LEAFENTRY old_leafentry, // NULL if there was no stored data. LEAFENTRY old_leafentry, // NULL if there was no stored data.
size_t *new_leafentry_memorysize, size_t *new_leafentry_memorysize,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment