Commit dacd2abf authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

[t:3724], [t:3717], [t:3727], merge fixes to main

git-svn-id: file:///svn/toku/tokudb@32829 c7de825b-a66e-492c-adef-691d508d4ae1
parent bacc1c7e
...@@ -136,10 +136,11 @@ struct brtnode_nonleaf_childinfo { ...@@ -136,10 +136,11 @@ struct brtnode_nonleaf_childinfo {
struct brtnode_leaf_basement_node { struct brtnode_leaf_basement_node {
uint32_t optimized_for_upgrade; // version number to which this leaf has been optimized, zero if never optimized for upgrade uint32_t optimized_for_upgrade; // version number to which this leaf has been optimized, zero if never optimized for upgrade
BOOL soft_copy_is_up_to_date; // the data in the OMT reflects the softcopy state.
OMT buffer; OMT buffer;
unsigned int n_bytes_in_buffer; /* How many bytes to represent the OMT (including the per-key overheads, but not including the overheads for the node. */ unsigned int n_bytes_in_buffer; /* How many bytes to represent the OMT (including the per-key overheads, but not including the overheads for the node. */
unsigned int seqinsert; /* number of sequential inserts to this leaf */ unsigned int seqinsert; /* number of sequential inserts to this leaf */
MSN max_msn_applied;
DSN max_dsn_applied; // max deserialization sequence number applied
}; };
#define PT_INVALID 0 #define PT_INVALID 0
...@@ -204,8 +205,8 @@ struct __attribute__((__packed__)) brtnode_partition { ...@@ -204,8 +205,8 @@ struct __attribute__((__packed__)) brtnode_partition {
}; };
struct brtnode { struct brtnode {
MSN max_msn_applied_to_node_in_memory; // max msn that has been applied to this node (for root node, this is max msn for the tree) MSN max_msn_applied_to_node_on_disk; // max_msn_applied that will be written to disk
MSN max_msn_applied_to_node_on_disk; // same as above, but for data on disk, only meaningful if node is clean DSN dsn; // deserialization sequence number
unsigned int nodesize; unsigned int nodesize;
unsigned int flags; unsigned int flags;
BLOCKNUM thisnodename; // Which block number is this node? BLOCKNUM thisnodename; // Which block number is this node?
...@@ -303,7 +304,8 @@ static inline void set_BSB(BRTNODE node, int i, SUB_BLOCK sb) { ...@@ -303,7 +304,8 @@ static inline void set_BSB(BRTNODE node, int i, SUB_BLOCK sb) {
// leaf node macros // leaf node macros
#define BLB_OPTIMIZEDFORUPGRADE(node,i) (BLB(node,i)->optimized_for_upgrade) #define BLB_OPTIMIZEDFORUPGRADE(node,i) (BLB(node,i)->optimized_for_upgrade)
#define BLB_SOFTCOPYISUPTODATE(node,i) (BLB(node,i)->soft_copy_is_up_to_date) #define BLB_MAX_MSN_APPLIED(node,i) (BLB(node,i)->max_msn_applied)
#define BLB_MAX_DSN_APPLIED(node,i) (BLB(node,i)->max_dsn_applied)
#define BLB_BUFFER(node,i) (BLB(node,i)->buffer) #define BLB_BUFFER(node,i) (BLB(node,i)->buffer)
#define BLB_NBYTESINBUF(node,i) (BLB(node,i)->n_bytes_in_buffer) #define BLB_NBYTESINBUF(node,i) (BLB(node,i)->n_bytes_in_buffer)
#define BLB_SEQINSERT(node,i) (BLB(node,i)->seqinsert) #define BLB_SEQINSERT(node,i) (BLB(node,i)->seqinsert)
...@@ -394,6 +396,8 @@ struct brt { ...@@ -394,6 +396,8 @@ struct brt {
int (*close_db)(DB*, u_int32_t); int (*close_db)(DB*, u_int32_t);
u_int32_t close_flags; u_int32_t close_flags;
DSN curr_dsn;
struct toku_list live_brt_link; struct toku_list live_brt_link;
struct toku_list zombie_brt_link; struct toku_list zombie_brt_link;
}; };
...@@ -521,6 +525,11 @@ void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children ...@@ -521,6 +525,11 @@ void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children
void toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num_children, void toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num_children,
int layout_version, unsigned int nodesize, unsigned int flags); int layout_version, unsigned int nodesize, unsigned int flags);
int toku_pin_brtnode_if_clean(
BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
ANCESTORS ancestors, struct pivot_bounds const * const bounds,
BRTNODE *node_p
);
int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash, int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
UNLOCKERS unlockers, UNLOCKERS unlockers,
ANCESTORS ancestors, struct pivot_bounds const * const pbounds, ANCESTORS ancestors, struct pivot_bounds const * const pbounds,
......
...@@ -384,7 +384,7 @@ serialize_brtnode_info( ...@@ -384,7 +384,7 @@ serialize_brtnode_info(
struct wbuf wb; struct wbuf wb;
wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size); wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size);
wbuf_MSN(&wb, node->max_msn_applied_to_node_in_memory); wbuf_MSN(&wb, node->max_msn_applied_to_node_on_disk);
wbuf_nocrc_uint(&wb, node->nodesize); wbuf_nocrc_uint(&wb, node->nodesize);
wbuf_nocrc_uint(&wb, node->flags); wbuf_nocrc_uint(&wb, node->flags);
wbuf_nocrc_int (&wb, node->height); wbuf_nocrc_int (&wb, node->height);
...@@ -478,6 +478,7 @@ static void ...@@ -478,6 +478,7 @@ static void
rebalance_brtnode_leaf(BRTNODE node) rebalance_brtnode_leaf(BRTNODE node)
{ {
assert(node->height == 0); assert(node->height == 0);
assert(node->dirty);
// first create an array of OMTVALUE's that store all the data // first create an array of OMTVALUE's that store all the data
u_int32_t num_le = 0; u_int32_t num_le = 0;
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
...@@ -525,6 +526,16 @@ rebalance_brtnode_leaf(BRTNODE node) ...@@ -525,6 +526,16 @@ rebalance_brtnode_leaf(BRTNODE node)
u_int32_t tmp_optimized_for_upgrade = BLB_OPTIMIZEDFORUPGRADE(node, node->n_children-1); u_int32_t tmp_optimized_for_upgrade = BLB_OPTIMIZEDFORUPGRADE(node, node->n_children-1);
u_int32_t tmp_seqinsert = BLB_SEQINSERT(node, node->n_children-1); u_int32_t tmp_seqinsert = BLB_SEQINSERT(node, node->n_children-1);
MSN max_msn = MIN_MSN;
DSN min_dsn = MAX_DSN;
for (int i = 0; i < node->n_children; i++) {
DSN curr_dsn = BLB_MAX_DSN_APPLIED(node,i);
MSN curr_msn = BLB_MAX_MSN_APPLIED(node,i);
min_dsn = (curr_dsn < min_dsn) ? curr_dsn : min_dsn;
max_msn = (curr_msn.msn > max_msn.msn) ? curr_msn : max_msn;
}
// Now destroy the old stuff; // Now destroy the old stuff;
toku_destroy_brtnode_internals(node); toku_destroy_brtnode_internals(node);
...@@ -537,7 +548,7 @@ rebalance_brtnode_leaf(BRTNODE node) ...@@ -537,7 +548,7 @@ rebalance_brtnode_leaf(BRTNODE node)
node->n_children = num_children; node->n_children = num_children;
XMALLOC_N(num_children, node->bp); XMALLOC_N(num_children, node->bp);
for (int i = 0; i < num_children; i++) { for (int i = 0; i < num_children; i++) {
set_BLB(node, i, toku_create_empty_bn()); set_BLB(node, i, toku_create_empty_bn());
} }
// now we start to fill in the data // now we start to fill in the data
...@@ -582,7 +593,11 @@ rebalance_brtnode_leaf(BRTNODE node) ...@@ -582,7 +593,11 @@ rebalance_brtnode_leaf(BRTNODE node)
BP_STATE(node,i) = PT_AVAIL; BP_STATE(node,i) = PT_AVAIL;
BP_TOUCH_CLOCK(node,i); BP_TOUCH_CLOCK(node,i);
BLB_MAX_DSN_APPLIED(node,i) = min_dsn;
BLB_MAX_MSN_APPLIED(node,i) = max_msn;
} }
node->max_msn_applied_to_node_on_disk = max_msn;
// now the subtree estimates // now the subtree estimates
toku_brt_leaf_reset_calc_leaf_stats(node); toku_brt_leaf_reset_calc_leaf_stats(node);
...@@ -723,7 +738,6 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h ...@@ -723,7 +738,6 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h
//printf("%s:%d wrote %d bytes for %lld size=%lld\n", __FILE__, __LINE__, w.ndone, off, size); //printf("%s:%d wrote %d bytes for %lld size=%lld\n", __FILE__, __LINE__, w.ndone, off, size);
toku_free(compressed_buf); toku_free(compressed_buf);
node->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction. node->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
node->max_msn_applied_to_node_on_disk = node->max_msn_applied_to_node_in_memory;
return 0; return 0;
} }
...@@ -799,7 +813,8 @@ BASEMENTNODE toku_create_empty_bn(void) { ...@@ -799,7 +813,8 @@ BASEMENTNODE toku_create_empty_bn(void) {
BASEMENTNODE toku_create_empty_bn_no_buffer(void) { BASEMENTNODE toku_create_empty_bn_no_buffer(void) {
BASEMENTNODE XMALLOC(bn); BASEMENTNODE XMALLOC(bn);
bn->soft_copy_is_up_to_date = TRUE; bn->max_dsn_applied = 0;
bn->max_msn_applied.msn = 0;
bn->buffer = NULL; bn->buffer = NULL;
bn->n_bytes_in_buffer = 0; bn->n_bytes_in_buffer = 0;
bn->seqinsert = 0; bn->seqinsert = 0;
...@@ -924,8 +939,9 @@ deserialize_brtnode_info( ...@@ -924,8 +939,9 @@ deserialize_brtnode_info(
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0}; struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
rbuf_init(&rb, sb->uncompressed_ptr, data_size); rbuf_init(&rb, sb->uncompressed_ptr, data_size);
node->dsn = INVALID_DSN;
node->max_msn_applied_to_node_on_disk = rbuf_msn(&rb); node->max_msn_applied_to_node_on_disk = rbuf_msn(&rb);
node->max_msn_applied_to_node_in_memory = node->max_msn_applied_to_node_on_disk;
node->nodesize = rbuf_int(&rb); node->nodesize = rbuf_int(&rb);
node->flags = rbuf_int(&rb); node->flags = rbuf_int(&rb);
node->height = rbuf_int(&rb); node->height = rbuf_int(&rb);
...@@ -988,6 +1004,8 @@ static void ...@@ -988,6 +1004,8 @@ static void
setup_available_brtnode_partition(BRTNODE node, int i) { setup_available_brtnode_partition(BRTNODE node, int i) {
if (node->height == 0) { if (node->height == 0) {
set_BLB(node, i, toku_create_empty_bn()); set_BLB(node, i, toku_create_empty_bn());
BLB_MAX_MSN_APPLIED(node,i) = node->max_msn_applied_to_node_on_disk;
BLB_MAX_DSN_APPLIED(node,i) = 0;
} }
else { else {
set_BNC(node, i, toku_create_empty_nl()); set_BNC(node, i, toku_create_empty_nl());
...@@ -1054,7 +1072,7 @@ deserialize_brtnode_partition( ...@@ -1054,7 +1072,7 @@ deserialize_brtnode_partition(
unsigned char ch = rbuf_char(&rb); unsigned char ch = rbuf_char(&rb);
assert(ch == BRTNODE_PARTITION_OMT_LEAVES); assert(ch == BRTNODE_PARTITION_OMT_LEAVES);
BLB_OPTIMIZEDFORUPGRADE(node, index) = rbuf_int(&rb); BLB_OPTIMIZEDFORUPGRADE(node, index) = rbuf_int(&rb);
BLB_SOFTCOPYISUPTODATE(node, index) = FALSE; // dont need to set max_dsn_applied because creation of basement node set it to correct value
BLB_SEQINSERT(node, index) = 0; BLB_SEQINSERT(node, index) = 0;
u_int32_t num_entries = rbuf_int(&rb); u_int32_t num_entries = rbuf_int(&rb);
OMTVALUE *XMALLOC_N(num_entries, array); OMTVALUE *XMALLOC_N(num_entries, array);
...@@ -1246,6 +1264,9 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode ...@@ -1246,6 +1264,9 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode
read_and_decompress_sub_block(&rb, &curr_sb); read_and_decompress_sub_block(&rb, &curr_sb);
// at this point, sb->uncompressed_ptr stores the serialized node partition // at this point, sb->uncompressed_ptr stores the serialized node partition
deserialize_brtnode_partition(&curr_sb, node, childnum); deserialize_brtnode_partition(&curr_sb, node, childnum);
if (node->height == 0) {
toku_brt_bn_reset_stats(node, childnum);
}
toku_free(curr_sb.uncompressed_ptr); toku_free(curr_sb.uncompressed_ptr);
toku_free(raw_block); toku_free(raw_block);
} }
...@@ -1269,6 +1290,9 @@ toku_deserialize_bp_from_compressed(BRTNODE node, int childnum) { ...@@ -1269,6 +1290,9 @@ toku_deserialize_bp_from_compressed(BRTNODE node, int childnum) {
curr_sb->compressed_size curr_sb->compressed_size
); );
deserialize_brtnode_partition(curr_sb, node, childnum); deserialize_brtnode_partition(curr_sb, node, childnum);
if (node->height == 0) {
toku_brt_bn_reset_stats(node, childnum);
}
toku_free(curr_sb->uncompressed_ptr); toku_free(curr_sb->uncompressed_ptr);
toku_free(curr_sb->compressed_ptr); toku_free(curr_sb->compressed_ptr);
toku_free(curr_sb); toku_free(curr_sb);
......
...@@ -134,7 +134,7 @@ toku_verify_brtnode (BRT brt, ...@@ -134,7 +134,7 @@ toku_verify_brtnode (BRT brt,
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v); //printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
node = node_v; node = node_v;
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
thismsn = node->max_msn_applied_to_node_in_memory; thismsn = node->max_msn_applied_to_node_on_disk;
if (rootmsn.msn == ZERO_MSN.msn) { if (rootmsn.msn == ZERO_MSN.msn) {
assert(parentmsn.msn == ZERO_MSN.msn); assert(parentmsn.msn == ZERO_MSN.msn);
rootmsn = thismsn; rootmsn = thismsn;
......
...@@ -144,6 +144,15 @@ toku_assert_entire_node_in_memory(BRTNODE node) { ...@@ -144,6 +144,15 @@ toku_assert_entire_node_in_memory(BRTNODE node) {
} }
} }
//
// MUST be called with the ydb lock held
//
static void
set_new_DSN_for_node(BRTNODE node, BRT t) {
node->dsn = t->curr_dsn;
t->curr_dsn++;
}
static u_int32_t static u_int32_t
get_leaf_num_entries(BRTNODE node) { get_leaf_num_entries(BRTNODE node) {
u_int32_t result = 0; u_int32_t result = 0;
...@@ -268,6 +277,30 @@ static void maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTO ...@@ -268,6 +277,30 @@ static void maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTO
static long brtnode_memory_size (BRTNODE node); static long brtnode_memory_size (BRTNODE node);
int toku_pin_brtnode_if_clean(
BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
ANCESTORS ancestors, struct pivot_bounds const * const bounds,
BRTNODE *node_p
)
{
void *node_v;
int r = toku_cachetable_get_and_pin_if_in_memory(
brt->cf,
blocknum,
fullhash,
&node_v
); // this one doesn't need to use the toku_pin_brtnode function because it doesn't bring anything in, so it cannot create a non-up-to-date leaf node.
if (r==0) {
BRTNODE node = node_v;
if (node->dsn == INVALID_DSN) {
set_new_DSN_for_node(node, brt);
}
maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds);
*node_p = node;
}
return r;
}
int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash, int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
UNLOCKERS unlockers, UNLOCKERS unlockers,
ANCESTORS ancestors, struct pivot_bounds const * const bounds, ANCESTORS ancestors, struct pivot_bounds const * const bounds,
...@@ -290,6 +323,9 @@ int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash, ...@@ -290,6 +323,9 @@ int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
unlockers); unlockers);
if (r==0) { if (r==0) {
BRTNODE node = node_v; BRTNODE node = node_v;
if (node->dsn == INVALID_DSN) {
set_new_DSN_for_node(node, brt);
}
maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds); maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds);
*node_p = node; *node_p = node;
// printf("%*sPin %ld\n", 8-node->height, "", blocknum.b); // printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
...@@ -321,6 +357,9 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha ...@@ -321,6 +357,9 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha
); );
assert(r==0); assert(r==0);
BRTNODE node = node_v; BRTNODE node = node_v;
if (node->dsn == INVALID_DSN) {
set_new_DSN_for_node(node, brt);
}
maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds); maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds);
*node_p = node; *node_p = node;
} }
...@@ -360,17 +399,23 @@ calc_leaf_stats (OMT buffer) { ...@@ -360,17 +399,23 @@ calc_leaf_stats (OMT buffer) {
return e; return e;
} }
void
toku_brt_bn_reset_stats(BRTNODE node, int childnum)
{
// basement node may be evicted, so only update stats if the basement node
// is fully in memory
// TODO: (Zardosht) for row cache, figure out a better way to do this
if (BP_STATE(node,childnum) == PT_AVAIL) {
BP_SUBTREE_EST(node,childnum) = calc_leaf_stats(BLB_BUFFER(node, childnum));
}
}
void void
toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node) { toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node) {
invariant(node->height==0); invariant(node->height==0);
int i = 0; int i = 0;
for (i = 0; i < node->n_children; i++) { for (i = 0; i < node->n_children; i++) {
// basement node may be evicted, so only update stats if the basement node toku_brt_bn_reset_stats(node,i);
// is fully in memory
// TODO: (Zardosht) for row cache, figure out a better way to do this
if (BP_STATE(node,i) == PT_AVAIL) {
node->bp[i].subtree_estimates = calc_leaf_stats(BLB_BUFFER(node, i));
}
} }
} }
...@@ -890,7 +935,7 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num ...@@ -890,7 +935,7 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
assert(height >= 0); assert(height >= 0);
n->max_msn_applied_to_node_on_disk = MIN_MSN; // correct value for root node, harmless for others n->max_msn_applied_to_node_on_disk = MIN_MSN; // correct value for root node, harmless for others
n->max_msn_applied_to_node_in_memory = MIN_MSN; // correct value for root node, harmless for others n->dsn = INVALID_DSN; // the owner of the node should take responsibility for properly setting this
n->nodesize = nodesize; n->nodesize = nodesize;
n->flags = flags; n->flags = flags;
n->thisnodename = nodename; n->thisnodename = nodename;
...@@ -951,11 +996,12 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r ...@@ -951,11 +996,12 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r
fixup_child_estimates(newroot, 0, nodea, TRUE); fixup_child_estimates(newroot, 0, nodea, TRUE);
fixup_child_estimates(newroot, 1, nodeb, TRUE); fixup_child_estimates(newroot, 1, nodeb, TRUE);
{ {
MSN msna = nodea->max_msn_applied_to_node_in_memory; MSN msna = nodea->max_msn_applied_to_node_on_disk;
MSN msnb = nodeb->max_msn_applied_to_node_in_memory; MSN msnb = nodeb->max_msn_applied_to_node_on_disk;
invariant(msna.msn == msnb.msn); invariant(msna.msn == msnb.msn);
newroot->max_msn_applied_to_node_in_memory = msna; newroot->max_msn_applied_to_node_on_disk = msna;
} }
newroot->dsn = (nodea->dsn > nodeb->dsn) ? nodea->dsn : nodeb->dsn;
BP_STATE(newroot,0) = PT_AVAIL; BP_STATE(newroot,0) = PT_AVAIL;
BP_STATE(newroot,1) = PT_AVAIL; BP_STATE(newroot,1) = PT_AVAIL;
newroot->dirty = 1; newroot->dirty = 1;
...@@ -981,6 +1027,7 @@ toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children) { ...@@ -981,6 +1027,7 @@ toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children) {
BRTNODE XMALLOC(n); BRTNODE XMALLOC(n);
toku_initialize_empty_brtnode(n, name, height, n_children, t->h->layout_version, t->h->nodesize, t->flags); toku_initialize_empty_brtnode(n, name, height, n_children, t->h->layout_version, t->h->nodesize, t->flags);
assert(n->nodesize > 0); assert(n->nodesize > 0);
set_new_DSN_for_node(n, t);
u_int32_t fullhash = toku_cachetable_hash(t->cf, n->thisnodename); u_int32_t fullhash = toku_cachetable_hash(t->cf, n->thisnodename);
n->fullhash = fullhash; n->fullhash = fullhash;
...@@ -1128,12 +1175,13 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1128,12 +1175,13 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
// Effect: Split a leaf node. // Effect: Split a leaf node.
{ {
BRTNODE B; BRTNODE B;
DSN dsn = node->dsn;
//printf("%s:%d splitting leaf %" PRIu64 " which is size %u (targetsize = %u)\n", __FILE__, __LINE__, node->thisnodename.b, toku_serialize_brtnode_size(node), node->nodesize); //printf("%s:%d splitting leaf %" PRIu64 " which is size %u (targetsize = %u)\n", __FILE__, __LINE__, node->thisnodename.b, toku_serialize_brtnode_size(node), node->nodesize);
assert(node->height==0); assert(node->height==0);
assert(node->nodesize>0); assert(node->nodesize>0);
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
MSN max_msn_applied_to_node = node->max_msn_applied_to_node_in_memory; MSN max_msn_applied_to_node = node->max_msn_applied_to_node_on_disk;
//printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename); //printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
//printf("%s:%d B is at %lld nodesize=%d\n", __FILE__, __LINE__, B->thisnodename, B->nodesize); //printf("%s:%d B is at %lld nodesize=%d\n", __FILE__, __LINE__, B->thisnodename, B->nodesize);
...@@ -1215,6 +1263,8 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1215,6 +1263,8 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
); );
BLB_NBYTESINBUF(node, split_node) -= diff_size; BLB_NBYTESINBUF(node, split_node) -= diff_size;
BLB_NBYTESINBUF(B, 0) += diff_size; BLB_NBYTESINBUF(B, 0) += diff_size;
BLB_MAX_DSN_APPLIED(B,0) = BLB_MAX_DSN_APPLIED(node, split_node);
BLB_MAX_MSN_APPLIED(B,0) = BLB_MAX_MSN_APPLIED(node, split_node);
subtract_estimates(&BP_SUBTREE_EST(node,split_node), &se_diff); subtract_estimates(&BP_SUBTREE_EST(node,split_node), &se_diff);
add_estimates(&BP_SUBTREE_EST(B,0), &se_diff); add_estimates(&BP_SUBTREE_EST(B,0), &se_diff);
...@@ -1255,8 +1305,11 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1255,8 +1305,11 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
splitk->flags=0; splitk->flags=0;
} }
node->max_msn_applied_to_node_in_memory = max_msn_applied_to_node; node->max_msn_applied_to_node_on_disk= max_msn_applied_to_node;
B ->max_msn_applied_to_node_in_memory = max_msn_applied_to_node; B ->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;
node->dsn = dsn;
B->dsn = dsn;
node->dirty = 1; node->dirty = 1;
B->dirty = 1; B->dirty = 1;
...@@ -1285,7 +1338,8 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -1285,7 +1338,8 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
int old_n_children = node->n_children; int old_n_children = node->n_children;
int n_children_in_a = old_n_children/2; int n_children_in_a = old_n_children/2;
int n_children_in_b = old_n_children-n_children_in_a; int n_children_in_b = old_n_children-n_children_in_a;
MSN max_msn_applied_to_node = node->max_msn_applied_to_node_in_memory; MSN max_msn_applied_to_node = node->max_msn_applied_to_node_on_disk;
DSN dsn = node->dsn;
BRTNODE B; BRTNODE B;
assert(node->height>0); assert(node->height>0);
assert(node->n_children>=2); // Otherwise, how do we split? We need at least two children to split. */ assert(node->n_children>=2); // Otherwise, how do we split? We need at least two children to split. */
...@@ -1333,8 +1387,11 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -1333,8 +1387,11 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
} }
node->max_msn_applied_to_node_in_memory = max_msn_applied_to_node; node->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;
B ->max_msn_applied_to_node_in_memory = max_msn_applied_to_node; B ->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;
node->dsn = dsn;
B->dsn = dsn;
node->dirty = 1; node->dirty = 1;
B ->dirty = 1; B ->dirty = 1;
...@@ -1750,6 +1807,13 @@ brt_leaf_put_cmd ( ...@@ -1750,6 +1807,13 @@ brt_leaf_put_cmd (
LEAFENTRY storeddata; LEAFENTRY storeddata;
OMTVALUE storeddatav=NULL; OMTVALUE storeddatav=NULL;
if (cmd->msn.msn <= bn->max_msn_applied.msn) {
// TODO3514 add accountability counter here
return;
}
else {
bn->max_msn_applied = cmd->msn;
}
u_int32_t omt_size; u_int32_t omt_size;
int r; int r;
...@@ -2119,8 +2183,8 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd) ...@@ -2119,8 +2183,8 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd)
// //
{ {
MSN cmd_msn = cmd->msn; MSN cmd_msn = cmd->msn;
invariant(cmd_msn.msn > node->max_msn_applied_to_node_in_memory.msn); invariant(cmd_msn.msn > node->max_msn_applied_to_node_on_disk.msn);
node->max_msn_applied_to_node_in_memory = cmd_msn; node->max_msn_applied_to_node_on_disk = cmd_msn;
//TODO: Accessing type directly //TODO: Accessing type directly
switch (cmd->type) { switch (cmd->type) {
...@@ -2320,19 +2384,21 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair ...@@ -2320,19 +2384,21 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
// splitk (OUT): If the two nodes did not get merged, the new pivot key between the two nodes. // splitk (OUT): If the two nodes did not get merged, the new pivot key between the two nodes.
{ {
MSN msn_max; MSN msn_max;
DSN dsn_max;
assert(a->height == b->height); assert(a->height == b->height);
toku_assert_entire_node_in_memory(parent); toku_assert_entire_node_in_memory(parent);
toku_assert_entire_node_in_memory(a); toku_assert_entire_node_in_memory(a);
toku_assert_entire_node_in_memory(b); toku_assert_entire_node_in_memory(b);
parent->dirty = 1; // just to make sure parent->dirty = 1; // just to make sure
{ {
MSN msna = a->max_msn_applied_to_node_in_memory; MSN msna = a->max_msn_applied_to_node_on_disk;
MSN msnb = b->max_msn_applied_to_node_in_memory; MSN msnb = b->max_msn_applied_to_node_on_disk;
msn_max = (msna.msn > msnb.msn) ? msna : msnb; msn_max = (msna.msn > msnb.msn) ? msna : msnb;
if (a->height > 0) { if (a->height > 0) {
invariant(msn_max.msn <= parent->max_msn_applied_to_node_in_memory.msn); // parent msn must be >= children's msn invariant(msn_max.msn <= parent->max_msn_applied_to_node_on_disk.msn); // parent msn must be >= children's msn
} }
} }
dsn_max = (a->dsn > b->dsn) ? a->dsn : b->dsn;
if (a->height == 0) { if (a->height == 0) {
maybe_merge_pinned_leaf_nodes(parent, childnum_of_parent, a, b, parent_splitk, did_merge, did_rebalance, splitk); maybe_merge_pinned_leaf_nodes(parent, childnum_of_parent, a, b, parent_splitk, did_merge, did_rebalance, splitk);
} else { } else {
...@@ -2341,8 +2407,10 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair ...@@ -2341,8 +2407,10 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
if (*did_merge || *did_rebalance) { if (*did_merge || *did_rebalance) {
// accurate for leaf nodes because all msgs above have been applied, // accurate for leaf nodes because all msgs above have been applied,
// accurate for non-leaf nodes because buffer immediately above each node has been flushed // accurate for non-leaf nodes because buffer immediately above each node has been flushed
a->max_msn_applied_to_node_in_memory = msn_max; a->max_msn_applied_to_node_on_disk = msn_max;
b->max_msn_applied_to_node_in_memory = msn_max; b->max_msn_applied_to_node_on_disk = msn_max;
a->dsn = dsn_max;
b->dsn = dsn_max;
} }
} }
...@@ -2546,7 +2614,7 @@ static void assert_leaf_up_to_date(BRTNODE node) { ...@@ -2546,7 +2614,7 @@ static void assert_leaf_up_to_date(BRTNODE node) {
assert(node->height == 0); assert(node->height == 0);
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
for (int i=0; i < node->n_children; i++) { for (int i=0; i < node->n_children; i++) {
assert(BLB_SOFTCOPYISUPTODATE(node, i)); assert(BLB_MAX_DSN_APPLIED(node, i) >= MIN_DSN);
} }
} }
...@@ -2763,13 +2831,6 @@ brtnode_nonleaf_put_cmd_at_root (BRT t, BRTNODE node, BRT_MSG cmd) ...@@ -2763,13 +2831,6 @@ brtnode_nonleaf_put_cmd_at_root (BRT t, BRTNODE node, BRT_MSG cmd)
void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, bool *made_change, uint64_t *workdone) { void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, bool *made_change, uint64_t *workdone) {
VERIFY_NODE(t, node); VERIFY_NODE(t, node);
// ignore messages that have already been applied to this leaf // ignore messages that have already been applied to this leaf
if (cmd->msn.msn <= node->max_msn_applied_to_node_in_memory.msn) {
// TODO3514 add accountability counter here
return;
}
else {
node->max_msn_applied_to_node_in_memory = cmd->msn;
}
if (brt_msg_applies_once(cmd)) { if (brt_msg_applies_once(cmd)) {
unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t); unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t);
...@@ -2834,7 +2895,10 @@ static void push_something_at_root (BRT brt, BRTNODE *nodep, BRT_MSG cmd) ...@@ -2834,7 +2895,10 @@ static void push_something_at_root (BRT brt, BRTNODE *nodep, BRT_MSG cmd)
uint64_t workdone_ignore = 0; // ignore workdone for root-leaf node uint64_t workdone_ignore = 0; // ignore workdone for root-leaf node
// not up to date, which means the get_and_pin actually fetched it into memory. // not up to date, which means the get_and_pin actually fetched it into memory.
toku_apply_cmd_to_leaf(brt, node, cmd, &made_dirty, &workdone_ignore); toku_apply_cmd_to_leaf(brt, node, cmd, &made_dirty, &workdone_ignore);
if (made_dirty) node->dirty = 1; node->dirty = 1;
MSN cmd_msn = cmd->msn;
invariant(cmd_msn.msn > node->max_msn_applied_to_node_on_disk.msn);
node->max_msn_applied_to_node_on_disk = cmd_msn;
} else { } else {
brtnode_nonleaf_put_cmd_at_root(brt, node, cmd); brtnode_nonleaf_put_cmd_at_root(brt, node, cmd);
//if (should_split) printf("%s:%d Pushed something simple, should_split=1\n", __FILE__, __LINE__); //if (should_split) printf("%s:%d Pushed something simple, should_split=1\n", __FILE__, __LINE__);
...@@ -2876,6 +2940,8 @@ static void apply_cmd_to_in_memory_non_root_leaves ( ...@@ -2876,6 +2940,8 @@ static void apply_cmd_to_in_memory_non_root_leaves (
BRT_MSG cmd, BRT_MSG cmd,
BRTNODE parent, BRTNODE parent,
int parents_childnum, int parents_childnum,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds,
uint64_t * workdone uint64_t * workdone
); );
...@@ -2885,22 +2951,28 @@ static void apply_cmd_to_in_memory_non_root_leaves_starting_at_node (BRT t, ...@@ -2885,22 +2951,28 @@ static void apply_cmd_to_in_memory_non_root_leaves_starting_at_node (BRT t,
BOOL is_root, BOOL is_root,
BRTNODE parent, BRTNODE parent,
int parents_childnum, int parents_childnum,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds,
uint64_t * workdone) { uint64_t * workdone) {
// internal node // internal node
if (node->height>0) { if (node->height>0) {
if (brt_msg_applies_once(cmd)) { if (brt_msg_applies_once(cmd)) {
unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t); unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t);
struct ancestors next_ancestors = {node, childnum, ancestors};
const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum); u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum);
if (is_root) // record workdone in root only, if not root then this is a recursive call so just pass along pointer if (is_root) // record workdone in root only, if not root then this is a recursive call so just pass along pointer
workdone = &(BP_WORKDONE(node,childnum)); workdone = &(BP_WORKDONE(node,childnum));
apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), child_fullhash, cmd, node, childnum, workdone); apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), child_fullhash, cmd, node, childnum, &next_ancestors, &next_bounds, workdone);
} }
else if (brt_msg_applies_all(cmd)) { else if (brt_msg_applies_all(cmd)) {
for (int childnum=0; childnum<node->n_children; childnum++) { for (int childnum=0; childnum<node->n_children; childnum++) {
struct ancestors next_ancestors = {node, childnum, ancestors};
const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum); u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum);
if (is_root) if (is_root)
workdone = &(BP_WORKDONE(node,childnum)); workdone = &(BP_WORKDONE(node,childnum));
apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), child_fullhash, cmd, node, childnum, workdone); apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), child_fullhash, cmd, node, childnum, &next_ancestors, &next_bounds, workdone);
} }
} }
} }
...@@ -2926,15 +2998,24 @@ static void apply_cmd_to_in_memory_non_root_leaves ( ...@@ -2926,15 +2998,24 @@ static void apply_cmd_to_in_memory_non_root_leaves (
BRT_MSG cmd, BRT_MSG cmd,
BRTNODE parent, BRTNODE parent,
int parents_childnum, int parents_childnum,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds,
uint64_t * workdone uint64_t * workdone
) )
{ {
void *node_v; BRTNODE node = NULL;
int r = toku_cachetable_get_and_pin_if_in_memory(t->cf, nodenum, fullhash, &node_v); // this one doesn't need to use the toku_pin_brtnode function because it doesn't bring anything in, so it cannot create a non-up-to-date leaf node. int r = toku_pin_brtnode_if_clean(
t,
nodenum,
fullhash,
ancestors,
bounds,
&node
);
if (r) { goto exit; } if (r) { goto exit; }
BRTNODE node = node_v; apply_cmd_to_in_memory_non_root_leaves_starting_at_node(t, node, cmd, FALSE, parent, parents_childnum, ancestors, bounds, workdone);
apply_cmd_to_in_memory_non_root_leaves_starting_at_node(t, node, cmd, FALSE, parent, parents_childnum, workdone);
toku_unpin_brtnode(t, node); toku_unpin_brtnode(t, node);
exit: exit:
...@@ -2968,7 +3049,7 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd) ...@@ -2968,7 +3049,7 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
fill_bfe_for_full_read(&bfe, brt->h); fill_bfe_for_full_read(&bfe, brt->h);
toku_pin_brtnode_holding_lock(brt, *rootp, fullhash, NULL, &infinite_bounds, &bfe, &node); toku_pin_brtnode_holding_lock(brt, *rootp, fullhash, NULL, &infinite_bounds, &bfe, &node);
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
cmd->msn.msn = node->max_msn_applied_to_node_in_memory.msn + 1; cmd->msn.msn = node->max_msn_applied_to_node_on_disk.msn + 1;
// Note, the lower level function that filters messages based on msn, // Note, the lower level function that filters messages based on msn,
// (brt_leaf_put_cmd() or brt_nonleaf_put_cmd()) will capture the msn and // (brt_leaf_put_cmd() or brt_nonleaf_put_cmd()) will capture the msn and
// store it in the relevant node, including the root node. This is how the // store it in the relevant node, including the root node. This is how the
...@@ -2980,10 +3061,9 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd) ...@@ -2980,10 +3061,9 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
push_something_at_root(brt, &node, cmd); push_something_at_root(brt, &node, cmd);
// verify that msn of latest message was captured in root node (push_something_at_root() did not release ydb lock) // verify that msn of latest message was captured in root node (push_something_at_root() did not release ydb lock)
invariant(cmd->msn.msn == node->max_msn_applied_to_node_in_memory.msn); invariant(cmd->msn.msn == node->max_msn_applied_to_node_on_disk.msn);
if (node->height > 0) { if (node->height > 0) {
apply_cmd_to_in_memory_non_root_leaves_starting_at_node(brt, node, cmd, TRUE, NULL, -1, NULL); apply_cmd_to_in_memory_non_root_leaves_starting_at_node(brt, node, cmd, TRUE, NULL, -1, (ANCESTORS)NULL, &infinite_bounds, NULL);
if (nonleaf_node_is_gorged(node)) { if (nonleaf_node_is_gorged(node)) {
// No need for a loop here. We only inserted one message, so flushing a single child suffices. // No need for a loop here. We only inserted one message, so flushing a single child suffices.
flush_some_child(brt, node, TRUE, TRUE, flush_some_child(brt, node, TRUE, TRUE,
...@@ -3426,6 +3506,7 @@ static int setup_initial_brt_root_node (BRT t, BLOCKNUM blocknum) { ...@@ -3426,6 +3506,7 @@ static int setup_initial_brt_root_node (BRT t, BLOCKNUM blocknum) {
BRTNODE XMALLOC(node); BRTNODE XMALLOC(node);
toku_initialize_empty_brtnode(node, blocknum, 0, 1, t->h->layout_version, t->h->nodesize, t->flags); toku_initialize_empty_brtnode(node, blocknum, 0, 1, t->h->layout_version, t->h->nodesize, t->flags);
BP_STATE(node,0) = PT_AVAIL; BP_STATE(node,0) = PT_AVAIL;
set_new_DSN_for_node(node, t);
u_int32_t fullhash = toku_cachetable_hash(t->cf, blocknum); u_int32_t fullhash = toku_cachetable_hash(t->cf, blocknum);
node->fullhash = fullhash; node->fullhash = fullhash;
...@@ -4569,6 +4650,7 @@ int toku_brt_create(BRT *brt_ptr) { ...@@ -4569,6 +4650,7 @@ int toku_brt_create(BRT *brt_ptr) {
brt->nodesize = BRT_DEFAULT_NODE_SIZE; brt->nodesize = BRT_DEFAULT_NODE_SIZE;
brt->compare_fun = toku_builtin_compare_fun; brt->compare_fun = toku_builtin_compare_fun;
brt->update_fun = NULL; brt->update_fun = NULL;
brt->curr_dsn = 1; // start at 1, as 0 is reserved for basement nodes
int r = toku_omt_create(&brt->txns); int r = toku_omt_create(&brt->txns);
if (r!=0) { toku_free(brt); return r; } if (r!=0) { toku_free(brt); return r; }
*brt_ptr = brt; *brt_ptr = brt;
...@@ -4912,7 +4994,6 @@ apply_buffer_messages_to_basement_node ( ...@@ -4912,7 +4994,6 @@ apply_buffer_messages_to_basement_node (
SUBTREE_EST se, SUBTREE_EST se,
BRTNODE ancestor, BRTNODE ancestor,
int childnum, int childnum,
MSN min_applied_msn,
struct pivot_bounds const * const bounds struct pivot_bounds const * const bounds
) )
// Effect: For each messages in ANCESTOR that is between lower_bound_exclusive (exclusive) and upper_bound_inclusive (inclusive), apply the message to the node. // Effect: For each messages in ANCESTOR that is between lower_bound_exclusive (exclusive) and upper_bound_inclusive (inclusive), apply the message to the node.
...@@ -4946,7 +5027,7 @@ apply_buffer_messages_to_basement_node ( ...@@ -4946,7 +5027,7 @@ apply_buffer_messages_to_basement_node (
({ ({
DBT hk; DBT hk;
toku_fill_dbt(&hk, key, keylen); toku_fill_dbt(&hk, key, keylen);
if (msn.msn > min_applied_msn.msn && (!msg_type_has_key(type) || key_is_in_leaf_range(t, &hk, lbe_ptr, ubi_ptr))) { if ((!msg_type_has_key(type) || key_is_in_leaf_range(t, &hk, lbe_ptr, ubi_ptr))) {
DBT hv; DBT hv;
BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id = {&hk, BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id = {&hk,
toku_fill_dbt(&hv, val, vallen)} }; toku_fill_dbt(&hv, val, vallen)} };
...@@ -5086,6 +5167,24 @@ apply_ancestors_messages_to_leafnode_and_maybe_flush (BRT t, BASEMENTNODE bm, SU ...@@ -5086,6 +5167,24 @@ apply_ancestors_messages_to_leafnode_and_maybe_flush (BRT t, BASEMENTNODE bm, SU
} }
*/ */
static BOOL
partition_requires_msg_application(BRTNODE node, int childnum, ANCESTORS ancestors) {
BOOL requires_msg_application = FALSE;
if (BP_STATE(node,childnum) != PT_AVAIL) return FALSE;
for (
ANCESTORS curr_ancestors = ancestors;
curr_ancestors;
curr_ancestors = curr_ancestors->next
)
{
if (curr_ancestors->node->dsn > BLB_MAX_DSN_APPLIED(node,childnum)) {
requires_msg_application = TRUE;
break;
}
}
return requires_msg_application;
}
static void static void
maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds) maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds)
// Effect: // Effect:
...@@ -5101,7 +5200,13 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -5101,7 +5200,13 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
// need to apply messages to each basement node // need to apply messages to each basement node
// TODO: (Zardosht) cilkify this, watch out for setting of max_msn_applied_to_node // TODO: (Zardosht) cilkify this, watch out for setting of max_msn_applied_to_node
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
if (BP_STATE(node,i) != PT_AVAIL || BLB_SOFTCOPYISUPTODATE(node, i)) { BOOL requires_msg_application = partition_requires_msg_application(
node,
i,
ancestors
);
if (!requires_msg_application) {
continue; continue;
} }
update_stats = TRUE; update_stats = TRUE;
...@@ -5118,15 +5223,13 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -5118,15 +5223,13 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
curr_se, curr_se,
curr_ancestors->node, curr_ancestors->node,
curr_ancestors->childnum, curr_ancestors->childnum,
node->max_msn_applied_to_node_on_disk,
&curr_bounds &curr_bounds
); );
if (curr_ancestors->node->max_msn_applied_to_node_in_memory.msn > node->max_msn_applied_to_node_in_memory.msn) { curr_bn->max_dsn_applied = (curr_ancestors->node->dsn > curr_bn->max_dsn_applied)
node->max_msn_applied_to_node_in_memory = curr_ancestors->node->max_msn_applied_to_node_in_memory; ? curr_ancestors->node->dsn
} : curr_bn->max_dsn_applied;
curr_ancestors= curr_ancestors->next; curr_ancestors= curr_ancestors->next;
} }
BLB_SOFTCOPYISUPTODATE(node, i) = TRUE;
} }
// Must update the leaf estimates. Might as well use the estimates from the soft copy (even if they make it out to disk), since they are // Must update the leaf estimates. Might as well use the estimates from the soft copy (even if they make it out to disk), since they are
...@@ -5161,7 +5264,7 @@ brt_search_basement_node( ...@@ -5161,7 +5264,7 @@ brt_search_basement_node(
BRT_CURSOR brtcursor BRT_CURSOR brtcursor
) )
{ {
assert(bn->soft_copy_is_up_to_date); assert(bn->max_dsn_applied >= MIN_DSN);
// Now we have to convert from brt_search_t to the heaviside function with a direction. What a pain... // Now we have to convert from brt_search_t to the heaviside function with a direction. What a pain...
......
...@@ -292,6 +292,7 @@ BOOL toku_brt_is_empty_fast (BRT brt) __attribute__ ((warn_unused_result)); ...@@ -292,6 +292,7 @@ BOOL toku_brt_is_empty_fast (BRT brt) __attribute__ ((warn_unused_result));
BOOL toku_brt_is_recovery_logging_suppressed (BRT) __attribute__ ((warn_unused_result)); BOOL toku_brt_is_recovery_logging_suppressed (BRT) __attribute__ ((warn_unused_result));
void toku_brt_bn_reset_stats(BRTNODE node, int childnum);
void toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node); void toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node);
int toku_brt_strerror_r(int error, char *buf, size_t buflen); int toku_brt_strerror_r(int error, char *buf, size_t buflen);
......
...@@ -57,9 +57,14 @@ typedef struct __toku_lsn { u_int64_t lsn; } LSN; ...@@ -57,9 +57,14 @@ typedef struct __toku_lsn { u_int64_t lsn; } LSN;
* Make the MSN be a struct instead of an integer so that we get better type checking. */ * Make the MSN be a struct instead of an integer so that we get better type checking. */
typedef struct __toku_msn { u_int64_t msn; } MSN; typedef struct __toku_msn { u_int64_t msn; } MSN;
#define ZERO_MSN ((MSN){0}) // dummy used for message construction, to be filled in when msg is applied to tree #define ZERO_MSN ((MSN){0}) // dummy used for message construction, to be filled in when msg is applied to tree
#define MIN_MSN ((MSN){(u_int64_t)1<<32}) // first 2**32 values reserved for messages created before Dr. No (for upgrade) #define MIN_MSN ((MSN){(u_int64_t)1000*1000*1000}) // first 1B values reserved for messages created before Dr. No (for upgrade)
#define MAX_MSN ((MSN){UINT64_MAX}) #define MAX_MSN ((MSN){UINT64_MAX})
typedef int64_t DSN; // DESERIALIZATION sequence number
#define INVALID_DSN -1
#define MIN_DSN 0
#define MAX_DSN INT64_MAX
/* At the brt layer, a FILENUM uniquely identifies an open file. /* At the brt layer, a FILENUM uniquely identifies an open file.
* At the ydb layer, a DICTIONARY_ID uniquely identifies an open dictionary. * At the ydb layer, a DICTIONARY_ID uniquely identifies an open dictionary.
* With the introduction of the loader (ticket 2216), it is possible for the file that holds * With the introduction of the loader (ticket 2216), it is possible for the file that holds
......
...@@ -146,7 +146,6 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) { ...@@ -146,7 +146,6 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) {
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
sn.max_msn_applied_to_node_on_disk.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = 4*(1<<20); sn.nodesize = 4*(1<<20);
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -154,6 +153,7 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) { ...@@ -154,6 +153,7 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) {
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0; sn.height = 0;
sn.n_children = nrows; sn.n_children = nrows;
sn.dirty = 1;
LEAFENTRY les[nrows]; LEAFENTRY les[nrows];
{ {
...@@ -267,7 +267,6 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) { ...@@ -267,7 +267,6 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) {
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
sn.max_msn_applied_to_node_on_disk.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = 4*(1<<20); sn.nodesize = 4*(1<<20);
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -275,6 +274,7 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) { ...@@ -275,6 +274,7 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) {
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0; sn.height = 0;
sn.n_children = 1; sn.n_children = 1;
sn.dirty = 1;
LEAFENTRY les[nrows]; LEAFENTRY les[nrows];
{ {
...@@ -382,7 +382,6 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) { ...@@ -382,7 +382,6 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
sn.max_msn_applied_to_node_on_disk.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = 4*(1<<20); sn.nodesize = 4*(1<<20);
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -390,6 +389,7 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) { ...@@ -390,6 +389,7 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0; sn.height = 0;
sn.n_children = 1; sn.n_children = 1;
sn.dirty = 1;
LEAFENTRY les[7]; LEAFENTRY les[7];
{ {
...@@ -503,7 +503,6 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) { ...@@ -503,7 +503,6 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) {
int r; int r;
sn.max_msn_applied_to_node_on_disk.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = nodesize; sn.nodesize = nodesize;
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -511,6 +510,7 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) { ...@@ -511,6 +510,7 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) {
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0; sn.height = 0;
sn.n_children = 7; sn.n_children = 7;
sn.dirty = 1;
LEAFENTRY elts[3]; LEAFENTRY elts[3];
elts[0] = le_malloc("a", "aval"); elts[0] = le_malloc("a", "aval");
elts[1] = le_malloc("b", "bval"); elts[1] = le_malloc("b", "bval");
...@@ -628,7 +628,6 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type ...@@ -628,7 +628,6 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
int r; int r;
sn.max_msn_applied_to_node_on_disk.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = nodesize; sn.nodesize = nodesize;
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -636,6 +635,7 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type ...@@ -636,6 +635,7 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0; sn.height = 0;
sn.n_children = 4; sn.n_children = 4;
sn.dirty = 1;
MALLOC_N(sn.n_children, sn.bp); MALLOC_N(sn.n_children, sn.bp);
MALLOC_N(sn.n_children-1, sn.childkeys); MALLOC_N(sn.n_children-1, sn.childkeys);
sn.childkeys[0] = kv_pair_malloc("A", 2, 0, 0); sn.childkeys[0] = kv_pair_malloc("A", 2, 0, 0);
...@@ -737,7 +737,6 @@ test_serialize_leaf(enum brtnode_verify_type bft) { ...@@ -737,7 +737,6 @@ test_serialize_leaf(enum brtnode_verify_type bft) {
int r; int r;
sn.max_msn_applied_to_node_on_disk.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = nodesize; sn.nodesize = nodesize;
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -745,6 +744,7 @@ test_serialize_leaf(enum brtnode_verify_type bft) { ...@@ -745,6 +744,7 @@ test_serialize_leaf(enum brtnode_verify_type bft) {
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0; sn.height = 0;
sn.n_children = 2; sn.n_children = 2;
sn.dirty = 1;
LEAFENTRY elts[3]; LEAFENTRY elts[3];
elts[0] = le_malloc("a", "aval"); elts[0] = le_malloc("a", "aval");
elts[1] = le_malloc("b", "bval"); elts[1] = le_malloc("b", "bval");
...@@ -862,10 +862,9 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) { ...@@ -862,10 +862,9 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) {
// source_brt.fd=fd; // source_brt.fd=fd;
sn.max_msn_applied_to_node_on_disk.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
char *hello_string; char *hello_string;
sn.max_msn_applied_to_node_on_disk.msn = TESTMSNDSKVAL; sn.max_msn_applied_to_node_on_disk.msn = TESTMSNDSKVAL;
sn.max_msn_applied_to_node_in_memory.msn = TESTMSNMEMVAL; //sn.max_msn_applied_to_node_in_memory.msn = TESTMSNMEMVAL;
sn.nodesize = nodesize; sn.nodesize = nodesize;
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -873,6 +872,7 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) { ...@@ -873,6 +872,7 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) {
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 1; sn.height = 1;
sn.n_children = 2; sn.n_children = 2;
sn.dirty = 1;
hello_string = toku_strdup("hello"); hello_string = toku_strdup("hello");
MALLOC_N(2, sn.bp); MALLOC_N(2, sn.bp);
MALLOC_N(1, sn.childkeys); MALLOC_N(1, sn.childkeys);
...@@ -938,14 +938,14 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) { ...@@ -938,14 +938,14 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) {
r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE); r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
assert(r==0); assert(r==0);
assert(sn.max_msn_applied_to_node_on_disk.msn == TESTMSNMEMVAL); //assert(sn.max_msn_applied_to_node_on_disk.msn == TESTMSNMEMVAL);
assert(sn.max_msn_applied_to_node_in_memory.msn == TESTMSNMEMVAL); //assert(sn.max_msn_applied_to_node_in_memory.msn == TESTMSNMEMVAL);
setup_dn(bft, fd, brt_h, &dn); setup_dn(bft, fd, brt_h, &dn);
assert(dn->thisnodename.b==20); assert(dn->thisnodename.b==20);
assert(dn->max_msn_applied_to_node_on_disk.msn == TESTMSNMEMVAL); //assert(dn->max_msn_applied_to_node_on_disk.msn == TESTMSNMEMVAL);
assert(dn->max_msn_applied_to_node_in_memory.msn == TESTMSNMEMVAL); //assert(dn->max_msn_applied_to_node_in_memory.msn == TESTMSNMEMVAL);
assert(dn->layout_version ==BRT_LAYOUT_VERSION); assert(dn->layout_version ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION); assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment