Commit 1164029f authored by Yoni Fogel's avatar Yoni Fogel

fixes #6086 Merge 6086 to main. We now read in basement nodes if a full...

fixes #6086 Merge 6086 to main.  We now read in basement nodes if a full keyrange resides in it so that we can answer more accurately.


git-svn-id: file:///svn/toku/tokudb@54342 c7de825b-a66e-492c-adef-691d508d4ae1
parent 76fe5767
...@@ -420,7 +420,9 @@ static void print_db_struct (void) { ...@@ -420,7 +420,9 @@ static void print_db_struct (void) {
STRUCT_SETUP(DB, set_pagesize, "int (*%s) (DB *, uint32_t)"); STRUCT_SETUP(DB, set_pagesize, "int (*%s) (DB *, uint32_t)");
STRUCT_SETUP(DB, stat, "int (*%s) (DB *, void *, uint32_t)"); STRUCT_SETUP(DB, stat, "int (*%s) (DB *, void *, uint32_t)");
STRUCT_SETUP(DB, verify, "int (*%s) (DB *, const char *, const char *, FILE *, uint32_t)"); STRUCT_SETUP(DB, verify, "int (*%s) (DB *, const char *, const char *, FILE *, uint32_t)");
const char *extra[]={"int (*key_range64)(DB*, DB_TXN *, DBT *, uint64_t *less, uint64_t *equal, uint64_t *greater, int *is_exact)", const char *extra[]={
"int (*key_range64)(DB*, DB_TXN *, DBT *, uint64_t *less, uint64_t *equal, uint64_t *greater, int *is_exact)",
"int (*keys_range64)(DB*, DB_TXN *, DBT *keyleft, DBT *keyright, uint64_t *less, uint64_t *left, uint64_t *between, uint64_t *right, uint64_t *greater, bool *middle_3_exact)",
"int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *)", "int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *)",
"int (*pre_acquire_table_lock)(DB*, DB_TXN*)", "int (*pre_acquire_table_lock)(DB*, DB_TXN*)",
"int (*pre_acquire_fileops_lock)(DB*, DB_TXN*)", "int (*pre_acquire_fileops_lock)(DB*, DB_TXN*)",
......
...@@ -51,9 +51,24 @@ enum ftnode_fetch_type { ...@@ -51,9 +51,24 @@ enum ftnode_fetch_type {
ftnode_fetch_none=1, // no partitions needed. ftnode_fetch_none=1, // no partitions needed.
ftnode_fetch_subset, // some subset of partitions needed ftnode_fetch_subset, // some subset of partitions needed
ftnode_fetch_prefetch, // this is part of a prefetch call ftnode_fetch_prefetch, // this is part of a prefetch call
ftnode_fetch_all // every partition is needed ftnode_fetch_all, // every partition is needed
ftnode_fetch_keymatch, // one child is needed if it holds both keys
}; };
static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) UU();
static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) {
switch (type) {
case ftnode_fetch_none:
case ftnode_fetch_subset:
case ftnode_fetch_prefetch:
case ftnode_fetch_all:
case ftnode_fetch_keymatch:
return true;
default:
return false;
}
}
// //
// An extra parameter passed to cachetable functions // An extra parameter passed to cachetable functions
// That is used in all types of fetch callbacks. // That is used in all types of fetch callbacks.
...@@ -730,6 +745,46 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h) ...@@ -730,6 +745,46 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h)
bfe->decompress_time = 0; bfe->decompress_time = 0;
} }
//
// Helper function to fill a ftnode_fetch_extra with data
// that will tell the fetch callback that an explicit range of children is
// necessary. Used in cases where the portion of the node that is required
// is known in advance, e.g. for keysrange when the left and right key
// are in the same basement node.
//
static inline void fill_bfe_for_keymatch(
struct ftnode_fetch_extra *bfe,
FT h,
DBT *left,
DBT *right,
bool disable_prefetching,
bool read_all_partitions
)
{
paranoid_invariant(h->h->type == FT_CURRENT);
bfe->type = ftnode_fetch_keymatch;
bfe->h = h;
bfe->search = nullptr;
toku_init_dbt(&bfe->range_lock_left_key);
toku_init_dbt(&bfe->range_lock_right_key);
if (left) {
toku_copyref_dbt(&bfe->range_lock_left_key, *left);
}
if (right) {
toku_copyref_dbt(&bfe->range_lock_right_key, *right);
}
bfe->left_is_neg_infty = left == nullptr;
bfe->right_is_pos_infty = right == nullptr;
bfe->child_to_read = -1;
bfe->disable_prefetching = disable_prefetching;
bfe->read_all_partitions = read_all_partitions;
bfe->bytes_read = 0;
bfe->io_time = 0;
bfe->deserialize_time = 0;
bfe->decompress_time = 0;
}
// //
// Helper function to fill a ftnode_fetch_extra with data // Helper function to fill a ftnode_fetch_extra with data
// that will tell the fetch callback that some subset of the node // that will tell the fetch callback that some subset of the node
......
...@@ -596,20 +596,15 @@ next_dict_id(void) { ...@@ -596,20 +596,15 @@ next_dict_id(void) {
bool bool
toku_bfe_wants_child_available (struct ftnode_fetch_extra* bfe, int childnum) toku_bfe_wants_child_available (struct ftnode_fetch_extra* bfe, int childnum)
{ {
if (bfe->type == ftnode_fetch_all || return bfe->type == ftnode_fetch_all ||
(bfe->type == ftnode_fetch_subset && bfe->child_to_read == childnum)) (bfe->child_to_read == childnum &&
{ (bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_keymatch));
return true;
}
else {
return false;
}
} }
int int
toku_bfe_leftmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node) toku_bfe_leftmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node)
{ {
paranoid_invariant(bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch); paranoid_invariant(bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch || bfe->type == ftnode_fetch_keymatch);
if (bfe->left_is_neg_infty) { if (bfe->left_is_neg_infty) {
return 0; return 0;
} else if (bfe->range_lock_left_key.data == nullptr) { } else if (bfe->range_lock_left_key.data == nullptr) {
...@@ -622,7 +617,7 @@ toku_bfe_leftmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node) ...@@ -622,7 +617,7 @@ toku_bfe_leftmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node)
int int
toku_bfe_rightmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node) toku_bfe_rightmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node)
{ {
paranoid_invariant(bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch); paranoid_invariant(bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch || bfe->type == ftnode_fetch_keymatch);
if (bfe->right_is_pos_infty) { if (bfe->right_is_pos_infty) {
return node->n_children - 1; return node->n_children - 1;
} else if (bfe->range_lock_right_key.data == nullptr) { } else if (bfe->range_lock_right_key.data == nullptr) {
...@@ -875,7 +870,7 @@ toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe) ...@@ -875,7 +870,7 @@ toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe)
STATUS_INC(FT_NUM_PIVOTS_FETCHED_WRITE, 1); STATUS_INC(FT_NUM_PIVOTS_FETCHED_WRITE, 1);
STATUS_INC(FT_BYTES_PIVOTS_FETCHED_WRITE, bfe->bytes_read); STATUS_INC(FT_BYTES_PIVOTS_FETCHED_WRITE, bfe->bytes_read);
STATUS_INC(FT_TOKUTIME_PIVOTS_FETCHED_WRITE, bfe->io_time); STATUS_INC(FT_TOKUTIME_PIVOTS_FETCHED_WRITE, bfe->io_time);
} else if (bfe->type == ftnode_fetch_subset) { } else if (bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_keymatch) {
STATUS_INC(FT_NUM_PIVOTS_FETCHED_QUERY, 1); STATUS_INC(FT_NUM_PIVOTS_FETCHED_QUERY, 1);
STATUS_INC(FT_BYTES_PIVOTS_FETCHED_QUERY, bfe->bytes_read); STATUS_INC(FT_BYTES_PIVOTS_FETCHED_QUERY, bfe->bytes_read);
STATUS_INC(FT_TOKUTIME_PIVOTS_FETCHED_QUERY, bfe->io_time); STATUS_INC(FT_TOKUTIME_PIVOTS_FETCHED_QUERY, bfe->io_time);
...@@ -1146,8 +1141,24 @@ bool toku_ftnode_pf_req_callback(void* ftnode_pv, void* read_extraargs) { ...@@ -1146,8 +1141,24 @@ bool toku_ftnode_pf_req_callback(void* ftnode_pv, void* read_extraargs) {
retval = true; retval = true;
} }
} }
} else if (bfe->type == ftnode_fetch_keymatch) {
// we do not take into account prefetching yet
// as of now, if we need a subset, the only thing
// we can possibly require is a single basement node
// we find out what basement node the query cares about
// and check if it is available
paranoid_invariant(bfe->h->compare_fun);
if (node->height == 0) {
int left_child = toku_bfe_leftmost_child_wanted(bfe, node);
int right_child = toku_bfe_rightmost_child_wanted(bfe, node);
if (left_child == right_child) {
bfe->child_to_read = left_child;
unsafe_touch_clock(node,bfe->child_to_read);
// child we want to read is not available, must set retval to true
retval = (BP_STATE(node, bfe->child_to_read) != PT_AVAIL);
} }
else { }
} else {
// we have a bug. The type should be known // we have a bug. The type should be known
abort(); abort();
} }
...@@ -1264,7 +1275,7 @@ int toku_ftnode_pf_callback(void* ftnode_pv, void* disk_data, void* read_extraar ...@@ -1264,7 +1275,7 @@ int toku_ftnode_pf_callback(void* ftnode_pv, void* disk_data, void* read_extraar
struct ftnode_fetch_extra *bfe = (struct ftnode_fetch_extra *) read_extraargs; struct ftnode_fetch_extra *bfe = (struct ftnode_fetch_extra *) read_extraargs;
// there must be a reason this is being called. If we get a garbage type or the type is ftnode_fetch_none, // there must be a reason this is being called. If we get a garbage type or the type is ftnode_fetch_none,
// then something went wrong // then something went wrong
assert((bfe->type == ftnode_fetch_subset) || (bfe->type == ftnode_fetch_all) || (bfe->type == ftnode_fetch_prefetch)); assert((bfe->type == ftnode_fetch_subset) || (bfe->type == ftnode_fetch_all) || (bfe->type == ftnode_fetch_prefetch) || (bfe->type == ftnode_fetch_keymatch));
// determine the range to prefetch // determine the range to prefetch
int lc, rc; int lc, rc;
if (!bfe->disable_prefetching && if (!bfe->disable_prefetching &&
...@@ -5583,63 +5594,99 @@ keyrange_compare (OMTVALUE lev, void *extra) { ...@@ -5583,63 +5594,99 @@ keyrange_compare (OMTVALUE lev, void *extra) {
} }
static void static void
keyrange_in_leaf_partition (FT_HANDLE brt, FTNODE node, DBT *key, int child_number, uint64_t estimated_num_rows, keysrange_in_leaf_partition (FT_HANDLE brt, FTNODE node,
uint64_t *less, uint64_t *equal, uint64_t *greater) DBT* key_left, DBT* key_right,
int left_child_number, int right_child_number, uint64_t estimated_num_rows,
uint64_t *less, uint64_t* equal_left, uint64_t* middle,
uint64_t* equal_right, uint64_t* greater, bool* single_basement_node)
// If the partition is in main memory then estimate the number // If the partition is in main memory then estimate the number
// If KEY==NULL then use an arbitrary key (leftmost or zero) // Treat key_left == NULL as negative infinity
// Treat key_right == NULL as positive infinity
{ {
paranoid_invariant(node->height == 0); // we are in a leaf paranoid_invariant(node->height == 0); // we are in a leaf
if (BP_STATE(node, child_number) == PT_AVAIL) { paranoid_invariant(!(key_left == NULL && key_right != NULL));
// If the partition is in main memory then get an exact count. paranoid_invariant(left_child_number <= right_child_number);
struct keyrange_compare_s s = {brt,key}; bool single_basement = left_child_number == right_child_number;
BASEMENTNODE bn = BLB(node, child_number); paranoid_invariant(!single_basement || (BP_STATE(node, left_child_number) == PT_AVAIL));
if (BP_STATE(node, left_child_number) == PT_AVAIL) {
int r;
// The partition is in main memory then get an exact count.
struct keyrange_compare_s s_left = {brt, key_left};
BASEMENTNODE bn = BLB(node, left_child_number);
OMTVALUE datav; OMTVALUE datav;
uint32_t idx = 0; uint32_t idx_left = 0;
// if key is NULL then set r==-1 and idx==0. // if key_left is NULL then set r==-1 and idx==0.
int r = key ? toku_omt_find_zero(bn->buffer, keyrange_compare, &s, &datav, &idx) : -1; r = key_left ? toku_omt_find_zero(bn->buffer, keyrange_compare, &s_left, &datav, &idx_left) : -1;
if (r==0) { *less = idx_left;
*less = idx; *equal_left = (r==0) ? 1 : 0;
*equal = 1;
*greater = toku_omt_size(bn->buffer)-idx-1; uint32_t size = toku_omt_size(bn->buffer);
} else { printf("Estimated vs Actual: %" PRIu64 " vs %" PRIu32 "\n", estimated_num_rows, size);
// If not found, then the idx says where it's between. uint32_t idx_right = size;
*less = idx; r = -1;
*equal = 0; if (single_basement && key_right) {
*greater = toku_omt_size(bn->buffer)-idx; struct keyrange_compare_s s_right = {brt, key_right};
r = toku_omt_find_zero(bn->buffer, keyrange_compare, &s_right, &datav, &idx_right);
} }
*middle = idx_right - idx_left - *equal_left;
*equal_right = (r==0) ? 1 : 0;
*greater = size - idx_right - *equal_right;
} else { } else {
*less = estimated_num_rows / 2; paranoid_invariant(!single_basement);
*equal = 0; uint32_t idx_left = estimated_num_rows / 2;
*greater = *less; if (!key_left) {
//Both nullptr, assume key_left belongs before leftmost entry, key_right belongs after rightmost entry
idx_left = 0;
paranoid_invariant(!key_right);
} }
// Assume idx_left and idx_right point to where key_left and key_right belong, (but are not there).
*less = idx_left;
*equal_left = 0;
*middle = estimated_num_rows - idx_left;
*equal_right = 0;
*greater = 0;
}
*single_basement_node = single_basement;
} }
static int static int
toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node, toku_ft_keysrange_internal (FT_HANDLE brt, FTNODE node,
DBT *key, uint64_t *less, uint64_t *equal, uint64_t *greater, DBT* key_left, DBT* key_right, bool may_find_right,
uint64_t* less, uint64_t* equal_left, uint64_t* middle,
uint64_t* equal_right, uint64_t* greater, bool* single_basement_node,
uint64_t estimated_num_rows, uint64_t estimated_num_rows,
struct ftnode_fetch_extra *bfe, // set up to read a minimal read. struct ftnode_fetch_extra *min_bfe, // set up to read a minimal read.
struct ftnode_fetch_extra *match_bfe, // set up to read a basement node iff both keys in it
struct unlockers *unlockers, ANCESTORS ancestors, struct pivot_bounds const * const bounds) struct unlockers *unlockers, ANCESTORS ancestors, struct pivot_bounds const * const bounds)
// Implementation note: Assign values to less, equal, and greater, and then on the way out (returning up the stack) we add more values in. // Implementation note: Assign values to less, equal, and greater, and then on the way out (returning up the stack) we add more values in.
{ {
int r = 0; int r = 0;
// if KEY is NULL then use the leftmost key. // if KEY is NULL then use the leftmost key.
int child_number = key ? toku_ftnode_which_child (node, key, &brt->ft->cmp_descriptor, brt->ft->compare_fun) : 0; int left_child_number = key_left ? toku_ftnode_which_child (node, key_left, &brt->ft->cmp_descriptor, brt->ft->compare_fun) : 0;
int right_child_number = node->n_children; // Sentinel that does not equal left_child_number.
if (may_find_right) {
right_child_number = key_right ? toku_ftnode_which_child (node, key_right, &brt->ft->cmp_descriptor, brt->ft->compare_fun) : node->n_children - 1;
}
uint64_t rows_per_child = estimated_num_rows / node->n_children; uint64_t rows_per_child = estimated_num_rows / node->n_children;
if (node->height == 0) { if (node->height == 0) {
keysrange_in_leaf_partition(brt, node, key_left, key_right, left_child_number, right_child_number,
rows_per_child, less, equal_left, middle, equal_right, greater, single_basement_node);
keyrange_in_leaf_partition(brt, node, key, child_number, rows_per_child, less, equal, greater); *less += rows_per_child * left_child_number;
if (*single_basement_node) {
*less += rows_per_child * child_number; *greater += rows_per_child * (node->n_children - left_child_number - 1);
*greater += rows_per_child * (node->n_children - child_number - 1); } else {
*middle += rows_per_child * (node->n_children - left_child_number - 1);
}
} else { } else {
// do the child. // do the child.
struct ancestors next_ancestors = {node, child_number, ancestors}; struct ancestors next_ancestors = {node, left_child_number, ancestors};
BLOCKNUM childblocknum = BP_BLOCKNUM(node, child_number); BLOCKNUM childblocknum = BP_BLOCKNUM(node, left_child_number);
uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, child_number); uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, left_child_number);
FTNODE childnode; FTNODE childnode;
bool msgs_applied = false; bool msgs_applied = false;
bool child_may_find_right = may_find_right && left_child_number == right_child_number;
r = toku_pin_ftnode_batched( r = toku_pin_ftnode_batched(
brt, brt,
childblocknum, childblocknum,
...@@ -5647,7 +5694,7 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node, ...@@ -5647,7 +5694,7 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node,
unlockers, unlockers,
&next_ancestors, &next_ancestors,
bounds, bounds,
bfe, child_may_find_right ? match_bfe : min_bfe,
PL_READ, // may_modify_node is false, because node guaranteed to not change PL_READ, // may_modify_node is false, because node guaranteed to not change
false, false,
&childnode, &childnode,
...@@ -5659,15 +5706,20 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node, ...@@ -5659,15 +5706,20 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node,
struct unlock_ftnode_extra unlock_extra = {brt,childnode,false}; struct unlock_ftnode_extra unlock_extra = {brt,childnode,false};
struct unlockers next_unlockers = {true, unlock_ftnode_fun, (void*)&unlock_extra, unlockers}; struct unlockers next_unlockers = {true, unlock_ftnode_fun, (void*)&unlock_extra, unlockers};
const struct pivot_bounds next_bounds = next_pivot_keys(node, child_number, bounds); const struct pivot_bounds next_bounds = next_pivot_keys(node, left_child_number, bounds);
r = toku_ft_keyrange_internal(brt, childnode, key, less, equal, greater, rows_per_child, r = toku_ft_keysrange_internal(brt, childnode, key_left, key_right, child_may_find_right,
bfe, &next_unlockers, &next_ancestors, &next_bounds); less, equal_left, middle, equal_right, greater, single_basement_node,
rows_per_child, min_bfe, match_bfe, &next_unlockers, &next_ancestors, &next_bounds);
if (r != TOKUDB_TRY_AGAIN) { if (r != TOKUDB_TRY_AGAIN) {
assert_zero(r); assert_zero(r);
*less += rows_per_child * child_number; *less += rows_per_child * left_child_number;
*greater += rows_per_child * (node->n_children - child_number - 1); if (*single_basement_node) {
*greater += rows_per_child * (node->n_children - left_child_number - 1);
} else {
*middle += rows_per_child * (node->n_children - left_child_number - 1);
}
assert(unlockers->locked); assert(unlockers->locked);
toku_unpin_ftnode_read_only(brt->ft, childnode); toku_unpin_ftnode_read_only(brt->ft, childnode);
...@@ -5677,20 +5729,39 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node, ...@@ -5677,20 +5729,39 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node,
return r; return r;
} }
void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less_p, uint64_t *equal_p, uint64_t *greater_p) void toku_ft_keysrange(FT_HANDLE brt, DBT* key_left, DBT* key_right, uint64_t *less_p, uint64_t* equal_left_p, uint64_t* middle_p, uint64_t* equal_right_p, uint64_t* greater_p, bool* middle_3_exact_p)
// Effect: Return an estimate of the number of keys to the left, the number equal, and the number to the right of the key. // Effect: Return an estimate of the number of keys to the left, the number equal (to left key), number between keys, number equal to right key, and the number to the right of both keys.
// The values are an estimate. // The values are an estimate.
// If you perform a keyrange on two keys that are in the same in-memory and uncompressed basement, // If you perform a keyrange on two keys that are in the same basement, equal_less, middle, and equal_right will be exact.
// you can use the keys_right numbers (or the keys_left) numbers to get an exact number keys in the range, // 4184: What to do with a NULL key?
// if the basement does not change between the keyrange queries. // key_left==NULL is treated as -infinity
// TODO 4184: What to do with a NULL key? // key_right==NULL is treated as +infinity
// If KEY is NULL then the system picks an arbitrary key and returns it. // If KEY is NULL then the system picks an arbitrary key and returns it.
// key_right can be non-null only if key_left is non-null;
{ {
struct ftnode_fetch_extra bfe; if (!key_left && key_right) {
fill_bfe_for_min_read(&bfe, brt->ft); // read pivot keys but not message buffers // Simplify internals by only supporting key_right != null when key_left != null
// If key_right != null and key_left == null, then swap them and fix up numbers.
uint64_t less = 0, equal_left = 0, middle = 0, equal_right = 0, greater = 0;
toku_ft_keysrange(brt, key_right, nullptr, &less, &equal_left, &middle, &equal_right, &greater, middle_3_exact_p);
*less_p = 0;
*equal_left_p = 0;
*middle_p = less;
*equal_right_p = equal_left;
*greater_p = middle;
invariant_zero(equal_right);
invariant_zero(greater);
return;
}
paranoid_invariant(!(!key_left && key_right));
struct ftnode_fetch_extra min_bfe;
struct ftnode_fetch_extra match_bfe;
fill_bfe_for_min_read(&min_bfe, brt->ft); // read pivot keys but not message buffers
fill_bfe_for_keymatch(&match_bfe, brt->ft, key_left, key_right, false, false); // read basement node only if both keys in it.
try_again: try_again:
{ {
uint64_t less = 0, equal = 0, greater = 0; uint64_t less = 0, equal_left = 0, middle = 0, equal_right = 0, greater = 0;
bool single_basement_node = false;
FTNODE node = NULL; FTNODE node = NULL;
{ {
uint32_t fullhash; uint32_t fullhash;
...@@ -5700,7 +5771,7 @@ try_again: ...@@ -5700,7 +5771,7 @@ try_again:
brt->ft, brt->ft,
root_key, root_key,
fullhash, fullhash,
&bfe, &match_bfe,
PL_READ, // may_modify_node, cannot change root during keyrange PL_READ, // may_modify_node, cannot change root during keyrange
0, 0,
NULL, NULL,
...@@ -5712,27 +5783,83 @@ try_again: ...@@ -5712,27 +5783,83 @@ try_again:
struct unlockers unlockers = {true, unlock_ftnode_fun, (void*)&unlock_extra, (UNLOCKERS)NULL}; struct unlockers unlockers = {true, unlock_ftnode_fun, (void*)&unlock_extra, (UNLOCKERS)NULL};
{ {
int r;
int64_t numrows = brt->ft->in_memory_stats.numrows; int64_t numrows = brt->ft->in_memory_stats.numrows;
if (numrows < 0) if (numrows < 0)
numrows = 0; // prevent appearance of a negative number numrows = 0; // prevent appearance of a negative number
int r = toku_ft_keyrange_internal (brt, node, key, r = toku_ft_keysrange_internal (brt, node, key_left, key_right, true,
&less, &equal, &greater, &less, &equal_left, &middle, &equal_right, &greater,
numrows, &single_basement_node, numrows,
&bfe, &unlockers, (ANCESTORS)NULL, &infinite_bounds); &min_bfe, &match_bfe, &unlockers, (ANCESTORS)NULL, &infinite_bounds);
assert(r == 0 || r == TOKUDB_TRY_AGAIN);
if (r == TOKUDB_TRY_AGAIN) {
assert(!unlockers.locked);
goto try_again;
}
// May need to do a second query.
if (!single_basement_node && key_right != nullptr) {
// "greater" is stored in "middle"
invariant_zero(equal_right);
invariant_zero(greater);
uint64_t less2 = 0, equal_left2 = 0, middle2 = 0, equal_right2 = 0, greater2 = 0;
bool ignore;
r = toku_ft_keysrange_internal (brt, node, key_right, nullptr, false,
&less2, &equal_left2, &middle2, &equal_right2, &greater2,
&ignore, numrows,
&min_bfe, &match_bfe, &unlockers, (ANCESTORS)nullptr, &infinite_bounds);
assert(r == 0 || r == TOKUDB_TRY_AGAIN); assert(r == 0 || r == TOKUDB_TRY_AGAIN);
if (r == TOKUDB_TRY_AGAIN) { if (r == TOKUDB_TRY_AGAIN) {
assert(!unlockers.locked); assert(!unlockers.locked);
goto try_again; goto try_again;
} }
invariant_zero(equal_right2);
invariant_zero(greater2);
// Update numbers.
// less is already correct.
// equal_left is already correct.
// "middle" currently holds everything greater than left_key in first query
// 'middle2' currently holds everything greater than right_key in second query
// 'equal_left2' is how many match right_key
// Prevent underflow.
if (middle >= equal_left2 + middle2) {
middle -= equal_left2 + middle2;
} else {
middle = 0;
}
equal_right = equal_left2;
greater = middle2;
}
} }
assert(unlockers.locked); assert(unlockers.locked);
toku_unpin_ftnode_read_only(brt->ft, node); toku_unpin_ftnode_read_only(brt->ft, node);
if (!key_right) {
paranoid_invariant_zero(equal_right);
paranoid_invariant_zero(greater);
}
if (!key_left) {
paranoid_invariant_zero(less);
paranoid_invariant_zero(equal_left);
}
*less_p = less; *less_p = less;
*equal_p = equal; *equal_left_p = equal_left;
*middle_p = middle;
*equal_right_p = equal_right;
*greater_p = greater; *greater_p = greater;
*middle_3_exact_p = single_basement_node;
} }
} }
//Test-only wrapper for the old one-key range function
void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less, uint64_t *equal, uint64_t *greater) {
uint64_t zero_equal_right, zero_greater;
bool ignore;
toku_ft_keysrange(brt, key, nullptr, less, equal, greater, &zero_equal_right, &zero_greater, &ignore);
invariant_zero(zero_equal_right);
invariant_zero(zero_greater);
}
void toku_ft_handle_stat64 (FT_HANDLE brt, TOKUTXN UU(txn), struct ftstat64_s *s) { void toku_ft_handle_stat64 (FT_HANDLE brt, TOKUTXN UU(txn), struct ftstat64_s *s) {
toku_ft_stat64(brt->ft, s); toku_ft_stat64(brt->ft, s);
} }
......
...@@ -205,6 +205,7 @@ enum ft_flags { ...@@ -205,6 +205,7 @@ enum ft_flags {
}; };
void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less, uint64_t *equal, uint64_t *greater); void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less, uint64_t *equal, uint64_t *greater);
void toku_ft_keysrange(FT_HANDLE brt, DBT* key_left, DBT* key_right, uint64_t *less_p, uint64_t* equal_left_p, uint64_t* middle_p, uint64_t* equal_right_p, uint64_t* greater_p, bool* middle_3_exact_p);
struct ftstat64_s { struct ftstat64_s {
uint64_t nkeys; /* estimate how many unique keys (even when flattened this may be an estimate) */ uint64_t nkeys; /* estimate how many unique keys (even when flattened this may be an estimate) */
......
...@@ -1372,6 +1372,20 @@ update_bfe_using_ftnode(FTNODE node, struct ftnode_fetch_extra *bfe) ...@@ -1372,6 +1372,20 @@ update_bfe_using_ftnode(FTNODE node, struct ftnode_fetch_extra *bfe)
node, node,
bfe->search bfe->search
); );
} else if (bfe->type == ftnode_fetch_keymatch) {
// we do not take into account prefetching yet
// as of now, if we need a subset, the only thing
// we can possibly require is a single basement node
// we find out what basement node the query cares about
// and check if it is available
paranoid_invariant(bfe->h->compare_fun);
if (node->height == 0) {
int left_child = toku_bfe_leftmost_child_wanted(bfe, node);
int right_child = toku_bfe_rightmost_child_wanted(bfe, node);
if (left_child == right_child) {
bfe->child_to_read = left_child;
}
}
} }
} }
...@@ -1688,7 +1702,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode, ...@@ -1688,7 +1702,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
// rbuf, so we might be able to store the compressed data for some // rbuf, so we might be able to store the compressed data for some
// objects. // objects.
// We can proceed to deserialize the individual subblocks. // We can proceed to deserialize the individual subblocks.
paranoid_invariant(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch); paranoid_invariant(is_valid_ftnode_fetch_type(bfe->type));
// setup the memory of the partitions // setup the memory of the partitions
// for partitions being decompressed, create either FIFO or basement node // for partitions being decompressed, create either FIFO or basement node
...@@ -2323,7 +2337,7 @@ deserialize_ftnode_from_rbuf( ...@@ -2323,7 +2337,7 @@ deserialize_ftnode_from_rbuf(
// now that the node info has been deserialized, we can proceed to deserialize // now that the node info has been deserialized, we can proceed to deserialize
// the individual sub blocks // the individual sub blocks
paranoid_invariant(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch); paranoid_invariant(is_valid_ftnode_fetch_type(bfe->type));
// setup the memory of the partitions // setup the memory of the partitions
// for partitions being decompressed, create either FIFO or basement node // for partitions being decompressed, create either FIFO or basement node
......
...@@ -66,6 +66,49 @@ static void maybe_reopen (enum memory_state ms, uint64_t limit) { ...@@ -66,6 +66,49 @@ static void maybe_reopen (enum memory_state ms, uint64_t limit) {
assert(0); assert(0);
} }
static void verify_keysrange(enum memory_state UU(ms), uint64_t limit,
uint64_t intkey1,
uint64_t intkey2,
uint64_t less,
uint64_t equal1,
uint64_t middle,
uint64_t equal2,
uint64_t greater,
bool middle3exact) {
uint64_t max_item = limit * 2 - 1;
uint64_t perfect_total = limit;
uint64_t perfect_less = intkey1 / 2;
uint64_t perfect_equal1 = intkey1 % 2 == 1;
uint64_t perfect_equal2 = intkey2 % 2 == 1 && intkey2 <= max_item;
uint64_t perfect_greater = intkey2 >= max_item ? 0 : (max_item + 1 - intkey2) / 2;
uint64_t perfect_middle = perfect_total - perfect_less - perfect_equal1 - perfect_equal2 - perfect_greater;
uint64_t total = less + equal1 + middle + equal2 + greater;
assert(total > 0);
assert(total < 2 * perfect_total);
assert(total > perfect_total / 2);
assert(equal1 == perfect_equal1 || (equal1 == 0 && !middle3exact));
assert(equal2 == perfect_equal2 || (equal2 == 0 && !middle3exact));
// As of 2013-02-25 this is accurate with fiddle ~= total/50.
// Set to 1/10th to prevent flakiness.
uint64_t fiddle = perfect_total / 10;
assert(less + fiddle > perfect_less);
assert(less < perfect_less + fiddle);
assert(middle + fiddle > perfect_middle);
assert(middle < perfect_middle + fiddle);
assert(greater + fiddle > perfect_greater);
assert(greater < perfect_greater + fiddle);
if (middle3exact) {
assert(middle == perfect_middle);
}
}
static void test_keyrange (enum memory_state ms, uint64_t limit) { static void test_keyrange (enum memory_state ms, uint64_t limit) {
open_ft_and_ct(true); open_ft_and_ct(true);
...@@ -123,7 +166,9 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) { ...@@ -123,7 +166,9 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) {
#endif #endif
} else { } else {
// after reopen, none of the basements are in memory // after reopen, none of the basements are in memory
assert(equal == 0); // However, "both" keys can be in the same basement (specifically the last basement node in the tree)
// Without trying to figure out how many are in the last basement node, we expect at least the first half not to be in the last basement node.
assert(i > limit / 2 || equal == 0);
#if 0 #if 0
if (i<10) { if (i<10) {
assert(less==0); assert(less==0);
...@@ -189,6 +234,80 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) { ...@@ -189,6 +234,80 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) {
#endif #endif
} }
maybe_reopen(ms, limit);
{
uint64_t totalqueries = 0;
uint64_t num_middle3_exact = 0;
for (uint64_t i=0; i < 2*limit; i++) {
char key[100];
char keyplus4[100];
char keyplus5[100];
uint64_t intkey = i;
snprintf(key, 100, "%08" PRIu64 "", intkey);
snprintf(keyplus4, 100, "%08" PRIu64 "", intkey+4);
snprintf(keyplus5, 100, "%08" PRIu64 "", intkey+5);
DBT k;
DBT k2;
DBT k3;
toku_fill_dbt(&k, key, 1+strlen(key));
toku_fill_dbt(&k2, keyplus4, 1+strlen(keyplus4));
toku_fill_dbt(&k3, keyplus5, 1+strlen(keyplus5));
uint64_t less,equal1,middle,equal2,greater;
bool middle3exact;
toku_ft_keysrange(t, &k, &k2, &less, &equal1, &middle, &equal2, &greater, &middle3exact);
if (ms == CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
//TODO(yoni): when reading basement nodes is implemented, get rid of this hack
middle3exact = false;
}
totalqueries++;
num_middle3_exact += middle3exact;
if (verbose > 1) {
printf("Rkey2 %" PRIu64 "/%" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %s\n",
intkey, 2*limit, less, equal1, middle, equal2, greater, middle3exact ? "true" : "false");
}
verify_keysrange(ms, limit, intkey, intkey+4,
less, equal1, middle, equal2, greater, middle3exact);
toku_ft_keysrange(t, &k, &k3, &less, &equal1, &middle, &equal2, &greater, &middle3exact);
if (ms == CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
//TODO(yoni): when reading basement nodes is implemented, get rid of this hack
middle3exact = false;
}
totalqueries++;
num_middle3_exact += middle3exact;
if (verbose > 1) {
printf("Rkey3 %" PRIu64 "/%" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %s\n",
intkey, 2*limit, less, equal1, middle, equal2, greater, middle3exact ? "true" : "false");
}
verify_keysrange(ms, limit, intkey, intkey+5,
less, equal1, middle, equal2, greater, middle3exact);
}
assert(num_middle3_exact <= totalqueries);
if (ms == CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
//TODO(yoni): when reading basement nodes is implemented, get rid of this hack
assert(num_middle3_exact == 0);
} else {
// About 85% of the time, the key for an int (and +4 or +5) is in the
// same basement node. Check >= 70% so this isn't very flaky.
assert(num_middle3_exact > totalqueries * 7 / 10);
}
}
close_ft_and_ct(); close_ft_and_ct();
} }
......
...@@ -695,12 +695,8 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS) ...@@ -695,12 +695,8 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS)
declare_custom_tests(keyrange.tdb) declare_custom_tests(keyrange.tdb)
add_ydb_test_aux(keyrange-get0.tdb keyrange.tdb --get 0) add_ydb_test_aux(keyrange-get0.tdb keyrange.tdb --get 0)
add_ydb_test_aux(keyrange-get1.tdb keyrange.tdb --get 1) add_ydb_test_aux(keyrange-get1.tdb keyrange.tdb --get 1)
if (0)
add_ydb_test_aux(keyrange-random-get0.tdb keyrange.tdb --get 0 --random_keys 1) add_ydb_test_aux(keyrange-random-get0.tdb keyrange.tdb --get 0 --random_keys 1)
add_ydb_test_aux(keyrange-random-get1.tdb keyrange.tdb --get 1 --random_keys 1) add_ydb_test_aux(keyrange-random-get1.tdb keyrange.tdb --get 1 --random_keys 1)
else ()
message(WARNING "TODO(leif): re-enable keyrange tests, see #5666")
endif ()
add_ydb_test_aux(keyrange-loader-get0.tdb keyrange.tdb --get 0 --loader 1) add_ydb_test_aux(keyrange-loader-get0.tdb keyrange.tdb --get 0 --loader 1)
add_ydb_test_aux(keyrange-loader-get1.tdb keyrange.tdb --get 1 --loader 1) add_ydb_test_aux(keyrange-loader-get1.tdb keyrange.tdb --get 1 --loader 1)
......
...@@ -60,7 +60,7 @@ run_test(void) { ...@@ -60,7 +60,7 @@ run_test(void) {
size_t key_size = 9; size_t key_size = 9;
size_t val_size = 9; size_t val_size = 9;
size_t est_row_size_with_overhead = 8 + key_size + 4 + val_size + 4; // xid + key + key_len + val + val)len size_t est_row_size_with_overhead = 8 + key_size + 4 + val_size + 4 + 5; // xid + key + key_len + val + val_len + mvcc overhead
size_t rows_per_basement = db_basement_size / est_row_size_with_overhead; size_t rows_per_basement = db_basement_size / est_row_size_with_overhead;
int r; int r;
...@@ -72,7 +72,8 @@ run_test(void) { ...@@ -72,7 +72,8 @@ run_test(void) {
r = env->open(env, envdir, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = env->open(env, envdir, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = db_create(&db, env, 0); CKERR(r); r = db_create(&db, env, 0); CKERR(r);
r = db->set_pagesize(db, db_page_size); r = db->set_pagesize(db, db_page_size); CKERR(r);
r = db->set_readpagesize(db, db_basement_size); CKERR(r);
r = env->txn_begin(env, 0, &txn, 0); CKERR(r); r = env->txn_begin(env, 0, &txn, 0); CKERR(r);
r = db->open(db, txn, "foo.db", 0, DB_BTREE, DB_CREATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = db->open(db, txn, "foo.db", 0, DB_BTREE, DB_CREATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = txn->commit(txn, 0); CKERR(r); r = txn->commit(txn, 0); CKERR(r);
...@@ -145,7 +146,11 @@ run_test(void) { ...@@ -145,7 +146,11 @@ run_test(void) {
if (0) goto skipit; // debug: just write the tree if (0) goto skipit; // debug: just write the tree
bool last_basement;
last_basement = false;
// verify key_range for keys that exist in the tree // verify key_range for keys that exist in the tree
uint64_t random_fudge;
random_fudge = random_keys ? rows_per_basement + nrows / 10 : 0;
for (uint64_t i=0; i<nrows; i++) { for (uint64_t i=0; i<nrows; i++) {
char key[100]; char key[100];
snprintf(key, 100, "%08llu", (unsigned long long)2*i+1); snprintf(key, 100, "%08llu", (unsigned long long)2*i+1);
...@@ -160,15 +165,31 @@ run_test(void) { ...@@ -160,15 +165,31 @@ run_test(void) {
assert(0 < less + equal + greater); assert(0 < less + equal + greater);
if (use_loader) { if (use_loader) {
assert(less + equal + greater <= nrows); assert(less + equal + greater <= nrows);
assert(get_all ? equal == 1 : equal == 0); if (get_all || last_basement) {
assert(equal == 1);
} else if (i < nrows - rows_per_basement * 2) {
assert(equal == 0);
} else if (i == nrows - 1) {
assert(equal == 1);
} else if (equal == 1) {
last_basement = true;
}
assert(less <= max64(i, i + rows_per_basement/2)); assert(less <= max64(i, i + rows_per_basement/2));
assert(greater <= nrows - less); assert(greater <= nrows - less);
} else { } else {
assert(less + equal + greater <= nrows + nrows / 8); assert(less + equal + greater <= nrows + nrows / 8);
assert(get_all ? equal == 1 : equal == 0); if (get_all || last_basement) {
uint64_t est_i = max64(i, i + rows_per_basement/2); assert(equal == 1);
assert(less <= est_i + est_i / 1); } else if (i < nrows - rows_per_basement * 2) {
assert(greater <= nrows - i + rows_per_basement/2); assert(equal == 0);
} else if (i == nrows - 1) {
assert(equal == 1);
} else if (equal == 1) {
last_basement = true;
}
uint64_t est_i = i * 2 + rows_per_basement;
assert(less <= est_i + random_fudge);
assert(greater <= nrows - i + rows_per_basement + random_fudge);
} }
} }
...@@ -193,9 +214,9 @@ run_test(void) { ...@@ -193,9 +214,9 @@ run_test(void) {
} else { } else {
assert(less + equal + greater <= nrows + nrows / 8); assert(less + equal + greater <= nrows + nrows / 8);
assert(equal == 0); assert(equal == 0);
uint64_t est_i = max64(i, i + rows_per_basement/2); uint64_t est_i = i * 2 + rows_per_basement;
assert(less <= est_i + est_i / 1); assert(less <= est_i + random_fudge);
assert(greater <= nrows - i + rows_per_basement/2); assert(greater <= nrows - i + rows_per_basement + random_fudge);
} }
} }
......
...@@ -642,17 +642,30 @@ toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) { ...@@ -642,17 +642,30 @@ toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
} }
static int static int
toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, uint64_t* less, uint64_t* equal, uint64_t* greater, int* is_exact) { toku_db_keys_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* keyleft, DBT* keyright, uint64_t* less, uint64_t* left, uint64_t* between, uint64_t *right, uint64_t *greater, bool* middle_3_exact) {
HANDLE_PANICKED_DB(db); HANDLE_PANICKED_DB(db);
HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
// note that toku_ft_keyrange does not have a txn param // note that we ignore the txn param. It would be more complicated to support it.
// this will be fixed later // TODO(yoni): Maybe add support for txns later? How would we do this? ydb lock comment about db_keyrange64 is obsolete.
// temporarily, because the caller, locked_db_keyrange, toku_ft_keysrange(db->i->ft_handle, keyleft, keyright, less, left, between, right, greater, middle_3_exact);
// has the ydb lock, we are ok return 0;
toku_ft_keyrange(db->i->ft_handle, key, less, equal, greater); }
// temporarily set is_exact to 0 because ft_keyrange does not have this parameter
*is_exact = 0; static int
toku_db_key_range64(DB* db, DB_TXN* txn, DBT* key, uint64_t* less_p, uint64_t* equal_p, uint64_t* greater_p, int* is_exact) {
uint64_t less, equal_left, middle, equal_right, greater;
bool ignore;
int r = toku_db_keys_range64(db, txn, key, NULL, &less, &equal_left, &middle, &equal_right, &greater, &ignore);
if (r == 0) {
*less_p = less;
*equal_p = equal_left;
*greater_p = middle;
paranoid_invariant_zero(greater); // no keys are greater than positive infinity
paranoid_invariant_zero(equal_right); // no keys are equal to positive infinity
// toku_ft_keysrange does not know when all 3 are exact, so set is_exact to false
*is_exact = false;
}
return 0; return 0;
} }
...@@ -928,6 +941,7 @@ toku_db_create(DB ** db, DB_ENV * env, uint32_t flags) { ...@@ -928,6 +941,7 @@ toku_db_create(DB ** db, DB_ENV * env, uint32_t flags) {
USDB(pre_acquire_table_lock); USDB(pre_acquire_table_lock);
USDB(pre_acquire_fileops_lock); USDB(pre_acquire_fileops_lock);
USDB(key_range64); USDB(key_range64);
USDB(keys_range64);
USDB(hot_optimize); USDB(hot_optimize);
USDB(stat64); USDB(stat64);
USDB(get_fractal_tree_info64); USDB(get_fractal_tree_info64);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment