Commit 3b853c9a authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

closes #5804, merge 51139, work to read in entire internal nodes into memory,...

closes #5804, merge 51139, work to read in entire internal nodes into memory, compressed, as opposed to using two I/O's, to main.

git-svn-id: file:///svn/toku/tokudb@51185 c7de825b-a66e-492c-adef-691d508d4ae1
parent 856ab364
...@@ -91,6 +91,9 @@ struct ftnode_fetch_extra { ...@@ -91,6 +91,9 @@ struct ftnode_fetch_extra {
// this value will be set during the fetch_callback call by toku_ftnode_fetch_callback or toku_ftnode_pf_req_callback // this value will be set during the fetch_callback call by toku_ftnode_fetch_callback or toku_ftnode_pf_req_callback
// thi callbacks need to evaluate this anyway, so we cache it here so the search code does not reevaluate it // thi callbacks need to evaluate this anyway, so we cache it here so the search code does not reevaluate it
int child_to_read; int child_to_read;
// when we read internal nodes, we want to read all the data off disk in one I/O
// then we'll treat it as normal and only decompress the needed partitions etc.
bool read_all_partitions;
// Accounting: How many bytes were fetched, and how much time did it take? // Accounting: How many bytes were fetched, and how much time did it take?
tokutime_t bytes_read; tokutime_t bytes_read;
uint64_t read_time; uint64_t read_time;
...@@ -724,6 +727,7 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h) ...@@ -724,6 +727,7 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h)
bfe->right_is_pos_infty = false; bfe->right_is_pos_infty = false;
bfe->child_to_read = -1; bfe->child_to_read = -1;
bfe->disable_prefetching = false; bfe->disable_prefetching = false;
bfe->read_all_partitions = false;
bfe->bytes_read = 0; bfe->bytes_read = 0;
bfe->read_time = 0; bfe->read_time = 0;
} }
...@@ -742,7 +746,8 @@ static inline void fill_bfe_for_subset_read( ...@@ -742,7 +746,8 @@ static inline void fill_bfe_for_subset_read(
DBT *right, DBT *right,
bool left_is_neg_infty, bool left_is_neg_infty,
bool right_is_pos_infty, bool right_is_pos_infty,
bool disable_prefetching bool disable_prefetching,
bool read_all_partitions
) )
{ {
paranoid_invariant(h->h->type == FT_CURRENT); paranoid_invariant(h->h->type == FT_CURRENT);
...@@ -755,6 +760,7 @@ static inline void fill_bfe_for_subset_read( ...@@ -755,6 +760,7 @@ static inline void fill_bfe_for_subset_read(
bfe->right_is_pos_infty = right_is_pos_infty; bfe->right_is_pos_infty = right_is_pos_infty;
bfe->child_to_read = -1; bfe->child_to_read = -1;
bfe->disable_prefetching = disable_prefetching; bfe->disable_prefetching = disable_prefetching;
bfe->read_all_partitions = read_all_partitions;
bfe->bytes_read = 0; bfe->bytes_read = 0;
bfe->read_time = 0; bfe->read_time = 0;
} }
...@@ -776,6 +782,7 @@ static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) { ...@@ -776,6 +782,7 @@ static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) {
bfe->right_is_pos_infty = false; bfe->right_is_pos_infty = false;
bfe->child_to_read = -1; bfe->child_to_read = -1;
bfe->disable_prefetching = false; bfe->disable_prefetching = false;
bfe->read_all_partitions = false;
bfe->bytes_read = 0; bfe->bytes_read = 0;
bfe->read_time = 0; bfe->read_time = 0;
} }
...@@ -824,6 +831,7 @@ static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe, ...@@ -824,6 +831,7 @@ static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe,
bfe->right_is_pos_infty = c->right_is_pos_infty; bfe->right_is_pos_infty = c->right_is_pos_infty;
bfe->child_to_read = -1; bfe->child_to_read = -1;
bfe->disable_prefetching = c->disable_prefetching; bfe->disable_prefetching = c->disable_prefetching;
bfe->read_all_partitions = false;
bfe->bytes_read = 0; bfe->bytes_read = 0;
bfe->read_time = 0; bfe->read_time = 0;
} }
......
...@@ -4860,6 +4860,9 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F ...@@ -4860,6 +4860,9 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, childnum); uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, childnum);
FTNODE childnode; FTNODE childnode;
// If the current node's height is greater than 1, then its child is an internal node.
// Therefore, to warm the cache better (#5798), we want to read all the partitions off disk in one shot.
bool read_all_partitions = node->height > 1;
struct ftnode_fetch_extra bfe; struct ftnode_fetch_extra bfe;
fill_bfe_for_subset_read( fill_bfe_for_subset_read(
&bfe, &bfe,
...@@ -4869,7 +4872,8 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F ...@@ -4869,7 +4872,8 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
&ftcursor->range_lock_right_key, &ftcursor->range_lock_right_key,
ftcursor->left_is_neg_infty, ftcursor->left_is_neg_infty,
ftcursor->right_is_pos_infty, ftcursor->right_is_pos_infty,
ftcursor->disable_prefetching ftcursor->disable_prefetching,
read_all_partitions
); );
bool msgs_applied = false; bool msgs_applied = false;
{ {
...@@ -5195,7 +5199,8 @@ try_again: ...@@ -5195,7 +5199,8 @@ try_again:
&ftcursor->range_lock_right_key, &ftcursor->range_lock_right_key,
ftcursor->left_is_neg_infty, ftcursor->left_is_neg_infty,
ftcursor->right_is_pos_infty, ftcursor->right_is_pos_infty,
ftcursor->disable_prefetching ftcursor->disable_prefetching,
true // We may as well always read the whole root into memory, if it's a leaf node it's a tiny tree anyway.
); );
FTNODE node = NULL; FTNODE node = NULL;
{ {
......
...@@ -2445,9 +2445,15 @@ toku_deserialize_ftnode_from (int fd, ...@@ -2445,9 +2445,15 @@ toku_deserialize_ftnode_from (int fd,
toku_trace("deserial start"); toku_trace("deserial start");
int r = 0; int r = 0;
struct rbuf rb = RBUF_INITIALIZER; struct rbuf rb = RBUF_INITIALIZER;
if (!bfe->read_all_partitions) {
read_ftnode_header_from_fd_into_rbuf_if_small_enough(fd, blocknum, bfe->h, &rb, bfe); read_ftnode_header_from_fd_into_rbuf_if_small_enough(fd, blocknum, bfe->h, &rb, bfe);
r = deserialize_ftnode_header_from_rbuf_if_small_enough(ftnode, ndd, blocknum, fullhash, bfe, &rb, fd); r = deserialize_ftnode_header_from_rbuf_if_small_enough(ftnode, ndd, blocknum, fullhash, bfe, &rb, fd);
} else {
// force us to do it the old way
r = -1;
}
if (r != 0) { if (r != 0) {
// Something went wrong, go back to doing it the old way. // Something went wrong, go back to doing it the old way.
r = deserialize_ftnode_from_fd(fd, blocknum, fullhash, ftnode, ndd, bfe, NULL); r = deserialize_ftnode_from_fd(fd, blocknum, fullhash, ftnode, ndd, bfe, NULL);
......
...@@ -192,6 +192,7 @@ test_subset_read(int fd, FT_HANDLE UU(brt), FT brt_h) { ...@@ -192,6 +192,7 @@ test_subset_read(int fd, FT_HANDLE UU(brt), FT brt_h) {
&right, &right,
false, false,
false, false,
false,
false false
); );
......
...@@ -164,6 +164,7 @@ test2(int fd, FT brt_h, FTNODE *dn) { ...@@ -164,6 +164,7 @@ test2(int fd, FT brt_h, FTNODE *dn) {
&right, &right,
true, true,
true, true,
false,
false false
); );
FTNODE_DISK_DATA ndd = NULL; FTNODE_DISK_DATA ndd = NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment