Commit 3ebbf0cd authored by John Esmet's avatar John Esmet Committed by Yoni Fogel

refs #5710 add counters for the number of bytes read and time spent doing IO,...

refs #5710 add counters for the number of bytes read and time spent doing IO, for the various ways you can do a fetch. improve the name of get_tokutime() and get rid of the now defunct toku_current_time_nanoseconds()


git-svn-id: file:///svn/toku/tokudb@50481 c7de825b-a66e-492c-adef-691d508d4ae1
parent 772d2ef8
...@@ -91,6 +91,9 @@ struct ftnode_fetch_extra { ...@@ -91,6 +91,9 @@ struct ftnode_fetch_extra {
// this value will be set during the fetch_callback call by toku_ftnode_fetch_callback or toku_ftnode_pf_req_callback // this value will be set during the fetch_callback call by toku_ftnode_fetch_callback or toku_ftnode_pf_req_callback
// thi callbacks need to evaluate this anyway, so we cache it here so the search code does not reevaluate it // thi callbacks need to evaluate this anyway, so we cache it here so the search code does not reevaluate it
int child_to_read; int child_to_read;
// Accounting: How many bytes were fetched, and how much time did it take?
tokutime_t bytes_read;
uint64_t read_time;
}; };
struct toku_fifo_entry_key_msn_heaviside_extra { struct toku_fifo_entry_key_msn_heaviside_extra {
...@@ -718,6 +721,8 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h) ...@@ -718,6 +721,8 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h)
bfe->right_is_pos_infty = false; bfe->right_is_pos_infty = false;
bfe->child_to_read = -1; bfe->child_to_read = -1;
bfe->disable_prefetching = false; bfe->disable_prefetching = false;
bfe->bytes_read = 0;
bfe->read_time = 0;
} }
// //
...@@ -747,6 +752,8 @@ static inline void fill_bfe_for_subset_read( ...@@ -747,6 +752,8 @@ static inline void fill_bfe_for_subset_read(
bfe->right_is_pos_infty = right_is_pos_infty; bfe->right_is_pos_infty = right_is_pos_infty;
bfe->child_to_read = -1; bfe->child_to_read = -1;
bfe->disable_prefetching = disable_prefetching; bfe->disable_prefetching = disable_prefetching;
bfe->bytes_read = 0;
bfe->read_time = 0;
} }
// //
...@@ -766,6 +773,8 @@ static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) { ...@@ -766,6 +773,8 @@ static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) {
bfe->right_is_pos_infty = false; bfe->right_is_pos_infty = false;
bfe->child_to_read = -1; bfe->child_to_read = -1;
bfe->disable_prefetching = false; bfe->disable_prefetching = false;
bfe->bytes_read = 0;
bfe->read_time = 0;
} }
static inline void destroy_bfe_for_prefetch(struct ftnode_fetch_extra *bfe) { static inline void destroy_bfe_for_prefetch(struct ftnode_fetch_extra *bfe) {
...@@ -812,6 +821,8 @@ static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe, ...@@ -812,6 +821,8 @@ static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe,
bfe->right_is_pos_infty = c->right_is_pos_infty; bfe->right_is_pos_infty = c->right_is_pos_infty;
bfe->child_to_read = -1; bfe->child_to_read = -1;
bfe->disable_prefetching = c->disable_prefetching; bfe->disable_prefetching = c->disable_prefetching;
bfe->bytes_read = 0;
bfe->read_time = 0;
} }
struct ancestors { struct ancestors {
...@@ -973,11 +984,9 @@ typedef enum { ...@@ -973,11 +984,9 @@ typedef enum {
FT_CREATE_NONLEAF, // number of nonleaf nodes created FT_CREATE_NONLEAF, // number of nonleaf nodes created
FT_DESTROY_LEAF, // number of leaf nodes destroyed FT_DESTROY_LEAF, // number of leaf nodes destroyed
FT_DESTROY_NONLEAF, // number of nonleaf nodes destroyed FT_DESTROY_NONLEAF, // number of nonleaf nodes destroyed
FT_MSG_KEYVAL_BYTES_IN, // how many bytes of keyval data ingested by the tree (all tree, no overhead counted)
FT_MSG_BYTES_IN, // how many bytes of messages injected at root (for all trees) FT_MSG_BYTES_IN, // how many bytes of messages injected at root (for all trees)
FT_MSG_BYTES_OUT, // how many bytes of messages flushed from h1 nodes to leaves FT_MSG_BYTES_OUT, // how many bytes of messages flushed from h1 nodes to leaves
FT_MSG_BYTES_CURR, // how many bytes of messages currently in trees (estimate) FT_MSG_BYTES_CURR, // how many bytes of messages currently in trees (estimate)
//FT_MSG_BYTES_MAX, // how many bytes of messages currently in trees (estimate)
FT_MSG_NUM, // how many messages injected at root FT_MSG_NUM, // how many messages injected at root
FT_MSG_NUM_BROADCAST, // how many broadcast messages injected at root FT_MSG_NUM_BROADCAST, // how many broadcast messages injected at root
FT_NUM_BASEMENTS_DECOMPRESSED_NORMAL, // how many basement nodes were decompressed because they were the target of a query FT_NUM_BASEMENTS_DECOMPRESSED_NORMAL, // how many basement nodes were decompressed because they were the target of a query
...@@ -989,16 +998,38 @@ typedef enum { ...@@ -989,16 +998,38 @@ typedef enum {
FT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH, FT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH,
FT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE, FT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE,
FT_NUM_PIVOTS_FETCHED_QUERY, // how many pivots were fetched for a query FT_NUM_PIVOTS_FETCHED_QUERY, // how many pivots were fetched for a query
FT_BYTES_PIVOTS_FETCHED_QUERY, // how many pivots were fetched for a query
FT_NANOTIME_PIVOTS_FETCHED_QUERY, // how many pivots were fetched for a query
FT_NUM_PIVOTS_FETCHED_PREFETCH, // ... for a prefetch FT_NUM_PIVOTS_FETCHED_PREFETCH, // ... for a prefetch
FT_BYTES_PIVOTS_FETCHED_PREFETCH, // ... for a prefetch
FT_NANOTIME_PIVOTS_FETCHED_PREFETCH, // ... for a prefetch
FT_NUM_PIVOTS_FETCHED_WRITE, // ... for a write FT_NUM_PIVOTS_FETCHED_WRITE, // ... for a write
FT_BYTES_PIVOTS_FETCHED_WRITE, // ... for a write
FT_NANOTIME_PIVOTS_FETCHED_WRITE, // ... for a write
FT_NUM_BASEMENTS_FETCHED_NORMAL, // how many basement nodes were fetched because they were the target of a query FT_NUM_BASEMENTS_FETCHED_NORMAL, // how many basement nodes were fetched because they were the target of a query
FT_BYTES_BASEMENTS_FETCHED_NORMAL, // how many basement nodes were fetched because they were the target of a query
FT_NANOTIME_BASEMENTS_FETCHED_NORMAL, // how many basement nodes were fetched because they were the target of a query
FT_NUM_BASEMENTS_FETCHED_AGGRESSIVE, // ... because they were between lc and rc FT_NUM_BASEMENTS_FETCHED_AGGRESSIVE, // ... because they were between lc and rc
FT_BYTES_BASEMENTS_FETCHED_AGGRESSIVE, // ... because they were between lc and rc
FT_NANOTIME_BASEMENTS_FETCHED_AGGRESSIVE, // ... because they were between lc and rc
FT_NUM_BASEMENTS_FETCHED_PREFETCH, FT_NUM_BASEMENTS_FETCHED_PREFETCH,
FT_BYTES_BASEMENTS_FETCHED_PREFETCH,
FT_NANOTIME_BASEMENTS_FETCHED_PREFETCH,
FT_NUM_BASEMENTS_FETCHED_WRITE, FT_NUM_BASEMENTS_FETCHED_WRITE,
FT_BYTES_BASEMENTS_FETCHED_WRITE,
FT_NANOTIME_BASEMENTS_FETCHED_WRITE,
FT_NUM_MSG_BUFFER_FETCHED_NORMAL, // how many msg buffers were fetched because they were the target of a query FT_NUM_MSG_BUFFER_FETCHED_NORMAL, // how many msg buffers were fetched because they were the target of a query
FT_BYTES_MSG_BUFFER_FETCHED_NORMAL, // how many msg buffers were fetched because they were the target of a query
FT_NANOTIME_MSG_BUFFER_FETCHED_NORMAL, // how many msg buffers were fetched because they were the target of a query
FT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE, // ... because they were between lc and rc FT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE, // ... because they were between lc and rc
FT_BYTES_MSG_BUFFER_FETCHED_AGGRESSIVE, // ... because they were between lc and rc
FT_NANOTIME_MSG_BUFFER_FETCHED_AGGRESSIVE, // ... because they were between lc and rc
FT_NUM_MSG_BUFFER_FETCHED_PREFETCH, FT_NUM_MSG_BUFFER_FETCHED_PREFETCH,
FT_BYTES_MSG_BUFFER_FETCHED_PREFETCH,
FT_NANOTIME_MSG_BUFFER_FETCHED_PREFETCH,
FT_NUM_MSG_BUFFER_FETCHED_WRITE, FT_NUM_MSG_BUFFER_FETCHED_WRITE,
FT_BYTES_MSG_BUFFER_FETCHED_WRITE,
FT_NANOTIME_MSG_BUFFER_FETCHED_WRITE,
FT_PRO_NUM_ROOT_SPLIT, FT_PRO_NUM_ROOT_SPLIT,
FT_PRO_NUM_ROOT_H0_INJECT, FT_PRO_NUM_ROOT_H0_INJECT,
FT_PRO_NUM_ROOT_H1_INJECT, FT_PRO_NUM_ROOT_H1_INJECT,
......
...@@ -179,10 +179,6 @@ status_init(void) ...@@ -179,10 +179,6 @@ status_init(void)
STATUS_INIT(FT_TOTAL_RETRIES, PARCOUNT, "total search retries due to TRY_AGAIN"); STATUS_INIT(FT_TOTAL_RETRIES, PARCOUNT, "total search retries due to TRY_AGAIN");
STATUS_INIT(FT_SEARCH_TRIES_GT_HEIGHT, PARCOUNT, "searches requiring more tries than the height of the tree"); STATUS_INIT(FT_SEARCH_TRIES_GT_HEIGHT, PARCOUNT, "searches requiring more tries than the height of the tree");
STATUS_INIT(FT_SEARCH_TRIES_GT_HEIGHTPLUS3, PARCOUNT, "searches requiring more tries than the height of the tree plus three"); STATUS_INIT(FT_SEARCH_TRIES_GT_HEIGHTPLUS3, PARCOUNT, "searches requiring more tries than the height of the tree plus three");
STATUS_INIT(FT_DISK_FLUSH_LEAF, PARCOUNT, "leaf nodes flushed to disk (not for checkpoint)");
STATUS_INIT(FT_DISK_FLUSH_NONLEAF, PARCOUNT, "nonleaf nodes flushed to disk (not for checkpoint)");
STATUS_INIT(FT_DISK_FLUSH_LEAF_FOR_CHECKPOINT, PARCOUNT, "leaf nodes flushed to disk (for checkpoint)");
STATUS_INIT(FT_DISK_FLUSH_NONLEAF_FOR_CHECKPOINT, PARCOUNT, "nonleaf nodes flushed to disk (for checkpoint)");
STATUS_INIT(FT_CREATE_LEAF, PARCOUNT, "leaf nodes created"); STATUS_INIT(FT_CREATE_LEAF, PARCOUNT, "leaf nodes created");
STATUS_INIT(FT_CREATE_NONLEAF, PARCOUNT, "nonleaf nodes created"); STATUS_INIT(FT_CREATE_NONLEAF, PARCOUNT, "nonleaf nodes created");
STATUS_INIT(FT_DESTROY_LEAF, PARCOUNT, "leaf nodes destroyed"); STATUS_INIT(FT_DESTROY_LEAF, PARCOUNT, "leaf nodes destroyed");
...@@ -192,6 +188,7 @@ status_init(void) ...@@ -192,6 +188,7 @@ status_init(void)
STATUS_INIT(FT_MSG_BYTES_CURR, PARCOUNT, "bytes of messages currently in trees (estimate)"); STATUS_INIT(FT_MSG_BYTES_CURR, PARCOUNT, "bytes of messages currently in trees (estimate)");
STATUS_INIT(FT_MSG_NUM, PARCOUNT, "messages injected at root"); STATUS_INIT(FT_MSG_NUM, PARCOUNT, "messages injected at root");
STATUS_INIT(FT_MSG_NUM_BROADCAST, PARCOUNT, "broadcast messages injected at root"); STATUS_INIT(FT_MSG_NUM_BROADCAST, PARCOUNT, "broadcast messages injected at root");
STATUS_INIT(FT_NUM_BASEMENTS_DECOMPRESSED_NORMAL, PARCOUNT, "basements decompressed as a target of a query"); STATUS_INIT(FT_NUM_BASEMENTS_DECOMPRESSED_NORMAL, PARCOUNT, "basements decompressed as a target of a query");
STATUS_INIT(FT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE, PARCOUNT, "basements decompressed for prelocked range"); STATUS_INIT(FT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE, PARCOUNT, "basements decompressed for prelocked range");
STATUS_INIT(FT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH, PARCOUNT, "basements decompressed for prefetch"); STATUS_INIT(FT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH, PARCOUNT, "basements decompressed for prefetch");
...@@ -200,18 +197,49 @@ status_init(void) ...@@ -200,18 +197,49 @@ status_init(void)
STATUS_INIT(FT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE, PARCOUNT, "buffers decompressed for prelocked range"); STATUS_INIT(FT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE, PARCOUNT, "buffers decompressed for prelocked range");
STATUS_INIT(FT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH, PARCOUNT, "buffers decompressed for prefetch"); STATUS_INIT(FT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH, PARCOUNT, "buffers decompressed for prefetch");
STATUS_INIT(FT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE, PARCOUNT, "buffers decompressed for write"); STATUS_INIT(FT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE, PARCOUNT, "buffers decompressed for write");
// Disk read statistics.
STATUS_INIT(FT_NUM_PIVOTS_FETCHED_QUERY, PARCOUNT, "pivots fetched for query"); STATUS_INIT(FT_NUM_PIVOTS_FETCHED_QUERY, PARCOUNT, "pivots fetched for query");
STATUS_INIT(FT_BYTES_PIVOTS_FETCHED_QUERY, PARCOUNT, "pivots fetched for query (bytes)");
STATUS_INIT(FT_NANOTIME_PIVOTS_FETCHED_QUERY, PARCOUNT, "pivots fetched for query (seconds)");
STATUS_INIT(FT_NUM_PIVOTS_FETCHED_PREFETCH, PARCOUNT, "pivots fetched for prefetch"); STATUS_INIT(FT_NUM_PIVOTS_FETCHED_PREFETCH, PARCOUNT, "pivots fetched for prefetch");
STATUS_INIT(FT_BYTES_PIVOTS_FETCHED_PREFETCH, PARCOUNT, "pivots fetched for prefetch (bytes)");
STATUS_INIT(FT_NANOTIME_PIVOTS_FETCHED_PREFETCH, PARCOUNT, "pivots fetched for prefetch (seconds)");
STATUS_INIT(FT_NUM_PIVOTS_FETCHED_WRITE, PARCOUNT, "pivots fetched for write"); STATUS_INIT(FT_NUM_PIVOTS_FETCHED_WRITE, PARCOUNT, "pivots fetched for write");
STATUS_INIT(FT_BYTES_PIVOTS_FETCHED_WRITE, PARCOUNT, "pivots fetched for write (bytes)");
STATUS_INIT(FT_NANOTIME_PIVOTS_FETCHED_WRITE, PARCOUNT, "pivots fetched for write (seconds)");
STATUS_INIT(FT_NUM_BASEMENTS_FETCHED_NORMAL, PARCOUNT, "basements fetched as a target of a query"); STATUS_INIT(FT_NUM_BASEMENTS_FETCHED_NORMAL, PARCOUNT, "basements fetched as a target of a query");
STATUS_INIT(FT_BYTES_BASEMENTS_FETCHED_NORMAL, PARCOUNT, "basements fetched as a target of a query (bytes)");
STATUS_INIT(FT_NANOTIME_BASEMENTS_FETCHED_NORMAL, PARCOUNT, "basements fetched as a target of a query (seconds)");
STATUS_INIT(FT_NUM_BASEMENTS_FETCHED_AGGRESSIVE, PARCOUNT, "basements fetched for prelocked range"); STATUS_INIT(FT_NUM_BASEMENTS_FETCHED_AGGRESSIVE, PARCOUNT, "basements fetched for prelocked range");
STATUS_INIT(FT_BYTES_BASEMENTS_FETCHED_AGGRESSIVE, PARCOUNT, "basements fetched for prelocked range (bytes)");
STATUS_INIT(FT_NANOTIME_BASEMENTS_FETCHED_AGGRESSIVE, PARCOUNT, "basements fetched for prelocked range (seconds)");
STATUS_INIT(FT_NUM_BASEMENTS_FETCHED_PREFETCH, PARCOUNT, "basements fetched for prefetch"); STATUS_INIT(FT_NUM_BASEMENTS_FETCHED_PREFETCH, PARCOUNT, "basements fetched for prefetch");
STATUS_INIT(FT_BYTES_BASEMENTS_FETCHED_PREFETCH, PARCOUNT, "basements fetched for prefetch (bytes)");
STATUS_INIT(FT_NANOTIME_BASEMENTS_FETCHED_PREFETCH, PARCOUNT, "basements fetched for prefetch (seconds)");
STATUS_INIT(FT_NUM_BASEMENTS_FETCHED_WRITE, PARCOUNT, "basements fetched for write"); STATUS_INIT(FT_NUM_BASEMENTS_FETCHED_WRITE, PARCOUNT, "basements fetched for write");
STATUS_INIT(FT_BYTES_BASEMENTS_FETCHED_WRITE, PARCOUNT, "basements fetched for write (bytes");
STATUS_INIT(FT_NANOTIME_BASEMENTS_FETCHED_WRITE, PARCOUNT, "basements fetched for write (seconds)");
STATUS_INIT(FT_NUM_MSG_BUFFER_FETCHED_NORMAL, PARCOUNT, "buffers fetched as a target of a query"); STATUS_INIT(FT_NUM_MSG_BUFFER_FETCHED_NORMAL, PARCOUNT, "buffers fetched as a target of a query");
STATUS_INIT(FT_BYTES_MSG_BUFFER_FETCHED_NORMAL, PARCOUNT, "buffers fetched as a target of a query (bytes)");
STATUS_INIT(FT_NANOTIME_MSG_BUFFER_FETCHED_NORMAL, PARCOUNT, "buffers fetched as a target of a query (seconds)");
STATUS_INIT(FT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE, PARCOUNT, "buffers fetched for prelocked range"); STATUS_INIT(FT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE, PARCOUNT, "buffers fetched for prelocked range");
STATUS_INIT(FT_BYTES_MSG_BUFFER_FETCHED_AGGRESSIVE, PARCOUNT, "buffers fetched for prelocked range (bytes)");
STATUS_INIT(FT_NANOTIME_MSG_BUFFER_FETCHED_AGGRESSIVE, PARCOUNT, "buffers fetched for prelocked range (seconds)");
STATUS_INIT(FT_NUM_MSG_BUFFER_FETCHED_PREFETCH, PARCOUNT, "buffers fetched for prefetch"); STATUS_INIT(FT_NUM_MSG_BUFFER_FETCHED_PREFETCH, PARCOUNT, "buffers fetched for prefetch");
STATUS_INIT(FT_BYTES_MSG_BUFFER_FETCHED_PREFETCH, PARCOUNT, "buffers fetched for prefetch (bytes)");
STATUS_INIT(FT_NANOTIME_MSG_BUFFER_FETCHED_PREFETCH, PARCOUNT, "buffers fetched for prefetch (seconds)");
STATUS_INIT(FT_NUM_MSG_BUFFER_FETCHED_WRITE, PARCOUNT, "buffers fetched for write"); STATUS_INIT(FT_NUM_MSG_BUFFER_FETCHED_WRITE, PARCOUNT, "buffers fetched for write");
STATUS_INIT(FT_BYTES_MSG_BUFFER_FETCHED_WRITE, PARCOUNT, "buffers fetched for write (bytes)");
STATUS_INIT(FT_NANOTIME_MSG_BUFFER_FETCHED_WRITE, PARCOUNT, "buffers fetched for write (seconds)");
// Disk write statistics.
STATUS_INIT(FT_DISK_FLUSH_LEAF, PARCOUNT, "leaf nodes flushed to disk (not for checkpoint)");
STATUS_INIT(FT_DISK_FLUSH_NONLEAF, PARCOUNT, "nonleaf nodes flushed to disk (not for checkpoint)");
STATUS_INIT(FT_DISK_FLUSH_LEAF_FOR_CHECKPOINT, PARCOUNT, "leaf nodes flushed to disk (for checkpoint)");
STATUS_INIT(FT_DISK_FLUSH_NONLEAF_FOR_CHECKPOINT, PARCOUNT, "nonleaf nodes flushed to disk (for checkpoint)");
// Promotion statistics.
STATUS_INIT(FT_PRO_NUM_ROOT_SPLIT, PARCOUNT, "promotion: roots split"); STATUS_INIT(FT_PRO_NUM_ROOT_SPLIT, PARCOUNT, "promotion: roots split");
STATUS_INIT(FT_PRO_NUM_ROOT_H0_INJECT, PARCOUNT, "promotion: leaf roots injected into"); STATUS_INIT(FT_PRO_NUM_ROOT_H0_INJECT, PARCOUNT, "promotion: leaf roots injected into");
STATUS_INIT(FT_PRO_NUM_ROOT_H1_INJECT, PARCOUNT, "promotion: h1 roots injected into"); STATUS_INIT(FT_PRO_NUM_ROOT_H1_INJECT, PARCOUNT, "promotion: h1 roots injected into");
...@@ -777,10 +805,16 @@ toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe) ...@@ -777,10 +805,16 @@ toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe)
{ {
if (bfe->type == ftnode_fetch_prefetch) { if (bfe->type == ftnode_fetch_prefetch) {
STATUS_INC(FT_NUM_PIVOTS_FETCHED_PREFETCH, 1); STATUS_INC(FT_NUM_PIVOTS_FETCHED_PREFETCH, 1);
STATUS_INC(FT_BYTES_PIVOTS_FETCHED_PREFETCH, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_PIVOTS_FETCHED_PREFETCH, bfe->read_time);
} else if (bfe->type == ftnode_fetch_all) { } else if (bfe->type == ftnode_fetch_all) {
STATUS_INC(FT_NUM_PIVOTS_FETCHED_WRITE, 1); STATUS_INC(FT_NUM_PIVOTS_FETCHED_WRITE, 1);
STATUS_INC(FT_BYTES_PIVOTS_FETCHED_WRITE, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_PIVOTS_FETCHED_WRITE, bfe->read_time);
} else if (bfe->type == ftnode_fetch_subset) { } else if (bfe->type == ftnode_fetch_subset) {
STATUS_INC(FT_NUM_PIVOTS_FETCHED_QUERY, 1); STATUS_INC(FT_NUM_PIVOTS_FETCHED_QUERY, 1);
STATUS_INC(FT_BYTES_PIVOTS_FETCHED_QUERY, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_PIVOTS_FETCHED_QUERY, bfe->read_time);
} }
} }
...@@ -1055,24 +1089,32 @@ ft_status_update_partial_fetch_reason( ...@@ -1055,24 +1089,32 @@ ft_status_update_partial_fetch_reason(
STATUS_INC(FT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH, 1); STATUS_INC(FT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH, 1);
} else { } else {
STATUS_INC(FT_NUM_BASEMENTS_FETCHED_PREFETCH, 1); STATUS_INC(FT_NUM_BASEMENTS_FETCHED_PREFETCH, 1);
STATUS_INC(FT_BYTES_BASEMENTS_FETCHED_PREFETCH, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_BASEMENTS_FETCHED_PREFETCH, bfe->read_time);
} }
} else if (bfe->type == ftnode_fetch_all) { } else if (bfe->type == ftnode_fetch_all) {
if (state == PT_COMPRESSED) { if (state == PT_COMPRESSED) {
STATUS_INC(FT_NUM_BASEMENTS_DECOMPRESSED_WRITE, 1); STATUS_INC(FT_NUM_BASEMENTS_DECOMPRESSED_WRITE, 1);
} else { } else {
STATUS_INC(FT_NUM_BASEMENTS_FETCHED_WRITE, 1); STATUS_INC(FT_NUM_BASEMENTS_FETCHED_WRITE, 1);
STATUS_INC(FT_BYTES_BASEMENTS_FETCHED_WRITE, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_BASEMENTS_FETCHED_WRITE, bfe->read_time);
} }
} else if (childnum == bfe->child_to_read) { } else if (childnum == bfe->child_to_read) {
if (state == PT_COMPRESSED) { if (state == PT_COMPRESSED) {
STATUS_INC(FT_NUM_BASEMENTS_DECOMPRESSED_NORMAL, 1); STATUS_INC(FT_NUM_BASEMENTS_DECOMPRESSED_NORMAL, 1);
} else { } else {
STATUS_INC(FT_NUM_BASEMENTS_FETCHED_NORMAL, 1); STATUS_INC(FT_NUM_BASEMENTS_FETCHED_NORMAL, 1);
STATUS_INC(FT_BYTES_BASEMENTS_FETCHED_NORMAL, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_BASEMENTS_FETCHED_NORMAL, bfe->read_time);
} }
} else { } else {
if (state == PT_COMPRESSED) { if (state == PT_COMPRESSED) {
STATUS_INC(FT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE, 1); STATUS_INC(FT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE, 1);
} else { } else {
STATUS_INC(FT_NUM_BASEMENTS_FETCHED_AGGRESSIVE, 1); STATUS_INC(FT_NUM_BASEMENTS_FETCHED_AGGRESSIVE, 1);
STATUS_INC(FT_BYTES_BASEMENTS_FETCHED_AGGRESSIVE, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_BASEMENTS_FETCHED_AGGRESSIVE, bfe->read_time);
} }
} }
} }
...@@ -1082,24 +1124,32 @@ ft_status_update_partial_fetch_reason( ...@@ -1082,24 +1124,32 @@ ft_status_update_partial_fetch_reason(
STATUS_INC(FT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH, 1); STATUS_INC(FT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH, 1);
} else { } else {
STATUS_INC(FT_NUM_MSG_BUFFER_FETCHED_PREFETCH, 1); STATUS_INC(FT_NUM_MSG_BUFFER_FETCHED_PREFETCH, 1);
STATUS_INC(FT_BYTES_MSG_BUFFER_FETCHED_PREFETCH, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_MSG_BUFFER_FETCHED_PREFETCH, bfe->read_time);
} }
} else if (bfe->type == ftnode_fetch_all) { } else if (bfe->type == ftnode_fetch_all) {
if (state == PT_COMPRESSED) { if (state == PT_COMPRESSED) {
STATUS_INC(FT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE, 1); STATUS_INC(FT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE, 1);
} else { } else {
STATUS_INC(FT_NUM_MSG_BUFFER_FETCHED_WRITE, 1); STATUS_INC(FT_NUM_MSG_BUFFER_FETCHED_WRITE, 1);
STATUS_INC(FT_BYTES_MSG_BUFFER_FETCHED_WRITE, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_MSG_BUFFER_FETCHED_WRITE, bfe->read_time);
} }
} else if (childnum == bfe->child_to_read) { } else if (childnum == bfe->child_to_read) {
if (state == PT_COMPRESSED) { if (state == PT_COMPRESSED) {
STATUS_INC(FT_NUM_MSG_BUFFER_DECOMPRESSED_NORMAL, 1); STATUS_INC(FT_NUM_MSG_BUFFER_DECOMPRESSED_NORMAL, 1);
} else { } else {
STATUS_INC(FT_NUM_MSG_BUFFER_FETCHED_NORMAL, 1); STATUS_INC(FT_NUM_MSG_BUFFER_FETCHED_NORMAL, 1);
STATUS_INC(FT_BYTES_MSG_BUFFER_FETCHED_NORMAL, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_MSG_BUFFER_FETCHED_NORMAL, bfe->read_time);
} }
} else { } else {
if (state == PT_COMPRESSED) { if (state == PT_COMPRESSED) {
STATUS_INC(FT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE, 1); STATUS_INC(FT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE, 1);
} else { } else {
STATUS_INC(FT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE, 1); STATUS_INC(FT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE, 1);
STATUS_INC(FT_BYTES_MSG_BUFFER_FETCHED_AGGRESSIVE, bfe->bytes_read);
STATUS_INC(FT_NANOTIME_MSG_BUFFER_FETCHED_AGGRESSIVE, bfe->read_time);
} }
} }
} }
...@@ -1127,23 +1177,23 @@ int toku_ftnode_pf_callback(void* ftnode_pv, void* disk_data, void* read_extraar ...@@ -1127,23 +1177,23 @@ int toku_ftnode_pf_callback(void* ftnode_pv, void* disk_data, void* read_extraar
lc = -1; lc = -1;
rc = -1; rc = -1;
} }
// TODO: possibly cilkify expensive operations in this loop
// TODO: review this with others to see if it can be made faster
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
if (BP_STATE(node,i) == PT_AVAIL) { if (BP_STATE(node,i) == PT_AVAIL) {
continue; continue;
} }
if ((lc <= i && i <= rc) || toku_bfe_wants_child_available(bfe, i)) { if ((lc <= i && i <= rc) || toku_bfe_wants_child_available(bfe, i)) {
ft_status_update_partial_fetch_reason(bfe, i, BP_STATE(node, i), (node->height == 0)); enum pt_state state = BP_STATE(node, i);
if (BP_STATE(node,i) == PT_COMPRESSED) { if (state == PT_COMPRESSED) {
r = toku_deserialize_bp_from_compressed(node, i, &bfe->h->cmp_descriptor, bfe->h->compare_fun); r = toku_deserialize_bp_from_compressed(node, i, &bfe->h->cmp_descriptor, bfe->h->compare_fun);
} } else {
else if (BP_STATE(node,i) == PT_ON_DISK) { invariant(state == PT_ON_DISK);
tokutime_t io_t0 = toku_time_now();
r = toku_deserialize_bp_from_disk(node, ndd, i, fd, bfe); r = toku_deserialize_bp_from_disk(node, ndd, i, fd, bfe);
tokutime_t io_t1 = toku_time_now();
bfe->bytes_read = BP_SIZE(ndd, i);
bfe->read_time = io_t1 - io_t0;
} }
else { ft_status_update_partial_fetch_reason(bfe, i, state, (node->height == 0));
abort();
}
} }
if (r != 0) { if (r != 0) {
......
...@@ -1132,7 +1132,7 @@ static const int read_header_heuristic_max = 32*1024; ...@@ -1132,7 +1132,7 @@ static const int read_header_heuristic_max = 32*1024;
#define MIN(a,b) (((a)>(b)) ? (b) : (a)) #define MIN(a,b) (((a)>(b)) ? (b) : (a))
#endif #endif
static void read_ftnode_header_from_fd_into_rbuf_if_small_enough (int fd, BLOCKNUM blocknum, FT h, struct rbuf *rb) static void read_ftnode_header_from_fd_into_rbuf_if_small_enough (int fd, BLOCKNUM blocknum, FT h, struct rbuf *rb, struct ftnode_fetch_extra *bfe)
// Effect: If the header part of the node is small enough, then read it into the rbuf. The rbuf will be allocated to be big enough in any case. // Effect: If the header part of the node is small enough, then read it into the rbuf. The rbuf will be allocated to be big enough in any case.
{ {
DISKOFF offset, size; DISKOFF offset, size;
...@@ -1142,11 +1142,15 @@ static void read_ftnode_header_from_fd_into_rbuf_if_small_enough (int fd, BLOCKN ...@@ -1142,11 +1142,15 @@ static void read_ftnode_header_from_fd_into_rbuf_if_small_enough (int fd, BLOCKN
rbuf_init(rb, raw_block, read_size); rbuf_init(rb, raw_block, read_size);
{ {
// read the block // read the block
tokutime_t io_t0 = toku_time_now();
ssize_t rlen = toku_os_pread(fd, raw_block, read_size, offset); ssize_t rlen = toku_os_pread(fd, raw_block, read_size, offset);
tokutime_t io_t1 = toku_time_now();
assert(rlen>=0); assert(rlen>=0);
rbuf_init(rb, raw_block, rlen); rbuf_init(rb, raw_block, rlen);
bfe->bytes_read = rlen;
bfe->read_time = io_t1 - io_t0;
toku_ft_status_update_pivot_fetch_reason(bfe);
} }
} }
// //
...@@ -1590,9 +1594,6 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode, ...@@ -1590,9 +1594,6 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
goto cleanup; goto cleanup;
} }
// We got the entire header and node info!
toku_ft_status_update_pivot_fetch_reason(bfe);
// Finish reading compressed the sub_block // Finish reading compressed the sub_block
bytevec* cp; bytevec* cp;
cp = (bytevec*)&sb_node_info.compressed_ptr; cp = (bytevec*)&sb_node_info.compressed_ptr;
...@@ -2409,8 +2410,7 @@ deserialize_ftnode_from_fd(int fd, ...@@ -2409,8 +2410,7 @@ deserialize_ftnode_from_fd(int fd,
STAT64INFO info) STAT64INFO info)
{ {
struct rbuf rb = RBUF_INITIALIZER; struct rbuf rb = RBUF_INITIALIZER;
read_block_from_fd_into_rbuf(fd, blocknum, bfe->h, &rb); read_block_from_fd_into_rbuf(fd, blocknum, bfe->h, &rb);
int r = deserialize_ftnode_from_rbuf(ftnode, ndd, blocknum, fullhash, bfe, info, &rb, fd); int r = deserialize_ftnode_from_rbuf(ftnode, ndd, blocknum, fullhash, bfe, info, &rb, fd);
if (r != 0) { if (r != 0) {
dump_bad_block(rb.buf,rb.size); dump_bad_block(rb.buf,rb.size);
...@@ -2433,7 +2433,7 @@ toku_deserialize_ftnode_from (int fd, ...@@ -2433,7 +2433,7 @@ toku_deserialize_ftnode_from (int fd,
toku_trace("deserial start"); toku_trace("deserial start");
int r = 0; int r = 0;
struct rbuf rb = RBUF_INITIALIZER; struct rbuf rb = RBUF_INITIALIZER;
read_ftnode_header_from_fd_into_rbuf_if_small_enough(fd, blocknum, bfe->h, &rb); read_ftnode_header_from_fd_into_rbuf_if_small_enough(fd, blocknum, bfe->h, &rb, bfe);
r = deserialize_ftnode_header_from_rbuf_if_small_enough(ftnode, ndd, blocknum, fullhash, bfe, &rb, fd); r = deserialize_ftnode_header_from_rbuf_if_small_enough(ftnode, ndd, blocknum, fullhash, bfe, &rb, fd);
if (r != 0) { if (r != 0) {
......
...@@ -15,7 +15,7 @@ add_library(${LIBTOKUPORTABILITY}_static STATIC ${tokuportability_srcs}) ...@@ -15,7 +15,7 @@ add_library(${LIBTOKUPORTABILITY}_static STATIC ${tokuportability_srcs})
maybe_add_gcov_to_libraries(${LIBTOKUPORTABILITY} ${LIBTOKUPORTABILITY}_static) maybe_add_gcov_to_libraries(${LIBTOKUPORTABILITY} ${LIBTOKUPORTABILITY}_static)
set_property(TARGET ${LIBTOKUPORTABILITY} ${LIBTOKUPORTABILITY}_static APPEND PROPERTY COMPILE_DEFINITIONS _GNU_SOURCE) set_property(TARGET ${LIBTOKUPORTABILITY} ${LIBTOKUPORTABILITY}_static APPEND PROPERTY COMPILE_DEFINITIONS _GNU_SOURCE)
set_target_properties(${LIBTOKUPORTABILITY}_static PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${LIBTOKUPORTABILITY}_static PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${LIBTOKUPORTABILITY} LINK_PUBLIC ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS}) target_link_libraries(${LIBTOKUPORTABILITY} LINK_PUBLIC rt ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS})
set_property(SOURCE file memory os_malloc portability toku_assert toku_rwlock APPEND PROPERTY set_property(SOURCE file memory os_malloc portability toku_assert toku_rwlock APPEND PROPERTY
COMPILE_DEFINITIONS TOKU_ALLOW_DEPRECATED=1) COMPILE_DEFINITIONS TOKU_ALLOW_DEPRECATED=1)
......
...@@ -82,8 +82,8 @@ typedef uint64_t tokutime_t; // Time type used in by tokutek timers. ...@@ -82,8 +82,8 @@ typedef uint64_t tokutime_t; // Time type used in by tokutek timers.
// //
double tokutime_to_seconds(tokutime_t) __attribute__((__visibility__("default"))); // Convert tokutime to seconds. double tokutime_to_seconds(tokutime_t) __attribute__((__visibility__("default"))); // Convert tokutime to seconds.
// Get tokutime. We want this to be fast, so we expose the implementation as RDTSC. // Get the value of tokutime for right now. We want this to be fast, so we expose the implementation as RDTSC.
static inline tokutime_t get_tokutime (void) { static inline tokutime_t toku_time_now(void) {
uint32_t lo, hi; uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo; return (uint64_t)hi << 32 | lo;
...@@ -95,11 +95,4 @@ static inline uint64_t toku_current_time_microsec(void) { ...@@ -95,11 +95,4 @@ static inline uint64_t toku_current_time_microsec(void) {
return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec; return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec;
} }
static inline uint64_t toku_current_time_nanosec(void) {
struct timespec t;
int r = toku_clock_gettime(CLOCK_REALTIME, &t);
assert(r == 0);
return t.tv_sec * (1UL * 1000 * 1000 * 1000) + t.tv_nsec;
}
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment