Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
M
mariadb
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
mariadb
Commits
9a6ba1aa
Commit
9a6ba1aa
authored
Jun 28, 2013
by
John Esmet
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refs #5770 Only check one basement node on pin, remove the assumption that adjacent
available nodes are query-able.
parent
06d56d51
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
349 additions
and
235 deletions
+349
-235
ft/ft-cachetable-wrappers.cc
ft/ft-cachetable-wrappers.cc
+85
-54
ft/ft-cachetable-wrappers.h
ft/ft-cachetable-wrappers.h
+0
-1
ft/ft-internal.h
ft/ft-internal.h
+3
-10
ft/ft-ops.cc
ft/ft-ops.cc
+258
-167
ft/tests/orthopush-flush.cc
ft/tests/orthopush-flush.cc
+3
-3
No files found.
ft/ft-cachetable-wrappers.cc
View file @
9a6ba1aa
...
@@ -193,6 +193,11 @@ toku_create_new_ftnode (
...
@@ -193,6 +193,11 @@ toku_create_new_ftnode (
NULL
);
NULL
);
}
}
//
// On success, this function assumes that the caller is trying to pin the node
// with a PL_READ lock. If message application is needed,
// then a PL_WRITE_CHEAP lock is grabbed
//
int
int
toku_pin_ftnode_batched
(
toku_pin_ftnode_batched
(
FT_HANDLE
brt
,
FT_HANDLE
brt
,
...
@@ -202,15 +207,22 @@ toku_pin_ftnode_batched(
...
@@ -202,15 +207,22 @@ toku_pin_ftnode_batched(
ANCESTORS
ancestors
,
ANCESTORS
ancestors
,
const
PIVOT_BOUNDS
bounds
,
const
PIVOT_BOUNDS
bounds
,
FTNODE_FETCH_EXTRA
bfe
,
FTNODE_FETCH_EXTRA
bfe
,
pair_lock_type
lock_type
,
bool
apply_ancestor_messages
,
// this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
bool
apply_ancestor_messages
,
// this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE
*
node_p
,
FTNODE
*
node_p
,
bool
*
msgs_applied
)
bool
*
msgs_applied
)
{
{
void
*
node_v
;
void
*
node_v
;
*
msgs_applied
=
false
;
*
msgs_applied
=
false
;
pair_lock_type
needed_lock_type
=
lock_type
;
FTNODE
node
=
nullptr
;
try_again_for_write_lock:
MSN
max_msn_in_path
=
ZERO_MSN
;
bool
needs_ancestors_messages
=
false
;
// this function assumes that if you want ancestor messages applied,
// you are doing a read for a query. This is so we can make some optimizations
// below.
if
(
apply_ancestor_messages
)
{
paranoid_invariant
(
bfe
->
type
==
ftnode_fetch_subset
);
}
int
r
=
toku_cachetable_get_and_pin_nonblocking_batched
(
int
r
=
toku_cachetable_get_and_pin_nonblocking_batched
(
brt
->
ft
->
cf
,
brt
->
ft
->
cf
,
blocknum
,
blocknum
,
...
@@ -221,25 +233,52 @@ try_again_for_write_lock:
...
@@ -221,25 +233,52 @@ try_again_for_write_lock:
toku_ftnode_fetch_callback
,
toku_ftnode_fetch_callback
,
toku_ftnode_pf_req_callback
,
toku_ftnode_pf_req_callback
,
toku_ftnode_pf_callback
,
toku_ftnode_pf_callback
,
needed_lock_type
,
PL_READ
,
bfe
,
//read_extraargs
bfe
,
//read_extraargs
unlockers
);
unlockers
);
if
(
r
==
0
)
{
if
(
r
!=
0
)
{
FTNODE
node
=
static_cast
<
FTNODE
>
(
node_v
);
assert
(
r
==
TOKUDB_TRY_AGAIN
);
// Any other error and we should bomb out ASAP.
MSN
max_msn_in_path
;
goto
exit
;
bool
needs_ancestors_messages
=
false
;
if
(
apply_ancestor_messages
&&
node
->
height
==
0
)
{
needs_ancestors_messages
=
toku_ft_leaf_needs_ancestors_messages
(
brt
->
ft
,
node
,
ancestors
,
bounds
,
&
max_msn_in_path
);
if
(
needs_ancestors_messages
&&
needed_lock_type
==
PL_READ
)
{
toku_unpin_ftnode_read_only
(
brt
->
ft
,
node
);
needed_lock_type
=
PL_WRITE_CHEAP
;
goto
try_again_for_write_lock
;
}
}
}
node
=
static_cast
<
FTNODE
>
(
node_v
);
if
(
apply_ancestor_messages
&&
node
->
height
==
0
)
{
if
(
apply_ancestor_messages
&&
node
->
height
==
0
)
{
needs_ancestors_messages
=
toku_ft_leaf_needs_ancestors_messages
(
brt
->
ft
,
node
,
ancestors
,
bounds
,
&
max_msn_in_path
,
bfe
->
child_to_read
);
if
(
needs_ancestors_messages
)
{
if
(
needs_ancestors_messages
)
{
invariant
(
needed_lock_type
!=
PL_READ
);
toku_unpin_ftnode_read_only
(
brt
->
ft
,
node
);
toku_apply_ancestors_messages_to_node
(
brt
,
node
,
ancestors
,
bounds
,
msgs_applied
);
int
rr
=
toku_cachetable_get_and_pin_nonblocking_batched
(
brt
->
ft
->
cf
,
blocknum
,
fullhash
,
&
node_v
,
NULL
,
get_write_callbacks_for_node
(
brt
->
ft
),
toku_ftnode_fetch_callback
,
toku_ftnode_pf_req_callback
,
toku_ftnode_pf_callback
,
PL_WRITE_CHEAP
,
bfe
,
//read_extraargs
unlockers
);
if
(
rr
!=
0
)
{
assert
(
rr
==
TOKUDB_TRY_AGAIN
);
// Any other error and we should bomb out ASAP.
r
=
TOKUDB_TRY_AGAIN
;
goto
exit
;
}
node
=
static_cast
<
FTNODE
>
(
node_v
);
toku_apply_ancestors_messages_to_node
(
brt
,
node
,
ancestors
,
bounds
,
msgs_applied
,
bfe
->
child_to_read
);
}
else
{
}
else
{
// At this point, we aren't going to run
// At this point, we aren't going to run
// toku_apply_ancestors_messages_to_node but that doesn't
// toku_apply_ancestors_messages_to_node but that doesn't
...
@@ -264,20 +303,12 @@ try_again_for_write_lock:
...
@@ -264,20 +303,12 @@ try_again_for_write_lock:
// requires a write lock, and a write lock requires you to
// requires a write lock, and a write lock requires you to
// resolve checkpointing.
// resolve checkpointing.
if
(
!
node
->
dirty
)
{
if
(
!
node
->
dirty
)
{
toku_ft_bn_update_max_msn
(
node
,
max_msn_in_path
);
toku_ft_bn_update_max_msn
(
node
,
max_msn_in_path
,
bfe
->
child_to_read
);
}
}
}
}
invariant
(
needed_lock_type
!=
PL_READ
||
!*
msgs_applied
);
}
if
((
lock_type
!=
PL_READ
)
&&
node
->
height
>
0
)
{
toku_move_ftnode_messages_to_stale
(
brt
->
ft
,
node
);
}
}
*
node_p
=
node
;
*
node_p
=
node
;
// printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
exit:
}
else
{
assert
(
r
==
TOKUDB_TRY_AGAIN
);
// Any other error and we should bomb out ASAP.
// printf("%*sPin %ld try again\n", 8, "", blocknum.b);
}
return
r
;
return
r
;
}
}
...
...
ft/ft-cachetable-wrappers.h
View file @
9a6ba1aa
...
@@ -150,7 +150,6 @@ toku_pin_ftnode_batched(
...
@@ -150,7 +150,6 @@ toku_pin_ftnode_batched(
ANCESTORS
ancestors
,
ANCESTORS
ancestors
,
const
PIVOT_BOUNDS
pbounds
,
const
PIVOT_BOUNDS
pbounds
,
FTNODE_FETCH_EXTRA
bfe
,
FTNODE_FETCH_EXTRA
bfe
,
pair_lock_type
lock_type
,
bool
apply_ancestor_messages
,
// this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
bool
apply_ancestor_messages
,
// this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE
*
node_p
,
FTNODE
*
node_p
,
bool
*
msgs_applied
bool
*
msgs_applied
...
...
ft/ft-internal.h
View file @
9a6ba1aa
...
@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
...
@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
#define VERIFY_NODE(t,n) ((void)0)
#define VERIFY_NODE(t,n) ((void)0)
#endif
#endif
//#define FT_TRACE
#ifdef FT_TRACE
#define WHEN_FTTRACE(x) x
#else
#define WHEN_FTTRACE(x) ((void)0)
#endif
void
toku_ft_status_update_pivot_fetch_reason
(
struct
ftnode_fetch_extra
*
bfe
);
void
toku_ft_status_update_pivot_fetch_reason
(
struct
ftnode_fetch_extra
*
bfe
);
void
toku_ft_status_update_flush_reason
(
FTNODE
node
,
uint64_t
uncompressed_bytes_flushed
,
uint64_t
bytes_written
,
tokutime_t
write_time
,
bool
for_checkpoint
);
void
toku_ft_status_update_flush_reason
(
FTNODE
node
,
uint64_t
uncompressed_bytes_flushed
,
uint64_t
bytes_written
,
tokutime_t
write_time
,
bool
for_checkpoint
);
void
toku_ft_status_update_serialize_times
(
FTNODE
node
,
tokutime_t
serialize_time
,
tokutime_t
compress_time
);
void
toku_ft_status_update_serialize_times
(
FTNODE
node
,
tokutime_t
serialize_time
,
tokutime_t
compress_time
);
...
@@ -982,11 +975,11 @@ struct pivot_bounds {
...
@@ -982,11 +975,11 @@ struct pivot_bounds {
__attribute__
((
nonnull
))
__attribute__
((
nonnull
))
void
toku_move_ftnode_messages_to_stale
(
FT
ft
,
FTNODE
node
);
void
toku_move_ftnode_messages_to_stale
(
FT
ft
,
FTNODE
node
);
void
toku_apply_ancestors_messages_to_node
(
FT_HANDLE
t
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
bool
*
msgs_applied
);
void
toku_apply_ancestors_messages_to_node
(
FT_HANDLE
t
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
bool
*
msgs_applied
,
int
child_to_read
);
__attribute__
((
nonnull
))
__attribute__
((
nonnull
))
bool
toku_ft_leaf_needs_ancestors_messages
(
FT
ft
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
MSN
*
const
max_msn_in_path
);
bool
toku_ft_leaf_needs_ancestors_messages
(
FT
ft
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
MSN
*
const
max_msn_in_path
,
int
child_to_read
);
__attribute__
((
nonnull
))
__attribute__
((
nonnull
))
void
toku_ft_bn_update_max_msn
(
FTNODE
node
,
MSN
max_msn_applied
);
void
toku_ft_bn_update_max_msn
(
FTNODE
node
,
MSN
max_msn_applied
,
int
child_to_read
);
__attribute__
((
const
,
nonnull
))
__attribute__
((
const
,
nonnull
))
size_t
toku_ft_msg_memsize_in_fifo
(
FT_MSG
cmd
);
size_t
toku_ft_msg_memsize_in_fifo
(
FT_MSG
cmd
);
...
...
ft/ft-ops.cc
View file @
9a6ba1aa
...
@@ -4509,8 +4509,53 @@ bnc_apply_messages_to_basement_node(
...
@@ -4509,8 +4509,53 @@ bnc_apply_messages_to_basement_node(
}
}
}
}
static
void
apply_ancestors_messages_to_bn
(
FT_HANDLE
t
,
FTNODE
node
,
int
childnum
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
TXNID
oldest_referenced_xid
,
bool
*
msgs_applied
)
{
BASEMENTNODE
curr_bn
=
BLB
(
node
,
childnum
);
struct
pivot_bounds
curr_bounds
=
next_pivot_keys
(
node
,
childnum
,
bounds
);
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
curr_bn
->
max_msn_applied
.
msn
)
{
paranoid_invariant
(
BP_STATE
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
)
==
PT_AVAIL
);
bnc_apply_messages_to_basement_node
(
t
,
curr_bn
,
curr_ancestors
->
node
,
curr_ancestors
->
childnum
,
&
curr_bounds
,
oldest_referenced_xid
,
msgs_applied
);
// We don't want to check this ancestor node again if the
// next time we query it, the msn hasn't changed.
curr_bn
->
max_msn_applied
=
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
;
}
}
// At this point, we know all the stale messages above this
// basement node have been applied, and any new messages will be
// fresh, so we don't need to look at stale messages for this
// basement node, unless it gets evicted (and this field becomes
// false when it's read in again).
curr_bn
->
stale_ancestor_messages_applied
=
true
;
}
void
void
toku_apply_ancestors_messages_to_node
(
FT_HANDLE
t
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
bool
*
msgs_applied
)
toku_apply_ancestors_messages_to_node
(
FT_HANDLE
t
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
bool
*
msgs_applied
,
int
child_to_read
)
// Effect:
// Effect:
// Bring a leaf node up-to-date according to all the messages in the ancestors.
// Bring a leaf node up-to-date according to all the messages in the ancestors.
// If the leaf node is already up-to-date then do nothing.
// If the leaf node is already up-to-date then do nothing.
...
@@ -4521,7 +4566,7 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
...
@@ -4521,7 +4566,7 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
// The entire root-to-leaf path is pinned and appears in the ancestors list.
// The entire root-to-leaf path is pinned and appears in the ancestors list.
{
{
VERIFY_NODE
(
t
,
node
);
VERIFY_NODE
(
t
,
node
);
invariant
(
node
->
height
==
0
);
paranoid_
invariant
(
node
->
height
==
0
);
TXNID
oldest_referenced_xid
=
ancestors
->
node
->
oldest_referenced_xid_known
;
TXNID
oldest_referenced_xid
=
ancestors
->
node
->
oldest_referenced_xid_known
;
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
...
@@ -4530,69 +4575,53 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
...
@@ -4530,69 +4575,53 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
}
}
}
}
if
(
!
node
->
dirty
&&
child_to_read
>=
0
)
{
paranoid_invariant
(
BP_STATE
(
node
,
child_to_read
)
==
PT_AVAIL
);
apply_ancestors_messages_to_bn
(
t
,
node
,
child_to_read
,
ancestors
,
bounds
,
oldest_referenced_xid
,
msgs_applied
);
}
else
{
// know we are a leaf node
// know we are a leaf node
// An important invariant:
// An important invariant:
// We MUST bring every available basement
node up to date.
// We MUST bring every available basement node for a dirty
node up to date.
// flushing on the cleaner thread depends on this. This invariant
// flushing on the cleaner thread depends on this. This invariant
// allows the cleaner thread to just pick an internal node and flush it
// allows the cleaner thread to just pick an internal node and flush it
// as opposed to being forced to start from the root.
// as opposed to being forced to start from the root.
for
(
int
i
=
0
;
i
<
node
->
n_children
;
i
++
)
{
for
(
int
i
=
0
;
i
<
node
->
n_children
;
i
++
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
BASEMENTNODE
curr_bn
=
BLB
(
node
,
i
);
apply_ancestors_messages_to_bn
(
struct
pivot_bounds
curr_bounds
=
next_pivot_keys
(
node
,
i
,
bounds
);
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
curr_bn
->
max_msn_applied
.
msn
)
{
paranoid_invariant
(
BP_STATE
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
)
==
PT_AVAIL
);
bnc_apply_messages_to_basement_node
(
t
,
t
,
curr_bn
,
node
,
curr_ancestors
->
node
,
i
,
curr_ancestors
->
childnum
,
ancestors
,
&
curr_
bounds
,
bounds
,
oldest_referenced_xid
,
oldest_referenced_xid
,
msgs_applied
msgs_applied
);
);
// We don't want to check this ancestor node again if the
// next time we query it, the msn hasn't changed.
curr_bn
->
max_msn_applied
=
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
;
}
}
}
}
// At this point, we know all the stale messages above this
// basement node have been applied, and any new messages will be
// fresh, so we don't need to look at stale messages for this
// basement node, unless it gets evicted (and this field becomes
// false when it's read in again).
curr_bn
->
stale_ancestor_messages_applied
=
true
;
}
VERIFY_NODE
(
t
,
node
);
VERIFY_NODE
(
t
,
node
);
}
}
bool
toku_ft_leaf_needs_ancestors_messages
(
FT
ft
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
MSN
*
const
max_msn_in_path
)
static
bool
bn_needs_ancestors_messages
(
// Effect: Determine whether there are messages in a node's ancestors
FT
ft
,
// which must be applied to it. These messages are in the correct
FTNODE
node
,
// keyrange for any available basement nodes, and are in nodes with the
int
childnum
,
// correct max_msn_applied_to_node_on_disk.
struct
pivot_bounds
const
*
const
bounds
,
// Notes:
ANCESTORS
ancestors
,
// This is an approximate query.
MSN
*
max_msn_applied
// Output:
)
// max_msn_in_path: max of "max_msn_applied_to_node_on_disk" over
// ancestors. This is used later to update basement nodes'
// max_msn_applied values in case we don't do the full algorithm.
// Returns:
// true if there may be some such messages
// false only if there are definitely no such messages
// Rationale:
// When we pin a node with a read lock, we want to quickly determine if
// we should exchange it for a write lock in preparation for applying
// messages. If there are no messages, we don't need the write lock.
{
{
invariant
(
node
->
height
==
0
);
BASEMENTNODE
bn
=
BLB
(
node
,
childnum
);
MSN
max_msn_applied
=
ZERO_MSN
;
struct
pivot_bounds
curr_bounds
=
next_pivot_keys
(
node
,
childnum
,
bounds
)
;
bool
needs_ancestors_messages
=
false
;
bool
needs_ancestors_messages
=
false
;
for
(
int
i
=
0
;
i
<
node
->
n_children
;
++
i
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
BASEMENTNODE
bn
=
BLB
(
node
,
i
);
struct
pivot_bounds
curr_bounds
=
next_pivot_keys
(
node
,
i
,
bounds
);
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
bn
->
max_msn_applied
.
msn
)
{
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
bn
->
max_msn_applied
.
msn
)
{
paranoid_invariant
(
BP_STATE
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
)
==
PT_AVAIL
);
paranoid_invariant
(
BP_STATE
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
)
==
PT_AVAIL
);
...
@@ -4627,19 +4656,86 @@ bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancesto
...
@@ -4627,19 +4656,86 @@ bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancesto
needs_ancestors_messages
=
true
;
needs_ancestors_messages
=
true
;
goto
cleanup
;
goto
cleanup
;
}
}
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
max_msn_applied
.
msn
)
{
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
max_msn_applied
->
msn
)
{
max_msn_applied
=
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
;
max_msn_applied
->
msn
=
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
;
}
}
}
cleanup:
return
needs_ancestors_messages
;
}
bool
toku_ft_leaf_needs_ancestors_messages
(
FT
ft
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
MSN
*
const
max_msn_in_path
,
int
child_to_read
)
// Effect: Determine whether there are messages in a node's ancestors
// which must be applied to it. These messages are in the correct
// keyrange for any available basement nodes, and are in nodes with the
// correct max_msn_applied_to_node_on_disk.
// Notes:
// This is an approximate query.
// Output:
// max_msn_in_path: max of "max_msn_applied_to_node_on_disk" over
// ancestors. This is used later to update basement nodes'
// max_msn_applied values in case we don't do the full algorithm.
// Returns:
// true if there may be some such messages
// false only if there are definitely no such messages
// Rationale:
// When we pin a node with a read lock, we want to quickly determine if
// we should exchange it for a write lock in preparation for applying
// messages. If there are no messages, we don't need the write lock.
{
paranoid_invariant
(
node
->
height
==
0
);
bool
needs_ancestors_messages
=
false
;
// child_to_read may be -1 in test cases
if
(
!
node
->
dirty
&&
child_to_read
>=
0
)
{
paranoid_invariant
(
BP_STATE
(
node
,
child_to_read
)
==
PT_AVAIL
);
needs_ancestors_messages
=
bn_needs_ancestors_messages
(
ft
,
node
,
child_to_read
,
bounds
,
ancestors
,
max_msn_in_path
);
}
}
else
{
for
(
int
i
=
0
;
i
<
node
->
n_children
;
++
i
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
needs_ancestors_messages
=
bn_needs_ancestors_messages
(
ft
,
node
,
i
,
bounds
,
ancestors
,
max_msn_in_path
);
if
(
needs_ancestors_messages
)
{
goto
cleanup
;
}
}
}
}
}
}
*
max_msn_in_path
=
max_msn_applied
;
cleanup:
cleanup:
return
needs_ancestors_messages
;
return
needs_ancestors_messages
;
}
}
void
toku_ft_bn_update_max_msn
(
FTNODE
node
,
MSN
max_msn_applied
)
{
void
toku_ft_bn_update_max_msn
(
FTNODE
node
,
MSN
max_msn_applied
,
int
child_to_read
)
{
invariant
(
node
->
height
==
0
);
invariant
(
node
->
height
==
0
);
if
(
!
node
->
dirty
&&
child_to_read
>=
0
)
{
paranoid_invariant
(
BP_STATE
(
node
,
child_to_read
)
==
PT_AVAIL
);
BASEMENTNODE
bn
=
BLB
(
node
,
child_to_read
);
if
(
max_msn_applied
.
msn
>
bn
->
max_msn_applied
.
msn
)
{
// see comment below
(
void
)
toku_sync_val_compare_and_swap
(
&
bn
->
max_msn_applied
.
msn
,
bn
->
max_msn_applied
.
msn
,
max_msn_applied
.
msn
);
}
}
else
{
for
(
int
i
=
0
;
i
<
node
->
n_children
;
++
i
)
{
for
(
int
i
=
0
;
i
<
node
->
n_children
;
++
i
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
BASEMENTNODE
bn
=
BLB
(
node
,
i
);
BASEMENTNODE
bn
=
BLB
(
node
,
i
);
...
@@ -4652,6 +4748,7 @@ void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied) {
...
@@ -4652,6 +4748,7 @@ void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied) {
(
void
)
toku_sync_val_compare_and_swap
(
&
bn
->
max_msn_applied
.
msn
,
bn
->
max_msn_applied
.
msn
,
max_msn_applied
.
msn
);
(
void
)
toku_sync_val_compare_and_swap
(
&
bn
->
max_msn_applied
.
msn
,
bn
->
max_msn_applied
.
msn
,
max_msn_applied
.
msn
);
}
}
}
}
}
}
}
struct
copy_to_stale_extra
{
struct
copy_to_stale_extra
{
...
@@ -4779,6 +4876,11 @@ got_a_good_value:
...
@@ -4779,6 +4876,11 @@ got_a_good_value:
ftcursor
->
leaf_info
.
to_be
.
omt
=
bn
->
buffer
;
ftcursor
->
leaf_info
.
to_be
.
omt
=
bn
->
buffer
;
ftcursor
->
leaf_info
.
to_be
.
index
=
idx
;
ftcursor
->
leaf_info
.
to_be
.
index
=
idx
;
//
// IMPORTANT: bulk fetch CANNOT go past the current basement node,
// because there is no guarantee that messages have been applied
// to other basement nodes, as part of #5770
//
if
(
r
==
TOKUDB_CURSOR_CONTINUE
&&
can_bulk_fetch
)
{
if
(
r
==
TOKUDB_CURSOR_CONTINUE
&&
can_bulk_fetch
)
{
r
=
ft_cursor_shortcut
(
r
=
ft_cursor_shortcut
(
ftcursor
,
ftcursor
,
...
@@ -4908,7 +5010,7 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
...
@@ -4908,7 +5010,7 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
BLOCKNUM
childblocknum
=
BP_BLOCKNUM
(
node
,
childnum
);
BLOCKNUM
childblocknum
=
BP_BLOCKNUM
(
node
,
childnum
);
uint32_t
fullhash
=
compute_child_fullhash
(
brt
->
ft
->
cf
,
node
,
childnum
);
uint32_t
fullhash
=
compute_child_fullhash
(
brt
->
ft
->
cf
,
node
,
childnum
);
FTNODE
childnode
;
FTNODE
childnode
=
nullptr
;
// If the current node's height is greater than 1, then its child is an internal node.
// If the current node's height is greater than 1, then its child is an internal node.
// Therefore, to warm the cache better (#5798), we want to read all the partitions off disk in one shot.
// Therefore, to warm the cache better (#5798), we want to read all the partitions off disk in one shot.
...
@@ -4931,7 +5033,6 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
...
@@ -4931,7 +5033,6 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
unlockers
,
unlockers
,
&
next_ancestors
,
bounds
,
&
next_ancestors
,
bounds
,
&
bfe
,
&
bfe
,
PL_READ
,
// we try to get a read lock, but we may upgrade to a write lock on a leaf for message application.
true
,
true
,
&
childnode
,
&
childnode
,
&
msgs_applied
);
&
msgs_applied
);
...
@@ -5090,19 +5191,6 @@ ft_search_node(
...
@@ -5090,19 +5191,6 @@ ft_search_node(
// At this point, we must have the necessary partition available to continue the search
// At this point, we must have the necessary partition available to continue the search
//
//
assert
(
BP_STATE
(
node
,
child_to_search
)
==
PT_AVAIL
);
assert
(
BP_STATE
(
node
,
child_to_search
)
==
PT_AVAIL
);
while
(
child_to_search
>=
0
&&
child_to_search
<
node
->
n_children
)
{
//
// Normally, the child we want to use is available, as we checked
// before entering this while loop. However, if we pass through
// the loop once, getting DB_NOTFOUND for this first value
// of child_to_search, we enter the while loop again with a
// child_to_search that may not be in memory. If it is not,
// we need to return TOKUDB_TRY_AGAIN so the query can
// read the appropriate partition into memory
//
if
(
BP_STATE
(
node
,
child_to_search
)
!=
PT_AVAIL
)
{
return
TOKUDB_TRY_AGAIN
;
}
const
struct
pivot_bounds
next_bounds
=
next_pivot_keys
(
node
,
child_to_search
,
bounds
);
const
struct
pivot_bounds
next_bounds
=
next_pivot_keys
(
node
,
child_to_search
,
bounds
);
if
(
node
->
height
>
0
)
{
if
(
node
->
height
>
0
)
{
r
=
ft_search_child
(
r
=
ft_search_child
(
...
@@ -5131,7 +5219,9 @@ ft_search_node(
...
@@ -5131,7 +5219,9 @@ ft_search_node(
can_bulk_fetch
can_bulk_fetch
);
);
}
}
if
(
r
==
0
)
return
r
;
//Success
if
(
r
==
0
)
{
return
r
;
//Success
}
if
(
r
!=
DB_NOTFOUND
)
{
if
(
r
!=
DB_NOTFOUND
)
{
return
r
;
//Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
return
r
;
//Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
...
@@ -5143,17 +5233,19 @@ ft_search_node(
...
@@ -5143,17 +5233,19 @@ ft_search_node(
// we have a new pivotkey
// we have a new pivotkey
if
(
node
->
height
==
0
)
{
if
(
node
->
height
==
0
)
{
// when we run off the end of a basement, try to lock the range up to the pivot. solves #3529
// when we run off the end of a basement, try to lock the range up to the pivot. solves #3529
const
DBT
*
pivot
=
NULL
;
const
DBT
*
pivot
=
nullptr
;
if
(
search
->
direction
==
FT_SEARCH_LEFT
)
if
(
search
->
direction
==
FT_SEARCH_LEFT
)
{
pivot
=
next_bounds
.
upper_bound_inclusive
;
// left -> right
pivot
=
next_bounds
.
upper_bound_inclusive
;
// left -> right
else
}
else
{
pivot
=
next_bounds
.
lower_bound_exclusive
;
// right -> left
pivot
=
next_bounds
.
lower_bound_exclusive
;
// right -> left
if
(
pivot
)
{
}
int
rr
=
getf
(
pivot
->
size
,
pivot
->
data
,
0
,
NULL
,
getf_v
,
true
);
if
(
pivot
!=
nullptr
)
{
if
(
rr
!=
0
)
int
rr
=
getf
(
pivot
->
size
,
pivot
->
data
,
0
,
nullptr
,
getf_v
,
true
);
if
(
rr
!=
0
)
{
return
rr
;
// lock was not granted
return
rr
;
// lock was not granted
}
}
}
}
}
// If we got a DB_NOTFOUND then we have to search the next record. Possibly everything present is not visible.
// If we got a DB_NOTFOUND then we have to search the next record. Possibly everything present is not visible.
// This way of doing DB_NOTFOUND is a kludge, and ought to be simplified. Something like this is needed for DB_NEXT, but
// This way of doing DB_NOTFOUND is a kludge, and ought to be simplified. Something like this is needed for DB_NEXT, but
...
@@ -5162,15 +5254,15 @@ ft_search_node(
...
@@ -5162,15 +5254,15 @@ ft_search_node(
// If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
// If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
// So save the pivot key in the search object.
// So save the pivot key in the search object.
maybe_search_save_bound
(
node
,
child_to_search
,
search
);
maybe_search_save_bound
(
node
,
child_to_search
,
search
);
// as part of #5770, if we can continue searching,
// We're about to pin some more nodes, but we thought we were done before.
// we MUST return TOKUDB_TRY_AGAIN,
if
(
search
->
direction
==
FT_SEARCH_LEFT
)
{
// because there is no guarantee that messages have been applied
child_to_search
++
;
// on any other path.
}
if
((
search
->
direction
==
FT_SEARCH_LEFT
&&
child_to_search
<
node
->
n_children
-
1
)
||
else
{
(
search
->
direction
==
FT_SEARCH_RIGHT
&&
child_to_search
>
0
))
{
child_to_search
--
;
r
=
TOKUDB_TRY_AGAIN
;
}
}
}
return
r
;
return
r
;
}
}
...
@@ -5775,7 +5867,6 @@ toku_ft_keysrange_internal (FT_HANDLE brt, FTNODE node,
...
@@ -5775,7 +5867,6 @@ toku_ft_keysrange_internal (FT_HANDLE brt, FTNODE node,
&
next_ancestors
,
&
next_ancestors
,
bounds
,
bounds
,
child_may_find_right
?
match_bfe
:
min_bfe
,
child_may_find_right
?
match_bfe
:
min_bfe
,
PL_READ
,
// may_modify_node is false, because node guaranteed to not change
false
,
false
,
&
childnode
,
&
childnode
,
&
msgs_applied
&
msgs_applied
...
@@ -5986,7 +6077,7 @@ static int get_key_after_bytes_in_child(FT_HANDLE ft_h, FT ft, FTNODE node, UNLO
...
@@ -5986,7 +6077,7 @@ static int get_key_after_bytes_in_child(FT_HANDLE ft_h, FT ft, FTNODE node, UNLO
uint32_t
fullhash
=
compute_child_fullhash
(
ft
->
cf
,
node
,
childnum
);
uint32_t
fullhash
=
compute_child_fullhash
(
ft
->
cf
,
node
,
childnum
);
FTNODE
child
;
FTNODE
child
;
bool
msgs_applied
=
false
;
bool
msgs_applied
=
false
;
r
=
toku_pin_ftnode_batched
(
ft_h
,
childblocknum
,
fullhash
,
unlockers
,
&
next_ancestors
,
bounds
,
bfe
,
PL_READ
,
false
,
&
child
,
&
msgs_applied
);
r
=
toku_pin_ftnode_batched
(
ft_h
,
childblocknum
,
fullhash
,
unlockers
,
&
next_ancestors
,
bounds
,
bfe
,
false
,
&
child
,
&
msgs_applied
);
paranoid_invariant
(
!
msgs_applied
);
paranoid_invariant
(
!
msgs_applied
);
if
(
r
==
TOKUDB_TRY_AGAIN
)
{
if
(
r
==
TOKUDB_TRY_AGAIN
)
{
return
r
;
return
r
;
...
...
ft/tests/orthopush-flush.cc
View file @
9a6ba1aa
...
@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
...
@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
struct
ancestors
ancestors
=
{
.
node
=
parentnode
,
.
childnum
=
0
,
.
next
=
NULL
};
struct
ancestors
ancestors
=
{
.
node
=
parentnode
,
.
childnum
=
0
,
.
next
=
NULL
};
const
struct
pivot_bounds
infinite_bounds
=
{
.
lower_bound_exclusive
=
NULL
,
.
upper_bound_inclusive
=
NULL
};
const
struct
pivot_bounds
infinite_bounds
=
{
.
lower_bound_exclusive
=
NULL
,
.
upper_bound_inclusive
=
NULL
};
bool
msgs_applied
;
bool
msgs_applied
;
toku_apply_ancestors_messages_to_node
(
t
,
child
,
&
ancestors
,
&
infinite_bounds
,
&
msgs_applied
);
toku_apply_ancestors_messages_to_node
(
t
,
child
,
&
ancestors
,
&
infinite_bounds
,
&
msgs_applied
,
-
1
);
FIFO_ITERATE
(
parent_bnc
->
buffer
,
key
,
keylen
,
val
,
vallen
,
type
,
msn
,
xids
,
is_fresh
,
FIFO_ITERATE
(
parent_bnc
->
buffer
,
key
,
keylen
,
val
,
vallen
,
type
,
msn
,
xids
,
is_fresh
,
{
{
...
@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
...
@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
.
upper_bound_inclusive
=
toku_clone_dbt
(
&
ubi
,
childkeys
[
7
])
.
upper_bound_inclusive
=
toku_clone_dbt
(
&
ubi
,
childkeys
[
7
])
};
};
bool
msgs_applied
;
bool
msgs_applied
;
toku_apply_ancestors_messages_to_node
(
t
,
child
,
&
ancestors
,
&
bounds
,
&
msgs_applied
);
toku_apply_ancestors_messages_to_node
(
t
,
child
,
&
ancestors
,
&
bounds
,
&
msgs_applied
,
-
1
);
FIFO_ITERATE
(
parent_bnc
->
buffer
,
key
,
keylen
,
val
,
vallen
,
type
,
msn
,
xids
,
is_fresh
,
FIFO_ITERATE
(
parent_bnc
->
buffer
,
key
,
keylen
,
val
,
vallen
,
type
,
msn
,
xids
,
is_fresh
,
{
{
...
@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
...
@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
struct
ancestors
ancestors
=
{
.
node
=
parentnode
,
.
childnum
=
0
,
.
next
=
NULL
};
struct
ancestors
ancestors
=
{
.
node
=
parentnode
,
.
childnum
=
0
,
.
next
=
NULL
};
const
struct
pivot_bounds
infinite_bounds
=
{
.
lower_bound_exclusive
=
NULL
,
.
upper_bound_inclusive
=
NULL
};
const
struct
pivot_bounds
infinite_bounds
=
{
.
lower_bound_exclusive
=
NULL
,
.
upper_bound_inclusive
=
NULL
};
bool
msgs_applied
;
bool
msgs_applied
;
toku_apply_ancestors_messages_to_node
(
t
,
child2
,
&
ancestors
,
&
infinite_bounds
,
&
msgs_applied
);
toku_apply_ancestors_messages_to_node
(
t
,
child2
,
&
ancestors
,
&
infinite_bounds
,
&
msgs_applied
,
-
1
);
FIFO_ITERATE
(
parent_bnc
->
buffer
,
key
,
keylen
,
val
,
vallen
,
type
,
msn
,
xids
,
is_fresh
,
FIFO_ITERATE
(
parent_bnc
->
buffer
,
key
,
keylen
,
val
,
vallen
,
type
,
msn
,
xids
,
is_fresh
,
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment