Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
M
MariaDB
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
nexedi
MariaDB
Commits
9a6ba1aa
Commit
9a6ba1aa
authored
Jun 28, 2013
by
John Esmet
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refs #5770 Only check one basement node on pin, remove the assumption that adjacent
available nodes are query-able.
parent
06d56d51
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
349 additions
and
235 deletions
+349
-235
ft/ft-cachetable-wrappers.cc
ft/ft-cachetable-wrappers.cc
+85
-54
ft/ft-cachetable-wrappers.h
ft/ft-cachetable-wrappers.h
+0
-1
ft/ft-internal.h
ft/ft-internal.h
+3
-10
ft/ft-ops.cc
ft/ft-ops.cc
+258
-167
ft/tests/orthopush-flush.cc
ft/tests/orthopush-flush.cc
+3
-3
No files found.
ft/ft-cachetable-wrappers.cc
View file @
9a6ba1aa
...
...
@@ -193,6 +193,11 @@ toku_create_new_ftnode (
NULL
);
}
//
// On success, this function assumes that the caller is trying to pin the node
// with a PL_READ lock. If message application is needed,
// then a PL_WRITE_CHEAP lock is grabbed
//
int
toku_pin_ftnode_batched
(
FT_HANDLE
brt
,
...
...
@@ -202,15 +207,22 @@ toku_pin_ftnode_batched(
ANCESTORS
ancestors
,
const
PIVOT_BOUNDS
bounds
,
FTNODE_FETCH_EXTRA
bfe
,
pair_lock_type
lock_type
,
bool
apply_ancestor_messages
,
// this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE
*
node_p
,
bool
*
msgs_applied
)
{
void
*
node_v
;
*
msgs_applied
=
false
;
pair_lock_type
needed_lock_type
=
lock_type
;
try_again_for_write_lock:
FTNODE
node
=
nullptr
;
MSN
max_msn_in_path
=
ZERO_MSN
;
bool
needs_ancestors_messages
=
false
;
// this function assumes that if you want ancestor messages applied,
// you are doing a read for a query. This is so we can make some optimizations
// below.
if
(
apply_ancestor_messages
)
{
paranoid_invariant
(
bfe
->
type
==
ftnode_fetch_subset
);
}
int
r
=
toku_cachetable_get_and_pin_nonblocking_batched
(
brt
->
ft
->
cf
,
blocknum
,
...
...
@@ -221,63 +233,82 @@ toku_pin_ftnode_batched(
toku_ftnode_fetch_callback
,
toku_ftnode_pf_req_callback
,
toku_ftnode_pf_callback
,
needed_lock_type
,
PL_READ
,
bfe
,
//read_extraargs
unlockers
);
if
(
r
==
0
)
{
FTNODE
node
=
static_cast
<
FTNODE
>
(
node_v
);
MSN
max_msn_in_path
;
bool
needs_ancestors_messages
=
false
;
if
(
apply_ancestor_messages
&&
node
->
height
==
0
)
{
needs_ancestors_messages
=
toku_ft_leaf_needs_ancestors_messages
(
brt
->
ft
,
node
,
ancestors
,
bounds
,
&
max_msn_in_path
);
if
(
needs_ancestors_messages
&&
needed_lock_type
==
PL_READ
)
{
toku_unpin_ftnode_read_only
(
brt
->
ft
,
node
);
needed_lock_type
=
PL_WRITE_CHEAP
;
goto
try_again_for_write_lock
;
if
(
r
!=
0
)
{
assert
(
r
==
TOKUDB_TRY_AGAIN
);
// Any other error and we should bomb out ASAP.
goto
exit
;
}
node
=
static_cast
<
FTNODE
>
(
node_v
);
if
(
apply_ancestor_messages
&&
node
->
height
==
0
)
{
needs_ancestors_messages
=
toku_ft_leaf_needs_ancestors_messages
(
brt
->
ft
,
node
,
ancestors
,
bounds
,
&
max_msn_in_path
,
bfe
->
child_to_read
);
if
(
needs_ancestors_messages
)
{
toku_unpin_ftnode_read_only
(
brt
->
ft
,
node
);
int
rr
=
toku_cachetable_get_and_pin_nonblocking_batched
(
brt
->
ft
->
cf
,
blocknum
,
fullhash
,
&
node_v
,
NULL
,
get_write_callbacks_for_node
(
brt
->
ft
),
toku_ftnode_fetch_callback
,
toku_ftnode_pf_req_callback
,
toku_ftnode_pf_callback
,
PL_WRITE_CHEAP
,
bfe
,
//read_extraargs
unlockers
);
if
(
rr
!=
0
)
{
assert
(
rr
==
TOKUDB_TRY_AGAIN
);
// Any other error and we should bomb out ASAP.
r
=
TOKUDB_TRY_AGAIN
;
goto
exit
;
}
}
if
(
apply_ancestor_messages
&&
node
->
height
==
0
)
{
if
(
needs_ancestors_messages
)
{
invariant
(
needed_lock_type
!=
PL_READ
);
toku_apply_ancestors_messages_to_node
(
brt
,
node
,
ancestors
,
bounds
,
msgs_applied
);
}
else
{
// At this point, we aren't going to run
// toku_apply_ancestors_messages_to_node but that doesn't
// mean max_msn_applied shouldn't be updated if possible
// (this saves the CPU work involved in
// toku_ft_leaf_needs_ancestors_messages).
//
// We still have a read lock, so we have not resolved
// checkpointing. If the node is pending and dirty, we
// can't modify anything, including max_msn, until we
// resolve checkpointing. If we do, the node might get
// written out that way as part of a checkpoint with a
// root that was already written out with a smaller
// max_msn. During recovery, we would then inject a
// message based on the root's max_msn, and that message
// would get filtered by the leaf because it had too high
// a max_msn value. (see #5407)
//
// So for simplicity we only update the max_msn if the
// node is clean. That way, in order for the node to get
// written out, it would have to be dirtied. That
// requires a write lock, and a write lock requires you to
// resolve checkpointing.
if
(
!
node
->
dirty
)
{
toku_ft_bn_update_max_msn
(
node
,
max_msn_in_path
);
}
node
=
static_cast
<
FTNODE
>
(
node_v
);
toku_apply_ancestors_messages_to_node
(
brt
,
node
,
ancestors
,
bounds
,
msgs_applied
,
bfe
->
child_to_read
);
}
else
{
// At this point, we aren't going to run
// toku_apply_ancestors_messages_to_node but that doesn't
// mean max_msn_applied shouldn't be updated if possible
// (this saves the CPU work involved in
// toku_ft_leaf_needs_ancestors_messages).
//
// We still have a read lock, so we have not resolved
// checkpointing. If the node is pending and dirty, we
// can't modify anything, including max_msn, until we
// resolve checkpointing. If we do, the node might get
// written out that way as part of a checkpoint with a
// root that was already written out with a smaller
// max_msn. During recovery, we would then inject a
// message based on the root's max_msn, and that message
// would get filtered by the leaf because it had too high
// a max_msn value. (see #5407)
//
// So for simplicity we only update the max_msn if the
// node is clean. That way, in order for the node to get
// written out, it would have to be dirtied. That
// requires a write lock, and a write lock requires you to
// resolve checkpointing.
if
(
!
node
->
dirty
)
{
toku_ft_bn_update_max_msn
(
node
,
max_msn_in_path
,
bfe
->
child_to_read
);
}
invariant
(
needed_lock_type
!=
PL_READ
||
!*
msgs_applied
);
}
if
((
lock_type
!=
PL_READ
)
&&
node
->
height
>
0
)
{
toku_move_ftnode_messages_to_stale
(
brt
->
ft
,
node
);
}
*
node_p
=
node
;
// printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
}
else
{
assert
(
r
==
TOKUDB_TRY_AGAIN
);
// Any other error and we should bomb out ASAP.
// printf("%*sPin %ld try again\n", 8, "", blocknum.b);
}
*
node_p
=
node
;
exit:
return
r
;
}
...
...
ft/ft-cachetable-wrappers.h
View file @
9a6ba1aa
...
...
@@ -150,7 +150,6 @@ toku_pin_ftnode_batched(
ANCESTORS
ancestors
,
const
PIVOT_BOUNDS
pbounds
,
FTNODE_FETCH_EXTRA
bfe
,
pair_lock_type
lock_type
,
bool
apply_ancestor_messages
,
// this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE
*
node_p
,
bool
*
msgs_applied
...
...
ft/ft-internal.h
View file @
9a6ba1aa
...
...
@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
#define VERIFY_NODE(t,n) ((void)0)
#endif
//#define FT_TRACE
#ifdef FT_TRACE
#define WHEN_FTTRACE(x) x
#else
#define WHEN_FTTRACE(x) ((void)0)
#endif
void
toku_ft_status_update_pivot_fetch_reason
(
struct
ftnode_fetch_extra
*
bfe
);
void
toku_ft_status_update_flush_reason
(
FTNODE
node
,
uint64_t
uncompressed_bytes_flushed
,
uint64_t
bytes_written
,
tokutime_t
write_time
,
bool
for_checkpoint
);
void
toku_ft_status_update_serialize_times
(
FTNODE
node
,
tokutime_t
serialize_time
,
tokutime_t
compress_time
);
...
...
@@ -982,11 +975,11 @@ struct pivot_bounds {
__attribute__
((
nonnull
))
void
toku_move_ftnode_messages_to_stale
(
FT
ft
,
FTNODE
node
);
void
toku_apply_ancestors_messages_to_node
(
FT_HANDLE
t
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
bool
*
msgs_applied
);
void
toku_apply_ancestors_messages_to_node
(
FT_HANDLE
t
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
bool
*
msgs_applied
,
int
child_to_read
);
__attribute__
((
nonnull
))
bool
toku_ft_leaf_needs_ancestors_messages
(
FT
ft
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
MSN
*
const
max_msn_in_path
);
bool
toku_ft_leaf_needs_ancestors_messages
(
FT
ft
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
MSN
*
const
max_msn_in_path
,
int
child_to_read
);
__attribute__
((
nonnull
))
void
toku_ft_bn_update_max_msn
(
FTNODE
node
,
MSN
max_msn_applied
);
void
toku_ft_bn_update_max_msn
(
FTNODE
node
,
MSN
max_msn_applied
,
int
child_to_read
);
__attribute__
((
const
,
nonnull
))
size_t
toku_ft_msg_memsize_in_fifo
(
FT_MSG
cmd
);
...
...
ft/ft-ops.cc
View file @
9a6ba1aa
...
...
@@ -4509,8 +4509,53 @@ bnc_apply_messages_to_basement_node(
}
}
static
void
apply_ancestors_messages_to_bn
(
FT_HANDLE
t
,
FTNODE
node
,
int
childnum
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
TXNID
oldest_referenced_xid
,
bool
*
msgs_applied
)
{
BASEMENTNODE
curr_bn
=
BLB
(
node
,
childnum
);
struct
pivot_bounds
curr_bounds
=
next_pivot_keys
(
node
,
childnum
,
bounds
);
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
curr_bn
->
max_msn_applied
.
msn
)
{
paranoid_invariant
(
BP_STATE
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
)
==
PT_AVAIL
);
bnc_apply_messages_to_basement_node
(
t
,
curr_bn
,
curr_ancestors
->
node
,
curr_ancestors
->
childnum
,
&
curr_bounds
,
oldest_referenced_xid
,
msgs_applied
);
// We don't want to check this ancestor node again if the
// next time we query it, the msn hasn't changed.
curr_bn
->
max_msn_applied
=
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
;
}
}
// At this point, we know all the stale messages above this
// basement node have been applied, and any new messages will be
// fresh, so we don't need to look at stale messages for this
// basement node, unless it gets evicted (and this field becomes
// false when it's read in again).
curr_bn
->
stale_ancestor_messages_applied
=
true
;
}
void
toku_apply_ancestors_messages_to_node
(
FT_HANDLE
t
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
bool
*
msgs_applied
)
toku_apply_ancestors_messages_to_node
(
FT_HANDLE
t
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
bool
*
msgs_applied
,
int
child_to_read
)
// Effect:
// Bring a leaf node up-to-date according to all the messages in the ancestors.
// If the leaf node is already up-to-date then do nothing.
...
...
@@ -4521,7 +4566,7 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
// The entire root-to-leaf path is pinned and appears in the ancestors list.
{
VERIFY_NODE
(
t
,
node
);
invariant
(
node
->
height
==
0
);
paranoid_
invariant
(
node
->
height
==
0
);
TXNID
oldest_referenced_xid
=
ancestors
->
node
->
oldest_referenced_xid_known
;
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
...
...
@@ -4530,44 +4575,104 @@ toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ances
}
}
// know we are a leaf node
// An important invariant:
// We MUST bring every available basement node up to date.
// flushing on the cleaner thread depends on this. This invariant
// allows the cleaner thread to just pick an internal node and flush it
// as opposed to being forced to start from the root.
for
(
int
i
=
0
;
i
<
node
->
n_children
;
i
++
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
BASEMENTNODE
curr_bn
=
BLB
(
node
,
i
);
struct
pivot_bounds
curr_bounds
=
next_pivot_keys
(
node
,
i
,
bounds
);
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
curr_bn
->
max_msn_applied
.
msn
)
{
paranoid_invariant
(
BP_STATE
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
)
==
PT_AVAIL
);
bnc_apply_messages_to_basement_node
(
t
,
curr_bn
,
curr_ancestors
->
node
,
curr_ancestors
->
childnum
,
&
curr_bounds
,
oldest_referenced_xid
,
msgs_applied
);
// We don't want to check this ancestor node again if the
// next time we query it, the msn hasn't changed.
curr_bn
->
max_msn_applied
=
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
;
}
if
(
!
node
->
dirty
&&
child_to_read
>=
0
)
{
paranoid_invariant
(
BP_STATE
(
node
,
child_to_read
)
==
PT_AVAIL
);
apply_ancestors_messages_to_bn
(
t
,
node
,
child_to_read
,
ancestors
,
bounds
,
oldest_referenced_xid
,
msgs_applied
);
}
else
{
// know we are a leaf node
// An important invariant:
// We MUST bring every available basement node for a dirty node up to date.
// flushing on the cleaner thread depends on this. This invariant
// allows the cleaner thread to just pick an internal node and flush it
// as opposed to being forced to start from the root.
for
(
int
i
=
0
;
i
<
node
->
n_children
;
i
++
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
apply_ancestors_messages_to_bn
(
t
,
node
,
i
,
ancestors
,
bounds
,
oldest_referenced_xid
,
msgs_applied
);
}
// At this point, we know all the stale messages above this
// basement node have been applied, and any new messages will be
// fresh, so we don't need to look at stale messages for this
// basement node, unless it gets evicted (and this field becomes
// false when it's read in again).
curr_bn
->
stale_ancestor_messages_applied
=
true
;
}
VERIFY_NODE
(
t
,
node
);
}
bool
toku_ft_leaf_needs_ancestors_messages
(
FT
ft
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
MSN
*
const
max_msn_in_path
)
static
bool
bn_needs_ancestors_messages
(
FT
ft
,
FTNODE
node
,
int
childnum
,
struct
pivot_bounds
const
*
const
bounds
,
ANCESTORS
ancestors
,
MSN
*
max_msn_applied
)
{
BASEMENTNODE
bn
=
BLB
(
node
,
childnum
);
struct
pivot_bounds
curr_bounds
=
next_pivot_keys
(
node
,
childnum
,
bounds
);
bool
needs_ancestors_messages
=
false
;
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
bn
->
max_msn_applied
.
msn
)
{
paranoid_invariant
(
BP_STATE
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
)
==
PT_AVAIL
);
NONLEAF_CHILDINFO
bnc
=
BNC
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
);
if
(
bnc
->
broadcast_list
.
size
()
>
0
)
{
needs_ancestors_messages
=
true
;
goto
cleanup
;
}
if
(
!
bn
->
stale_ancestor_messages_applied
)
{
uint32_t
stale_lbi
,
stale_ube
;
find_bounds_within_message_tree
(
&
ft
->
cmp_descriptor
,
ft
->
compare_fun
,
bnc
->
stale_message_tree
,
bnc
->
buffer
,
&
curr_bounds
,
&
stale_lbi
,
&
stale_ube
);
if
(
stale_lbi
<
stale_ube
)
{
needs_ancestors_messages
=
true
;
goto
cleanup
;
}
}
uint32_t
fresh_lbi
,
fresh_ube
;
find_bounds_within_message_tree
(
&
ft
->
cmp_descriptor
,
ft
->
compare_fun
,
bnc
->
fresh_message_tree
,
bnc
->
buffer
,
&
curr_bounds
,
&
fresh_lbi
,
&
fresh_ube
);
if
(
fresh_lbi
<
fresh_ube
)
{
needs_ancestors_messages
=
true
;
goto
cleanup
;
}
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
max_msn_applied
->
msn
)
{
max_msn_applied
->
msn
=
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
;
}
}
}
cleanup:
return
needs_ancestors_messages
;
}
bool
toku_ft_leaf_needs_ancestors_messages
(
FT
ft
,
FTNODE
node
,
ANCESTORS
ancestors
,
struct
pivot_bounds
const
*
const
bounds
,
MSN
*
const
max_msn_in_path
,
int
child_to_read
)
// Effect: Determine whether there are messages in a node's ancestors
// which must be applied to it. These messages are in the correct
// keyrange for any available basement nodes, and are in nodes with the
...
...
@@ -4586,72 +4691,64 @@ bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancesto
// we should exchange it for a write lock in preparation for applying
// messages. If there are no messages, we don't need the write lock.
{
invariant
(
node
->
height
==
0
);
MSN
max_msn_applied
=
ZERO_MSN
;
paranoid_invariant
(
node
->
height
==
0
);
bool
needs_ancestors_messages
=
false
;
for
(
int
i
=
0
;
i
<
node
->
n_children
;
++
i
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
BASEMENTNODE
bn
=
BLB
(
node
,
i
);
struct
pivot_bounds
curr_bounds
=
next_pivot_keys
(
node
,
i
,
bounds
);
for
(
ANCESTORS
curr_ancestors
=
ancestors
;
curr_ancestors
;
curr_ancestors
=
curr_ancestors
->
next
)
{
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
bn
->
max_msn_applied
.
msn
)
{
paranoid_invariant
(
BP_STATE
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
)
==
PT_AVAIL
);
NONLEAF_CHILDINFO
bnc
=
BNC
(
curr_ancestors
->
node
,
curr_ancestors
->
childnum
);
if
(
bnc
->
broadcast_list
.
size
()
>
0
)
{
needs_ancestors_messages
=
true
;
goto
cleanup
;
}
if
(
!
bn
->
stale_ancestor_messages_applied
)
{
uint32_t
stale_lbi
,
stale_ube
;
find_bounds_within_message_tree
(
&
ft
->
cmp_descriptor
,
ft
->
compare_fun
,
bnc
->
stale_message_tree
,
bnc
->
buffer
,
&
curr_bounds
,
&
stale_lbi
,
&
stale_ube
);
if
(
stale_lbi
<
stale_ube
)
{
needs_ancestors_messages
=
true
;
goto
cleanup
;
}
}
uint32_t
fresh_lbi
,
fresh_ube
;
find_bounds_within_message_tree
(
&
ft
->
cmp_descriptor
,
ft
->
compare_fun
,
bnc
->
fresh_message_tree
,
bnc
->
buffer
,
&
curr_bounds
,
&
fresh_lbi
,
&
fresh_ube
);
if
(
fresh_lbi
<
fresh_ube
)
{
needs_ancestors_messages
=
true
;
goto
cleanup
;
}
if
(
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
.
msn
>
max_msn_applied
.
msn
)
{
max_msn_applied
=
curr_ancestors
->
node
->
max_msn_applied_to_node_on_disk
;
}
// child_to_read may be -1 in test cases
if
(
!
node
->
dirty
&&
child_to_read
>=
0
)
{
paranoid_invariant
(
BP_STATE
(
node
,
child_to_read
)
==
PT_AVAIL
);
needs_ancestors_messages
=
bn_needs_ancestors_messages
(
ft
,
node
,
child_to_read
,
bounds
,
ancestors
,
max_msn_in_path
);
}
else
{
for
(
int
i
=
0
;
i
<
node
->
n_children
;
++
i
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
needs_ancestors_messages
=
bn_needs_ancestors_messages
(
ft
,
node
,
i
,
bounds
,
ancestors
,
max_msn_in_path
);
if
(
needs_ancestors_messages
)
{
goto
cleanup
;
}
}
}
*
max_msn_in_path
=
max_msn_applied
;
cleanup:
return
needs_ancestors_messages
;
}
void
toku_ft_bn_update_max_msn
(
FTNODE
node
,
MSN
max_msn_applied
)
{
void
toku_ft_bn_update_max_msn
(
FTNODE
node
,
MSN
max_msn_applied
,
int
child_to_read
)
{
invariant
(
node
->
height
==
0
);
for
(
int
i
=
0
;
i
<
node
->
n_children
;
++
i
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
BASEMENTNODE
bn
=
BLB
(
node
,
i
);
if
(
!
node
->
dirty
&&
child_to_read
>=
0
)
{
paranoid_invariant
(
BP_STATE
(
node
,
child_to_read
)
==
PT_AVAIL
);
BASEMENTNODE
bn
=
BLB
(
node
,
child_to_read
);
if
(
max_msn_applied
.
msn
>
bn
->
max_msn_applied
.
msn
)
{
// This function runs in a shared access context, so to silence tools
// like DRD, we use a CAS and ignore the result.
// Any threads trying to update these basement nodes should be
// updating them to the same thing (since they all have a read lock on
// the same root-to-leaf path) so this is safe.
// see comment below
(
void
)
toku_sync_val_compare_and_swap
(
&
bn
->
max_msn_applied
.
msn
,
bn
->
max_msn_applied
.
msn
,
max_msn_applied
.
msn
);
}
}
else
{
for
(
int
i
=
0
;
i
<
node
->
n_children
;
++
i
)
{
if
(
BP_STATE
(
node
,
i
)
!=
PT_AVAIL
)
{
continue
;
}
BASEMENTNODE
bn
=
BLB
(
node
,
i
);
if
(
max_msn_applied
.
msn
>
bn
->
max_msn_applied
.
msn
)
{
// This function runs in a shared access context, so to silence tools
// like DRD, we use a CAS and ignore the result.
// Any threads trying to update these basement nodes should be
// updating them to the same thing (since they all have a read lock on
// the same root-to-leaf path) so this is safe.
(
void
)
toku_sync_val_compare_and_swap
(
&
bn
->
max_msn_applied
.
msn
,
bn
->
max_msn_applied
.
msn
,
max_msn_applied
.
msn
);
}
}
}
}
struct
copy_to_stale_extra
{
...
...
@@ -4779,6 +4876,11 @@ ok: ;
ftcursor
->
leaf_info
.
to_be
.
omt
=
bn
->
buffer
;
ftcursor
->
leaf_info
.
to_be
.
index
=
idx
;
//
// IMPORTANT: bulk fetch CANNOT go past the current basement node,
// because there is no guarantee that messages have been applied
// to other basement nodes, as part of #5770
//
if
(
r
==
TOKUDB_CURSOR_CONTINUE
&&
can_bulk_fetch
)
{
r
=
ft_cursor_shortcut
(
ftcursor
,
...
...
@@ -4908,7 +5010,7 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
BLOCKNUM
childblocknum
=
BP_BLOCKNUM
(
node
,
childnum
);
uint32_t
fullhash
=
compute_child_fullhash
(
brt
->
ft
->
cf
,
node
,
childnum
);
FTNODE
childnode
;
FTNODE
childnode
=
nullptr
;
// If the current node's height is greater than 1, then its child is an internal node.
// Therefore, to warm the cache better (#5798), we want to read all the partitions off disk in one shot.
...
...
@@ -4931,7 +5033,6 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
unlockers
,
&
next_ancestors
,
bounds
,
&
bfe
,
PL_READ
,
// we try to get a read lock, but we may upgrade to a write lock on a leaf for message application.
true
,
&
childnode
,
&
msgs_applied
);
...
...
@@ -5090,87 +5191,78 @@ ft_search_node(
// At this point, we must have the necessary partition available to continue the search
//
assert
(
BP_STATE
(
node
,
child_to_search
)
==
PT_AVAIL
);
while
(
child_to_search
>=
0
&&
child_to_search
<
node
->
n_children
)
{
//
// Normally, the child we want to use is available, as we checked
// before entering this while loop. However, if we pass through
// the loop once, getting DB_NOTFOUND for this first value
// of child_to_search, we enter the while loop again with a
// child_to_search that may not be in memory. If it is not,
// we need to return TOKUDB_TRY_AGAIN so the query can
// read the appropriate partition into memory
//
if
(
BP_STATE
(
node
,
child_to_search
)
!=
PT_AVAIL
)
{
return
TOKUDB_TRY_AGAIN
;
}
const
struct
pivot_bounds
next_bounds
=
next_pivot_keys
(
node
,
child_to_search
,
bounds
);
if
(
node
->
height
>
0
)
{
r
=
ft_search_child
(
brt
,
node
,
child_to_search
,
search
,
getf
,
getf_v
,
doprefetch
,
ftcursor
,
unlockers
,
ancestors
,
&
next_bounds
,
can_bulk_fetch
);
}
else
{
r
=
ft_search_basement_node
(
BLB
(
node
,
child_to_search
),
search
,
getf
,
getf_v
,
doprefetch
,
ftcursor
,
can_bulk_fetch
);
}
if
(
r
==
0
)
return
r
;
//Success
const
struct
pivot_bounds
next_bounds
=
next_pivot_keys
(
node
,
child_to_search
,
bounds
);
if
(
node
->
height
>
0
)
{
r
=
ft_search_child
(
brt
,
node
,
child_to_search
,
search
,
getf
,
getf_v
,
doprefetch
,
ftcursor
,
unlockers
,
ancestors
,
&
next_bounds
,
can_bulk_fetch
);
}
else
{
r
=
ft_search_basement_node
(
BLB
(
node
,
child_to_search
),
search
,
getf
,
getf_v
,
doprefetch
,
ftcursor
,
can_bulk_fetch
);
}
if
(
r
==
0
)
{
return
r
;
//Success
}
if
(
r
!=
DB_NOTFOUND
)
{
return
r
;
//Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
if
(
r
!=
DB_NOTFOUND
)
{
return
r
;
//Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
}
// not really necessary, just put this here so that reading the
// code becomes simpler. The point is at this point in the code,
// we know that we got DB_NOTFOUND and we have to continue
assert
(
r
==
DB_NOTFOUND
);
// we have a new pivotkey
if
(
node
->
height
==
0
)
{
// when we run off the end of a basement, try to lock the range up to the pivot. solves #3529
const
DBT
*
pivot
=
nullptr
;
if
(
search
->
direction
==
FT_SEARCH_LEFT
)
{
pivot
=
next_bounds
.
upper_bound_inclusive
;
// left -> right
}
else
{
pivot
=
next_bounds
.
lower_bound_exclusive
;
// right -> left
}
// not really necessary, just put this here so that reading the
// code becomes simpler. The point is at this point in the code,
// we know that we got DB_NOTFOUND and we have to continue
assert
(
r
==
DB_NOTFOUND
);
// we have a new pivotkey
if
(
node
->
height
==
0
)
{
// when we run off the end of a basement, try to lock the range up to the pivot. solves #3529
const
DBT
*
pivot
=
NULL
;
if
(
search
->
direction
==
FT_SEARCH_LEFT
)
pivot
=
next_bounds
.
upper_bound_inclusive
;
// left -> right
else
pivot
=
next_bounds
.
lower_bound_exclusive
;
// right -> left
if
(
pivot
)
{
int
rr
=
getf
(
pivot
->
size
,
pivot
->
data
,
0
,
NULL
,
getf_v
,
true
);
if
(
rr
!=
0
)
return
rr
;
// lock was not granted
if
(
pivot
!=
nullptr
)
{
int
rr
=
getf
(
pivot
->
size
,
pivot
->
data
,
0
,
nullptr
,
getf_v
,
true
);
if
(
rr
!=
0
)
{
return
rr
;
// lock was not granted
}
}
}
// If we got a DB_NOTFOUND then we have to search the next record. Possibly everything present is not visible.
// This way of doing DB_NOTFOUND is a kludge, and ought to be simplified. Something like this is needed for DB_NEXT, but
// for point queries, it's overkill. If we got a DB_NOTFOUND on a point query then we should just stop looking.
// When releasing locks on I/O we must not search the same subtree again, or we won't be guaranteed to make forward progress.
// If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
// So save the pivot key in the search object.
maybe_search_save_bound
(
node
,
child_to_search
,
search
);
// We're about to pin some more nodes, but we thought we were done before.
if
(
search
->
direction
==
FT_SEARCH_LEFT
)
{
child_to_search
++
;
}
else
{
child_to_search
--
;
}
// If we got a DB_NOTFOUND then we have to search the next record. Possibly everything present is not visible.
// This way of doing DB_NOTFOUND is a kludge, and ought to be simplified. Something like this is needed for DB_NEXT, but
// for point queries, it's overkill. If we got a DB_NOTFOUND on a point query then we should just stop looking.
// When releasing locks on I/O we must not search the same subtree again, or we won't be guaranteed to make forward progress.
// If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
// So save the pivot key in the search object.
maybe_search_save_bound
(
node
,
child_to_search
,
search
);
// as part of #5770, if we can continue searching,
// we MUST return TOKUDB_TRY_AGAIN,
// because there is no guarantee that messages have been applied
// on any other path.
if
((
search
->
direction
==
FT_SEARCH_LEFT
&&
child_to_search
<
node
->
n_children
-
1
)
||
(
search
->
direction
==
FT_SEARCH_RIGHT
&&
child_to_search
>
0
))
{
r
=
TOKUDB_TRY_AGAIN
;
}
return
r
;
}
...
...
@@ -5775,7 +5867,6 @@ toku_ft_keysrange_internal (FT_HANDLE brt, FTNODE node,
&
next_ancestors
,
bounds
,
child_may_find_right
?
match_bfe
:
min_bfe
,
PL_READ
,
// may_modify_node is false, because node guaranteed to not change
false
,
&
childnode
,
&
msgs_applied
...
...
@@ -5986,7 +6077,7 @@ static int get_key_after_bytes_in_child(FT_HANDLE ft_h, FT ft, FTNODE node, UNLO
uint32_t
fullhash
=
compute_child_fullhash
(
ft
->
cf
,
node
,
childnum
);
FTNODE
child
;
bool
msgs_applied
=
false
;
r
=
toku_pin_ftnode_batched
(
ft_h
,
childblocknum
,
fullhash
,
unlockers
,
&
next_ancestors
,
bounds
,
bfe
,
PL_READ
,
false
,
&
child
,
&
msgs_applied
);
r
=
toku_pin_ftnode_batched
(
ft_h
,
childblocknum
,
fullhash
,
unlockers
,
&
next_ancestors
,
bounds
,
bfe
,
false
,
&
child
,
&
msgs_applied
);
paranoid_invariant
(
!
msgs_applied
);
if
(
r
==
TOKUDB_TRY_AGAIN
)
{
return
r
;
...
...
ft/tests/orthopush-flush.cc
View file @
9a6ba1aa
...
...
@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
struct
ancestors
ancestors
=
{
.
node
=
parentnode
,
.
childnum
=
0
,
.
next
=
NULL
};
const
struct
pivot_bounds
infinite_bounds
=
{
.
lower_bound_exclusive
=
NULL
,
.
upper_bound_inclusive
=
NULL
};
bool
msgs_applied
;
toku_apply_ancestors_messages_to_node
(
t
,
child
,
&
ancestors
,
&
infinite_bounds
,
&
msgs_applied
);
toku_apply_ancestors_messages_to_node
(
t
,
child
,
&
ancestors
,
&
infinite_bounds
,
&
msgs_applied
,
-
1
);
FIFO_ITERATE
(
parent_bnc
->
buffer
,
key
,
keylen
,
val
,
vallen
,
type
,
msn
,
xids
,
is_fresh
,
{
...
...
@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
.
upper_bound_inclusive
=
toku_clone_dbt
(
&
ubi
,
childkeys
[
7
])
};
bool
msgs_applied
;
toku_apply_ancestors_messages_to_node
(
t
,
child
,
&
ancestors
,
&
bounds
,
&
msgs_applied
);
toku_apply_ancestors_messages_to_node
(
t
,
child
,
&
ancestors
,
&
bounds
,
&
msgs_applied
,
-
1
);
FIFO_ITERATE
(
parent_bnc
->
buffer
,
key
,
keylen
,
val
,
vallen
,
type
,
msn
,
xids
,
is_fresh
,
{
...
...
@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
struct
ancestors
ancestors
=
{
.
node
=
parentnode
,
.
childnum
=
0
,
.
next
=
NULL
};
const
struct
pivot_bounds
infinite_bounds
=
{
.
lower_bound_exclusive
=
NULL
,
.
upper_bound_inclusive
=
NULL
};
bool
msgs_applied
;
toku_apply_ancestors_messages_to_node
(
t
,
child2
,
&
ancestors
,
&
infinite_bounds
,
&
msgs_applied
);
toku_apply_ancestors_messages_to_node
(
t
,
child2
,
&
ancestors
,
&
infinite_bounds
,
&
msgs_applied
,
-
1
);
FIFO_ITERATE
(
parent_bnc
->
buffer
,
key
,
keylen
,
val
,
vallen
,
type
,
msn
,
xids
,
is_fresh
,
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment