Commit 25ac82a5 authored by John Esmet's avatar John Esmet Committed by Yoni Fogel

[t:4372] marked off some possible errors, fixing comment on top of brt.c


git-svn-id: file:///svn/toku/tokudb@39202 c7de825b-a66e-492c-adef-691d508d4ae1
parent a2dacff9
......@@ -111,6 +111,16 @@ struct flusher_advice {
void* extra; // parameter passed into callbacks
};
// FIXME all of these need the toku prefix
//
// how about:
//
// toku_brtnode_flush_some_child()
// toku_fa_flusher_advice_init()
// toku_fa_always_recursively_flush()
// toku_fa_dont_destroy_basement_nodes()
// toku_fa_default_merge_child()
// toku_fa_default_pick_child_after_split()
void
flusher_advice_init(
......
......@@ -1414,6 +1414,8 @@ brt_merge_child(
}
}
// FIXME started_at_root does not exist anymore. this is out of date.
//
// The parameter "started_at_root" is needed to resolve #4147 and #4160,
// which are subtle interactions of background flushing (cleaner and
// flusher threads) and MSN logic.
......
......@@ -434,6 +434,7 @@ struct brt {
struct toku_list zombie_brt_link;
};
// FIXME needs toku prefix
long brtnode_memory_size (BRTNODE node);
PAIR_ATTR make_brtnode_pair_attr(BRTNODE node);
......@@ -474,11 +475,14 @@ void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTO
BASEMENTNODE toku_create_empty_bn(void);
BASEMENTNODE toku_create_empty_bn_no_buffer(void); // create a basement node with a null buffer.
NONLEAF_CHILDINFO toku_create_empty_nl(void);
// FIXME needs toku prefix
void destroy_basement_node (BASEMENTNODE bn);
// FIXME needs toku prefix
void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl);
void toku_destroy_brtnode_internals(BRTNODE node);
void toku_brtnode_free (BRTNODE *node);
void toku_assert_entire_node_in_memory(BRTNODE node);
// FIXME needs toku prefix
void bring_node_fully_into_memory(BRTNODE node, struct brt_header* h);
// append a child node to a parent node
......@@ -686,6 +690,7 @@ struct pivot_bounds {
struct kv_pair const * const upper_bound_inclusive; // NULL to indicate negative or positive infinity (which are in practice exclusive since there are now transfinite keys in messages).
};
// FIXME needs toku prefix
void maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds);
int
......@@ -793,7 +798,6 @@ int toku_db_badformat(void) __attribute__((__warn_unused_result__));
int toku_brt_remove_on_commit(TOKUTXN child, DBT* iname_dbt_p) __attribute__((__warn_unused_result__));
int toku_brt_remove_now(CACHETABLE ct, DBT* iname_dbt_p) __attribute__((__warn_unused_result__));
typedef struct brt_upgrade_status {
u_int64_t header_13; // how many headers were upgraded from version 13
u_int64_t nonleaf_13;
......@@ -869,6 +873,7 @@ struct brt_status {
void toku_brt_get_status(BRT_STATUS);
// FIXME needs toku prefix
void
brt_leaf_apply_cmd_once (
BRTNODE leafnode,
......@@ -881,6 +886,7 @@ brt_leaf_apply_cmd_once (
uint64_t *workdonep
);
// FIXME needs toku prefix
void
brt_leaf_put_cmd (
brt_compare_func compare_fun,
......@@ -907,6 +913,7 @@ void toku_apply_cmd_to_leaf(
OMT live_list_reverse
);
// FIXME needs toku prefix
void brtnode_put_cmd (
brt_compare_func compare_fun,
brt_update_func update_fun,
......@@ -918,18 +925,14 @@ void brtnode_put_cmd (
OMT live_list_reverse
);
void toku_reset_root_xid_that_created(BRT brt, TXNID new_root_xid_that_created);
// Reset the root_xid_that_created field to the given value.
// This redefines which xid created the dictionary.
void toku_flusher_thread_set_callback(void (*callback_f)(int, void*), void* extra);
void toku_brt_header_note_hot_begin(BRT brt);
void toku_brt_header_note_hot_complete(BRT brt, BOOL success, MSN msn_at_start_of_hot);
C_END
#endif
......@@ -3,38 +3,71 @@
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/*
Managing the tree shape: How insertion, deletion, and querying work
When we insert a message into the BRT, here's what happens.
insert_a_message_at_root(msg):
insert_a_message_at_root(msg) {
root = find the root node
if root needs to be split (has fissible reactivity):
if root needs to be split {
split the root
root = find the new root node
}
insert_msg_into_node_buffer(root, msg)
if root has too many messages in its buffer and is a nonleaf node:
flush the buffer on a background thread
flush_nonleaf_node(node, height, key):
To process a nonleaf node (height, key)
Note: Height is always > 0.
Note: This process occurs asynchrnously, but we get the YDB lock at the beginning.
Descend the tree following KEY until a node of HEIGHT is found.
While the node is too full then
pick the heaviest child
bring that child into memory (use nonblocking get_and_pin, which means that if we get a try-again, we go back up and restart the process_a_node job.
move all messages for that child from the node to the child.
If the child needs to be split or merged, then split or merge the child.
If the resulting child's (or children's) buffers are too full then create a work item for each such child to process the child. (This can only happen
for nonleaf children, since otherwise there are no buffers to be too full).
We also have a background thread that traverses the tree (relatively slowly) to flatten the tree.
Background_flattener:
if root has too many messages in its buffer and is a nonleaf node {
child = heaviest child of root
if that child is non reactive and pinnable {
target = child
buffer = child's buffer
} else {
target = root
buffer = null
}
post a flusher thread work item to the cachetable kibbutz = {
flush_nonleaf_node(node_to_flush, buffer)
}
}
}
flush_nonleaf_node(node, buffer) {
if we have a specific target node and non null buffer to flush {
flush_buffer_to_node(node, buffer)
if that node is now gorged and needs flushing {
flush_some_child(node, advice)
}
} else {
// the buffer is null, so the node given is not going to
// be the TARGET of a flush, but instead the source of one.
// we should find some child of the node and flush it.
flush_some_child(node, advice)
}
}
flush_some_child(parent, advice) {
child = advice->pick_child()
buffer = remove_child_buffer_from_parent(parent, child)
if the buffer is non null {
flush_buffer_to_node(child, buffer)
}
if child is stable and the advice says to recursively flush {
flush_some_child(child, advice)
} else if child needs to be split {
split the child
} else if the child _could_ be merged {
maybe_merge_child(child, parent)
}
}
We also have a background cleaner thread that traverses and flattens the tree:
cleaner_thread() {
}
It's state is a height and a key and a child number
Repeat:
sleep (say 1s)
......@@ -54,7 +87,6 @@ We also have a background thread that traverses the tree (relatively slowly) to
The background flattener should also garbage collect MVCC versions. The flattener should remember the MVCC versions it has encountered
so that if any of those are no longer live, it can run again.
To shrink a file: Let X be the size of the reachable data.
We define an acceptable bloat constant of C. For example we set C=2 if we are willing to allow the file to be as much as 2X in size.
The goal is to find the smallest amount of stuff we can move to get the file down to size CX.
......
......@@ -307,6 +307,7 @@ struct cachefile {
// when things finish.
};
// FIXME global with no toku prefix
void add_background_job(CACHEFILE cf, bool already_locked)
{
if (!already_locked) {
......@@ -318,6 +319,7 @@ void add_background_job(CACHEFILE cf, bool already_locked)
}
}
// FIXME global with no toku prefix
void remove_background_job(CACHEFILE cf, bool already_locked)
{
if (!already_locked) {
......@@ -331,12 +333,14 @@ void remove_background_job(CACHEFILE cf, bool already_locked)
}
}
// FIXME global with no toku prefix
void cachefile_kibbutz_enq (CACHEFILE cf, void (*f)(void*), void *extra)
// The function f must call remove_background_job when it completes
{
add_background_job(cf, false);
toku_kibbutz_enq(cf->cachetable->kibbutz, f, extra);
}
static void wait_on_background_jobs_to_finish (CACHEFILE cf) {
cachetable_lock(cf->cachetable);
while (cf->n_background_jobs>0) {
......@@ -3866,6 +3870,7 @@ cleaner_thread_rate_pair(PAIR p)
static int const CLEANER_N_TO_CHECK = 8;
// FIXME this is global but no one uses it except cachetable.c
int
toku_cleaner_thread (void *cachetable_v)
// Effect: runs a cleaner.
......
......@@ -5891,6 +5891,7 @@ locked_db_stat64 (DB *db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
return r;
}
static int
toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) {
HANDLE_PANICKED_DB(db);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment