Commit db1065e7 authored by John Esmet's avatar John Esmet

Merge pull request #180 from Tokutek/rightmost_leaf

Optimize rightmost inserts and unique checks using a constant rightmost leaf and a heuristic stored in the FT
parents fa28ddaa ce63b1dd
...@@ -403,3 +403,25 @@ toku_unpin_ftnode_read_only(FT ft, FTNODE node) ...@@ -403,3 +403,25 @@ toku_unpin_ftnode_read_only(FT ft, FTNODE node)
); );
assert(r==0); assert(r==0);
} }
void toku_ftnode_swap_pair_values(FTNODE a, FTNODE b)
// Effect: Swap the blocknum, fullhash, and PAIR for for a and b
// Requires: Both nodes are pinned
{
BLOCKNUM tmp_blocknum = a->thisnodename;
uint32_t tmp_fullhash = a->fullhash;
PAIR tmp_pair = a->ct_pair;
a->thisnodename = b->thisnodename;
a->fullhash = b->fullhash;
a->ct_pair = b->ct_pair;
b->thisnodename = tmp_blocknum;
b->fullhash = tmp_fullhash;
b->ct_pair = tmp_pair;
// A and B swapped pair pointers, but we still have to swap
// the actual pair values (ie: the FTNODEs they represent)
// in the cachetable.
toku_cachetable_swap_pair_values(a->ct_pair, b->ct_pair);
}
...@@ -190,4 +190,7 @@ int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pai ...@@ -190,4 +190,7 @@ int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pai
void toku_unpin_ftnode(FT h, FTNODE node); void toku_unpin_ftnode(FT h, FTNODE node);
void toku_unpin_ftnode_read_only(FT ft, FTNODE node); void toku_unpin_ftnode_read_only(FT ft, FTNODE node);
// Effect: Swaps pair values of two pinned nodes
void toku_ftnode_swap_pair_values(FTNODE nodea, FTNODE nodeb);
#endif #endif
...@@ -565,6 +565,7 @@ static bool may_node_be_reactive(FT ft, FTNODE node) ...@@ -565,6 +565,7 @@ static bool may_node_be_reactive(FT ft, FTNODE node)
*/ */
static void static void
handle_split_of_child( handle_split_of_child(
FT ft,
FTNODE node, FTNODE node,
int childnum, int childnum,
FTNODE childa, FTNODE childa,
...@@ -607,8 +608,20 @@ handle_split_of_child( ...@@ -607,8 +608,20 @@ handle_split_of_child(
paranoid_invariant(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child paranoid_invariant(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child
// We never set the rightmost blocknum to be the root.
// Instead, we wait for the root to split and let promotion initialize the rightmost
// blocknum to be the first non-root leaf node on the right extreme to recieve an insert.
invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b);
if (childa->thisnodename.b == ft->rightmost_blocknum.b) {
// The rightmost leaf (a) split into (a) and (b). We want (b) to swap pair values
// with (a), now that it is the new rightmost leaf. This keeps the rightmost blocknum
// constant, the same the way we keep the root blocknum constant.
toku_ftnode_swap_pair_values(childa, childb);
BP_BLOCKNUM(node, childnum) = childa->thisnodename;
}
BP_BLOCKNUM(node, childnum+1) = childb->thisnodename; BP_BLOCKNUM(node, childnum+1) = childb->thisnodename;
BP_WORKDONE(node, childnum+1) = 0; BP_WORKDONE(node, childnum+1) = 0;
BP_STATE(node,childnum+1) = PT_AVAIL; BP_STATE(node,childnum+1) = PT_AVAIL;
NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl(); NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl();
...@@ -1071,7 +1084,7 @@ ft_split_child( ...@@ -1071,7 +1084,7 @@ ft_split_child(
ft_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes); ft_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes);
} }
// printf("%s:%d child did split\n", __FILE__, __LINE__); // printf("%s:%d child did split\n", __FILE__, __LINE__);
handle_split_of_child (node, childnum, nodea, nodeb, &splitk); handle_split_of_child (h, node, childnum, nodea, nodeb, &splitk);
// for test // for test
call_flusher_thread_callback(flt_flush_during_split); call_flusher_thread_callback(flt_flush_during_split);
...@@ -1489,6 +1502,14 @@ ft_merge_child( ...@@ -1489,6 +1502,14 @@ ft_merge_child(
&node->childkeys[childnuma+1], &node->childkeys[childnuma+1],
(node->n_children-childnumb)*sizeof(node->childkeys[0])); (node->n_children-childnumb)*sizeof(node->childkeys[0]));
REALLOC_N(node->n_children-1, node->childkeys); REALLOC_N(node->n_children-1, node->childkeys);
// Handle a merge of the rightmost leaf node.
if (did_merge && childb->thisnodename.b == h->rightmost_blocknum.b) {
invariant(childb->thisnodename.b != h->h->root_blocknum.b);
toku_ftnode_swap_pair_values(childa, childb);
BP_BLOCKNUM(node, childnuma) = childa->thisnodename;
}
paranoid_invariant(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b); paranoid_invariant(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b);
childa->dirty = 1; // just to make sure childa->dirty = 1; // just to make sure
childb->dirty = 1; // just to make sure childb->dirty = 1; // just to make sure
......
...@@ -123,6 +123,10 @@ enum { FT_DEFAULT_FANOUT = 16 }; ...@@ -123,6 +123,10 @@ enum { FT_DEFAULT_FANOUT = 16 };
enum { FT_DEFAULT_NODE_SIZE = 4 * 1024 * 1024 }; enum { FT_DEFAULT_NODE_SIZE = 4 * 1024 * 1024 };
enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 }; enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 };
// We optimize for a sequential insert pattern if 100 consecutive injections
// happen into the rightmost leaf node due to promotion.
enum { FT_SEQINSERT_SCORE_THRESHOLD = 100 };
// //
// Field in ftnode_fetch_extra that tells the // Field in ftnode_fetch_extra that tells the
// partial fetch callback what piece of the node // partial fetch callback what piece of the node
...@@ -572,6 +576,22 @@ struct ft { ...@@ -572,6 +576,22 @@ struct ft {
// is this ft a blackhole? if so, all messages are dropped. // is this ft a blackhole? if so, all messages are dropped.
bool blackhole; bool blackhole;
// The blocknum of the rightmost leaf node in the tree. Stays constant through splits
// and merges using pair-swapping (like the root node, see toku_ftnode_swap_pair_values())
//
// This field only transitions from RESERVED_BLOCKNUM_NULL to non-null, never back.
// We initialize it when promotion inserts into a non-root leaf node on the right extreme.
// We use the blocktable lock to protect the initialize transition, though it's not really
// necessary since all threads should be setting it to the same value. We maintain that invariant
// on first initialization, see ft_set_or_verify_rightmost_blocknum()
BLOCKNUM rightmost_blocknum;
// sequential access pattern heuristic
// - when promotion pushes a message directly into the rightmost leaf, the score goes up.
// - if the score is high enough, we optimistically attempt to insert directly into the rightmost leaf
// - if our attempt fails because the key was not in range of the rightmost leaf, we reset the score back to 0
uint32_t seqinsert_score;
}; };
// Allocate a DB struct off the stack and only set its comparison // Allocate a DB struct off the stack and only set its comparison
...@@ -1186,6 +1206,9 @@ typedef enum { ...@@ -1186,6 +1206,9 @@ typedef enum {
FT_PRO_NUM_DIDNT_WANT_PROMOTE, FT_PRO_NUM_DIDNT_WANT_PROMOTE,
FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize
FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize
FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS,
FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS,
FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE,
FT_STATUS_NUM_ROWS FT_STATUS_NUM_ROWS
} ft_status_entry; } ft_status_entry;
......
This diff is collapsed.
...@@ -213,6 +213,9 @@ int toku_ft_lookup (FT_HANDLE ft_h, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void ...@@ -213,6 +213,9 @@ int toku_ft_lookup (FT_HANDLE ft_h, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void
// Effect: Insert a key and data pair into an ft // Effect: Insert a key and data pair into an ft
void toku_ft_insert (FT_HANDLE ft_h, DBT *k, DBT *v, TOKUTXN txn); void toku_ft_insert (FT_HANDLE ft_h, DBT *k, DBT *v, TOKUTXN txn);
// Returns: 0 if the key was inserted, DB_KEYEXIST if the key already exists
int toku_ft_insert_unique(FT_HANDLE ft, DBT *k, DBT *v, TOKUTXN txn, bool do_logging);
// Effect: Optimize the ft // Effect: Optimize the ft
void toku_ft_optimize (FT_HANDLE ft_h); void toku_ft_optimize (FT_HANDLE ft_h);
......
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2014 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved."
#include "test.h"
#include <ft/ybt.h>
#include <ft/ft-cachetable-wrappers.h>
// Each FT maintains a sequential insert heuristic to determine if its
// worth trying to insert directly into a well-known rightmost leaf node.
//
// The heuristic is only maintained when a rightmost leaf node is known.
//
// This test verifies that sequential inserts increase the seqinsert score
// and that a single non-sequential insert resets the score.
static void test_seqinsert_heuristic(void) {
int r = 0;
char name[TOKU_PATH_MAX + 1];
toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata");
toku_os_recursive_delete(TOKU_TEST_FILENAME);
r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r);
FT_HANDLE ft_handle;
CACHETABLE ct;
toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER);
r = toku_open_ft_handle(name, 1, &ft_handle,
4*1024*1024, 64*1024,
TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL,
toku_builtin_compare_fun); CKERR(r);
FT ft = ft_handle->ft;
int k;
DBT key, val;
const int val_size = 1024 * 1024;
char *XMALLOC_N(val_size, val_buf);
memset(val_buf, 'x', val_size);
toku_fill_dbt(&val, val_buf, val_size);
// Insert many rows sequentially. This is enough data to:
// - force the root to split (the righmost leaf will then be known)
// - raise the seqinsert score high enough to enable direct rightmost injections
const int rows_to_insert = 200;
for (int i = 0; i < rows_to_insert; i++) {
k = toku_htonl(i);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
}
invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL);
invariant(ft->seqinsert_score == FT_SEQINSERT_SCORE_THRESHOLD);
// Insert on the left extreme. The seq insert score is high enough
// that we will attempt to insert into the rightmost leaf. We won't
// be successful because key 0 won't be in the bounds of the rightmost leaf.
// This failure should reset the seqinsert score back to 0.
k = toku_htonl(0);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 0);
// Insert in the middle. The score should not go up.
k = toku_htonl(rows_to_insert / 2);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 0);
// Insert on the right extreme. The score should go up.
k = toku_htonl(rows_to_insert);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 1);
// Insert again on the right extreme again, the score should go up.
k = toku_htonl(rows_to_insert + 1);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 2);
// Insert close to, but not at, the right extreme. The score should reset.
// -- the magic number 4 derives from the fact that vals are 1mb and nodes are 4mb
k = toku_htonl(rows_to_insert - 4);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 0);
toku_free(val_buf);
toku_ft_handle_close(ft_handle);
toku_cachetable_close(&ct);
toku_os_recursive_delete(TOKU_TEST_FILENAME);
}
int test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
test_seqinsert_heuristic();
return 0;
}
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2014 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved."
#include "test.h"
#include <ft/ybt.h>
#include <ft/ft-cachetable-wrappers.h>
// Promotion tracks the rightmost blocknum in the FT when a message
// is successfully promoted to a non-root leaf node on the right extreme.
//
// This test verifies that a split or merge of the rightmost leaf properly
// maintains the rightmost blocknum (which is constant - the pair's swap values,
// like the root blocknum).
static void test_split_merge(void) {
int r = 0;
char name[TOKU_PATH_MAX + 1];
toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata");
toku_os_recursive_delete(TOKU_TEST_FILENAME);
r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r);
FT_HANDLE ft_handle;
CACHETABLE ct;
toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER);
r = toku_open_ft_handle(name, 1, &ft_handle,
4*1024*1024, 64*1024,
TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL,
toku_builtin_compare_fun); CKERR(r);
// We have a root blocknum, but no rightmost blocknum yet.
FT ft = ft_handle->ft;
invariant(ft->h->root_blocknum.b != RESERVED_BLOCKNUM_NULL);
invariant(ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL);
int k;
DBT key, val;
const int val_size = 1 * 1024 * 1024;
char *XMALLOC_N(val_size, val_buf);
memset(val_buf, 'x', val_size);
toku_fill_dbt(&val, val_buf, val_size);
// Insert 16 rows (should induce a few splits)
const int rows_to_insert = 16;
for (int i = 0; i < rows_to_insert; i++) {
k = toku_htonl(i);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
}
// rightmost blocknum should be set, because the root split and promotion
// did a rightmost insertion directly into the rightmost leaf, lazily
// initializing the rightmost blocknum.
invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL);
BLOCKNUM root_blocknum = ft->h->root_blocknum;
FTNODE root_node;
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, ft);
toku_pin_ftnode(ft, root_blocknum,
toku_cachetable_hash(ft->cf, ft->h->root_blocknum),
&bfe, PL_WRITE_EXPENSIVE, &root_node, true);
// root blocknum should be consistent
invariant(root_node->thisnodename.b == ft->h->root_blocknum.b);
// root should have split at least once, and it should now be at height 1
invariant(root_node->n_children > 1);
invariant(root_node->height == 1);
// rightmost blocknum should no longer be the root, since the root split
invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b);
// the right child should have the rightmost blocknum
invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b);
BLOCKNUM rightmost_blocknum_before_merge = ft->rightmost_blocknum;
const int num_children_before_merge = root_node->n_children;
// delete the last 6 rows.
// - 1mb each, so 6mb deleted
// - should be enough to delete the entire rightmost leaf + some of its neighbor
const int rows_to_delete = 6;
toku_unpin_ftnode(ft, root_node);
for (int i = 0; i < rows_to_delete; i++) {
k = toku_htonl(rows_to_insert - i);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_delete(ft_handle, &key, NULL);
}
toku_pin_ftnode(ft, root_blocknum,
toku_cachetable_hash(ft->cf, root_blocknum),
&bfe, PL_WRITE_EXPENSIVE, &root_node, true);
// - rightmost leaf should be fusible after those deletes (which were promoted directly to the leaf)
FTNODE rightmost_leaf;
toku_pin_ftnode(ft, rightmost_blocknum_before_merge,
toku_cachetable_hash(ft->cf, rightmost_blocknum_before_merge),
&bfe, PL_WRITE_EXPENSIVE, &rightmost_leaf, true);
invariant(get_node_reactivity(ft, rightmost_leaf) == RE_FUSIBLE);
toku_unpin_ftnode(ft, rightmost_leaf);
// - merge the rightmost child now that it's fusible
toku_ft_merge_child(ft, root_node, root_node->n_children - 1);
toku_pin_ftnode(ft, root_blocknum,
toku_cachetable_hash(ft->cf, root_blocknum),
&bfe, PL_WRITE_EXPENSIVE, &root_node, true);
// the merge should have worked, and the root should still be at height 1
invariant(root_node->n_children < num_children_before_merge);
invariant(root_node->height == 1);
// the rightmost child of the root has the rightmost blocknum
invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b);
// the value for rightmost blocknum itself should not have changed
// (we keep it constant, like the root blocknum)
invariant(rightmost_blocknum_before_merge.b == ft->rightmost_blocknum.b);
toku_unpin_ftnode(ft, root_node);
toku_free(val_buf);
toku_ft_handle_close(ft_handle);
toku_cachetable_close(&ct);
toku_os_recursive_delete(TOKU_TEST_FILENAME);
}
int test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
test_split_merge();
return 0;
}
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/**
* Test that unique inserts work correctly. This exercises the rightmost leaf inject optimization.
*/
#include <portability/toku_random.h>
#include "test.h"
static char random_buf[8];
static struct random_data random_data;
static void test_simple_unique_insert(DB_ENV *env) {
int r;
DB *db;
r = db_create(&db, env, 0); CKERR(r);
r = db->open(db, NULL, "db", NULL, DB_BTREE, DB_CREATE, 0644); CKERR(r);
DBT key1, key2, key3;
dbt_init(&key1, "a", sizeof("a"));
dbt_init(&key2, "b", sizeof("b"));
dbt_init(&key3, "c", sizeof("c"));
r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR(r);
r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
r = db->put(db, NULL, &key3, &key3, DB_NOOVERWRITE); CKERR(r);
r = db->put(db, NULL, &key3, &key3, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
r = db->put(db, NULL, &key2, &key2, DB_NOOVERWRITE); CKERR(r);
r = db->put(db, NULL, &key2, &key2, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
// sanity check
r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
r = db->put(db, NULL, &key1, &key3, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
r = db->close(db, 0); CKERR(r);
r = env->dbremove(env, NULL, "db", NULL, 0); CKERR(r);
}
static void test_large_sequential_insert_unique(DB_ENV *env) {
int r;
DB *db;
r = db_create(&db, env, 0); CKERR(r);
// very small nodes/basements to make a taller tree
r = db->set_pagesize(db, 8 * 1024); CKERR(r);
r = db->set_readpagesize(db, 2 * 1024); CKERR(r);
r = db->open(db, NULL, "db", NULL, DB_BTREE, DB_CREATE, 0644); CKERR(r);
const int val_size = 1024;
char *XMALLOC_N(val_size, val_buf);
memset(val_buf, 'k', val_size);
DBT val;
dbt_init(&val, val_buf, val_size);
// grow a tree to about depth 3, taking sanity checks along the way
const int start_num_rows = (64 * 1024 * 1024) / val_size;
for (int i = 0; i < start_num_rows; i++) {
DBT key;
int k = toku_htonl(i);
dbt_init(&key, &k, sizeof(k));
r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR(r);
if (i % 50 == 0) {
// sanity check - should not be able to insert this key twice in a row
r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
// .. but re-inserting is okay, if we provisionally deleted the row
DB_TXN *txn;
r = env->txn_begin(env, NULL, &txn, 0); CKERR(r);
r = db->del(db, NULL, &key, DB_DELETE_ANY); CKERR(r);
r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR(r);
r = txn->commit(txn, 0); CKERR(r);
}
if (i > 0 && i % 250 == 0) {
// sanity check - unique checks on random keys we already inserted should
// fail (exercises middle-of-the-tree checks)
for (int check_i = 0; check_i < 4; check_i++) {
DBT rand_key;
int rand_k = toku_htonl(myrandom_r(&random_data) % i);
dbt_init(&rand_key, &rand_k, sizeof(rand_k));
r = db->put(db, NULL, &rand_key, &val, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
}
}
}
toku_free(val_buf);
r = db->close(db, 0); CKERR(r);
r = env->dbremove(env, NULL, "db", NULL, 0); CKERR(r);
}
int test_main(int argc, char * const argv[]) {
default_parse_args(argc, argv);
int r;
const int envflags = DB_INIT_MPOOL | DB_CREATE | DB_THREAD |
DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_TXN | DB_PRIVATE;
// startup
DB_ENV *env;
toku_os_recursive_delete(TOKU_TEST_FILENAME);
r = toku_os_mkdir(TOKU_TEST_FILENAME, 0755); CKERR(r);
r = db_env_create(&env, 0); CKERR(r);
r = env->open(env, TOKU_TEST_FILENAME, envflags, 0755);
r = myinitstate_r(random(), random_buf, 8, &random_data); CKERR(r);
test_simple_unique_insert(env);
test_large_sequential_insert_unique(env);
// cleanup
r = env->close(env, 0); CKERR(r);
return 0;
}
...@@ -253,6 +253,30 @@ toku_db_del(DB *db, DB_TXN *txn, DBT *key, uint32_t flags, bool holds_mo_lock) { ...@@ -253,6 +253,30 @@ toku_db_del(DB *db, DB_TXN *txn, DBT *key, uint32_t flags, bool holds_mo_lock) {
return r; return r;
} }
static int
db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, int flags, bool do_log) {
int r = 0;
bool unique = false;
enum ft_msg_type type = FT_INSERT;
if (flags == DB_NOOVERWRITE) {
unique = true;
} else if (flags == DB_NOOVERWRITE_NO_ERROR) {
type = FT_INSERT_NO_OVERWRITE;
} else if (flags != 0) {
// All other non-zero flags are unsupported
r = EINVAL;
}
if (r == 0) {
TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : nullptr;
if (unique) {
r = toku_ft_insert_unique(db->i->ft_handle, key, val, ttxn, do_log);
} else {
toku_ft_maybe_insert(db->i->ft_handle, key, val, ttxn, false, ZERO_LSN, do_log, type);
}
invariant(r == DB_KEYEXIST || r == 0);
}
return r;
}
int int
toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_mo_lock) { toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_mo_lock) {
...@@ -265,25 +289,16 @@ toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_ ...@@ -265,25 +289,16 @@ toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_
flags &= ~lock_flags; flags &= ~lock_flags;
r = db_put_check_size_constraints(db, key, val); r = db_put_check_size_constraints(db, key, val);
if (r == 0) {
//Do any checking required by the flags. //Do locking if necessary.
r = db_put_check_overwrite_constraint(db, txn, key, lock_flags, flags);
}
//Do locking if necessary. Do not grab the lock again if this DB had a unique
//check performed because the lock was already grabbed by its cursor callback.
bool do_locking = (bool)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)); bool do_locking = (bool)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE));
if (r == 0 && do_locking && !(flags & DB_NOOVERWRITE)) { if (r == 0 && do_locking) {
r = toku_db_get_point_write_lock(db, txn, key); r = toku_db_get_point_write_lock(db, txn, key);
} }
if (r == 0) { if (r == 0) {
//Insert into the ft. //Insert into the ft.
TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL;
enum ft_msg_type type = FT_INSERT;
if (flags==DB_NOOVERWRITE_NO_ERROR) {
type = FT_INSERT_NO_OVERWRITE;
}
if (!holds_mo_lock) toku_multi_operation_client_lock(); if (!holds_mo_lock) toku_multi_operation_client_lock();
toku_ft_maybe_insert(db->i->ft_handle, key, val, ttxn, false, ZERO_LSN, true, type); r = db_put(db, txn, key, val, flags, true);
if (!holds_mo_lock) toku_multi_operation_client_unlock(); if (!holds_mo_lock) toku_multi_operation_client_unlock();
} }
...@@ -635,9 +650,11 @@ log_put_multiple(DB_TXN *txn, DB *src_db, const DBT *src_key, const DBT *src_val ...@@ -635,9 +650,11 @@ log_put_multiple(DB_TXN *txn, DB *src_db, const DBT *src_key, const DBT *src_val
} }
} }
// Requires: If remaining_flags is non-null, this function performs any required uniqueness checks
// Otherwise, the caller is responsible.
static int static int
do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], DBT_ARRAY vals[], DB *src_db, const DBT *src_key, bool indexer_shortcut) { do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], DBT_ARRAY vals[], uint32_t *remaining_flags, DB *src_db, const DBT *src_key, bool indexer_shortcut) {
TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; int r = 0;
for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { for (uint32_t which_db = 0; which_db < num_dbs; which_db++) {
DB *db = db_array[which_db]; DB *db = db_array[which_db];
...@@ -666,16 +683,21 @@ do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], ...@@ -666,16 +683,21 @@ do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[],
} }
if (do_put) { if (do_put) {
for (uint32_t i = 0; i < keys[which_db].size; i++) { for (uint32_t i = 0; i < keys[which_db].size; i++) {
// if db is being indexed by an indexer, then put into that db if the src key is to the left or equal to the int flags = 0;
// indexers cursor. we have to get the src_db from the indexer and find it in the db_array. if (remaining_flags != nullptr) {
toku_ft_maybe_insert(db->i->ft_handle, flags = remaining_flags[which_db];
&keys[which_db].dbts[i], &vals[which_db].dbts[i], invariant(!(flags & DB_NOOVERWRITE_NO_ERROR));
ttxn, false, ZERO_LSN, false, FT_INSERT); }
r = db_put(db, txn, &keys[which_db].dbts[i], &vals[which_db].dbts[i], flags, false);
if (r != 0) {
goto done;
}
} }
} }
} }
} }
return 0; done:
return r;
} }
static int static int
...@@ -754,20 +776,14 @@ env_put_multiple_internal( ...@@ -754,20 +776,14 @@ env_put_multiple_internal(
r = db_put_check_size_constraints(db, &put_key, &put_val); r = db_put_check_size_constraints(db, &put_key, &put_val);
if (r != 0) goto cleanup; if (r != 0) goto cleanup;
//Check overwrite constraints
r = db_put_check_overwrite_constraint(db, txn,
&put_key,
lock_flags[which_db], remaining_flags[which_db]);
if (r != 0) goto cleanup;
if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) { if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) {
//put_multiple does not support delaying the no error, since we would //put_multiple does not support delaying the no error, since we would
//have to log the flag in the put_multiple. //have to log the flag in the put_multiple.
r = EINVAL; goto cleanup; r = EINVAL; goto cleanup;
} }
//Do locking if necessary. Do not grab the lock again if this DB had a unique //Do locking if necessary.
//check performed because the lock was already grabbed by its cursor callback. if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) {
if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE) && !(remaining_flags[which_db] & DB_NOOVERWRITE)) {
//Needs locking //Needs locking
r = toku_db_get_point_write_lock(db, txn, &put_key); r = toku_db_get_point_write_lock(db, txn, &put_key);
if (r != 0) goto cleanup; if (r != 0) goto cleanup;
...@@ -790,8 +806,10 @@ env_put_multiple_internal( ...@@ -790,8 +806,10 @@ env_put_multiple_internal(
} }
} }
toku_multi_operation_client_lock(); toku_multi_operation_client_lock();
log_put_multiple(txn, src_db, src_key, src_val, num_dbs, fts); r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, remaining_flags, src_db, src_key, indexer_shortcut);
r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, src_db, src_key, indexer_shortcut); if (r == 0) {
log_put_multiple(txn, src_db, src_key, src_val, num_dbs, fts);
}
toku_multi_operation_client_unlock(); toku_multi_operation_client_unlock();
if (indexer_lock_taken) { if (indexer_lock_taken) {
toku_indexer_unlock(indexer); toku_indexer_unlock(indexer);
...@@ -1075,7 +1093,7 @@ env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, ...@@ -1075,7 +1093,7 @@ env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn,
// recovery so we don't end up losing data. // recovery so we don't end up losing data.
// So unlike env->put_multiple, we ONLY log a 'put_multiple' log entry. // So unlike env->put_multiple, we ONLY log a 'put_multiple' log entry.
log_put_multiple(txn, src_db, new_src_key, new_src_data, n_put_dbs, put_fts); log_put_multiple(txn, src_db, new_src_key, new_src_data, n_put_dbs, put_fts);
r = do_put_multiple(txn, n_put_dbs, put_dbs, put_key_arrays, put_val_arrays, src_db, new_src_key, indexer_shortcut); r = do_put_multiple(txn, n_put_dbs, put_dbs, put_key_arrays, put_val_arrays, nullptr, src_db, new_src_key, indexer_shortcut);
} }
toku_multi_operation_client_unlock(); toku_multi_operation_client_unlock();
if (indexer_lock_taken) { if (indexer_lock_taken) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment