Commit ab996be6 authored by Barry Perlman's avatar Barry Perlman Committed by Yoni Fogel

[t:2892] Merge upgrade logic to main. Merge command was svn merge...

[t:2892] Merge upgrade logic to main.  Merge command was svn merge --accept=postpone -r25293:HEAD ../tokudb.main+2892 .

git-svn-id: file:///svn/toku/tokudb@25303 c7de825b-a66e-492c-adef-691d508d4ae1
parent ee289ba5
...@@ -232,6 +232,7 @@ typedef enum { ...@@ -232,6 +232,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008 #define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009 #define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */ /* LOADER flags */
#define LOADER_USE_PUTS 1 #define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ /* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
...@@ -234,6 +234,7 @@ typedef enum { ...@@ -234,6 +234,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008 #define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009 #define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */ /* LOADER flags */
#define LOADER_USE_PUTS 1 #define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ /* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
...@@ -234,6 +234,7 @@ typedef enum { ...@@ -234,6 +234,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008 #define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009 #define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */ /* LOADER flags */
#define LOADER_USE_PUTS 1 #define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ /* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
...@@ -234,6 +234,7 @@ typedef enum { ...@@ -234,6 +234,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008 #define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009 #define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */ /* LOADER flags */
#define LOADER_USE_PUTS 1 #define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ /* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
...@@ -235,6 +235,7 @@ typedef enum { ...@@ -235,6 +235,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008 #define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009 #define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */ /* LOADER flags */
#define LOADER_USE_PUTS 1 #define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ /* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
...@@ -72,6 +72,7 @@ enum { ...@@ -72,6 +72,7 @@ enum {
TOKUDB_NO_DATA = -100008, TOKUDB_NO_DATA = -100008,
TOKUDB_ACCEPT = -100009, TOKUDB_ACCEPT = -100009,
TOKUDB_MVCC_DICTIONARY_TOO_NEW = -100010, TOKUDB_MVCC_DICTIONARY_TOO_NEW = -100010,
TOKUDB_UPGRADE_FAILURE = -100011,
}; };
static void print_defines (void) { static void print_defines (void) {
...@@ -218,6 +219,7 @@ static void print_defines (void) { ...@@ -218,6 +219,7 @@ static void print_defines (void) {
dodefine(TOKUDB_NO_DATA); dodefine(TOKUDB_NO_DATA);
dodefine(TOKUDB_ACCEPT); dodefine(TOKUDB_ACCEPT);
dodefine(TOKUDB_MVCC_DICTIONARY_TOO_NEW); dodefine(TOKUDB_MVCC_DICTIONARY_TOO_NEW);
dodefine(TOKUDB_UPGRADE_FAILURE);
/* LOADER flags */ /* LOADER flags */
printf("/* LOADER flags */\n"); printf("/* LOADER flags */\n");
......
...@@ -235,6 +235,7 @@ typedef enum { ...@@ -235,6 +235,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008 #define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009 #define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */ /* LOADER flags */
#define LOADER_USE_PUTS 1 #define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ /* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
...@@ -235,6 +235,7 @@ typedef enum { ...@@ -235,6 +235,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008 #define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009 #define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */ /* LOADER flags */
#define LOADER_USE_PUTS 1 #define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ /* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
...@@ -748,8 +748,7 @@ static void ...@@ -748,8 +748,7 @@ static void
translation_deserialize_from_buffer(struct translation *t, // destination into which to deserialize translation_deserialize_from_buffer(struct translation *t, // destination into which to deserialize
DISKOFF location_on_disk, //Location of translation_buffer DISKOFF location_on_disk, //Location of translation_buffer
u_int64_t size_on_disk, u_int64_t size_on_disk,
unsigned char * translation_buffer, unsigned char * translation_buffer) { // buffer with serialized translation
BOOL invert_checksum) { // buffer with serialized translation
assert(location_on_disk!=0); assert(location_on_disk!=0);
t->type = TRANSLATION_CHECKPOINTED; t->type = TRANSLATION_CHECKPOINTED;
{ {
...@@ -758,9 +757,6 @@ translation_deserialize_from_buffer(struct translation *t, // destination int ...@@ -758,9 +757,6 @@ translation_deserialize_from_buffer(struct translation *t, // destination int
u_int64_t offset = size_on_disk - 4; u_int64_t offset = size_on_disk - 4;
//printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk); //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
u_int32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset)); u_int32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset));
if (invert_checksum) {
x1764 = ~x1764;
}
assert(x1764 == stored_x1764); assert(x1764 == stored_x1764);
} }
struct rbuf rt; struct rbuf rt;
...@@ -808,10 +804,9 @@ void ...@@ -808,10 +804,9 @@ void
toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, toku_blocktable_create_from_buffer(BLOCK_TABLE *btp,
DISKOFF location_on_disk, //Location of translation_buffer DISKOFF location_on_disk, //Location of translation_buffer
DISKOFF size_on_disk, DISKOFF size_on_disk,
unsigned char *translation_buffer, unsigned char *translation_buffer) {
BOOL invert_checksum) {
BLOCK_TABLE bt = blocktable_create_internal(); BLOCK_TABLE bt = blocktable_create_internal();
translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer, invert_checksum); translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer);
blocktable_note_translation(bt->block_allocator, &bt->checkpointed); blocktable_note_translation(bt->block_allocator, &bt->checkpointed);
// we just filled in checkpointed, now copy it to current. // we just filled in checkpointed, now copy it to current.
copy_translation(&bt->current, &bt->checkpointed, TRANSLATION_CURRENT); copy_translation(&bt->current, &bt->checkpointed, TRANSLATION_CURRENT);
......
...@@ -21,7 +21,7 @@ struct block_translation_pair { ...@@ -21,7 +21,7 @@ struct block_translation_pair {
}; };
void toku_blocktable_create_new(BLOCK_TABLE *btp); void toku_blocktable_create_new(BLOCK_TABLE *btp);
void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer, BOOL invert_checksum); void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
void toku_blocktable_destroy(BLOCK_TABLE *btp); void toku_blocktable_destroy(BLOCK_TABLE *btp);
void toku_brtheader_lock(struct brt_header *h); void toku_brtheader_lock(struct brt_header *h);
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved." #ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "brt_layout_version.h"
#include "toku_assert.h" #include "toku_assert.h"
#include "block_allocator.h" #include "block_allocator.h"
#include "cachetable.h" #include "cachetable.h"
...@@ -44,7 +45,7 @@ enum { BUFFER_HEADER_SIZE = (4 // height// ...@@ -44,7 +45,7 @@ enum { BUFFER_HEADER_SIZE = (4 // height//
struct subtree_estimates { struct subtree_estimates {
// estimate number of rows in the tree by counting the number of rows // estimate number of rows in the tree by counting the number of rows
// in the leaves. The stuff in the internal nodes is likely to be off O(1). // in the leaves. The stuff in the internal nodes is likely to be off O(1).
u_int64_t nkeys; // number of distinct keys. u_int64_t nkeys; // number of distinct keys (obsolete with removal of dupsort, but not worth removing)
u_int64_t ndata; // number of key-data pairs (previously leafentry_estimate) u_int64_t ndata; // number of key-data pairs (previously leafentry_estimate)
u_int64_t dsize; // total size of leafentries u_int64_t dsize; // total size of leafentries
BOOL exact; // are the estimates exact? BOOL exact; // are the estimates exact?
...@@ -82,7 +83,6 @@ struct brtnode_nonleaf_childinfo { ...@@ -82,7 +83,6 @@ struct brtnode_nonleaf_childinfo {
unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */ unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */
}; };
typedef struct brtnode *BRTNODE;
/* Internal nodes. */ /* Internal nodes. */
struct brtnode { struct brtnode {
unsigned int nodesize; unsigned int nodesize;
...@@ -121,6 +121,7 @@ struct brtnode { ...@@ -121,6 +121,7 @@ struct brtnode {
} n; } n;
struct leaf { struct leaf {
struct subtree_estimates leaf_stats; // actually it is exact. struct subtree_estimates leaf_stats; // actually it is exact.
uint32_t optimized_for_upgrade; // version number to which this leaf has been optimized, zero if never optimized for upgrade
OMT buffer; OMT buffer;
LEAFLOCK_POOL leaflock_pool; LEAFLOCK_POOL leaflock_pool;
LEAFLOCK leaflock; LEAFLOCK leaflock;
...@@ -166,7 +167,7 @@ struct brt_header { ...@@ -166,7 +167,7 @@ struct brt_header {
int layout_version_original; // different (<) from layout_version if upgraded from a previous version (useful for debugging) int layout_version_original; // different (<) from layout_version if upgraded from a previous version (useful for debugging)
int layout_version_read_from_disk; // transient, not serialized to disk int layout_version_read_from_disk; // transient, not serialized to disk
BOOL upgrade_brt_performed; // initially FALSE, set TRUE when brt has been fully updated (even though nodes may not have been) BOOL upgrade_brt_performed; // initially FALSE, set TRUE when brt has been fully updated (even though nodes may not have been)
uint64_t num_blocks_to_upgrade; // Number of blocks still not newest version. When we release layout 13 we may need to turn this to an array. int64_t num_blocks_to_upgrade; // Number of v12 blocks still not newest version. When we release layout 14 we may need to turn this to an array or add more variables.
unsigned int nodesize; unsigned int nodesize;
BLOCKNUM root; // roots of the dictionary BLOCKNUM root; // roots of the dictionary
struct remembered_hash root_hash; // hash of the root offset. struct remembered_hash root_hash; // hash of the root offset.
...@@ -269,7 +270,7 @@ struct brtenv { ...@@ -269,7 +270,7 @@ struct brtenv {
}; };
extern void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *brtnode_v, void *extraargs, long size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint); extern void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *brtnode_v, void *extraargs, long size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint);
extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, long *sizep, void*extraargs); extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, long *sizep, int*dirty, void*extraargs);
extern int toku_brt_alloc_init_header(BRT t, TOKUTXN txn); extern int toku_brt_alloc_init_header(BRT t, TOKUTXN txn);
extern int toku_read_brt_header_and_store_in_cachefile (CACHEFILE cf, struct brt_header **header, BOOL* was_open); extern int toku_read_brt_header_and_store_in_cachefile (CACHEFILE cf, struct brt_header **header, BOOL* was_open);
extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *root_hash); extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *root_hash);
...@@ -352,21 +353,6 @@ void toku_verify_all_in_mempool(BRTNODE node); ...@@ -352,21 +353,6 @@ void toku_verify_all_in_mempool(BRTNODE node);
int toku_verify_brtnode (BRT brt, BLOCKNUM blocknum, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse) ; int toku_verify_brtnode (BRT brt, BLOCKNUM blocknum, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse) ;
enum brt_layout_version_e {
BRT_LAYOUT_VERSION_5 = 5,
BRT_LAYOUT_VERSION_6 = 6, // Diff from 5 to 6: Add leafentry_estimate
BRT_LAYOUT_VERSION_7 = 7, // Diff from 6 to 7: Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333
BRT_LAYOUT_VERSION_8 = 8, // Diff from 7 to 8: Use murmur instead of crc32. We are going to make a simplification and stop supporting version 7 and before. Current As of Beta 1.0.6
BRT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression.
BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from brt layer
BRT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned). BRT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one.
BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE', compressed block format, num old blocks
BRT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Added MVCC
BRT_NEXT_VERSION, // the version after the current version
BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line.
BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_12 // Minimum version supported
};
void toku_brtheader_free (struct brt_header *h); void toku_brtheader_free (struct brt_header *h);
int toku_brtheader_close (CACHEFILE cachefile, int fd, void *header_v, char **error_string, BOOL oplsn_valid, LSN oplsn); int toku_brtheader_close (CACHEFILE cachefile, int fd, void *header_v, char **error_string, BOOL oplsn_valid, LSN oplsn);
int toku_brtheader_begin_checkpoint (CACHEFILE cachefile, int fd, LSN checkpoint_lsn, void *header_v); int toku_brtheader_begin_checkpoint (CACHEFILE cachefile, int fd, LSN checkpoint_lsn, void *header_v);
...@@ -380,9 +366,10 @@ int toku_brt_remove_now(CACHETABLE ct, DBT* iname_dbt_p); ...@@ -380,9 +366,10 @@ int toku_brt_remove_now(CACHETABLE ct, DBT* iname_dbt_p);
typedef struct brt_upgrade_status { typedef struct brt_upgrade_status {
u_int64_t header; u_int64_t header_12; // how many headers upgrade from version 12
u_int64_t nonleaf; u_int64_t nonleaf_12;
u_int64_t leaf; u_int64_t leaf_12;
u_int64_t optimized_for_upgrade_12; // how many optimize_for_upgrade messages sent
} BRT_UPGRADE_STATUS_S, *BRT_UPGRADE_STATUS; } BRT_UPGRADE_STATUS_S, *BRT_UPGRADE_STATUS;
void toku_brt_get_upgrade_status(BRT_UPGRADE_STATUS); void toku_brt_get_upgrade_status(BRT_UPGRADE_STATUS);
......
This diff is collapsed.
...@@ -174,6 +174,10 @@ message are not gorged. (But they may be hungry or too fat or too thin.) ...@@ -174,6 +174,10 @@ message are not gorged. (But they may be hungry or too fat or too thin.)
#include "roll.h" #include "roll.h"
#include "toku_atomic.h" #include "toku_atomic.h"
static const uint32_t this_version = BRT_LAYOUT_VERSION;
void void
toku_brt_header_suppress_rollbacks(struct brt_header *h, TOKUTXN txn) { toku_brt_header_suppress_rollbacks(struct brt_header *h, TOKUTXN txn) {
TXNID txnid = toku_txn_get_txnid(txn); TXNID txnid = toku_txn_get_txnid(txn);
...@@ -296,6 +300,12 @@ calc_leaf_stats (BRTNODE node) { ...@@ -296,6 +300,12 @@ calc_leaf_stats (BRTNODE node) {
return e; return e;
} }
void
toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node) {
invariant(node->height==0);
node->u.l.leaf_stats = calc_leaf_stats(node);
}
static void __attribute__((__unused__)) static void __attribute__((__unused__))
brt_leaf_check_leaf_stats (BRTNODE node) brt_leaf_check_leaf_stats (BRTNODE node)
{ {
...@@ -483,13 +493,16 @@ void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename ...@@ -483,13 +493,16 @@ void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename
} }
//fd is protected (must be holding fdlock) //fd is protected (must be holding fdlock)
int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, long *sizep, void*extraargs) { int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM nodename, u_int32_t fullhash,
void **brtnode_pv, long *sizep, int *dirtyp, void *extraargs) {
lazy_assert(extraargs); lazy_assert(extraargs);
struct brt_header *h = extraargs; struct brt_header *h = extraargs;
BRTNODE *result=(BRTNODE*)brtnode_pv; BRTNODE *result=(BRTNODE*)brtnode_pv;
int r = toku_deserialize_brtnode_from(fd, nodename, fullhash, result, h); int r = toku_deserialize_brtnode_from(fd, nodename, fullhash, result, h);
if (r == 0) if (r == 0) {
*sizep = brtnode_memory_size(*result); *sizep = brtnode_memory_size(*result);
*dirtyp = (*result)->dirty;
}
//(*result)->parent_brtnode = 0; /* Don't know it right now. */ //(*result)->parent_brtnode = 0; /* Don't know it right now. */
//printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename); //printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename);
return r; return r;
...@@ -656,6 +669,7 @@ initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height, size_ ...@@ -656,6 +669,7 @@ initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height, size_
n->u.n.childkeys=0; n->u.n.childkeys=0;
} else { } else {
n->u.l.leaf_stats = zero_estimates; n->u.l.leaf_stats = zero_estimates;
n->u.l.optimized_for_upgrade = 0;
int r; int r;
r = toku_omt_create(&n->u.l.buffer); r = toku_omt_create(&n->u.l.buffer);
lazy_assert_zero(r); lazy_assert_zero(r);
...@@ -1646,6 +1660,9 @@ brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd, ...@@ -1646,6 +1660,9 @@ brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd,
lazy_assert(toku_omt_size(node->u.l.buffer) == omt_size); lazy_assert(toku_omt_size(node->u.l.buffer) == omt_size);
break; break;
case BRT_OPTIMIZE_FOR_UPGRADE:
node->dirty = 1;
node->u.l.optimized_for_upgrade = *((uint32_t*)(cmd->u.id.val->data)); // record version of software that sent the optimize_for_upgrade message
case BRT_OPTIMIZE: case BRT_OPTIMIZE:
// Apply to all leafentries // Apply to all leafentries
idx = 0; idx = 0;
...@@ -1893,6 +1910,7 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd, ...@@ -1893,6 +1910,7 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd,
case BRT_COMMIT_BROADCAST_TXN: case BRT_COMMIT_BROADCAST_TXN:
case BRT_ABORT_BROADCAST_TXN: case BRT_ABORT_BROADCAST_TXN:
case BRT_OPTIMIZE: case BRT_OPTIMIZE:
case BRT_OPTIMIZE_FOR_UPGRADE:
return brt_nonleaf_cmd_all (t, node, cmd, re_array, did_io); // send message to all children return brt_nonleaf_cmd_all (t, node, cmd, re_array, did_io); // send message to all children
case BRT_NONE: case BRT_NONE:
break; break;
...@@ -2601,14 +2619,33 @@ toku_brt_load_recovery(TOKUTXN txn, char const * old_iname, char const * new_ina ...@@ -2601,14 +2619,33 @@ toku_brt_load_recovery(TOKUTXN txn, char const * old_iname, char const * new_ina
return r; return r;
} }
static int brt_optimize (BRT brt, BOOL upgrade);
// Effect: Optimize the brt. // Effect: Optimize the brt.
int int
toku_brt_optimize (BRT brt) { toku_brt_optimize (BRT brt) {
int r = brt_optimize(brt, FALSE);
return r;
}
int
toku_brt_optimize_for_upgrade (BRT brt) {
int r = brt_optimize(brt, TRUE);
return r;
}
static int
brt_optimize (BRT brt, BOOL upgrade) {
int r = 0; int r = 0;
TOKULOGGER logger = toku_cachefile_logger(brt->cf);
TXNID oldest = toku_logger_get_oldest_living_xid(logger);
XIDS root_xids = xids_get_root_xids(); TXNID oldest = TXNID_NONE_LIVING;
if (!upgrade) {
TOKULOGGER logger = toku_cachefile_logger(brt->cf);
oldest = toku_logger_get_oldest_living_xid(logger);
}
XIDS root_xids = xids_get_root_xids();
XIDS message_xids; XIDS message_xids;
if (oldest == TXNID_NONE_LIVING) { if (oldest == TXNID_NONE_LIVING) {
message_xids = root_xids; message_xids = root_xids;
...@@ -2622,8 +2659,16 @@ toku_brt_optimize (BRT brt) { ...@@ -2622,8 +2659,16 @@ toku_brt_optimize (BRT brt) {
DBT val; DBT val;
toku_init_dbt(&key); toku_init_dbt(&key);
toku_init_dbt(&val); toku_init_dbt(&val);
BRT_MSG_S brtcmd = { BRT_OPTIMIZE, message_xids, .u.id={&key,&val}}; if (upgrade) {
r = toku_brt_root_put_cmd(brt, &brtcmd); // maybe there's a better place than the val dbt to put the version, but it seems harmless and is convenient
toku_fill_dbt(&val, &this_version, sizeof(this_version));
BRT_MSG_S brtcmd = { BRT_OPTIMIZE_FOR_UPGRADE, message_xids, .u.id={&key,&val}};
r = toku_brt_root_put_cmd(brt, &brtcmd);
}
else {
BRT_MSG_S brtcmd = { BRT_OPTIMIZE, message_xids, .u.id={&key,&val}};
r = toku_brt_root_put_cmd(brt, &brtcmd);
}
xids_destroy(&message_xids); xids_destroy(&message_xids);
return r; return r;
} }
......
...@@ -68,6 +68,8 @@ int toku_brt_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn) __attribute__ ((warn ...@@ -68,6 +68,8 @@ int toku_brt_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn) __attribute__ ((warn
int toku_brt_optimize (BRT brt) __attribute__ ((warn_unused_result)); int toku_brt_optimize (BRT brt) __attribute__ ((warn_unused_result));
int toku_brt_optimize_for_upgrade (BRT brt) __attribute__ ((warn_unused_result));
// Effect: Insert a key and data pair into a brt if the oplsn is newer than the brt lsn. This function is called during recovery. // Effect: Insert a key and data pair into a brt if the oplsn is newer than the brt lsn. This function is called during recovery.
// Returns 0 if successful // Returns 0 if successful
int toku_brt_maybe_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn, BOOL oplsn_valid, LSN oplsn, int do_logging, enum brt_msg_type type) __attribute__ ((warn_unused_result)); int toku_brt_maybe_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn, BOOL oplsn_valid, LSN oplsn, int do_logging, enum brt_msg_type type) __attribute__ ((warn_unused_result));
...@@ -176,7 +178,9 @@ enum brt_header_flags { ...@@ -176,7 +178,9 @@ enum brt_header_flags {
//TOKU_DB_DUP = (1<<0), //Obsolete #2862 //TOKU_DB_DUP = (1<<0), //Obsolete #2862
//TOKU_DB_DUPSORT = (1<<1), //Obsolete #2862 //TOKU_DB_DUPSORT = (1<<1), //Obsolete #2862
TOKU_DB_KEYCMP_BUILTIN = (1<<2), TOKU_DB_KEYCMP_BUILTIN = (1<<2),
//TOKU_DB_VALCMP_BUILTIN = (1<<3), #if BRT_LAYOUT_MIN_SUPPORTED_VERSION <= BRT_LAYOUT_VERSION_12
TOKU_DB_VALCMP_BUILTIN_12 = (1<<3),
#endif
}; };
int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u_int64_t *greater) __attribute__ ((warn_unused_result)); int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u_int64_t *greater) __attribute__ ((warn_unused_result));
...@@ -238,6 +242,8 @@ BOOL toku_brt_is_recovery_logging_suppressed (BRT) __attribute__ ((warn_unused_r ...@@ -238,6 +242,8 @@ BOOL toku_brt_is_recovery_logging_suppressed (BRT) __attribute__ ((warn_unused_r
#define TOKU_MULTIPLE_MAIN_THREADS 0 #define TOKU_MULTIPLE_MAIN_THREADS 0
#endif #endif
void toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node);
int toku_brt_strerror_r(int error, char *buf, size_t buflen); int toku_brt_strerror_r(int error, char *buf, size_t buflen);
// Effect: LIke the XSI-compliant strerorr_r, extended to db_strerror(). // Effect: LIke the XSI-compliant strerorr_r, extended to db_strerror().
// If error>=0 then the result is to do strerror_r(error, buf, buflen), that is fill buf with a descriptive error message. // If error>=0 then the result is to do strerror_r(error, buf, buflen), that is fill buf with a descriptive error message.
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ifndef BRT_LAYOUT_VERSION_H
#define BRT_LAYOUT_VERSION_H
#ident "$Id$"
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
//Must be defined before other recursive headers could include logger.h
enum brt_layout_version_e {
BRT_LAYOUT_VERSION_5 = 5,
BRT_LAYOUT_VERSION_6 = 6, // Diff from 5 to 6: Add leafentry_estimate
BRT_LAYOUT_VERSION_7 = 7, // Diff from 6 to 7: Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333
BRT_LAYOUT_VERSION_8 = 8, // Diff from 7 to 8: Use murmur instead of crc32. We are going to make a simplification and stop supporting version 7 and before. Current As of Beta 1.0.6
BRT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression.
BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from brt layer
BRT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned). BRT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one.
BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE', compressed block format, num old blocks
BRT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Added MVCC, deprecated TOKU_DB_VALCMP_BUILTIN(_12)
BRT_NEXT_VERSION, // the version after the current version
BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line.
BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_12 // Minimum version supported
};
#endif
...@@ -120,6 +120,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) { ...@@ -120,6 +120,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
case BRT_COMMIT_BROADCAST_TXN: printf("COMMIT_BROADCAST_TXN"); goto ok; case BRT_COMMIT_BROADCAST_TXN: printf("COMMIT_BROADCAST_TXN"); goto ok;
case BRT_ABORT_BROADCAST_TXN: printf("ABORT_BROADCAST_TXN"); goto ok; case BRT_ABORT_BROADCAST_TXN: printf("ABORT_BROADCAST_TXN"); goto ok;
case BRT_OPTIMIZE: printf("OPTIMIZE"); goto ok; case BRT_OPTIMIZE: printf("OPTIMIZE"); goto ok;
case BRT_OPTIMIZE_FOR_UPGRADE: printf("OPTIMIZE_FOR_UPGRADE"); goto ok;
} }
printf("HUH?"); printf("HUH?");
ok: ok:
...@@ -139,6 +140,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) { ...@@ -139,6 +140,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
} else { } else {
struct subtree_estimates *est = &n->u.l.leaf_stats; struct subtree_estimates *est = &n->u.l.leaf_stats;
printf("{nkey=%" PRIu64 " ndata=%" PRIu64 " dsize=%" PRIu64 " %s }\n", est->nkeys, est->ndata, est->dsize, est->exact ? "T" : "F"); printf("{nkey=%" PRIu64 " ndata=%" PRIu64 " dsize=%" PRIu64 " %s }\n", est->nkeys, est->ndata, est->dsize, est->exact ? "T" : "F");
printf(" optimized_for_upgrade=%u\n", n->u.l.optimized_for_upgrade);
printf(" n_bytes_in_buffer=%u\n", n->u.l.n_bytes_in_buffer); printf(" n_bytes_in_buffer=%u\n", n->u.l.n_bytes_in_buffer);
printf(" items_in_buffer =%u\n", toku_omt_size(n->u.l.buffer)); printf(" items_in_buffer =%u\n", toku_omt_size(n->u.l.buffer));
if (dump_data) toku_omt_iterate(n->u.l.buffer, print_le, 0); if (dump_data) toku_omt_iterate(n->u.l.buffer, print_le, 0);
......
...@@ -2227,6 +2227,9 @@ static struct leaf_buf *start_leaf (struct dbout *out, const DESCRIPTOR UU(desc) ...@@ -2227,6 +2227,9 @@ static struct leaf_buf *start_leaf (struct dbout *out, const DESCRIPTOR UU(desc)
lbuf->nkeys_p = lbuf->dbuf.off; lbuf->dbuf.off+=8; lbuf->nkeys_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->ndata_p = lbuf->dbuf.off; lbuf->dbuf.off+=8; lbuf->ndata_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->dsize_p = lbuf->dbuf.off; lbuf->dbuf.off+=8; lbuf->dsize_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
putbuf_int32(&lbuf->dbuf, 0); // optimized_for_upgrade
lbuf->partitions_p = lbuf->dbuf.off; lbuf->dbuf.off+=4; lbuf->dbuf.off += stored_sub_block_map_size; // RFP partition map lbuf->partitions_p = lbuf->dbuf.off; lbuf->dbuf.off+=4; lbuf->dbuf.off += stored_sub_block_map_size; // RFP partition map
lbuf->n_in_buf_p = lbuf->dbuf.off; lbuf->dbuf.off+=4; lbuf->n_in_buf_p = lbuf->dbuf.off; lbuf->dbuf.off+=4;
......
...@@ -19,6 +19,7 @@ extern "C" { ...@@ -19,6 +19,7 @@ extern "C" {
#endif #endif
typedef struct brt *BRT; typedef struct brt *BRT;
typedef struct brtnode *BRTNODE;
struct brt_header; struct brt_header;
struct wbuf; struct wbuf;
struct dbuf; struct dbuf;
...@@ -96,7 +97,8 @@ enum brt_msg_type { ...@@ -96,7 +97,8 @@ enum brt_msg_type {
BRT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction). BRT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction).
BRT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction). BRT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction).
BRT_INSERT_NO_OVERWRITE = 11, BRT_INSERT_NO_OVERWRITE = 11,
BRT_OPTIMIZE = 12, BRT_OPTIMIZE = 12, // Broadcast
BRT_OPTIMIZE_FOR_UPGRADE = 13, // same as BRT_OPTIMIZE, but record version number in leafnode
}; };
typedef struct xids_t *XIDS; typedef struct xids_t *XIDS;
......
...@@ -1092,6 +1092,8 @@ static int cachetable_fetch_pair(CACHETABLE ct, CACHEFILE cf, PAIR p) { ...@@ -1092,6 +1092,8 @@ static int cachetable_fetch_pair(CACHETABLE ct, CACHEFILE cf, PAIR p) {
void *toku_value = 0; void *toku_value = 0;
long size = 0; long size = 0;
int dirty = 0;
WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key)); WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key));
...@@ -1100,7 +1102,9 @@ static int cachetable_fetch_pair(CACHETABLE ct, CACHEFILE cf, PAIR p) { ...@@ -1100,7 +1102,9 @@ static int cachetable_fetch_pair(CACHETABLE ct, CACHEFILE cf, PAIR p) {
int r; int r;
if (toku_cachefile_is_dev_null_unlocked(cf)) r = -1; if (toku_cachefile_is_dev_null_unlocked(cf)) r = -1;
else r = fetch_callback(cf, cf->fd, key, fullhash, &toku_value, &size, extraargs); else r = fetch_callback(cf, cf->fd, key, fullhash, &toku_value, &size, &dirty, extraargs);
if (dirty)
p->dirty = CACHETABLE_DIRTY;
cachetable_lock(ct); cachetable_lock(ct);
rwlock_read_unlock(&cf->fdlock); rwlock_read_unlock(&cf->fdlock);
......
...@@ -122,7 +122,7 @@ typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, void ...@@ -122,7 +122,7 @@ typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, void
// Returns: 0 if success, otherwise an error number. The address and size of the object // Returns: 0 if success, otherwise an error number. The address and size of the object
// associated with the key are returned. // associated with the key are returned.
// Can access fd (fd is protected by a readlock during call) // Can access fd (fd is protected by a readlock during call)
typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs); typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, int *dirtyp, void *extraargs);
void toku_cachefile_set_userdata(CACHEFILE cf, void *userdata, void toku_cachefile_set_userdata(CACHEFILE cf, void *userdata,
int (*log_fassociate_during_checkpoint)(CACHEFILE, void*), int (*log_fassociate_during_checkpoint)(CACHEFILE, void*),
......
...@@ -111,7 +111,7 @@ struct __attribute__ ((__packed__)) leafentry { ...@@ -111,7 +111,7 @@ struct __attribute__ ((__packed__)) leafentry {
typedef struct leafentry *LEAFENTRY; typedef struct leafentry *LEAFENTRY;
typedef struct leafentry_12 *LEAFENTRY_12;
u_int32_t toku_le_crc(LEAFENTRY v); u_int32_t toku_le_crc(LEAFENTRY v);
...@@ -173,7 +173,6 @@ le_clean(uint8_t *key, uint32_t keylen, ...@@ -173,7 +173,6 @@ le_clean(uint8_t *key, uint32_t keylen,
struct dbuf *d); struct dbuf *d);
//Callback contract: //Callback contract:
// Function checks to see if id is accepted by context. // Function checks to see if id is accepted by context.
// Returns: // Returns:
...@@ -187,6 +186,15 @@ int le_iterate_is_empty(LEAFENTRY le, LE_ITERATE_CALLBACK f, BOOL *is_empty, TOK ...@@ -187,6 +186,15 @@ int le_iterate_is_empty(LEAFENTRY le, LE_ITERATE_CALLBACK f, BOOL *is_empty, TOK
int le_iterate_val(LEAFENTRY le, LE_ITERATE_CALLBACK f, void** valpp, u_int32_t *vallenp, TOKUTXN context); int le_iterate_val(LEAFENTRY le, LE_ITERATE_CALLBACK f, void** valpp, u_int32_t *vallenp, TOKUTXN context);
size_t
leafentry_disksize_12(LEAFENTRY_12 le);
int
toku_le_upgrade_12_13(LEAFENTRY_12 old_leafentry, // NULL if there was no stored data.
size_t *new_leafentry_memorysize,
size_t *new_leafentry_disksize,
LEAFENTRY *new_leafentry_p);
#if defined(__cplusplus) || defined(__cilkplusplus) #if defined(__cplusplus) || defined(__cilkplusplus)
}; };
#endif #endif
......
...@@ -38,7 +38,7 @@ static inline void toku_free_FILENUMS(FILENUMS val) { toku_free(val.filenums); } ...@@ -38,7 +38,7 @@ static inline void toku_free_FILENUMS(FILENUMS val) { toku_free(val.filenums); }
void toku_set_lsn_increment (uint64_t incr) __attribute__((__visibility__("default"))); void toku_set_lsn_increment (uint64_t incr) __attribute__((__visibility__("default")));
int toku_maybe_upgrade_log (const char *env_dir, const char *log_dir); int toku_maybe_upgrade_log (const char *env_dir, const char *log_dir, LSN * lsn_of_clean_shutdown, BOOL * upgrade_in_progress);
uint64_t toku_log_upgrade_get_footprint(void); uint64_t toku_log_upgrade_get_footprint(void);
......
This diff is collapsed.
...@@ -89,7 +89,7 @@ static int lc_open_logfile(TOKULOGCURSOR lc, int index) { ...@@ -89,7 +89,7 @@ static int lc_open_logfile(TOKULOGCURSOR lc, int index) {
r = toku_read_logmagic(lc->cur_fp, &version); r = toku_read_logmagic(lc->cur_fp, &version);
if (r!=0) if (r!=0)
return DB_BADFORMAT; return DB_BADFORMAT;
if (version != TOKU_LOG_VERSION) if (version < TOKU_LOG_MIN_SUPPORTED_VERSION || version > TOKU_LOG_VERSION)
return DB_BADFORMAT; return DB_BADFORMAT;
} }
// mark as open // mark as open
...@@ -379,6 +379,7 @@ int toku_logcursor_first(TOKULOGCURSOR lc, struct log_entry **le) { ...@@ -379,6 +379,7 @@ int toku_logcursor_first(TOKULOGCURSOR lc, struct log_entry **le) {
return r; return r;
} }
//get last entry in the logfile specified by logcursor
int toku_logcursor_last(TOKULOGCURSOR lc, struct log_entry **le) { int toku_logcursor_last(TOKULOGCURSOR lc, struct log_entry **le) {
int r=0; int r=0;
if ( lc->entry_valid ) { if ( lc->entry_valid ) {
...@@ -462,6 +463,7 @@ static int lc_fix_bad_logfile(TOKULOGCURSOR lc) { ...@@ -462,6 +463,7 @@ static int lc_fix_bad_logfile(TOKULOGCURSOR lc) {
r = fseek(lc->cur_fp, 0, SEEK_SET); if ( r!=0 ) return r; r = fseek(lc->cur_fp, 0, SEEK_SET); if ( r!=0 ) return r;
r = toku_read_logmagic(lc->cur_fp, &version); if ( r!=0 ) return r; r = toku_read_logmagic(lc->cur_fp, &version); if ( r!=0 ) return r;
if (version != TOKU_LOG_VERSION) return -1;
toku_off_t last_good_pos; toku_off_t last_good_pos;
last_good_pos = ftello(lc->cur_fp); last_good_pos = ftello(lc->cur_fp);
......
...@@ -79,17 +79,20 @@ int toku_logfilemgr_init(TOKULOGFILEMGR lfm, const char *log_dir) { ...@@ -79,17 +79,20 @@ int toku_logfilemgr_init(TOKULOGFILEMGR lfm, const char *log_dir) {
return ENOMEM; return ENOMEM;
} }
// find the index // find the index
// basename is the filename of the i-th logfile
basename = strrchr(logfiles[i], '/') + 1; basename = strrchr(logfiles[i], '/') + 1;
int version; int version;
r = sscanf(basename, "log%lld.tokulog%d", &index, &version); r = sscanf(basename, "log%lld.tokulog%d", &index, &version);
assert(r==2); // found index and version assert(r==2); // found index and version
assert(version==TOKU_LOG_VERSION); assert(version>=TOKU_LOG_MIN_SUPPORTED_VERSION);
assert(version<=TOKU_LOG_VERSION);
lf_info->index = index; lf_info->index = index;
// find last LSN lf_info->version = version;
// find last LSN in logfile
r = toku_logcursor_create_for_file(&cursor, log_dir, basename); r = toku_logcursor_create_for_file(&cursor, log_dir, basename);
if (r!=0) if (r!=0)
return r; return r;
r = toku_logcursor_last(cursor, &entry); r = toku_logcursor_last(cursor, &entry); // set "entry" to last log entry in logfile
if ( r == 0 ) { if ( r == 0 ) {
lf_info->maxlsn = toku_log_entry_get_lsn(entry); lf_info->maxlsn = toku_log_entry_get_lsn(entry);
tmp_lsn = lf_info->maxlsn; tmp_lsn = lf_info->maxlsn;
......
...@@ -15,6 +15,7 @@ extern "C" { ...@@ -15,6 +15,7 @@ extern "C" {
struct toku_logfile_info { struct toku_logfile_info {
int64_t index; int64_t index;
LSN maxlsn; LSN maxlsn;
uint32_t version;
}; };
typedef struct toku_logfile_info *TOKULOGFILEINFO; typedef struct toku_logfile_info *TOKULOGFILEINFO;
......
...@@ -10,7 +10,7 @@ static const int log_format_version=TOKU_LOG_VERSION; ...@@ -10,7 +10,7 @@ static const int log_format_version=TOKU_LOG_VERSION;
static int open_logfile (TOKULOGGER logger); static int open_logfile (TOKULOGGER logger);
static int toku_logger_write_buffer (TOKULOGGER logger, LSN *fsynced_lsn); static int toku_logger_write_buffer (TOKULOGGER logger, LSN *fsynced_lsn);
static int delete_logfile(TOKULOGGER logger, long long index); static int delete_logfile(TOKULOGGER logger, long long index, uint32_t version);
static void grab_output(TOKULOGGER logger, LSN *fsynced_lsn); static void grab_output(TOKULOGGER logger, LSN *fsynced_lsn);
static void release_output(TOKULOGGER logger, LSN fsynced_lsn); static void release_output(TOKULOGGER logger, LSN fsynced_lsn);
...@@ -573,10 +573,40 @@ int toku_logger_find_next_unused_log_file(const char *directory, long long *resu ...@@ -573,10 +573,40 @@ int toku_logger_find_next_unused_log_file(const char *directory, long long *resu
return r; return r;
} }
// TODO: Put this in portability layer when ready
// in: file pathname that may have a dirname prefix
// return: file leaf name
static char * fileleafname(char *pathname) {
const char delimiter = '/';
char *leafname = strrchr(pathname, delimiter);
if (leafname)
leafname++;
else
leafname = pathname;
return leafname;
}
static int logfilenamecompare (const void *ap, const void *bp) { static int logfilenamecompare (const void *ap, const void *bp) {
char *a=*(char**)ap; char *a=*(char**)ap;
char *a_leafname = fileleafname(a);
char *b=*(char**)bp; char *b=*(char**)bp;
return strcmp(a,b); char * b_leafname = fileleafname(b);
int rval;
BOOL valid;
uint64_t num_a = 0; // placate compiler
uint64_t num_b = 0;
uint32_t ver_a = 0;
uint32_t ver_b = 0;
valid = is_a_logfile_any_version(a_leafname, &num_a, &ver_a);
invariant(valid);
valid = is_a_logfile_any_version(b_leafname, &num_b, &ver_b);
invariant(valid);
if (ver_a < ver_b) rval = -1;
else if (ver_a > ver_b) rval = +1;
else if (num_a < num_b) rval = -1;
else if (num_a > num_b) rval = +1;
else rval = 0;
return rval;
} }
// Return the log files in sorted order // Return the log files in sorted order
...@@ -596,8 +626,9 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo ...@@ -596,8 +626,9 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo
} }
int dirnamelen = strlen(directory); int dirnamelen = strlen(directory);
while ((de=readdir(d))) { while ((de=readdir(d))) {
long long thisl; uint64_t thisl;
if ( !(is_a_logfile(de->d_name, &thisl)) ) continue; //#2424: Skip over files that don't match the exact logfile template uint32_t version_ignore;
if ( !(is_a_logfile_any_version(de->d_name, &thisl, &version_ignore)) ) continue; //#2424: Skip over files that don't match the exact logfile template
if (n_results+1>=result_limit) { if (n_results+1>=result_limit) {
result_limit*=2; result_limit*=2;
result = toku_realloc(result, result_limit*sizeof(*result)); result = toku_realloc(result, result_limit*sizeof(*result));
...@@ -610,8 +641,12 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo ...@@ -610,8 +641,12 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo
snprintf(fname, fnamelen, "%s/%s", directory, de->d_name); snprintf(fname, fnamelen, "%s/%s", directory, de->d_name);
result[n_results++] = fname; result[n_results++] = fname;
} }
// Return them in increasing order. // Return them in increasing order. Set width to allow for newer log file names ("xxx.tokulog13")
qsort(result, n_results, sizeof(result[0]), logfilenamecompare); // which are one character longer than old log file names ("xxx.tokulog2"). The comparison function
// won't look beyond the terminating NUL, so an extra character in the comparison string doesn't matter.
// Allow room for terminating NUL after "xxx.tokulog13" even if result[0] is of form "xxx.tokulog2."
int width = sizeof(result[0]+2);
qsort(result, n_results, width, logfilenamecompare);
*resultp = result; *resultp = result;
*n_logfiles = n_results; *n_logfiles = n_results;
result[n_results]=0; // make a trailing null result[n_results]=0; // make a trailing null
...@@ -644,6 +679,7 @@ static int open_logfile (TOKULOGGER logger) ...@@ -644,6 +679,7 @@ static int open_logfile (TOKULOGGER logger)
return ENOMEM; return ENOMEM;
lf_info->index = index; lf_info->index = index;
lf_info->maxlsn = logger->written_lsn; lf_info->maxlsn = logger->written_lsn;
lf_info->version = TOKU_LOG_VERSION;
toku_logfilemgr_add_logfile_info(logger->logfilemgr, lf_info); toku_logfilemgr_add_logfile_info(logger->logfilemgr, lf_info);
} }
logger->fsynced_lsn = logger->written_lsn; logger->fsynced_lsn = logger->written_lsn;
...@@ -651,12 +687,12 @@ static int open_logfile (TOKULOGGER logger) ...@@ -651,12 +687,12 @@ static int open_logfile (TOKULOGGER logger)
return 0; return 0;
} }
static int delete_logfile(TOKULOGGER logger, long long index) static int delete_logfile(TOKULOGGER logger, long long index, uint32_t version)
// Entry and Exit: This thread has permission to modify the output. // Entry and Exit: This thread has permission to modify the output.
{ {
int fnamelen = strlen(logger->directory)+50; int fnamelen = strlen(logger->directory)+50;
char fname[fnamelen]; char fname[fnamelen];
snprintf(fname, fnamelen, "%s/log%012lld.tokulog%d", logger->directory, index, TOKU_LOG_VERSION); snprintf(fname, fnamelen, "%s/log%012lld.tokulog%d", logger->directory, index, version);
int r = remove(fname); int r = remove(fname);
return r; return r;
} }
...@@ -675,7 +711,9 @@ int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN trim_lsn) ...@@ -675,7 +711,9 @@ int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN trim_lsn)
if ( logger->write_log_files && logger->trim_log_files) { if ( logger->write_log_files && logger->trim_log_files) {
while ( n_logfiles > 1 ) { // don't delete current logfile while ( n_logfiles > 1 ) { // don't delete current logfile
uint32_t log_version;
lf_info = toku_logfilemgr_get_oldest_logfile_info(lfm); lf_info = toku_logfilemgr_get_oldest_logfile_info(lfm);
log_version = lf_info->version;
if ( lf_info->maxlsn.lsn > trim_lsn.lsn ) { if ( lf_info->maxlsn.lsn > trim_lsn.lsn ) {
// file contains an open LSN, can't delete this or any newer log files // file contains an open LSN, can't delete this or any newer log files
break; break;
...@@ -684,7 +722,7 @@ int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN trim_lsn) ...@@ -684,7 +722,7 @@ int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN trim_lsn)
long index = lf_info->index; long index = lf_info->index;
toku_logfilemgr_delete_oldest_logfile_info(lfm); toku_logfilemgr_delete_oldest_logfile_info(lfm);
n_logfiles--; n_logfiles--;
r = delete_logfile(logger, index); r = delete_logfile(logger, index, log_version);
if (r!=0) { if (r!=0) {
break; break;
} }
...@@ -1329,7 +1367,7 @@ toku_logger_get_status(TOKULOGGER logger, LOGGER_STATUS s) { ...@@ -1329,7 +1367,7 @@ toku_logger_get_status(TOKULOGGER logger, LOGGER_STATUS s) {
int int
toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint32_t *version_found) { toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint32_t *version_found) {
BOOL found = FALSE; BOOL found = FALSE;
uint32_t single_version = 0; uint32_t highest_version = 0;
int r = 0; int r = 0;
struct dirent *de; struct dirent *de;
...@@ -1338,16 +1376,17 @@ toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint ...@@ -1338,16 +1376,17 @@ toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint
r = errno; r = errno;
} }
else { else {
// Examine every file in the directory and assert that all log files are of the same version (single_version). // Examine every file in the directory and find highest version
while ((de=readdir(d))) { while ((de=readdir(d))) {
uint32_t this_log_version; uint32_t this_log_version;
uint64_t this_log_number; uint64_t this_log_number;
BOOL is_log = is_a_logfile_any_version(de->d_name, &this_log_number, &this_log_version); BOOL is_log = is_a_logfile_any_version(de->d_name, &this_log_number, &this_log_version);
if (is_log) { if (is_log) {
if (found) if (found) {
assert(single_version == this_log_version); highest_version = highest_version > this_log_version ? highest_version : this_log_version;
}
found = TRUE; found = TRUE;
single_version = this_log_version; highest_version = this_log_version;
} }
} }
} }
...@@ -1358,7 +1397,7 @@ toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint ...@@ -1358,7 +1397,7 @@ toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint
if (r==0) { if (r==0) {
*found_any_logs = found; *found_any_logs = found;
if (found) if (found)
*version_found = single_version; *version_found = highest_version;
} }
return r; return r;
} }
......
...@@ -9,12 +9,19 @@ ...@@ -9,12 +9,19 @@
extern "C" { extern "C" {
#endif #endif
#include "brt_layout_version.h"
enum { enum {
TOKU_LOG_VERSION_1 = 1, TOKU_LOG_VERSION_1 = 1,
TOKU_LOG_VERSION_2 = 2, TOKU_LOG_VERSION_2 = 2,
TOKU_LOG_NEXT_VERSION, // the version after the current version //After 2 we linked the log version to the BRT_LAYOUT VERSION.
TOKU_LOG_VERSION = TOKU_LOG_NEXT_VERSION-1, // A hack so I don't have to change this line. //So it went from 2 to 13 (3-12 do not exist)
TOKU_LOG_MIN_SUPPORTED_VERSION = TOKU_LOG_VERSION_2 TOKU_LOG_VERSION = BRT_LAYOUT_VERSION, //Linked
#if BRT_LAYOUT_MIN_SUPPORTED_VERSION > BRT_LAYOUT_VERSION_12 //linked once we remove support for 12
TOKU_LOG_MIN_SUPPORTED_VERSION = BRT_LAYOUT_MIN_SUPPORTED_VERSION,
#else
TOKU_LOG_MIN_SUPPORTED_VERSION = TOKU_LOG_VERSION_2,
#endif
}; };
#define ROLLBACK_CACHEFILE_NAME "tokudb.rollback" #define ROLLBACK_CACHEFILE_NAME "tokudb.rollback"
......
...@@ -474,7 +474,7 @@ static void toku_rollback_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM ...@@ -474,7 +474,7 @@ static void toku_rollback_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM
} }
static int toku_rollback_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname, u_int32_t fullhash, static int toku_rollback_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname, u_int32_t fullhash,
void **rollback_pv, long *sizep, void *extraargs) { void **rollback_pv, long *sizep, int * UU(dirtyp), void *extraargs) {
int r; int r;
struct brt_header *h = extraargs; struct brt_header *h = extraargs;
assert(h->cf == cachefile); assert(h->cf == cachefile);
......
...@@ -46,7 +46,7 @@ flush (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), void *value, void *UU ...@@ -46,7 +46,7 @@ flush (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), void *value, void *UU
} }
static int static int
fetch (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), u_int32_t UU(fullhash), void **UU(value), long *UU(sizep), void *UU(extraargs)) fetch (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), u_int32_t UU(fullhash), void **UU(value), long *UU(sizep), int *UU(dirtyp), void *UU(extraargs))
{ {
assert(0); // should not be called assert(0); // should not be called
return 0; return 0;
......
...@@ -19,12 +19,13 @@ static void flush(CACHEFILE cf, int UU(fd), CACHEKEY key, void *value, void *ext ...@@ -19,12 +19,13 @@ static void flush(CACHEFILE cf, int UU(fd), CACHEKEY key, void *value, void *ext
if (keep_me) n_keep_me++; if (keep_me) n_keep_me++;
} }
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs) { static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, int *dirtyp, void *extraargs) {
cf = cf; key = key; fullhash = fullhash; value = value; sizep = sizep; extraargs = extraargs; cf = cf; key = key; fullhash = fullhash; value = value; sizep = sizep; extraargs = extraargs;
assert(0); // should not be called assert(0); // should not be called
n_fetch++; n_fetch++;
*value = 0; *value = 0;
*sizep = item_size; *sizep = item_size;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
return 0; return 0;
......
...@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
return 0; return 0;
......
...@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
return 0; return 0;
......
...@@ -17,10 +17,11 @@ flush (CACHEFILE cf __attribute__((__unused__)), ...@@ -17,10 +17,11 @@ flush (CACHEFILE cf __attribute__((__unused__)),
} }
static int static int
fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t hash, void **vptr, long *sizep, void *extra) { fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t hash, void **vptr, long *sizep, int *dirtyp, void *extra) {
cf = cf; hash = hash; extra = extra; cf = cf; hash = hash; extra = extra;
*sizep = (long) key.b; *sizep = (long) key.b;
*vptr = toku_malloc(*sizep); *vptr = toku_malloc(*sizep);
*dirtyp = 0;
return 0; return 0;
} }
...@@ -31,6 +32,7 @@ fetch_error (CACHEFILE cf __attribute__((__unused__)), ...@@ -31,6 +32,7 @@ fetch_error (CACHEFILE cf __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void*extraargs __attribute__((__unused__)) void*extraargs __attribute__((__unused__))
) { ) {
return -1; return -1;
......
...@@ -22,12 +22,13 @@ static void flush(CACHEFILE cf, int UU(fd), CACHEKEY key, void *value, void *ext ...@@ -22,12 +22,13 @@ static void flush(CACHEFILE cf, int UU(fd), CACHEKEY key, void *value, void *ext
if (keep_me) n_keep_me++; if (keep_me) n_keep_me++;
} }
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs) { static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, int *dirtyp, void *extraargs) {
cf = cf; key = key; fullhash = fullhash; value = value; sizep = sizep; extraargs = extraargs; cf = cf; key = key; fullhash = fullhash; value = value; sizep = sizep; extraargs = extraargs;
n_fetch++; n_fetch++;
sleep(10); sleep(10);
*value = 0; *value = 0;
*sizep = item_size; *sizep = item_size;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -27,6 +27,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -27,6 +27,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
...@@ -35,6 +36,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -35,6 +36,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0; *value = 0;
*sizep = 1; *sizep = 1;
*dirtyp = 0;
return -42; return -42;
} }
......
...@@ -28,6 +28,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -28,6 +28,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
...@@ -36,6 +37,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -36,6 +37,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = toku_malloc(1); *value = toku_malloc(1);
*sizep = 1; *sizep = 1;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -27,6 +27,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -27,6 +27,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
...@@ -35,6 +36,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -35,6 +36,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0; *value = 0;
*sizep = 1; *sizep = 1;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -39,6 +39,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -39,6 +39,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value, void **value,
long *sizep, long *sizep,
int *dirtyp,
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
...@@ -47,6 +48,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -47,6 +48,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0; *value = 0;
*sizep = 1; *sizep = 1;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
...@@ -32,7 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -32,7 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0; *value = 0;
*sizep = 1; *sizep = 1;
*dirtyp = 0;
return -42; return -42;
} }
......
...@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
...@@ -32,6 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -32,6 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0; *value = 0;
*sizep = 1; *sizep = 1;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
...@@ -32,6 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -32,6 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0; *value = 0;
*sizep = 1; *sizep = 1;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -28,6 +28,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -28,6 +28,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
...@@ -36,6 +37,7 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -36,6 +37,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0; *value = 0;
*sizep = 1; *sizep = 1;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -75,6 +75,7 @@ static int r_fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -75,6 +75,7 @@ static int r_fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void**value __attribute__((__unused__)), void**value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void*extraargs __attribute__((__unused__))) { void*extraargs __attribute__((__unused__))) {
// fprintf(stderr, "Whoops, this should never be called"); // fprintf(stderr, "Whoops, this should never be called");
return -42; return -42;
......
...@@ -33,6 +33,7 @@ static int f_fetch (CACHEFILE f, ...@@ -33,6 +33,7 @@ static int f_fetch (CACHEFILE f,
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void**value, void**value,
long *sizep, long *sizep,
int *dirtyp,
void*extraargs __attribute__((__unused__))) { void*extraargs __attribute__((__unused__))) {
void *buf = toku_malloc(BLOCKSIZE); void *buf = toku_malloc(BLOCKSIZE);
int r = pread(toku_cachefile_get_and_pin_fd(f), buf, BLOCKSIZE, key.b); int r = pread(toku_cachefile_get_and_pin_fd(f), buf, BLOCKSIZE, key.b);
...@@ -40,6 +41,7 @@ static int f_fetch (CACHEFILE f, ...@@ -40,6 +41,7 @@ static int f_fetch (CACHEFILE f,
assert(r==BLOCKSIZE); assert(r==BLOCKSIZE);
*value = buf; *value = buf;
*sizep = BLOCKSIZE; *sizep = BLOCKSIZE;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -144,12 +144,13 @@ static struct item *make_item (u_int64_t key) { ...@@ -144,12 +144,13 @@ static struct item *make_item (u_int64_t key) {
} }
static CACHEKEY did_fetch={-1}; static CACHEKEY did_fetch={-1};
static int fetch (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash __attribute__((__unused__)), void**value, long *sizep __attribute__((__unused__)), void*extraargs) { static int fetch (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash __attribute__((__unused__)), void**value, long *sizep __attribute__((__unused__)), int *dirtyp, void*extraargs) {
if (verbose) printf("Fetch %" PRId64 "\n", key.b); if (verbose) printf("Fetch %" PRId64 "\n", key.b);
assert (expect_f==f); assert (expect_f==f);
assert((long)extraargs==23); assert((long)extraargs==23);
*value = make_item(key.b); *value = make_item(key.b);
*sizep = test_object_size; *sizep = test_object_size;
*dirtyp = 0;
did_fetch=key; did_fetch=key;
return 0; return 0;
} }
...@@ -308,9 +309,11 @@ static void flush_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEK ...@@ -308,9 +309,11 @@ static void flush_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEK
} }
static int fetch_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEKEY key __attribute__((__unused__)), static int fetch_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEKEY key __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void**value, long *sizep __attribute__((__unused__)), void*extraargs) { void**value, long *sizep __attribute__((__unused__)),
int * dirtyp, void*extraargs) {
assert((long)extraargs==42); assert((long)extraargs==42);
*value=0; *value=0;
*dirtyp = 0;
return 0; return 0;
} }
...@@ -369,17 +372,19 @@ static void null_flush (CACHEFILE cf __attribute__((__unused__)), ...@@ -369,17 +372,19 @@ static void null_flush (CACHEFILE cf __attribute__((__unused__)),
BOOL for_checkpoint __attribute__((__unused__))) { BOOL for_checkpoint __attribute__((__unused__))) {
} }
static int add123_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), void*extraargs) { static int add123_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
assert(fullhash==toku_cachetable_hash(cf,key)); assert(fullhash==toku_cachetable_hash(cf,key));
assert((long)extraargs==123); assert((long)extraargs==123);
*value = (void*)((unsigned long)key.b+123L); *value = (void*)((unsigned long)key.b+123L);
*dirtyp = 0;
return 0; return 0;
} }
static int add222_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), void*extraargs) { static int add222_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
assert(fullhash==toku_cachetable_hash(cf,key)); assert(fullhash==toku_cachetable_hash(cf,key));
assert((long)extraargs==222); assert((long)extraargs==222);
*value = (void*)((unsigned long)key.b+222L); *value = (void*)((unsigned long)key.b+222L);
*dirtyp = 0;
return 0; return 0;
} }
...@@ -443,8 +448,9 @@ static void test_dirty_flush(CACHEFILE f, ...@@ -443,8 +448,9 @@ static void test_dirty_flush(CACHEFILE f,
if (verbose) printf("test_dirty_flush %p %" PRId64 " %p %ld %u %u\n", f, key.b, value, size, (unsigned)do_write, (unsigned)keep); if (verbose) printf("test_dirty_flush %p %" PRId64 " %p %ld %u %u\n", f, key.b, value, size, (unsigned)do_write, (unsigned)keep);
} }
static int test_dirty_fetch(CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value_ptr, long *size_ptr, void *arg) { static int test_dirty_fetch(CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value_ptr, long *size_ptr, int * dirtyp, void *arg) {
*value_ptr = arg; *value_ptr = arg;
*dirtyp = 0;
assert(fullhash==toku_cachetable_hash(f,key)); assert(fullhash==toku_cachetable_hash(f,key));
if (verbose) printf("test_dirty_fetch %p %" PRId64 " %p %ld %p\n", f, key.b, *value_ptr, *size_ptr, arg); if (verbose) printf("test_dirty_fetch %p %" PRId64 " %p %ld %p\n", f, key.b, *value_ptr, *size_ptr, arg);
return 0; return 0;
......
...@@ -112,10 +112,11 @@ static void flush_forchain (CACHEFILE f __attribute__((__unused__)), ...@@ -112,10 +112,11 @@ static void flush_forchain (CACHEFILE f __attribute__((__unused__)),
//print_ints(); //print_ints();
} }
static int fetch_forchain (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void**value, long *sizep __attribute__((__unused__)), void*extraargs) { static int fetch_forchain (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void**value, long *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
assert(toku_cachetable_hash(f, key)==fullhash); assert(toku_cachetable_hash(f, key)==fullhash);
assert((long)extraargs==(long)key.b); assert((long)extraargs==(long)key.b);
*value = (void*)(long)key.b; *value = (void*)(long)key.b;
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)), ...@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)), u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)), void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)), long *sizep __attribute__((__unused__)),
int *dirtyp,
void *extraargs __attribute__((__unused__)) void *extraargs __attribute__((__unused__))
) { ) {
*dirtyp = 0;
return 0; return 0;
} }
......
...@@ -313,7 +313,7 @@ msg_modify_ule(ULE ule, BRT_MSG msg) { ...@@ -313,7 +313,7 @@ msg_modify_ule(ULE ule, BRT_MSG msg) {
XIDS xids = brt_msg_get_xids(msg); XIDS xids = brt_msg_get_xids(msg);
invariant(xids_get_num_xids(xids) < MAX_TRANSACTION_RECORDS); invariant(xids_get_num_xids(xids) < MAX_TRANSACTION_RECORDS);
enum brt_msg_type type = brt_msg_get_type(msg); enum brt_msg_type type = brt_msg_get_type(msg);
if (type != BRT_OPTIMIZE) { if (type != BRT_OPTIMIZE && type != BRT_OPTIMIZE_FOR_UPGRADE) {
ule_do_implicit_promotions(ule, xids); ule_do_implicit_promotions(ule, xids);
} }
switch (type) { switch (type) {
...@@ -342,6 +342,7 @@ msg_modify_ule(ULE ule, BRT_MSG msg) { ...@@ -342,6 +342,7 @@ msg_modify_ule(ULE ule, BRT_MSG msg) {
ule_apply_commit(ule, xids); ule_apply_commit(ule, xids);
break; break;
case BRT_OPTIMIZE: case BRT_OPTIMIZE:
case BRT_OPTIMIZE_FOR_UPGRADE:
ule_optimize(ule, xids); ule_optimize(ule, xids);
break; break;
default: default:
...@@ -358,7 +359,7 @@ test_msg_modify_ule(ULE ule, BRT_MSG msg){ ...@@ -358,7 +359,7 @@ test_msg_modify_ule(ULE ule, BRT_MSG msg){
static void ule_optimize(ULE ule, XIDS xids) { static void ule_optimize(ULE ule, XIDS xids) {
if (ule->num_puxrs) { if (ule->num_puxrs) {
TXNID uncommitted = ule->uxrs[ule->num_cuxrs].xid; TXNID uncommitted = ule->uxrs[ule->num_cuxrs].xid; // outermost uncommitted
TXNID oldest_living_xid = TXNID_NONE; TXNID oldest_living_xid = TXNID_NONE;
uint32_t num_xids = xids_get_num_xids(xids); uint32_t num_xids = xids_get_num_xids(xids);
if (num_xids > 0) { if (num_xids > 0) {
...@@ -2018,3 +2019,202 @@ bool transaction_open(TXNID xid) { ...@@ -2018,3 +2019,202 @@ bool transaction_open(TXNID xid) {
#endif #endif
#if BRT_LAYOUT_MIN_SUPPORTED_VERSION <= BRT_LAYOUT_VERSION_12
#if TOKU_WINDOWS
#pragma pack(push, 1)
#endif
struct __attribute__ ((__packed__)) leafentry_12 {
u_int8_t num_xrs;
u_int32_t keylen;
u_int32_t innermost_inserted_vallen;
union {
struct __attribute__ ((__packed__)) leafentry_committed_12 {
u_int8_t key_val[0]; //Actual key, then actual val
} comm;
struct __attribute__ ((__packed__)) leafentry_provisional_12 {
u_int8_t innermost_type;
TXNID xid_outermost_uncommitted;
u_int8_t key_val_xrs[]; //Actual key,
//then actual innermost inserted val,
//then transaction records.
} prov;
} u;
};
#if TOKU_WINDOWS
#pragma pack(pop)
#endif
//Requires:
// Leafentry that ule represents should not be destroyed (is not just all deletes)
static size_t
le_memsize_from_ule_12 (ULE ule) {
uint32_t num_uxrs = ule->num_cuxrs + ule->num_puxrs;
assert(num_uxrs);
size_t rval;
if (num_uxrs == 1) {
assert(uxr_is_insert(&ule->uxrs[0]));
rval = 1 //num_uxrs
+4 //keylen
+4 //vallen
+ule->keylen //actual key
+ule->uxrs[0].vallen; //actual val
}
else {
rval = 1 //num_uxrs
+4 //keylen
+ule->keylen //actual key
+1*num_uxrs //types
+8*(num_uxrs-1); //txnids
u_int8_t i;
for (i = 0; i < num_uxrs; i++) {
UXR uxr = &ule->uxrs[i];
if (uxr_is_insert(uxr)) {
rval += 4; //vallen
rval += uxr->vallen; //actual val
}
}
}
return rval;
}
//This function is mostly copied from 4.1.1
// Note, number of transaction records in version 12 has been replaced by separate counters in version 13 (MVCC),
// one counter for committed transaction records and one counter for provisional transaction records. When
// upgrading a version 12 le to version 13, the number of committed transaction records is always set to one (1)
// and the number of provisional transaction records is set to the original number of transaction records
// minus one. The bottom transaction record is assumed to be a committed value. (If there is no committed
// value then the bottom transaction record of version 12 is a committed delete.)
// This is the only change from the 4.1.1 code. The rest of the leafentry is read as is.
static void
le_unpack_12(ULE ule, LEAFENTRY_12 le) {
//Read num_uxrs
uint8_t num_xrs = le->num_xrs;
assert(num_xrs > 0);
ule->uxrs = ule->uxrs_static; //Static version is always enough.
ule->num_cuxrs = 1;
ule->num_puxrs = num_xrs - 1;
//Read the keylen
ule->keylen = toku_dtoh32(le->keylen);
//Read the vallen of innermost insert
u_int32_t vallen_of_innermost_insert = toku_dtoh32(le->innermost_inserted_vallen);
u_int8_t *p;
if (num_xrs == 1) {
//Unpack a 'committed leafentry' (No uncommitted transactions exist)
ule->keyp = le->u.comm.key_val;
ule->uxrs[0].type = XR_INSERT; //Must be or the leafentry would not exist
ule->uxrs[0].vallen = vallen_of_innermost_insert;
ule->uxrs[0].valp = &le->u.comm.key_val[ule->keylen];
ule->uxrs[0].xid = 0; //Required.
//Set p to immediately after leafentry
p = &le->u.comm.key_val[ule->keylen + vallen_of_innermost_insert];
}
else {
//Unpack a 'provisional leafentry' (Uncommitted transactions exist)
//Read in type.
u_int8_t innermost_type = le->u.prov.innermost_type;
assert(!uxr_type_is_placeholder(innermost_type));
//Read in xid
TXNID xid_outermost_uncommitted = toku_dtoh64(le->u.prov.xid_outermost_uncommitted);
//Read pointer to key
ule->keyp = le->u.prov.key_val_xrs;
//Read pointer to innermost inserted val (immediately after key)
u_int8_t *valp_of_innermost_insert = &le->u.prov.key_val_xrs[ule->keylen];
//Point p to immediately after 'header'
p = &le->u.prov.key_val_xrs[ule->keylen + vallen_of_innermost_insert];
BOOL found_innermost_insert = FALSE;
int i; //Index in ULE.uxrs[]
//Loop inner to outer
for (i = num_xrs - 1; i >= 0; i--) {
UXR uxr = &ule->uxrs[i];
//Innermost's type is in header.
if (i < num_xrs - 1) {
//Not innermost, so load the type.
uxr->type = *p;
p += 1;
}
else {
//Innermost, load the type previously read from header
uxr->type = innermost_type;
}
//Committed txn id is implicit (0). (i==0)
//Outermost uncommitted txnid is stored in header. (i==1)
if (i > 1) {
//Not committed nor outermost uncommitted, so load the xid.
uxr->xid = toku_dtoh64(*(TXNID*)p);
p += 8;
}
else if (i == 1) {
//Outermost uncommitted, load the xid previously read from header
uxr->xid = xid_outermost_uncommitted;
}
else {
// i == 0, committed entry
uxr->xid = 0;
}
if (uxr_is_insert(uxr)) {
if (found_innermost_insert) {
//Not the innermost insert. Load vallen/valp
uxr->vallen = toku_dtoh32(*(u_int32_t*)p);
p += 4;
uxr->valp = p;
p += uxr->vallen;
}
else {
//Innermost insert, load the vallen/valp previously read from header
uxr->vallen = vallen_of_innermost_insert;
uxr->valp = valp_of_innermost_insert;
found_innermost_insert = TRUE;
}
}
}
assert(found_innermost_insert);
}
#if ULE_DEBUG
size_t memsize = le_memsize_from_ule_12(ule);
assert(p == ((u_int8_t*)le) + memsize);
#endif
}
size_t
leafentry_disksize_12(LEAFENTRY_12 le) {
ULE_S ule;
le_unpack_12(&ule, le);
size_t memsize = le_memsize_from_ule_12(&ule);
ule_cleanup(&ule);
return memsize;
}
int
toku_le_upgrade_12_13(LEAFENTRY_12 old_leafentry,
size_t *new_leafentry_memorysize,
size_t *new_leafentry_disksize,
LEAFENTRY *new_leafentry_p) {
ULE_S ule;
int rval;
invariant(old_leafentry);
le_unpack_12(&ule, old_leafentry);
rval = le_pack(&ule, // create packed leafentry
new_leafentry_memorysize,
new_leafentry_disksize,
new_leafentry_p,
NULL, NULL, NULL); //NULL for omt means that we use malloc instead of mempool
ule_cleanup(&ule);
return rval;
}
#endif
The essential idea of auto-upgrade from BRT_LAYOUT_VERSION 12 to 13 is to
take advantage of the similarities between the two versions, and not to
try to create an infrastructure for all future upgrades.
As future layouts are created, upgrade paths, if any, will be crafted to
each particular change.
On startup, the version number of the recovery log is checked. If an
upgrade is needed, then the log is tested for a clean shutdown. If
there is no clean shutdown, then an error is returned. If the log does
end in a clean shutdown, then a new log file is created with the current
version number, starting with an LSN that is one greater than the clean
shutdown.
Once the new log is in place, the persistent environment dictionary is
upgraded, and then normal operation begins.
The startup of a new version of the storage engine might not be crash
safe.
Dictionaries, including the persistent environment and the fileops
directory, are upgraded as they are read into memory from disk.
The brt header is upgraded by
- removing an unused flag
- setting the transaction id to the xid of the clean shutdown
- marking the header as dirty
Each non-leaf node is upgraded by:
- removing an unused flag
- upgrading the version numbers in the node
- marking the node as dirty.
This works because all of the version 12 messages are unchanged
in version 13. The version 12 messages will be applied to the
leafentries using version 13 code.
Each non-leaf node is upgraded by
- removing an unused flag
- using modified version 12 code to unpack the version 12 packed
leaf entries into version 13 unpacked leaf entries
- repacking the leafentries into a new mempool
- destroying the original mempool (that holds the version 12
node read from disk)
The node is marked as dirty.
Once the brt is open, a BRT_OPTIMIZE broadcast message is inserted to
optimize the dictionary.
A schematic overview of how a brt node is deserialized:
toku_deserialize_brtnode_from() { // accepts fd, fills in BRTNODE, brt_header
deserialize_brtnode_from_rbuf_versioned() {
deserialize_brtnode_from_rbuf() // accepts rbuf fills in BRTNODE
if nonleaf deserialize_brtnode_nonleaf_from_rbuf(){ // rbuf -> BRTNODE (no version sensitivity)
if leaf deserialize_brtnode_leaf_from_rbuf() { // calculates node size from leafentry sizes
// leafentry sizes vary with version
if version 12 {
if leaf {
unpack each leafentry into a version 13 ule
pack each version 13 ule into version 13 le
allocate new mempool for version 13 les
destroy old mempool
}
remove unused flag
increment version number
mark dirty
}
}
}
Open issues:
- The brt layer makes some callbacks to the handlerton layer. If
any of the functions change from one version to another, then
the result may not be correct. A version number could be
included in all the function signatures so the callback function
could be aware of what version the caller is expecting.
The callbacks are:
- comparator
- hot index generator
- hot column mutator
Note, brt-internal.h defines struct subtree_estimates which contains field nkeys.
This field is obsolete with the removal of dupsort databases (since it will always
be the same as ndata), but removing it is not worth the trouble.
==========
The changes from version 12 to 13 include (may not be complete list):
- Persistent environment dictionary
- version number
- timestamp of environment creation (database installation)
- history of previous versions
- timestamps for upgrades
- Recovery log
- version number
- new log entries (hotindex, maybe others)
- brt header
- version number
- added field (root_xid_that_created), set to last checkpoint lsn
- deleted flag (built-in comparison function for values)
- brt internal node
- version number
- additional message(s) possible, no upgrade needed beyond changing version number
- brt leafnode
- version number
- new leafentry format
- version 12 leafentry unpack code is preserved
- rollback log
- version number is only change, no upgrade is needed because
rollback logs are not preserved through clean shutdown
Because version 12 and version 13 leafentries are significantly
different, the way leafentries is handled is as follows:
- deserialize_brtnode_leaf_from_rbuf()
- sets up array of pointers to leafentries (to be unpacked later),
these pointers are put into an OMT
- calculates checksum (x1764)
- adjusts ndone byte counter to verify that entire rbuf is read
- deserialize_brtnode_from_rbuf_versioned() calls
deserialize_brtnode_leaf_from_rbuf()
- loop through all leafentries, one at a time:
- unpack version 12 le and repack as version 13 le, each in its own malloc'ed memory
- calculate new fingerprint
- create new block
- allocate new mempool
- copy individual les into new mempool
- destroy individual les
- destroy original mempool
Open issues:
- We need to verify clean shutdown before upgrade.
If shutdown was not clean then we would run recovery, and the
code does not support recovering from an old format version.
- One way to do this is to increase the log version number (either
increment or synchronize with BRT_LAYOUT_VERSION).
- Can we just look at the log? needs_recovery(env);
If this mechanism is specific
to the version 12 to 13 upgrade, then that is adequate.
Once the recovery log format changes, then we need a
different mechanism, similar to the 3.x->4.x upgrade
logic in log_upgrade.c.
- How to decide that an upgrade is necessary?
Needed for logic that says:
- If upgrade is necessary, then verify clean shutdown:
If upgrade is necessary (recorded version is old)
and clean shutdown was not done, then exit with
error code.
- tokudb_needs_recovery() is not separate from verification of
clean shutdown. This function indicates if a recovery is
necessary, but it does not verify simple clean shutdown
with just the shutdown log entry. Instead, it looks for
checkpoint begin/checkpoint end. (Also, comment at end
is permitted.)
Proposed solution:
- Decision on whether to perform upgrade is done by examining log version.
- If we need an upgrade:
- If not clean shutdown, then exit with error message, change nothing
on disk.
- If clean shutdown, then create new log by simply creating new log file
(empty, or perhaps with initial comment that says "start of new log").
- Normal log-trimming code will delete old logs. (None of the
locking logic in log_upgrade.c is needed.)
- Log-opening logic needs to be modified to do this. See log file
manager initialization function (and maybe functions it calls),
maybe the log cursor:
- logfilemgr.c: toku_logfilemgr_init()
- Log-trimming logic loops over pairs of file names and LSNs,
deleting old files based on LSN.
- Question: would it help any if the "clean shutdown" log entry
was required to be in a new log file of its own? It would
prevent the creation of an empty log file after "clean shutdown."
It might, but it's probably not worth doing.
Issue of optimize message (to be sent into each dictionary on upgrade)
- BRT_COMMIT_BROADCAST_ALL (should be faster executing, always commits everything, was needed for an earlier upgrade attempt)
- BRT_OPTIMIZE (better tested, has been used, tests to see if transactions are still live)
After upgrade (after clean shutdown, no running transactions, trees
fully flattened), there is no difference in what these two message do.
Note, BRT_OPTIMIZE requires a clean shutdown if used on upgrade. If used before recovery (which an upgrade
without clean shutdown would do), then it would be wrong because it would appear that all transactions were
completed.
TODO:
- update brt header fields
- original layout version
- version read from disk
- add accountability counters
- capture LSN of clean shutdown, use instead of checkpoint lsn
...@@ -182,9 +182,14 @@ xids_get_serialize_size(XIDS xids){ ...@@ -182,9 +182,14 @@ xids_get_serialize_size(XIDS xids){
return rval; return rval;
} }
// Include TXNID zero in checksum to maintain compatibility
// with previously released version.
void void
toku_calc_more_murmur_xids (struct x1764 *mm, XIDS xids) { toku_calc_more_murmur_xids (struct x1764 *mm, XIDS xids) {
x1764_add(mm, &xids->num_xids, 1); x1764_add(mm, &xids->num_xids, 1);
TXNID zero = 0;
x1764_add(mm, &zero, 8);
u_int8_t index; u_int8_t index;
u_int8_t num_xids = xids_get_num_xids(xids); u_int8_t num_xids = xids_get_num_xids(xids);
for (index = 0; index < num_xids; index++) { for (index = 0; index < num_xids; index++) {
......
...@@ -37,7 +37,6 @@ TRANSPARENT_UPGRADE_SRCS = $(wildcard upgrade-*.c) ...@@ -37,7 +37,6 @@ TRANSPARENT_UPGRADE_SRCS = $(wildcard upgrade-*.c)
NONSTANDARD_SRCS= \ NONSTANDARD_SRCS= \
$(RECOVER_SRCS) \ $(RECOVER_SRCS) \
$(LOADER_SRCS) \ $(LOADER_SRCS) \
$(TRANSPARENT_UPGRADE_SRCS) \
#end #end
#Tests that don't compile in windows. SHould #Tests that don't compile in windows. SHould
...@@ -179,6 +178,7 @@ BDB_DONTRUN_TESTS = \ ...@@ -179,6 +178,7 @@ BDB_DONTRUN_TESTS = \
update-multiple-nochange \ update-multiple-nochange \
update-multiple-key0 \ update-multiple-key0 \
update-multiple-data-diagonal \ update-multiple-data-diagonal \
upgrade_simple \
upgrade-test-1 \ upgrade-test-1 \
upgrade-test-2 \ upgrade-test-2 \
upgrade-test-3 \ upgrade-test-3 \
......
...@@ -21,6 +21,7 @@ enum {ROWS_PER_TRANSACTION=10000}; ...@@ -21,6 +21,7 @@ enum {ROWS_PER_TRANSACTION=10000};
int NUM_DBS=5; int NUM_DBS=5;
int NUM_ROWS=100000; int NUM_ROWS=100000;
int CHECK_RESULTS=0; int CHECK_RESULTS=0;
int littlenode = 0;
enum { old_default_cachesize=1024 }; // MB enum { old_default_cachesize=1024 }; // MB
int CACHESIZE=old_default_cachesize; int CACHESIZE=old_default_cachesize;
int ALLOW_DUPS=0; int ALLOW_DUPS=0;
...@@ -112,7 +113,7 @@ static void run_test(void) ...@@ -112,7 +113,7 @@ static void run_test(void)
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr); env->set_errfile(env, stderr);
r = env->checkpointing_set_period(env, 60); CKERR(r); r = env->checkpointing_set_period(env, 0); CKERR(r);
DBT desc; DBT desc;
dbt_init(&desc, "foo", sizeof("foo")); dbt_init(&desc, "foo", sizeof("foo"));
...@@ -124,6 +125,10 @@ static void run_test(void) ...@@ -124,6 +125,10 @@ static void run_test(void)
for(int i=0;i<NUM_DBS;i++) { for(int i=0;i<NUM_DBS;i++) {
idx[i] = i; idx[i] = i;
r = db_create(&dbs[i], env, 0); CKERR(r); r = db_create(&dbs[i], env, 0); CKERR(r);
if (littlenode) {
r=dbs[i]->set_pagesize(dbs[i], 4096);
CKERR(0);
}
r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r); r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r);
dbs[i]->app_private = &idx[i]; dbs[i]->app_private = &idx[i];
snprintf(name, sizeof(name), "db_%04x", i); snprintf(name, sizeof(name), "db_%04x", i);
...@@ -176,7 +181,7 @@ static void do_args(int argc, char * const argv[]) { ...@@ -176,7 +181,7 @@ static void do_args(int argc, char * const argv[]) {
} else if (strcmp(argv[0], "-h")==0) { } else if (strcmp(argv[0], "-h")==0) {
resultcode=0; resultcode=0;
do_usage: do_usage:
fprintf(stderr, "Usage: -h -c -d <num_dbs> -r <num_rows> %s\n", cmd); fprintf(stderr, "Usage: -h -c -n -d <num_dbs> -r <num_rows> %s\n", cmd);
exit(resultcode); exit(resultcode);
} else if (strcmp(argv[0], "-d")==0) { } else if (strcmp(argv[0], "-d")==0) {
argc--; argv++; argc--; argv++;
...@@ -191,6 +196,8 @@ static void do_args(int argc, char * const argv[]) { ...@@ -191,6 +196,8 @@ static void do_args(int argc, char * const argv[]) {
NUM_ROWS = atoi(argv[0]); NUM_ROWS = atoi(argv[0]);
} else if (strcmp(argv[0], "-c")==0) { } else if (strcmp(argv[0], "-c")==0) {
CHECK_RESULTS = 1; CHECK_RESULTS = 1;
} else if (strcmp(argv[0], "-n")==0) {
littlenode = 1;
} else { } else {
fprintf(stderr, "Unknown arg: %s\n", argv[0]); fprintf(stderr, "Unknown arg: %s\n", argv[0]);
resultcode=1; resultcode=1;
......
...@@ -40,7 +40,7 @@ test_main (int argc, char *const argv[]) { ...@@ -40,7 +40,7 @@ test_main (int argc, char *const argv[]) {
dbt_init(&data, there, strlen(there)+1), dbt_init(&data, there, strlen(there)+1),
0); 0);
r=txn->commit(txn, 0); CKERR(r); r=txn->commit(txn, 0); CKERR(r);
r=env->txn_checkpoint(env, 0, 0, 0); r=env->txn_checkpoint(env, 0, 0, 0); CKERR(r);
} }
{ {
......
...@@ -48,7 +48,7 @@ test_main (int argc, char *const argv[]) { ...@@ -48,7 +48,7 @@ test_main (int argc, char *const argv[]) {
dbt_init(&data, there, strlen(there)+1), dbt_init(&data, there, strlen(there)+1),
0); 0);
r=txn->commit(txn, 0); CKERR(r); r=txn->commit(txn, 0); CKERR(r);
r=env->txn_checkpoint(env, 0, 0, 0); r=env->txn_checkpoint(env, 0, 0, 0); CKERR(r);
} }
{ {
......
...@@ -20,15 +20,17 @@ DB_ENV *env; ...@@ -20,15 +20,17 @@ DB_ENV *env;
enum {MAX_NAME=128}; enum {MAX_NAME=128};
int NUM_DBS=5; int NUM_DBS=5;
int NUM_ROWS=100000; int NUM_ROWS=100000;
int CHECK_RESULTS=0; int SRC_VERSION = 4;
enum { old_default_cachesize=1024 }; // MB int littlenode = 0;
int CACHESIZE=old_default_cachesize; int flat = 0;
char *db_v3_dir = "../../utils/preload-3.1-db";
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
char *env_dir = ENVDIR; // the default env_dir. char *env_dir = ENVDIR; // the default env_dir.
char *db_v5_dir = "dir.preload-db.c.tdb";
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
char *db_v4_dir_flat = "env_preload.4.1.1.flat.cleanshutdown";
int SRC_VERSION = 4;
static void upgrade_test_1(DB **dbs) { static void upgrade_test_1(DB **dbs) {
int r; int r;
...@@ -64,39 +66,52 @@ static void upgrade_test_1(DB **dbs) { ...@@ -64,39 +66,52 @@ static void upgrade_test_1(DB **dbs) {
} }
} }
static void run_test(void) static void setup(void) {
{
int r; int r;
int len = 256;
char *src_db_dir; char syscmd[len];
if ( SRC_VERSION == 3 ) char * src_db_dir;
src_db_dir = db_v3_dir;
else if ( SRC_VERSION == 4 ) if ( SRC_VERSION == 4 ) {
src_db_dir = db_v4_dir; if (flat)
src_db_dir = db_v4_dir_flat;
else if (littlenode)
src_db_dir = db_v4_dir_node4k;
else
src_db_dir = db_v4_dir;
}
else if ( SRC_VERSION == 5 ) {
src_db_dir = db_v5_dir;
}
else { else {
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION); fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
assert(0); assert(0);
} }
{
int len = 256;
char syscmd[len];
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
}
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
generate_permute_tables(); generate_permute_tables();
}
static void run_test(void)
{
int r;
r = db_env_create(&env, 0); CKERR(r); r = db_env_create(&env, 0); CKERR(r);
if (littlenode) {
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
}
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr); env->set_errfile(env, stderr);
r = env->checkpointing_set_period(env, 60); CKERR(r); r = env->checkpointing_set_period(env, 1); CKERR(r);
DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS); DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS);
assert(dbs != NULL); assert(dbs != NULL);
...@@ -117,7 +132,12 @@ static void do_args(int argc, char * const argv[]); ...@@ -117,7 +132,12 @@ static void do_args(int argc, char * const argv[]);
int test_main(int argc, char * const *argv) { int test_main(int argc, char * const *argv) {
do_args(argc, argv); do_args(argc, argv);
run_test(); if (SRC_VERSION == 4) {
littlenode = 1; // 4k nodes, small cache
}
setup();
run_test(); // read, upgrade, write back to disk
run_test(); // read and verify
return 0; return 0;
} }
...@@ -135,7 +155,7 @@ static void do_args(int argc, char * const argv[]) { ...@@ -135,7 +155,7 @@ static void do_args(int argc, char * const argv[]) {
} else if (strcmp(argv[0], "-h")==0) { } else if (strcmp(argv[0], "-h")==0) {
resultcode=0; resultcode=0;
do_usage: do_usage:
fprintf(stderr, "Usage: -h -c -d <num_dbs> -r <num_rows> %s\n", cmd); fprintf(stderr, "Usage: -h -d <num_dbs> -r <num_rows> %s\n", cmd);
exit(resultcode); exit(resultcode);
} else if (strcmp(argv[0], "-d")==0) { } else if (strcmp(argv[0], "-d")==0) {
argc--; argv++; argc--; argv++;
...@@ -148,11 +168,11 @@ static void do_args(int argc, char * const argv[]) { ...@@ -148,11 +168,11 @@ static void do_args(int argc, char * const argv[]) {
} else if (strcmp(argv[0], "-r")==0) { } else if (strcmp(argv[0], "-r")==0) {
argc--; argv++; argc--; argv++;
NUM_ROWS = atoi(argv[0]); NUM_ROWS = atoi(argv[0]);
} else if (strcmp(argv[0], "-c")==0) {
CHECK_RESULTS = 1;
} else if (strcmp(argv[0], "-V")==0) { } else if (strcmp(argv[0], "-V")==0) {
argc--; argv++; argc--; argv++;
SRC_VERSION = atoi(argv[0]); SRC_VERSION = atoi(argv[0]);
} else if (strcmp(argv[0], "-f")==0) {
flat = 1;
} else { } else {
fprintf(stderr, "Unknown arg: %s\n", argv[0]); fprintf(stderr, "Unknown arg: %s\n", argv[0]);
resultcode=1; resultcode=1;
......
...@@ -19,14 +19,15 @@ enum {MAX_NAME=128}; ...@@ -19,14 +19,15 @@ enum {MAX_NAME=128};
int NUM_DBS=5; int NUM_DBS=5;
int NUM_ROWS=100000; int NUM_ROWS=100000;
int CHECK_RESULTS=0; int CHECK_RESULTS=0;
enum { old_default_cachesize=1024 }; // MB int SRC_VERSION = 4;
int CACHESIZE=old_default_cachesize; int littlenode = 0;
char *db_v3_dir = "../../utils/preload-3.1-db";
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
char *env_dir = ENVDIR; // the default env_dir. char *env_dir = ENVDIR; // the default env_dir.
char *db_v5_dir = "dir.preload-db.c.tdb";
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
int SRC_VERSION = 4;
static void upgrade_test_2(DB **dbs) { static void upgrade_test_2(DB **dbs) {
int r = 0; int r = 0;
...@@ -85,39 +86,52 @@ static void upgrade_test_2(DB **dbs) { ...@@ -85,39 +86,52 @@ static void upgrade_test_2(DB **dbs) {
} }
} }
static void run_test(void)
{
int r;
char *src_db_dir; static void setup(void) {
if ( SRC_VERSION == 3 ) int r;
src_db_dir = db_v3_dir; int len = 256;
else if ( SRC_VERSION == 4 ) char syscmd[len];
src_db_dir = db_v4_dir; char * src_db_dir;
if ( SRC_VERSION == 4 ) {
if (littlenode)
src_db_dir = db_v4_dir_node4k;
else
src_db_dir = db_v4_dir;
}
else if ( SRC_VERSION == 5 ) {
src_db_dir = db_v5_dir;
}
else { else {
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION); fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
assert(0); assert(0);
} }
{
int len = 256;
char syscmd[len];
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
}
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
generate_permute_tables(); generate_permute_tables();
}
static void run_test(int checkpoint_period)
{
int r;
r = db_env_create(&env, 0); CKERR(r); r = db_env_create(&env, 0); CKERR(r);
if (littlenode) {
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
}
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr); env->set_errfile(env, stderr);
r = env->checkpointing_set_period(env, 60); CKERR(r); r = env->checkpointing_set_period(env, checkpoint_period); CKERR(r);
DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS); DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS);
assert(dbs != NULL); assert(dbs != NULL);
...@@ -136,9 +150,15 @@ static void run_test(void) ...@@ -136,9 +150,15 @@ static void run_test(void)
// ------------ infrastructure ---------- // ------------ infrastructure ----------
static void do_args(int argc, char * const argv[]); static void do_args(int argc, char * const argv[]);
int test_main(int argc, char * const *argv) { int test_main(int argc, char * const *argv) {
do_args(argc, argv); do_args(argc, argv);
run_test(); if (SRC_VERSION == 4) {
littlenode = 1; // 4k nodes, small cache
}
setup();
run_test(1);
return 0; return 0;
} }
......
...@@ -19,14 +19,15 @@ enum {MAX_NAME=128}; ...@@ -19,14 +19,15 @@ enum {MAX_NAME=128};
int NUM_DBS=5; int NUM_DBS=5;
int NUM_ROWS=100000; int NUM_ROWS=100000;
int CHECK_RESULTS=0; int CHECK_RESULTS=0;
enum { old_default_cachesize=1024 }; // MB int SRC_VERSION = 4;
int CACHESIZE=old_default_cachesize; int littlenode = 0;
char *db_v3_dir = "../../utils/preload-3.1-db";
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
char *env_dir = ENVDIR; // the default env_dir. char *env_dir = ENVDIR; // the default env_dir.
char *db_v5_dir = "dir.preload-db.c.tdb";
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
int SRC_VERSION = 4;
static void upgrade_test_3(DB **dbs) { static void upgrade_test_3(DB **dbs) {
int r; int r;
...@@ -87,35 +88,47 @@ static void upgrade_test_3(DB **dbs) { ...@@ -87,35 +88,47 @@ static void upgrade_test_3(DB **dbs) {
} }
} }
static void run_test(void) static void setup(void) {
{
int r; int r;
int len = 256;
char *src_db_dir; char syscmd[len];
if ( SRC_VERSION == 3 ) char * src_db_dir;
src_db_dir = db_v3_dir;
else if ( SRC_VERSION == 4 ) if ( SRC_VERSION == 4 ) {
src_db_dir = db_v4_dir; if (littlenode)
src_db_dir = db_v4_dir_node4k;
else
src_db_dir = db_v4_dir;
}
else if ( SRC_VERSION == 5 ) {
src_db_dir = db_v5_dir;
}
else { else {
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION); fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
assert(0); assert(0);
} }
{
int len = 256;
char syscmd[len];
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
}
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
generate_permute_tables(); generate_permute_tables();
}
static void run_test(void)
{
int r;
r = db_env_create(&env, 0); CKERR(r); r = db_env_create(&env, 0); CKERR(r);
if (littlenode) {
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
}
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr); env->set_errfile(env, stderr);
...@@ -140,7 +153,16 @@ static void do_args(int argc, char * const argv[]); ...@@ -140,7 +153,16 @@ static void do_args(int argc, char * const argv[]);
int test_main(int argc, char * const *argv) { int test_main(int argc, char * const *argv) {
do_args(argc, argv); do_args(argc, argv);
littlenode = 0;
setup();
run_test(); run_test();
if (SRC_VERSION == 4) {
if (verbose)
printf("Now repeat test with small nodes and small cache.\n");
littlenode = 1; // 4k nodes, small cache
setup();
run_test();
}
return 0; return 0;
} }
......
...@@ -19,15 +19,18 @@ enum {MAX_NAME=128}; ...@@ -19,15 +19,18 @@ enum {MAX_NAME=128};
int NUM_DBS=5; int NUM_DBS=5;
int NUM_ROWS=100000; int NUM_ROWS=100000;
int CHECK_RESULTS=0; int CHECK_RESULTS=0;
enum { old_default_cachesize=1024 }; // MB int SRC_VERSION = 4;
int CACHESIZE=old_default_cachesize; int littlenode = 0;
enum {ROWS_PER_TRANSACTION=10000};
char *db_v3_dir = "../../utils/preload-3.1-db";
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
char *env_dir = ENVDIR; // the default env_dir. char *env_dir = ENVDIR; // the default env_dir.
char *db_v5_dir = "dir.preload-db.c.tdb";
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
enum {ROWS_PER_TRANSACTION=10000};
int SRC_VERSION = 4;
static void upgrade_test_4(DB **dbs) { static void upgrade_test_4(DB **dbs) {
int r; int r;
...@@ -122,35 +125,47 @@ static void upgrade_test_4(DB **dbs) { ...@@ -122,35 +125,47 @@ static void upgrade_test_4(DB **dbs) {
} }
} }
static void run_test(void) static void setup(void) {
{
int r; int r;
int len = 256;
char *src_db_dir; char syscmd[len];
if ( SRC_VERSION == 3 ) char * src_db_dir;
src_db_dir = db_v3_dir;
else if ( SRC_VERSION == 4 ) if ( SRC_VERSION == 4 ) {
src_db_dir = db_v4_dir; if (littlenode)
src_db_dir = db_v4_dir_node4k;
else
src_db_dir = db_v4_dir;
}
else if ( SRC_VERSION == 5 ) {
src_db_dir = db_v5_dir;
}
else { else {
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION); fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
assert(0); assert(0);
} }
{
int len = 256;
char syscmd[len];
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
}
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
generate_permute_tables(); generate_permute_tables();
}
static void run_test(void)
{
int r;
r = db_env_create(&env, 0); CKERR(r); r = db_env_create(&env, 0); CKERR(r);
if (littlenode) {
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
}
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr); env->set_errfile(env, stderr);
...@@ -175,7 +190,17 @@ static void do_args(int argc, char * const argv[]); ...@@ -175,7 +190,17 @@ static void do_args(int argc, char * const argv[]);
int test_main(int argc, char * const *argv) { int test_main(int argc, char * const *argv) {
do_args(argc, argv); do_args(argc, argv);
do_args(argc, argv);
littlenode = 0;
setup();
run_test(); run_test();
if (SRC_VERSION == 4) {
if (verbose)
printf("Now repeat test with small nodes and small cache.\n");
littlenode = 1; // 4k nodes, small cache
setup();
run_test();
}
return 0; return 0;
} }
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2009 Tokutek Inc. All rights reserved."
#ident "$Id: env_startup.c 20778 2010-05-28 20:38:42Z yfogel $"
/* Purpose of this test is to verify simplest part of upgrade logic.
* Start by creating two very simple 4.x environments,
* one in each of two states:
* - after a clean shutdown
* - without a clean shutdown
*
* The two different environments will be used to exercise upgrade logic
* for 5.x.
*
*/
#include "test.h"
#include <db.h>
static DB_ENV *env;
#define FLAGS_NOLOG DB_INIT_LOCK|DB_INIT_MPOOL|DB_CREATE|DB_PRIVATE
#define FLAGS_LOG FLAGS_NOLOG|DB_INIT_TXN|DB_INIT_LOG
static int mode = S_IRWXU+S_IRWXG+S_IRWXO;
static void test_shutdown(void);
static void
setup (u_int32_t flags, BOOL clean) {
int r;
if (env)
test_shutdown();
r = system("rm -rf " ENVDIR);
CKERR(r);
r=toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO);
CKERR(r);
if (clean) {
r = system("cp env_simple.4.1.1.cleanshutdown/* " ENVDIR);
}
else {
r = system("cp env_simple.4.1.1.dirtyshutdown/* " ENVDIR);
}
CKERR(r);
r=db_env_create(&env, 0);
CKERR(r);
env->set_errfile(env, stderr);
r=env->open(env, ENVDIR, flags, mode);
if (clean)
CKERR(r);
else
CKERR2(r, TOKUDB_UPGRADE_FAILURE);
}
static void
test_shutdown(void) {
int r;
r=env->close(env, 0); CKERR(r);
env = NULL;
}
static void
test_env_startup(void) {
u_int32_t flags;
flags = FLAGS_LOG;
setup(flags, TRUE);
print_engine_status(env);
test_shutdown();
setup(flags, FALSE);
if (verbose) {
printf("\n\nEngine status after aborted env->open() will have some garbage values:\n");
}
print_engine_status(env);
test_shutdown();
}
int
test_main (int argc, char * const argv[]) {
parse_args(argc, argv);
test_env_startup();
return 0;
}
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment