#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "portability/toku_portability.h"
#include "portability/memory.h"
#include "portability/toku_assert.h"
#include "portability/toku_portability.h"
#include "portability/toku_pthread.h"
#include "ft/ft-internal.h" // ugly but pragmatic, need access to dirty bits while holding translation lock
// TODO: reorganize this dependency
// ugly but pragmatic, need access to dirty bits while holding translation lock
// TODO: Refactor this (possibly with FT-301)
#include "ft/ft-internal.h"
// TODO: reorganize this dependency (FT-303)
#include "ft/ft-ops.h" // for toku_maybe_truncate_file
#include "ft/serialize/block_table.h"
#include "ft/serialize/rbuf.h"
...
...
@@ -103,124 +106,164 @@ PATENT RIGHTS GRANT:
#include "ft/serialize/block_allocator.h"
#include "util/nb_mutex.h"
#include "util/scoped_malloc.h"
// indicates the end of a freelist
staticconstBLOCKNUMfreelist_null={-1};
// value of block_translation_pair.size if blocknum is unused
staticconstDISKOFFsize_is_free=(DISKOFF)-1;
// value of block_translation_pair.u.diskoff if blocknum is used but does not yet have a diskblock
staticconstDISKOFFdiskoff_unused=(DISKOFF)-2;
voidblock_table::_mutex_lock(){
toku_mutex_lock(&_mutex);
}
voidblock_table::_mutex_unlock(){
toku_mutex_unlock(&_mutex);
}
// TODO: Move lock to FT
voidtoku_ft_lock(FTft){
block_table*bt=&ft->blocktable;
bt->_mutex_lock();
}
// TODO: Move lock to FT
voidtoku_ft_unlock(FTft){
block_table*bt=&ft->blocktable;
toku_mutex_assert_locked(&bt->_mutex);
bt->_mutex_unlock();
}
// There are two headers: the reserve must fit them both and be suitably aligned.
structtranslation{//This is the BTT (block translation table)
enumtranslation_typetype;
int64_tlength_of_array;//Number of elements in array (block_translation). always >= smallest_never_used_blocknum
BLOCKNUMsmallest_never_used_blocknum;
BLOCKNUMblocknum_freelist_head;// next (previously used) unused blocknum (free list)
structblock_translation_pair*block_translation;
// Where and how big is the block translation vector stored on disk.
// size_on_disk is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].size
// location_on is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff
};
staticconstBLOCKNUMfreelist_null={-1};// in a freelist, this indicates end of list
staticconstDISKOFFsize_is_free=(DISKOFF)-1;// value of block_translation_pair.size if blocknum is unused
staticconstDISKOFFdiskoff_unused=(DISKOFF)-2;// value of block_translation_pair.u.diskoff if blocknum is used but does not yet have a diskblock
/********
* There are three copies of the translation table (btt) in the block table:
*
* checkpointed Is initialized by deserializing from disk,
* and is the only version ever read from disk.
* When read from disk it is copied to current.
* It is immutable. It can be replaced by an inprogress btt.
*
* inprogress Is only filled by copying from current,
* and is the only version ever serialized to disk.
* (It is serialized to disk on checkpoint and clean shutdown.)
* At end of checkpoint it replaces 'checkpointed'.
* During a checkpoint, any 'pending' dirty writes will update
* inprogress.
*
* current Is initialized by copying from checkpointed,
* is the only version ever modified while the database is in use,
* and is the only version ever copied to inprogress.
* It is never stored on disk.
********/
structblock_table{
structtranslationcurrent;// The current translation is the one used by client threads. It is not represented on disk.
structtranslationinprogress;// the translation used by the checkpoint currently in progress. If the checkpoint thread allocates a block, it must also update the current translation.
structtranslationcheckpointed;// the translation for the data that shall remain inviolate on disk until the next checkpoint finishes, after which any blocks used only in this translation can be freed.
// The in-memory data structure for block allocation. There is no on-disk data structure for block allocation.
// Note: This is *allocation* not *translation*. The bt_block_allocator is unaware of which blocks are used for which translation, but simply allocates and deallocates blocks.
// Effect: Fills wbuf (which starts uninitialized) with bt
// Effect: Serializes the blocktable to a wbuf (which starts uninitialized)
// A clean shutdown runs checkpoint start so that current and inprogress are copies.
// The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needd)
// The address is guaranteed to be 512-byte aligned, but the size is not guaranteed.
// It *is* guaranteed that we can read up to the next 512-byte boundary, however
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "Copyright (c) 2007-2014 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <db.h>
#include "portability/toku_stdint.h"
#include "portability/toku_pthread.h"
structft;
#include "ft/serialize/block_allocator.h"
#include "util/nb_mutex.h"
typedefstructblock_table*BLOCK_TABLE;
structft;
typedefstructblocknum_s{int64_tb;}BLOCKNUM;
staticinlineBLOCKNUMmake_blocknum(int64_tb){
BLOCKNUMresult={.b=b};
returnresult;
}
staticconstBLOCKNUMROLLBACK_NONE={.b=0};
// Offset in a disk. -1 is the 'null' pointer.
typedefint64_tDISKOFF;
// Needed by tests, ftdump
structblock_translation_pair{
union{// If in the freelist, use next_free_blocknum, otherwise diskoff.
DISKOFFdiskoff;
BLOCKNUMnext_free_blocknum;
}u;
DISKOFFsize;// set to 0xFFFFFFFFFFFFFFFF for free
// Unmovable reserved first, then reallocable.
// We reserve one blocknum for the translation table itself.