The good stuff from #1884 up to changeset:14213. Refs #1884. [t:1884].

* Block allocation is now first-fit. * There is a better test for the block allocator. * brt_stat64 returns data in a struct instead of a bunch of arguments. * Nodes are set clean after serialization. git-svn-id: file:///svn/toku/tokudb@14214 c7de825b-a66e-492c-adef-691d508d4ae1

The good stuff from #1884 up to changeset:14213. Refs #1884. [t:1884].
* Block allocation is now first-fit. * There is a better test for the block allocator. * brt_stat64 returns data in a struct instead of a bunch of arguments. * Nodes are set clean after serialization. git-svn-id: file:///svn/toku/tokudb@14214 c7de825b-a66e-492c-adef-691d508d4ae1
d73e7c40 · Bradley C. Kuszmaul · Yoni Fogel · 12dc9822 · d73e7c40 · d73e7c40
Commit d73e7c40 authored Aug 26, 2009 by Bradley C. Kuszmaul Committed by Yoni Fogel Apr 16, 2013
7 changed files
--- a/newbrt/block_allocator.c
+++ b/newbrt/block_allocator.c
@@ -7,7 +7,7 @@

 // Here's a very simple implementation.
 // It's not very fast at allocating or freeing.
-
+// Previous implementation used next_fit, but now use first_fit since we are moving blocks around to reduce file size.

 struct blockpair {
    u_int64_t offset;
@@ -20,18 +20,21 @@ struct block_allocator {
    u_int64_t n_blocks; // How many blocks
    u_int64_t blocks_array_size; // How big is the blocks_array.  Must be >= n_blocks.
    struct blockpair *blocks_array; // These blocks are sorted by address.
-    u_int64_t next_fit_counter; // Used for the next_fit algorithm.
+    u_int64_t n_bytes_in_use; // including the reserve_at_beginning
 };

 void
 block_allocator_validate (BLOCK_ALLOCATOR ba) {
    u_int64_t i;
+    u_int64_t n_bytes_in_use = ba->reserve_at_beginning;
    for (i=0; i<ba->n_blocks; i++) {
+	n_bytes_in_use += ba->blocks_array[i].size;
 	if (i>0) {
 	    assert(ba->blocks_array[i].offset >  ba->blocks_array[i-1].offset);
 	    assert(ba->blocks_array[i].offset >= ba->blocks_array[i-1].offset + ba->blocks_array[i-1].size );
 	}
    }
+    assert(n_bytes_in_use == ba->n_bytes_in_use);
 }

 #if 0
@@ -60,7 +63,7 @@ create_block_allocator (BLOCK_ALLOCATOR *ba, u_int64_t reserve_at_beginning, u_i
    result->n_blocks = 0;
    result->blocks_array_size = 1;
    XMALLOC_N(result->blocks_array_size, result->blocks_array);
-    result->next_fit_counter = 0;
+    result->n_bytes_in_use = reserve_at_beginning;
    *ba = result;
    VALIDATE(result);
 }
@@ -89,6 +92,7 @@ block_allocator_alloc_block_at (BLOCK_ALLOCATOR ba, u_int64_t size, u_int64_t of
    assert(offset >= ba->reserve_at_beginning);
    grow_blocks_array(ba);
    // Just do a linear search for the block
+    ba->n_bytes_in_use += size;
    for (i=0; i<ba->n_blocks; i++) {
 	if (ba->blocks_array[i].offset > offset) {
 	    // allocate it in that slot
@@ -110,27 +114,41 @@ block_allocator_alloc_block_at (BLOCK_ALLOCATOR ba, u_int64_t size, u_int64_t of
 }

 static inline u_int64_t
-align (u_int64_t value, BLOCK_ALLOCATOR ba) {
+align (u_int64_t value, BLOCK_ALLOCATOR ba)
+// Effect: align a value by rounding up.
+{
    return ((value+ba->alignment-1)/ba->alignment)*ba->alignment;
 }

 void
 block_allocator_alloc_block (BLOCK_ALLOCATOR ba, u_int64_t size, u_int64_t *offset) {
    grow_blocks_array(ba);
+    ba->n_bytes_in_use += size;
    if (ba->n_blocks==0) {
-	ba->blocks_array[0].offset = ba->reserve_at_beginning;
+	assert(ba->n_bytes_in_use == ba->reserve_at_beginning + size); // we know exactly how many are in use
+	ba->blocks_array[0].offset = align(ba->reserve_at_beginning, ba);
 	ba->blocks_array[0].size  = size;
-	*offset = ba->reserve_at_beginning;
+	*offset = ba->blocks_array[0].offset;
 	ba->n_blocks++;
 	return;
    }
-    u_int64_t i;
-    u_int64_t blocknum = ba->next_fit_counter;
-    // Implement next fit.
-    for (i=0; i<ba->n_blocks; i++, blocknum++) {
-	if (blocknum>=ba->n_blocks) blocknum=0;
+    // Implement first fit.    
+    {
+	u_int64_t end_of_reserve = align(ba->reserve_at_beginning, ba);
+	if (end_of_reserve + size <= ba->blocks_array[0].offset ) {
+	    // Check to see if the space immediately after the reserve is big enough to hold the new block.
+	    struct blockpair *bp = &ba->blocks_array[0];
+	    memmove(bp+1, bp, (ba->n_blocks)*sizeof(struct blockpair));
+	    bp[0].offset = end_of_reserve;
+	    bp[0].size   = size;
+	    ba->n_blocks++;
+	    *offset = end_of_reserve;
+	    VALIDATE(ba);
+	    return;
+	}
+    }
+    for (u_int64_t blocknum = 0; blocknum +1 < ba->n_blocks; blocknum ++) {
 	// Consider the space after blocknum
-	if (blocknum+1 == ba->n_blocks) continue; // Can't use the space after the last block, since that would be new space.
 	struct blockpair *bp = &ba->blocks_array[blocknum];
 	u_int64_t this_offset = bp[0].offset;
 	u_int64_t this_size   = bp[0].size;
@@ -141,7 +159,6 @@ block_allocator_alloc_block (BLOCK_ALLOCATOR ba, u_int64_t size, u_int64_t *offs
 	bp[1].offset = answer_offset;
 	bp[1].size   = size;
 	ba->n_blocks++;
-	ba->next_fit_counter = blocknum;
 	*offset = answer_offset;
 	VALIDATE(ba);
 	return;
@@ -188,6 +205,7 @@ block_allocator_free_block (BLOCK_ALLOCATOR ba, u_int64_t offset) {
    VALIDATE(ba);
    int64_t bn = find_block(ba, offset);
    assert(bn>=0); // we require that there is a block with that offset.  Might as well abort if no such block exists.
+    ba->n_bytes_in_use -= ba->blocks_array[bn].size;
    memmove(&ba->blocks_array[bn], &ba->blocks_array[bn+1], (ba->n_blocks-bn-1) * sizeof(struct blockpair));
    ba->n_blocks--;
    VALIDATE(ba);
@@ -208,3 +226,22 @@ block_allocator_allocated_limit (BLOCK_ALLOCATOR ba) {
 	return last->offset + last->size;
    }
 }
+
+int
+block_allocator_get_nth_block_in_layout_order (BLOCK_ALLOCATOR ba, u_int64_t b, u_int64_t *offset, u_int64_t *size)
+// Effect: Consider the blocks in sorted order.  The reserved block at the beginning is number 0.  The next one is number 1 and so forth.
+// Return the offset and size of the block with that number.
+// Return 0 if there is a block that big, return nonzero if b is too big.
+{
+    if (b==0) {
+	*offset=0;
+	*size  =ba->reserve_at_beginning;
+	return  0;
+    } else if (b > ba->n_blocks) {
+	return -1;
+    } else {
+	*offset=ba->blocks_array[b-1].offset;
+	*size  =ba->blocks_array[b-1].size;
+	return 0;
+    }
+}
--- a/newbrt/block_allocator.h
+++ b/newbrt/block_allocator.h
@@ -42,7 +42,7 @@ create_block_allocator (BLOCK_ALLOCATOR * ba, u_int64_t reserve_at_beginning, u_
 //  Aborts if we run out of memory.
 // Parameters
 //  ba (OUT):                        Result stored here.
-//  reserve_at_beginning (IN)        Size of reserved block at beginning.
+//  reserve_at_beginning (IN)        Size of reserved block at beginning.  This size does not have to be aligned.
 //  alignment (IN)                   Block alignment.

 void
@@ -73,7 +73,7 @@ block_allocator_alloc_block (BLOCK_ALLOCATOR ba, u_int64_t size, u_int64_t *offs
 //  The block address will be a multiple of the alignment.
 // Parameters:
 //  ba (IN/OUT):  The block allocator.   (Modifies ba.)
-//  size (IN):    The size of the block.
+//  size (IN):    The size of the block.  (The size does not have to be aligned.)
 //  offset (OUT): The location of the block.

 void
@@ -109,4 +109,11 @@ block_allocator_allocated_limit (BLOCK_ALLOCATOR ba);
 //  So we start at the "infinite" block, write the fifo, and then
 //  allocate_block_at of the correct size and offset to account for the root FIFO.

+int
+block_allocator_get_nth_block_in_layout_order (BLOCK_ALLOCATOR ba, u_int64_t b, u_int64_t *offset, u_int64_t *size);
+// Effect: Consider the blocks in sorted order.  The reserved block at the beginning is number 0.  The next one is number 1 and so forth.
+// Return the offset and size of the block with that number.
+// Return 0 if there is a block that big, return nonzero if b is too big.
+// This is probably only useful for tests.
+
 #endif
--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@@ -4578,12 +4578,12 @@ int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less,  u_int64_t *equal,  u
    return 0;
 }

-int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), u_int64_t *nkeys, u_int64_t *ndata, u_int64_t *dsize, u_int64_t *fsize) {
+int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) {
    {
 	int64_t file_size;
 	int r = toku_os_get_file_size(toku_cachefile_fd(brt->cf), &file_size);
 	assert(r==0);
-	*fsize = file_size + toku_cachefile_size_in_memory(brt->cf);
+	s->fsize = file_size + toku_cachefile_size_in_memory(brt->cf);
    }

    assert(brt->h);
@@ -4598,17 +4598,17 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), u_int64_t *nkeys, u_int64_t *ndat
    BRTNODE node = node_v;

    if (node->height==0) {
-	*nkeys = node->u.l.leaf_stats.nkeys;
-	*ndata = node->u.l.leaf_stats.ndata;
-	*dsize = node->u.l.leaf_stats.dsize;
+	s->nkeys = node->u.l.leaf_stats.nkeys;
+	s->ndata = node->u.l.leaf_stats.ndata;
+	s->dsize = node->u.l.leaf_stats.dsize;
    } else {
-	*nkeys = *ndata = *dsize = 0;
+	s->nkeys = s->ndata = s->dsize = 0;
 	int i;
 	for (i=0; i<node->u.n.n_children; i++) {
 	    struct subtree_estimates *se = &BNC_SUBTREE_ESTIMATES(node, i);
-	    *nkeys += se->nkeys;
-	    *ndata += se->ndata;
-	    *dsize += se->dsize;
+	    s->nkeys += se->nkeys;
+	    s->ndata += se->ndata;
+	    s->dsize += se->dsize;
 	}
    }
    

--- a/newbrt/brt.h
+++ b/newbrt/brt.h
@@ -137,11 +137,15 @@ enum brt_header_flags {
 };

 int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less,  u_int64_t *equal,  u_int64_t *greater);
+struct brtstat64_s {
+    u_int64_t nkeys; /* estimate how many unique keys (even when flattened this may be an estimate)     */
+    u_int64_t ndata; /* estimate the number of pairs (exact when flattened and committed)               */
+    u_int64_t dsize; /* estimate the sum of the sizes of the pairs (exact when flattened and committed) */
+    u_int64_t fsize;  /* the size of the underlying file                                                */
+    u_int64_t ffree; /* Number of free bytes in the underlying file                                    */
+};
 int toku_brt_stat64 (BRT, TOKUTXN,
-		     u_int64_t *nkeys, /* estimate how many unique keys (even when flattened this may be an estimate)     */
-		     u_int64_t *ndata, /* estimate the number of pairs (exact when flattened and committed)               */
-		     u_int64_t *dsize, /* estimate the sum of the sizes of the pairs (exact when flattened and committed) */
-		     u_int64_t *fsize  /* the size of the underlying file                                                 */
+		     struct brtstat64_s *stat
 		     );

 int toku_brt_init(void (*ydb_lock_callback)(void), void (*ydb_unlock_callback)(void));

--- a/newbrt/cachetable.c
+++ b/newbrt/cachetable.c
@@ -1196,7 +1196,9 @@ int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key,
 }


-int toku_cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, enum cachetable_dirty dirty, long size) {
+int toku_cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, enum cachetable_dirty dirty, long size)
+// size==0 means that the size didn't change.
+{
    CACHETABLE ct = cachefile->cachetable;
    PAIR p;
    WHEN_TRACE_CT(printf("%s:%d unpin(%lld)", __FILE__, __LINE__, key));

--- a/newbrt/tests/block_allocator_test.c
+++ b/newbrt/tests/block_allocator_test.c
@@ -22,15 +22,41 @@ static void ba_free (BLOCK_ALLOCATOR ba, u_int64_t offset) {
    block_allocator_validate(ba);
 }

+static void
+ba_check_l (BLOCK_ALLOCATOR ba, u_int64_t blocknum_in_layout_order, u_int64_t expected_offset, u_int64_t expected_size)
+{
+    u_int64_t actual_offset, actual_size;
+    int r = block_allocator_get_nth_block_in_layout_order(ba, blocknum_in_layout_order, &actual_offset, &actual_size);
+    assert(r==0);
+    assert(expected_offset == actual_offset);
+    assert(expected_size   == actual_size);
+}
+
+static void
+ba_check_none (BLOCK_ALLOCATOR ba, u_int64_t blocknum_in_layout_order)
+{
+    u_int64_t actual_offset, actual_size;
+    int r = block_allocator_get_nth_block_in_layout_order(ba, blocknum_in_layout_order, &actual_offset, &actual_size);
+    assert(r==-1);
+}
+
+
 // Simple block allocator test
 static void
 test_ba0 (void) {
    BLOCK_ALLOCATOR ba;
    u_int64_t b0, b1;
    create_block_allocator(&ba, 100, 1);
+    assert(block_allocator_allocated_limit(ba)==100);
    ba_alloc_at(ba, 50, 100);
+    assert(block_allocator_allocated_limit(ba)==150);
    ba_alloc_at(ba, 25, 150);
    ba_alloc   (ba, 10, &b0);
+    ba_check_l (ba, 0, 0,   100);
+    ba_check_l (ba, 1, 100,  50);
+    ba_check_l (ba, 2, 150,  25);
+    ba_check_l (ba, 3, b0,  10);
+    ba_check_none (ba, 4);
    assert(b0==175);
    ba_free(ba, 150);
    ba_alloc_at(ba, 10, 150);
@@ -101,6 +127,135 @@ test_ba1 (int n_initial) {
    assert(ba==0);
 }
    
+// Check to see if it is first fit or best fit.
+static void
+test_ba2 (void)
+{
+    BLOCK_ALLOCATOR ba;
+    u_int64_t b[6];
+    enum { BSIZE = 1024 };
+    create_block_allocator(&ba, 100, BSIZE);
+    assert(block_allocator_allocated_limit(ba)==100);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_none (ba, 1);
+
+    ba_alloc (ba, 100, &b[0]);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_l    (ba, 1, BSIZE, 100);
+    ba_check_none (ba, 2);
+
+    ba_alloc (ba, BSIZE+100, &b[1]);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_l    (ba, 1,   BSIZE,       100);
+    ba_check_l    (ba, 2, 2*BSIZE, BSIZE+100);
+    ba_check_none (ba, 3);
+
+    ba_alloc (ba, 100, &b[2]);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_l    (ba, 1,   BSIZE,       100);
+    ba_check_l    (ba, 2, 2*BSIZE, BSIZE+100);
+    ba_check_l    (ba, 3, 4*BSIZE,       100);
+    ba_check_none (ba, 4);
+
+    ba_alloc (ba, 100, &b[3]);
+    ba_alloc (ba, 100, &b[4]);
+    ba_alloc (ba, 100, &b[5]);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_l    (ba, 1,   BSIZE,       100);
+    ba_check_l    (ba, 2, 2*BSIZE, BSIZE+100);
+    ba_check_l    (ba, 3, 4*BSIZE,       100);
+    ba_check_l    (ba, 4, 5*BSIZE,       100);
+    ba_check_l    (ba, 5, 6*BSIZE,       100);
+    ba_check_l    (ba, 6, 7*BSIZE,       100);
+    ba_check_none (ba, 7);
+   
+    ba_free (ba, 4*BSIZE);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_l    (ba, 1,   BSIZE,       100);
+    ba_check_l    (ba, 2, 2*BSIZE, BSIZE+100);
+    ba_check_l    (ba, 3, 5*BSIZE,       100);
+    ba_check_l    (ba, 4, 6*BSIZE,       100);
+    ba_check_l    (ba, 5, 7*BSIZE,       100);
+    ba_check_none (ba, 6);
+
+    u_int64_t b2;
+    ba_alloc(ba, 100, &b2);
+    assert(b2==4*BSIZE);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_l    (ba, 1,   BSIZE,       100);
+    ba_check_l    (ba, 2, 2*BSIZE, BSIZE+100);
+    ba_check_l    (ba, 3, 4*BSIZE,       100);
+    ba_check_l    (ba, 4, 5*BSIZE,       100);
+    ba_check_l    (ba, 5, 6*BSIZE,       100);
+    ba_check_l    (ba, 6, 7*BSIZE,       100);
+    ba_check_none (ba, 7);
+
+    ba_free (ba,   BSIZE);
+    ba_free (ba, 5*BSIZE);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_l    (ba, 1, 2*BSIZE, BSIZE+100);
+    ba_check_l    (ba, 2, 4*BSIZE,       100);
+    ba_check_l    (ba, 3, 6*BSIZE,       100);
+    ba_check_l    (ba, 4, 7*BSIZE,       100);
+    ba_check_none (ba, 5);
+
+    // This alloc will allocate the first block after the reserve space in the case of first fit.
+    u_int64_t b3;
+    ba_alloc(ba, 100, &b3);
+    assert(b3==  BSIZE);      // First fit.
+    // if (b3==5*BSIZE) then it is next fit.
+
+    // Now 5*BSIZE is free
+    u_int64_t b5;
+    ba_alloc(ba, 100, &b5);
+    assert(b5==5*BSIZE);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_l    (ba, 1,   BSIZE,       100);
+    ba_check_l    (ba, 2, 2*BSIZE, BSIZE+100);
+    ba_check_l    (ba, 3, 4*BSIZE,       100);
+    ba_check_l    (ba, 4, 5*BSIZE,       100);
+    ba_check_l    (ba, 5, 6*BSIZE,       100);
+    ba_check_l    (ba, 6, 7*BSIZE,       100);
+    ba_check_none (ba, 7);
+
+    // Now all blocks are busy
+    u_int64_t b6, b7, b8;
+    ba_alloc(ba, 100, &b6);
+    ba_alloc(ba, 100, &b7);
+    ba_alloc(ba, 100, &b8);
+    assert(b6==8*BSIZE);
+    assert(b7==9*BSIZE);
+    assert(b8==10*BSIZE);
+    ba_check_l    (ba, 0, 0, 100);
+    ba_check_l    (ba, 1,   BSIZE,       100);
+    ba_check_l    (ba, 2, 2*BSIZE, BSIZE+100);
+    ba_check_l    (ba, 3, 4*BSIZE,       100);
+    ba_check_l    (ba, 4, 5*BSIZE,       100);
+    ba_check_l    (ba, 5, 6*BSIZE,       100);
+    ba_check_l    (ba, 6, 7*BSIZE,       100);
+    ba_check_l    (ba, 7, 8*BSIZE,       100);
+    ba_check_l    (ba, 8, 9*BSIZE,       100);
+    ba_check_l    (ba, 9, 10*BSIZE,       100);
+    ba_check_none (ba, 10);
+    
+    ba_free(ba, 9*BSIZE);
+    ba_free(ba, 7*BSIZE);
+    u_int64_t b9;
+    ba_alloc(ba, 100, &b9);
+    assert(b9==7*BSIZE);
+
+    ba_free(ba, 5*BSIZE);
+    ba_free(ba, 2*BSIZE);
+    u_int64_t b10, b11;
+    ba_alloc(ba, 100, &b10);
+    assert(b10==2*BSIZE);
+    ba_alloc(ba, 100, &b11);
+    assert(b11==3*BSIZE);
+    ba_alloc(ba, 100, &b11);
+    assert(b11==5*BSIZE);
+
+    destroy_block_allocator(&ba);
+}

 int
 test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
@@ -108,5 +263,6 @@ test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute_
    test_ba1(0);
    test_ba1(10);
    test_ba1(20);
+    test_ba2();
    return 0;
 }
--- a/src/ydb.c
+++ b/src/ydb.c
@@ -3558,14 +3558,21 @@ static int toku_db_set_pagesize(DB *db, u_int32_t pagesize) {
 static int toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
    HANDLE_PANICKED_DB(db);
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
-    return toku_brt_stat64(db->i->brt, db_txn_struct_i(txn)->tokutxn, &s->bt_nkeys, &s->bt_ndata, &s->bt_dsize, &s->bt_fsize);
+    struct brtstat64_s brtstat;
+    int r = toku_brt_stat64(db->i->brt, db_txn_struct_i(txn)->tokutxn, &brtstat);
+    if (r==0) {
+	s->bt_nkeys = brtstat.nkeys;
+	s->bt_ndata = brtstat.ndata;
+	s->bt_dsize = brtstat.dsize;
+	s->bt_fsize = brtstat.fsize;
+    }
+    return r;
 }
 static int locked_db_stat64 (DB *db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
    toku_ydb_lock();
    int r = toku_db_stat64(db, txn, s);
    toku_ydb_unlock();
    return r;
-
 }

 static int toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) {