Commit 57f6beab authored by Rich Prohaska's avatar Rich Prohaska Committed by Yoni Fogel

change block format to support leaf partitions closes[t:2351]

git-svn-id: file:///svn/toku/tokudb@19500 c7de825b-a66e-492c-adef-691d508d4ae1
parent c1a99a1b
...@@ -195,17 +195,17 @@ static unsigned int ...@@ -195,17 +195,17 @@ static unsigned int
toku_serialize_brtnode_size_slow (BRTNODE node) { toku_serialize_brtnode_size_slow (BRTNODE node) {
unsigned int size = node_header_overhead + extended_node_header_overhead; unsigned int size = node_header_overhead + extended_node_header_overhead;
size += toku_serialize_descriptor_size(node->desc); size += toku_serialize_descriptor_size(node->desc);
if (node->height>0) { if (node->height > 0) {
unsigned int hsize=0; unsigned int hsize=0;
unsigned int csize=0; unsigned int csize=0;
size+=4; /* n_children */ size += 4; /* n_children */
size+=4; /* subtree fingerprint. */ size += 4; /* subtree fingerprint. */
size+=4*(node->u.n.n_children-1); /* key lengths*/ size += 4*(node->u.n.n_children-1); /* key lengths*/
if (node->flags & TOKU_DB_DUPSORT) size += 4*(node->u.n.n_children-1); if (node->flags & TOKU_DB_DUPSORT) size += 4*(node->u.n.n_children-1);
for (int i=0; i<node->u.n.n_children-1; i++) { for (int i=0; i<node->u.n.n_children-1; i++) {
csize+=toku_brtnode_pivot_key_len(node, node->u.n.childkeys[i]); csize += toku_brtnode_pivot_key_len(node, node->u.n.childkeys[i]);
} }
size+=(8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and 1 for the exact bit for the estimates. */ size += (8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and 1 for the exact bit for the estimates. */
int n_buffers = node->u.n.n_children; int n_buffers = node->u.n.n_children;
assert(0 <= n_buffers && n_buffers < TREE_FANOUT+1); assert(0 <= n_buffers && n_buffers < TREE_FANOUT+1);
for (int i=0; i< n_buffers; i++) { for (int i=0; i< n_buffers; i++) {
...@@ -222,12 +222,11 @@ toku_serialize_brtnode_size_slow (BRTNODE node) { ...@@ -222,12 +222,11 @@ toku_serialize_brtnode_size_slow (BRTNODE node) {
return size+hsize+csize; return size+hsize+csize;
} else { } else {
unsigned int hsize=0; unsigned int hsize=0;
toku_omt_iterate(node->u.l.buffer, toku_omt_iterate(node->u.l.buffer, addupsize, &hsize);
addupsize,
&hsize);
assert(hsize==node->u.l.n_bytes_in_buffer); assert(hsize==node->u.l.n_bytes_in_buffer);
hsize+=4; /* add n entries in buffer table. */ hsize += 4; // add n entries in buffer table
hsize+=3*8; /* add the three leaf stats, but no exact bit. */ hsize += 3*8; // add the three leaf stats, but no exact bit
size += 4 + 1*stored_sub_block_map_size; // one partition
return size+hsize; return size+hsize;
} }
} }
...@@ -238,20 +237,21 @@ toku_serialize_brtnode_size (BRTNODE node) { ...@@ -238,20 +237,21 @@ toku_serialize_brtnode_size (BRTNODE node) {
unsigned int result = node_header_overhead + extended_node_header_overhead; unsigned int result = node_header_overhead + extended_node_header_overhead;
assert(sizeof(toku_off_t)==8); assert(sizeof(toku_off_t)==8);
result += toku_serialize_descriptor_size(node->desc); result += toku_serialize_descriptor_size(node->desc);
if (node->height>0) { if (node->height > 0) {
result+=4; /* subtree fingerpirnt */ result += 4; /* subtree fingerpirnt */
result+=4; /* n_children */ result += 4; /* n_children */
result+=4*(node->u.n.n_children-1); /* key lengths*/ result += 4*(node->u.n.n_children-1); /* key lengths*/
if (node->flags & TOKU_DB_DUPSORT) result += 4*(node->u.n.n_children-1); /* data lengths */ if (node->flags & TOKU_DB_DUPSORT) result += 4*(node->u.n.n_children-1); /* data lengths */
assert(node->u.n.totalchildkeylens < (1<<30)); assert(node->u.n.totalchildkeylens < (1<<30));
result+=node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */ result += node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */
result+=(8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and one for the exact bit. */ result += (8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and one for the exact bit. */
result+=node->u.n.n_bytes_in_buffers; result += node->u.n.n_bytes_in_buffers;
result += node->u.n.n_children*stored_sub_block_map_size; result += node->u.n.n_children*stored_sub_block_map_size;
} else { } else {
result+=4; /* n_entries in buffer table. */ result += 4; // n_entries in buffer table
result+=3*8; /* the three leaf stats. */ result += 3*8; // the three leaf stats
result+=node->u.l.n_bytes_in_buffer; result += node->u.l.n_bytes_in_buffer;
result += 4 + 1*stored_sub_block_map_size; // one partition
} }
if (toku_memory_check) { if (toku_memory_check) {
unsigned int slowresult = toku_serialize_brtnode_size_slow(node); unsigned int slowresult = toku_serialize_brtnode_size_slow(node);
...@@ -383,16 +383,16 @@ serialize_leaf(BRTNODE node, int n_sub_blocks, struct sub_block sub_block[], str ...@@ -383,16 +383,16 @@ serialize_leaf(BRTNODE node, int n_sub_blocks, struct sub_block sub_block[], str
wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.ndata); wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.ndata);
wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.dsize); wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.dsize);
#if 0
// RFP partition the leaf elements. for now, 1 partition // RFP partition the leaf elements. for now, 1 partition
const int npartitions = 1; const int npartitions = 1;
wbuf_nocrc_int(wbuf, npartitions); wbuf_nocrc_int(wbuf, npartitions);
struct sub_block_map part_map[npartitions]; struct sub_block_map part_map[npartitions];
for (int i = 0; i < npartitions; i++) { for (int i = 0; i < npartitions; i++) {
size_t offset = wbuf_get_woffset(wbuf); size_t offset = wbuf_get_woffset(wbuf) - node_header_overhead;
size_t size = sizeof (u_int32_t) + node->u.l.n_bytes_in_buffer; // # in partition + size of partition
int idx = get_sub_block_index(n_sub_blocks, sub_block, offset); int idx = get_sub_block_index(n_sub_blocks, sub_block, offset);
assert(idx >= 0);
size_t size = sizeof (u_int32_t) + node->u.l.n_bytes_in_buffer; // # in partition + size of partition
sub_block_map_init(&part_map[i], idx, offset, size); sub_block_map_init(&part_map[i], idx, offset, size);
} }
...@@ -404,10 +404,6 @@ serialize_leaf(BRTNODE node, int n_sub_blocks, struct sub_block sub_block[], str ...@@ -404,10 +404,6 @@ serialize_leaf(BRTNODE node, int n_sub_blocks, struct sub_block sub_block[], str
// RFP serialize the partition maps // RFP serialize the partition maps
for (int i = 0; i < npartitions; i++) for (int i = 0; i < npartitions; i++)
sub_block_map_serialize(&part_map[i], wbuf); sub_block_map_serialize(&part_map[i], wbuf);
#else
n_sub_blocks = n_sub_blocks;
sub_block = sub_block;
#endif
// serialize the leaf entries // serialize the leaf entries
wbuf_nocrc_uint(wbuf, toku_omt_size(node->u.l.buffer)); wbuf_nocrc_uint(wbuf, toku_omt_size(node->u.l.buffer));
...@@ -714,6 +710,7 @@ deserialize_brtnode_nonleaf_from_rbuf (BRTNODE result, bytevec magic, struct rbu ...@@ -714,6 +710,7 @@ deserialize_brtnode_nonleaf_from_rbuf (BRTNODE result, bytevec magic, struct rbu
for (int j=0; j<i; j++) toku_fifo_free(&BNC_BUFFER(result,j)); for (int j=0; j<i; j++) toku_fifo_free(&BNC_BUFFER(result,j));
return toku_db_badformat(); return toku_db_badformat();
} }
toku_fifo_size_hint(BNC_BUFFER(result,i), child_buffer_map[i].size);
} }
// deserialize all child buffers, like the function says // deserialize all child buffers, like the function says
...@@ -745,6 +742,31 @@ deserialize_brtnode_leaf_from_rbuf (BRTNODE result, bytevec magic, struct rbuf * ...@@ -745,6 +742,31 @@ deserialize_brtnode_leaf_from_rbuf (BRTNODE result, bytevec magic, struct rbuf *
result->u.l.leaf_stats.ndata = rbuf_ulonglong(rb); result->u.l.leaf_stats.ndata = rbuf_ulonglong(rb);
result->u.l.leaf_stats.dsize = rbuf_ulonglong(rb); result->u.l.leaf_stats.dsize = rbuf_ulonglong(rb);
result->u.l.leaf_stats.exact = TRUE; result->u.l.leaf_stats.exact = TRUE;
// deserialize the number of partitions
int npartitions = rbuf_int(rb);
assert(npartitions == 1);
// deserialize partition pivots
for (int p = 0; p < npartitions-1; p++) {
// just throw them away for now
if (result->flags & TOKU_DB_DUPSORT) {
bytevec keyptr, dataptr;
unsigned int keylen, datalen;
rbuf_bytes(rb, &keyptr, &keylen);
rbuf_bytes(rb, &dataptr, &datalen);
} else {
bytevec childkeyptr;
unsigned int cklen;
rbuf_bytes(rb, &childkeyptr, &cklen);
}
}
// deserialize the partition map
struct sub_block_map part_map[npartitions];
for (int p = 0; p < npartitions; p++)
sub_block_map_deserialize(&part_map[p], rb);
int n_in_buf = rbuf_int(rb); int n_in_buf = rbuf_int(rb);
result->u.l.n_bytes_in_buffer = 0; result->u.l.n_bytes_in_buffer = 0;
result->u.l.seqinsert = 0; result->u.l.seqinsert = 0;
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "brtloader-internal.h" #include "brtloader-internal.h"
#include "brt-internal.h" #include "brt-internal.h"
#include "sub_block.h" #include "sub_block.h"
#include "sub_block_map.h"
static size_t (*os_fwrite_fun)(const void *,size_t,size_t,FILE*)=NULL; static size_t (*os_fwrite_fun)(const void *,size_t,size_t,FILE*)=NULL;
void brtloader_set_os_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) { void brtloader_set_os_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) {
...@@ -1040,7 +1041,7 @@ struct leaf_buf { ...@@ -1040,7 +1041,7 @@ struct leaf_buf {
unsigned int local_fingerprint; unsigned int local_fingerprint;
int local_fingerprint_p; int local_fingerprint_p;
int nkeys, ndata, dsize, n_in_buf; int nkeys, ndata, dsize, n_in_buf;
int nkeys_p, ndata_p, dsize_p, n_in_buf_p; int nkeys_p, ndata_p, dsize_p, partitions_p, n_in_buf_p;
}; };
const int nodesize = (SIZE_FACTOR==1) ? (1<<15) : (1<<22); const int nodesize = (SIZE_FACTOR==1) ? (1<<15) : (1<<22);
...@@ -1115,6 +1116,7 @@ static struct leaf_buf *start_leaf (struct dbout *out, const struct descriptor * ...@@ -1115,6 +1116,7 @@ static struct leaf_buf *start_leaf (struct dbout *out, const struct descriptor *
lbuf->nkeys_p = lbuf->dbuf.off; lbuf->dbuf.off+=8; lbuf->nkeys_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->ndata_p = lbuf->dbuf.off; lbuf->dbuf.off+=8; lbuf->ndata_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->dsize_p = lbuf->dbuf.off; lbuf->dbuf.off+=8; lbuf->dsize_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->partitions_p = lbuf->dbuf.off; lbuf->dbuf.off+=4; lbuf->dbuf.off += stored_sub_block_map_size; // RFP partition map
lbuf->n_in_buf_p = lbuf->dbuf.off; lbuf->dbuf.off+=4; lbuf->n_in_buf_p = lbuf->dbuf.off; lbuf->dbuf.off+=4;
return lbuf; return lbuf;
...@@ -1173,6 +1175,16 @@ static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progr ...@@ -1173,6 +1175,16 @@ static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progr
putbuf_int64_at(&lbuf->dbuf, lbuf->nkeys_p, lbuf->nkeys); putbuf_int64_at(&lbuf->dbuf, lbuf->nkeys_p, lbuf->nkeys);
putbuf_int64_at(&lbuf->dbuf, lbuf->ndata_p, lbuf->ndata); putbuf_int64_at(&lbuf->dbuf, lbuf->ndata_p, lbuf->ndata);
putbuf_int64_at(&lbuf->dbuf, lbuf->dsize_p, lbuf->dsize); putbuf_int64_at(&lbuf->dbuf, lbuf->dsize_p, lbuf->dsize);
// RFP abstract this
const int32_t n_partitions = 1;
struct sub_block_map partition_map;
sub_block_map_init(&partition_map, 0, 0, 0);
putbuf_int32_at(&lbuf->dbuf, lbuf->partitions_p, n_partitions);
putbuf_int32_at(&lbuf->dbuf, lbuf->partitions_p+4, partition_map.idx);
putbuf_int32_at(&lbuf->dbuf, lbuf->partitions_p+8, partition_map.offset);
putbuf_int32_at(&lbuf->dbuf, lbuf->partitions_p+12, partition_map.size);
putbuf_int32_at(&lbuf->dbuf, lbuf->n_in_buf_p, lbuf->n_in_buf); putbuf_int32_at(&lbuf->dbuf, lbuf->n_in_buf_p, lbuf->n_in_buf);
//print_bytestring(lbuf->dbuf.buf, lbuf->dbuf.off, 200); //print_bytestring(lbuf->dbuf.buf, lbuf->dbuf.off, 200);
......
...@@ -62,6 +62,13 @@ static int next_power_of_two (int n) { ...@@ -62,6 +62,13 @@ static int next_power_of_two (int n) {
return r; return r;
} }
void toku_fifo_size_hint(FIFO fifo, size_t size) {
if (fifo->memory == NULL) {
fifo->memory_size = next_power_of_two(size);
fifo->memory = toku_malloc(fifo->memory_size);
}
}
int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, XIDS xids) { int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, XIDS xids) {
int need_space_here = sizeof(struct fifo_entry) int need_space_here = sizeof(struct fifo_entry)
+ keylen + datalen + keylen + datalen
......
...@@ -8,21 +8,41 @@ ...@@ -8,21 +8,41 @@
#include "xids-internal.h" #include "xids-internal.h"
#include "xids.h" #include "xids.h"
struct fifo_entry {
// If the fifo_entry is unpacked, the compiler aligns the xids array and we waste a lot of space
#if TOKU_WINDOWS
#pragma pack(push, 1)
#endif
struct __attribute__((__packed__)) fifo_entry {
unsigned int keylen; unsigned int keylen;
unsigned int vallen; unsigned int vallen;
unsigned char type; unsigned char type;
XIDS_S xids_s; XIDS_S xids_s;
}; };
#if TOKU_WINDOWS
#pragma pack(pop)
#endif
typedef struct fifo *FIFO; typedef struct fifo *FIFO;
int toku_fifo_create(FIFO *); int toku_fifo_create(FIFO *);
void toku_fifo_free(FIFO *); void toku_fifo_free(FIFO *);
// Use the size hint to size the storage for the fifo entries in anticipation of putting a bunch of them
// into the fifo.
void toku_fifo_size_hint(FIFO, size_t size_hint);
int toku_fifo_n_entries(FIFO); int toku_fifo_n_entries(FIFO);
int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd); int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd);
int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type, XIDS xids); int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type, XIDS xids);
int toku_fifo_peek (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, u_int32_t *type, XIDS *xids); int toku_fifo_peek (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, u_int32_t *type, XIDS *xids);
// int toku_fifo_peek_cmdstruct (FIFO, BRT_MSG, DBT*, DBT*); // fill in the BRT_MSG, using the two DBTs for the DBT part. // int toku_fifo_peek_cmdstruct (FIFO, BRT_MSG, DBT*, DBT*); // fill in the BRT_MSG, using the two DBTs for the DBT part.
int toku_fifo_deq(FIFO); int toku_fifo_deq(FIFO);
......
...@@ -5,14 +5,25 @@ ...@@ -5,14 +5,25 @@
#include "includes.h" #include "includes.h"
static void test_serialize(void) { static void
test_serialize_leaf(void) {
int r;
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
r = close(fd); assert(r != -1);
}
static void
test_serialize_nonleaf(void) {
// struct brt source_brt; // struct brt source_brt;
int nodesize = 1024; const int nodesize = 1024;
struct brtnode sn, *dn; struct brtnode sn, *dn;
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO);
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
int r; int r;
const u_int32_t randval = random(); const u_int32_t randval = random();
assert(fd>=0);
// source_brt.fd=fd; // source_brt.fd=fd;
char *hello_string; char *hello_string;
...@@ -119,28 +130,6 @@ static void test_serialize(void) { ...@@ -119,28 +130,6 @@ static void test_serialize(void) {
} }
assert(dn->local_fingerprint==sn.local_fingerprint); assert(dn->local_fingerprint==sn.local_fingerprint);
} }
#if 0
{
bytevec data; ITEMLEN datalen; int type;
r = toku_hash_find(dn->u.n.buffers[0], "a", 2, &data, &datalen, &type);
assert(r==0);
assert(strcmp(data,"aval")==0);
assert(datalen==5);
assert(type == BRT_NONE);
r=toku_hash_find(dn->u.n.buffers[0], "b", 2, &data, &datalen, &type);
assert(r==0);
assert(strcmp(data,"bval")==0);
assert(datalen==5);
assert(type == BRT_NONE);
r=toku_hash_find(dn->u.n.buffers[1], "x", 2, &data, &datalen, &type);
assert(r==0);
assert(strcmp(data,"xval")==0);
assert(datalen==5);
assert(type == BRT_NONE);
}
#endif
toku_brtnode_free(&dn); toku_brtnode_free(&dn);
kv_pair_free(sn.u.n.childkeys[0]); kv_pair_free(sn.u.n.childkeys[0]);
...@@ -159,7 +148,8 @@ static void test_serialize(void) { ...@@ -159,7 +148,8 @@ static void test_serialize(void) {
int int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
toku_memory_check = 1; toku_memory_check = 1;
test_serialize(); test_serialize_leaf();
test_serialize_nonleaf();
toku_malloc_cleanup(); toku_malloc_cleanup();
return 0; return 0;
} }
...@@ -227,7 +227,7 @@ static void fill_rowset (struct rowset *rows, ...@@ -227,7 +227,7 @@ static void fill_rowset (struct rowset *rows,
DBT key = {.size=sizeof(keys[i]), DBT key = {.size=sizeof(keys[i]),
.data=&keys[i]}; .data=&keys[i]};
DBT val = {.size=strlen(vals[i]), DBT val = {.size=strlen(vals[i]),
.data=&vals[i]}; .data=(void *)vals[i]};
add_row(rows, &key, &val); add_row(rows, &key, &val);
} }
} }
...@@ -252,7 +252,9 @@ static void test_merge_files (char *template) { ...@@ -252,7 +252,9 @@ static void test_merge_files (char *template) {
struct error_callback_s cb; struct error_callback_s cb;
cb.error_callback = err_cb; cb.error_callback = err_cb;
r = sort_and_write_rows(&aset, &fs, &bl, dest_db, compare_ints, &cb, 0); CKERR(r); r = sort_and_write_rows(&aset, &fs, &bl, dest_db, compare_ints, &cb, 0); CKERR(r);
bl.n_rows += 6;
r = sort_and_write_rows(&bset, &fs, &bl, dest_db, compare_ints, &cb, 0); CKERR(r); r = sort_and_write_rows(&bset, &fs, &bl, dest_db, compare_ints, &cb, 0); CKERR(r);
bl.n_rows += 3;
assert(fs.n_temp_files==2 && fs.n_temp_files_limit >= fs.n_temp_files); assert(fs.n_temp_files==2 && fs.n_temp_files_limit >= fs.n_temp_files);
destroy_rowset(&aset); destroy_rowset(&aset);
destroy_rowset(&bset); destroy_rowset(&bset);
......
...@@ -9,10 +9,20 @@ ...@@ -9,10 +9,20 @@
// ids[0] is the outermost transaction. // ids[0] is the outermost transaction.
// ids[num_xids - 1] is the innermost transaction. // ids[num_xids - 1] is the innermost transaction.
// Should only be accessed by accessor functions xids_xxx, not directly. // Should only be accessed by accessor functions xids_xxx, not directly.
typedef struct xids_t {
// If the xids struct is unpacked, the compiler aligns the ids[] and we waste a lot of space
#if TOKU_WINDOWS
#pragma pack(push, 1)
#endif
typedef struct __attribute__((__packed__)) xids_t {
u_int8_t num_stored_xids; // maximum value of MAX_TRANSACTION_RECORDS - 1 ... u_int8_t num_stored_xids; // maximum value of MAX_TRANSACTION_RECORDS - 1 ...
// ... because transaction 0 is implicit // ... because transaction 0 is implicit
TXNID ids[]; TXNID ids[];
} XIDS_S; } XIDS_S;
#if TOKU_WINDOWS
#pragma pack(pop)
#endif
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment