Commit 57f6beab authored by Rich Prohaska's avatar Rich Prohaska Committed by Yoni Fogel

change block format to support leaf partitions closes[t:2351]

git-svn-id: file:///svn/toku/tokudb@19500 c7de825b-a66e-492c-adef-691d508d4ae1
parent c1a99a1b
......@@ -195,17 +195,17 @@ static unsigned int
toku_serialize_brtnode_size_slow (BRTNODE node) {
unsigned int size = node_header_overhead + extended_node_header_overhead;
size += toku_serialize_descriptor_size(node->desc);
if (node->height>0) {
if (node->height > 0) {
unsigned int hsize=0;
unsigned int csize=0;
size+=4; /* n_children */
size+=4; /* subtree fingerprint. */
size+=4*(node->u.n.n_children-1); /* key lengths*/
size += 4; /* n_children */
size += 4; /* subtree fingerprint. */
size += 4*(node->u.n.n_children-1); /* key lengths*/
if (node->flags & TOKU_DB_DUPSORT) size += 4*(node->u.n.n_children-1);
for (int i=0; i<node->u.n.n_children-1; i++) {
csize+=toku_brtnode_pivot_key_len(node, node->u.n.childkeys[i]);
csize += toku_brtnode_pivot_key_len(node, node->u.n.childkeys[i]);
}
size+=(8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and 1 for the exact bit for the estimates. */
size += (8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and 1 for the exact bit for the estimates. */
int n_buffers = node->u.n.n_children;
assert(0 <= n_buffers && n_buffers < TREE_FANOUT+1);
for (int i=0; i< n_buffers; i++) {
......@@ -222,12 +222,11 @@ toku_serialize_brtnode_size_slow (BRTNODE node) {
return size+hsize+csize;
} else {
unsigned int hsize=0;
toku_omt_iterate(node->u.l.buffer,
addupsize,
&hsize);
toku_omt_iterate(node->u.l.buffer, addupsize, &hsize);
assert(hsize==node->u.l.n_bytes_in_buffer);
hsize+=4; /* add n entries in buffer table. */
hsize+=3*8; /* add the three leaf stats, but no exact bit. */
hsize += 4; // add n entries in buffer table
hsize += 3*8; // add the three leaf stats, but no exact bit
size += 4 + 1*stored_sub_block_map_size; // one partition
return size+hsize;
}
}
......@@ -238,20 +237,21 @@ toku_serialize_brtnode_size (BRTNODE node) {
unsigned int result = node_header_overhead + extended_node_header_overhead;
assert(sizeof(toku_off_t)==8);
result += toku_serialize_descriptor_size(node->desc);
if (node->height>0) {
result+=4; /* subtree fingerpirnt */
result+=4; /* n_children */
result+=4*(node->u.n.n_children-1); /* key lengths*/
if (node->height > 0) {
result += 4; /* subtree fingerpirnt */
result += 4; /* n_children */
result += 4*(node->u.n.n_children-1); /* key lengths*/
if (node->flags & TOKU_DB_DUPSORT) result += 4*(node->u.n.n_children-1); /* data lengths */
assert(node->u.n.totalchildkeylens < (1<<30));
result+=node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */
result+=(8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and one for the exact bit. */
result+=node->u.n.n_bytes_in_buffers;
result += node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */
result += (8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and one for the exact bit. */
result += node->u.n.n_bytes_in_buffers;
result += node->u.n.n_children*stored_sub_block_map_size;
} else {
result+=4; /* n_entries in buffer table. */
result+=3*8; /* the three leaf stats. */
result+=node->u.l.n_bytes_in_buffer;
result += 4; // n_entries in buffer table
result += 3*8; // the three leaf stats
result += node->u.l.n_bytes_in_buffer;
result += 4 + 1*stored_sub_block_map_size; // one partition
}
if (toku_memory_check) {
unsigned int slowresult = toku_serialize_brtnode_size_slow(node);
......@@ -383,16 +383,16 @@ serialize_leaf(BRTNODE node, int n_sub_blocks, struct sub_block sub_block[], str
wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.ndata);
wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.dsize);
#if 0
// RFP partition the leaf elements. for now, 1 partition
const int npartitions = 1;
wbuf_nocrc_int(wbuf, npartitions);
struct sub_block_map part_map[npartitions];
for (int i = 0; i < npartitions; i++) {
size_t offset = wbuf_get_woffset(wbuf);
size_t size = sizeof (u_int32_t) + node->u.l.n_bytes_in_buffer; // # in partition + size of partition
size_t offset = wbuf_get_woffset(wbuf) - node_header_overhead;
int idx = get_sub_block_index(n_sub_blocks, sub_block, offset);
assert(idx >= 0);
size_t size = sizeof (u_int32_t) + node->u.l.n_bytes_in_buffer; // # in partition + size of partition
sub_block_map_init(&part_map[i], idx, offset, size);
}
......@@ -404,10 +404,6 @@ serialize_leaf(BRTNODE node, int n_sub_blocks, struct sub_block sub_block[], str
// RFP serialize the partition maps
for (int i = 0; i < npartitions; i++)
sub_block_map_serialize(&part_map[i], wbuf);
#else
n_sub_blocks = n_sub_blocks;
sub_block = sub_block;
#endif
// serialize the leaf entries
wbuf_nocrc_uint(wbuf, toku_omt_size(node->u.l.buffer));
......@@ -714,6 +710,7 @@ deserialize_brtnode_nonleaf_from_rbuf (BRTNODE result, bytevec magic, struct rbu
for (int j=0; j<i; j++) toku_fifo_free(&BNC_BUFFER(result,j));
return toku_db_badformat();
}
toku_fifo_size_hint(BNC_BUFFER(result,i), child_buffer_map[i].size);
}
// deserialize all child buffers, like the function says
......@@ -745,6 +742,31 @@ deserialize_brtnode_leaf_from_rbuf (BRTNODE result, bytevec magic, struct rbuf *
result->u.l.leaf_stats.ndata = rbuf_ulonglong(rb);
result->u.l.leaf_stats.dsize = rbuf_ulonglong(rb);
result->u.l.leaf_stats.exact = TRUE;
// deserialize the number of partitions
int npartitions = rbuf_int(rb);
assert(npartitions == 1);
// deserialize partition pivots
for (int p = 0; p < npartitions-1; p++) {
// just throw them away for now
if (result->flags & TOKU_DB_DUPSORT) {
bytevec keyptr, dataptr;
unsigned int keylen, datalen;
rbuf_bytes(rb, &keyptr, &keylen);
rbuf_bytes(rb, &dataptr, &datalen);
} else {
bytevec childkeyptr;
unsigned int cklen;
rbuf_bytes(rb, &childkeyptr, &cklen);
}
}
// deserialize the partition map
struct sub_block_map part_map[npartitions];
for (int p = 0; p < npartitions; p++)
sub_block_map_deserialize(&part_map[p], rb);
int n_in_buf = rbuf_int(rb);
result->u.l.n_bytes_in_buffer = 0;
result->u.l.seqinsert = 0;
......
......@@ -21,6 +21,7 @@
#include "brtloader-internal.h"
#include "brt-internal.h"
#include "sub_block.h"
#include "sub_block_map.h"
static size_t (*os_fwrite_fun)(const void *,size_t,size_t,FILE*)=NULL;
void brtloader_set_os_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) {
......@@ -1040,7 +1041,7 @@ struct leaf_buf {
unsigned int local_fingerprint;
int local_fingerprint_p;
int nkeys, ndata, dsize, n_in_buf;
int nkeys_p, ndata_p, dsize_p, n_in_buf_p;
int nkeys_p, ndata_p, dsize_p, partitions_p, n_in_buf_p;
};
const int nodesize = (SIZE_FACTOR==1) ? (1<<15) : (1<<22);
......@@ -1115,6 +1116,7 @@ static struct leaf_buf *start_leaf (struct dbout *out, const struct descriptor *
lbuf->nkeys_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->ndata_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->dsize_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->partitions_p = lbuf->dbuf.off; lbuf->dbuf.off+=4; lbuf->dbuf.off += stored_sub_block_map_size; // RFP partition map
lbuf->n_in_buf_p = lbuf->dbuf.off; lbuf->dbuf.off+=4;
return lbuf;
......@@ -1173,6 +1175,16 @@ static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progr
putbuf_int64_at(&lbuf->dbuf, lbuf->nkeys_p, lbuf->nkeys);
putbuf_int64_at(&lbuf->dbuf, lbuf->ndata_p, lbuf->ndata);
putbuf_int64_at(&lbuf->dbuf, lbuf->dsize_p, lbuf->dsize);
// RFP abstract this
const int32_t n_partitions = 1;
struct sub_block_map partition_map;
sub_block_map_init(&partition_map, 0, 0, 0);
putbuf_int32_at(&lbuf->dbuf, lbuf->partitions_p, n_partitions);
putbuf_int32_at(&lbuf->dbuf, lbuf->partitions_p+4, partition_map.idx);
putbuf_int32_at(&lbuf->dbuf, lbuf->partitions_p+8, partition_map.offset);
putbuf_int32_at(&lbuf->dbuf, lbuf->partitions_p+12, partition_map.size);
putbuf_int32_at(&lbuf->dbuf, lbuf->n_in_buf_p, lbuf->n_in_buf);
//print_bytestring(lbuf->dbuf.buf, lbuf->dbuf.off, 200);
......
......@@ -62,6 +62,13 @@ static int next_power_of_two (int n) {
return r;
}
void toku_fifo_size_hint(FIFO fifo, size_t size) {
if (fifo->memory == NULL) {
fifo->memory_size = next_power_of_two(size);
fifo->memory = toku_malloc(fifo->memory_size);
}
}
int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, XIDS xids) {
int need_space_here = sizeof(struct fifo_entry)
+ keylen + datalen
......
......@@ -8,21 +8,41 @@
#include "xids-internal.h"
#include "xids.h"
struct fifo_entry {
// If the fifo_entry is unpacked, the compiler aligns the xids array and we waste a lot of space
#if TOKU_WINDOWS
#pragma pack(push, 1)
#endif
struct __attribute__((__packed__)) fifo_entry {
unsigned int keylen;
unsigned int vallen;
unsigned char type;
XIDS_S xids_s;
};
#if TOKU_WINDOWS
#pragma pack(pop)
#endif
typedef struct fifo *FIFO;
int toku_fifo_create(FIFO *);
void toku_fifo_free(FIFO *);
// Use the size hint to size the storage for the fifo entries in anticipation of putting a bunch of them
// into the fifo.
void toku_fifo_size_hint(FIFO, size_t size_hint);
int toku_fifo_n_entries(FIFO);
int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_MSG cmd);
int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type, XIDS xids);
int toku_fifo_peek (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, u_int32_t *type, XIDS *xids);
// int toku_fifo_peek_cmdstruct (FIFO, BRT_MSG, DBT*, DBT*); // fill in the BRT_MSG, using the two DBTs for the DBT part.
int toku_fifo_deq(FIFO);
......
......@@ -5,14 +5,25 @@
#include "includes.h"
static void test_serialize(void) {
static void
test_serialize_leaf(void) {
int r;
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
r = close(fd); assert(r != -1);
}
static void
test_serialize_nonleaf(void) {
// struct brt source_brt;
int nodesize = 1024;
const int nodesize = 1024;
struct brtnode sn, *dn;
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO);
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
int r;
const u_int32_t randval = random();
assert(fd>=0);
// source_brt.fd=fd;
char *hello_string;
......@@ -119,28 +130,6 @@ static void test_serialize(void) {
}
assert(dn->local_fingerprint==sn.local_fingerprint);
}
#if 0
{
bytevec data; ITEMLEN datalen; int type;
r = toku_hash_find(dn->u.n.buffers[0], "a", 2, &data, &datalen, &type);
assert(r==0);
assert(strcmp(data,"aval")==0);
assert(datalen==5);
assert(type == BRT_NONE);
r=toku_hash_find(dn->u.n.buffers[0], "b", 2, &data, &datalen, &type);
assert(r==0);
assert(strcmp(data,"bval")==0);
assert(datalen==5);
assert(type == BRT_NONE);
r=toku_hash_find(dn->u.n.buffers[1], "x", 2, &data, &datalen, &type);
assert(r==0);
assert(strcmp(data,"xval")==0);
assert(datalen==5);
assert(type == BRT_NONE);
}
#endif
toku_brtnode_free(&dn);
kv_pair_free(sn.u.n.childkeys[0]);
......@@ -159,7 +148,8 @@ static void test_serialize(void) {
int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
toku_memory_check = 1;
test_serialize();
test_serialize_leaf();
test_serialize_nonleaf();
toku_malloc_cleanup();
return 0;
}
......@@ -227,7 +227,7 @@ static void fill_rowset (struct rowset *rows,
DBT key = {.size=sizeof(keys[i]),
.data=&keys[i]};
DBT val = {.size=strlen(vals[i]),
.data=&vals[i]};
.data=(void *)vals[i]};
add_row(rows, &key, &val);
}
}
......@@ -252,7 +252,9 @@ static void test_merge_files (char *template) {
struct error_callback_s cb;
cb.error_callback = err_cb;
r = sort_and_write_rows(&aset, &fs, &bl, dest_db, compare_ints, &cb, 0); CKERR(r);
bl.n_rows += 6;
r = sort_and_write_rows(&bset, &fs, &bl, dest_db, compare_ints, &cb, 0); CKERR(r);
bl.n_rows += 3;
assert(fs.n_temp_files==2 && fs.n_temp_files_limit >= fs.n_temp_files);
destroy_rowset(&aset);
destroy_rowset(&bset);
......
......@@ -9,10 +9,20 @@
// ids[0] is the outermost transaction.
// ids[num_xids - 1] is the innermost transaction.
// Should only be accessed by accessor functions xids_xxx, not directly.
typedef struct xids_t {
// If the xids struct is unpacked, the compiler aligns the ids[] and we waste a lot of space
#if TOKU_WINDOWS
#pragma pack(push, 1)
#endif
typedef struct __attribute__((__packed__)) xids_t {
u_int8_t num_stored_xids; // maximum value of MAX_TRANSACTION_RECORDS - 1 ...
// ... because transaction 0 is implicit
TXNID ids[];
} XIDS_S;
#if TOKU_WINDOWS
#pragma pack(pop)
#endif
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment