Commit 51a56b52 authored by Rusty Russell's avatar Rusty Russell

Merge branch 'tdb2'

parents 451d97ad a42bba8e
...@@ -2,7 +2,7 @@ LDLIBS:=../../tdb.o ../../tally.o ...@@ -2,7 +2,7 @@ LDLIBS:=../../tdb.o ../../tally.o
CFLAGS:=-I../../.. -Wall -O3 #-g -pg CFLAGS:=-I../../.. -Wall -O3 #-g -pg
LDFLAGS:=-L../../.. LDFLAGS:=-L../../..
default: replay_trace tdbtorture tdbdump tdbtool starvation mktdb default: replay_trace tdbtorture tdbdump tdbtool starvation mktdb speed
benchmark: replay_trace benchmark: replay_trace
@trap "rm -f /tmp/trace.$$$$" 0; for f in benchmarks/*.rz; do if runzip -k $$f -o /tmp/trace.$$$$ && echo -n "$$f": && ./replay_trace --quiet -n 5 replay.tdb /tmp/trace.$$$$ && rm /tmp/trace.$$$$; then rm -f /tmp/trace.$$$$; else exit 1; fi; done @trap "rm -f /tmp/trace.$$$$" 0; for f in benchmarks/*.rz; do if runzip -k $$f -o /tmp/trace.$$$$ && echo -n "$$f": && ./replay_trace --quiet -n 5 replay.tdb /tmp/trace.$$$$ && rm /tmp/trace.$$$$; then rm -f /tmp/trace.$$$$; else exit 1; fi; done
...@@ -30,4 +30,4 @@ check: replay_trace ...@@ -30,4 +30,4 @@ check: replay_trace
@sed 's/\(^[0-9]* traverse\) .*/\1fn/' < $^ > $@ @sed 's/\(^[0-9]* traverse\) .*/\1fn/' < $^ > $@
clean: clean:
rm -f replay_trace tdbtorture tdbdump tdbtool *.o rm -f replay_trace tdbtorture tdbdump tdbtool speed *.o
/* Simple speed test for TDB */
#include <err.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/time.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ccan/tdb/tdb.h>
/* Nanoseconds per operation */
static size_t normalize(const struct timeval *start,
const struct timeval *stop,
unsigned int num)
{
struct timeval diff;
timersub(stop, start, &diff);
/* Floating point is more accurate here. */
return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
/ num * 1000;
}
static size_t file_size(void)
{
struct stat st;
if (stat("/tmp/speed.tdb", &st) != 0)
return -1;
return st.st_size;
}
static int count_record(struct tdb_context *tdb,
TDB_DATA key, TDB_DATA data, void *p)
{
int *total = p;
*total += *(int *)data.dptr;
return 0;
}
int main(int argc, char *argv[])
{
unsigned int i, j, num = 1000, stage = 0, stopat = -1;
int flags = TDB_DEFAULT;
TDB_DATA key, data;
struct tdb_context *tdb;
struct timeval start, stop;
bool transaction = false;
if (argv[1] && strcmp(argv[1], "--internal") == 0) {
flags = TDB_INTERNAL;
argc--;
argv++;
}
if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
transaction = true;
argc--;
argv++;
}
tdb = tdb_open("/tmp/speed.tdb", 100003, flags, O_RDWR|O_CREAT|O_TRUNC,
0600);
if (!tdb)
err(1, "Opening /tmp/speed.tdb");
key.dptr = (void *)&i;
key.dsize = sizeof(i);
data = key;
if (argv[1]) {
num = atoi(argv[1]);
argv++;
argc--;
}
if (argv[1]) {
stopat = atoi(argv[1]);
argv++;
argc--;
}
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Add 1000 records. */
printf("Adding %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (i = 0; i < num; i++)
if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
errx(1, "Inserting key %u in tdb: %s",
i, tdb_errorstr(tdb));
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Finding 1000 records. */
printf("Finding %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (i = 0; i < num; i++) {
int *dptr;
dptr = (int *)tdb_fetch(tdb, key).dptr;
if (!dptr || *dptr != i)
errx(1, "Fetching key %u in tdb gave %u",
i, dptr ? *dptr : -1);
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Missing 1000 records. */
printf("Missing %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (i = num; i < num*2; i++) {
int *dptr;
dptr = (int *)tdb_fetch(tdb, key).dptr;
if (dptr)
errx(1, "Fetching key %u in tdb gave %u", i, *dptr);
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Traverse 1000 records. */
printf("Traversing %u records: ", num); fflush(stdout);
i = 0;
gettimeofday(&start, NULL);
if (tdb_traverse(tdb, count_record, &i) != num)
errx(1, "Traverse returned wrong number of records");
if (i != (num - 1) * (num / 2))
errx(1, "Traverse tallied to %u", i);
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Delete 1000 records (not in order). */
printf("Deleting %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (j = 0; j < num; j++) {
i = (j + 100003) % num;
if (tdb_delete(tdb, key) != 0)
errx(1, "Deleting key %u in tdb: %s",
i, tdb_errorstr(tdb));
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Re-add 1000 records (not in order). */
printf("Re-adding %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (j = 0; j < num; j++) {
i = (j + 100003) % num;
if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
errx(1, "Inserting key %u in tdb: %s",
i, tdb_errorstr(tdb));
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Append 1000 records. */
printf("Appending %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (i = 0; i < num; i++)
if (tdb_append(tdb, key, data) != 0)
errx(1, "Appending key %u in tdb: %s",
i, tdb_errorstr(tdb));
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Churn 1000 records: not in order! */
printf("Churning %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (j = 0; j < num; j++) {
i = (j + 1000019) % num;
if (tdb_delete(tdb, key) != 0)
errx(1, "Deleting key %u in tdb: %s",
i, tdb_errorstr(tdb));
i += num;
if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
errx(1, "Inserting key %u in tdb: %s",
i, tdb_errorstr(tdb));
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
return 0;
}
...@@ -43,25 +43,25 @@ static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery) ...@@ -43,25 +43,25 @@ static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
hash_test = TDB_HASH_MAGIC; hash_test = TDB_HASH_MAGIC;
hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test)); hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
if (hdr.hash_test != hash_test) { if (hdr.hash_test != hash_test) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"check: hash test %llu should be %llu\n", "check: hash test %llu should be %llu",
(long long)hdr.hash_test, (long long)hdr.hash_test,
(long long)hash_test); (long long)hash_test);
return false; return false;
} }
if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) { if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"check: bad magic '%.*s'\n", "check: bad magic '%.*s'",
(unsigned)sizeof(hdr.magic_food), hdr.magic_food); (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
return false; return false;
} }
*recovery = hdr.recovery; *recovery = hdr.recovery;
if (*recovery) { if (*recovery) {
if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) { if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: invalid recovery offset %zu\n", "tdb_check: invalid recovery offset %zu",
(size_t)*recovery); (size_t)*recovery);
return false; return false;
} }
...@@ -77,7 +77,65 @@ static bool check_hash_tree(struct tdb_context *tdb, ...@@ -77,7 +77,65 @@ static bool check_hash_tree(struct tdb_context *tdb,
unsigned hprefix_bits, unsigned hprefix_bits,
tdb_off_t used[], tdb_off_t used[],
size_t num_used, size_t num_used,
size_t *num_found); size_t *num_found,
int (*check)(TDB_DATA, TDB_DATA, void *),
void *private_data);
static bool check_hash_chain(struct tdb_context *tdb,
tdb_off_t off,
uint64_t hash,
tdb_off_t used[],
size_t num_used,
size_t *num_found,
int (*check)(TDB_DATA, TDB_DATA, void *),
void *private_data)
{
struct tdb_used_record rec;
if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
return false;
if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Bad hash chain magic %llu",
(long long)rec_magic(&rec));
return false;
}
if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Bad hash chain length %llu vs %zu",
(long long)rec_data_length(&rec),
sizeof(struct tdb_chain));
return false;
}
if (rec_key_length(&rec) != 0) {
tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Bad hash chain key length %llu",
(long long)rec_key_length(&rec));
return false;
}
if (rec_hash(&rec) != 0) {
tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Bad hash chain hash value %llu",
(long long)rec_hash(&rec));
return false;
}
off += sizeof(rec);
if (!check_hash_tree(tdb, off, 0, hash, 64,
used, num_used, num_found, check, private_data))
return false;
off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
if (off == TDB_OFF_ERR)
return false;
if (off == 0)
return true;
(*num_found)++;
return check_hash_chain(tdb, off, hash, used, num_used, num_found,
check, private_data);
}
static bool check_hash_record(struct tdb_context *tdb, static bool check_hash_record(struct tdb_context *tdb,
tdb_off_t off, tdb_off_t off,
...@@ -85,30 +143,43 @@ static bool check_hash_record(struct tdb_context *tdb, ...@@ -85,30 +143,43 @@ static bool check_hash_record(struct tdb_context *tdb,
unsigned hprefix_bits, unsigned hprefix_bits,
tdb_off_t used[], tdb_off_t used[],
size_t num_used, size_t num_used,
size_t *num_found) size_t *num_found,
int (*check)(TDB_DATA, TDB_DATA, void *),
void *private_data)
{ {
struct tdb_used_record rec; struct tdb_used_record rec;
if (hprefix_bits >= 64)
return check_hash_chain(tdb, off, hprefix, used, num_used,
num_found, check, private_data);
if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1) if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
return false; return false;
if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Bad hash table magic %llu",
(long long)rec_magic(&rec));
return false;
}
if (rec_data_length(&rec) if (rec_data_length(&rec)
!= sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) { != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Bad hash table length %llu vs %llu\n", "tdb_check: Bad hash table length %llu vs %llu",
(long long)rec_data_length(&rec), (long long)rec_data_length(&rec),
(long long)sizeof(tdb_off_t)<<TDB_SUBLEVEL_HASH_BITS); (long long)sizeof(tdb_off_t)
<< TDB_SUBLEVEL_HASH_BITS);
return false; return false;
} }
if (rec_key_length(&rec) != 0) { if (rec_key_length(&rec) != 0) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Bad hash table key length %llu\n", "tdb_check: Bad hash table key length %llu",
(long long)rec_key_length(&rec)); (long long)rec_key_length(&rec));
return false; return false;
} }
if (rec_hash(&rec) != 0) { if (rec_hash(&rec) != 0) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Bad hash table hash value %llu\n", "tdb_check: Bad hash table hash value %llu",
(long long)rec_hash(&rec)); (long long)rec_hash(&rec));
return false; return false;
} }
...@@ -117,7 +188,7 @@ static bool check_hash_record(struct tdb_context *tdb, ...@@ -117,7 +188,7 @@ static bool check_hash_record(struct tdb_context *tdb,
return check_hash_tree(tdb, off, return check_hash_tree(tdb, off,
TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
hprefix, hprefix_bits, hprefix, hprefix_bits,
used, num_used, num_found); used, num_used, num_found, check, private_data);
} }
static int off_cmp(const tdb_off_t *a, const tdb_off_t *b) static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
...@@ -141,7 +212,9 @@ static bool check_hash_tree(struct tdb_context *tdb, ...@@ -141,7 +212,9 @@ static bool check_hash_tree(struct tdb_context *tdb,
unsigned hprefix_bits, unsigned hprefix_bits,
tdb_off_t used[], tdb_off_t used[],
size_t num_used, size_t num_used,
size_t *num_found) size_t *num_found,
int (*check)(TDB_DATA, TDB_DATA, void *),
void *private_data)
{ {
unsigned int g, b; unsigned int g, b;
const tdb_off_t *hash; const tdb_off_t *hash;
...@@ -166,16 +239,42 @@ static bool check_hash_tree(struct tdb_context *tdb, ...@@ -166,16 +239,42 @@ static bool check_hash_tree(struct tdb_context *tdb,
off = group[b] & TDB_OFF_MASK; off = group[b] & TDB_OFF_MASK;
p = asearch(&off, used, num_used, off_cmp); p = asearch(&off, used, num_used, off_cmp);
if (!p) { if (!p) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"tdb_check: Invalid offset %llu " TDB_DEBUG_ERROR,
"in hash\n", "tdb_check: Invalid offset %llu "
(long long)off); "in hash", (long long)off);
goto fail; goto fail;
} }
/* Mark it invalid. */ /* Mark it invalid. */
*p ^= 1; *p ^= 1;
(*num_found)++; (*num_found)++;
if (hprefix_bits == 64) {
/* Chained entries are unordered. */
if (is_subhash(group[b])) {
tdb_logerr(tdb, TDB_ERR_CORRUPT,
TDB_DEBUG_ERROR,
"tdb_check: Invalid chain"
" entry subhash");
goto fail;
}
h = hash_record(tdb, off);
if (h != hprefix) {
tdb_logerr(tdb, TDB_ERR_CORRUPT,
TDB_DEBUG_ERROR,
"check: bad hash chain"
" placement"
" 0x%llx vs 0x%llx",
(long long)h,
(long long)hprefix);
goto fail;
}
if (tdb_read_convert(tdb, off, &rec,
sizeof(rec)))
goto fail;
goto check;
}
if (is_subhash(group[b])) { if (is_subhash(group[b])) {
uint64_t subprefix; uint64_t subprefix;
subprefix = (hprefix subprefix = (hprefix
...@@ -188,7 +287,8 @@ static bool check_hash_tree(struct tdb_context *tdb, ...@@ -188,7 +287,8 @@ static bool check_hash_tree(struct tdb_context *tdb,
hprefix_bits hprefix_bits
+ group_bits + group_bits
+ TDB_HASH_GROUP_BITS, + TDB_HASH_GROUP_BITS,
used, num_used, num_found)) used, num_used, num_found,
check, private_data))
goto fail; goto fail;
continue; continue;
} }
...@@ -199,18 +299,20 @@ static bool check_hash_tree(struct tdb_context *tdb, ...@@ -199,18 +299,20 @@ static bool check_hash_tree(struct tdb_context *tdb,
used_bits = 0; used_bits = 0;
if (get_bits(h, hprefix_bits, &used_bits) != hprefix if (get_bits(h, hprefix_bits, &used_bits) != hprefix
&& hprefix_bits) { && hprefix_bits) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"check: bad hash placement" TDB_DEBUG_ERROR,
" 0x%llx vs 0x%llx\n", "check: bad hash placement"
" 0x%llx vs 0x%llx",
(long long)h, (long long)hprefix); (long long)h, (long long)hprefix);
goto fail; goto fail;
} }
/* Does it belong in this group? */ /* Does it belong in this group? */
if (get_bits(h, group_bits, &used_bits) != g) { if (get_bits(h, group_bits, &used_bits) != g) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"check: bad group %llu vs %u\n", TDB_DEBUG_ERROR,
(long long)h, g); "check: bad group %llu vs %u",
(long long)h, g);
goto fail; goto fail;
} }
...@@ -219,11 +321,12 @@ static bool check_hash_tree(struct tdb_context *tdb, ...@@ -219,11 +321,12 @@ static bool check_hash_tree(struct tdb_context *tdb,
if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits) if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
!= bucket) { != bucket) {
used_bits -= TDB_HASH_GROUP_BITS; used_bits -= TDB_HASH_GROUP_BITS;
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"check: bad bucket %u vs %u\n", TDB_DEBUG_ERROR,
"check: bad bucket %u vs %u",
(unsigned)get_bits(h, (unsigned)get_bits(h,
TDB_HASH_GROUP_BITS, TDB_HASH_GROUP_BITS,
&used_bits), &used_bits),
bucket); bucket);
goto fail; goto fail;
} }
...@@ -234,28 +337,46 @@ static bool check_hash_tree(struct tdb_context *tdb, ...@@ -234,28 +337,46 @@ static bool check_hash_tree(struct tdb_context *tdb,
i != b; i != b;
i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) { i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
if (group[i] == 0) { if (group[i] == 0) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb_logerr(tdb, TDB_ERR_CORRUPT,
tdb->log_priv, TDB_DEBUG_ERROR,
"check: bad group placement" "check: bad group placement"
" %u vs %u\n", " %u vs %u",
b, bucket); b, bucket);
goto fail; goto fail;
} }
} }
if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1) if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
goto fail; goto fail;
/* Bottom bits must match header. */ /* Bottom bits must match header. */
if ((h & ((1 << 11)-1)) != rec_hash(&rec)) { if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"tdb_check: Bad hash magic at" TDB_DEBUG_ERROR,
" offset %llu (0x%llx vs 0x%llx)\n", "tdb_check: Bad hash magic at"
(long long)off, " offset %llu (0x%llx vs 0x%llx)",
(long long)h, (long long)off,
(long long)rec_hash(&rec)); (long long)h,
(long long)rec_hash(&rec));
goto fail; goto fail;
} }
check:
if (check) {
TDB_DATA key, data;
key.dsize = rec_key_length(&rec);
data.dsize = rec_data_length(&rec);
key.dptr = (void *)tdb_access_read(tdb,
off + sizeof(rec),
key.dsize + data.dsize,
false);
if (!key.dptr)
goto fail;
data.dptr = key.dptr + key.dsize;
if (check(key, data, private_data) != 0)
goto fail;
tdb_access_release(tdb, key.dptr);
}
} }
} }
tdb_access_release(tdb, hash); tdb_access_release(tdb, hash);
...@@ -268,19 +389,22 @@ fail: ...@@ -268,19 +389,22 @@ fail:
static bool check_hash(struct tdb_context *tdb, static bool check_hash(struct tdb_context *tdb,
tdb_off_t used[], tdb_off_t used[],
size_t num_used, size_t num_flists) size_t num_used, size_t num_ftables,
int (*check)(TDB_DATA, TDB_DATA, void *),
void *private_data)
{ {
/* Free lists also show up as used. */ /* Free tables also show up as used. */
size_t num_found = num_flists; size_t num_found = num_ftables;
if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable), if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
0, 0, used, num_used, &num_found)) 0, 0, used, num_used, &num_found,
check, private_data))
return false; return false;
if (num_found != num_used) { if (num_found != num_used) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Not all entries are in hash\n"); "tdb_check: Not all entries are in hash");
return false; return false;
} }
return true; return true;
...@@ -289,62 +413,63 @@ static bool check_hash(struct tdb_context *tdb, ...@@ -289,62 +413,63 @@ static bool check_hash(struct tdb_context *tdb,
static bool check_free(struct tdb_context *tdb, static bool check_free(struct tdb_context *tdb,
tdb_off_t off, tdb_off_t off,
const struct tdb_free_record *frec, const struct tdb_free_record *frec,
tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket) tdb_off_t prev, unsigned int ftable,
unsigned int bucket)
{ {
if (frec_magic(frec) != TDB_FREE_MAGIC) { if (frec_magic(frec) != TDB_FREE_MAGIC) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: offset %llu bad magic 0x%llx\n", "tdb_check: offset %llu bad magic 0x%llx",
(long long)off, (long long)frec->magic_and_meta); (long long)off, (long long)frec->magic_and_prev);
return false; return false;
} }
if (frec_flist(frec) != flist_off) { if (frec_ftable(frec) != ftable) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: offset %llu bad freelist 0x%llx\n", "tdb_check: offset %llu bad freetable %u",
(long long)off, (long long)frec_flist(frec)); (long long)off, frec_ftable(frec));
return false; return false;
} }
if (tdb->methods->oob(tdb, off if (tdb->methods->oob(tdb, off
+ frec->data_len+sizeof(struct tdb_used_record), + frec_len(frec) + sizeof(struct tdb_used_record),
false)) false))
return false; return false;
if (size_to_bucket(frec->data_len) != bucket) { if (size_to_bucket(frec_len(frec)) != bucket) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: offset %llu in wrong bucket %u vs %u\n", "tdb_check: offset %llu in wrong bucket %u vs %u",
(long long)off, (long long)off,
bucket, size_to_bucket(frec->data_len)); bucket, size_to_bucket(frec_len(frec)));
return false; return false;
} }
if (prev != frec->prev) { if (prev != frec_prev(frec)) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: offset %llu bad prev %llu vs %llu\n", "tdb_check: offset %llu bad prev %llu vs %llu",
(long long)off, (long long)off,
(long long)prev, (long long)frec->prev); (long long)prev, (long long)frec_len(frec));
return false; return false;
} }
return true; return true;
} }
static bool check_free_list(struct tdb_context *tdb, static bool check_free_table(struct tdb_context *tdb,
tdb_off_t flist_off, tdb_off_t ftable_off,
tdb_off_t free[], unsigned ftable_num,
size_t num_free, tdb_off_t free[],
size_t *num_found) size_t num_free,
size_t *num_found)
{ {
struct tdb_freelist flist; struct tdb_freetable ft;
tdb_off_t h; tdb_off_t h;
unsigned int i; unsigned int i;
if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1) if (tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft)) == -1)
return false; return false;
if (rec_magic(&flist.hdr) != TDB_MAGIC if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
|| rec_key_length(&flist.hdr) != 0 || rec_key_length(&ft.hdr) != 0
|| rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr) || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
|| rec_hash(&flist.hdr) != 1) { || rec_hash(&ft.hdr) != 0) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
tdb->log_priv, "tdb_check: Invalid header on free table");
"tdb_check: Invalid header on free list\n");
return false; return false;
} }
...@@ -352,23 +477,23 @@ static bool check_free_list(struct tdb_context *tdb, ...@@ -352,23 +477,23 @@ static bool check_free_list(struct tdb_context *tdb,
tdb_off_t off, prev = 0, *p; tdb_off_t off, prev = 0, *p;
struct tdb_free_record f; struct tdb_free_record f;
h = bucket_off(flist_off, i); h = bucket_off(ftable_off, i);
for (off = tdb_read_off(tdb, h); off; off = f.next) { for (off = tdb_read_off(tdb, h); off; off = f.next) {
if (off == TDB_OFF_ERR) if (off == TDB_OFF_ERR)
return false; return false;
if (tdb_read_convert(tdb, off, &f, sizeof(f))) if (tdb_read_convert(tdb, off, &f, sizeof(f)))
return false; return false;
if (!check_free(tdb, off, &f, prev, flist_off, i)) if (!check_free(tdb, off, &f, prev, ftable_num, i))
return false; return false;
/* FIXME: Check hash bits */ /* FIXME: Check hash bits */
p = asearch(&off, free, num_free, off_cmp); p = asearch(&off, free, num_free, off_cmp);
if (!p) { if (!p) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb_logerr(tdb, TDB_ERR_CORRUPT,
tdb->log_priv, TDB_DEBUG_ERROR,
"tdb_check: Invalid offset" "tdb_check: Invalid offset"
" %llu in free table\n", " %llu in free table",
(long long)off); (long long)off);
return false; return false;
} }
/* Mark it invalid. */ /* Mark it invalid. */
...@@ -381,7 +506,7 @@ static bool check_free_list(struct tdb_context *tdb, ...@@ -381,7 +506,7 @@ static bool check_free_list(struct tdb_context *tdb,
} }
/* Slow, but should be very rare. */ /* Slow, but should be very rare. */
static size_t dead_space(struct tdb_context *tdb, tdb_off_t off) size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
{ {
size_t len; size_t len;
...@@ -409,113 +534,135 @@ static bool check_linear(struct tdb_context *tdb, ...@@ -409,113 +534,135 @@ static bool check_linear(struct tdb_context *tdb,
struct tdb_used_record u; struct tdb_used_record u;
struct tdb_free_record f; struct tdb_free_record f;
struct tdb_recovery_record r; struct tdb_recovery_record r;
} pad, *p; } rec;
p = tdb_get(tdb, off, &pad, sizeof(pad)); /* r is larger: only get that if we need to. */
if (!p) if (tdb_read_convert(tdb, off, &rec, sizeof(rec.f)) == -1)
return false; return false;
/* If we crash after ftruncate, we can get zeroes or fill. */ /* If we crash after ftruncate, we can get zeroes or fill. */
if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
|| p->r.magic == 0x4343434343434343ULL) { || rec.r.magic == 0x4343434343434343ULL) {
if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
return false;
if (recovery == off) { if (recovery == off) {
found_recovery = true; found_recovery = true;
len = sizeof(p->r) + p->r.max_len; len = sizeof(rec.r) + rec.r.max_len;
} else { } else {
len = dead_space(tdb, off); len = dead_space(tdb, off);
if (len < sizeof(p->r)) { if (len < sizeof(rec.r)) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb_logerr(tdb, TDB_ERR_CORRUPT,
tdb->log_priv, TDB_DEBUG_ERROR,
"tdb_check: invalid dead space" "tdb_check: invalid dead"
" at %zu\n", (size_t)off); " space at %zu",
(size_t)off);
return false; return false;
} }
tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv, tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
"Dead space at %zu-%zu (of %zu)\n", "Dead space at %zu-%zu (of %zu)",
(size_t)off, (size_t)(off + len), (size_t)off, (size_t)(off + len),
(size_t)tdb->map_size); (size_t)tdb->map_size);
} }
} else if (p->r.magic == TDB_RECOVERY_MAGIC) { } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
return false;
if (recovery != off) { if (recovery != off) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"tdb_check: unexpected recovery" TDB_DEBUG_ERROR,
" record at offset %zu\n", "tdb_check: unexpected recovery"
(size_t)off); " record at offset %zu",
(size_t)off);
return false;
}
if (rec.r.len > rec.r.max_len) {
tdb_logerr(tdb, TDB_ERR_CORRUPT,
TDB_DEBUG_ERROR,
"tdb_check: invalid recovery length"
" %zu", (size_t)rec.r.len);
return false;
}
if (rec.r.eof > tdb->map_size) {
tdb_logerr(tdb, TDB_ERR_CORRUPT,
TDB_DEBUG_ERROR,
"tdb_check: invalid old EOF"
" %zu", (size_t)rec.r.eof);
return false; return false;
} }
found_recovery = true; found_recovery = true;
len = sizeof(p->r) + p->r.max_len; len = sizeof(rec.r) + rec.r.max_len;
} else if (frec_magic(&p->f) == TDB_FREE_MAGIC } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
|| frec_magic(&p->f) == TDB_COALESCING_MAGIC) { len = sizeof(rec.u) + frec_len(&rec.f);
len = sizeof(p->u) + p->f.data_len;
if (off + len > tdb->map_size) { if (off + len > tdb->map_size) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"tdb_check: free overlength %llu" TDB_DEBUG_ERROR,
" at offset %llu\n", "tdb_check: free overlength %llu"
(long long)len, (long long)off); " at offset %llu",
(long long)len, (long long)off);
return false; return false;
} }
/* This record is free! */ /* This record should be in free lists. */
if (frec_magic(&p->f) == TDB_FREE_MAGIC if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
&& !append(free, num_free, off)) && !append(free, num_free, off))
return false; return false;
} else { } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
|| rec_magic(&rec.u) == TDB_CHAIN_MAGIC
|| rec_magic(&rec.u) == TDB_HTABLE_MAGIC
|| rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
uint64_t klen, dlen, extra; uint64_t klen, dlen, extra;
/* This record is used! */ /* This record is used! */
if (rec_magic(&p->u) != TDB_MAGIC) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: Bad magic 0x%llx"
" at offset %llu\n",
(long long)rec_magic(&p->u),
(long long)off);
return false;
}
if (!append(used, num_used, off)) if (!append(used, num_used, off))
return false; return false;
klen = rec_key_length(&p->u); klen = rec_key_length(&rec.u);
dlen = rec_data_length(&p->u); dlen = rec_data_length(&rec.u);
extra = rec_extra_padding(&p->u); extra = rec_extra_padding(&rec.u);
len = sizeof(p->u) + klen + dlen + extra; len = sizeof(rec.u) + klen + dlen + extra;
if (off + len > tdb->map_size) { if (off + len > tdb->map_size) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"tdb_check: used overlength %llu" TDB_DEBUG_ERROR,
" at offset %llu\n", "tdb_check: used overlength %llu"
(long long)len, (long long)off); " at offset %llu",
(long long)len, (long long)off);
return false; return false;
} }
if (len < sizeof(p->f)) { if (len < sizeof(rec.f)) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"tdb_check: too short record %llu at" TDB_DEBUG_ERROR,
" %llu\n", "tdb_check: too short record %llu"
(long long)len, (long long)off); " at %llu",
(long long)len, (long long)off);
return false; return false;
} }
} else {
tdb_logerr(tdb, TDB_ERR_CORRUPT,
TDB_DEBUG_ERROR,
"tdb_check: Bad magic 0x%llx at offset %zu",
(long long)rec_magic(&rec.u), (size_t)off);
return false;
} }
} }
/* We must have found recovery area if there was one. */ /* We must have found recovery area if there was one. */
if (recovery != 0 && !found_recovery) { if (recovery != 0 && !found_recovery) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: expected a recovery area at %zu\n", "tdb_check: expected a recovery area at %zu",
(size_t)recovery); (size_t)recovery);
return false; return false;
} }
return true; return true;
} }
/* FIXME: call check() function. */
int tdb_check(struct tdb_context *tdb, int tdb_check(struct tdb_context *tdb,
int (*check)(TDB_DATA key, TDB_DATA data, void *private_data), int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
void *private_data) void *private_data)
{ {
tdb_off_t *free = NULL, *used = NULL, flist, recovery; tdb_off_t *free = NULL, *used = NULL, ft, recovery;
size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0; size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0) if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
return -1; return -1;
...@@ -532,22 +679,23 @@ int tdb_check(struct tdb_context *tdb, ...@@ -532,22 +679,23 @@ int tdb_check(struct tdb_context *tdb,
if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery)) if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
goto fail; goto fail;
for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) { for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
if (flist == TDB_OFF_ERR) if (ft == TDB_OFF_ERR)
goto fail; goto fail;
if (!check_free_list(tdb, flist, free, num_free, &num_found)) if (!check_free_table(tdb, ft, num_ftables, free, num_free,
&num_found))
goto fail; goto fail;
num_flists++; num_ftables++;
} }
/* FIXME: Check key uniqueness? */ /* FIXME: Check key uniqueness? */
if (!check_hash(tdb, used, num_used, num_flists)) if (!check_hash(tdb, used, num_used, num_ftables, check, private_data))
goto fail; goto fail;
if (num_found != num_free) { if (num_found != num_free) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
"tdb_check: Not all entries are in free table\n"); "tdb_check: Not all entries are in free table");
return false; return -1;
} }
tdb_allrecord_unlock(tdb, F_RDLCK); tdb_allrecord_unlock(tdb, F_RDLCK);
......
#LyX 1.6.5 created this file. For more info see http://www.lyx.org/ #LyX 1.6.7 created this file. For more info see http://www.lyx.org/
\lyxformat 345 \lyxformat 345
\begin_document \begin_document
\begin_header \begin_header
...@@ -50,13 +50,7 @@ Rusty Russell, IBM Corporation ...@@ -50,13 +50,7 @@ Rusty Russell, IBM Corporation
\end_layout \end_layout
\begin_layout Date \begin_layout Date
1-December-2010
\change_deleted 0 1283307542
26-July
\change_inserted 0 1284423485
14-September
\change_unchanged
-2010
\end_layout \end_layout
\begin_layout Abstract \begin_layout Abstract
...@@ -476,8 +470,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional ...@@ -476,8 +470,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional
\begin_layout Subsubsection \begin_layout Subsubsection
Proposed Solution Proposed Solution
\change_inserted 0 1284422789
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
name "attributes" name "attributes"
...@@ -485,8 +477,6 @@ name "attributes" ...@@ -485,8 +477,6 @@ name "attributes"
\end_inset \end_inset
\change_unchanged
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -590,6 +580,14 @@ This allows future attributes to be added, even if this expands the size ...@@ -590,6 +580,14 @@ This allows future attributes to be added, even if this expands the size
of the union. of the union.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
tdb_traverse Makes Impossible Guarantees tdb_traverse Makes Impossible Guarantees
\end_layout \end_layout
...@@ -631,6 +629,16 @@ Abandon the guarantee. ...@@ -631,6 +629,16 @@ Abandon the guarantee.
You can prevent changes by using a transaction or the locking API. You can prevent changes by using a transaction or the locking API.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
Delete-during-traverse will still delete every record, too (assuming no
other changes).
\end_layout
\begin_layout Subsection \begin_layout Subsection
Nesting of Transactions Is Fraught Nesting of Transactions Is Fraught
\end_layout \end_layout
...@@ -685,6 +693,14 @@ least-surprise ...@@ -685,6 +693,14 @@ least-surprise
-obscure case. -obscure case.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete; nesting flag is still defined as per tdb1.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Incorrect Hash Function is Not Detected Incorrect Hash Function is Not Detected
\end_layout \end_layout
...@@ -706,6 +722,14 @@ The header should contain an example hash result (eg. ...@@ -706,6 +722,14 @@ The header should contain an example hash result (eg.
hash function produces the same answer, or fail the tdb_open call. hash function produces the same answer, or fail the tdb_open call.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
tdb_set_max_dead/TDB_VOLATILE Expose Implementation tdb_set_max_dead/TDB_VOLATILE Expose Implementation
\end_layout \end_layout
...@@ -750,6 +774,16 @@ With the scalability problems of the freelist solved, this API can be removed. ...@@ -750,6 +774,16 @@ With the scalability problems of the freelist solved, this API can be removed.
tuning, but initially will become a no-op. tuning, but initially will become a no-op.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
TDB_VOLATILE still defined, but implementation should fail on unknown flags
to be future-proof.
\end_layout
\begin_layout Subsection \begin_layout Subsection
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
...@@ -802,6 +836,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether ...@@ -802,6 +836,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether
an API. an API.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
TDB API Is Not POSIX Thread-safe TDB API Is Not POSIX Thread-safe
\end_layout \end_layout
...@@ -846,8 +888,6 @@ Internal locking is required to make sure that fcntl locks do not overlap ...@@ -846,8 +888,6 @@ Internal locking is required to make sure that fcntl locks do not overlap
\begin_layout Standard \begin_layout Standard
The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
version of the library, and otherwise no overhead will exist. version of the library, and otherwise no overhead will exist.
\change_inserted 0 1284016998
Alternatively, a hooking mechanism similar to that proposed for Alternatively, a hooking mechanism similar to that proposed for
\begin_inset CommandInset ref \begin_inset CommandInset ref
LatexCommand ref LatexCommand ref
...@@ -856,8 +896,14 @@ reference "Proposed-Solution-locking-hook" ...@@ -856,8 +896,14 @@ reference "Proposed-Solution-locking-hook"
\end_inset \end_inset
could be used to enable pthread locking at runtime. could be used to enable pthread locking at runtime.
\change_unchanged \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -975,6 +1021,14 @@ This is flexible enough to handle any potential locking scenario, even when ...@@ -975,6 +1021,14 @@ This is flexible enough to handle any potential locking scenario, even when
It also keeps the complexity out of the API, and in ctdbd where it is needed. It also keeps the complexity out of the API, and in ctdbd where it is needed.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
tdb_chainlock Functions Expose Implementation tdb_chainlock Functions Expose Implementation
\end_layout \end_layout
...@@ -1056,6 +1110,14 @@ It may be possible to make this race-free in some implementations by having ...@@ -1056,6 +1110,14 @@ It may be possible to make this race-free in some implementations by having
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
The API Uses Gratuitous Typedefs, Capitals The API Uses Gratuitous Typedefs, Capitals
\end_layout \end_layout
...@@ -1132,6 +1194,14 @@ It should simply take an extra argument, since we are prepared to break ...@@ -1132,6 +1194,14 @@ It should simply take an extra argument, since we are prepared to break
the API/ABI. the API/ABI.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Various Callback Functions Are Not Typesafe Various Callback Functions Are Not Typesafe
\end_layout \end_layout
...@@ -1171,6 +1241,14 @@ With careful use of macros, we can create callback functions which give ...@@ -1171,6 +1241,14 @@ With careful use of macros, we can create callback functions which give
See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
\end_layout \end_layout
...@@ -1206,19 +1284,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" ...@@ -1206,19 +1284,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
\end_inset \end_inset
. .
\change_inserted 0 1284015637 \end_layout
\begin_layout Subsubsection
Status
\end_layout \end_layout
\begin_layout Subsection \begin_layout Standard
Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
\end_layout
\change_inserted 0 1284015716 \begin_layout Subsection
Extending The Header Is Difficult Extending The Header Is Difficult
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284015906
We have reserved (zeroed) words in the TDB header, which can be used for We have reserved (zeroed) words in the TDB header, which can be used for
future features. future features.
If the future features are compulsory, the version number must be updated If the future features are compulsory, the version number must be updated
...@@ -1228,14 +1308,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for ...@@ -1228,14 +1308,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1284015637
Proposed Solution Proposed Solution
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284016114
The header should contain a The header should contain a
\begin_inset Quotes eld \begin_inset Quotes eld
\end_inset \end_inset
...@@ -1249,58 +1325,48 @@ format variant ...@@ -1249,58 +1325,48 @@ format variant
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
\change_inserted 0 1284016149
The lower part reflects the format variant understood by code accessing The lower part reflects the format variant understood by code accessing
the database. the database.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
\change_inserted 0 1284016639
The upper part reflects the format variant you must understand to write The upper part reflects the format variant you must understand to write
to the database (otherwise you can only open for reading). to the database (otherwise you can only open for reading).
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284016821
The latter field can only be written at creation time, the former should The latter field can only be written at creation time, the former should
be written under the OPEN_LOCK when opening the database for writing, if be written under the OPEN_LOCK when opening the database for writing, if
the variant of the code is lower than the current lowest variant. the variant of the code is lower than the current lowest variant.
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284016803
This should allow backwards-compatible features to be added, and detection This should allow backwards-compatible features to be added, and detection
if older code (which doesn't understand the feature) writes to the database. if older code (which doesn't understand the feature) writes to the database.
\change_deleted 0 1284016101 \end_layout
\begin_layout Subsubsection
Status
\end_layout \end_layout
\begin_layout Subsection \begin_layout Standard
Incomplete.
\end_layout
\change_inserted 0 1284015634 \begin_layout Subsection
Record Headers Are Not Expandible Record Headers Are Not Expandible
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284015634
If we later want to add (say) checksums on keys and data, it would require If we later want to add (say) checksums on keys and data, it would require
another format change, which we'd like to avoid. another format change, which we'd like to avoid.
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1284015634
Proposed Solution Proposed Solution
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284422552
We often have extra padding at the tail of a record. We often have extra padding at the tail of a record.
If we ensure that the first byte (if any) of this padding is zero, we will If we ensure that the first byte (if any) of this padding is zero, we will
have a way for future changes to detect code which doesn't understand a have a way for future changes to detect code which doesn't understand a
...@@ -1309,28 +1375,28 @@ We often have extra padding at the tail of a record. ...@@ -1309,28 +1375,28 @@ We often have extra padding at the tail of a record.
not present on that record. not present on that record.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\change_inserted 0 1284422568 \begin_layout Subsection
TDB Does Not Use Talloc TDB Does Not Use Talloc
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284422646
Many users of TDB (particularly Samba) use the talloc allocator, and thus Many users of TDB (particularly Samba) use the talloc allocator, and thus
have to wrap TDB in a talloc context to use it conveniently. have to wrap TDB in a talloc context to use it conveniently.
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1284422656
Proposed Solution Proposed Solution
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284423065
The allocation within TDB is not complicated enough to justify the use of The allocation within TDB is not complicated enough to justify the use of
talloc, and I am reluctant to force another (excellent) library on TDB talloc, and I am reluctant to force another (excellent) library on TDB
users. users.
...@@ -1356,15 +1422,19 @@ context ...@@ -1356,15 +1422,19 @@ context
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284423042
This would form a talloc heirarchy as expected, but the caller would still This would form a talloc heirarchy as expected, but the caller would still
have to attach a destructor to the tdb context returned from tdb_open to have to attach a destructor to the tdb context returned from tdb_open to
close it. close it.
All TDB_DATA fields would be children of the tdb_context, and the caller All TDB_DATA fields would be children of the tdb_context, and the caller
would still have to manage them (using talloc_free() or talloc_steal()). would still have to manage them (using talloc_free() or talloc_steal()).
\change_unchanged \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout \end_layout
\begin_layout Section \begin_layout Section
...@@ -1422,6 +1492,14 @@ Remove the flag. ...@@ -1422,6 +1492,14 @@ Remove the flag.
point. point.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
\end_layout
\begin_layout Subsection \begin_layout Subsection
TDB Files Have a 4G Limit TDB Files Have a 4G Limit
\end_layout \end_layout
...@@ -1469,6 +1547,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August ...@@ -1469,6 +1547,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August
be erased and initialized as a fresh tdb!) be erased and initialized as a fresh tdb!)
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
TDB Records Have a 4G Limit TDB Records Have a 4G Limit
\end_layout \end_layout
...@@ -1498,6 +1584,14 @@ reference "sub:Records-Incur-A" ...@@ -1498,6 +1584,14 @@ reference "sub:Records-Incur-A"
). ).
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Hash Size Is Determined At TDB Creation Time Hash Size Is Determined At TDB Creation Time
\end_layout \end_layout
...@@ -1512,16 +1606,12 @@ TDB contains a number of hash chains in the header; the number is specified ...@@ -1512,16 +1606,12 @@ TDB contains a number of hash chains in the header; the number is specified
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1283336713
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
name "sub:Hash-Size-Solution" name "sub:Hash-Size-Solution"
\end_inset \end_inset
\change_unchanged
Proposed Solution Proposed Solution
\end_layout \end_layout
...@@ -1540,58 +1630,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin ...@@ -1540,58 +1630,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin
, it became clear that it is hard to beat a straight linear hash table which , it became clear that it is hard to beat a straight linear hash table which
doubles in size when it reaches saturation. doubles in size when it reaches saturation.
\change_deleted 0 1283307675
There are three details which become important:
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
On encountering a full bucket, we use the next bucket.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
Extra hash bits are stored with the offset, to reduce comparisons.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
A marker entry is used on deleting an entry.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The doubling of the table must be done under a transaction; we will not
reduce it on deletion, so it will be an unusual case.
It will either be placed at the head (other entries will be moved out the
way so we can expand).
We could have a pointer in the header to the current hashtable location,
but that pointer would have to be read frequently to check for hashtable
moves.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The locking for this is slightly more complex than the chained case; we
currently have one lock per bucket, and that means we would need to expand
the lock if we overflow to the next bucket.
The frequency of such collisions will effect our locking heuristics: we
can always lock more buckets than we need.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
One possible optimization is to only re-check the hash size on an insert
or a lookup miss.
\change_inserted 0 1283307770
Unfortunately, altering the hash table introduces serious locking complications Unfortunately, altering the hash table introduces serious locking complications
: the entire hash table needs to be locked to enlarge the hash table, and : the entire hash table needs to be locked to enlarge the hash table, and
others might be holding locks. others might be holding locks.
...@@ -1599,8 +1637,6 @@ One possible optimization is to only re-check the hash size on an insert ...@@ -1599,8 +1637,6 @@ One possible optimization is to only re-check the hash size on an insert
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1283336187
Thus an expanding layered hash will be used: an array of hash groups, with Thus an expanding layered hash will be used: an array of hash groups, with
each hash group exploding into pointers to lower hash groups once it fills, each hash group exploding into pointers to lower hash groups once it fills,
turning into a hash tree. turning into a hash tree.
...@@ -1609,8 +1645,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with ...@@ -1609,8 +1645,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1283336586
Note that bits from the hash table entries should be stolen to hold more Note that bits from the hash table entries should be stolen to hold more
hash bits to reduce the penalty of collisions. hash bits to reduce the penalty of collisions.
We can use the otherwise-unused lower 3 bits. We can use the otherwise-unused lower 3 bits.
...@@ -1621,8 +1655,14 @@ Note that bits from the hash table entries should be stolen to hold more ...@@ -1621,8 +1655,14 @@ Note that bits from the hash table entries should be stolen to hold more
bits are valid. bits are valid.
This means we can choose not to re-hash all entries when we expand a hash This means we can choose not to re-hash all entries when we expand a hash
group; simply use the next bits we need and mark them invalid. group; simply use the next bits we need and mark them invalid.
\change_unchanged \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -1749,8 +1789,6 @@ The single list lock limits our allocation rate; due to the other issues ...@@ -1749,8 +1789,6 @@ The single list lock limits our allocation rate; due to the other issues
\begin_layout Subsubsection \begin_layout Subsubsection
Proposed Solution Proposed Solution
\change_deleted 0 1283336858
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -1765,20 +1803,14 @@ The free list must be split to reduce contention. ...@@ -1765,20 +1803,14 @@ The free list must be split to reduce contention.
This implies that the number of free lists is related to the size of the This implies that the number of free lists is related to the size of the
hash table, but as it is rare to walk a large number of free list entries hash table, but as it is rare to walk a large number of free list entries
we can use far fewer, say 1/32 of the number of hash buckets. we can use far fewer, say 1/32 of the number of hash buckets.
\change_inserted 0 1283336910
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1283337052
It seems tempting to try to reuse the hash implementation which we use for It seems tempting to try to reuse the hash implementation which we use for
records here, but we have two ways of searching for free entries: for allocatio records here, but we have two ways of searching for free entries: for allocatio
n we search by size (and possibly zone) which produces too many clashes n we search by size (and possibly zone) which produces too many clashes
for our hash table to handle well, and for coalescing we search by address. for our hash table to handle well, and for coalescing we search by address.
Thus an array of doubly-linked free lists seems preferable. Thus an array of doubly-linked free lists seems preferable.
\change_unchanged
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -1792,24 +1824,28 @@ reference "sub:TDB-Becomes-Fragmented" ...@@ -1792,24 +1824,28 @@ reference "sub:TDB-Becomes-Fragmented"
) but it's not clear this would reduce contention in the common case where ) but it's not clear this would reduce contention in the common case where
all processes are allocating/freeing the same size. all processes are allocating/freeing the same size.
Thus we almost certainly need to divide in other ways: the most obvious Thus we almost certainly need to divide in other ways: the most obvious
is to divide the file into zones, and using a free list (or set of free is to divide the file into zones, and using a free list (or table of free
lists) for each. lists) for each.
This approximates address ordering. This approximates address ordering.
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
Note that this means we need to split the free lists when we expand the Unfortunately it is difficult to know what heuristics should be used to
file; this is probably acceptable when we double the hash table size, since determine zone sizes, and our transaction code relies on being able to
that is such an expensive operation already. create a
In the case of increasing the file size, there is an optimization we can \begin_inset Quotes eld
use: if we use M in the formula above as the file size rounded up to the \end_inset
next power of 2, we only need reshuffle free lists when the file size crosses
a power of 2 boundary, recovery area
\emph on \begin_inset Quotes erd
and \end_inset
\emph default
reshuffling the free lists is trivial: we simply merge every consecutive by simply appending to the file (difficult if it would need to create a
pair of free lists. new zone header).
Thus we use a linked-list of free tables; currently we only ever create
one, but if there is more than one we choose one at random to use.
In future we may use heuristics to add new free tables on contention.
We only expand the file when all free tables are exhausted.
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -1818,7 +1854,7 @@ The basic algorithm is as follows. ...@@ -1818,7 +1854,7 @@ The basic algorithm is as follows.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Identify the correct zone. Identify the correct free list.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
...@@ -1826,12 +1862,12 @@ Lock the corresponding list. ...@@ -1826,12 +1862,12 @@ Lock the corresponding list.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Re-check the zone (we didn't have a lock, sizes could have changed): relock Re-check the list (we didn't have a lock, sizes could have changed): relock
if necessary. if necessary.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Place the freed entry in the list for that zone. Place the freed entry in the list.
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -1840,15 +1876,7 @@ Allocation is a little more complicated, as we perform delayed coalescing ...@@ -1840,15 +1876,7 @@ Allocation is a little more complicated, as we perform delayed coalescing
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Pick a zone either the zone we last freed into, or based on a Pick a free table; usually the previous one.
\begin_inset Quotes eld
\end_inset
random
\begin_inset Quotes erd
\end_inset
number.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
...@@ -1856,16 +1884,16 @@ Lock the corresponding list. ...@@ -1856,16 +1884,16 @@ Lock the corresponding list.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Re-check the zone: relock if necessary. If the top entry is -large enough, remove it from the list and return it.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
If the top entry is -large enough, remove it from the list and return it. Otherwise, coalesce entries in the list.If there was no entry large enough,
unlock the list and try the next largest list
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Otherwise, coalesce entries in the list.If there was no entry large enough, If no list has an entry which meets our needs, try the next free table.
unlock the list and try the next zone.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
...@@ -1897,73 +1925,8 @@ reference "sub:Records-Incur-A" ...@@ -1897,73 +1925,8 @@ reference "sub:Records-Incur-A"
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
I anticipate that the number of entries in each free zone would be small, Each free entry has the free table number in the header: less than 255.
but it might be worth using one free entry to hold pointers to the others It also contains a doubly-linked list for easy deletion.
for cache efficiency.
\change_inserted 0 1283309850
\end_layout
\begin_layout Standard
\change_inserted 0 1283337216
\begin_inset CommandInset label
LatexCommand label
name "freelist-in-zone"
\end_inset
If we want to avoid locking complexity (enlarging the free lists when we
enlarge the file) we could place the array of free lists at the beginning
of each zone.
This means existing array lists never move, but means that a record cannot
be larger than a zone.
That in turn implies that zones should be variable sized (say, power of
2), which makes the question
\begin_inset Quotes eld
\end_inset
what zone is this record in?
\begin_inset Quotes erd
\end_inset
much harder (and
\begin_inset Quotes eld
\end_inset
pick a random zone
\begin_inset Quotes erd
\end_inset
, but that's less common).
It could be done with as few as 4 bits from the record header.
\begin_inset Foot
status open
\begin_layout Plain Layout
\change_inserted 0 1284424151
Using
\begin_inset Formula $2^{16+N*3}$
\end_inset
means 0 gives a minimal 65536-byte zone, 15 gives the maximal
\begin_inset Formula $2^{61}$
\end_inset
byte zone.
Zones range in factor of 8 steps.
Given the zone size for the zone the current record is in, we can determine
the start of the zone.
\change_unchanged
\end_layout
\end_inset
\change_unchanged
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -2165,8 +2128,6 @@ miss ...@@ -2165,8 +2128,6 @@ miss
it reduces 99.9% of false memcmp). it reduces 99.9% of false memcmp).
As an aside, as the lower bits are already incorporated in the hash table As an aside, as the lower bits are already incorporated in the hash table
resolution, the upper bits should be used here. resolution, the upper bits should be used here.
\change_inserted 0 1283336739
Note that it's not clear that these bits will be a win, given the extra Note that it's not clear that these bits will be a win, given the extra
bits in the hash table itself (see bits in the hash table itself (see
\begin_inset CommandInset ref \begin_inset CommandInset ref
...@@ -2176,8 +2137,6 @@ reference "sub:Hash-Size-Solution" ...@@ -2176,8 +2137,6 @@ reference "sub:Hash-Size-Solution"
\end_inset \end_inset
). ).
\change_unchanged
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
...@@ -2214,11 +2173,11 @@ struct tdb_used_record { ...@@ -2214,11 +2173,11 @@ struct tdb_used_record {
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
uint32_t magic : 16, uint32_t used_magic : 16,
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
prev_is_free: 1,
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
...@@ -2226,7 +2185,7 @@ struct tdb_used_record { ...@@ -2226,7 +2185,7 @@ struct tdb_used_record {
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
top_hash: 10; top_hash: 11;
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
...@@ -2250,29 +2209,27 @@ struct tdb_free_record { ...@@ -2250,29 +2209,27 @@ struct tdb_free_record {
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
uint32_t free_magic; uint64_t free_magic: 8,
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
uint64_t total_length; prev : 56;
\change_inserted 0 1283337133
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
\change_inserted 0 1283337139 \end_layout
uint64_t prev, next;
\change_unchanged
\begin_layout LyX-Code
uint64_t free_table: 8,
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
... total_length : 56
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
uint64_t tailer; uint64_t next;;
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
...@@ -2281,20 +2238,19 @@ struct tdb_free_record { ...@@ -2281,20 +2238,19 @@ struct tdb_free_record {
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1283337235 \change_deleted 0 1291206079
We might want to take some bits from the used record's top_hash (and the
free record which has 32 bits of padding to spare anyway) if we use variable
sized zones.
See
\begin_inset CommandInset ref
LatexCommand ref
reference "freelist-in-zone"
\end_inset
.
\change_unchanged \change_unchanged
Note that by limiting valid offsets to 56 bits, we can pack everything we
need into 3 64-byte words, meaning our minimum record size is 8 bytes.
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -2387,6 +2343,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum ...@@ -2387,6 +2343,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum
a transaction in progress; we need only check for recovery if this is set. a transaction in progress; we need only check for recovery if this is set.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\begin_layout Subsection \begin_layout Subsection
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
...@@ -2398,13 +2362,7 @@ TDB Does Not Have Snapshot Support ...@@ -2398,13 +2362,7 @@ TDB Does Not Have Snapshot Support
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
Proposed Solution Proposed SolutionNone.
\change_deleted 0 1284423472
\end_layout
\begin_layout Standard
None.
At some point you say At some point you say
\begin_inset Quotes eld \begin_inset Quotes eld
\end_inset \end_inset
...@@ -2413,13 +2371,6 @@ use a real database ...@@ -2413,13 +2371,6 @@ use a real database
\begin_inset Quotes erd \begin_inset Quotes erd
\end_inset \end_inset
\change_inserted 0 1284423891
\change_deleted 0 1284423891
.
\change_inserted 0 1284423901
(but see (but see
\begin_inset CommandInset ref \begin_inset CommandInset ref
LatexCommand ref LatexCommand ref
...@@ -2428,8 +2379,6 @@ reference "replay-attribute" ...@@ -2428,8 +2379,6 @@ reference "replay-attribute"
\end_inset \end_inset
). ).
\change_unchanged
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -2452,8 +2401,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack ...@@ -2452,8 +2401,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack
\begin_layout Standard \begin_layout Standard
We could then implement snapshots using a similar method, using multiple We could then implement snapshots using a similar method, using multiple
different hash tables/free tables. different hash tables/free tables.
\change_inserted 0 1284423495 \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -2473,8 +2428,6 @@ Proposed Solution ...@@ -2473,8 +2428,6 @@ Proposed Solution
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284424201
None (but see None (but see
\begin_inset CommandInset ref \begin_inset CommandInset ref
LatexCommand ref LatexCommand ref
...@@ -2483,15 +2436,21 @@ reference "replay-attribute" ...@@ -2483,15 +2436,21 @@ reference "replay-attribute"
\end_inset \end_inset
). ).
We could solve a small part of the problem by providing read-only transactions.
\change_unchanged
We could solve a small part of the problem by providing read-only transactions.
These would allow one write transaction to begin, but it could not commit These would allow one write transaction to begin, but it could not commit
until all r/o transactions are done. until all r/o transactions are done.
This would require a new RO_TRANSACTION_LOCK, which would be upgraded on This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
commit. commit.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Default Hash Function Is Suboptimal Default Hash Function Is Suboptimal
\end_layout \end_layout
...@@ -2532,6 +2491,14 @@ The seed should be created at tdb-creation time from some random source, ...@@ -2532,6 +2491,14 @@ The seed should be created at tdb-creation time from some random source,
hash bombing. hash bombing.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
...@@ -2569,6 +2536,14 @@ reference "traverse-Proposed-Solution" ...@@ -2569,6 +2536,14 @@ reference "traverse-Proposed-Solution"
. .
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Fcntl Locking Adds Overhead Fcntl Locking Adds Overhead
\end_layout \end_layout
...@@ -2670,19 +2645,13 @@ At some later point, a sync would allow recovery of the old data into the ...@@ -2670,19 +2645,13 @@ At some later point, a sync would allow recovery of the old data into the
free lists (perhaps when the array of top-level pointers filled). free lists (perhaps when the array of top-level pointers filled).
On crash, tdb_open() would examine the array of top levels, and apply the On crash, tdb_open() would examine the array of top levels, and apply the
transactions until it encountered an invalid checksum. transactions until it encountered an invalid checksum.
\change_inserted 0 1284423555
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
\change_inserted 0 1284423617
Tracing Is Fragile, Replay Is External Tracing Is Fragile, Replay Is External
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284423719
The current TDB has compile-time-enabled tracing code, but it often breaks The current TDB has compile-time-enabled tracing code, but it often breaks
as it is not enabled by default. as it is not enabled by default.
In a similar way, the ctdb code has an external wrapper which does replay In a similar way, the ctdb code has an external wrapper which does replay
...@@ -2690,8 +2659,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks ...@@ -2690,8 +2659,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1284423864
Proposed Solution Proposed Solution
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
...@@ -2703,8 +2670,6 @@ name "replay-attribute" ...@@ -2703,8 +2670,6 @@ name "replay-attribute"
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284423850
Tridge points out that an attribute can be later added to tdb_open (see Tridge points out that an attribute can be later added to tdb_open (see
\begin_inset CommandInset ref \begin_inset CommandInset ref
...@@ -2715,8 +2680,14 @@ reference "attributes" ...@@ -2715,8 +2680,14 @@ reference "attributes"
) to provide replay/trace hooks, which could become the basis for this and ) to provide replay/trace hooks, which could become the basis for this and
future parallel transactions and snapshot support. future parallel transactions and snapshot support.
\change_unchanged \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout \end_layout
\end_body \end_body
......
head 1.10; head 1.13;
access; access;
symbols; symbols;
locks; strict; locks; strict;
comment @# @; comment @# @;
1.13
date 2010.12.01.12.22.08; author rusty; state Exp;
branches;
next 1.12;
1.12
date 2010.12.01.12.20.49; author rusty; state Exp;
branches;
next 1.11;
1.11
date 2010.12.01.11.55.20; author rusty; state Exp;
branches;
next 1.10;
1.10 1.10
date 2010.09.14.00.33.57; author rusty; state Exp; date 2010.09.14.00.33.57; author rusty; state Exp;
branches; branches;
...@@ -61,12 +76,12 @@ desc ...@@ -61,12 +76,12 @@ desc
@ @
1.10 1.13
log log
@Tracing attribute, talloc support. @Merged changes.
@ @
text text
@#LyX 1.6.5 created this file. For more info see http://www.lyx.org/ @#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
\lyxformat 345 \lyxformat 345
\begin_document \begin_document
\begin_header \begin_header
...@@ -118,13 +133,7 @@ Rusty Russell, IBM Corporation ...@@ -118,13 +133,7 @@ Rusty Russell, IBM Corporation
\end_layout \end_layout
\begin_layout Date \begin_layout Date
1-December-2010
\change_deleted 0 1283307542
26-July
\change_inserted 0 1284423485
14-September
\change_unchanged
-2010
\end_layout \end_layout
\begin_layout Abstract \begin_layout Abstract
...@@ -544,8 +553,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional ...@@ -544,8 +553,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional
\begin_layout Subsubsection \begin_layout Subsubsection
Proposed Solution Proposed Solution
\change_inserted 0 1284422789
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
name "attributes" name "attributes"
...@@ -553,8 +560,6 @@ name "attributes" ...@@ -553,8 +560,6 @@ name "attributes"
\end_inset \end_inset
\change_unchanged
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -658,6 +663,14 @@ This allows future attributes to be added, even if this expands the size ...@@ -658,6 +663,14 @@ This allows future attributes to be added, even if this expands the size
of the union. of the union.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
tdb_traverse Makes Impossible Guarantees tdb_traverse Makes Impossible Guarantees
\end_layout \end_layout
...@@ -699,6 +712,16 @@ Abandon the guarantee. ...@@ -699,6 +712,16 @@ Abandon the guarantee.
You can prevent changes by using a transaction or the locking API. You can prevent changes by using a transaction or the locking API.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
Delete-during-traverse will still delete every record, too (assuming no
other changes).
\end_layout
\begin_layout Subsection \begin_layout Subsection
Nesting of Transactions Is Fraught Nesting of Transactions Is Fraught
\end_layout \end_layout
...@@ -753,6 +776,14 @@ least-surprise ...@@ -753,6 +776,14 @@ least-surprise
-obscure case. -obscure case.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete; nesting flag is still defined as per tdb1.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Incorrect Hash Function is Not Detected Incorrect Hash Function is Not Detected
\end_layout \end_layout
...@@ -774,6 +805,14 @@ The header should contain an example hash result (eg. ...@@ -774,6 +805,14 @@ The header should contain an example hash result (eg.
hash function produces the same answer, or fail the tdb_open call. hash function produces the same answer, or fail the tdb_open call.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
tdb_set_max_dead/TDB_VOLATILE Expose Implementation tdb_set_max_dead/TDB_VOLATILE Expose Implementation
\end_layout \end_layout
...@@ -818,6 +857,16 @@ With the scalability problems of the freelist solved, this API can be removed. ...@@ -818,6 +857,16 @@ With the scalability problems of the freelist solved, this API can be removed.
tuning, but initially will become a no-op. tuning, but initially will become a no-op.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
TDB_VOLATILE still defined, but implementation should fail on unknown flags
to be future-proof.
\end_layout
\begin_layout Subsection \begin_layout Subsection
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
...@@ -870,6 +919,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether ...@@ -870,6 +919,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether
an API. an API.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
TDB API Is Not POSIX Thread-safe TDB API Is Not POSIX Thread-safe
\end_layout \end_layout
...@@ -914,8 +971,6 @@ Internal locking is required to make sure that fcntl locks do not overlap ...@@ -914,8 +971,6 @@ Internal locking is required to make sure that fcntl locks do not overlap
\begin_layout Standard \begin_layout Standard
The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
version of the library, and otherwise no overhead will exist. version of the library, and otherwise no overhead will exist.
\change_inserted 0 1284016998
Alternatively, a hooking mechanism similar to that proposed for Alternatively, a hooking mechanism similar to that proposed for
\begin_inset CommandInset ref \begin_inset CommandInset ref
LatexCommand ref LatexCommand ref
...@@ -924,8 +979,14 @@ reference "Proposed-Solution-locking-hook" ...@@ -924,8 +979,14 @@ reference "Proposed-Solution-locking-hook"
\end_inset \end_inset
could be used to enable pthread locking at runtime. could be used to enable pthread locking at runtime.
\change_unchanged \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -1043,6 +1104,14 @@ This is flexible enough to handle any potential locking scenario, even when ...@@ -1043,6 +1104,14 @@ This is flexible enough to handle any potential locking scenario, even when
It also keeps the complexity out of the API, and in ctdbd where it is needed. It also keeps the complexity out of the API, and in ctdbd where it is needed.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
tdb_chainlock Functions Expose Implementation tdb_chainlock Functions Expose Implementation
\end_layout \end_layout
...@@ -1124,6 +1193,14 @@ It may be possible to make this race-free in some implementations by having ...@@ -1124,6 +1193,14 @@ It may be possible to make this race-free in some implementations by having
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
The API Uses Gratuitous Typedefs, Capitals The API Uses Gratuitous Typedefs, Capitals
\end_layout \end_layout
...@@ -1200,6 +1277,14 @@ It should simply take an extra argument, since we are prepared to break ...@@ -1200,6 +1277,14 @@ It should simply take an extra argument, since we are prepared to break
the API/ABI. the API/ABI.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Various Callback Functions Are Not Typesafe Various Callback Functions Are Not Typesafe
\end_layout \end_layout
...@@ -1239,6 +1324,14 @@ With careful use of macros, we can create callback functions which give ...@@ -1239,6 +1324,14 @@ With careful use of macros, we can create callback functions which give
See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
\end_layout \end_layout
...@@ -1274,19 +1367,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" ...@@ -1274,19 +1367,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
\end_inset \end_inset
. .
\change_inserted 0 1284015637 \end_layout
\begin_layout Subsubsection
Status
\end_layout \end_layout
\begin_layout Subsection \begin_layout Standard
Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
\end_layout
\change_inserted 0 1284015716 \begin_layout Subsection
Extending The Header Is Difficult Extending The Header Is Difficult
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284015906
We have reserved (zeroed) words in the TDB header, which can be used for We have reserved (zeroed) words in the TDB header, which can be used for
future features. future features.
If the future features are compulsory, the version number must be updated If the future features are compulsory, the version number must be updated
...@@ -1296,14 +1391,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for ...@@ -1296,14 +1391,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1284015637
Proposed Solution Proposed Solution
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284016114
The header should contain a The header should contain a
\begin_inset Quotes eld \begin_inset Quotes eld
\end_inset \end_inset
...@@ -1317,58 +1408,48 @@ format variant ...@@ -1317,58 +1408,48 @@ format variant
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
\change_inserted 0 1284016149
The lower part reflects the format variant understood by code accessing The lower part reflects the format variant understood by code accessing
the database. the database.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
\change_inserted 0 1284016639
The upper part reflects the format variant you must understand to write The upper part reflects the format variant you must understand to write
to the database (otherwise you can only open for reading). to the database (otherwise you can only open for reading).
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284016821
The latter field can only be written at creation time, the former should The latter field can only be written at creation time, the former should
be written under the OPEN_LOCK when opening the database for writing, if be written under the OPEN_LOCK when opening the database for writing, if
the variant of the code is lower than the current lowest variant. the variant of the code is lower than the current lowest variant.
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284016803
This should allow backwards-compatible features to be added, and detection This should allow backwards-compatible features to be added, and detection
if older code (which doesn't understand the feature) writes to the database. if older code (which doesn't understand the feature) writes to the database.
\change_deleted 0 1284016101 \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
\change_inserted 0 1284015634
Record Headers Are Not Expandible Record Headers Are Not Expandible
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284015634
If we later want to add (say) checksums on keys and data, it would require If we later want to add (say) checksums on keys and data, it would require
another format change, which we'd like to avoid. another format change, which we'd like to avoid.
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1284015634
Proposed Solution Proposed Solution
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284422552
We often have extra padding at the tail of a record. We often have extra padding at the tail of a record.
If we ensure that the first byte (if any) of this padding is zero, we will If we ensure that the first byte (if any) of this padding is zero, we will
have a way for future changes to detect code which doesn't understand a have a way for future changes to detect code which doesn't understand a
...@@ -1377,28 +1458,28 @@ We often have extra padding at the tail of a record. ...@@ -1377,28 +1458,28 @@ We often have extra padding at the tail of a record.
not present on that record. not present on that record.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsubsection
Status
\end_layout
\change_inserted 0 1284422568 \begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
TDB Does Not Use Talloc TDB Does Not Use Talloc
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284422646
Many users of TDB (particularly Samba) use the talloc allocator, and thus Many users of TDB (particularly Samba) use the talloc allocator, and thus
have to wrap TDB in a talloc context to use it conveniently. have to wrap TDB in a talloc context to use it conveniently.
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1284422656
Proposed Solution Proposed Solution
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284423065
The allocation within TDB is not complicated enough to justify the use of The allocation within TDB is not complicated enough to justify the use of
talloc, and I am reluctant to force another (excellent) library on TDB talloc, and I am reluctant to force another (excellent) library on TDB
users. users.
...@@ -1424,15 +1505,19 @@ context ...@@ -1424,15 +1505,19 @@ context
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284423042
This would form a talloc heirarchy as expected, but the caller would still This would form a talloc heirarchy as expected, but the caller would still
have to attach a destructor to the tdb context returned from tdb_open to have to attach a destructor to the tdb context returned from tdb_open to
close it. close it.
All TDB_DATA fields would be children of the tdb_context, and the caller All TDB_DATA fields would be children of the tdb_context, and the caller
would still have to manage them (using talloc_free() or talloc_steal()). would still have to manage them (using talloc_free() or talloc_steal()).
\change_unchanged \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout \end_layout
\begin_layout Section \begin_layout Section
...@@ -1490,6 +1575,14 @@ Remove the flag. ...@@ -1490,6 +1575,14 @@ Remove the flag.
point. point.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
\end_layout
\begin_layout Subsection \begin_layout Subsection
TDB Files Have a 4G Limit TDB Files Have a 4G Limit
\end_layout \end_layout
...@@ -1537,6 +1630,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August ...@@ -1537,6 +1630,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August
be erased and initialized as a fresh tdb!) be erased and initialized as a fresh tdb!)
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
TDB Records Have a 4G Limit TDB Records Have a 4G Limit
\end_layout \end_layout
...@@ -1566,6 +1667,14 @@ reference "sub:Records-Incur-A" ...@@ -1566,6 +1667,14 @@ reference "sub:Records-Incur-A"
). ).
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Hash Size Is Determined At TDB Creation Time Hash Size Is Determined At TDB Creation Time
\end_layout \end_layout
...@@ -1580,16 +1689,12 @@ TDB contains a number of hash chains in the header; the number is specified ...@@ -1580,16 +1689,12 @@ TDB contains a number of hash chains in the header; the number is specified
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1283336713
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
name "sub:Hash-Size-Solution" name "sub:Hash-Size-Solution"
\end_inset \end_inset
\change_unchanged
Proposed Solution Proposed Solution
\end_layout \end_layout
...@@ -1608,58 +1713,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin ...@@ -1608,58 +1713,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin
, it became clear that it is hard to beat a straight linear hash table which , it became clear that it is hard to beat a straight linear hash table which
doubles in size when it reaches saturation. doubles in size when it reaches saturation.
\change_deleted 0 1283307675
There are three details which become important:
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
On encountering a full bucket, we use the next bucket.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
Extra hash bits are stored with the offset, to reduce comparisons.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
A marker entry is used on deleting an entry.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The doubling of the table must be done under a transaction; we will not
reduce it on deletion, so it will be an unusual case.
It will either be placed at the head (other entries will be moved out the
way so we can expand).
We could have a pointer in the header to the current hashtable location,
but that pointer would have to be read frequently to check for hashtable
moves.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The locking for this is slightly more complex than the chained case; we
currently have one lock per bucket, and that means we would need to expand
the lock if we overflow to the next bucket.
The frequency of such collisions will effect our locking heuristics: we
can always lock more buckets than we need.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
One possible optimization is to only re-check the hash size on an insert
or a lookup miss.
\change_inserted 0 1283307770
Unfortunately, altering the hash table introduces serious locking complications Unfortunately, altering the hash table introduces serious locking complications
: the entire hash table needs to be locked to enlarge the hash table, and : the entire hash table needs to be locked to enlarge the hash table, and
others might be holding locks. others might be holding locks.
...@@ -1667,8 +1720,6 @@ One possible optimization is to only re-check the hash size on an insert ...@@ -1667,8 +1720,6 @@ One possible optimization is to only re-check the hash size on an insert
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1283336187
Thus an expanding layered hash will be used: an array of hash groups, with Thus an expanding layered hash will be used: an array of hash groups, with
each hash group exploding into pointers to lower hash groups once it fills, each hash group exploding into pointers to lower hash groups once it fills,
turning into a hash tree. turning into a hash tree.
...@@ -1677,8 +1728,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with ...@@ -1677,8 +1728,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1283336586
Note that bits from the hash table entries should be stolen to hold more Note that bits from the hash table entries should be stolen to hold more
hash bits to reduce the penalty of collisions. hash bits to reduce the penalty of collisions.
We can use the otherwise-unused lower 3 bits. We can use the otherwise-unused lower 3 bits.
...@@ -1689,8 +1738,14 @@ Note that bits from the hash table entries should be stolen to hold more ...@@ -1689,8 +1738,14 @@ Note that bits from the hash table entries should be stolen to hold more
bits are valid. bits are valid.
This means we can choose not to re-hash all entries when we expand a hash This means we can choose not to re-hash all entries when we expand a hash
group; simply use the next bits we need and mark them invalid. group; simply use the next bits we need and mark them invalid.
\change_unchanged \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -1817,8 +1872,6 @@ The single list lock limits our allocation rate; due to the other issues ...@@ -1817,8 +1872,6 @@ The single list lock limits our allocation rate; due to the other issues
\begin_layout Subsubsection \begin_layout Subsubsection
Proposed Solution Proposed Solution
\change_deleted 0 1283336858
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -1833,20 +1886,14 @@ The free list must be split to reduce contention. ...@@ -1833,20 +1886,14 @@ The free list must be split to reduce contention.
This implies that the number of free lists is related to the size of the This implies that the number of free lists is related to the size of the
hash table, but as it is rare to walk a large number of free list entries hash table, but as it is rare to walk a large number of free list entries
we can use far fewer, say 1/32 of the number of hash buckets. we can use far fewer, say 1/32 of the number of hash buckets.
\change_inserted 0 1283336910
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1283337052
It seems tempting to try to reuse the hash implementation which we use for It seems tempting to try to reuse the hash implementation which we use for
records here, but we have two ways of searching for free entries: for allocatio records here, but we have two ways of searching for free entries: for allocatio
n we search by size (and possibly zone) which produces too many clashes n we search by size (and possibly zone) which produces too many clashes
for our hash table to handle well, and for coalescing we search by address. for our hash table to handle well, and for coalescing we search by address.
Thus an array of doubly-linked free lists seems preferable. Thus an array of doubly-linked free lists seems preferable.
\change_unchanged
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -1860,24 +1907,28 @@ reference "sub:TDB-Becomes-Fragmented" ...@@ -1860,24 +1907,28 @@ reference "sub:TDB-Becomes-Fragmented"
) but it's not clear this would reduce contention in the common case where ) but it's not clear this would reduce contention in the common case where
all processes are allocating/freeing the same size. all processes are allocating/freeing the same size.
Thus we almost certainly need to divide in other ways: the most obvious Thus we almost certainly need to divide in other ways: the most obvious
is to divide the file into zones, and using a free list (or set of free is to divide the file into zones, and using a free list (or table of free
lists) for each. lists) for each.
This approximates address ordering. This approximates address ordering.
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
Note that this means we need to split the free lists when we expand the Unfortunately it is difficult to know what heuristics should be used to
file; this is probably acceptable when we double the hash table size, since determine zone sizes, and our transaction code relies on being able to
that is such an expensive operation already. create a
In the case of increasing the file size, there is an optimization we can \begin_inset Quotes eld
use: if we use M in the formula above as the file size rounded up to the \end_inset
next power of 2, we only need reshuffle free lists when the file size crosses
a power of 2 boundary, recovery area
\emph on \begin_inset Quotes erd
and \end_inset
\emph default
reshuffling the free lists is trivial: we simply merge every consecutive by simply appending to the file (difficult if it would need to create a
pair of free lists. new zone header).
Thus we use a linked-list of free tables; currently we only ever create
one, but if there is more than one we choose one at random to use.
In future we may use heuristics to add new free tables on contention.
We only expand the file when all free tables are exhausted.
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -1886,7 +1937,7 @@ The basic algorithm is as follows. ...@@ -1886,7 +1937,7 @@ The basic algorithm is as follows.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Identify the correct zone. Identify the correct free list.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
...@@ -1894,12 +1945,12 @@ Lock the corresponding list. ...@@ -1894,12 +1945,12 @@ Lock the corresponding list.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Re-check the zone (we didn't have a lock, sizes could have changed): relock Re-check the list (we didn't have a lock, sizes could have changed): relock
if necessary. if necessary.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Place the freed entry in the list for that zone. Place the freed entry in the list.
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -1908,15 +1959,7 @@ Allocation is a little more complicated, as we perform delayed coalescing ...@@ -1908,15 +1959,7 @@ Allocation is a little more complicated, as we perform delayed coalescing
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Pick a zone either the zone we last freed into, or based on a Pick a free table; usually the previous one.
\begin_inset Quotes eld
\end_inset
random
\begin_inset Quotes erd
\end_inset
number.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
...@@ -1924,16 +1967,16 @@ Lock the corresponding list. ...@@ -1924,16 +1967,16 @@ Lock the corresponding list.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Re-check the zone: relock if necessary. If the top entry is -large enough, remove it from the list and return it.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
If the top entry is -large enough, remove it from the list and return it. Otherwise, coalesce entries in the list.If there was no entry large enough,
unlock the list and try the next largest list
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
Otherwise, coalesce entries in the list.If there was no entry large enough, If no list has an entry which meets our needs, try the next free table.
unlock the list and try the next zone.
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
...@@ -1965,73 +2008,8 @@ reference "sub:Records-Incur-A" ...@@ -1965,73 +2008,8 @@ reference "sub:Records-Incur-A"
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
I anticipate that the number of entries in each free zone would be small, Each free entry has the free table number in the header: less than 255.
but it might be worth using one free entry to hold pointers to the others It also contains a doubly-linked list for easy deletion.
for cache efficiency.
\change_inserted 0 1283309850
\end_layout
\begin_layout Standard
\change_inserted 0 1283337216
\begin_inset CommandInset label
LatexCommand label
name "freelist-in-zone"
\end_inset
If we want to avoid locking complexity (enlarging the free lists when we
enlarge the file) we could place the array of free lists at the beginning
of each zone.
This means existing array lists never move, but means that a record cannot
be larger than a zone.
That in turn implies that zones should be variable sized (say, power of
2), which makes the question
\begin_inset Quotes eld
\end_inset
what zone is this record in?
\begin_inset Quotes erd
\end_inset
much harder (and
\begin_inset Quotes eld
\end_inset
pick a random zone
\begin_inset Quotes erd
\end_inset
, but that's less common).
It could be done with as few as 4 bits from the record header.
\begin_inset Foot
status open
\begin_layout Plain Layout
\change_inserted 0 1284424151
Using
\begin_inset Formula $2^{16+N*3}$
\end_inset
means 0 gives a minimal 65536-byte zone, 15 gives the maximal
\begin_inset Formula $2^{61}$
\end_inset
byte zone.
Zones range in factor of 8 steps.
Given the zone size for the zone the current record is in, we can determine
the start of the zone.
\change_unchanged
\end_layout
\end_inset
\change_unchanged
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -2233,8 +2211,6 @@ miss ...@@ -2233,8 +2211,6 @@ miss
it reduces 99.9% of false memcmp). it reduces 99.9% of false memcmp).
As an aside, as the lower bits are already incorporated in the hash table As an aside, as the lower bits are already incorporated in the hash table
resolution, the upper bits should be used here. resolution, the upper bits should be used here.
\change_inserted 0 1283336739
Note that it's not clear that these bits will be a win, given the extra Note that it's not clear that these bits will be a win, given the extra
bits in the hash table itself (see bits in the hash table itself (see
\begin_inset CommandInset ref \begin_inset CommandInset ref
...@@ -2244,8 +2220,6 @@ reference "sub:Hash-Size-Solution" ...@@ -2244,8 +2220,6 @@ reference "sub:Hash-Size-Solution"
\end_inset \end_inset
). ).
\change_unchanged
\end_layout \end_layout
\begin_layout Enumerate \begin_layout Enumerate
...@@ -2282,11 +2256,11 @@ struct tdb_used_record { ...@@ -2282,11 +2256,11 @@ struct tdb_used_record {
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
uint32_t magic : 16, uint32_t used_magic : 16,
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
prev_is_free: 1,
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
...@@ -2294,7 +2268,7 @@ struct tdb_used_record { ...@@ -2294,7 +2268,7 @@ struct tdb_used_record {
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
top_hash: 10; top_hash: 11;
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
...@@ -2318,29 +2292,27 @@ struct tdb_free_record { ...@@ -2318,29 +2292,27 @@ struct tdb_free_record {
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
uint32_t free_magic; uint64_t free_magic: 8,
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
uint64_t total_length; prev : 56;
\change_inserted 0 1283337133
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
\change_inserted 0 1283337139 \end_layout
uint64_t prev, next;
\change_unchanged
\begin_layout LyX-Code
uint64_t free_table: 8,
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
... total_length : 56
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
uint64_t tailer; uint64_t next;;
\end_layout \end_layout
\begin_layout LyX-Code \begin_layout LyX-Code
...@@ -2349,20 +2321,19 @@ struct tdb_free_record { ...@@ -2349,20 +2321,19 @@ struct tdb_free_record {
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1283337235 \change_deleted 0 1291206079
We might want to take some bits from the used record's top_hash (and the
free record which has 32 bits of padding to spare anyway) if we use variable
sized zones.
See
\begin_inset CommandInset ref
LatexCommand ref
reference "freelist-in-zone"
\end_inset
.
\change_unchanged \change_unchanged
Note that by limiting valid offsets to 56 bits, we can pack everything we
need into 3 64-byte words, meaning our minimum record size is 8 bytes.
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -2455,6 +2426,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum ...@@ -2455,6 +2426,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum
a transaction in progress; we need only check for recovery if this is set. a transaction in progress; we need only check for recovery if this is set.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\begin_layout Subsection \begin_layout Subsection
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
...@@ -2466,13 +2445,7 @@ TDB Does Not Have Snapshot Support ...@@ -2466,13 +2445,7 @@ TDB Does Not Have Snapshot Support
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
Proposed Solution Proposed SolutionNone.
\change_deleted 0 1284423472
\end_layout
\begin_layout Standard
None.
At some point you say At some point you say
\begin_inset Quotes eld \begin_inset Quotes eld
\end_inset \end_inset
...@@ -2481,13 +2454,6 @@ use a real database ...@@ -2481,13 +2454,6 @@ use a real database
\begin_inset Quotes erd \begin_inset Quotes erd
\end_inset \end_inset
\change_inserted 0 1284423891
\change_deleted 0 1284423891
.
\change_inserted 0 1284423901
(but see (but see
\begin_inset CommandInset ref \begin_inset CommandInset ref
LatexCommand ref LatexCommand ref
...@@ -2496,8 +2462,6 @@ reference "replay-attribute" ...@@ -2496,8 +2462,6 @@ reference "replay-attribute"
\end_inset \end_inset
). ).
\change_unchanged
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
...@@ -2520,8 +2484,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack ...@@ -2520,8 +2484,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack
\begin_layout Standard \begin_layout Standard
We could then implement snapshots using a similar method, using multiple We could then implement snapshots using a similar method, using multiple
different hash tables/free tables. different hash tables/free tables.
\change_inserted 0 1284423495 \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
...@@ -2541,8 +2511,6 @@ Proposed Solution ...@@ -2541,8 +2511,6 @@ Proposed Solution
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284424201
None (but see None (but see
\begin_inset CommandInset ref \begin_inset CommandInset ref
LatexCommand ref LatexCommand ref
...@@ -2551,15 +2519,21 @@ reference "replay-attribute" ...@@ -2551,15 +2519,21 @@ reference "replay-attribute"
\end_inset \end_inset
). ).
We could solve a small part of the problem by providing read-only transactions.
\change_unchanged
We could solve a small part of the problem by providing read-only transactions.
These would allow one write transaction to begin, but it could not commit These would allow one write transaction to begin, but it could not commit
until all r/o transactions are done. until all r/o transactions are done.
This would require a new RO_TRANSACTION_LOCK, which would be upgraded on This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
commit. commit.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Default Hash Function Is Suboptimal Default Hash Function Is Suboptimal
\end_layout \end_layout
...@@ -2600,6 +2574,14 @@ The seed should be created at tdb-creation time from some random source, ...@@ -2600,6 +2574,14 @@ The seed should be created at tdb-creation time from some random source,
hash bombing. hash bombing.
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
...@@ -2637,6 +2619,14 @@ reference "traverse-Proposed-Solution" ...@@ -2637,6 +2619,14 @@ reference "traverse-Proposed-Solution"
. .
\end_layout \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection \begin_layout Subsection
Fcntl Locking Adds Overhead Fcntl Locking Adds Overhead
\end_layout \end_layout
...@@ -2738,19 +2728,13 @@ At some later point, a sync would allow recovery of the old data into the ...@@ -2738,19 +2728,13 @@ At some later point, a sync would allow recovery of the old data into the
free lists (perhaps when the array of top-level pointers filled). free lists (perhaps when the array of top-level pointers filled).
On crash, tdb_open() would examine the array of top levels, and apply the On crash, tdb_open() would examine the array of top levels, and apply the
transactions until it encountered an invalid checksum. transactions until it encountered an invalid checksum.
\change_inserted 0 1284423555
\end_layout \end_layout
\begin_layout Subsection \begin_layout Subsection
\change_inserted 0 1284423617
Tracing Is Fragile, Replay Is External Tracing Is Fragile, Replay Is External
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284423719
The current TDB has compile-time-enabled tracing code, but it often breaks The current TDB has compile-time-enabled tracing code, but it often breaks
as it is not enabled by default. as it is not enabled by default.
In a similar way, the ctdb code has an external wrapper which does replay In a similar way, the ctdb code has an external wrapper which does replay
...@@ -2758,8 +2742,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks ...@@ -2758,8 +2742,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks
\end_layout \end_layout
\begin_layout Subsubsection \begin_layout Subsubsection
\change_inserted 0 1284423864
Proposed Solution Proposed Solution
\begin_inset CommandInset label \begin_inset CommandInset label
LatexCommand label LatexCommand label
...@@ -2771,8 +2753,6 @@ name "replay-attribute" ...@@ -2771,8 +2753,6 @@ name "replay-attribute"
\end_layout \end_layout
\begin_layout Standard \begin_layout Standard
\change_inserted 0 1284423850
Tridge points out that an attribute can be later added to tdb_open (see Tridge points out that an attribute can be later added to tdb_open (see
\begin_inset CommandInset ref \begin_inset CommandInset ref
...@@ -2783,8 +2763,14 @@ reference "attributes" ...@@ -2783,8 +2763,14 @@ reference "attributes"
) to provide replay/trace hooks, which could become the basis for this and ) to provide replay/trace hooks, which could become the basis for this and
future parallel transactions and snapshot support. future parallel transactions and snapshot support.
\change_unchanged \end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout \end_layout
\end_body \end_body
...@@ -2792,6 +2778,996 @@ reference "attributes" ...@@ -2792,6 +2778,996 @@ reference "attributes"
@ @
1.12
log
@Add status, some fixes, linked freelists.
@
text
@d53 1
a53 7
\change_deleted 0 1291204535
14-September
\change_inserted 0 1291204533
1-December
\change_unchanged
-2010
a580 2
\change_inserted 0 1291204563
a583 2
\change_inserted 0 1291204572
a587 2
\change_inserted 0 1291204573
a588 2
\change_unchanged
a629 2
\change_inserted 0 1291204588
a632 2
\change_inserted 0 1291204588
a636 2
\change_inserted 0 1291204631
a639 2
\change_unchanged
a693 2
\change_inserted 0 1291204639
a696 2
\change_inserted 0 1291204640
a700 2
\change_inserted 0 1291204665
a701 2
\change_unchanged
a722 2
\change_inserted 0 1291204671
a725 2
\change_inserted 0 1291204671
a729 2
\change_inserted 0 1291204673
a730 2
\change_unchanged
a774 2
\change_inserted 0 1291204731
a777 2
\change_inserted 0 1291204732
a781 2
\change_inserted 0 1291204779
a784 2
\change_unchanged
a836 2
\change_inserted 0 1291204830
a839 2
\change_inserted 0 1291204831
a843 2
\change_inserted 0 1291204834
a844 2
\change_unchanged
a898 2
\change_inserted 0 1291204847
a901 2
\change_inserted 0 1291204847
a905 2
\change_inserted 0 1291204852
a906 2
\change_unchanged
a1021 2
\change_inserted 0 1291204881
a1024 2
\change_inserted 0 1291204881
a1028 2
\change_inserted 0 1291204885
a1029 2
\change_unchanged
a1110 2
\change_inserted 0 1291204898
a1113 2
\change_inserted 0 1291204898
a1117 2
\change_inserted 0 1291204901
a1118 2
\change_unchanged
a1194 2
\change_inserted 0 1291204908
a1197 2
\change_inserted 0 1291204908
a1201 2
\change_inserted 0 1291204908
a1202 2
\change_unchanged
a1241 2
\change_inserted 0 1291204917
a1244 2
\change_inserted 0 1291204917
a1248 2
\change_inserted 0 1291204920
a1249 2
\change_unchanged
a1286 2
\change_inserted 0 1291204927
a1289 2
\change_inserted 0 1291204928
a1293 2
\change_inserted 0 1291204942
a1294 2
\change_unchanged
a1345 2
\change_inserted 0 1291205003
a1348 2
\change_inserted 0 1291205004
a1352 2
\change_inserted 0 1291205007
a1375 2
\change_inserted 0 1291205019
a1378 2
\change_inserted 0 1291205019
a1382 2
\change_inserted 0 1291205023
a1383 2
\change_unchanged
a1429 2
\change_inserted 0 1291205029
a1432 2
\change_inserted 0 1291205029
a1436 2
\change_inserted 0 1291206020
a1437 2
\change_unchanged
a1492 2
\change_inserted 0 1291205043
a1495 2
\change_inserted 0 1291205043
a1499 2
\change_inserted 0 1291205057
a1500 2
\change_unchanged
a1547 2
\change_inserted 0 1291205062
a1550 2
\change_inserted 0 1291205062
a1554 2
\change_inserted 0 1291205062
a1555 2
\change_unchanged
a1584 2
\change_inserted 0 1291205072
a1587 2
\change_inserted 0 1291205073
a1591 2
\change_inserted 0 1291205073
a1592 2
\change_unchanged
a1632 4
\change_deleted 0 1291204504
\change_unchanged
a1657 2
\change_inserted 0 1291205079
a1660 2
\change_inserted 0 1291205080
a1664 2
\change_inserted 0 1291205080
a1665 2
\change_unchanged
a1791 2
\change_inserted 0 1291205090
d1827 2
a1828 7
is to divide the file into zones, and using a free list (or
\change_inserted 0 1291205498
table
\change_deleted 0 1291205497
set
\change_unchanged
of free lists) for each.
a1829 2
\change_inserted 0 1291205203
a1832 2
\change_inserted 0 1291205358
a1848 21
\change_unchanged
\end_layout
\begin_layout Standard
\change_deleted 0 1291205198
Note that this means we need to split the free lists when we expand the
file; this is probably acceptable when we double the hash table size, since
that is such an expensive operation already.
In the case of increasing the file size, there is an optimization we can
use: if we use M in the formula above as the file size rounded up to the
next power of 2, we only need reshuffle free lists when the file size crosses
a power of 2 boundary,
\emph on
and
\emph default
reshuffling the free lists is trivial: we simply merge every consecutive
pair of free lists.
\change_unchanged
d1857 1
a1857 7
Identify the correct
\change_inserted 0 1291205366
free list
\change_deleted 0 1291205364
zone
\change_unchanged
.
d1865 2
a1866 7
Re-check the
\change_inserted 0 1291205372
list
\change_deleted 0 1291205371
zone
\change_unchanged
(we didn't have a lock, sizes could have changed): relock if necessary.
d1870 1
a1870 5
Place the freed entry in the list
\change_deleted 0 1291205382
for that zone
\change_unchanged
.
d1879 1
a1879 15
Pick a
\change_deleted 0 1291205403
zone either the zone we last freed into, or based on a
\begin_inset Quotes eld
\end_inset
random
\begin_inset Quotes erd
\end_inset
number.
\change_inserted 0 1291205411
free table; usually the previous one.
\change_unchanged
a1883 10
\change_deleted 0 1291205432
\end_layout
\begin_layout Enumerate
\change_deleted 0 1291205428
Re-check the zone: relock if necessary.
\change_unchanged
d1892 1
a1892 7
unlock the list and try the next
\change_inserted 0 1291205455
largest list
\change_deleted 0 1291205452
zone.
\change_inserted 0 1291205457
a1895 2
\change_inserted 0 1291205476
a1896 2
\change_unchanged
a1924 2
\change_inserted 0 1291205542
a1927 2
\change_inserted 0 1291205591
a1929 70
\change_unchanged
\end_layout
\begin_layout Standard
\change_deleted 0 1291205539
I anticipate that the number of entries in each free zone would be small,
but it might be worth using one free entry to hold pointers to the others
for cache efficiency.
\change_unchanged
\end_layout
\begin_layout Standard
\change_deleted 0 1291205534
\begin_inset CommandInset label
LatexCommand label
name "freelist-in-zone"
\end_inset
If we want to avoid locking complexity (enlarging the free lists when we
enlarge the file) we could place the array of free lists at the beginning
of each zone.
This means existing array lists never move, but means that a record cannot
be larger than a zone.
That in turn implies that zones should be variable sized (say, power of
2), which makes the question
\begin_inset Quotes eld
\end_inset
what zone is this record in?
\begin_inset Quotes erd
\end_inset
much harder (and
\begin_inset Quotes eld
\end_inset
pick a random zone
\begin_inset Quotes erd
\end_inset
, but that's less common).
It could be done with as few as 4 bits from the record header.
\begin_inset Foot
status collapsed
\begin_layout Plain Layout
Using
\begin_inset Formula $2^{16+N*3}$
\end_inset
means 0 gives a minimal 65536-byte zone, 15 gives the maximal
\begin_inset Formula $2^{61}$
\end_inset
byte zone.
Zones range in factor of 8 steps.
Given the zone size for the zone the current record is in, we can determine
the start of the zone.
\end_layout
\end_inset
\change_inserted 0 1291205139
d2176 1
a2176 5
uint32_t
\change_inserted 0 1291205758
used_
\change_unchanged
magic : 16,
a2180 4
\change_deleted 0 1291205693
prev_is_free: 1,
\change_unchanged
d2188 1
a2188 7
top_hash: 1
\change_inserted 0 1291205704
1
\change_deleted 0 1291205704
0
\change_unchanged
;
d2212 1
a2212 9
uint
\change_inserted 0 1291205725
64
\change_deleted 0 1291205723
32
\change_unchanged
_t
\change_inserted 0 1291205753
free_magic: 8,
a2215 2
\change_inserted 0 1291205746
a2220 24
\change_deleted 0 1291205749
free_magic;
\change_unchanged
\end_layout
\begin_layout LyX-Code
uint64_t
\change_inserted 0 1291205786
free_table: 8,
\end_layout
\begin_layout LyX-Code
\change_inserted 0 1291205788
\change_unchanged
total_length
\change_inserted 0 1291205792
: 56
\change_deleted 0 1291205790
;
\change_unchanged
d2224 1
a2224 7
uint64_t
\change_deleted 0 1291205801
prev,
\change_unchanged
next;
\change_deleted 0 1291205811
d2228 1
a2228 3
\change_deleted 0 1291205811
...
d2232 1
a2232 5
\change_deleted 0 1291205808
uint64_t tailer
\change_unchanged
;
d2241 5
a2245 16
\change_deleted 0 1291205827
We might want to take some bits from the used record's top_hash (and the
free record which has 32 bits of padding to spare anyway) if we use variable
sized zones.
See
\begin_inset CommandInset ref
LatexCommand ref
reference "freelist-in-zone"
\end_inset
.
\change_inserted 0 1291205885
Note that by limiting valid offsets to 56 bits, we can pack everything
we need into 3 64-byte words, meaning our minimum record size is 8 bytes.
a2248 2
\change_inserted 0 1291205886
a2252 2
\change_inserted 0 1291205886
a2253 2
\change_unchanged
a2343 2
\change_inserted 0 1291205894
a2346 2
\change_inserted 0 1291205894
a2350 2
\change_inserted 0 1291205902
a2351 2
\change_unchanged
a2373 4
\change_deleted 0 1291204504
\change_unchanged
a2403 2
\change_inserted 0 1291205910
a2406 2
\change_inserted 0 1291205910
a2410 2
\change_inserted 0 1291205914
a2411 2
\change_unchanged
a2443 2
\change_inserted 0 1291205919
a2446 2
\change_inserted 0 1291205919
a2450 2
\change_inserted 0 1291205922
a2451 2
\change_unchanged
a2491 2
\change_inserted 0 1291205929
a2494 2
\change_inserted 0 1291205929
a2498 2
\change_inserted 0 1291205929
a2499 2
\change_unchanged
a2536 2
\change_inserted 0 1291205932
a2539 2
\change_inserted 0 1291205933
a2543 2
\change_inserted 0 1291205933
a2544 2
\change_unchanged
a2682 2
\change_inserted 0 1291205944
a2685 2
\change_inserted 0 1291205945
a2689 2
\change_inserted 0 1291205948
a2690 2
\change_unchanged
@
1.11
log
@Merge changes
@
text
@d53 7
a59 1
14-September-2010
d587 16
d644 18
d716 16
d753 16
d813 18
d883 16
d953 16
d1084 16
d1181 16
d1273 16
d1328 16
d1381 16
d1447 19
a1465 2
if older code (which doesn't understand the feature) writes to the database.Reco
rd Headers Are Not Expandible
d1484 16
d1546 16
d1617 16
d1680 16
d1725 16
d1810 16
d1951 8
a1958 3
Proposed SolutionThe first step is to remove all the current heuristics,
as they obviously interact, then examine them once the lock contention
is addressed.
d1989 7
a1995 2
is to divide the file into zones, and using a free list (or set of free
lists) for each.
d1997 2
d2002 25
d2039 2
d2049 7
a2055 1
Identify the correct zone.
d2063 7
a2069 2
Re-check the zone (we didn't have a lock, sizes could have changed): relock
if necessary.
d2073 5
a2077 1
Place the freed entry in the list for that zone.
d2086 3
a2088 1
Pick a zone either the zone we last freed into, or based on a
d2097 4
d2105 2
d2110 2
d2113 2
d2123 15
a2137 1
unlock the list and try the next zone.
d2166 11
d2180 2
d2185 2
d2190 2
d2223 1
a2223 1
status open
d2243 2
d2491 5
a2495 1
uint32_t magic : 16,
d2499 2
d2502 2
d2511 7
a2517 1
top_hash: 10;
d2541 29
a2569 1
uint32_t free_magic;
d2573 11
a2583 1
uint64_t total_length;
d2587 7
a2593 1
uint64_t prev, next;
d2597 2
d2603 5
a2607 1
uint64_t tailer;
d2615 2
d2628 18
d2736 16
d2808 16
d2856 16
d2912 16
d2965 16
d3119 16
@
1.10
log
@Tracing attribute, talloc support.
@
text
@d1 1
a1 1
#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
d53 1
a53 7
\change_deleted 0 1283307542
26-July
\change_inserted 0 1284423485
14-September
\change_unchanged
-2010
a472 2
\change_inserted 0 1284422789
a479 2
\change_unchanged
a838 2
\change_inserted 0 1284016998
a846 2
\change_unchanged
a1194 2
\change_inserted 0 1284015637
a1197 2
\change_inserted 0 1284015716
a1201 2
\change_inserted 0 1284015906
a1210 2
\change_inserted 0 1284015637
a1214 2
\change_inserted 0 1284016114
a1227 2
\change_inserted 0 1284016149
a1232 2
\change_inserted 0 1284016639
a1237 2
\change_inserted 0 1284016821
a1243 2
\change_inserted 0 1284016803
d1245 2
a1246 9
if older code (which doesn't understand the feature) writes to the database.
\change_deleted 0 1284016101
\end_layout
\begin_layout Subsection
\change_inserted 0 1284015634
Record Headers Are Not Expandible
a1249 2
\change_inserted 0 1284015634
a1254 2
\change_inserted 0 1284015634
a1258 2
\change_inserted 0 1284422552
a1267 2
\change_inserted 0 1284422568
a1271 2
\change_inserted 0 1284422646
a1276 2
\change_inserted 0 1284422656
a1280 2
\change_inserted 0 1284423065
a1305 2
\change_inserted 0 1284423042
a1310 2
\change_unchanged
a1457 2
\change_inserted 0 1283336713
a1463 2
\change_unchanged
d1482 2
d1485 1
a1485 51
\change_deleted 0 1283307675
There are three details which become important:
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
On encountering a full bucket, we use the next bucket.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
Extra hash bits are stored with the offset, to reduce comparisons.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
A marker entry is used on deleting an entry.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The doubling of the table must be done under a transaction; we will not
reduce it on deletion, so it will be an unusual case.
It will either be placed at the head (other entries will be moved out the
way so we can expand).
We could have a pointer in the header to the current hashtable location,
but that pointer would have to be read frequently to check for hashtable
moves.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The locking for this is slightly more complex than the chained case; we
currently have one lock per bucket, and that means we would need to expand
the lock if we overflow to the next bucket.
The frequency of such collisions will effect our locking heuristics: we
can always lock more buckets than we need.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
One possible optimization is to only re-check the hash size on an insert
or a lookup miss.
\change_inserted 0 1283307770
a1492 2
\change_inserted 0 1283336187
a1500 2
\change_inserted 0 1283336586
a1510 2
\change_unchanged
d1636 3
a1638 8
Proposed Solution
\change_deleted 0 1283336858
\end_layout
\begin_layout Standard
The first step is to remove all the current heuristics, as they obviously
interact, then examine them once the lock contention is addressed.
a1647 2
\change_inserted 0 1283336910
a1650 2
\change_inserted 0 1283337052
a1655 2
\change_unchanged
a1776 2
\change_inserted 0 1283309850
a1779 2
\change_inserted 0 1283337216
a1813 2
\change_inserted 0 1284424151
a1825 2
\change_unchanged
a1830 2
\change_unchanged
a2031 2
\change_inserted 0 1283336739
a2040 2
\change_unchanged
a2117 2
\change_inserted 0 1283337133
a2120 2
\change_inserted 0 1283337139
a2121 2
\change_unchanged
a2136 2
\change_inserted 0 1283337235
a2147 2
\change_unchanged
d2251 1
a2251 7
Proposed Solution
\change_deleted 0 1284423472
\end_layout
\begin_layout Standard
None.
d2261 1
a2261 1
\change_inserted 0 1284423891
d2263 1
a2263 4
\change_deleted 0 1284423891
.
\change_inserted 0 1284423901
a2271 2
\change_unchanged
a2293 2
\change_inserted 0 1284423495
a2312 2
\change_inserted 0 1284424201
d2321 1
a2321 3
\change_unchanged
We could solve a small part of the problem by providing read-only transactions.
a2505 2
\change_inserted 0 1284423555
a2508 2
\change_inserted 0 1284423617
a2512 2
\change_inserted 0 1284423719
a2519 2
\change_inserted 0 1284423864
a2530 2
\change_inserted 0 1284423850
a2540 2
\change_unchanged
@
1.9 1.9
log log
@Extension mechanism. @Extension mechanism.
......
No preview for this file type
...@@ -2,7 +2,7 @@ TDB2: A Redesigning The Trivial DataBase ...@@ -2,7 +2,7 @@ TDB2: A Redesigning The Trivial DataBase
Rusty Russell, IBM Corporation Rusty Russell, IBM Corporation
14-September-2010 1-December-2010
Abstract Abstract
...@@ -129,6 +129,10 @@ union tdb_attribute { ...@@ -129,6 +129,10 @@ union tdb_attribute {
This allows future attributes to be added, even if this expands This allows future attributes to be added, even if this expands
the size of the union. the size of the union.
2.1.2 Status
Complete.
2.2 tdb_traverse Makes Impossible Guarantees 2.2 tdb_traverse Makes Impossible Guarantees
tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
...@@ -148,6 +152,11 @@ occur during your traversal, otherwise you will see some subset. ...@@ -148,6 +152,11 @@ occur during your traversal, otherwise you will see some subset.
You can prevent changes by using a transaction or the locking You can prevent changes by using a transaction or the locking
API. API.
2.2.2 Status
Complete. Delete-during-traverse will still delete every record,
too (assuming no other changes).
2.3 Nesting of Transactions Is Fraught 2.3 Nesting of Transactions Is Fraught
TDB has alternated between allowing nested transactions and not TDB has alternated between allowing nested transactions and not
...@@ -182,6 +191,10 @@ However, this behavior can be simulated with a wrapper which uses ...@@ -182,6 +191,10 @@ However, this behavior can be simulated with a wrapper which uses
tdb_add_flags() and tdb_remove_flags(), so the API should not be tdb_add_flags() and tdb_remove_flags(), so the API should not be
expanded for this relatively-obscure case. expanded for this relatively-obscure case.
2.3.2 Status
Incomplete; nesting flag is still defined as per tdb1.
2.4 Incorrect Hash Function is Not Detected 2.4 Incorrect Hash Function is Not Detected
tdb_open_ex() allows the calling code to specify a different hash tdb_open_ex() allows the calling code to specify a different hash
...@@ -195,6 +208,10 @@ The header should contain an example hash result (eg. the hash of ...@@ -195,6 +208,10 @@ The header should contain an example hash result (eg. the hash of
0xdeadbeef), and tdb_open_ex() should check that the given hash 0xdeadbeef), and tdb_open_ex() should check that the given hash
function produces the same answer, or fail the tdb_open call. function produces the same answer, or fail the tdb_open call.
2.4.2 Status
Complete.
2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation 2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
In response to scalability issues with the free list ([TDB-Freelist-Is] In response to scalability issues with the free list ([TDB-Freelist-Is]
...@@ -216,6 +233,11 @@ hint that store and delete of records will be at least as common ...@@ -216,6 +233,11 @@ hint that store and delete of records will be at least as common
as fetch in order to allow some internal tuning, but initially as fetch in order to allow some internal tuning, but initially
will become a no-op. will become a no-op.
2.5.2 Status
Incomplete. TDB_VOLATILE still defined, but implementation should
fail on unknown flags to be future-proof.
2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times 2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
In The Same Process In The Same Process
...@@ -251,6 +273,10 @@ whether re-opening is allowed, as though there may be some ...@@ -251,6 +273,10 @@ whether re-opening is allowed, as though there may be some
benefit to adding a call to detect when a tdb_context is shared, benefit to adding a call to detect when a tdb_context is shared,
to allow other to create such an API. to allow other to create such an API.
2.6.2 Status
Incomplete.
2.7 TDB API Is Not POSIX Thread-safe 2.7 TDB API Is Not POSIX Thread-safe
The TDB API uses an error code which can be queried after an The TDB API uses an error code which can be queried after an
...@@ -281,6 +307,10 @@ will exist. Alternatively, a hooking mechanism similar to that ...@@ -281,6 +307,10 @@ will exist. Alternatively, a hooking mechanism similar to that
proposed for [Proposed-Solution-locking-hook] could be used to proposed for [Proposed-Solution-locking-hook] could be used to
enable pthread locking at runtime. enable pthread locking at runtime.
2.7.2 Status
Incomplete.
2.8 *_nonblock Functions And *_mark Functions Expose 2.8 *_nonblock Functions And *_mark Functions Expose
Implementation Implementation
...@@ -343,6 +373,10 @@ locks it doesn't need to obtain. ...@@ -343,6 +373,10 @@ locks it doesn't need to obtain.
It also keeps the complexity out of the API, and in ctdbd where It also keeps the complexity out of the API, and in ctdbd where
it is needed. it is needed.
2.8.2 Status
Incomplete.
2.9 tdb_chainlock Functions Expose Implementation 2.9 tdb_chainlock Functions Expose Implementation
tdb_chainlock locks some number of records, including the record tdb_chainlock locks some number of records, including the record
...@@ -391,6 +425,10 @@ EINVAL if the signal occurs before the kernel is entered, ...@@ -391,6 +425,10 @@ EINVAL if the signal occurs before the kernel is entered,
otherwise EAGAIN. otherwise EAGAIN.
] ]
2.10.2 Status
Incomplete.
2.11 The API Uses Gratuitous Typedefs, Capitals 2.11 The API Uses Gratuitous Typedefs, Capitals
typedefs are useful for providing source compatibility when types typedefs are useful for providing source compatibility when types
...@@ -433,6 +471,10 @@ the tdb_open_ex for logging. ...@@ -433,6 +471,10 @@ the tdb_open_ex for logging.
It should simply take an extra argument, since we are prepared to It should simply take an extra argument, since we are prepared to
break the API/ABI. break the API/ABI.
2.12.2 Status
Complete.
2.13 Various Callback Functions Are Not Typesafe 2.13 Various Callback Functions Are Not Typesafe
The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take] The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
...@@ -455,6 +497,10 @@ their parameter. ...@@ -455,6 +497,10 @@ their parameter.
See CCAN's typesafe_cb module at See CCAN's typesafe_cb module at
http://ccan.ozlabs.org/info/typesafe_cb.html http://ccan.ozlabs.org/info/typesafe_cb.html
2.13.2 Status
Incomplete.
2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, 2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
tdb_reopen_all Problematic tdb_reopen_all Problematic
...@@ -475,6 +521,11 @@ it alone has opened the TDB and will erase it. ...@@ -475,6 +521,11 @@ it alone has opened the TDB and will erase it.
Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
see [TDB_CLEAR_IF_FIRST-Imposes-Performance]. see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
2.14.2 Status
Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
implemented.
2.15 Extending The Header Is Difficult 2.15 Extending The Header Is Difficult
We have reserved (zeroed) words in the TDB header, which can be We have reserved (zeroed) words in the TDB header, which can be
...@@ -505,6 +556,10 @@ This should allow backwards-compatible features to be added, and ...@@ -505,6 +556,10 @@ This should allow backwards-compatible features to be added, and
detection if older code (which doesn't understand the feature) detection if older code (which doesn't understand the feature)
writes to the database. writes to the database.
2.15.2 Status
Incomplete.
2.16 Record Headers Are Not Expandible 2.16 Record Headers Are Not Expandible
If we later want to add (say) checksums on keys and data, it If we later want to add (say) checksums on keys and data, it
...@@ -519,6 +574,10 @@ understand a new format: the new code would write (say) a 1 at ...@@ -519,6 +574,10 @@ understand a new format: the new code would write (say) a 1 at
the tail, and thus if there is no tail or the first byte is 0, we the tail, and thus if there is no tail or the first byte is 0, we
would know the extension is not present on that record. would know the extension is not present on that record.
2.16.2 Status
Incomplete.
2.17 TDB Does Not Use Talloc 2.17 TDB Does Not Use Talloc
Many users of TDB (particularly Samba) use the talloc allocator, Many users of TDB (particularly Samba) use the talloc allocator,
...@@ -541,6 +600,10 @@ returned from tdb_open to close it. All TDB_DATA fields would be ...@@ -541,6 +600,10 @@ returned from tdb_open to close it. All TDB_DATA fields would be
children of the tdb_context, and the caller would still have to children of the tdb_context, and the caller would still have to
manage them (using talloc_free() or talloc_steal()). manage them (using talloc_free() or talloc_steal()).
2.17.2 Status
Deferred.
3 Performance And Scalability Issues 3 Performance And Scalability Issues
3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST 3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
...@@ -570,6 +633,10 @@ Remove the flag. It was a neat idea, but even trivial servers ...@@ -570,6 +633,10 @@ Remove the flag. It was a neat idea, but even trivial servers
tend to know when they are initializing for the first time and tend to know when they are initializing for the first time and
can simply unlink the old tdb at that point. can simply unlink the old tdb at that point.
3.1.2 Status
Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
3.2 TDB Files Have a 4G Limit 3.2 TDB Files Have a 4G Limit
This seems to be becoming an issue (so much for “trivial”!), This seems to be becoming an issue (so much for “trivial”!),
...@@ -596,6 +663,10 @@ Old versions of tdb will fail to open the new TDB files (since 28 ...@@ -596,6 +663,10 @@ Old versions of tdb will fail to open the new TDB files (since 28
August 2009, commit 398d0c29290: prior to that any unrecognized August 2009, commit 398d0c29290: prior to that any unrecognized
file format would be erased and initialized as a fresh tdb!) file format would be erased and initialized as a fresh tdb!)
3.2.2 Status
Complete.
3.3 TDB Records Have a 4G Limit 3.3 TDB Records Have a 4G Limit
This has not been a reported problem, and the API uses size_t This has not been a reported problem, and the API uses size_t
...@@ -610,6 +681,10 @@ implementation would return TDB_ERR_OOM in a similar case). It ...@@ -610,6 +681,10 @@ implementation would return TDB_ERR_OOM in a similar case). It
seems unlikely that 32 bit keys will be a limitation, so the seems unlikely that 32 bit keys will be a limitation, so the
implementation may not support this (see [sub:Records-Incur-A]). implementation may not support this (see [sub:Records-Incur-A]).
3.3.2 Status
Complete.
3.4 Hash Size Is Determined At TDB Creation Time 3.4 Hash Size Is Determined At TDB Creation Time
TDB contains a number of hash chains in the header; the number is TDB contains a number of hash chains in the header; the number is
...@@ -628,20 +703,9 @@ This was annoying because I was previously convinced that an ...@@ -628,20 +703,9 @@ This was annoying because I was previously convinced that an
expanding tree of hashes would be very close to optimal. expanding tree of hashes would be very close to optimal.
], it became clear that it is hard to beat a straight linear hash ], it became clear that it is hard to beat a straight linear hash
table which doubles in size when it reaches saturation. table which doubles in size when it reaches saturation.
Unfortunately, altering the hash table introduces serious locking
1. complications: the entire hash table needs to be locked to
enlarge the hash table, and others might be holding locks.
2.
3.
Unfortunately, altering the hash table introduces serious
locking complications: the entire hash table needs to be locked
to enlarge the hash table, and others might be holding locks.
Particularly insidious are insertions done under tdb_chainlock. Particularly insidious are insertions done under tdb_chainlock.
Thus an expanding layered hash will be used: an array of hash Thus an expanding layered hash will be used: an array of hash
...@@ -662,6 +726,10 @@ means we can choose not to re-hash all entries when we expand a ...@@ -662,6 +726,10 @@ means we can choose not to re-hash all entries when we expand a
hash group; simply use the next bits we need and mark them hash group; simply use the next bits we need and mark them
invalid. invalid.
3.4.2 Status
Complete.
3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended 3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
TDB uses a single linked list for the free list. Allocation TDB uses a single linked list for the free list. Allocation
...@@ -749,45 +817,45 @@ There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fr ...@@ -749,45 +817,45 @@ There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fr
case where all processes are allocating/freeing the same size. case where all processes are allocating/freeing the same size.
Thus we almost certainly need to divide in other ways: the most Thus we almost certainly need to divide in other ways: the most
obvious is to divide the file into zones, and using a free list obvious is to divide the file into zones, and using a free list
(or set of free lists) for each. This approximates address (or table of free lists) for each. This approximates address
ordering. ordering.
Note that this means we need to split the free lists when we Unfortunately it is difficult to know what heuristics should be
expand the file; this is probably acceptable when we double the used to determine zone sizes, and our transaction code relies on
hash table size, since that is such an expensive operation being able to create a “recovery area” by simply appending to the
already. In the case of increasing the file size, there is an file (difficult if it would need to create a new zone header).
optimization we can use: if we use M in the formula above as the Thus we use a linked-list of free tables; currently we only ever
file size rounded up to the next power of 2, we only need create one, but if there is more than one we choose one at random
reshuffle free lists when the file size crosses a power of 2 to use. In future we may use heuristics to add new free tables on
boundary, and reshuffling the free lists is trivial: we simply contention. We only expand the file when all free tables are
merge every consecutive pair of free lists. exhausted.
The basic algorithm is as follows. Freeing is simple: The basic algorithm is as follows. Freeing is simple:
1. Identify the correct zone. 1. Identify the correct free list.
2. Lock the corresponding list. 2. Lock the corresponding list.
3. Re-check the zone (we didn't have a lock, sizes could have 3. Re-check the list (we didn't have a lock, sizes could have
changed): relock if necessary. changed): relock if necessary.
4. Place the freed entry in the list for that zone. 4. Place the freed entry in the list.
Allocation is a little more complicated, as we perform delayed Allocation is a little more complicated, as we perform delayed
coalescing at this point: coalescing at this point:
1. Pick a zone either the zone we last freed into, or based on a “ 1. Pick a free table; usually the previous one.
random” number.
2. Lock the corresponding list. 2. Lock the corresponding list.
3. Re-check the zone: relock if necessary. 3. If the top entry is -large enough, remove it from the list and
4. If the top entry is -large enough, remove it from the list and
return it. return it.
5. Otherwise, coalesce entries in the list.If there was no entry 4. Otherwise, coalesce entries in the list.If there was no entry
large enough, unlock the list and try the next zone. large enough, unlock the list and try the next largest list
5. If no list has an entry which meets our needs, try the next
free table.
6. If no zone satisfies, expand the file. 6. If no zone satisfies, expand the file.
...@@ -798,24 +866,9 @@ ordering seems to be fairly good for keeping fragmentation low ...@@ -798,24 +866,9 @@ ordering seems to be fairly good for keeping fragmentation low
does not need a tailer to coalesce, though if we needed one we does not need a tailer to coalesce, though if we needed one we
could have one cheaply: see [sub:Records-Incur-A]. could have one cheaply: see [sub:Records-Incur-A].
I anticipate that the number of entries in each free zone would Each free entry has the free table number in the header: less
be small, but it might be worth using one free entry to hold than 255. It also contains a doubly-linked list for easy
pointers to the others for cache efficiency. deletion.
<freelist-in-zone>If we want to avoid locking complexity
(enlarging the free lists when we enlarge the file) we could
place the array of free lists at the beginning of each zone. This
means existing array lists never move, but means that a record
cannot be larger than a zone. That in turn implies that zones
should be variable sized (say, power of 2), which makes the
question “what zone is this record in?” much harder (and “pick a
random zone”, but that's less common). It could be done with as
few as 4 bits from the record header.[footnote:
Using 2^{16+N*3}means 0 gives a minimal 65536-byte zone, 15 gives
the maximal 2^{61} byte zone. Zones range in factor of 8 steps.
Given the zone size for the zone the current record is in, we can
determine the start of the zone.
]
3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented 3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
...@@ -944,13 +997,13 @@ This produces a 16 byte used header like this: ...@@ -944,13 +997,13 @@ This produces a 16 byte used header like this:
struct tdb_used_record { struct tdb_used_record {
uint32_t magic : 16, uint32_t used_magic : 16,
prev_is_free: 1,
key_data_divide: 5, key_data_divide: 5,
top_hash: 10; top_hash: 11;
uint32_t extra_octets; uint32_t extra_octets;
...@@ -962,21 +1015,27 @@ And a free record like this: ...@@ -962,21 +1015,27 @@ And a free record like this:
struct tdb_free_record { struct tdb_free_record {
uint32_t free_magic; uint64_t free_magic: 8,
prev : 56;
uint64_t total_length;
uint64_t prev, next;
... uint64_t free_table: 8,
uint64_t tailer; total_length : 56
uint64_t next;;
}; };
We might want to take some bits from the used record's top_hash Note that by limiting valid offsets to 56 bits, we can pack
(and the free record which has 32 bits of padding to spare everything we need into 3 64-byte words, meaning our minimum
anyway) if we use variable sized zones. See [freelist-in-zone]. record size is 8 bytes.
3.7.2 Status
Complete.
3.8 Transaction Commit Requires 4 fdatasync 3.8 Transaction Commit Requires 4 fdatasync
...@@ -1029,12 +1088,14 @@ but need only be done at open. For running databases, a separate ...@@ -1029,12 +1088,14 @@ but need only be done at open. For running databases, a separate
header field can be used to indicate a transaction in progress; header field can be used to indicate a transaction in progress;
we need only check for recovery if this is set. we need only check for recovery if this is set.
3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support 3.8.2 Status
3.9.1 Proposed Solution Deferred.
None. At some point you say “use a real database” (but see [replay-attribute] 3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
).
3.9.1 Proposed SolutionNone. At some point you say “use a real
database” (but see [replay-attribute]).
But as a thought experiment, if we implemented transactions to But as a thought experiment, if we implemented transactions to
only overwrite free entries (this is tricky: there must not be a only overwrite free entries (this is tricky: there must not be a
...@@ -1053,6 +1114,10 @@ rewrite some sections of the hash, too. ...@@ -1053,6 +1114,10 @@ rewrite some sections of the hash, too.
We could then implement snapshots using a similar method, using We could then implement snapshots using a similar method, using
multiple different hash tables/free tables. multiple different hash tables/free tables.
3.9.2 Status
Deferred.
3.10 Transactions Cannot Operate in Parallel 3.10 Transactions Cannot Operate in Parallel
This would be useless for ldb, as it hits the index records with This would be useless for ldb, as it hits the index records with
...@@ -1069,6 +1134,10 @@ allow one write transaction to begin, but it could not commit ...@@ -1069,6 +1134,10 @@ allow one write transaction to begin, but it could not commit
until all r/o transactions are done. This would require a new until all r/o transactions are done. This would require a new
RO_TRANSACTION_LOCK, which would be upgraded on commit. RO_TRANSACTION_LOCK, which would be upgraded on commit.
3.10.2 Status
Deferred.
3.11 Default Hash Function Is Suboptimal 3.11 Default Hash Function Is Suboptimal
The Knuth-inspired multiplicative hash used by tdb is fairly slow The Knuth-inspired multiplicative hash used by tdb is fairly slow
...@@ -1090,6 +1159,10 @@ The seed should be created at tdb-creation time from some random ...@@ -1090,6 +1159,10 @@ The seed should be created at tdb-creation time from some random
source, and placed in the header. This is far from foolproof, but source, and placed in the header. This is far from foolproof, but
adds a little bit of protection against hash bombing. adds a little bit of protection against hash bombing.
3.11.2 Status
Complete.
3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity 3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
We lock a record during traversal iteration, and try to grab that We lock a record during traversal iteration, and try to grab that
...@@ -1104,6 +1177,10 @@ indefinitely. ...@@ -1104,6 +1177,10 @@ indefinitely.
Remove reliability guarantees; see [traverse-Proposed-Solution]. Remove reliability guarantees; see [traverse-Proposed-Solution].
3.12.2 Status
Complete.
3.13 Fcntl Locking Adds Overhead 3.13 Fcntl Locking Adds Overhead
Placing a fcntl lock means a system call, as does removing one. Placing a fcntl lock means a system call, as does removing one.
...@@ -1176,3 +1253,7 @@ tdb_open (see [attributes]) to provide replay/trace hooks, which ...@@ -1176,3 +1253,7 @@ tdb_open (see [attributes]) to provide replay/trace hooks, which
could become the basis for this and future parallel transactions could become the basis for this and future parallel transactions
and snapshot support. and snapshot support.
3.15.2 Status
Deferred.
...@@ -49,23 +49,24 @@ unsigned int size_to_bucket(tdb_len_t data_len) ...@@ -49,23 +49,24 @@ unsigned int size_to_bucket(tdb_len_t data_len)
return bucket; return bucket;
} }
tdb_off_t first_flist(struct tdb_context *tdb) tdb_off_t first_ftable(struct tdb_context *tdb)
{ {
return tdb_read_off(tdb, offsetof(struct tdb_header, free_list)); return tdb_read_off(tdb, offsetof(struct tdb_header, free_table));
} }
tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist) tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable)
{ {
return tdb_read_off(tdb, flist + offsetof(struct tdb_freelist, next)); return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next));
} }
int tdb_flist_init(struct tdb_context *tdb) int tdb_ftable_init(struct tdb_context *tdb)
{ {
/* Use reservoir sampling algorithm to select a free list at random. */ /* Use reservoir sampling algorithm to select a free list at random. */
unsigned int rnd, max = 0; unsigned int rnd, max = 0, count = 0;
tdb_off_t off; tdb_off_t off;
tdb->flist_off = off = first_flist(tdb); tdb->ftable_off = off = first_ftable(tdb);
tdb->ftable = 0;
while (off) { while (off) {
if (off == TDB_OFF_ERR) if (off == TDB_OFF_ERR)
...@@ -73,50 +74,52 @@ int tdb_flist_init(struct tdb_context *tdb) ...@@ -73,50 +74,52 @@ int tdb_flist_init(struct tdb_context *tdb)
rnd = random(); rnd = random();
if (rnd >= max) { if (rnd >= max) {
tdb->flist_off = off; tdb->ftable_off = off;
tdb->ftable = count;
max = rnd; max = rnd;
} }
off = next_flist(tdb, off); off = next_ftable(tdb, off);
count++;
} }
return 0; return 0;
} }
/* Offset of a given bucket. */ /* Offset of a given bucket. */
tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket) tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket)
{ {
return flist_off + offsetof(struct tdb_freelist, buckets) return ftable_off + offsetof(struct tdb_freetable, buckets)
+ bucket * sizeof(tdb_off_t); + bucket * sizeof(tdb_off_t);
} }
/* Returns free_buckets + 1, or list number to search. */ /* Returns free_buckets + 1, or list number to search. */
static tdb_off_t find_free_head(struct tdb_context *tdb, static tdb_off_t find_free_head(struct tdb_context *tdb,
tdb_off_t flist_off, tdb_off_t ftable_off,
tdb_off_t bucket) tdb_off_t bucket)
{ {
/* Speculatively search for a non-zero bucket. */ /* Speculatively search for a non-zero bucket. */
return tdb_find_nonzero_off(tdb, bucket_off(flist_off, 0), return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0),
bucket, TDB_FREE_BUCKETS); bucket, TDB_FREE_BUCKETS);
} }
/* Remove from free bucket. */ /* Remove from free bucket. */
static int remove_from_list(struct tdb_context *tdb, static int remove_from_list(struct tdb_context *tdb,
tdb_off_t b_off, tdb_off_t r_off, tdb_off_t b_off, tdb_off_t r_off,
struct tdb_free_record *r) const struct tdb_free_record *r)
{ {
tdb_off_t off; tdb_off_t off;
/* Front of list? */ /* Front of list? */
if (r->prev == 0) { if (frec_prev(r) == 0) {
off = b_off; off = b_off;
} else { } else {
off = r->prev + offsetof(struct tdb_free_record, next); off = frec_prev(r) + offsetof(struct tdb_free_record, next);
} }
#ifdef DEBUG #ifdef DEBUG
if (tdb_read_off(tdb, off) != r_off) { if (tdb_read_off(tdb, off) != r_off) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
"remove_from_list: %llu bad prev in list %llu\n", "remove_from_list: %llu bad prev in list %llu",
(long long)r_off, (long long)b_off); (long long)r_off, (long long)b_off);
return -1; return -1;
} }
...@@ -128,19 +131,19 @@ static int remove_from_list(struct tdb_context *tdb, ...@@ -128,19 +131,19 @@ static int remove_from_list(struct tdb_context *tdb,
} }
if (r->next != 0) { if (r->next != 0) {
off = r->next + offsetof(struct tdb_free_record, prev); off = r->next + offsetof(struct tdb_free_record,magic_and_prev);
/* r->next->prev = r->prev */ /* r->next->prev = r->prev */
#ifdef DEBUG #ifdef DEBUG
if (tdb_read_off(tdb, off) != r_off) { if (tdb_read_off(tdb, off) & TDB_OFF_MASK != r_off) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
"remove_from_list: %llu bad list %llu\n", "remove_from_list: %llu bad list %llu",
(long long)r_off, (long long)b_off); (long long)r_off, (long long)b_off);
return -1; return -1;
} }
#endif #endif
if (tdb_write_off(tdb, off, r->prev)) { if (tdb_write_off(tdb, off, r->magic_and_prev)) {
return -1; return -1;
} }
} }
...@@ -151,58 +154,66 @@ static int remove_from_list(struct tdb_context *tdb, ...@@ -151,58 +154,66 @@ static int remove_from_list(struct tdb_context *tdb,
static int enqueue_in_free(struct tdb_context *tdb, static int enqueue_in_free(struct tdb_context *tdb,
tdb_off_t b_off, tdb_off_t b_off,
tdb_off_t off, tdb_off_t off,
struct tdb_free_record *new) tdb_len_t len)
{ {
new->prev = 0; struct tdb_free_record new;
uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
/* We only need to set ftable_and_len; rest is set in enqueue_in_free */
new.ftable_and_len = ((uint64_t)tdb->ftable << (64 - TDB_OFF_UPPER_STEAL))
| len;
/* prev = 0. */
new.magic_and_prev = magic;
/* new->next = head. */ /* new->next = head. */
new->next = tdb_read_off(tdb, b_off); new.next = tdb_read_off(tdb, b_off);
if (new->next == TDB_OFF_ERR) if (new.next == TDB_OFF_ERR)
return -1; return -1;
if (new->next) { if (new.next) {
#ifdef DEBUG #ifdef DEBUG
if (tdb_read_off(tdb, if (tdb_read_off(tdb,
new->next new.next + offsetof(struct tdb_free_record,
+ offsetof(struct tdb_free_record, prev)) magic_and_prev))
!= 0) { != magic) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
"enqueue_in_free: %llu bad head prev %llu\n", "enqueue_in_free: %llu bad head"
(long long)new->next, (long long)b_off); " prev %llu",
(long long)new.next, (long long)b_off);
return -1; return -1;
} }
#endif #endif
/* next->prev = new. */ /* next->prev = new. */
if (tdb_write_off(tdb, new->next if (tdb_write_off(tdb, new.next
+ offsetof(struct tdb_free_record, prev), + offsetof(struct tdb_free_record,
off) != 0) magic_and_prev),
off | magic) != 0)
return -1; return -1;
} }
/* head = new */ /* head = new */
if (tdb_write_off(tdb, b_off, off) != 0) if (tdb_write_off(tdb, b_off, off) != 0)
return -1; return -1;
return tdb_write_convert(tdb, off, new, sizeof(*new)); return tdb_write_convert(tdb, off, &new, sizeof(new));
} }
/* List need not be locked. */ /* List need not be locked. */
int add_free_record(struct tdb_context *tdb, int add_free_record(struct tdb_context *tdb,
tdb_off_t off, tdb_len_t len_with_header) tdb_off_t off, tdb_len_t len_with_header)
{ {
struct tdb_free_record new;
tdb_off_t b_off; tdb_off_t b_off;
tdb_len_t len;
int ret; int ret;
assert(len_with_header >= sizeof(new)); assert(len_with_header >= sizeof(struct tdb_free_record));
new.magic_and_meta = TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL) len = len_with_header - sizeof(struct tdb_used_record);
| tdb->flist_off;
new.data_len = len_with_header - sizeof(struct tdb_used_record);
b_off = bucket_off(tdb->flist_off, size_to_bucket(new.data_len)); b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0) if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
return -1; return -1;
ret = enqueue_in_free(tdb, b_off, off, &new); ret = enqueue_in_free(tdb, b_off, off, len);
tdb_unlock_free_bucket(tdb, b_off); tdb_unlock_free_bucket(tdb, b_off);
return ret; return ret;
} }
...@@ -234,91 +245,113 @@ static size_t record_leftover(size_t keylen, size_t datalen, ...@@ -234,91 +245,113 @@ static size_t record_leftover(size_t keylen, size_t datalen,
return leftover; return leftover;
} }
static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
{
tdb_off_t off;
unsigned int i;
if (likely(tdb->ftable == ftable))
return tdb->ftable_off;
off = first_ftable(tdb);
for (i = 0; i < ftable; i++)
off = next_ftable(tdb, off);
return off;
}
/* Note: we unlock the current bucket if we coalesce or fail. */ /* Note: we unlock the current bucket if we coalesce or fail. */
static int coalesce(struct tdb_context *tdb, static int coalesce(struct tdb_context *tdb,
tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len) tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
{ {
struct tdb_free_record pad, *r;
tdb_off_t end; tdb_off_t end;
struct tdb_free_record rec;
add_stat(tdb, alloc_coalesce_tried, 1);
end = off + sizeof(struct tdb_used_record) + data_len; end = off + sizeof(struct tdb_used_record) + data_len;
while (end < tdb->map_size) { while (end < tdb->map_size) {
const struct tdb_free_record *r;
tdb_off_t nb_off; tdb_off_t nb_off;
unsigned ftable, bucket;
/* FIXME: do tdb_get here and below really win? */ r = tdb_access_read(tdb, end, sizeof(*r), true);
r = tdb_get(tdb, end, &pad, sizeof(pad));
if (!r) if (!r)
goto err; goto err;
if (frec_magic(r) != TDB_FREE_MAGIC) if (frec_magic(r) != TDB_FREE_MAGIC
|| frec_ftable(r) == TDB_FTABLE_NONE) {
tdb_access_release(tdb, r);
break; break;
}
nb_off = bucket_off(frec_flist(r), size_to_bucket(r->data_len)); ftable = frec_ftable(r);
bucket = size_to_bucket(frec_len(r));
nb_off = bucket_off(ftable_offset(tdb, ftable), bucket);
tdb_access_release(tdb, r);
/* We may be violating lock order here, so best effort. */ /* We may be violating lock order here, so best effort. */
if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1) if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1) {
add_stat(tdb, alloc_coalesce_lockfail, 1);
break; break;
}
/* Now we have lock, re-check. */ /* Now we have lock, re-check. */
r = tdb_get(tdb, end, &pad, sizeof(pad)); if (tdb_read_convert(tdb, end, &rec, sizeof(rec))) {
if (!r) {
tdb_unlock_free_bucket(tdb, nb_off); tdb_unlock_free_bucket(tdb, nb_off);
goto err; goto err;
} }
if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) { if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) {
add_stat(tdb, alloc_coalesce_race, 1);
tdb_unlock_free_bucket(tdb, nb_off); tdb_unlock_free_bucket(tdb, nb_off);
break; break;
} }
if (unlikely(bucket_off(frec_flist(r), if (unlikely(frec_ftable(&rec) != ftable)
size_to_bucket(r->data_len)) || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
!= nb_off)) { add_stat(tdb, alloc_coalesce_race, 1);
tdb_unlock_free_bucket(tdb, nb_off); tdb_unlock_free_bucket(tdb, nb_off);
break; break;
} }
if (remove_from_list(tdb, nb_off, end, r) == -1) { if (remove_from_list(tdb, nb_off, end, &rec) == -1) {
tdb_unlock_free_bucket(tdb, nb_off); tdb_unlock_free_bucket(tdb, nb_off);
goto err; goto err;
} }
end += sizeof(struct tdb_used_record) + r->data_len; end += sizeof(struct tdb_used_record) + frec_len(&rec);
tdb_unlock_free_bucket(tdb, nb_off); tdb_unlock_free_bucket(tdb, nb_off);
add_stat(tdb, alloc_coalesce_num_merged, 1);
} }
/* Didn't find any adjacent free? */ /* Didn't find any adjacent free? */
if (end == off + sizeof(struct tdb_used_record) + data_len) if (end == off + sizeof(struct tdb_used_record) + data_len)
return 0; return 0;
/* OK, expand record */ /* OK, expand initial record */
r = tdb_get(tdb, off, &pad, sizeof(pad)); if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
if (!r)
goto err; goto err;
if (r->data_len != data_len) { if (frec_len(&rec) != data_len) {
tdb->ecode = TDB_ERR_CORRUPT; tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "coalesce: expected data len %zu not %zu",
"coalesce: expected data len %llu not %llu\n", (size_t)data_len, (size_t)frec_len(&rec));
(long long)data_len, (long long)r->data_len);
goto err; goto err;
} }
if (remove_from_list(tdb, b_off, off, r) == -1) if (remove_from_list(tdb, b_off, off, &rec) == -1)
goto err;
r = tdb_access_write(tdb, off, sizeof(*r), true);
if (!r)
goto err; goto err;
/* We have to drop this to avoid deadlocks, so make sure record /* We have to drop this to avoid deadlocks, so make sure record
* doesn't get coalesced by someone else! */ * doesn't get coalesced by someone else! */
r->magic_and_meta = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL); rec.ftable_and_len = (TDB_FTABLE_NONE << (64 - TDB_OFF_UPPER_STEAL))
r->data_len = end - off - sizeof(struct tdb_used_record); | (end - off - sizeof(struct tdb_used_record));
if (tdb_access_commit(tdb, r) != 0) if (tdb_write_off(tdb, off + offsetof(struct tdb_free_record,
ftable_and_len),
rec.ftable_and_len) != 0)
goto err; goto err;
add_stat(tdb, alloc_coalesce_succeeded, 1);
tdb_unlock_free_bucket(tdb, b_off); tdb_unlock_free_bucket(tdb, b_off);
if (add_free_record(tdb, off, end - off) == -1) if (add_free_record(tdb, off, end - off) == -1)
...@@ -333,19 +366,21 @@ err: ...@@ -333,19 +366,21 @@ err:
/* We need size bytes to put our key and data in. */ /* We need size bytes to put our key and data in. */
static tdb_off_t lock_and_alloc(struct tdb_context *tdb, static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
tdb_off_t flist_off, tdb_off_t ftable_off,
tdb_off_t bucket, tdb_off_t bucket,
size_t keylen, size_t datalen, size_t keylen, size_t datalen,
bool want_extra, bool want_extra,
unsigned magic,
unsigned hashlow) unsigned hashlow)
{ {
tdb_off_t off, b_off,best_off; tdb_off_t off, b_off,best_off;
struct tdb_free_record pad, best = { 0 }, *r; struct tdb_free_record best = { 0 };
double multiplier; double multiplier;
size_t size = adjust_size(keylen, datalen); size_t size = adjust_size(keylen, datalen);
add_stat(tdb, allocs, 1);
again: again:
b_off = bucket_off(flist_off, bucket); b_off = bucket_off(ftable_off, bucket);
/* FIXME: Try non-blocking wait first, to measure contention. */ /* FIXME: Try non-blocking wait first, to measure contention. */
/* Lock this bucket. */ /* Lock this bucket. */
...@@ -353,7 +388,7 @@ again: ...@@ -353,7 +388,7 @@ again:
return TDB_OFF_ERR; return TDB_OFF_ERR;
} }
best.data_len = -1ULL; best.ftable_and_len = -1ULL;
best_off = 0; best_off = 0;
/* Get slack if we're after extra. */ /* Get slack if we're after extra. */
...@@ -369,30 +404,40 @@ again: ...@@ -369,30 +404,40 @@ again:
goto unlock_err; goto unlock_err;
while (off) { while (off) {
/* FIXME: Does tdb_get win anything here? */ const struct tdb_free_record *r;
r = tdb_get(tdb, off, &pad, sizeof(*r)); tdb_len_t len;
tdb_off_t next;
r = tdb_access_read(tdb, off, sizeof(*r), true);
if (!r) if (!r)
goto unlock_err; goto unlock_err;
if (frec_magic(r) != TDB_FREE_MAGIC) { if (frec_magic(r) != TDB_FREE_MAGIC) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_access_release(tdb, r);
"lock_and_alloc: %llu non-free 0x%llx\n", tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
(long long)off, (long long)r->magic_and_meta); "lock_and_alloc: %llu non-free 0x%llx",
(long long)off, (long long)r->magic_and_prev);
goto unlock_err; goto unlock_err;
} }
if (r->data_len >= size && r->data_len < best.data_len) { if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
best_off = off; best_off = off;
best = *r; best = *r;
} }
if (best.data_len < size * multiplier && best_off) if (frec_len(&best) < size * multiplier && best_off) {
tdb_access_release(tdb, r);
break; break;
}
multiplier *= 1.01; multiplier *= 1.01;
next = r->next;
len = frec_len(r);
tdb_access_release(tdb, r);
/* Since we're going slow anyway, try coalescing here. */ /* Since we're going slow anyway, try coalescing here. */
switch (coalesce(tdb, off, b_off, r->data_len)) { switch (coalesce(tdb, off, b_off, len)) {
case -1: case -1:
/* This has already unlocked on error. */ /* This has already unlocked on error. */
return -1; return -1;
...@@ -400,7 +445,7 @@ again: ...@@ -400,7 +445,7 @@ again:
/* This has unlocked list, restart. */ /* This has unlocked list, restart. */
goto again; goto again;
} }
off = r->next; off = next;
} }
/* If we found anything at all, use it. */ /* If we found anything at all, use it. */
...@@ -413,28 +458,30 @@ again: ...@@ -413,28 +458,30 @@ again:
goto unlock_err; goto unlock_err;
leftover = record_leftover(keylen, datalen, want_extra, leftover = record_leftover(keylen, datalen, want_extra,
best.data_len); frec_len(&best));
assert(keylen + datalen + leftover <= best.data_len); assert(keylen + datalen + leftover <= frec_len(&best));
/* We need to mark non-free before we drop lock, otherwise /* We need to mark non-free before we drop lock, otherwise
* coalesce() could try to merge it! */ * coalesce() could try to merge it! */
if (set_header(tdb, &rec, keylen, datalen, if (set_header(tdb, &rec, magic, keylen, datalen,
best.data_len - leftover, frec_len(&best) - leftover, hashlow) != 0)
hashlow) != 0)
goto unlock_err; goto unlock_err;
if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0) if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
goto unlock_err; goto unlock_err;
tdb_unlock_free_bucket(tdb, b_off); /* Bucket of leftover will be <= current bucket, so nested
* locking is allowed. */
if (leftover) { if (leftover) {
add_stat(tdb, alloc_leftover, 1);
if (add_free_record(tdb, if (add_free_record(tdb,
best_off + sizeof(rec) best_off + sizeof(rec)
+ best.data_len - leftover, + frec_len(&best) - leftover,
leftover)) leftover))
return TDB_OFF_ERR; best_off = TDB_OFF_ERR;
} }
tdb_unlock_free_bucket(tdb, b_off);
return best_off; return best_off;
} }
...@@ -449,10 +496,10 @@ unlock_err: ...@@ -449,10 +496,10 @@ unlock_err:
/* Get a free block from current free list, or 0 if none. */ /* Get a free block from current free list, or 0 if none. */
static tdb_off_t get_free(struct tdb_context *tdb, static tdb_off_t get_free(struct tdb_context *tdb,
size_t keylen, size_t datalen, bool want_extra, size_t keylen, size_t datalen, bool want_extra,
unsigned hashlow) unsigned magic, unsigned hashlow)
{ {
tdb_off_t off, flist; tdb_off_t off, ftable_off;
unsigned start_b, b; unsigned start_b, b, ftable;
bool wrapped = false; bool wrapped = false;
/* If they are growing, add 50% to get to higher bucket. */ /* If they are growing, add 50% to get to higher bucket. */
...@@ -462,31 +509,40 @@ static tdb_off_t get_free(struct tdb_context *tdb, ...@@ -462,31 +509,40 @@ static tdb_off_t get_free(struct tdb_context *tdb,
else else
start_b = size_to_bucket(adjust_size(keylen, datalen)); start_b = size_to_bucket(adjust_size(keylen, datalen));
flist = tdb->flist_off; ftable_off = tdb->ftable_off;
while (!wrapped || flist != tdb->flist_off) { ftable = tdb->ftable;
while (!wrapped || ftable_off != tdb->ftable_off) {
/* Start at exact size bucket, and search up... */ /* Start at exact size bucket, and search up... */
for (b = find_free_head(tdb, flist, start_b); for (b = find_free_head(tdb, ftable_off, start_b);
b < TDB_FREE_BUCKETS; b < TDB_FREE_BUCKETS;
b = find_free_head(tdb, flist, b + 1)) { b = find_free_head(tdb, ftable_off, b + 1)) {
/* Try getting one from list. */ /* Try getting one from list. */
off = lock_and_alloc(tdb, flist, off = lock_and_alloc(tdb, ftable_off,
b, keylen, datalen, want_extra, b, keylen, datalen, want_extra,
hashlow); magic, hashlow);
if (off == TDB_OFF_ERR) if (off == TDB_OFF_ERR)
return TDB_OFF_ERR; return TDB_OFF_ERR;
if (off != 0) { if (off != 0) {
if (b == start_b)
add_stat(tdb, alloc_bucket_exact, 1);
if (b == TDB_FREE_BUCKETS - 1)
add_stat(tdb, alloc_bucket_max, 1);
/* Worked? Stay using this list. */ /* Worked? Stay using this list. */
tdb->flist_off = flist; tdb->ftable_off = ftable_off;
tdb->ftable = ftable;
return off; return off;
} }
/* Didn't work. Try next bucket. */ /* Didn't work. Try next bucket. */
} }
/* Hmm, try next list. */ /* Hmm, try next table. */
flist = next_flist(tdb, flist); ftable_off = next_ftable(tdb, ftable_off);
if (flist == 0) { ftable++;
if (ftable_off == 0) {
wrapped = true; wrapped = true;
flist = first_flist(tdb); ftable_off = first_ftable(tdb);
ftable = 0;
} }
} }
...@@ -495,7 +551,7 @@ static tdb_off_t get_free(struct tdb_context *tdb, ...@@ -495,7 +551,7 @@ static tdb_off_t get_free(struct tdb_context *tdb,
int set_header(struct tdb_context *tdb, int set_header(struct tdb_context *tdb,
struct tdb_used_record *rec, struct tdb_used_record *rec,
uint64_t keylen, uint64_t datalen, unsigned magic, uint64_t keylen, uint64_t datalen,
uint64_t actuallen, unsigned hashlow) uint64_t actuallen, unsigned hashlow)
{ {
uint64_t keybits = (fls64(keylen) + 1) / 2; uint64_t keybits = (fls64(keylen) + 1) / 2;
...@@ -504,16 +560,15 @@ int set_header(struct tdb_context *tdb, ...@@ -504,16 +560,15 @@ int set_header(struct tdb_context *tdb,
rec->magic_and_meta = (hashlow & ((1 << 11)-1)) rec->magic_and_meta = (hashlow & ((1 << 11)-1))
| ((actuallen - (keylen + datalen)) << 11) | ((actuallen - (keylen + datalen)) << 11)
| (keybits << 43) | (keybits << 43)
| (TDB_MAGIC << 48); | ((uint64_t)magic << 48);
rec->key_and_data_len = (keylen | (datalen << (keybits*2))); rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
/* Encoding can fail on big values. */ /* Encoding can fail on big values. */
if (rec_key_length(rec) != keylen if (rec_key_length(rec) != keylen
|| rec_data_length(rec) != datalen || rec_data_length(rec) != datalen
|| rec_extra_padding(rec) != actuallen - (keylen + datalen)) { || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
tdb->ecode = TDB_ERR_IO; tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "Could not encode k=%llu,d=%llu,a=%llu",
"Could not encode k=%llu,d=%llu,a=%llu\n",
(long long)keylen, (long long)datalen, (long long)keylen, (long long)datalen,
(long long)actuallen); (long long)actuallen);
return -1; return -1;
...@@ -533,11 +588,19 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size) ...@@ -533,11 +588,19 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
/* Need to hold a hash lock to expand DB: transactions rely on it. */ /* Need to hold a hash lock to expand DB: transactions rely on it. */
if (!(tdb->flags & TDB_NOLOCK) if (!(tdb->flags & TDB_NOLOCK)
&& !tdb->allrecord_lock.count && !tdb_has_hash_locks(tdb)) { && !tdb->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
"tdb_expand: must hold lock during expand\n"); "tdb_expand: must hold lock during expand");
return -1; return -1;
} }
/* always make room for at least 100 more records, and at
least 25% more space. */
if (size * TDB_EXTENSION_FACTOR > tdb->map_size / 4)
wanted = size * TDB_EXTENSION_FACTOR;
else
wanted = tdb->map_size / 4;
wanted = adjust_size(0, wanted);
/* Only one person can expand file at a time. */ /* Only one person can expand file at a time. */
if (tdb_lock_expand(tdb, F_WRLCK) != 0) if (tdb_lock_expand(tdb, F_WRLCK) != 0)
return -1; return -1;
...@@ -550,7 +613,7 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size) ...@@ -550,7 +613,7 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
return 0; return 0;
} }
if (tdb->methods->expand_file(tdb, wanted*TDB_EXTENSION_FACTOR) == -1) { if (tdb->methods->expand_file(tdb, wanted) == -1) {
tdb_unlock_expand(tdb, F_WRLCK); tdb_unlock_expand(tdb, F_WRLCK);
return -1; return -1;
} }
...@@ -558,12 +621,13 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size) ...@@ -558,12 +621,13 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
/* We need to drop this lock before adding free record. */ /* We need to drop this lock before adding free record. */
tdb_unlock_expand(tdb, F_WRLCK); tdb_unlock_expand(tdb, F_WRLCK);
return add_free_record(tdb, old_size, wanted * TDB_EXTENSION_FACTOR); add_stat(tdb, expands, 1);
return add_free_record(tdb, old_size, wanted);
} }
/* This won't fail: it will expand the database if it has to. */ /* This won't fail: it will expand the database if it has to. */
tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
uint64_t hash, bool growing) uint64_t hash, unsigned magic, bool growing)
{ {
tdb_off_t off; tdb_off_t off;
...@@ -571,7 +635,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, ...@@ -571,7 +635,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
assert(!tdb->direct_access); assert(!tdb->direct_access);
for (;;) { for (;;) {
off = get_free(tdb, keylen, datalen, growing, hash); off = get_free(tdb, keylen, datalen, growing, magic, hash);
if (likely(off != 0)) if (likely(off != 0))
break; break;
......
...@@ -42,17 +42,19 @@ uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len) ...@@ -42,17 +42,19 @@ uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off) uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
{ {
struct tdb_used_record pad, *r; const struct tdb_used_record *r;
const void *key; const void *key;
uint64_t klen, hash; uint64_t klen, hash;
r = tdb_get(tdb, off, &pad, sizeof(pad)); r = tdb_access_read(tdb, off, sizeof(*r), true);
if (!r) if (!r)
/* FIXME */ /* FIXME */
return 0; return 0;
klen = rec_key_length(r); klen = rec_key_length(r);
key = tdb_access_read(tdb, off + sizeof(pad), klen, false); tdb_access_release(tdb, r);
key = tdb_access_read(tdb, off + sizeof(*r), klen, false);
if (!key) if (!key)
return 0; return 0;
...@@ -76,6 +78,30 @@ static uint32_t use_bits(struct hash_info *h, unsigned num) ...@@ -76,6 +78,30 @@ static uint32_t use_bits(struct hash_info *h, unsigned num)
return bits(h->h, 64 - h->hash_used, num); return bits(h->h, 64 - h->hash_used, num);
} }
static bool key_matches(struct tdb_context *tdb,
const struct tdb_used_record *rec,
tdb_off_t off,
const struct tdb_data *key)
{
bool ret = false;
const char *rkey;
if (rec_key_length(rec) != key->dsize) {
add_stat(tdb, compare_wrong_keylen, 1);
return ret;
}
rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
if (!rkey)
return ret;
if (memcmp(rkey, key->dptr, key->dsize) == 0)
ret = true;
else
add_stat(tdb, compare_wrong_keycmp, 1);
tdb_access_release(tdb, rkey);
return ret;
}
/* Does entry match? */ /* Does entry match? */
static bool match(struct tdb_context *tdb, static bool match(struct tdb_context *tdb,
struct hash_info *h, struct hash_info *h,
...@@ -83,38 +109,33 @@ static bool match(struct tdb_context *tdb, ...@@ -83,38 +109,33 @@ static bool match(struct tdb_context *tdb,
tdb_off_t val, tdb_off_t val,
struct tdb_used_record *rec) struct tdb_used_record *rec)
{ {
bool ret;
const unsigned char *rkey;
tdb_off_t off; tdb_off_t off;
/* FIXME: Handle hash value truncated. */ add_stat(tdb, compares, 1);
if (bits(val, TDB_OFF_HASH_TRUNCATED_BIT, 1))
abort();
/* Desired bucket must match. */ /* Desired bucket must match. */
if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) {
add_stat(tdb, compare_wrong_bucket, 1);
return false; return false;
}
/* Top bits of offset == next bits of hash. */ /* Top bits of offset == next bits of hash. */
if (bits(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA) if (bits(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA)
!= bits(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA, != bits(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
TDB_OFF_UPPER_STEAL_EXTRA)) TDB_OFF_UPPER_STEAL_EXTRA)) {
add_stat(tdb, compare_wrong_offsetbits, 1);
return false; return false;
}
off = val & TDB_OFF_MASK; off = val & TDB_OFF_MASK;
if (tdb_read_convert(tdb, off, rec, sizeof(*rec)) == -1) if (tdb_read_convert(tdb, off, rec, sizeof(*rec)) == -1)
return false; return false;
/* FIXME: check extra bits in header? */ if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) {
if (rec_key_length(rec) != key->dsize) add_stat(tdb, compare_wrong_rechash, 1);
return false; return false;
}
rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false); return key_matches(tdb, rec, off, key);
if (!rkey)
return false;
ret = (memcmp(rkey, key->dptr, key->dsize) == 0);
tdb_access_release(tdb, rkey);
return ret;
} }
static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket) static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
...@@ -123,10 +144,9 @@ static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket) ...@@ -123,10 +144,9 @@ static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
+ (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t); + (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
} }
/* Truncated hashes can't be all 1: that's how we spot a sub-hash */
bool is_subhash(tdb_off_t val) bool is_subhash(tdb_off_t val)
{ {
return val >> (64-TDB_OFF_UPPER_STEAL) == (1<<TDB_OFF_UPPER_STEAL) - 1; return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1;
} }
/* FIXME: Guess the depth, don't over-lock! */ /* FIXME: Guess the depth, don't over-lock! */
...@@ -136,6 +156,65 @@ static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size) ...@@ -136,6 +156,65 @@ static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS)); return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
} }
static tdb_off_t COLD find_in_chain(struct tdb_context *tdb,
struct tdb_data key,
tdb_off_t chain,
struct hash_info *h,
struct tdb_used_record *rec,
struct traverse_info *tinfo)
{
tdb_off_t off, next;
/* In case nothing is free, we set these to zero. */
h->home_bucket = h->found_bucket = 0;
for (off = chain; off; off = next) {
unsigned int i;
h->group_start = off;
if (tdb_read_convert(tdb, off, h->group, sizeof(h->group)))
return TDB_OFF_ERR;
for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
tdb_off_t recoff;
if (!h->group[i]) {
/* Remember this empty bucket. */
h->home_bucket = h->found_bucket = i;
continue;
}
/* We can insert extra bits via add_to_hash
* empty bucket logic. */
recoff = h->group[i] & TDB_OFF_MASK;
if (tdb_read_convert(tdb, recoff, rec, sizeof(*rec)))
return TDB_OFF_ERR;
if (key_matches(tdb, rec, recoff, &key)) {
h->home_bucket = h->found_bucket = i;
if (tinfo) {
tinfo->levels[tinfo->num_levels]
.hashtable = off;
tinfo->levels[tinfo->num_levels]
.total_buckets
= 1 << TDB_HASH_GROUP_BITS;
tinfo->levels[tinfo->num_levels].entry
= i;
tinfo->num_levels++;
}
return recoff;
}
}
next = tdb_read_off(tdb, off
+ offsetof(struct tdb_chain, next));
if (next == TDB_OFF_ERR)
return TDB_OFF_ERR;
if (next)
next += sizeof(struct tdb_used_record);
}
return 0;
}
/* This is the core routine which searches the hashtable for an entry. /* This is the core routine which searches the hashtable for an entry.
* On error, no locks are held and TDB_OFF_ERR is returned. * On error, no locks are held and TDB_OFF_ERR is returned.
* Otherwise, hinfo is filled in (and the optional tinfo). * Otherwise, hinfo is filled in (and the optional tinfo).
...@@ -171,7 +250,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb, ...@@ -171,7 +250,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS; tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS;
} }
while (likely(h->hash_used < 64)) { while (h->hash_used <= 64) {
/* Read in the hash group. */ /* Read in the hash group. */
h->group_start = hashtable h->group_start = hashtable
+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); + group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
...@@ -228,8 +307,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb, ...@@ -228,8 +307,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
return 0; return 0;
} }
/* FIXME: We hit the bottom. Chain! */ return find_in_chain(tdb, key, hashtable, h, rec, tinfo);
abort();
fail: fail:
tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype); tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype);
...@@ -239,8 +317,8 @@ fail: ...@@ -239,8 +317,8 @@ fail:
/* I wrote a simple test, expanding a hash to 2GB, for the following /* I wrote a simple test, expanding a hash to 2GB, for the following
* cases: * cases:
* 1) Expanding all the buckets at once, * 1) Expanding all the buckets at once,
* 2) Expanding the most-populated bucket, * 2) Expanding the bucket we wanted to place the new entry into.
* 3) Expanding the bucket we wanted to place the new entry ito. * 3) Expanding the most-populated bucket,
* *
* I measured the worst/average/best density during this process. * I measured the worst/average/best density during this process.
* 1) 3%/16%/30% * 1) 3%/16%/30%
...@@ -315,6 +393,41 @@ int replace_in_hash(struct tdb_context *tdb, ...@@ -315,6 +393,41 @@ int replace_in_hash(struct tdb_context *tdb,
encode_offset(new_off, h)); encode_offset(new_off, h));
} }
/* We slot in anywhere that's empty in the chain. */
static int COLD add_to_chain(struct tdb_context *tdb,
tdb_off_t subhash,
tdb_off_t new_off)
{
size_t entry = tdb_find_zero_off(tdb, subhash, 1<<TDB_HASH_GROUP_BITS);
if (entry == 1 << TDB_HASH_GROUP_BITS) {
tdb_off_t next;
next = tdb_read_off(tdb, subhash
+ offsetof(struct tdb_chain, next));
if (next == TDB_OFF_ERR)
return -1;
if (!next) {
next = alloc(tdb, 0, sizeof(struct tdb_chain), 0,
TDB_CHAIN_MAGIC, false);
if (next == TDB_OFF_ERR)
return -1;
if (zero_out(tdb, next+sizeof(struct tdb_used_record),
sizeof(struct tdb_chain)))
return -1;
if (tdb_write_off(tdb, subhash
+ offsetof(struct tdb_chain, next),
next) != 0)
return -1;
}
return add_to_chain(tdb, next, new_off);
}
return tdb_write_off(tdb, subhash + entry * sizeof(tdb_off_t),
new_off);
}
/* Add into a newly created subhash. */ /* Add into a newly created subhash. */
static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash, static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
unsigned hash_used, tdb_off_t val) unsigned hash_used, tdb_off_t val)
...@@ -325,14 +438,12 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash, ...@@ -325,14 +438,12 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
h.hash_used = hash_used; h.hash_used = hash_used;
/* FIXME chain if hash_used == 64 */
if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64) if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64)
abort(); return add_to_chain(tdb, subhash, off);
/* FIXME: Do truncated hash bits if we can! */
h.h = hash_record(tdb, off); h.h = hash_record(tdb, off);
gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS); gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS);
h.group_start = subhash + sizeof(struct tdb_used_record) h.group_start = subhash
+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); + gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS); h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS);
...@@ -346,20 +457,29 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash, ...@@ -346,20 +457,29 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
static int expand_group(struct tdb_context *tdb, struct hash_info *h) static int expand_group(struct tdb_context *tdb, struct hash_info *h)
{ {
unsigned bucket, num_vals, i; unsigned bucket, num_vals, i, magic;
size_t subsize;
tdb_off_t subhash; tdb_off_t subhash;
tdb_off_t vals[1 << TDB_HASH_GROUP_BITS]; tdb_off_t vals[1 << TDB_HASH_GROUP_BITS];
/* Attach new empty subhash under fullest bucket. */ /* Attach new empty subhash under fullest bucket. */
bucket = fullest_bucket(tdb, h->group, h->home_bucket); bucket = fullest_bucket(tdb, h->group, h->home_bucket);
subhash = alloc(tdb, 0, sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS, if (h->hash_used == 64) {
0, false); add_stat(tdb, alloc_chain, 1);
subsize = sizeof(struct tdb_chain);
magic = TDB_CHAIN_MAGIC;
} else {
add_stat(tdb, alloc_subhash, 1);
subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS);
magic = TDB_HTABLE_MAGIC;
}
subhash = alloc(tdb, 0, subsize, 0, magic, false);
if (subhash == TDB_OFF_ERR) if (subhash == TDB_OFF_ERR)
return -1; return -1;
if (zero_out(tdb, subhash + sizeof(struct tdb_used_record), if (zero_out(tdb, subhash + sizeof(struct tdb_used_record), subsize))
sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) == -1)
return -1; return -1;
/* Remove any which are destined for bucket or are in wrong place. */ /* Remove any which are destined for bucket or are in wrong place. */
...@@ -377,7 +497,10 @@ static int expand_group(struct tdb_context *tdb, struct hash_info *h) ...@@ -377,7 +497,10 @@ static int expand_group(struct tdb_context *tdb, struct hash_info *h)
/* assert(num_vals); */ /* assert(num_vals); */
/* Overwrite expanded bucket with subhash pointer. */ /* Overwrite expanded bucket with subhash pointer. */
h->group[bucket] = subhash | ~((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1); h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT);
/* Point to actual contents of record. */
subhash += sizeof(struct tdb_used_record);
/* Put values back. */ /* Put values back. */
for (i = 0; i < num_vals; i++) { for (i = 0; i < num_vals; i++) {
...@@ -433,10 +556,6 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h) ...@@ -433,10 +556,6 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off) int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
{ {
/* FIXME: chain! */
if (h->hash_used >= 64)
abort();
/* We hit an empty bucket during search? That's where it goes. */ /* We hit an empty bucket during search? That's where it goes. */
if (!h->group[h->found_bucket]) { if (!h->group[h->found_bucket]) {
h->group[h->found_bucket] = encode_offset(new_off, h); h->group[h->found_bucket] = encode_offset(new_off, h);
...@@ -445,6 +564,9 @@ int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off) ...@@ -445,6 +564,9 @@ int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
h->group, sizeof(h->group)); h->group, sizeof(h->group));
} }
if (h->hash_used > 64)
return add_to_chain(tdb, h->group_start, new_off);
/* We're full. Expand. */ /* We're full. Expand. */
if (expand_group(tdb, h) == -1) if (expand_group(tdb, h) == -1)
return -1; return -1;
...@@ -523,7 +645,11 @@ again: ...@@ -523,7 +645,11 @@ again:
tlevel++; tlevel++;
tlevel->hashtable = off + sizeof(struct tdb_used_record); tlevel->hashtable = off + sizeof(struct tdb_used_record);
tlevel->entry = 0; tlevel->entry = 0;
tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS); /* Next level is a chain? */
if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1))
tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS);
else
tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
goto again; goto again;
} }
...@@ -531,6 +657,20 @@ again: ...@@ -531,6 +657,20 @@ again:
if (tinfo->num_levels == 1) if (tinfo->num_levels == 1)
return 0; return 0;
/* Handle chained entries. */
if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) {
tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable
+ offsetof(struct tdb_chain,
next));
if (tlevel->hashtable == TDB_OFF_ERR)
return TDB_OFF_ERR;
if (tlevel->hashtable) {
tlevel->hashtable += sizeof(struct tdb_used_record);
tlevel->entry = 0;
goto again;
}
}
/* Go back up and keep searching. */ /* Go back up and keep searching. */
tinfo->num_levels--; tinfo->num_levels--;
tlevel--; tlevel--;
...@@ -563,11 +703,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype, ...@@ -563,11 +703,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
ltype); ltype);
return -1; return -1;
} }
if (rec_magic(&rec) != TDB_MAGIC) { if (rec_magic(&rec) != TDB_USED_MAGIC) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT,
"next_in_hash:" TDB_DEBUG_FATAL,
" corrupt record at %llu\n", "next_in_hash:"
(long long)off); " corrupt record at %llu",
(long long)off);
return -1; return -1;
} }
......
...@@ -56,9 +56,9 @@ void tdb_mmap(struct tdb_context *tdb) ...@@ -56,9 +56,9 @@ void tdb_mmap(struct tdb_context *tdb)
*/ */
if (tdb->map_ptr == MAP_FAILED) { if (tdb->map_ptr == MAP_FAILED) {
tdb->map_ptr = NULL; tdb->map_ptr = NULL;
tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv, tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
"tdb_mmap failed for size %lld (%s)\n", "tdb_mmap failed for size %lld (%s)",
(long long)tdb->map_size, strerror(errno)); (long long)tdb->map_size, strerror(errno));
} }
} }
...@@ -70,7 +70,6 @@ void tdb_mmap(struct tdb_context *tdb) ...@@ -70,7 +70,6 @@ void tdb_mmap(struct tdb_context *tdb)
static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe) static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
{ {
struct stat st; struct stat st;
int ret;
/* We can't hold pointers during this: we could unmap! */ /* We can't hold pointers during this: we could unmap! */
assert(!tdb->direct_access assert(!tdb->direct_access
...@@ -81,11 +80,9 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe) ...@@ -81,11 +80,9 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
return 0; return 0;
if (tdb->flags & TDB_INTERNAL) { if (tdb->flags & TDB_INTERNAL) {
if (!probe) { if (!probe) {
/* Ensure ecode is set for log fn. */ tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
tdb->ecode = TDB_ERR_IO;
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
"tdb_oob len %lld beyond internal" "tdb_oob len %lld beyond internal"
" malloc size %lld\n", " malloc size %lld",
(long long)len, (long long)len,
(long long)tdb->map_size); (long long)tdb->map_size);
} }
...@@ -95,22 +92,20 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe) ...@@ -95,22 +92,20 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
if (tdb_lock_expand(tdb, F_RDLCK) != 0) if (tdb_lock_expand(tdb, F_RDLCK) != 0)
return -1; return -1;
ret = fstat(tdb->fd, &st); if (fstat(tdb->fd, &st) != 0) {
tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
tdb_unlock_expand(tdb, F_RDLCK); "Failed to fstat file: %s", strerror(errno));
tdb_unlock_expand(tdb, F_RDLCK);
if (ret == -1) {
tdb->ecode = TDB_ERR_IO;
return -1; return -1;
} }
tdb_unlock_expand(tdb, F_RDLCK);
if (st.st_size < (size_t)len) { if (st.st_size < (size_t)len) {
if (!probe) { if (!probe) {
/* Ensure ecode is set for log fn. */ tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
tdb->ecode = TDB_ERR_IO; "tdb_oob len %zu beyond eof at %zu",
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, (size_t)len, st.st_size);
"tdb_oob len %lld beyond eof at %lld\n",
(long long)len, (long long)st.st_size);
} }
return -1; return -1;
} }
...@@ -123,19 +118,6 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe) ...@@ -123,19 +118,6 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
return 0; return 0;
} }
/* Either make a copy into pad and return that, or return ptr into mmap. */
/* Note: pad has to be a real object, so we can't get here if len
* overflows size_t */
void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
{
if (likely(!(tdb->flags & TDB_CONVERT))) {
void *ret = tdb->methods->direct(tdb, off, len);
if (ret)
return ret;
}
return tdb_read_convert(tdb, off, pad, len) == -1 ? NULL : pad;
}
/* Endian conversion: we only ever deal with 8 byte quantities */ /* Endian conversion: we only ever deal with 8 byte quantities */
void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size) void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
{ {
...@@ -191,7 +173,9 @@ uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off, ...@@ -191,7 +173,9 @@ uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len) int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
{ {
char buf[8192] = { 0 }; char buf[8192] = { 0 };
void *p = tdb->methods->direct(tdb, off, len); void *p = tdb->methods->direct(tdb, off, len, true);
assert(!tdb->read_only);
if (p) { if (p) {
memset(p, 0, len); memset(p, 0, len);
return 0; return 0;
...@@ -208,13 +192,18 @@ int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len) ...@@ -208,13 +192,18 @@ int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off) tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
{ {
tdb_off_t pad, *ret; tdb_off_t ret;
ret = tdb_get(tdb, off, &pad, sizeof(pad)); if (likely(!(tdb->flags & TDB_CONVERT))) {
if (!ret) { tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
return TDB_OFF_ERR; false);
if (p)
return *p;
} }
return *ret;
if (tdb_read_convert(tdb, off, &ret, sizeof(ret)) == -1)
return TDB_OFF_ERR;
return ret;
} }
/* Even on files, we can get partial writes due to signals. */ /* Even on files, we can get partial writes due to signals. */
...@@ -278,15 +267,17 @@ bool tdb_read_all(int fd, void *buf, size_t len) ...@@ -278,15 +267,17 @@ bool tdb_read_all(int fd, void *buf, size_t len)
static int tdb_write(struct tdb_context *tdb, tdb_off_t off, static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
const void *buf, tdb_len_t len) const void *buf, tdb_len_t len)
{ {
if (len == 0) {
return 0;
}
if (tdb->read_only) { if (tdb->read_only) {
tdb->ecode = TDB_ERR_RDONLY; tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
"Write to read-only database");
return -1; return -1;
} }
/* FIXME: Bogus optimization? */
if (len == 0) {
return 0;
}
if (tdb->methods->oob(tdb, off + len, 0) != 0) if (tdb->methods->oob(tdb, off + len, 0) != 0)
return -1; return -1;
...@@ -294,11 +285,9 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off, ...@@ -294,11 +285,9 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
memcpy(off + (char *)tdb->map_ptr, buf, len); memcpy(off + (char *)tdb->map_ptr, buf, len);
} else { } else {
if (!tdb_pwrite_all(tdb->fd, buf, len, off)) { if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
tdb->ecode = TDB_ERR_IO; tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_write failed at %zu len=%zu (%s)",
"tdb_write failed at %llu len=%llu (%s)\n", (size_t)off, (size_t)len, strerror(errno));
(long long)off, (long long)len,
strerror(errno));
return -1; return -1;
} }
} }
...@@ -317,14 +306,12 @@ static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf, ...@@ -317,14 +306,12 @@ static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
memcpy(buf, off + (char *)tdb->map_ptr, len); memcpy(buf, off + (char *)tdb->map_ptr, len);
} else { } else {
if (!tdb_pread_all(tdb->fd, buf, len, off)) { if (!tdb_pread_all(tdb->fd, buf, len, off)) {
/* Ensure ecode is set for log fn. */ tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
tdb->ecode = TDB_ERR_IO; "tdb_read failed at %zu "
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "len=%zu (%s) map_size=%zu",
"tdb_read failed at %lld " (size_t)off, (size_t)len,
"len=%lld (%s) map_size=%lld\n",
(long long)off, (long long)len,
strerror(errno), strerror(errno),
(long long)tdb->map_size); (size_t)tdb->map_size);
return -1; return -1;
} }
} }
...@@ -338,10 +325,9 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off, ...@@ -338,10 +325,9 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
if (unlikely((tdb->flags & TDB_CONVERT))) { if (unlikely((tdb->flags & TDB_CONVERT))) {
void *conv = malloc(len); void *conv = malloc(len);
if (!conv) { if (!conv) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_write: no memory converting"
"tdb_write: no memory converting %zu bytes\n", " %zu bytes", len);
len);
return -1; return -1;
} }
memcpy(conv, rec, len); memcpy(conv, rec, len);
...@@ -364,6 +350,20 @@ int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off, ...@@ -364,6 +350,20 @@ int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val) int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
{ {
if (tdb->read_only) {
tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
"Write to read-only database");
return -1;
}
if (likely(!(tdb->flags & TDB_CONVERT))) {
tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
true);
if (p) {
*p = val;
return 0;
}
}
return tdb_write_convert(tdb, off, &val, sizeof(val)); return tdb_write_convert(tdb, off, &val, sizeof(val));
} }
...@@ -374,12 +374,12 @@ static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, ...@@ -374,12 +374,12 @@ static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
/* some systems don't like zero length malloc */ /* some systems don't like zero length malloc */
buf = malloc(prefix + len ? prefix + len : 1); buf = malloc(prefix + len ? prefix + len : 1);
if (unlikely(!buf)) { if (!buf) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_alloc_read malloc failed len=%zu",
"tdb_alloc_read malloc failed len=%lld\n", (size_t)(prefix + len));
(long long)prefix + len); } else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix,
} else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) { len) == -1)) {
free(buf); free(buf);
buf = NULL; buf = NULL;
} }
...@@ -400,9 +400,8 @@ static int fill(struct tdb_context *tdb, ...@@ -400,9 +400,8 @@ static int fill(struct tdb_context *tdb,
size_t n = len > size ? size : len; size_t n = len > size ? size : len;
if (!tdb_pwrite_all(tdb->fd, buf, n, off)) { if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
tdb->ecode = TDB_ERR_IO; tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "fill write failed: giving up!");
"fill write failed: giving up!\n");
return -1; return -1;
} }
len -= n; len -= n;
...@@ -418,14 +417,16 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition) ...@@ -418,14 +417,16 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
char buf[8192]; char buf[8192];
if (tdb->read_only) { if (tdb->read_only) {
tdb->ecode = TDB_ERR_RDONLY; tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
"Expand on read-only database");
return -1; return -1;
} }
if (tdb->flags & TDB_INTERNAL) { if (tdb->flags & TDB_INTERNAL) {
char *new = realloc(tdb->map_ptr, tdb->map_size + addition); char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
if (!new) { if (!new) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
"No memory to expand database");
return -1; return -1;
} }
tdb->map_ptr = new; tdb->map_ptr = new;
...@@ -443,7 +444,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition) ...@@ -443,7 +444,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
file isn't sparse, which would be very bad if we ran out of file isn't sparse, which would be very bad if we ran out of
disk. This must be done with write, not via mmap */ disk. This must be done with write, not via mmap */
memset(buf, 0x43, sizeof(buf)); memset(buf, 0x43, sizeof(buf));
if (fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1) if (0 || fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
return -1; return -1;
tdb->map_size += addition; tdb->map_size += addition;
tdb_mmap(tdb); tdb_mmap(tdb);
...@@ -451,25 +452,20 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition) ...@@ -451,25 +452,20 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
return 0; return 0;
} }
/* This is only neded for tdb_access_commit, but used everywhere to simplify. */
struct tdb_access_hdr {
tdb_off_t off;
tdb_len_t len;
bool convert;
};
const void *tdb_access_read(struct tdb_context *tdb, const void *tdb_access_read(struct tdb_context *tdb,
tdb_off_t off, tdb_len_t len, bool convert) tdb_off_t off, tdb_len_t len, bool convert)
{ {
const void *ret = NULL; const void *ret = NULL;
if (likely(!(tdb->flags & TDB_CONVERT))) if (likely(!(tdb->flags & TDB_CONVERT)))
ret = tdb->methods->direct(tdb, off, len); ret = tdb->methods->direct(tdb, off, len, false);
if (!ret) { if (!ret) {
struct tdb_access_hdr *hdr; struct tdb_access_hdr *hdr;
hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr)); hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
if (hdr) { if (hdr) {
hdr->next = tdb->access;
tdb->access = hdr;
ret = hdr + 1; ret = hdr + 1;
if (convert) if (convert)
tdb_convert(tdb, (void *)ret, len); tdb_convert(tdb, (void *)ret, len);
...@@ -485,13 +481,21 @@ void *tdb_access_write(struct tdb_context *tdb, ...@@ -485,13 +481,21 @@ void *tdb_access_write(struct tdb_context *tdb,
{ {
void *ret = NULL; void *ret = NULL;
if (tdb->read_only) {
tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
"Write to read-only database");
return NULL;
}
if (likely(!(tdb->flags & TDB_CONVERT))) if (likely(!(tdb->flags & TDB_CONVERT)))
ret = tdb->methods->direct(tdb, off, len); ret = tdb->methods->direct(tdb, off, len, true);
if (!ret) { if (!ret) {
struct tdb_access_hdr *hdr; struct tdb_access_hdr *hdr;
hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr)); hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
if (hdr) { if (hdr) {
hdr->next = tdb->access;
tdb->access = hdr;
hdr->off = off; hdr->off = off;
hdr->len = len; hdr->len = len;
hdr->convert = convert; hdr->convert = convert;
...@@ -505,30 +509,41 @@ void *tdb_access_write(struct tdb_context *tdb, ...@@ -505,30 +509,41 @@ void *tdb_access_write(struct tdb_context *tdb,
return ret; return ret;
} }
static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
{
struct tdb_access_hdr **hp;
for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
if (*hp + 1 == p)
return hp;
}
return NULL;
}
void tdb_access_release(struct tdb_context *tdb, const void *p) void tdb_access_release(struct tdb_context *tdb, const void *p)
{ {
if (!tdb->map_ptr struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
|| (char *)p < (char *)tdb->map_ptr
|| (char *)p >= (char *)tdb->map_ptr + tdb->map_size) if (hp) {
free((struct tdb_access_hdr *)p - 1); hdr = *hp;
else *hp = hdr->next;
free(hdr);
} else
tdb->direct_access--; tdb->direct_access--;
} }
int tdb_access_commit(struct tdb_context *tdb, void *p) int tdb_access_commit(struct tdb_context *tdb, void *p)
{ {
struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
int ret = 0; int ret = 0;
if (!tdb->map_ptr if (hp) {
|| (char *)p < (char *)tdb->map_ptr hdr = *hp;
|| (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
struct tdb_access_hdr *hdr;
hdr = (struct tdb_access_hdr *)p - 1;
if (hdr->convert) if (hdr->convert)
ret = tdb_write_convert(tdb, hdr->off, p, hdr->len); ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
else else
ret = tdb_write(tdb, hdr->off, p, hdr->len); ret = tdb_write(tdb, hdr->off, p, hdr->len);
*hp = hdr->next;
free(hdr); free(hdr);
} else } else
tdb->direct_access--; tdb->direct_access--;
...@@ -536,7 +551,8 @@ int tdb_access_commit(struct tdb_context *tdb, void *p) ...@@ -536,7 +551,8 @@ int tdb_access_commit(struct tdb_context *tdb, void *p)
return ret; return ret;
} }
static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len) static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
bool write)
{ {
if (unlikely(!tdb->map_ptr)) if (unlikely(!tdb->map_ptr))
return NULL; return NULL;
...@@ -546,6 +562,12 @@ static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len) ...@@ -546,6 +562,12 @@ static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
return (char *)tdb->map_ptr + off; return (char *)tdb->map_ptr + off;
} }
void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val)
{
if ((uintptr_t)stat < (uintptr_t)tdb->stats + tdb->stats->size)
*stat += val;
}
static const struct tdb_methods io_methods = { static const struct tdb_methods io_methods = {
tdb_read, tdb_read,
tdb_write, tdb_write,
......
...@@ -40,10 +40,13 @@ static int fcntl_lock(struct tdb_context *tdb, ...@@ -40,10 +40,13 @@ static int fcntl_lock(struct tdb_context *tdb,
fl.l_len = len; fl.l_len = len;
fl.l_pid = 0; fl.l_pid = 0;
add_stat(tdb, lock_lowlevel, 1);
if (waitflag) if (waitflag)
return fcntl(tdb->fd, F_SETLKW, &fl); return fcntl(tdb->fd, F_SETLKW, &fl);
else else {
add_stat(tdb, lock_nonblock, 1);
return fcntl(tdb->fd, F_SETLK, &fl); return fcntl(tdb->fd, F_SETLK, &fl);
}
} }
static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len) static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
...@@ -99,7 +102,7 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len) ...@@ -99,7 +102,7 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
} }
if (!found) { if (!found) {
fprintf(stderr, "Unlock on %u@%u not found!\n", fprintf(stderr, "Unlock on %u@%u not found!",
(int)off, (int)len); (int)off, (int)len);
abort(); abort();
} }
...@@ -132,16 +135,16 @@ static int tdb_brlock(struct tdb_context *tdb, ...@@ -132,16 +135,16 @@ static int tdb_brlock(struct tdb_context *tdb,
} }
if (rw_type == F_WRLCK && tdb->read_only) { if (rw_type == F_WRLCK && tdb->read_only) {
tdb->ecode = TDB_ERR_RDONLY; tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
"Write lock attempted on read-only database");
return -1; return -1;
} }
/* A 32 bit system cannot open a 64-bit file, but it could have /* A 32 bit system cannot open a 64-bit file, but it could have
* expanded since then: check here. */ * expanded since then: check here. */
if ((size_t)(offset + len) != offset + len) { if ((size_t)(offset + len) != offset + len) {
tdb->ecode = TDB_ERR_IO; tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_brlock: lock on giant offset %llu",
"tdb_brlock: lock on giant offset %llu\n",
(long long)(offset + len)); (long long)(offset + len));
return -1; return -1;
} }
...@@ -157,11 +160,12 @@ static int tdb_brlock(struct tdb_context *tdb, ...@@ -157,11 +160,12 @@ static int tdb_brlock(struct tdb_context *tdb,
* EAGAIN is an expected return from non-blocking * EAGAIN is an expected return from non-blocking
* locks. */ * locks. */
if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) { if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
"tdb_brlock failed (fd=%d) at" "tdb_brlock failed (fd=%d) at"
" offset %llu rw_type=%d flags=%d len=%llu\n", " offset %zu rw_type=%d flags=%d len=%zu:"
tdb->fd, (long long)offset, rw_type, " %s",
flags, (long long)len); tdb->fd, (size_t)offset, rw_type,
flags, (size_t)len, strerror(errno));
} }
return -1; return -1;
} }
...@@ -182,10 +186,10 @@ static int tdb_brunlock(struct tdb_context *tdb, ...@@ -182,10 +186,10 @@ static int tdb_brunlock(struct tdb_context *tdb,
} while (ret == -1 && errno == EINTR); } while (ret == -1 && errno == EINTR);
if (ret == -1) { if (ret == -1) {
tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_TRACE,
"tdb_brunlock failed (fd=%d) at offset %llu" "tdb_brunlock failed (fd=%d) at offset %zu"
" rw_type=%d len=%llu\n", " rw_type=%d len=%zu",
tdb->fd, (long long)offset, rw_type, (long long)len); tdb->fd, (size_t)offset, rw_type, (size_t)len);
} }
return ret; return ret;
} }
...@@ -201,15 +205,15 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb) ...@@ -201,15 +205,15 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
int count = 1000; int count = 1000;
if (tdb->allrecord_lock.count != 1) { if (tdb->allrecord_lock.count != 1) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
"tdb_allrecord_upgrade failed: count %u too high\n", "tdb_allrecord_upgrade failed: count %u too high",
tdb->allrecord_lock.count); tdb->allrecord_lock.count);
return -1; return -1;
} }
if (tdb->allrecord_lock.off != 1) { if (tdb->allrecord_lock.off != 1) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
"tdb_allrecord_upgrade failed: already upgraded?\n"); "tdb_allrecord_upgrade failed: already upgraded?");
return -1; return -1;
} }
...@@ -230,8 +234,8 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb) ...@@ -230,8 +234,8 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
tv.tv_usec = 1; tv.tv_usec = 1;
select(0, NULL, NULL, NULL, &tv); select(0, NULL, NULL, NULL, &tv);
} }
tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_WARNING,
"tdb_allrecord_upgrade failed\n"); "tdb_allrecord_upgrade failed");
return -1; return -1;
} }
...@@ -276,23 +280,23 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype, ...@@ -276,23 +280,23 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
struct tdb_lock_type *new_lck; struct tdb_lock_type *new_lck;
if (offset > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE + tdb->map_size / 8) { if (offset > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE + tdb->map_size / 8) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_nest_lock: invalid offset %zu ltype=%d",
"tdb_nest_lock: invalid offset %llu ltype=%d\n", (size_t)offset, ltype);
(long long)offset, ltype);
return -1; return -1;
} }
if (tdb->flags & TDB_NOLOCK) if (tdb->flags & TDB_NOLOCK)
return 0; return 0;
add_stat(tdb, locks, 1);
new_lck = find_nestlock(tdb, offset); new_lck = find_nestlock(tdb, offset);
if (new_lck) { if (new_lck) {
if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) { if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_nest_lock: offset %zu has read lock",
"tdb_nest_lock: offset %llu has read lock\n", (size_t)offset);
(long long)offset);
return -1; return -1;
} }
/* Just increment the struct, posix locks don't stack. */ /* Just increment the struct, posix locks don't stack. */
...@@ -303,9 +307,8 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype, ...@@ -303,9 +307,8 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
if (tdb->num_lockrecs if (tdb->num_lockrecs
&& offset >= TDB_HASH_LOCK_START && offset >= TDB_HASH_LOCK_START
&& offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) { && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_nest_lock: already have a hash lock?");
"tdb_nest_lock: already have a hash lock?\n");
return -1; return -1;
} }
...@@ -313,10 +316,9 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype, ...@@ -313,10 +316,9 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
tdb->lockrecs, tdb->lockrecs,
sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1)); sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
if (new_lck == NULL) { if (new_lck == NULL) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_nest_lock: unable to allocate %zu lock struct",
"tdb_nest_lock: unable to allocate %llu lock struct", tdb->num_lockrecs + 1);
(long long)(tdb->num_lockrecs + 1));
errno = ENOMEM; errno = ENOMEM;
return -1; return -1;
} }
...@@ -361,9 +363,8 @@ static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype) ...@@ -361,9 +363,8 @@ static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype)
lck = find_nestlock(tdb, off); lck = find_nestlock(tdb, off);
if ((lck == NULL) || (lck->count == 0)) { if ((lck == NULL) || (lck->count == 0)) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_nest_unlock: no lock for %zu", (size_t)off);
"tdb_nest_unlock: no lock for %llu\n", (long long)off);
return -1; return -1;
} }
...@@ -448,9 +449,8 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, ...@@ -448,9 +449,8 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
{ {
/* FIXME: There are no locks on read-only dbs */ /* FIXME: There are no locks on read-only dbs */
if (tdb->read_only) { if (tdb->read_only) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_allrecord_lock: read-only");
"tdb_allrecord_lock: read-only\n");
return -1; return -1;
} }
...@@ -462,49 +462,45 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, ...@@ -462,49 +462,45 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
if (tdb->allrecord_lock.count) { if (tdb->allrecord_lock.count) {
/* a global lock of a different type exists */ /* a global lock of a different type exists */
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_allrecord_lock: already have %s lock",
"tdb_allrecord_lock: already have %s lock\n", tdb->allrecord_lock.ltype == F_RDLCK
tdb->allrecord_lock.ltype == F_RDLCK ? "read" : "write");
? "read" : "write");
return -1; return -1;
} }
if (tdb_has_hash_locks(tdb)) { if (tdb_has_hash_locks(tdb)) {
/* can't combine global and chain locks */ /* can't combine global and chain locks */
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_allrecord_lock: already have chain lock");
"tdb_allrecord_lock: already have chain lock\n");
return -1; return -1;
} }
if (upgradable && ltype != F_RDLCK) { if (upgradable && ltype != F_RDLCK) {
/* tdb error: you can't upgrade a write lock! */ /* tdb error: you can't upgrade a write lock! */
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_allrecord_lock: can't upgrade a write lock");
"tdb_allrecord_lock: can't upgrade a write lock\n");
return -1; return -1;
} }
add_stat(tdb, locks, 1);
again: again:
/* Lock hashes, gradually. */ /* Lock hashes, gradually. */
if (tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START, if (tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
TDB_HASH_LOCK_RANGE)) { TDB_HASH_LOCK_RANGE)) {
if (!(flags & TDB_LOCK_PROBE)) { if (!(flags & TDB_LOCK_PROBE)) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
"tdb_allrecord_lock hashes failed (%s)\n", "tdb_allrecord_lock hashes failed");
strerror(errno));
} }
return -1; return -1;
} }
/* Lock free lists: there to end of file. */ /* Lock free tables: there to end of file. */
if (tdb_brlock(tdb, ltype, TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE, if (tdb_brlock(tdb, ltype, TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE,
0, flags)) { 0, flags)) {
if (!(flags & TDB_LOCK_PROBE)) { if (!(flags & TDB_LOCK_PROBE)) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
"tdb_allrecord_lock freelist failed (%s)\n", "tdb_allrecord_lock freetables failed");
strerror(errno));
} }
tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START,
TDB_HASH_LOCK_RANGE); TDB_HASH_LOCK_RANGE);
...@@ -559,29 +555,19 @@ void tdb_unlock_expand(struct tdb_context *tdb, int ltype) ...@@ -559,29 +555,19 @@ void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
/* unlock entire db */ /* unlock entire db */
int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype) int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
{ {
/* FIXME: There are no locks on read-only dbs */
if (tdb->read_only) {
tdb->ecode = TDB_ERR_LOCK;
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_allrecord_unlock: read-only\n");
return -1;
}
if (tdb->allrecord_lock.count == 0) { if (tdb->allrecord_lock.count == 0) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_allrecord_unlock: not locked!");
"tdb_allrecord_unlock: not locked!\n");
return -1; return -1;
} }
/* Upgradable locks are marked as write locks. */ /* Upgradable locks are marked as write locks. */
if (tdb->allrecord_lock.ltype != ltype if (tdb->allrecord_lock.ltype != ltype
&& (!tdb->allrecord_lock.off || ltype != F_RDLCK)) { && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_allrecord_unlock: have %s lock",
"tdb_allrecord_unlock: have %s lock\n", tdb->allrecord_lock.ltype == F_RDLCK
tdb->allrecord_lock.ltype == F_RDLCK ? "read" : "write");
? "read" : "write");
return -1; return -1;
} }
...@@ -642,25 +628,22 @@ int tdb_lock_hashes(struct tdb_context *tdb, ...@@ -642,25 +628,22 @@ int tdb_lock_hashes(struct tdb_context *tdb,
} }
if (tdb->allrecord_lock.count) { if (tdb->allrecord_lock.count) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_lock_hashes: already have %s allrecordlock",
"tdb_lock_hashes: have %s allrecordlock\n", tdb->allrecord_lock.ltype == F_RDLCK
tdb->allrecord_lock.ltype == F_RDLCK ? "read" : "write");
? "read" : "write");
return -1; return -1;
} }
if (tdb_has_free_lock(tdb)) { if (tdb_has_free_lock(tdb)) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_lock_hashes: already have free lock");
"tdb_lock_hashes: have free lock already\n");
return -1; return -1;
} }
if (tdb_has_expansion_lock(tdb)) { if (tdb_has_expansion_lock(tdb)) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_lock_hashes: already have expansion lock");
"tdb_lock_hashes: have expansion lock already\n");
return -1; return -1;
} }
...@@ -678,9 +661,8 @@ int tdb_unlock_hashes(struct tdb_context *tdb, ...@@ -678,9 +661,8 @@ int tdb_unlock_hashes(struct tdb_context *tdb,
if (tdb->allrecord_lock.count) { if (tdb->allrecord_lock.count) {
if (tdb->allrecord_lock.ltype == F_RDLCK if (tdb->allrecord_lock.ltype == F_RDLCK
&& ltype == F_WRLCK) { && ltype == F_WRLCK) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_unlock_hashes RO allrecord!");
"tdb_unlock_hashes RO allrecord!\n");
return -1; return -1;
} }
return 0; return 0;
...@@ -709,17 +691,15 @@ int tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off, ...@@ -709,17 +691,15 @@ int tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
if (tdb->allrecord_lock.count) { if (tdb->allrecord_lock.count) {
if (tdb->allrecord_lock.ltype == F_WRLCK) if (tdb->allrecord_lock.ltype == F_WRLCK)
return 0; return 0;
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_lock_free_bucket with RO allrecordlock!");
"tdb_lock_free_bucket with RO allrecordlock!\n");
return -1; return -1;
} }
#if 0 /* FIXME */ #if 0 /* FIXME */
if (tdb_has_expansion_lock(tdb)) { if (tdb_has_expansion_lock(tdb)) {
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_lock_free_bucket: already have expansion lock");
"tdb_lock_free_bucket: have expansion lock already\n");
return -1; return -1;
} }
#endif #endif
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include "config.h" #include "config.h"
#include <ccan/tdb2/tdb2.h> #include <ccan/tdb2/tdb2.h>
#include <ccan/likely/likely.h> #include <ccan/likely/likely.h>
#include <ccan/compiler/compiler.h>
#ifdef HAVE_BYTESWAP_H #ifdef HAVE_BYTESWAP_H
#include <byteswap.h> #include <byteswap.h>
#endif #endif
...@@ -63,9 +64,11 @@ typedef uint64_t tdb_off_t; ...@@ -63,9 +64,11 @@ typedef uint64_t tdb_off_t;
#define TDB_MAGIC_FOOD "TDB file\n" #define TDB_MAGIC_FOOD "TDB file\n"
#define TDB_VERSION ((uint64_t)(0x26011967 + 7)) #define TDB_VERSION ((uint64_t)(0x26011967 + 7))
#define TDB_MAGIC ((uint64_t)0x1999) #define TDB_USED_MAGIC ((uint64_t)0x1999)
#define TDB_HTABLE_MAGIC ((uint64_t)0x1888)
#define TDB_CHAIN_MAGIC ((uint64_t)0x1777)
#define TDB_FTABLE_MAGIC ((uint64_t)0x1666)
#define TDB_FREE_MAGIC ((uint64_t)0xFE) #define TDB_FREE_MAGIC ((uint64_t)0xFE)
#define TDB_COALESCING_MAGIC ((uint64_t)0xFD)
#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL) #define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
#define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL) #define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
#define TDB_RECOVERY_INVALID_MAGIC (0x0ULL) #define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
...@@ -91,20 +94,22 @@ typedef uint64_t tdb_off_t; ...@@ -91,20 +94,22 @@ typedef uint64_t tdb_off_t;
#define TDB_SUBLEVEL_HASH_BITS 6 #define TDB_SUBLEVEL_HASH_BITS 6
/* And 8 entries in each group, ie 8 groups per sublevel. */ /* And 8 entries in each group, ie 8 groups per sublevel. */
#define TDB_HASH_GROUP_BITS 3 #define TDB_HASH_GROUP_BITS 3
/* This is currently 10: beyond this we chain. */
#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS)
/* Extend file by least 32 times larger than needed. */ /* Extend file by least 100 times larger than needed. */
#define TDB_EXTENSION_FACTOR 32 #define TDB_EXTENSION_FACTOR 100
/* We steal bits from the offsets to store hash info. */ /* We steal bits from the offsets to store hash info. */
#define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1) #define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */ /* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
#define TDB_OFF_UPPER_STEAL 8 #define TDB_OFF_UPPER_STEAL 8
#define TDB_OFF_UPPER_STEAL_EXTRA 7 #define TDB_OFF_UPPER_STEAL_EXTRA 7
#define TDB_OFF_UPPER_STEAL_TRUNCBIT 1 /* The bit number where we store extra hash bits. */
/* If this is set, hash is truncated (only 1 bit is valid). */
#define TDB_OFF_HASH_TRUNCATED_BIT 56
/* The bit number where we store next level of hash. */
#define TDB_OFF_HASH_EXTRA_BIT 57 #define TDB_OFF_HASH_EXTRA_BIT 57
#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56
/* The bit number where we store the extra hash bits. */
/* Convenience mask to get actual offset. */ /* Convenience mask to get actual offset. */
#define TDB_OFF_MASK \ #define TDB_OFF_MASK \
(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK) (((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
...@@ -116,6 +121,9 @@ typedef uint64_t tdb_off_t; ...@@ -116,6 +121,9 @@ typedef uint64_t tdb_off_t;
#define TDB_MIN_DATA_LEN \ #define TDB_MIN_DATA_LEN \
(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record)) (sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
/* Indicates this entry is not on an flist (can happen during coalescing) */
#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1)
#if !HAVE_BSWAP_64 #if !HAVE_BSWAP_64
static inline uint64_t bswap_64(uint64_t x) static inline uint64_t bswap_64(uint64_t x)
{ {
...@@ -173,20 +181,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r) ...@@ -173,20 +181,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
} }
struct tdb_free_record { struct tdb_free_record {
uint64_t magic_and_meta; /* TDB_OFF_UPPER_STEAL bits of magic */ uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
uint64_t data_len; /* Not counting these two fields. */ uint64_t ftable_and_len; /* Len not counting these two fields. */
/* This is why the minimum record size is 16 bytes. */ /* This is why the minimum record size is 8 bytes. */
uint64_t next, prev; uint64_t next;
}; };
static inline uint64_t frec_prev(const struct tdb_free_record *f)
{
return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
}
static inline uint64_t frec_magic(const struct tdb_free_record *f) static inline uint64_t frec_magic(const struct tdb_free_record *f)
{ {
return f->magic_and_meta >> (64 - TDB_OFF_UPPER_STEAL); return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
} }
static inline uint64_t frec_flist(const struct tdb_free_record *f) static inline uint64_t frec_len(const struct tdb_free_record *f)
{ {
return f->magic_and_meta & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1); return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
}
static inline unsigned frec_ftable(const struct tdb_free_record *f)
{
return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL);
} }
struct tdb_recovery_record { struct tdb_recovery_record {
...@@ -199,6 +217,12 @@ struct tdb_recovery_record { ...@@ -199,6 +217,12 @@ struct tdb_recovery_record {
uint64_t eof; uint64_t eof;
}; };
/* If we bottom out of the subhashes, we chain. */
struct tdb_chain {
tdb_off_t rec[1 << TDB_HASH_GROUP_BITS];
tdb_off_t next;
};
/* this is stored at the front of every database */ /* this is stored at the front of every database */
struct tdb_header { struct tdb_header {
char magic_food[64]; /* for /etc/magic */ char magic_food[64]; /* for /etc/magic */
...@@ -206,7 +230,7 @@ struct tdb_header { ...@@ -206,7 +230,7 @@ struct tdb_header {
uint64_t version; /* version of the code */ uint64_t version; /* version of the code */
uint64_t hash_test; /* result of hashing HASH_MAGIC. */ uint64_t hash_test; /* result of hashing HASH_MAGIC. */
uint64_t hash_seed; /* "random" seed written at creation time. */ uint64_t hash_seed; /* "random" seed written at creation time. */
tdb_off_t free_list; /* (First) free list. */ tdb_off_t free_table; /* (First) free table. */
tdb_off_t recovery; /* Transaction recovery area. */ tdb_off_t recovery; /* Transaction recovery area. */
tdb_off_t reserved[26]; tdb_off_t reserved[26];
...@@ -215,7 +239,7 @@ struct tdb_header { ...@@ -215,7 +239,7 @@ struct tdb_header {
tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS]; tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS];
}; };
struct tdb_freelist { struct tdb_freetable {
struct tdb_used_record hdr; struct tdb_used_record hdr;
tdb_off_t next; tdb_off_t next;
tdb_off_t buckets[TDB_FREE_BUCKETS]; tdb_off_t buckets[TDB_FREE_BUCKETS];
...@@ -246,7 +270,7 @@ struct traverse_info { ...@@ -246,7 +270,7 @@ struct traverse_info {
/* We ignore groups here, and treat it as a big array. */ /* We ignore groups here, and treat it as a big array. */
unsigned entry; unsigned entry;
unsigned int total_buckets; unsigned int total_buckets;
} levels[64 / TDB_SUBLEVEL_HASH_BITS]; } levels[TDB_MAX_LEVELS + 1];
unsigned int num_levels; unsigned int num_levels;
unsigned int toplevel_group; unsigned int toplevel_group;
/* This makes delete-everything-inside-traverse work as expected. */ /* This makes delete-everything-inside-traverse work as expected. */
...@@ -269,6 +293,15 @@ struct tdb_lock_type { ...@@ -269,6 +293,15 @@ struct tdb_lock_type {
uint32_t ltype; uint32_t ltype;
}; };
/* This is only needed for tdb_access_commit, but used everywhere to
* simplify. */
struct tdb_access_hdr {
struct tdb_access_hdr *next;
tdb_off_t off;
tdb_len_t len;
bool convert;
};
struct tdb_context { struct tdb_context {
/* Filename of the database. */ /* Filename of the database. */
const char *name; const char *name;
...@@ -298,8 +331,8 @@ struct tdb_context { ...@@ -298,8 +331,8 @@ struct tdb_context {
uint32_t flags; uint32_t flags;
/* Logging function */ /* Logging function */
tdb_logfn_t log; tdb_logfn_t logfn;
void *log_priv; void *log_private;
/* Hash function. */ /* Hash function. */
tdb_hashfn_t khash; tdb_hashfn_t khash;
...@@ -309,17 +342,23 @@ struct tdb_context { ...@@ -309,17 +342,23 @@ struct tdb_context {
/* Set if we are in a transaction. */ /* Set if we are in a transaction. */
struct tdb_transaction *transaction; struct tdb_transaction *transaction;
/* What freelist are we using? */ /* What free table are we using? */
uint64_t flist_off; tdb_off_t ftable_off;
unsigned int ftable;
/* IO methods: changes for transactions. */ /* IO methods: changes for transactions. */
const struct tdb_methods *methods; const struct tdb_methods *methods;
/* Lock information */ /* Lock information */
struct tdb_lock_type allrecord_lock; struct tdb_lock_type allrecord_lock;
uint64_t num_lockrecs; size_t num_lockrecs;
struct tdb_lock_type *lockrecs; struct tdb_lock_type *lockrecs;
struct tdb_attribute_stats *stats;
/* Direct access information */
struct tdb_access_hdr *access;
/* Single list of all TDBs, to avoid multiple opens. */ /* Single list of all TDBs, to avoid multiple opens. */
struct tdb_context *next; struct tdb_context *next;
dev_t device; dev_t device;
...@@ -331,7 +370,7 @@ struct tdb_methods { ...@@ -331,7 +370,7 @@ struct tdb_methods {
int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t); int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
int (*oob)(struct tdb_context *, tdb_off_t, bool); int (*oob)(struct tdb_context *, tdb_off_t, bool);
int (*expand_file)(struct tdb_context *, tdb_len_t); int (*expand_file)(struct tdb_context *, tdb_len_t);
void *(*direct)(struct tdb_context *, tdb_off_t, size_t); void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool);
}; };
/* /*
...@@ -367,29 +406,32 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h); ...@@ -367,29 +406,32 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
bool is_subhash(tdb_off_t val); bool is_subhash(tdb_off_t val);
/* free.c: */ /* free.c: */
int tdb_flist_init(struct tdb_context *tdb); int tdb_ftable_init(struct tdb_context *tdb);
/* check.c needs these to iterate through free lists. */ /* check.c needs these to iterate through free lists. */
tdb_off_t first_flist(struct tdb_context *tdb); tdb_off_t first_ftable(struct tdb_context *tdb);
tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist); tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable);
/* If this fails, try tdb_expand. */ /* This returns space or TDB_OFF_ERR. */
tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
uint64_t hash, bool growing); uint64_t hash, unsigned magic, bool growing);
/* Put this record in a free list. */ /* Put this record in a free list. */
int add_free_record(struct tdb_context *tdb, int add_free_record(struct tdb_context *tdb,
tdb_off_t off, tdb_len_t len_with_header); tdb_off_t off, tdb_len_t len_with_header);
/* Set up header for a used record. */ /* Set up header for a used/ftable/htable/chain record. */
int set_header(struct tdb_context *tdb, int set_header(struct tdb_context *tdb,
struct tdb_used_record *rec, struct tdb_used_record *rec,
uint64_t keylen, uint64_t datalen, unsigned magic, uint64_t keylen, uint64_t datalen,
uint64_t actuallen, unsigned hashlow); uint64_t actuallen, unsigned hashlow);
/* Used by tdb_check to verify. */ /* Used by tdb_check to verify. */
unsigned int size_to_bucket(tdb_len_t data_len); unsigned int size_to_bucket(tdb_len_t data_len);
tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket); tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket);
/* Used by tdb_summary */
size_t dead_space(struct tdb_context *tdb, tdb_off_t off);
/* io.c: */ /* io.c: */
/* Initialize tdb->methods. */ /* Initialize tdb->methods. */
...@@ -402,10 +444,6 @@ void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size); ...@@ -402,10 +444,6 @@ void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
void tdb_munmap(struct tdb_context *tdb); void tdb_munmap(struct tdb_context *tdb);
void tdb_mmap(struct tdb_context *tdb); void tdb_mmap(struct tdb_context *tdb);
/* Either make a copy into pad and return that, or return ptr into mmap.
* Converts endian (ie. will use pad in that case). */
void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len);
/* Either alloc a copy, or give direct access. Release frees or noop. */ /* Either alloc a copy, or give direct access. Release frees or noop. */
const void *tdb_access_read(struct tdb_context *tdb, const void *tdb_access_read(struct tdb_context *tdb,
tdb_off_t off, tdb_len_t len, bool convert); tdb_off_t off, tdb_len_t len, bool convert);
...@@ -452,6 +490,13 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off, ...@@ -452,6 +490,13 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off, int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
void *rec, size_t len); void *rec, size_t len);
/* Adds a stat, if it's in range. */
void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val);
#define add_stat(tdb, statname, val) \
do { \
if (unlikely((tdb)->stats)) \
add_stat_((tdb), &(tdb)->stats->statname, (val)); \
} while (0)
/* lock.c: */ /* lock.c: */
void tdb_lock_init(struct tdb_context *tdb); void tdb_lock_init(struct tdb_context *tdb);
...@@ -507,6 +552,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype, ...@@ -507,6 +552,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
int tdb_transaction_recover(struct tdb_context *tdb); int tdb_transaction_recover(struct tdb_context *tdb);
bool tdb_needs_recovery(struct tdb_context *tdb); bool tdb_needs_recovery(struct tdb_context *tdb);
/* tdb.c: */
void COLD tdb_logerr(struct tdb_context *tdb,
enum TDB_ERROR ecode,
enum tdb_debug_level level,
const char *fmt, ...);
#ifdef TDB_TRACE #ifdef TDB_TRACE
void tdb_trace(struct tdb_context *tdb, const char *op); void tdb_trace(struct tdb_context *tdb, const char *op);
void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op); void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);
......
...@@ -37,33 +37,43 @@ static int count_hash(struct tdb_context *tdb, ...@@ -37,33 +37,43 @@ static int count_hash(struct tdb_context *tdb,
static bool summarize(struct tdb_context *tdb, static bool summarize(struct tdb_context *tdb,
struct tally *hashes, struct tally *hashes,
struct tally *flists, struct tally *ftables,
struct tally *free, struct tally *free,
struct tally *keys, struct tally *keys,
struct tally *data, struct tally *data,
struct tally *extra, struct tally *extra,
struct tally *uncoal, struct tally *uncoal,
struct tally *buckets) struct tally *buckets,
struct tally *chains)
{ {
tdb_off_t off; tdb_off_t off;
tdb_len_t len; tdb_len_t len;
tdb_len_t unc = 0; tdb_len_t unc = 0;
for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) { for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
union { const union {
struct tdb_used_record u; struct tdb_used_record u;
struct tdb_free_record f; struct tdb_free_record f;
} pad, *p; struct tdb_recovery_record r;
p = tdb_get(tdb, off, &pad, sizeof(pad)); } *p;
/* We might not be able to get the whole thing. */
p = tdb_access_read(tdb, off, sizeof(p->f), true);
if (!p) if (!p)
return false; return false;
if (rec_magic(&p->u) != TDB_MAGIC) { if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
len = p->f.data_len; || p->r.magic == TDB_RECOVERY_MAGIC) {
if (unc) {
tally_add(uncoal, unc);
unc = 0;
}
len = sizeof(p->r) + p->r.max_len;
} else if (frec_magic(&p->f) == TDB_FREE_MAGIC) {
len = frec_len(&p->f);
tally_add(free, len); tally_add(free, len);
tally_add(buckets, size_to_bucket(len)); tally_add(buckets, size_to_bucket(len));
len += sizeof(p->u); len += sizeof(p->u);
unc++; unc++;
} else { } else if (rec_magic(&p->u) == TDB_USED_MAGIC) {
if (unc) { if (unc) {
tally_add(uncoal, unc); tally_add(uncoal, unc);
unc = 0; unc = 0;
...@@ -73,25 +83,35 @@ static bool summarize(struct tdb_context *tdb, ...@@ -73,25 +83,35 @@ static bool summarize(struct tdb_context *tdb,
+ rec_data_length(&p->u) + rec_data_length(&p->u)
+ rec_extra_padding(&p->u); + rec_extra_padding(&p->u);
/* FIXME: Use different magic for hashes, flists. */ tally_add(keys, rec_key_length(&p->u));
if (!rec_key_length(&p->u) && rec_hash(&p->u) < 2) { tally_add(data, rec_data_length(&p->u));
if (rec_hash(&p->u) == 0) { tally_add(extra, rec_extra_padding(&p->u));
int count = count_hash(tdb, } else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) {
off + sizeof(p->u), int count = count_hash(tdb,
TDB_SUBLEVEL_HASH_BITS); off + sizeof(p->u),
if (count == -1) TDB_SUBLEVEL_HASH_BITS);
return false; if (count == -1)
tally_add(hashes, count); return false;
} else { tally_add(hashes, count);
tally_add(flists, tally_add(extra, rec_extra_padding(&p->u));
rec_data_length(&p->u)); len = sizeof(p->u)
} + rec_data_length(&p->u)
} else { + rec_extra_padding(&p->u);
tally_add(keys, rec_key_length(&p->u)); } else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) {
tally_add(data, rec_data_length(&p->u)); len = sizeof(p->u)
} + rec_data_length(&p->u)
+ rec_extra_padding(&p->u);
tally_add(ftables, rec_data_length(&p->u));
tally_add(extra, rec_extra_padding(&p->u));
} else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) {
len = sizeof(p->u)
+ rec_data_length(&p->u)
+ rec_extra_padding(&p->u);
tally_add(chains, 1);
tally_add(extra, rec_extra_padding(&p->u)); tally_add(extra, rec_extra_padding(&p->u));
} } else
len = dead_space(tdb, off);
tdb_access_release(tdb, p);
} }
if (unc) if (unc)
tally_add(uncoal, unc); tally_add(uncoal, unc);
...@@ -110,6 +130,7 @@ static bool summarize(struct tdb_context *tdb, ...@@ -110,6 +130,7 @@ static bool summarize(struct tdb_context *tdb,
"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \ "Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
"Number of free lists: %zu\n%s" \ "Number of free lists: %zu\n%s" \
"Toplevel hash used: %u of %u\n" \ "Toplevel hash used: %u of %u\n" \
"Number of chains: %zu\n" \
"Number of subhashes: %zu\n" \ "Number of subhashes: %zu\n" \
"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \ "Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n" "Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
...@@ -127,8 +148,8 @@ static bool summarize(struct tdb_context *tdb, ...@@ -127,8 +148,8 @@ static bool summarize(struct tdb_context *tdb,
char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags) char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
{ {
tdb_len_t len; tdb_len_t len;
struct tally *flists, *hashes, *freet, *keys, *data, *extra, *uncoal, struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal,
*buckets; *buckets, *chains;
char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg, *bucketsg; char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg, *bucketsg;
char *ret = NULL; char *ret = NULL;
...@@ -143,7 +164,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags) ...@@ -143,7 +164,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
} }
/* Start stats off empty. */ /* Start stats off empty. */
flists = tally_new(HISTO_HEIGHT); ftables = tally_new(HISTO_HEIGHT);
hashes = tally_new(HISTO_HEIGHT); hashes = tally_new(HISTO_HEIGHT);
freet = tally_new(HISTO_HEIGHT); freet = tally_new(HISTO_HEIGHT);
keys = tally_new(HISTO_HEIGHT); keys = tally_new(HISTO_HEIGHT);
...@@ -151,14 +172,16 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags) ...@@ -151,14 +172,16 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
extra = tally_new(HISTO_HEIGHT); extra = tally_new(HISTO_HEIGHT);
uncoal = tally_new(HISTO_HEIGHT); uncoal = tally_new(HISTO_HEIGHT);
buckets = tally_new(HISTO_HEIGHT); buckets = tally_new(HISTO_HEIGHT);
if (!flists || !hashes || !freet || !keys || !data || !extra chains = tally_new(HISTO_HEIGHT);
|| !uncoal || !buckets) { if (!ftables || !hashes || !freet || !keys || !data || !extra
tdb->ecode = TDB_ERR_OOM; || !uncoal || !buckets || !chains) {
tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
"tdb_summary: failed to allocate tally structures");
goto unlock; goto unlock;
} }
if (!summarize(tdb, hashes, flists, freet, keys, data, extra, uncoal, if (!summarize(tdb, hashes, ftables, freet, keys, data, extra, uncoal,
buckets)) buckets, chains))
goto unlock; goto unlock;
if (flags & TDB_SUMMARY_HISTOGRAMS) { if (flags & TDB_SUMMARY_HISTOGRAMS) {
...@@ -206,6 +229,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags) ...@@ -206,6 +229,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
count_hash(tdb, offsetof(struct tdb_header, hashtable), count_hash(tdb, offsetof(struct tdb_header, hashtable),
TDB_TOPLEVEL_HASH_BITS), TDB_TOPLEVEL_HASH_BITS),
1 << TDB_TOPLEVEL_HASH_BITS, 1 << TDB_TOPLEVEL_HASH_BITS,
tally_num(chains),
tally_num(hashes), tally_num(hashes),
tally_min(hashes), tally_mean(hashes), tally_max(hashes), tally_min(hashes), tally_mean(hashes), tally_max(hashes),
hashesg ? hashesg : "", hashesg ? hashesg : "",
...@@ -215,11 +239,12 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags) ...@@ -215,11 +239,12 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
tally_total(freet, NULL) * 100.0 / tdb->map_size, tally_total(freet, NULL) * 100.0 / tdb->map_size,
(tally_num(keys) + tally_num(freet) + tally_num(hashes)) (tally_num(keys) + tally_num(freet) + tally_num(hashes))
* sizeof(struct tdb_used_record) * 100.0 / tdb->map_size, * sizeof(struct tdb_used_record) * 100.0 / tdb->map_size,
tally_num(flists) * sizeof(struct tdb_freelist) tally_num(ftables) * sizeof(struct tdb_freetable)
* 100.0 / tdb->map_size, * 100.0 / tdb->map_size,
(tally_num(hashes) (tally_num(hashes)
* (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
+ (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS)) + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS)
+ sizeof(struct tdb_chain) * tally_num(chains))
* 100.0 / tdb->map_size); * 100.0 / tdb->map_size);
unlock: unlock:
...@@ -237,6 +262,8 @@ unlock: ...@@ -237,6 +262,8 @@ unlock:
free(data); free(data);
free(extra); free(extra);
free(uncoal); free(uncoal);
free(ftables);
free(chains);
tdb_allrecord_unlock(tdb, F_RDLCK); tdb_allrecord_unlock(tdb, F_RDLCK);
tdb_unlock_expand(tdb, F_RDLCK); tdb_unlock_expand(tdb, F_RDLCK);
......
#include "private.h" #include "private.h"
#include <ccan/tdb2/tdb2.h> #include <ccan/tdb2/tdb2.h>
#include <ccan/build_assert/build_assert.h>
#include <ccan/likely/likely.h>
#include <assert.h> #include <assert.h>
#include <stdarg.h>
/* The null return. */ /* The null return. */
struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 }; struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
...@@ -10,13 +9,6 @@ struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 }; ...@@ -10,13 +9,6 @@ struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */ /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
static struct tdb_context *tdbs = NULL; static struct tdb_context *tdbs = NULL;
PRINTF_FMT(4, 5) static void
null_log_fn(struct tdb_context *tdb,
enum tdb_debug_level level, void *priv,
const char *fmt, ...)
{
}
static bool tdb_already_open(dev_t device, ino_t ino) static bool tdb_already_open(dev_t device, ino_t ino)
{ {
struct tdb_context *i; struct tdb_context *i;
...@@ -39,8 +31,8 @@ static uint64_t random_number(struct tdb_context *tdb) ...@@ -39,8 +31,8 @@ static uint64_t random_number(struct tdb_context *tdb)
fd = open("/dev/urandom", O_RDONLY); fd = open("/dev/urandom", O_RDONLY);
if (fd >= 0) { if (fd >= 0) {
if (tdb_read_all(fd, &ret, sizeof(ret))) { if (tdb_read_all(fd, &ret, sizeof(ret))) {
tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv, tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
"tdb_open: random from /dev/urandom\n"); "tdb_open: random from /dev/urandom");
close(fd); close(fd);
return ret; return ret;
} }
...@@ -55,9 +47,9 @@ static uint64_t random_number(struct tdb_context *tdb) ...@@ -55,9 +47,9 @@ static uint64_t random_number(struct tdb_context *tdb)
char reply[1 + sizeof(uint64_t)]; char reply[1 + sizeof(uint64_t)];
int r = read(fd, reply, sizeof(reply)); int r = read(fd, reply, sizeof(reply));
if (r > 1) { if (r > 1) {
tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv, tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
"tdb_open: %u random bytes from" "tdb_open: %u random bytes from"
" /dev/egd-pool\n", r-1); " /dev/egd-pool", r-1);
/* Copy at least some bytes. */ /* Copy at least some bytes. */
memcpy(&ret, reply+1, r - 1); memcpy(&ret, reply+1, r - 1);
if (reply[0] == sizeof(uint64_t) if (reply[0] == sizeof(uint64_t)
...@@ -73,14 +65,14 @@ static uint64_t random_number(struct tdb_context *tdb) ...@@ -73,14 +65,14 @@ static uint64_t random_number(struct tdb_context *tdb)
/* Fallback: pid and time. */ /* Fallback: pid and time. */
gettimeofday(&now, NULL); gettimeofday(&now, NULL);
ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec; ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv, tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
"tdb_open: random from getpid and time\n"); "tdb_open: random from getpid and time");
return ret; return ret;
} }
struct new_database { struct new_database {
struct tdb_header hdr; struct tdb_header hdr;
struct tdb_freelist flist; struct tdb_freetable ftable;
}; };
/* initialise a new database */ /* initialise a new database */
...@@ -109,11 +101,11 @@ static int tdb_new_database(struct tdb_context *tdb, ...@@ -109,11 +101,11 @@ static int tdb_new_database(struct tdb_context *tdb,
memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable)); memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
/* Free is empty. */ /* Free is empty. */
newdb.hdr.free_list = offsetof(struct new_database, flist); newdb.hdr.free_table = offsetof(struct new_database, ftable);
memset(&newdb.flist, 0, sizeof(newdb.flist)); memset(&newdb.ftable, 0, sizeof(newdb.ftable));
set_header(NULL, &newdb.flist.hdr, 0, set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0,
sizeof(newdb.flist) - sizeof(newdb.flist.hdr), sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
sizeof(newdb.flist) - sizeof(newdb.flist.hdr), 1); sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr), 0);
/* Magic food */ /* Magic food */
memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food)); memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
...@@ -130,7 +122,8 @@ static int tdb_new_database(struct tdb_context *tdb, ...@@ -130,7 +122,8 @@ static int tdb_new_database(struct tdb_context *tdb,
tdb->map_size = sizeof(newdb); tdb->map_size = sizeof(newdb);
tdb->map_ptr = malloc(tdb->map_size); tdb->map_ptr = malloc(tdb->map_size);
if (!tdb->map_ptr) { if (!tdb->map_ptr) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
"tdb_new_database: failed to allocate");
return -1; return -1;
} }
memcpy(tdb->map_ptr, &newdb, tdb->map_size); memcpy(tdb->map_ptr, &newdb, tdb->map_size);
...@@ -143,7 +136,9 @@ static int tdb_new_database(struct tdb_context *tdb, ...@@ -143,7 +136,9 @@ static int tdb_new_database(struct tdb_context *tdb,
return -1; return -1;
if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) { if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) {
tdb->ecode = TDB_ERR_IO; tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
"tdb_new_database: failed to write: %s",
strerror(errno));
return -1; return -1;
} }
return 0; return 0;
...@@ -155,7 +150,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -155,7 +150,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
{ {
struct tdb_context *tdb; struct tdb_context *tdb;
struct stat st; struct stat st;
int save_errno; int saved_errno = 0;
uint64_t hash_test; uint64_t hash_test;
unsigned v; unsigned v;
struct tdb_header hdr; struct tdb_header hdr;
...@@ -165,7 +160,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -165,7 +160,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
if (!tdb) { if (!tdb) {
/* Can't log this */ /* Can't log this */
errno = ENOMEM; errno = ENOMEM;
goto fail; return NULL;
} }
tdb->name = NULL; tdb->name = NULL;
tdb->map_ptr = NULL; tdb->map_ptr = NULL;
...@@ -174,9 +169,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -174,9 +169,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
tdb->map_size = sizeof(struct tdb_header); tdb->map_size = sizeof(struct tdb_header);
tdb->ecode = TDB_SUCCESS; tdb->ecode = TDB_SUCCESS;
tdb->flags = tdb_flags; tdb->flags = tdb_flags;
tdb->log = null_log_fn; tdb->logfn = NULL;
tdb->log_priv = NULL;
tdb->transaction = NULL; tdb->transaction = NULL;
tdb->stats = NULL;
tdb->access = NULL;
tdb_hash_init(tdb); tdb_hash_init(tdb);
tdb_io_init(tdb); tdb_io_init(tdb);
tdb_lock_init(tdb); tdb_lock_init(tdb);
...@@ -184,8 +180,8 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -184,8 +180,8 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
while (attr) { while (attr) {
switch (attr->base.attr) { switch (attr->base.attr) {
case TDB_ATTRIBUTE_LOG: case TDB_ATTRIBUTE_LOG:
tdb->log = attr->log.log_fn; tdb->logfn = attr->log.log_fn;
tdb->log_priv = attr->log.log_private; tdb->log_private = attr->log.log_private;
break; break;
case TDB_ATTRIBUTE_HASH: case TDB_ATTRIBUTE_HASH:
tdb->khash = attr->hash.hash_fn; tdb->khash = attr->hash.hash_fn;
...@@ -194,20 +190,24 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -194,20 +190,24 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
case TDB_ATTRIBUTE_SEED: case TDB_ATTRIBUTE_SEED:
seed = &attr->seed; seed = &attr->seed;
break; break;
case TDB_ATTRIBUTE_STATS:
tdb->stats = &attr->stats;
/* They have stats we don't know about? Tell them. */
if (tdb->stats->size > sizeof(attr->stats))
tdb->stats->size = sizeof(attr->stats);
break;
default: default:
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
"tdb_open: unknown attribute type %u\n", "tdb_open: unknown attribute type %u",
attr->base.attr); attr->base.attr);
errno = EINVAL;
goto fail; goto fail;
} }
attr = attr->base.next; attr = attr->base.next;
} }
if ((open_flags & O_ACCMODE) == O_WRONLY) { if ((open_flags & O_ACCMODE) == O_WRONLY) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
"tdb_open: can't open tdb %s write-only\n", name); "tdb_open: can't open tdb %s write-only", name);
errno = EINVAL;
goto fail; goto fail;
} }
...@@ -225,21 +225,21 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -225,21 +225,21 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
if (tdb->flags & TDB_INTERNAL) { if (tdb->flags & TDB_INTERNAL) {
tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP); tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
if (tdb_new_database(tdb, seed, &hdr) != 0) { if (tdb_new_database(tdb, seed, &hdr) != 0) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_open: tdb_new_database failed!");
goto fail; goto fail;
} }
tdb_convert(tdb, &hdr.hash_seed, sizeof(hdr.hash_seed)); tdb_convert(tdb, &hdr.hash_seed, sizeof(hdr.hash_seed));
tdb->hash_seed = hdr.hash_seed; tdb->hash_seed = hdr.hash_seed;
tdb_flist_init(tdb); tdb_ftable_init(tdb);
return tdb; return tdb;
} }
if ((tdb->fd = open(name, open_flags, mode)) == -1) { if ((tdb->fd = open(name, open_flags, mode)) == -1) {
tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv, /* errno set by open(2) */
"tdb_open: could not open file %s: %s\n", saved_errno = errno;
name, strerror(errno)); tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
goto fail; /* errno set by open(2) */ "tdb_open: could not open file %s: %s",
name, strerror(errno));
goto fail;
} }
/* on exec, don't inherit the fd */ /* on exec, don't inherit the fd */
...@@ -248,19 +248,19 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -248,19 +248,19 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
/* ensure there is only one process initialising at once */ /* ensure there is only one process initialising at once */
if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) { if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, /* errno set by tdb_brlock */
"tdb_open: failed to get open lock on %s: %s\n", saved_errno = errno;
name, strerror(errno)); goto fail;
goto fail; /* errno set by tdb_brlock */
} }
if (!tdb_pread_all(tdb->fd, &hdr, sizeof(hdr), 0) if (!tdb_pread_all(tdb->fd, &hdr, sizeof(hdr), 0)
|| strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) { || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
if (!(open_flags & O_CREAT) if (!(open_flags & O_CREAT)) {
|| tdb_new_database(tdb, seed, &hdr) == -1) { tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
if (errno == 0) { "tdb_open: %s is not a tdb file", name);
errno = EIO; /* ie bad format or something */ goto fail;
} }
if (tdb_new_database(tdb, seed, &hdr) == -1) {
goto fail; goto fail;
} }
} else if (hdr.version != TDB_VERSION) { } else if (hdr.version != TDB_VERSION) {
...@@ -268,10 +268,9 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -268,10 +268,9 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
tdb->flags |= TDB_CONVERT; tdb->flags |= TDB_CONVERT;
else { else {
/* wrong version */ /* wrong version */
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
"tdb_open: %s is unknown version 0x%llx\n", "tdb_open: %s is unknown version 0x%llx",
name, (long long)hdr.version); name, (long long)hdr.version);
errno = EIO;
goto fail; goto fail;
} }
} }
...@@ -282,29 +281,34 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -282,29 +281,34 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test)); hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
if (hdr.hash_test != hash_test) { if (hdr.hash_test != hash_test) {
/* wrong hash variant */ /* wrong hash variant */
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
"tdb_open: %s uses a different hash function\n", "tdb_open: %s uses a different hash function",
name); name);
errno = EIO;
goto fail; goto fail;
} }
if (fstat(tdb->fd, &st) == -1) if (fstat(tdb->fd, &st) == -1) {
saved_errno = errno;
tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
"tdb_open: could not stat open %s: %s",
name, strerror(errno));
goto fail; goto fail;
}
/* Is it already in the open list? If so, fail. */ /* Is it already in the open list? If so, fail. */
if (tdb_already_open(st.st_dev, st.st_ino)) { if (tdb_already_open(st.st_dev, st.st_ino)) {
/* FIXME */ /* FIXME */
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_NESTING, TDB_DEBUG_ERROR,
"tdb_open: %s (%d,%d) is already open in this process\n", "tdb_open: %s (%d,%d) is already open in this"
name, (int)st.st_dev, (int)st.st_ino); " process",
errno = EBUSY; name, (int)st.st_dev, (int)st.st_ino);
goto fail; goto fail;
} }
tdb->name = strdup(name); tdb->name = strdup(name);
if (!tdb->name) { if (!tdb->name) {
errno = ENOMEM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
"tdb_open: failed to allocate name");
goto fail; goto fail;
} }
...@@ -317,11 +321,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -317,11 +321,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
/* Now it's fully formed, recover if necessary. */ /* Now it's fully formed, recover if necessary. */
if (tdb_needs_recovery(tdb) && tdb_lock_and_recover(tdb) == -1) { if (tdb_needs_recovery(tdb) && tdb_lock_and_recover(tdb) == -1) {
errno = EIO;
goto fail; goto fail;
} }
if (tdb_flist_init(tdb) == -1) if (tdb_ftable_init(tdb) == -1)
goto fail; goto fail;
tdb->next = tdbs; tdb->next = tdbs;
...@@ -329,10 +332,30 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -329,10 +332,30 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
return tdb; return tdb;
fail: fail:
save_errno = errno; /* Map ecode to some logical errno. */
if (!saved_errno) {
if (!tdb) switch (tdb->ecode) {
return NULL; case TDB_ERR_CORRUPT:
case TDB_ERR_IO:
saved_errno = EIO;
break;
case TDB_ERR_LOCK:
saved_errno = EWOULDBLOCK;
break;
case TDB_ERR_OOM:
saved_errno = ENOMEM;
break;
case TDB_ERR_EINVAL:
saved_errno = EINVAL;
break;
case TDB_ERR_NESTING:
saved_errno = EBUSY;
break;
default:
saved_errno = EINVAL;
break;
}
}
#ifdef TDB_TRACE #ifdef TDB_TRACE
close(tdb->tracefd); close(tdb->tracefd);
...@@ -346,15 +369,14 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags, ...@@ -346,15 +369,14 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
free((char *)tdb->name); free((char *)tdb->name);
if (tdb->fd != -1) if (tdb->fd != -1)
if (close(tdb->fd) != 0) if (close(tdb->fd) != 0)
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
"tdb_open: failed to close tdb->fd" "tdb_open: failed to close tdb->fd"
" on error!\n"); " on error!");
free(tdb); free(tdb);
errno = save_errno; errno = saved_errno;
return NULL; return NULL;
} }
/* FIXME: modify, don't rewrite! */
static int update_rec_hdr(struct tdb_context *tdb, static int update_rec_hdr(struct tdb_context *tdb,
tdb_off_t off, tdb_off_t off,
tdb_len_t keylen, tdb_len_t keylen,
...@@ -364,7 +386,8 @@ static int update_rec_hdr(struct tdb_context *tdb, ...@@ -364,7 +386,8 @@ static int update_rec_hdr(struct tdb_context *tdb,
{ {
uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec); uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
if (set_header(tdb, rec, keylen, datalen, keylen + dataroom, h)) if (set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen,
keylen + dataroom, h))
return -1; return -1;
return tdb_write_convert(tdb, off, rec, sizeof(*rec)); return tdb_write_convert(tdb, off, rec, sizeof(*rec));
...@@ -380,12 +403,14 @@ static int replace_data(struct tdb_context *tdb, ...@@ -380,12 +403,14 @@ static int replace_data(struct tdb_context *tdb,
tdb_off_t new_off; tdb_off_t new_off;
/* Allocate a new record. */ /* Allocate a new record. */
new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, growing); new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC,
growing);
if (unlikely(new_off == TDB_OFF_ERR)) if (unlikely(new_off == TDB_OFF_ERR))
return -1; return -1;
/* We didn't like the existing one: remove it. */ /* We didn't like the existing one: remove it. */
if (old_off) { if (old_off) {
add_stat(tdb, frees, 1);
add_free_record(tdb, old_off, add_free_record(tdb, old_off,
sizeof(struct tdb_used_record) sizeof(struct tdb_used_record)
+ key.dsize + old_room); + key.dsize + old_room);
...@@ -445,7 +470,6 @@ int tdb_store(struct tdb_context *tdb, ...@@ -445,7 +470,6 @@ int tdb_store(struct tdb_context *tdb,
h.hlock_range, F_WRLCK); h.hlock_range, F_WRLCK);
return 0; return 0;
} }
/* FIXME: See if right record is free? */
} else { } else {
if (flag == TDB_MODIFY) { if (flag == TDB_MODIFY) {
/* if the record doesn't exist and we /* if the record doesn't exist and we
...@@ -502,15 +526,13 @@ int tdb_append(struct tdb_context *tdb, ...@@ -502,15 +526,13 @@ int tdb_append(struct tdb_context *tdb,
F_WRLCK); F_WRLCK);
return 0; return 0;
} }
/* FIXME: Check right record free? */
/* Slow path. */ /* Slow path. */
newdata = malloc(key.dsize + old_dlen + dbuf.dsize); newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
if (!newdata) { if (!newdata) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_append: failed to allocate %zu bytes",
"tdb_append: cannot allocate %llu bytes!\n", (size_t)(key.dsize+old_dlen+dbuf.dsize));
(long long)key.dsize + old_dlen + dbuf.dsize);
goto fail; goto fail;
} }
if (tdb->methods->read(tdb, off + sizeof(rec) + key.dsize, if (tdb->methods->read(tdb, off + sizeof(rec) + key.dsize,
...@@ -582,6 +604,7 @@ int tdb_delete(struct tdb_context *tdb, struct tdb_data key) ...@@ -582,6 +604,7 @@ int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
goto unlock_err; goto unlock_err;
/* Free the deleted entry. */ /* Free the deleted entry. */
add_stat(tdb, frees, 1);
if (add_free_record(tdb, off, if (add_free_record(tdb, off,
sizeof(struct tdb_used_record) sizeof(struct tdb_used_record)
+ rec_key_length(&rec) + rec_key_length(&rec)
...@@ -602,12 +625,11 @@ int tdb_close(struct tdb_context *tdb) ...@@ -602,12 +625,11 @@ int tdb_close(struct tdb_context *tdb)
struct tdb_context **i; struct tdb_context **i;
int ret = 0; int ret = 0;
/* FIXME: tdb_trace(tdb, "tdb_close");
if (tdb->transaction) { if (tdb->transaction) {
tdb_transaction_cancel(tdb); tdb_transaction_cancel(tdb);
} }
*/
tdb_trace(tdb, "tdb_close");
if (tdb->map_ptr) { if (tdb->map_ptr) {
if (tdb->flags & TDB_INTERNAL) if (tdb->flags & TDB_INTERNAL)
...@@ -638,12 +660,12 @@ int tdb_close(struct tdb_context *tdb) ...@@ -638,12 +660,12 @@ int tdb_close(struct tdb_context *tdb)
return ret; return ret;
} }
enum TDB_ERROR tdb_error(struct tdb_context *tdb) enum TDB_ERROR tdb_error(const struct tdb_context *tdb)
{ {
return tdb->ecode; return tdb->ecode;
} }
const char *tdb_errorstr(struct tdb_context *tdb) const char *tdb_errorstr(const struct tdb_context *tdb)
{ {
/* Gcc warns if you miss a case in the switch, so use that. */ /* Gcc warns if you miss a case in the switch, so use that. */
switch (tdb->ecode) { switch (tdb->ecode) {
...@@ -660,3 +682,38 @@ const char *tdb_errorstr(struct tdb_context *tdb) ...@@ -660,3 +682,38 @@ const char *tdb_errorstr(struct tdb_context *tdb)
} }
return "Invalid error code"; return "Invalid error code";
} }
void COLD tdb_logerr(struct tdb_context *tdb,
enum TDB_ERROR ecode,
enum tdb_debug_level level,
const char *fmt, ...)
{
char *message;
va_list ap;
size_t len;
/* tdb_open paths care about errno, so save it. */
int saved_errno = errno;
tdb->ecode = ecode;
if (!tdb->logfn)
return;
/* FIXME: Doesn't assume asprintf. */
va_start(ap, fmt);
len = vsnprintf(NULL, 0, fmt, ap);
va_end(ap);
message = malloc(len + 1);
if (!message) {
tdb->logfn(tdb, level, tdb->log_private,
"out of memory formatting message");
return;
}
va_start(ap, fmt);
len = vsprintf(message, fmt, ap);
va_end(ap);
tdb->logfn(tdb, level, tdb->log_private, message);
free(message);
errno = saved_errno;
}
...@@ -67,7 +67,7 @@ enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, ...@@ -67,7 +67,7 @@ enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK,
/* flags for tdb_summary. Logical or to combine. */ /* flags for tdb_summary. Logical or to combine. */
enum tdb_summary_flags { TDB_SUMMARY_HISTOGRAMS = 1 }; enum tdb_summary_flags { TDB_SUMMARY_HISTOGRAMS = 1 };
/* debugging uses one of the following levels */ /* logging uses one of the following levels */
enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR, enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR,
TDB_DEBUG_WARNING, TDB_DEBUG_TRACE}; TDB_DEBUG_WARNING, TDB_DEBUG_TRACE};
...@@ -80,14 +80,15 @@ struct tdb_context; ...@@ -80,14 +80,15 @@ struct tdb_context;
/* FIXME: Make typesafe */ /* FIXME: Make typesafe */
typedef int (*tdb_traverse_func)(struct tdb_context *, TDB_DATA, TDB_DATA, void *); typedef int (*tdb_traverse_func)(struct tdb_context *, TDB_DATA, TDB_DATA, void *);
typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *priv, const char *, ...) PRINTF_FMT(4, 5); typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *, const char *);
typedef uint64_t (*tdb_hashfn_t)(const void *key, size_t len, uint64_t seed, typedef uint64_t (*tdb_hashfn_t)(const void *key, size_t len, uint64_t seed,
void *priv); void *priv);
enum tdb_attribute_type { enum tdb_attribute_type {
TDB_ATTRIBUTE_LOG = 0, TDB_ATTRIBUTE_LOG = 0,
TDB_ATTRIBUTE_HASH = 1, TDB_ATTRIBUTE_HASH = 1,
TDB_ATTRIBUTE_SEED = 2 TDB_ATTRIBUTE_SEED = 2,
TDB_ATTRIBUTE_STATS = 3
}; };
struct tdb_attribute_base { struct tdb_attribute_base {
...@@ -112,11 +113,39 @@ struct tdb_attribute_seed { ...@@ -112,11 +113,39 @@ struct tdb_attribute_seed {
uint64_t seed; uint64_t seed;
}; };
struct tdb_attribute_stats {
struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */
size_t size; /* = sizeof(struct tdb_attribute_stats) */
uint64_t allocs;
uint64_t alloc_subhash;
uint64_t alloc_chain;
uint64_t alloc_bucket_exact;
uint64_t alloc_bucket_max;
uint64_t alloc_leftover;
uint64_t alloc_coalesce_tried;
uint64_t alloc_coalesce_lockfail;
uint64_t alloc_coalesce_race;
uint64_t alloc_coalesce_succeeded;
uint64_t alloc_coalesce_num_merged;
uint64_t compares;
uint64_t compare_wrong_bucket;
uint64_t compare_wrong_offsetbits;
uint64_t compare_wrong_keylen;
uint64_t compare_wrong_rechash;
uint64_t compare_wrong_keycmp;
uint64_t expands;
uint64_t frees;
uint64_t locks;
uint64_t lock_lowlevel;
uint64_t lock_nonblock;
};
union tdb_attribute { union tdb_attribute {
struct tdb_attribute_base base; struct tdb_attribute_base base;
struct tdb_attribute_log log; struct tdb_attribute_log log;
struct tdb_attribute_hash hash; struct tdb_attribute_hash hash;
struct tdb_attribute_seed seed; struct tdb_attribute_seed seed;
struct tdb_attribute_stats stats;
}; };
struct tdb_context *tdb_open(const char *name, int tdb_flags, struct tdb_context *tdb_open(const char *name, int tdb_flags,
...@@ -139,8 +168,8 @@ int tdb_check(struct tdb_context *tdb, ...@@ -139,8 +168,8 @@ int tdb_check(struct tdb_context *tdb,
int (*check)(TDB_DATA key, TDB_DATA data, void *private_data), int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
void *private_data); void *private_data);
enum TDB_ERROR tdb_error(struct tdb_context *tdb); enum TDB_ERROR tdb_error(const struct tdb_context *tdb);
const char *tdb_errorstr(struct tdb_context *tdb); const char *tdb_errorstr(const struct tdb_context *tdb);
int tdb_transaction_start(struct tdb_context *tdb); int tdb_transaction_start(struct tdb_context *tdb);
void tdb_transaction_cancel(struct tdb_context *tdb); void tdb_transaction_cancel(struct tdb_context *tdb);
......
...@@ -23,20 +23,20 @@ static void add(struct tdb_layout *layout, union tdb_layout_elem elem) ...@@ -23,20 +23,20 @@ static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
layout->elem[layout->num_elems++] = elem; layout->elem[layout->num_elems++] = elem;
} }
void tdb_layout_add_freelist(struct tdb_layout *layout) void tdb_layout_add_freetable(struct tdb_layout *layout)
{ {
union tdb_layout_elem elem; union tdb_layout_elem elem;
elem.base.type = FREELIST; elem.base.type = FREETABLE;
add(layout, elem); add(layout, elem);
} }
void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len, void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
unsigned flist) unsigned ftable)
{ {
union tdb_layout_elem elem; union tdb_layout_elem elem;
elem.base.type = FREE; elem.base.type = FREE;
elem.free.len = len; elem.free.len = len;
elem.free.flist_num = flist; elem.free.ftable_num = ftable;
add(layout, elem); add(layout, elem);
} }
...@@ -82,9 +82,9 @@ static tdb_len_t hashtable_len(struct tle_hashtable *htable) ...@@ -82,9 +82,9 @@ static tdb_len_t hashtable_len(struct tle_hashtable *htable)
+ htable->extra; + htable->extra;
} }
static tdb_len_t freelist_len(struct tle_freelist *flist) static tdb_len_t freetable_len(struct tle_freetable *ftable)
{ {
return sizeof(struct tdb_freelist); return sizeof(struct tdb_freetable);
} }
static void set_free_record(void *mem, tdb_len_t len) static void set_free_record(void *mem, tdb_len_t len)
...@@ -97,7 +97,7 @@ static void set_data_record(void *mem, struct tdb_context *tdb, ...@@ -97,7 +97,7 @@ static void set_data_record(void *mem, struct tdb_context *tdb,
{ {
struct tdb_used_record *u = mem; struct tdb_used_record *u = mem;
set_header(tdb, u, used->key.dsize, used->data.dsize, set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize,
used->key.dsize + used->data.dsize + used->extra, used->key.dsize + used->data.dsize + used->extra,
tdb_hash(tdb, used->key.dptr, used->key.dsize)); tdb_hash(tdb, used->key.dptr, used->key.dsize));
memcpy(u + 1, used->key.dptr, used->key.dsize); memcpy(u + 1, used->key.dptr, used->key.dsize);
...@@ -111,34 +111,36 @@ static void set_hashtable(void *mem, struct tdb_context *tdb, ...@@ -111,34 +111,36 @@ static void set_hashtable(void *mem, struct tdb_context *tdb,
struct tdb_used_record *u = mem; struct tdb_used_record *u = mem;
tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS; tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS;
set_header(tdb, u, 0, len, len + htable->extra, 0); set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0);
memset(u + 1, 0, len); memset(u + 1, 0, len);
} }
static void set_freelist(void *mem, struct tdb_context *tdb, static void set_freetable(void *mem, struct tdb_context *tdb,
struct tle_freelist *freelist, struct tdb_header *hdr, struct tle_freetable *freetable, struct tdb_header *hdr,
tdb_off_t last_flist) tdb_off_t last_ftable)
{ {
struct tdb_freelist *flist = mem; struct tdb_freetable *ftable = mem;
memset(flist, 0, sizeof(*flist)); memset(ftable, 0, sizeof(*ftable));
set_header(tdb, &flist->hdr, 0, set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0,
sizeof(*flist) - sizeof(flist->hdr), sizeof(*ftable) - sizeof(ftable->hdr),
sizeof(*flist) - sizeof(flist->hdr), 1); sizeof(*ftable) - sizeof(ftable->hdr), 0);
if (last_flist) { if (last_ftable) {
flist = (struct tdb_freelist *)((char *)hdr + last_flist); ftable = (struct tdb_freetable *)((char *)hdr + last_ftable);
flist->next = freelist->base.off; ftable->next = freetable->base.off;
} else { } else {
hdr->free_list = freelist->base.off; hdr->free_table = freetable->base.off;
} }
} }
static void add_to_freetable(struct tdb_context *tdb, static void add_to_freetable(struct tdb_context *tdb,
tdb_off_t eoff, tdb_off_t eoff,
tdb_off_t elen, tdb_off_t elen,
struct tle_freelist *freelist) unsigned ftable,
struct tle_freetable *freetable)
{ {
tdb->flist_off = freelist->base.off; tdb->ftable_off = freetable->base.off;
tdb->ftable = ftable;
add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen); add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
} }
...@@ -202,15 +204,15 @@ static void add_to_hashtable(struct tdb_context *tdb, ...@@ -202,15 +204,15 @@ static void add_to_hashtable(struct tdb_context *tdb,
abort(); abort();
} }
static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num) static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num)
{ {
unsigned i; unsigned i;
for (i = 0; i < layout->num_elems; i++) { for (i = 0; i < layout->num_elems; i++) {
if (layout->elem[i].base.type != FREELIST) if (layout->elem[i].base.type != FREETABLE)
continue; continue;
if (num == 0) if (num == 0)
return &layout->elem[i].flist; return &layout->elem[i].ftable;
num--; num--;
} }
abort(); abort();
...@@ -220,7 +222,7 @@ static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num) ...@@ -220,7 +222,7 @@ static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
struct tdb_context *tdb_layout_get(struct tdb_layout *layout) struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
{ {
unsigned int i; unsigned int i;
tdb_off_t off, len, last_flist; tdb_off_t off, len, last_ftable;
char *mem; char *mem;
struct tdb_context *tdb; struct tdb_context *tdb;
...@@ -231,8 +233,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout) ...@@ -231,8 +233,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
union tdb_layout_elem *e = &layout->elem[i]; union tdb_layout_elem *e = &layout->elem[i];
e->base.off = off; e->base.off = off;
switch (e->base.type) { switch (e->base.type) {
case FREELIST: case FREETABLE:
len = freelist_len(&e->flist); len = freetable_len(&e->ftable);
break; break;
case FREE: case FREE:
len = free_record_len(e->free.len); len = free_record_len(e->free.len);
...@@ -259,14 +261,14 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout) ...@@ -259,14 +261,14 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
tdb->map_ptr = mem; tdb->map_ptr = mem;
tdb->map_size = off; tdb->map_size = off;
last_flist = 0; last_ftable = 0;
for (i = 0; i < layout->num_elems; i++) { for (i = 0; i < layout->num_elems; i++) {
union tdb_layout_elem *e = &layout->elem[i]; union tdb_layout_elem *e = &layout->elem[i];
switch (e->base.type) { switch (e->base.type) {
case FREELIST: case FREETABLE:
set_freelist(mem + e->base.off, tdb, &e->flist, set_freetable(mem + e->base.off, tdb, &e->ftable,
(struct tdb_header *)mem, last_flist); (struct tdb_header *)mem, last_ftable);
last_flist = e->base.off; last_ftable = e->base.off;
break; break;
case FREE: case FREE:
set_free_record(mem + e->base.off, e->free.len); set_free_record(mem + e->base.off, e->free.len);
...@@ -279,8 +281,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout) ...@@ -279,8 +281,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
break; break;
} }
} }
/* Must have a free list! */ /* Must have a free table! */
assert(last_flist); assert(last_ftable);
/* Now fill the free and hash tables. */ /* Now fill the free and hash tables. */
for (i = 0; i < layout->num_elems; i++) { for (i = 0; i < layout->num_elems; i++) {
...@@ -288,7 +290,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout) ...@@ -288,7 +290,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
switch (e->base.type) { switch (e->base.type) {
case FREE: case FREE:
add_to_freetable(tdb, e->base.off, e->free.len, add_to_freetable(tdb, e->base.off, e->free.len,
find_flist(layout, e->free.flist_num)); e->free.ftable_num,
find_ftable(layout, e->free.ftable_num));
break; break;
case DATA: case DATA:
add_to_hashtable(tdb, e->base.off, e->used.key); add_to_hashtable(tdb, e->base.off, e->used.key);
...@@ -298,7 +301,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout) ...@@ -298,7 +301,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
} }
} }
tdb->flist_off = find_flist(layout, 0)->base.off; tdb->ftable_off = find_ftable(layout, 0)->base.off;
/* Get physical if they asked for it. */ /* Get physical if they asked for it. */
if (layout->filename) { if (layout->filename) {
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
#include <ccan/tdb2/private.h> #include <ccan/tdb2/private.h>
struct tdb_layout *new_tdb_layout(const char *filename); struct tdb_layout *new_tdb_layout(const char *filename);
void tdb_layout_add_freelist(struct tdb_layout *layout); void tdb_layout_add_freetable(struct tdb_layout *layout);
void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len, void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
unsigned flist); unsigned ftable);
void tdb_layout_add_used(struct tdb_layout *layout, void tdb_layout_add_used(struct tdb_layout *layout,
TDB_DATA key, TDB_DATA data, TDB_DATA key, TDB_DATA data,
tdb_len_t extra); tdb_len_t extra);
...@@ -18,7 +18,7 @@ void tdb_layout_add_hashtable(struct tdb_layout *layout, ...@@ -18,7 +18,7 @@ void tdb_layout_add_hashtable(struct tdb_layout *layout,
struct tdb_context *tdb_layout_get(struct tdb_layout *layout); struct tdb_context *tdb_layout_get(struct tdb_layout *layout);
enum layout_type { enum layout_type {
FREELIST, FREE, DATA, HASHTABLE, FREETABLE, FREE, DATA, HASHTABLE,
}; };
/* Shared by all union members. */ /* Shared by all union members. */
...@@ -27,14 +27,14 @@ struct tle_base { ...@@ -27,14 +27,14 @@ struct tle_base {
tdb_off_t off; tdb_off_t off;
}; };
struct tle_freelist { struct tle_freetable {
struct tle_base base; struct tle_base base;
}; };
struct tle_free { struct tle_free {
struct tle_base base; struct tle_base base;
tdb_len_t len; tdb_len_t len;
unsigned flist_num; unsigned ftable_num;
}; };
struct tle_used { struct tle_used {
...@@ -53,7 +53,7 @@ struct tle_hashtable { ...@@ -53,7 +53,7 @@ struct tle_hashtable {
union tdb_layout_elem { union tdb_layout_elem {
struct tle_base base; struct tle_base base;
struct tle_freelist flist; struct tle_freetable ftable;
struct tle_free free; struct tle_free free;
struct tle_used used; struct tle_used used;
struct tle_hashtable hashtable; struct tle_hashtable hashtable;
......
#define _GNU_SOURCE
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdarg.h>
#include <ccan/tap/tap.h> #include <ccan/tap/tap.h>
#include "logging.h" #include "logging.h"
...@@ -16,24 +14,13 @@ union tdb_attribute tap_log_attr = { ...@@ -16,24 +14,13 @@ union tdb_attribute tap_log_attr = {
void tap_log_fn(struct tdb_context *tdb, void tap_log_fn(struct tdb_context *tdb,
enum tdb_debug_level level, void *priv, enum tdb_debug_level level, void *priv,
const char *fmt, ...) const char *message)
{ {
va_list ap;
char *p;
if (suppress_logging) if (suppress_logging)
return; return;
va_start(ap, fmt); diag("tdb log level %u: %s%s", level, log_prefix, message);
if (vasprintf(&p, fmt, ap) == -1)
abort();
/* Strip trailing \n: diag adds it. */
if (p[strlen(p)-1] == '\n')
p[strlen(p)-1] = '\0';
diag("tdb log level %u: %s%s", level, log_prefix, p);
free(p);
if (level != TDB_DEBUG_TRACE) if (level != TDB_DEBUG_TRACE)
tap_log_messages++; tap_log_messages++;
va_end(ap);
} }
...@@ -11,7 +11,7 @@ extern union tdb_attribute tap_log_attr; ...@@ -11,7 +11,7 @@ extern union tdb_attribute tap_log_attr;
void tap_log_fn(struct tdb_context *tdb, void tap_log_fn(struct tdb_context *tdb,
enum tdb_debug_level level, void *priv, enum tdb_debug_level level, void *priv,
const char *fmt, ...); const char *message);
static inline bool data_equal(struct tdb_data a, struct tdb_data b) static inline bool data_equal(struct tdb_data a, struct tdb_data b)
{ {
......
...@@ -12,18 +12,20 @@ int main(int argc, char *argv[]) ...@@ -12,18 +12,20 @@ int main(int argc, char *argv[])
{ {
unsigned int i; unsigned int i;
struct tdb_used_record rec; struct tdb_used_record rec;
struct tdb_context tdb = { .log = tap_log_fn, .log_priv = NULL }; struct tdb_context tdb = { .logfn = tap_log_fn };
plan_tests(64 + 32 + 48*6 + 1); plan_tests(64 + 32 + 48*6 + 1);
/* We should be able to encode any data value. */ /* We should be able to encode any data value. */
for (i = 0; i < 64; i++) for (i = 0; i < 64; i++)
ok1(set_header(&tdb, &rec, 0, 1ULL << i, 1ULL << i, 0) == 0); ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i,
1ULL << i, 0) == 0);
/* And any key and data with < 64 bits between them. */ /* And any key and data with < 64 bits between them. */
for (i = 0; i < 32; i++) { for (i = 0; i < 32; i++) {
tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i; tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen, 0) == 0); ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
klen + dlen, 0) == 0);
} }
/* We should neatly encode all values. */ /* We should neatly encode all values. */
...@@ -32,13 +34,13 @@ int main(int argc, char *argv[]) ...@@ -32,13 +34,13 @@ int main(int argc, char *argv[])
uint64_t klen = 1ULL << (i < 16 ? i : 15); uint64_t klen = 1ULL << (i < 16 ? i : 15);
uint64_t dlen = 1ULL << i; uint64_t dlen = 1ULL << i;
uint64_t xlen = 1ULL << (i < 32 ? i : 31); uint64_t xlen = 1ULL << (i < 32 ? i : 31);
ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen + xlen, h) ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
== 0); klen+dlen+xlen, h) == 0);
ok1(rec_key_length(&rec) == klen); ok1(rec_key_length(&rec) == klen);
ok1(rec_data_length(&rec) == dlen); ok1(rec_data_length(&rec) == dlen);
ok1(rec_extra_padding(&rec) == xlen); ok1(rec_extra_padding(&rec) == xlen);
ok1((uint64_t)rec_hash(&rec) == h); ok1((uint64_t)rec_hash(&rec) == h);
ok1(rec_magic(&rec) == TDB_MAGIC); ok1(rec_magic(&rec) == TDB_USED_MAGIC);
} }
ok1(tap_log_messages == 0); ok1(tap_log_messages == 0);
return exit_status(); return exit_status();
......
...@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off) ...@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
return TDB_OFF_ERR; return TDB_OFF_ERR;
if (frec_magic(&f) != TDB_FREE_MAGIC) if (frec_magic(&f) != TDB_FREE_MAGIC)
return TDB_OFF_ERR; return TDB_OFF_ERR;
return f.data_len; return frec_len(&f);
} }
int main(int argc, char *argv[]) int main(int argc, char *argv[])
...@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) ...@@ -38,7 +38,7 @@ int main(int argc, char *argv[])
/* No coalescing can be done due to EOF */ /* No coalescing can be done due to EOF */
layout = new_tdb_layout(NULL); layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout); tdb_layout_add_freetable(layout);
len = 1024; len = 1024;
tdb_layout_add_free(layout, len, 0); tdb_layout_add_free(layout, len, 0);
tdb = tdb_layout_get(layout); tdb = tdb_layout_get(layout);
...@@ -46,7 +46,7 @@ int main(int argc, char *argv[]) ...@@ -46,7 +46,7 @@ int main(int argc, char *argv[])
ok1(free_record_length(tdb, layout->elem[1].base.off) == len); ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
/* Figure out which bucket free entry is. */ /* Figure out which bucket free entry is. */
b_off = bucket_off(tdb->flist_off, size_to_bucket(len)); b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
/* Lock and fail to coalesce. */ /* Lock and fail to coalesce. */
ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len) == 0); ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len) == 0);
...@@ -57,7 +57,7 @@ int main(int argc, char *argv[]) ...@@ -57,7 +57,7 @@ int main(int argc, char *argv[])
/* No coalescing can be done due to used record */ /* No coalescing can be done due to used record */
layout = new_tdb_layout(NULL); layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout); tdb_layout_add_freetable(layout);
tdb_layout_add_free(layout, 1024, 0); tdb_layout_add_free(layout, 1024, 0);
tdb_layout_add_used(layout, key, data, 6); tdb_layout_add_used(layout, key, data, 6);
tdb = tdb_layout_get(layout); tdb = tdb_layout_get(layout);
...@@ -65,7 +65,7 @@ int main(int argc, char *argv[]) ...@@ -65,7 +65,7 @@ int main(int argc, char *argv[])
ok1(tdb_check(tdb, NULL, NULL) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Figure out which bucket free entry is. */ /* Figure out which bucket free entry is. */
b_off = bucket_off(tdb->flist_off, size_to_bucket(1024)); b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
/* Lock and fail to coalesce. */ /* Lock and fail to coalesce. */
ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 0); ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 0);
...@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) ...@@ -76,7 +76,7 @@ int main(int argc, char *argv[])
/* Coalescing can be done due to two free records, then EOF */ /* Coalescing can be done due to two free records, then EOF */
layout = new_tdb_layout(NULL); layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout); tdb_layout_add_freetable(layout);
tdb_layout_add_free(layout, 1024, 0); tdb_layout_add_free(layout, 1024, 0);
tdb_layout_add_free(layout, 2048, 0); tdb_layout_add_free(layout, 2048, 0);
tdb = tdb_layout_get(layout); tdb = tdb_layout_get(layout);
...@@ -85,7 +85,7 @@ int main(int argc, char *argv[]) ...@@ -85,7 +85,7 @@ int main(int argc, char *argv[])
ok1(tdb_check(tdb, NULL, NULL) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Figure out which bucket (first) free entry is. */ /* Figure out which bucket (first) free entry is. */
b_off = bucket_off(tdb->flist_off, size_to_bucket(1024)); b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
/* Lock and coalesce. */ /* Lock and coalesce. */
ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1); ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
...@@ -97,7 +97,7 @@ int main(int argc, char *argv[]) ...@@ -97,7 +97,7 @@ int main(int argc, char *argv[])
/* Coalescing can be done due to two free records, then data */ /* Coalescing can be done due to two free records, then data */
layout = new_tdb_layout(NULL); layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout); tdb_layout_add_freetable(layout);
tdb_layout_add_free(layout, 1024, 0); tdb_layout_add_free(layout, 1024, 0);
tdb_layout_add_free(layout, 512, 0); tdb_layout_add_free(layout, 512, 0);
tdb_layout_add_used(layout, key, data, 6); tdb_layout_add_used(layout, key, data, 6);
...@@ -107,7 +107,7 @@ int main(int argc, char *argv[]) ...@@ -107,7 +107,7 @@ int main(int argc, char *argv[])
ok1(tdb_check(tdb, NULL, NULL) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Figure out which bucket free entry is. */ /* Figure out which bucket free entry is. */
b_off = bucket_off(tdb->flist_off, size_to_bucket(1024)); b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
/* Lock and coalesce. */ /* Lock and coalesce. */
ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1); ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
...@@ -119,7 +119,7 @@ int main(int argc, char *argv[]) ...@@ -119,7 +119,7 @@ int main(int argc, char *argv[])
/* Coalescing can be done due to three free records, then EOF */ /* Coalescing can be done due to three free records, then EOF */
layout = new_tdb_layout(NULL); layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout); tdb_layout_add_freetable(layout);
tdb_layout_add_free(layout, 1024, 0); tdb_layout_add_free(layout, 1024, 0);
tdb_layout_add_free(layout, 512, 0); tdb_layout_add_free(layout, 512, 0);
tdb_layout_add_free(layout, 256, 0); tdb_layout_add_free(layout, 256, 0);
...@@ -130,7 +130,7 @@ int main(int argc, char *argv[]) ...@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
ok1(tdb_check(tdb, NULL, NULL) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Figure out which bucket free entry is. */ /* Figure out which bucket free entry is. */
b_off = bucket_off(tdb->flist_off, size_to_bucket(1024)); b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
/* Lock and coalesce. */ /* Lock and coalesce. */
ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1); ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
......
...@@ -65,7 +65,8 @@ int main(int argc, char *argv[]) ...@@ -65,7 +65,8 @@ int main(int argc, char *argv[])
/* FIXME: Check lock length */ /* FIXME: Check lock length */
/* Allocate a new record. */ /* Allocate a new record. */
new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, false); new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
TDB_USED_MAGIC, false);
ok1(new_off != TDB_OFF_ERR); ok1(new_off != TDB_OFF_ERR);
/* We should be able to add it now. */ /* We should be able to add it now. */
...@@ -225,7 +226,8 @@ int main(int argc, char *argv[]) ...@@ -225,7 +226,8 @@ int main(int argc, char *argv[])
/* We should be able to add it now. */ /* We should be able to add it now. */
/* Allocate a new record. */ /* Allocate a new record. */
new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, false); new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
TDB_USED_MAGIC, false);
ok1(new_off != TDB_OFF_ERR); ok1(new_off != TDB_OFF_ERR);
ok1(add_to_hash(tdb, &h, new_off) == 0); ok1(add_to_hash(tdb, &h, new_off) == 0);
......
#include <ccan/tdb2/tdb.c>
#include <ccan/tdb2/free.c>
#include <ccan/tdb2/lock.c>
#include <ccan/tdb2/io.c>
#include <ccan/tdb2/hash.c>
#include <ccan/tdb2/transaction.c>
#include <ccan/tdb2/traverse.c>
#include <ccan/tdb2/check.c>
#include <ccan/tap/tap.h>
#include "logging.h"
static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv)
{
return 0;
}
static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
{
if (p)
return tdb_delete(tdb, key);
return 0;
}
int main(int argc, char *argv[])
{
unsigned int i, j;
struct tdb_context *tdb;
struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) };
union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
.hash_fn = badhash } };
int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
TDB_NOMMAP|TDB_CONVERT,
};
hattr.base.next = &tap_log_attr;
plan_tests(5395);
for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
struct tdb_data d;
tdb = tdb_open("run-25-hashoverload.tdb", flags[i],
O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
ok1(tdb);
if (!tdb)
continue;
/* Fill a group. */
for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
}
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Now store one last value: should form chain. */
ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Check we can find them all. */
for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) {
d = tdb_fetch(tdb, key);
ok1(d.dsize == sizeof(j));
ok1(d.dptr != NULL);
ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
}
/* Now add a *lot* more. */
for (j = (1 << TDB_HASH_GROUP_BITS) + 1;
j < (16 << TDB_HASH_GROUP_BITS);
j++) {
ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
d = tdb_fetch(tdb, key);
ok1(d.dsize == sizeof(j));
ok1(d.dptr != NULL);
ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
}
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Traverse through them. */
ok1(tdb_traverse(tdb, trav, NULL) == j);
/* Empty the first chain-worth. */
for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
ok1(tdb_delete(tdb, key) == 0);
ok1(tdb_check(tdb, NULL, NULL) == 0);
for (j = (1 << TDB_HASH_GROUP_BITS);
j < (16 << TDB_HASH_GROUP_BITS);
j++) {
d = tdb_fetch(tdb, key);
ok1(d.dsize == sizeof(j));
ok1(d.dptr != NULL);
ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
}
/* Traverse through them. */
ok1(tdb_traverse(tdb, trav, NULL)
== (15 << TDB_HASH_GROUP_BITS));
/* Re-add */
for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
}
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Now try deleting as we go. */
ok1(tdb_traverse(tdb, trav, trav)
== (16 << TDB_HASH_GROUP_BITS));
ok1(tdb_check(tdb, NULL, NULL) == 0);
ok1(tdb_traverse(tdb, trav, NULL) == 0);
tdb_close(tdb);
}
ok1(tap_log_messages == 0);
return exit_status();
}
...@@ -9,13 +9,13 @@ ...@@ -9,13 +9,13 @@
#include <err.h> #include <err.h>
#include "logging.h" #include "logging.h"
static bool empty_freelist(struct tdb_context *tdb) static bool empty_freetable(struct tdb_context *tdb)
{ {
struct tdb_freelist free; struct tdb_freetable free;
unsigned int i; unsigned int i;
/* Now, free list should be completely exhausted in zone 0 */ /* Now, free table should be completely exhausted in zone 0 */
if (tdb_read_convert(tdb, tdb->flist_off, &free, sizeof(free)) != 0) if (tdb_read_convert(tdb, tdb->ftable_off, &free, sizeof(free)) != 0)
abort(); abort();
for (i = 0; i < sizeof(free.buckets)/sizeof(free.buckets[0]); i++) { for (i = 0; i < sizeof(free.buckets)/sizeof(free.buckets[0]); i++) {
...@@ -50,26 +50,26 @@ int main(int argc, char *argv[]) ...@@ -50,26 +50,26 @@ int main(int argc, char *argv[])
if (!tdb) if (!tdb)
continue; continue;
ok1(empty_freelist(tdb)); ok1(empty_freetable(tdb));
/* Need some hash lock for expand. */ /* Need some hash lock for expand. */
ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0); ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
/* Create some free space. */ /* Create some free space. */
ok1(tdb_expand(tdb, 1) == 0); ok1(tdb_expand(tdb, 1) == 0);
ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0); ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
ok1(tdb_check(tdb, NULL, NULL) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0);
ok1(!empty_freelist(tdb)); ok1(!empty_freetable(tdb));
size = tdb->map_size; size = tdb->map_size;
/* Insert minimal-length records until we expand. */ /* Insert minimal-length records until we expand. */
for (j = 0; tdb->map_size == size; j++) { for (j = 0; tdb->map_size == size; j++) {
was_empty = empty_freelist(tdb); was_empty = empty_freetable(tdb);
if (tdb_store(tdb, k, k, TDB_INSERT) != 0) if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
err(1, "Failed to store record %i", j); err(1, "Failed to store record %i", j);
} }
/* Would have been empty before expansion, but no longer. */ /* Would have been empty before expansion, but no longer. */
ok1(was_empty); ok1(was_empty);
ok1(!empty_freelist(tdb)); ok1(!empty_freetable(tdb));
tdb_close(tdb); tdb_close(tdb);
} }
......
...@@ -22,11 +22,11 @@ int main(int argc, char *argv[]) ...@@ -22,11 +22,11 @@ int main(int argc, char *argv[])
data.dsize = 5; data.dsize = 5;
key.dsize = 5; key.dsize = 5;
/* Create a TDB with three free lists. */ /* Create a TDB with three free tables. */
layout = new_tdb_layout(NULL); layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout); tdb_layout_add_freetable(layout);
tdb_layout_add_freelist(layout); tdb_layout_add_freetable(layout);
tdb_layout_add_freelist(layout); tdb_layout_add_freetable(layout);
tdb_layout_add_free(layout, 80, 0); tdb_layout_add_free(layout, 80, 0);
/* Used record prevent coalescing. */ /* Used record prevent coalescing. */
tdb_layout_add_used(layout, key, data, 6); tdb_layout_add_used(layout, key, data, 6);
...@@ -40,24 +40,28 @@ int main(int argc, char *argv[]) ...@@ -40,24 +40,28 @@ int main(int argc, char *argv[])
tdb = tdb_layout_get(layout); tdb = tdb_layout_get(layout);
ok1(tdb_check(tdb, NULL, NULL) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0);
off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0, 0); off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0,
TDB_USED_MAGIC, 0);
ok1(off == layout->elem[3].base.off); ok1(off == layout->elem[3].base.off);
ok1(tdb->flist_off == layout->elem[0].base.off); ok1(tdb->ftable_off == layout->elem[0].base.off);
off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0, 0); off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0,
TDB_USED_MAGIC, 0);
ok1(off == layout->elem[5].base.off); ok1(off == layout->elem[5].base.off);
ok1(tdb->flist_off == layout->elem[1].base.off); ok1(tdb->ftable_off == layout->elem[1].base.off);
off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0, 0); off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0,
TDB_USED_MAGIC, 0);
ok1(off == layout->elem[7].base.off); ok1(off == layout->elem[7].base.off);
ok1(tdb->flist_off == layout->elem[2].base.off); ok1(tdb->ftable_off == layout->elem[2].base.off);
off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0, 0); off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0,
TDB_USED_MAGIC, 0);
ok1(off == layout->elem[9].base.off); ok1(off == layout->elem[9].base.off);
ok1(tdb->flist_off == layout->elem[0].base.off); ok1(tdb->ftable_off == layout->elem[0].base.off);
/* Now we fail. */ /* Now we fail. */
off = get_free(tdb, 0, 0, 1, 0); off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0);
ok1(off == 0); ok1(off == 0);
tdb_close(tdb); tdb_close(tdb);
......
...@@ -13,7 +13,7 @@ static int log_count = 0; ...@@ -13,7 +13,7 @@ static int log_count = 0;
/* Normally we get a log when setting random seed. */ /* Normally we get a log when setting random seed. */
static void my_log_fn(struct tdb_context *tdb, static void my_log_fn(struct tdb_context *tdb,
enum tdb_debug_level level, void *priv, enum tdb_debug_level level, void *priv,
const char *fmt, ...) const char *message)
{ {
log_count++; log_count++;
} }
......
...@@ -56,7 +56,6 @@ static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p) ...@@ -56,7 +56,6 @@ static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
td->high = val; td->high = val;
if (td->delete) { if (td->delete) {
if (tdb_delete(tdb, key) != 0) { if (tdb_delete(tdb, key) != 0) {
td->delete_error = tdb_error(tdb); td->delete_error = tdb_error(tdb);
return -1; return -1;
...@@ -120,7 +119,7 @@ int main(int argc, char *argv[]) ...@@ -120,7 +119,7 @@ int main(int argc, char *argv[])
hattr.base.next = &tap_log_attr; hattr.base.next = &tap_log_attr;
plan_tests(sizeof(flags) / sizeof(flags[0]) * 50 + 1); plan_tests(sizeof(flags) / sizeof(flags[0]) * 53 + 1);
for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
tdb = tdb_open("run-traverse.tdb", flags[i], tdb = tdb_open("run-traverse.tdb", flags[i],
O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
...@@ -182,6 +181,7 @@ int main(int argc, char *argv[]) ...@@ -182,6 +181,7 @@ int main(int argc, char *argv[])
ok1(td.low <= NUM_RECORDS / 2); ok1(td.low <= NUM_RECORDS / 2);
ok1(td.high > NUM_RECORDS / 2); ok1(td.high > NUM_RECORDS / 2);
ok1(tdb_check(tdb, NULL, NULL) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0);
ok1(tap_log_messages == 0);
/* Growing traverse. Expect failure on r/o traverse. */ /* Growing traverse. Expect failure on r/o traverse. */
tgd.calls = 0; tgd.calls = 0;
...@@ -193,6 +193,8 @@ int main(int argc, char *argv[]) ...@@ -193,6 +193,8 @@ int main(int argc, char *argv[])
ok1(tgd.error == TDB_ERR_RDONLY); ok1(tgd.error == TDB_ERR_RDONLY);
ok1(tgd.calls == 1); ok1(tgd.calls == 1);
ok1(!tgd.mismatch); ok1(!tgd.mismatch);
ok1(tap_log_messages == 1);
tap_log_messages = 0;
ok1(tdb_check(tdb, NULL, NULL) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Deleting traverse. Expect failure on r/o traverse. */ /* Deleting traverse. Expect failure on r/o traverse. */
...@@ -209,6 +211,8 @@ int main(int argc, char *argv[]) ...@@ -209,6 +211,8 @@ int main(int argc, char *argv[])
ok1(!td.mismatch); ok1(!td.mismatch);
ok1(td.calls == 1); ok1(td.calls == 1);
ok1(td.low == td.high); ok1(td.low == td.high);
ok1(tap_log_messages == 1);
tap_log_messages = 0;
ok1(tdb_check(tdb, NULL, NULL) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Deleting traverse (delete everything). */ /* Deleting traverse (delete everything). */
......
OBJS:=../../tdb2.o ../../hash.o ../../tally.o OBJS:=../../tdb2.o ../../hash.o ../../tally.o
CFLAGS:=-I../../.. -Wall -g #-g -O3 #-g -pg CFLAGS:=-I../../.. -Wall -g -O3 #-g -pg
LDFLAGS:=-L../../.. LDFLAGS:=-L../../..
default: tdbtorture tdbtool mktdb default: tdbtorture tdbtool mktdb speed
tdbtorture: tdbtorture.c $(OBJS) tdbtorture: tdbtorture.c $(OBJS)
tdbtool: tdbtool.c $(OBJS) tdbtool: tdbtool.c $(OBJS)
mktdb: mktdb.c $(OBJS) mktdb: mktdb.c $(OBJS)
speed: speed.c $(OBJS)
clean: clean:
rm -f tdbtorture tdbtool mktdb rm -f tdbtorture tdbtool mktdb speed
/* Simple speed test for TDB */
#include <err.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/time.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ccan/tdb2/tdb2.h>
/* Nanoseconds per operation */
static size_t normalize(const struct timeval *start,
const struct timeval *stop,
unsigned int num)
{
struct timeval diff;
timersub(stop, start, &diff);
/* Floating point is more accurate here. */
return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
/ num * 1000;
}
static size_t file_size(void)
{
struct stat st;
if (stat("/tmp/speed.tdb", &st) != 0)
return -1;
return st.st_size;
}
static int count_record(struct tdb_context *tdb,
TDB_DATA key, TDB_DATA data, void *p)
{
int *total = p;
*total += *(int *)data.dptr;
return 0;
}
static void dump_and_clear_stats(struct tdb_attribute_stats *stats)
{
printf("allocs = %llu\n",
(unsigned long long)stats->allocs);
printf(" alloc_subhash = %llu\n",
(unsigned long long)stats->alloc_subhash);
printf(" alloc_chain = %llu\n",
(unsigned long long)stats->alloc_chain);
printf(" alloc_bucket_exact = %llu\n",
(unsigned long long)stats->alloc_bucket_exact);
printf(" alloc_bucket_max = %llu\n",
(unsigned long long)stats->alloc_bucket_max);
printf(" alloc_leftover = %llu\n",
(unsigned long long)stats->alloc_leftover);
printf(" alloc_coalesce_tried = %llu\n",
(unsigned long long)stats->alloc_coalesce_tried);
printf(" alloc_coalesce_lockfail = %llu\n",
(unsigned long long)stats->alloc_coalesce_lockfail);
printf(" alloc_coalesce_race = %llu\n",
(unsigned long long)stats->alloc_coalesce_race);
printf(" alloc_coalesce_succeeded = %llu\n",
(unsigned long long)stats->alloc_coalesce_succeeded);
printf(" alloc_coalesce_num_merged = %llu\n",
(unsigned long long)stats->alloc_coalesce_num_merged);
printf("compares = %llu\n",
(unsigned long long)stats->compares);
printf(" compare_wrong_bucket = %llu\n",
(unsigned long long)stats->compare_wrong_bucket);
printf(" compare_wrong_offsetbits = %llu\n",
(unsigned long long)stats->compare_wrong_offsetbits);
printf(" compare_wrong_keylen = %llu\n",
(unsigned long long)stats->compare_wrong_keylen);
printf(" compare_wrong_rechash = %llu\n",
(unsigned long long)stats->compare_wrong_rechash);
printf(" compare_wrong_keycmp = %llu\n",
(unsigned long long)stats->compare_wrong_keycmp);
printf("expands = %llu\n",
(unsigned long long)stats->expands);
printf("frees = %llu\n",
(unsigned long long)stats->frees);
printf("locks = %llu\n",
(unsigned long long)stats->locks);
printf(" lock_lowlevel = %llu\n",
(unsigned long long)stats->lock_lowlevel);
printf(" lock_nonblock = %llu\n",
(unsigned long long)stats->lock_nonblock);
/* Now clear. */
memset(&stats->allocs, 0, (char *)(stats+1) - (char *)&stats->allocs);
}
int main(int argc, char *argv[])
{
unsigned int i, j, num = 1000, stage = 0, stopat = -1;
int flags = TDB_DEFAULT;
bool transaction = false;
TDB_DATA key, data;
struct tdb_context *tdb;
struct timeval start, stop;
union tdb_attribute seed, stats;
/* Try to keep benchmarks even. */
seed.base.attr = TDB_ATTRIBUTE_SEED;
seed.base.next = NULL;
seed.seed.seed = 0;
memset(&stats, 0, sizeof(stats));
stats.base.attr = TDB_ATTRIBUTE_STATS;
stats.base.next = NULL;
stats.stats.size = sizeof(stats);
if (argv[1] && strcmp(argv[1], "--internal") == 0) {
flags = TDB_INTERNAL;
argc--;
argv++;
}
if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
transaction = true;
argc--;
argv++;
}
if (argv[1] && strcmp(argv[1], "--stats") == 0) {
seed.base.next = &stats;
argc--;
argv++;
}
tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC,
0600, &seed);
if (!tdb)
err(1, "Opening /tmp/speed.tdb");
key.dptr = (void *)&i;
key.dsize = sizeof(i);
data = key;
if (argv[1]) {
num = atoi(argv[1]);
argv++;
argc--;
}
if (argv[1]) {
stopat = atoi(argv[1]);
argv++;
argc--;
}
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Add 1000 records. */
printf("Adding %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (i = 0; i < num; i++)
if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
errx(1, "Inserting key %u in tdb: %s",
i, tdb_errorstr(tdb));
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (seed.base.next)
dump_and_clear_stats(&stats.stats);
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Finding 1000 records. */
printf("Finding %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (i = 0; i < num; i++) {
int *dptr;
dptr = (int *)tdb_fetch(tdb, key).dptr;
if (!dptr || *dptr != i)
errx(1, "Fetching key %u in tdb gave %u",
i, dptr ? *dptr : -1);
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (seed.base.next)
dump_and_clear_stats(&stats.stats);
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Missing 1000 records. */
printf("Missing %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (i = num; i < num*2; i++) {
int *dptr;
dptr = (int *)tdb_fetch(tdb, key).dptr;
if (dptr)
errx(1, "Fetching key %u in tdb gave %u", i, *dptr);
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (seed.base.next)
dump_and_clear_stats(&stats.stats);
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Traverse 1000 records. */
printf("Traversing %u records: ", num); fflush(stdout);
i = 0;
gettimeofday(&start, NULL);
if (tdb_traverse(tdb, count_record, &i) != num)
errx(1, "Traverse returned wrong number of records");
if (i != (num - 1) * (num / 2))
errx(1, "Traverse tallied to %u", i);
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (seed.base.next)
dump_and_clear_stats(&stats.stats);
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Delete 1000 records (not in order). */
printf("Deleting %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (j = 0; j < num; j++) {
i = (j + 100003) % num;
if (tdb_delete(tdb, key) != 0)
errx(1, "Deleting key %u in tdb: %s",
i, tdb_errorstr(tdb));
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (seed.base.next)
dump_and_clear_stats(&stats.stats);
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Re-add 1000 records (not in order). */
printf("Re-adding %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (j = 0; j < num; j++) {
i = (j + 100003) % num;
if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
errx(1, "Inserting key %u in tdb: %s",
i, tdb_errorstr(tdb));
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (seed.base.next)
dump_and_clear_stats(&stats.stats);
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Append 1000 records. */
printf("Appending %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (i = 0; i < num; i++)
if (tdb_append(tdb, key, data) != 0)
errx(1, "Appending key %u in tdb: %s",
i, tdb_errorstr(tdb));
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (++stage == stopat)
exit(0);
if (transaction && tdb_transaction_start(tdb))
errx(1, "starting transaction: %s", tdb_errorstr(tdb));
/* Churn 1000 records: not in order! */
printf("Churning %u records: ", num); fflush(stdout);
gettimeofday(&start, NULL);
for (j = 0; j < num; j++) {
i = (j + 1000019) % num;
if (tdb_delete(tdb, key) != 0)
errx(1, "Deleting key %u in tdb: %s",
i, tdb_errorstr(tdb));
i += num;
if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
errx(1, "Inserting key %u in tdb: %s",
i, tdb_errorstr(tdb));
}
gettimeofday(&stop, NULL);
if (transaction && tdb_transaction_commit(tdb))
errx(1, "committing transaction: %s", tdb_errorstr(tdb));
printf(" %zu ns (%zu bytes)\n",
normalize(&start, &stop, num), file_size());
if (seed.base.next)
dump_and_clear_stats(&stats.stats);
if (++stage == stopat)
exit(0);
return 0;
}
...@@ -169,10 +169,9 @@ static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf, ...@@ -169,10 +169,9 @@ static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
return 0; return 0;
fail: fail:
tdb->ecode = TDB_ERR_IO; tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "transaction_read: failed at off=%zu len=%zu",
"transaction_read: failed at off=%llu len=%llu\n", (size_t)off, (size_t)len);
(long long)off, (long long)len);
tdb->transaction->transaction_error = 1; tdb->transaction->transaction_error = 1;
return -1; return -1;
} }
...@@ -188,12 +187,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off, ...@@ -188,12 +187,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
/* Only a commit is allowed on a prepared transaction */ /* Only a commit is allowed on a prepared transaction */
if (tdb->transaction->prepared) { if (tdb->transaction->prepared) {
tdb->ecode = TDB_ERR_EINVAL; tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
"transaction_write: transaction already prepared," "transaction_write: transaction already prepared,"
" write not allowed\n"); " write not allowed");
tdb->transaction->transaction_error = 1; goto fail;
return -1;
} }
/* break it up into block sized chunks */ /* break it up into block sized chunks */
...@@ -228,7 +225,8 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off, ...@@ -228,7 +225,8 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
(blk+1)*sizeof(uint8_t *)); (blk+1)*sizeof(uint8_t *));
} }
if (new_blocks == NULL) { if (new_blocks == NULL) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
"transaction_write: failed to allocate");
goto fail; goto fail;
} }
memset(&new_blocks[tdb->transaction->num_blocks], 0, memset(&new_blocks[tdb->transaction->num_blocks], 0,
...@@ -242,9 +240,9 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off, ...@@ -242,9 +240,9 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
if (tdb->transaction->blocks[blk] == NULL) { if (tdb->transaction->blocks[blk] == NULL) {
tdb->transaction->blocks[blk] = (uint8_t *)calloc(getpagesize(), 1); tdb->transaction->blocks[blk] = (uint8_t *)calloc(getpagesize(), 1);
if (tdb->transaction->blocks[blk] == NULL) { if (tdb->transaction->blocks[blk] == NULL) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
tdb->transaction->transaction_error = 1; "transaction_write: failed to allocate");
return -1; goto fail;
} }
if (tdb->transaction->old_map_size > blk * getpagesize()) { if (tdb->transaction->old_map_size > blk * getpagesize()) {
tdb_len_t len2 = getpagesize(); tdb_len_t len2 = getpagesize();
...@@ -254,6 +252,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off, ...@@ -254,6 +252,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
if (tdb->transaction->io_methods->read(tdb, blk * getpagesize(), if (tdb->transaction->io_methods->read(tdb, blk * getpagesize(),
tdb->transaction->blocks[blk], tdb->transaction->blocks[blk],
len2) != 0) { len2) != 0) {
tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
"transaction_write: failed to"
" read old block: %s",
strerror(errno));
SAFE_FREE(tdb->transaction->blocks[blk]); SAFE_FREE(tdb->transaction->blocks[blk]);
goto fail; goto fail;
} }
...@@ -278,10 +280,6 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off, ...@@ -278,10 +280,6 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
return 0; return 0;
fail: fail:
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
"transaction_write: failed at off=%llu len=%llu\n",
(long long)((blk*getpagesize()) + off),
(long long)len);
tdb->transaction->transaction_error = 1; tdb->transaction->transaction_error = 1;
return -1; return -1;
} }
...@@ -341,6 +339,12 @@ static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, bool probe) ...@@ -341,6 +339,12 @@ static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
return 0; return 0;
} }
tdb->ecode = TDB_ERR_IO; tdb->ecode = TDB_ERR_IO;
if (!probe) {
tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
"tdb_oob len %lld beyond transaction size %lld",
(long long)len,
(long long)tdb->map_size);
}
return -1; return -1;
} }
...@@ -359,10 +363,39 @@ static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t addition) ...@@ -359,10 +363,39 @@ static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t addition)
} }
static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off, static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
size_t len) size_t len, bool write)
{ {
/* FIXME */ size_t blk = off / getpagesize(), end_blk;
return NULL;
/* This is wrong for zero-length blocks, but will fail gracefully */
end_blk = (off + len - 1) / getpagesize();
/* Can only do direct if in single block and we've already copied. */
if (write) {
if (blk != end_blk)
return NULL;
if (blk >= tdb->transaction->num_blocks)
return NULL;
if (tdb->transaction->blocks[blk] == NULL)
return NULL;
return tdb->transaction->blocks[blk] + off % getpagesize();
}
/* Single which we have copied? */
if (blk == end_blk
&& blk < tdb->transaction->num_blocks
&& tdb->transaction->blocks[blk])
return tdb->transaction->blocks[blk] + off % getpagesize();
/* Otherwise must be all not copied. */
while (blk < end_blk) {
if (blk >= tdb->transaction->num_blocks)
break;
if (tdb->transaction->blocks[blk])
return NULL;
blk++;
}
return tdb->transaction->io_methods->direct(tdb, off, len, write);
} }
static const struct tdb_methods transaction_methods = { static const struct tdb_methods transaction_methods = {
...@@ -383,9 +416,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t ...@@ -383,9 +416,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
} }
if (fsync(tdb->fd) != 0) { if (fsync(tdb->fd) != 0) {
tdb->ecode = TDB_ERR_IO; tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_transaction: fsync failed: %s",
"tdb_transaction: fsync failed\n"); strerror(errno));
return -1; return -1;
} }
#ifdef MS_SYNC #ifdef MS_SYNC
...@@ -393,10 +426,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t ...@@ -393,10 +426,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
tdb_off_t moffset = offset & ~(getpagesize()-1); tdb_off_t moffset = offset & ~(getpagesize()-1);
if (msync(moffset + (char *)tdb->map_ptr, if (msync(moffset + (char *)tdb->map_ptr,
length + (offset - moffset), MS_SYNC) != 0) { length + (offset - moffset), MS_SYNC) != 0) {
tdb->ecode = TDB_ERR_IO; tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_transaction: msync failed: %s",
"tdb_transaction: msync failed - %s\n", strerror(errno));
strerror(errno));
return -1; return -1;
} }
} }
...@@ -410,9 +442,8 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb) ...@@ -410,9 +442,8 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
int i; int i;
if (tdb->transaction == NULL) { if (tdb->transaction == NULL) {
tdb->ecode = TDB_ERR_EINVAL; tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_transaction_cancel: no transaction");
"tdb_transaction_cancel: no transaction\n");
return; return;
} }
...@@ -441,9 +472,9 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb) ...@@ -441,9 +472,9 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
&invalid, sizeof(invalid)) == -1 || &invalid, sizeof(invalid)) == -1 ||
transaction_sync(tdb, tdb->transaction->magic_offset, transaction_sync(tdb, tdb->transaction->magic_offset,
sizeof(invalid)) == -1) { sizeof(invalid)) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_cancel: failed to remove" "tdb_transaction_cancel: failed to remove"
" recovery magic\n"); " recovery magic");
} }
} }
...@@ -469,16 +500,17 @@ int tdb_transaction_start(struct tdb_context *tdb) ...@@ -469,16 +500,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
{ {
/* some sanity checks */ /* some sanity checks */
if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) { if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
tdb->ecode = TDB_ERR_EINVAL; tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_transaction_start: cannot start a transaction"
"tdb_transaction_start: cannot start a transaction" " on a read-only or internal db");
" on a read-only or internal db\n");
return -1; return -1;
} }
/* cope with nested tdb_transaction_start() calls */ /* cope with nested tdb_transaction_start() calls */
if (tdb->transaction != NULL) { if (tdb->transaction != NULL) {
tdb->ecode = TDB_ERR_NESTING; tdb_logerr(tdb, TDB_ERR_NESTING, TDB_DEBUG_ERROR,
"tdb_transaction_start:"
" already inside transaction");
return -1; return -1;
} }
...@@ -486,17 +518,17 @@ int tdb_transaction_start(struct tdb_context *tdb) ...@@ -486,17 +518,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
/* the caller must not have any locks when starting a /* the caller must not have any locks when starting a
transaction as otherwise we'll be screwed by lack transaction as otherwise we'll be screwed by lack
of nested locks in posix */ of nested locks in posix */
tdb->ecode = TDB_ERR_LOCK; tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_transaction_start: cannot start a transaction"
"tdb_transaction_start: cannot start a transaction" " with locks held");
" with locks held\n");
return -1; return -1;
} }
tdb->transaction = (struct tdb_transaction *) tdb->transaction = (struct tdb_transaction *)
calloc(sizeof(struct tdb_transaction), 1); calloc(sizeof(struct tdb_transaction), 1);
if (tdb->transaction == NULL) { if (tdb->transaction == NULL) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
"tdb_transaction_start: cannot allocate");
return -1; return -1;
} }
...@@ -585,17 +617,17 @@ static int tdb_recovery_allocate(struct tdb_context *tdb, ...@@ -585,17 +617,17 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery)); recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
if (recovery_head == TDB_OFF_ERR) { if (recovery_head == TDB_OFF_ERR) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_recovery_allocate:" "tdb_recovery_allocate:"
" failed to read recovery head\n"); " failed to read recovery head");
return -1; return -1;
} }
if (recovery_head != 0) { if (recovery_head != 0) {
if (methods->read(tdb, recovery_head, &rec, sizeof(rec))) { if (methods->read(tdb, recovery_head, &rec, sizeof(rec))) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_recovery_allocate:" "tdb_recovery_allocate:"
" failed to read recovery record\n"); " failed to read recovery record");
return -1; return -1;
} }
tdb_convert(tdb, &rec, sizeof(rec)); tdb_convert(tdb, &rec, sizeof(rec));
...@@ -621,11 +653,12 @@ static int tdb_recovery_allocate(struct tdb_context *tdb, ...@@ -621,11 +653,12 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
us an area that is being currently used (as of the start of us an area that is being currently used (as of the start of
the transaction) */ the transaction) */
if (recovery_head != 0) { if (recovery_head != 0) {
add_stat(tdb, frees, 1);
if (add_free_record(tdb, recovery_head, if (add_free_record(tdb, recovery_head,
sizeof(rec) + rec.max_len) != 0) { sizeof(rec) + rec.max_len) != 0) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_recovery_allocate:" "tdb_recovery_allocate:"
" failed to free previous recovery area\n"); " failed to free previous recovery area");
return -1; return -1;
} }
} }
...@@ -649,9 +682,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb, ...@@ -649,9 +682,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
sizeof(rec) + *recovery_max_size; sizeof(rec) + *recovery_max_size;
tdb->map_size = tdb->transaction->old_map_size; tdb->map_size = tdb->transaction->old_map_size;
if (methods->expand_file(tdb, addition) == -1) { if (methods->expand_file(tdb, addition) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_recovery_allocate:" "tdb_recovery_allocate:"
" failed to create recovery area\n"); " failed to create recovery area");
return -1; return -1;
} }
...@@ -665,9 +698,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb, ...@@ -665,9 +698,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
tdb_convert(tdb, &recovery_head, sizeof(recovery_head)); tdb_convert(tdb, &recovery_head, sizeof(recovery_head));
if (methods->write(tdb, offsetof(struct tdb_header, recovery), if (methods->write(tdb, offsetof(struct tdb_header, recovery),
&recovery_head, sizeof(tdb_off_t)) == -1) { &recovery_head, sizeof(tdb_off_t)) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_recovery_allocate:" "tdb_recovery_allocate:"
" failed to write recovery head\n"); " failed to write recovery head");
return -1; return -1;
} }
transaction_write_existing(tdb, offsetof(struct tdb_header, recovery), transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
...@@ -713,7 +746,8 @@ static int transaction_setup_recovery(struct tdb_context *tdb, ...@@ -713,7 +746,8 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
data = (unsigned char *)malloc(recovery_size + sizeof(*rec)); data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
if (data == NULL) { if (data == NULL) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
"transaction_setup_recovery: cannot allocate");
return -1; return -1;
} }
...@@ -743,10 +777,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb, ...@@ -743,10 +777,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
continue; continue;
} }
if (offset + length > tdb->map_size) { if (offset + length > tdb->map_size) {
tdb->ecode = TDB_ERR_CORRUPT; tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_transaction_setup_recovery:"
"tdb_transaction_setup_recovery:" " transaction data over new region boundary");
" transaction data over new region boundary\n");
free(data); free(data);
return -1; return -1;
} }
...@@ -774,9 +807,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb, ...@@ -774,9 +807,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
/* write the recovery data to the recovery area */ /* write the recovery data to the recovery area */
if (methods->write(tdb, recovery_offset, data, if (methods->write(tdb, recovery_offset, data,
sizeof(*rec) + recovery_size) == -1) { sizeof(*rec) + recovery_size) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_setup_recovery:" "tdb_transaction_setup_recovery:"
" failed to write recovery data\n"); " failed to write recovery data");
free(data); free(data);
return -1; return -1;
} }
...@@ -801,9 +834,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb, ...@@ -801,9 +834,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
magic); magic);
if (methods->write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) { if (methods->write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_setup_recovery:" "tdb_transaction_setup_recovery:"
" failed to write recovery magic\n"); " failed to write recovery magic");
return -1; return -1;
} }
transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)); transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic));
...@@ -821,27 +854,24 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb) ...@@ -821,27 +854,24 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
const struct tdb_methods *methods; const struct tdb_methods *methods;
if (tdb->transaction == NULL) { if (tdb->transaction == NULL) {
tdb->ecode = TDB_ERR_EINVAL; tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_transaction_prepare_commit: no transaction");
"tdb_transaction_prepare_commit: no transaction\n");
return -1; return -1;
} }
if (tdb->transaction->prepared) { if (tdb->transaction->prepared) {
tdb->ecode = TDB_ERR_EINVAL;
_tdb_transaction_cancel(tdb); _tdb_transaction_cancel(tdb);
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
"tdb_transaction_prepare_commit:" "tdb_transaction_prepare_commit:"
" transaction already prepared\n"); " transaction already prepared");
return -1; return -1;
} }
if (tdb->transaction->transaction_error) { if (tdb->transaction->transaction_error) {
tdb->ecode = TDB_ERR_IO;
_tdb_transaction_cancel(tdb); _tdb_transaction_cancel(tdb);
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
"tdb_transaction_prepare_commit:" "tdb_transaction_prepare_commit:"
" transaction error pending\n"); " transaction error pending");
return -1; return -1;
} }
...@@ -860,9 +890,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb) ...@@ -860,9 +890,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
/* upgrade the main transaction lock region to a write lock */ /* upgrade the main transaction lock region to a write lock */
if (tdb_allrecord_upgrade(tdb) == -1) { if (tdb_allrecord_upgrade(tdb) == -1) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
"tdb_transaction_prepare_commit:" "tdb_transaction_prepare_commit:"
" failed to upgrade hash locks\n"); " failed to upgrade hash locks");
_tdb_transaction_cancel(tdb); _tdb_transaction_cancel(tdb);
return -1; return -1;
} }
...@@ -870,9 +900,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb) ...@@ -870,9 +900,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
/* get the open lock - this prevents new users attaching to the database /* get the open lock - this prevents new users attaching to the database
during the commit */ during the commit */
if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) { if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
"tdb_transaction_prepare_commit:" "tdb_transaction_prepare_commit:"
" failed to get open lock\n"); " failed to get open lock");
_tdb_transaction_cancel(tdb); _tdb_transaction_cancel(tdb);
return -1; return -1;
} }
...@@ -881,9 +911,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb) ...@@ -881,9 +911,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
if (!(tdb->flags & TDB_NOSYNC)) { if (!(tdb->flags & TDB_NOSYNC)) {
/* write the recovery data to the end of the file */ /* write the recovery data to the end of the file */
if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) { if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_prepare_commit:" "tdb_transaction_prepare_commit:"
" failed to setup recovery data\n"); " failed to setup recovery data");
_tdb_transaction_cancel(tdb); _tdb_transaction_cancel(tdb);
return -1; return -1;
} }
...@@ -897,9 +927,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb) ...@@ -897,9 +927,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
/* Restore original map size for tdb_expand_file */ /* Restore original map size for tdb_expand_file */
tdb->map_size = tdb->transaction->old_map_size; tdb->map_size = tdb->transaction->old_map_size;
if (methods->expand_file(tdb, add) == -1) { if (methods->expand_file(tdb, add) == -1) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
"tdb_transaction_prepare_commit:" "tdb_transaction_prepare_commit:"
" expansion failed\n"); " expansion failed");
_tdb_transaction_cancel(tdb); _tdb_transaction_cancel(tdb);
return -1; return -1;
} }
...@@ -927,19 +957,18 @@ int tdb_transaction_commit(struct tdb_context *tdb) ...@@ -927,19 +957,18 @@ int tdb_transaction_commit(struct tdb_context *tdb)
int i; int i;
if (tdb->transaction == NULL) { if (tdb->transaction == NULL) {
tdb->ecode = TDB_ERR_EINVAL; tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_transaction_commit: no transaction");
"tdb_transaction_commit: no transaction\n");
return -1; return -1;
} }
tdb_trace(tdb, "tdb_transaction_commit"); tdb_trace(tdb, "tdb_transaction_commit");
if (tdb->transaction->transaction_error) { if (tdb->transaction->transaction_error) {
tdb->ecode = TDB_ERR_IO;
tdb_transaction_cancel(tdb); tdb_transaction_cancel(tdb);
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
"tdb_transaction_commit: transaction error pending\n"); "tdb_transaction_commit:"
" transaction error pending");
return -1; return -1;
} }
...@@ -980,9 +1009,9 @@ int tdb_transaction_commit(struct tdb_context *tdb) ...@@ -980,9 +1009,9 @@ int tdb_transaction_commit(struct tdb_context *tdb)
if (methods->write(tdb, offset, tdb->transaction->blocks[i], if (methods->write(tdb, offset, tdb->transaction->blocks[i],
length) == -1) { length) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_commit:" "tdb_transaction_commit:"
" write failed during commit\n"); " write failed during commit");
/* we've overwritten part of the data and /* we've overwritten part of the data and
possibly expanded the file, so we need to possibly expanded the file, so we need to
...@@ -1042,9 +1071,9 @@ int tdb_transaction_recover(struct tdb_context *tdb) ...@@ -1042,9 +1071,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
/* find the recovery area */ /* find the recovery area */
recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery)); recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
if (recovery_head == TDB_OFF_ERR) { if (recovery_head == TDB_OFF_ERR) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_recover:" "tdb_transaction_recover:"
" failed to read recovery head\n"); " failed to read recovery head");
return -1; return -1;
} }
...@@ -1055,9 +1084,9 @@ int tdb_transaction_recover(struct tdb_context *tdb) ...@@ -1055,9 +1084,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
/* read the recovery record */ /* read the recovery record */
if (tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)) == -1) { if (tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_recover:" "tdb_transaction_recover:"
" failed to read recovery record\n"); " failed to read recovery record");
return -1; return -1;
} }
...@@ -1067,10 +1096,9 @@ int tdb_transaction_recover(struct tdb_context *tdb) ...@@ -1067,10 +1096,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
} }
if (tdb->read_only) { if (tdb->read_only) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
"tdb_transaction_recover:" "tdb_transaction_recover:"
" attempt to recover read only database\n"); " attempt to recover read only database");
tdb->ecode = TDB_ERR_CORRUPT;
return -1; return -1;
} }
...@@ -1078,19 +1106,18 @@ int tdb_transaction_recover(struct tdb_context *tdb) ...@@ -1078,19 +1106,18 @@ int tdb_transaction_recover(struct tdb_context *tdb)
data = (unsigned char *)malloc(rec.len); data = (unsigned char *)malloc(rec.len);
if (data == NULL) { if (data == NULL) {
tdb->ecode = TDB_ERR_OOM; tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "tdb_transaction_recover:"
"tdb_transaction_recover:" " failed to allocate recovery data");
" failed to allocate recovery data\n");
return -1; return -1;
} }
/* read the full recovery data */ /* read the full recovery data */
if (tdb->methods->read(tdb, recovery_head + sizeof(rec), data, if (tdb->methods->read(tdb, recovery_head + sizeof(rec), data,
rec.len) == -1) { rec.len) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_recover:" "tdb_transaction_recover:"
" failed to read recovery data\n"); " failed to read recovery data");
return -1; return -1;
} }
...@@ -1106,9 +1133,9 @@ int tdb_transaction_recover(struct tdb_context *tdb) ...@@ -1106,9 +1133,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
if (tdb->methods->write(tdb, ofs, p, len) == -1) { if (tdb->methods->write(tdb, ofs, p, len) == -1) {
free(data); free(data);
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_recover:" "tdb_transaction_recover:"
" failed to recover %zu bytes at offset %zu\n", " failed to recover %zu bytes at offset %zu",
(size_t)len, (size_t)ofs); (size_t)len, (size_t)ofs);
return -1; return -1;
} }
...@@ -1118,8 +1145,8 @@ int tdb_transaction_recover(struct tdb_context *tdb) ...@@ -1118,8 +1145,8 @@ int tdb_transaction_recover(struct tdb_context *tdb)
free(data); free(data);
if (transaction_sync(tdb, 0, tdb->map_size) == -1) { if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_recover: failed to sync recovery\n"); "tdb_transaction_recover: failed to sync recovery");
return -1; return -1;
} }
...@@ -1127,9 +1154,9 @@ int tdb_transaction_recover(struct tdb_context *tdb) ...@@ -1127,9 +1154,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
if (recovery_eof <= recovery_head) { if (recovery_eof <= recovery_head) {
if (tdb_write_off(tdb, offsetof(struct tdb_header,recovery), 0) if (tdb_write_off(tdb, offsetof(struct tdb_header,recovery), 0)
== -1) { == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_recover:" "tdb_transaction_recover:"
" failed to remove recovery head\n"); " failed to remove recovery head");
return -1; return -1;
} }
} }
...@@ -1139,21 +1166,21 @@ int tdb_transaction_recover(struct tdb_context *tdb) ...@@ -1139,21 +1166,21 @@ int tdb_transaction_recover(struct tdb_context *tdb)
recovery_head recovery_head
+ offsetof(struct tdb_recovery_record, magic), + offsetof(struct tdb_recovery_record, magic),
TDB_RECOVERY_INVALID_MAGIC) == -1) { TDB_RECOVERY_INVALID_MAGIC) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_recover:" "tdb_transaction_recover:"
" failed to remove recovery magic\n"); " failed to remove recovery magic");
return -1; return -1;
} }
if (transaction_sync(tdb, 0, recovery_eof) == -1) { if (transaction_sync(tdb, 0, recovery_eof) == -1) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
"tdb_transaction_recover: failed to sync2 recovery\n"); "tdb_transaction_recover: failed to sync2 recovery");
return -1; return -1;
} }
tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv, tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
"tdb_transaction_recover: recovered %zu byte database\n", "tdb_transaction_recover: recovered %zu byte database",
(size_t)recovery_eof); (size_t)recovery_eof);
/* all done */ /* all done */
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment