Merge branch 'tdb2'

51a56b52 · Rusty Russell · 451d97ad · a42bba8e · 51a56b52 · 51a56b52
Commit 51a56b52 authored Dec 01, 2010 by Rusty Russell
30 changed files
--- a/ccan/tdb/tools/Makefile
+++ b/ccan/tdb/tools/Makefile
@@ -2,7 +2,7 @@ LDLIBS:=../../tdb.o ../../tally.o
 CFLAGS:=-I../../.. -Wall -O3 #-g -pg
 LDFLAGS:=-L../../..
-default: replay_trace tdbtorture tdbdump tdbtool starvation mktdb
+default: replay_trace tdbtorture tdbdump tdbtool starvation mktdb speed
 benchmark: replay_trace
 	@trap "rm -f /tmp/trace.$$$$" 0; for f in benchmarks/*.rz; do if runzip -k $$f -o /tmp/trace.$$$$ && echo -n "$$f": && ./replay_trace --quiet -n 5 replay.tdb /tmp/trace.$$$$ && rm /tmp/trace.$$$$; then rm -f /tmp/trace.$$$$; else exit 1; fi; done
@@ -30,4 +30,4 @@ check: replay_trace
 	@sed 's/\(^[0-9]* traverse\) .*/\1fn/' < $^ > $@
 clean:
-	rm -f replay_trace tdbtorture tdbdump tdbtool *.o
+	rm -f replay_trace tdbtorture tdbdump tdbtool speed *.o
--- a/ccan/tdb/tools/speed.c
+++ b/ccan/tdb/tools/speed.c
+/* Simple speed test for TDB */
+#include <err.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <ccan/tdb/tdb.h>
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+			const struct timeval *stop,
+			unsigned int num)
+{
+	struct timeval diff;
+	timersub(stop, start, &diff);
+	/* Floating point is more accurate here. */
+	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+		/ num * 1000;
+}
+static size_t file_size(void)
+{
+	struct stat st;
+	if (stat("/tmp/speed.tdb", &st) != 0)
+		return -1;
+	return st.st_size;
+}
+static int count_record(struct tdb_context *tdb,
+			TDB_DATA key, TDB_DATA data, void *p)
+{
+	int *total = p;
+	*total += *(int *)data.dptr;
+	return 0;
+}
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, num = 1000, stage = 0, stopat = -1;
+	int flags = TDB_DEFAULT;
+	TDB_DATA key, data;
+	struct tdb_context *tdb;
+	struct timeval start, stop;
+	bool transaction = false;
+	if (argv[1] && strcmp(argv[1], "--internal") == 0) {
+		flags = TDB_INTERNAL;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
+		transaction = true;
+		argc--;
+		argv++;
+	}
+	tdb = tdb_open("/tmp/speed.tdb", 100003, flags, O_RDWR|O_CREAT|O_TRUNC,
+		       0600);
+	if (!tdb)
+		err(1, "Opening /tmp/speed.tdb");
+	key.dptr = (void *)&i;
+	key.dsize = sizeof(i);
+	data = key;
+	if (argv[1]) {
+		num = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+	if (argv[1]) {
+		stopat = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Add 1000 records. */
+	printf("Adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Finding 1000 records. */
+	printf("Finding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (!dptr || *dptr != i)
+			errx(1, "Fetching key %u in tdb gave %u",
+			     i, dptr ? *dptr : -1);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Missing 1000 records. */
+	printf("Missing %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = num; i < num*2; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (dptr)
+			errx(1, "Fetching key %u in tdb gave %u", i, *dptr);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Traverse 1000 records. */
+	printf("Traversing %u records: ", num); fflush(stdout);
+	i = 0;
+	gettimeofday(&start, NULL);
+	if (tdb_traverse(tdb, count_record, &i) != num)
+		errx(1, "Traverse returned wrong number of records");
+	if (i != (num - 1) * (num / 2))
+		errx(1, "Traverse tallied to %u", i);
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Delete 1000 records (not in order). */
+	printf("Deleting %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Re-add 1000 records (not in order). */
+	printf("Re-adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Append 1000 records. */
+	printf("Appending %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_append(tdb, key, data) != 0)
+			errx(1, "Appending key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Churn 1000 records: not in order! */
+	printf("Churning %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 1000019) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+		i += num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	return 0;
+}
--- a/ccan/tdb2/check.c
+++ b/ccan/tdb2/check.c
@@ -43,25 +43,25 @@ static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
 	hash_test = TDB_HASH_MAGIC;
 	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
 	if (hdr.hash_test != hash_test) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "check: hash test %llu should be %llu\n",
+			   "check: hash test %llu should be %llu",
-			 (long long)hdr.hash_test,
+			   (long long)hdr.hash_test,
-			 (long long)hash_test);
+			   (long long)hash_test);
 		return false;
 	}
 	if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "check: bad magic '%.*s'\n",
+			   "check: bad magic '%.*s'",
-			 (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
+			   (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
 		return false;
 	}
 	*recovery = hdr.recovery;
 	if (*recovery) {
 		if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-				 "tdb_check: invalid recovery offset %zu\n",
+				 "tdb_check: invalid recovery offset %zu",
 				 (size_t)*recovery);
 			return false;
 		}
@@ -77,7 +77,65 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			    unsigned hprefix_bits,
 			    tdb_off_t used[],
 			    size_t num_used,
-			    size_t *num_found);
+			    size_t *num_found,
+			    int (*check)(TDB_DATA, TDB_DATA, void *),
+			    void *private_data);
+static bool check_hash_chain(struct tdb_context *tdb,
+			     tdb_off_t off,
+			     uint64_t hash,
+			     tdb_off_t used[],
+			     size_t num_used,
+			     size_t *num_found,
+			     int (*check)(TDB_DATA, TDB_DATA, void *),
+			     void *private_data)
+{
+	struct tdb_used_record rec;
+	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
+		return false;
+	if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash chain magic %llu",
+			   (long long)rec_magic(&rec));
+		return false;
+	}
+	if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash chain length %llu vs %zu",
+			   (long long)rec_data_length(&rec),
+			   sizeof(struct tdb_chain));
+		return false;
+	}
+	if (rec_key_length(&rec) != 0) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash chain key length %llu",
+			 (long long)rec_key_length(&rec));
+		return false;
+	}
+	if (rec_hash(&rec) != 0) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash chain hash value %llu",
+			 (long long)rec_hash(&rec));
+		return false;
+	}
+	off += sizeof(rec);
+	if (!check_hash_tree(tdb, off, 0, hash, 64,
+			     used, num_used, num_found, check, private_data))
+		return false;
+	off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
+	if (off == TDB_OFF_ERR)
+		return false;
+	if (off == 0)
+		return true;
+	(*num_found)++;
+	return check_hash_chain(tdb, off, hash, used, num_used, num_found,
+				check, private_data);
+}
 static bool check_hash_record(struct tdb_context *tdb,
 			      tdb_off_t off,
@@ -85,30 +143,43 @@ static bool check_hash_record(struct tdb_context *tdb,
 			      unsigned hprefix_bits,
 			      tdb_off_t used[],
 			      size_t num_used,
-			      size_t *num_found)
+			      size_t *num_found,
+			      int (*check)(TDB_DATA, TDB_DATA, void *),
+			      void *private_data)
 {
 	struct tdb_used_record rec;
+	if (hprefix_bits >= 64)
+		return check_hash_chain(tdb, off, hprefix, used, num_used,
+					num_found, check, private_data);
 	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
 		return false;
+	if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash table magic %llu",
+			   (long long)rec_magic(&rec));
+		return false;
+	}
 	if (rec_data_length(&rec)
 	    != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: Bad hash table length %llu vs %llu\n",
+			   "tdb_check: Bad hash table length %llu vs %llu",
-			 (long long)rec_data_length(&rec),
+			   (long long)rec_data_length(&rec),
-			 (long long)sizeof(tdb_off_t)<<TDB_SUBLEVEL_HASH_BITS);
+			   (long long)sizeof(tdb_off_t)
+			   << TDB_SUBLEVEL_HASH_BITS);
 		return false;
 	}
 	if (rec_key_length(&rec) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: Bad hash table key length %llu\n",
+			 "tdb_check: Bad hash table key length %llu",
 			 (long long)rec_key_length(&rec));
 		return false;
 	}
 	if (rec_hash(&rec) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: Bad hash table hash value %llu\n",
+			 "tdb_check: Bad hash table hash value %llu",
 			 (long long)rec_hash(&rec));
 		return false;
 	}
@@ -117,7 +188,7 @@ static bool check_hash_record(struct tdb_context *tdb,
 	return check_hash_tree(tdb, off,
 			       TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
 			       hprefix, hprefix_bits,
-			       used, num_used, num_found);
+			       used, num_used, num_found, check, private_data);
 }
 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
@@ -141,7 +212,9 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			    unsigned hprefix_bits,
 			    tdb_off_t used[],
 			    size_t num_used,
-			    size_t *num_found)
+			    size_t *num_found,
+			    int (*check)(TDB_DATA, TDB_DATA, void *),
+			    void *private_data)
 {
 	unsigned int g, b;
 	const tdb_off_t *hash;
@@ -166,16 +239,42 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			off = group[b] & TDB_OFF_MASK;
 			p = asearch(&off, used, num_used, off_cmp);
 			if (!p) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "tdb_check: Invalid offset %llu "
+					   TDB_DEBUG_ERROR,
-					 "in hash\n",
+					   "tdb_check: Invalid offset %llu "
-					 (long long)off);
+					   "in hash", (long long)off);
 				goto fail;
 			}
 			/* Mark it invalid. */
 			*p ^= 1;
 			(*num_found)++;
+			if (hprefix_bits == 64) {
+				/* Chained entries are unordered. */
+				if (is_subhash(group[b])) {
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "tdb_check: Invalid chain"
+						   " entry subhash");
+					goto fail;
+				}
+				h = hash_record(tdb, off);
+				if (h != hprefix) {
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "check: bad hash chain"
+						   " placement"
+						   " 0x%llx vs 0x%llx",
+						   (long long)h,
+						   (long long)hprefix);
+					goto fail;
+				}
+				if (tdb_read_convert(tdb, off, &rec,
+						     sizeof(rec)))
+					goto fail;
+				goto check;
+			}
 			if (is_subhash(group[b])) {
 				uint64_t subprefix;
 				subprefix = (hprefix 
@@ -188,7 +287,8 @@ static bool check_hash_tree(struct tdb_context *tdb,
 					       hprefix_bits
 						       + group_bits
 						       + TDB_HASH_GROUP_BITS,
-					       used, num_used, num_found))
+					       used, num_used, num_found,
+					       check, private_data))
 					goto fail;
 				continue;
 			}
@@ -199,18 +299,20 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			used_bits = 0;
 			if (get_bits(h, hprefix_bits, &used_bits) != hprefix
 			    && hprefix_bits) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "check: bad hash placement"
+					   TDB_DEBUG_ERROR,
-					 " 0x%llx vs 0x%llx\n",
+					   "check: bad hash placement"
+					   " 0x%llx vs 0x%llx",
 					 (long long)h, (long long)hprefix);
 				goto fail;
 			}
 			/* Does it belong in this group? */
 			if (get_bits(h, group_bits, &used_bits) != g) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "check: bad group %llu vs %u\n",
+					   TDB_DEBUG_ERROR,
-					 (long long)h, g);
+					   "check: bad group %llu vs %u",
+					   (long long)h, g);
 				goto fail;
 			}
@@ -219,11 +321,12 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
 			    != bucket) {
 				used_bits -= TDB_HASH_GROUP_BITS;
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "check: bad bucket %u vs %u\n",
+					   TDB_DEBUG_ERROR,
+					 "check: bad bucket %u vs %u",
 					 (unsigned)get_bits(h,
-							    TDB_HASH_GROUP_BITS,
+							TDB_HASH_GROUP_BITS,
-							    &used_bits),
+							&used_bits),
 					 bucket);
 				goto fail;
 			}
@@ -234,28 +337,46 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			     i != b;
 			     i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
 				if (group[i] == 0) {
-					tdb->log(tdb, TDB_DEBUG_ERROR,
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						 tdb->log_priv,
+						   TDB_DEBUG_ERROR,
-						 "check: bad group placement"
+						   "check: bad group placement"
-						 " %u vs %u\n",
+						   " %u vs %u",
-						 b, bucket);
+						   b, bucket);
 					goto fail;
 				}
 			}
-			if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
 				goto fail;
 			/* Bottom bits must match header. */
 			if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "tdb_check: Bad hash magic at"
+					   TDB_DEBUG_ERROR,
-					 " offset %llu (0x%llx vs 0x%llx)\n",
+					   "tdb_check: Bad hash magic at"
-					 (long long)off,
+					   " offset %llu (0x%llx vs 0x%llx)",
-					 (long long)h,
+					   (long long)off,
-					 (long long)rec_hash(&rec));
+					   (long long)h,
+					   (long long)rec_hash(&rec));
 				goto fail;
 			}
+		check:
+			if (check) {
+				TDB_DATA key, data;
+				key.dsize = rec_key_length(&rec);
+				data.dsize = rec_data_length(&rec);
+				key.dptr = (void *)tdb_access_read(tdb,
+						   off + sizeof(rec),
+						   key.dsize + data.dsize,
+						   false);
+				if (!key.dptr)
+					goto fail;
+				data.dptr = key.dptr + key.dsize;
+				if (check(key, data, private_data) != 0)
+					goto fail;
+				tdb_access_release(tdb, key.dptr);
+			}
 		}
 	}
 	tdb_access_release(tdb, hash);
@@ -268,19 +389,22 @@ fail:
 static bool check_hash(struct tdb_context *tdb,
 		       tdb_off_t used[],
-		       size_t num_used, size_t num_flists)
+		       size_t num_used, size_t num_ftables,
+		       int (*check)(TDB_DATA, TDB_DATA, void *),
+		       void *private_data)
 {
-	/* Free lists also show up as used. */
+	/* Free tables also show up as used. */
-	size_t num_found = num_flists;
+	size_t num_found = num_ftables;
 	if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
 			     TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
-			     0, 0, used, num_used, &num_found))
+			     0, 0, used, num_used, &num_found,
+			     check, private_data))
 		return false;
 	if (num_found != num_used) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: Not all entries are in hash\n");
+			   "tdb_check: Not all entries are in hash");
 		return false;
 	}
 	return true;
@@ -289,62 +413,63 @@ static bool check_hash(struct tdb_context *tdb,
 static bool check_free(struct tdb_context *tdb,
 		       tdb_off_t off,
 		       const struct tdb_free_record *frec,
-		       tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket)
+		       tdb_off_t prev, unsigned int ftable,
+		       unsigned int bucket)
 {
 	if (frec_magic(frec) != TDB_FREE_MAGIC) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: offset %llu bad magic 0x%llx\n",
+			   "tdb_check: offset %llu bad magic 0x%llx",
-			 (long long)off, (long long)frec->magic_and_meta);
+			   (long long)off, (long long)frec->magic_and_prev);
 		return false;
 	}
-	if (frec_flist(frec) != flist_off) {
+	if (frec_ftable(frec) != ftable) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: offset %llu bad freelist 0x%llx\n",
+			   "tdb_check: offset %llu bad freetable %u",
-			 (long long)off, (long long)frec_flist(frec));
+			   (long long)off, frec_ftable(frec));
 		return false;
 	}
 	if (tdb->methods->oob(tdb, off
-			      + frec->data_len+sizeof(struct tdb_used_record),
+			      + frec_len(frec) + sizeof(struct tdb_used_record),
 			      false))
 		return false;
-	if (size_to_bucket(frec->data_len) != bucket) {
+	if (size_to_bucket(frec_len(frec)) != bucket) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: offset %llu in wrong bucket %u vs %u\n",
+			   "tdb_check: offset %llu in wrong bucket %u vs %u",
-			 (long long)off,
+			   (long long)off,
-			 bucket, size_to_bucket(frec->data_len));
+			   bucket, size_to_bucket(frec_len(frec)));
 		return false;
 	}
-	if (prev != frec->prev) {
+	if (prev != frec_prev(frec)) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: offset %llu bad prev %llu vs %llu\n",
+			   "tdb_check: offset %llu bad prev %llu vs %llu",
-			 (long long)off,
+			   (long long)off,
-			 (long long)prev, (long long)frec->prev);
+			   (long long)prev, (long long)frec_len(frec));
 		return false;
 	}
 	return true;
 }
-static bool check_free_list(struct tdb_context *tdb,
+static bool check_free_table(struct tdb_context *tdb,
-			    tdb_off_t flist_off,
+			     tdb_off_t ftable_off,
-			    tdb_off_t free[],
+			     unsigned ftable_num,
-			    size_t num_free,
+			     tdb_off_t free[],
-			    size_t *num_found)
+			     size_t num_free,
+			     size_t *num_found)
 {
-	struct tdb_freelist flist;
+	struct tdb_freetable ft;
 	tdb_off_t h;
 	unsigned int i;
-	if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1)
+	if (tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft)) == -1)
 		return false;
-	if (rec_magic(&flist.hdr) != TDB_MAGIC
+	if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
-	    || rec_key_length(&flist.hdr) != 0
+	    || rec_key_length(&ft.hdr) != 0
-	    || rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr)
+	    || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
-	    || rec_hash(&flist.hdr) != 1) {
+	    || rec_hash(&ft.hdr) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 tdb->log_priv,
+			   "tdb_check: Invalid header on free table");
-			 "tdb_check: Invalid header on free list\n");
 		return false;
 	}
@@ -352,23 +477,23 @@ static bool check_free_list(struct tdb_context *tdb,
 		tdb_off_t off, prev = 0, *p;
 		struct tdb_free_record f;
-		h = bucket_off(flist_off, i);
+		h = bucket_off(ftable_off, i);
 		for (off = tdb_read_off(tdb, h); off; off = f.next) {
 			if (off == TDB_OFF_ERR)
 				return false;
 			if (tdb_read_convert(tdb, off, &f, sizeof(f)))
 				return false;
-			if (!check_free(tdb, off, &f, prev, flist_off, i))
+			if (!check_free(tdb, off, &f, prev, ftable_num, i))
 				return false;
 			/* FIXME: Check hash bits */
 			p = asearch(&off, free, num_free, off_cmp);
 			if (!p) {
-				tdb->log(tdb, TDB_DEBUG_ERROR,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 tdb->log_priv,
+					   TDB_DEBUG_ERROR,
-					 "tdb_check: Invalid offset"
+					   "tdb_check: Invalid offset"
-					 " %llu in free table\n",
+					   " %llu in free table",
-					 (long long)off);
+					   (long long)off);
 				return false;
 			}
 			/* Mark it invalid. */
@@ -381,7 +506,7 @@ static bool check_free_list(struct tdb_context *tdb,
 }
 /* Slow, but should be very rare. */
-static size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
+size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
 {
 	size_t len;
@@ -409,113 +534,135 @@ static bool check_linear(struct tdb_context *tdb,
 			struct tdb_used_record u;
 			struct tdb_free_record f;
 			struct tdb_recovery_record r;
-		} pad, *p;
+		} rec;
-		p = tdb_get(tdb, off, &pad, sizeof(pad));
+		/* r is larger: only get that if we need to. */
-		if (!p)
+		if (tdb_read_convert(tdb, off, &rec, sizeof(rec.f)) == -1)
 			return false;
 		/* If we crash after ftruncate, we can get zeroes or fill. */
-		if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
+		if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
-		    || p->r.magic ==  0x4343434343434343ULL) {
+		    || rec.r.magic ==  0x4343434343434343ULL) {
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
+				return false;
 			if (recovery == off) {
 				found_recovery = true;
-				len = sizeof(p->r) + p->r.max_len;
+				len = sizeof(rec.r) + rec.r.max_len;
 			} else {
 				len = dead_space(tdb, off);
-				if (len < sizeof(p->r)) {
+				if (len < sizeof(rec.r)) {
-					tdb->log(tdb, TDB_DEBUG_ERROR,
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						 tdb->log_priv,
+						   TDB_DEBUG_ERROR,
-						 "tdb_check: invalid dead space"
+						   "tdb_check: invalid dead"
-						 " at %zu\n", (size_t)off);
+						   " space at %zu",
+						   (size_t)off);
 					return false;
 				}
-				tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
+				tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
-					 "Dead space at %zu-%zu (of %zu)\n",
+					   "Dead space at %zu-%zu (of %zu)",
-					 (size_t)off, (size_t)(off + len),
+					   (size_t)off, (size_t)(off + len),
-					 (size_t)tdb->map_size);
+					   (size_t)tdb->map_size);
 			}
-		} else if (p->r.magic == TDB_RECOVERY_MAGIC) {
+		} else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
+				return false;
 			if (recovery != off) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "tdb_check: unexpected recovery"
+					   TDB_DEBUG_ERROR,
-					 " record at offset %zu\n",
+					   "tdb_check: unexpected recovery"
-					 (size_t)off);
+					   " record at offset %zu",
+					   (size_t)off);
+				return false;
+			}
+			if (rec.r.len > rec.r.max_len) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: invalid recovery length"
+					   " %zu", (size_t)rec.r.len);
+				return false;
+			}
+			if (rec.r.eof > tdb->map_size) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: invalid old EOF"
+					   " %zu", (size_t)rec.r.eof);
 				return false;
 			}
 			found_recovery = true;
-			len = sizeof(p->r) + p->r.max_len;
+			len = sizeof(rec.r) + rec.r.max_len;
-		} else if (frec_magic(&p->f) == TDB_FREE_MAGIC
+		} else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
-			   || frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
+			len = sizeof(rec.u) + frec_len(&rec.f);
-			len = sizeof(p->u) + p->f.data_len;
 			if (off + len > tdb->map_size) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "tdb_check: free overlength %llu"
+					   TDB_DEBUG_ERROR,
-					 " at offset %llu\n",
+					   "tdb_check: free overlength %llu"
-					 (long long)len, (long long)off);
+					   " at offset %llu",
+					   (long long)len, (long long)off);
 				return false;
 			}
-			/* This record is free! */
+			/* This record should be in free lists. */
-			if (frec_magic(&p->f) == TDB_FREE_MAGIC
+			if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
 			    && !append(free, num_free, off))
 				return false;
-		} else {
+		} else if (rec_magic(&rec.u) == TDB_USED_MAGIC
+			   || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
+			   || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
+			   || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
 			uint64_t klen, dlen, extra;
 			/* This record is used! */
-			if (rec_magic(&p->u) != TDB_MAGIC) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: Bad magic 0x%llx"
-					 " at offset %llu\n",
-					 (long long)rec_magic(&p->u),
-					 (long long)off);
-				return false;
-			}
 			if (!append(used, num_used, off))
 				return false;
-			klen = rec_key_length(&p->u);
+			klen = rec_key_length(&rec.u);
-			dlen = rec_data_length(&p->u);
+			dlen = rec_data_length(&rec.u);
-			extra = rec_extra_padding(&p->u);
+			extra = rec_extra_padding(&rec.u);
-			len = sizeof(p->u) + klen + dlen + extra;
+			len = sizeof(rec.u) + klen + dlen + extra;
 			if (off + len > tdb->map_size) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "tdb_check: used overlength %llu"
+					   TDB_DEBUG_ERROR,
-					 " at offset %llu\n",
+					   "tdb_check: used overlength %llu"
-					 (long long)len, (long long)off);
+					   " at offset %llu",
+					   (long long)len, (long long)off);
 				return false;
 			}
-			if (len < sizeof(p->f)) {
+			if (len < sizeof(rec.f)) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "tdb_check: too short record %llu at"
+					   TDB_DEBUG_ERROR,
-					 " %llu\n",
+					   "tdb_check: too short record %llu"
-					 (long long)len, (long long)off);
+					   " at %llu",
+					   (long long)len, (long long)off);
 				return false;
 			}
+		} else {
+			tdb_logerr(tdb, TDB_ERR_CORRUPT,
+				   TDB_DEBUG_ERROR,
+				   "tdb_check: Bad magic 0x%llx at offset %zu",
+				   (long long)rec_magic(&rec.u), (size_t)off);
+			return false;
 		}
 	}
 	/* We must have found recovery area if there was one. */
 	if (recovery != 0 && !found_recovery) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: expected a recovery area at %zu\n",
+			   "tdb_check: expected a recovery area at %zu",
-			 (size_t)recovery);
+			   (size_t)recovery);
 		return false;
 	}
 	return true;
 }
-/* FIXME: call check() function. */
 int tdb_check(struct tdb_context *tdb,
 	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
 	      void *private_data)
 {
-	tdb_off_t *free = NULL, *used = NULL, flist, recovery;
+	tdb_off_t *free = NULL, *used = NULL, ft, recovery;
-	size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
+	size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
 	if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
 		return -1;
@@ -532,22 +679,23 @@ int tdb_check(struct tdb_context *tdb,
 	if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
 		goto fail;
-	for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
+	for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
-		if (flist == TDB_OFF_ERR)
+		if (ft == TDB_OFF_ERR)
 			goto fail;
-		if (!check_free_list(tdb, flist, free, num_free, &num_found))
+		if (!check_free_table(tdb, ft, num_ftables, free, num_free,
+				      &num_found))
 			goto fail;
-		num_flists++;
+		num_ftables++;
 	}
 	/* FIXME: Check key uniqueness? */
-	if (!check_hash(tdb, used, num_used, num_flists))
+	if (!check_hash(tdb, used, num_used, num_ftables, check, private_data))
 		goto fail;
 	if (num_found != num_free) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
-			 "tdb_check: Not all entries are in free table\n");
+			   "tdb_check: Not all entries are in free table");
-		return false;
+		return -1;
 	}
 	tdb_allrecord_unlock(tdb, F_RDLCK);

--- a/ccan/tdb2/doc/design.lyx
+++ b/ccan/tdb2/doc/design.lyx
-#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
 \lyxformat 345
 \begin_document
 \begin_header
@@ -50,13 +50,7 @@ Rusty Russell, IBM Corporation
 \end_layout
 \begin_layout Date
+1-December-2010
-\change_deleted 0 1283307542
-26-July
-\change_inserted 0 1284423485
-14-September
-\change_unchanged
-2010
 \end_layout
 \begin_layout Abstract
@@ -476,8 +470,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional
 \begin_layout Subsubsection
 Proposed Solution
-\change_inserted 0 1284422789
 \begin_inset CommandInset label
 LatexCommand label
 name "attributes"
@@ -485,8 +477,6 @@ name "attributes"
 \end_inset
-\change_unchanged
 \end_layout
 \begin_layout Standard
@@ -590,6 +580,14 @@ This allows future attributes to be added, even if this expands the size
 of the union.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 tdb_traverse Makes Impossible Guarantees
 \end_layout
@@ -631,6 +629,16 @@ Abandon the guarantee.
 You can prevent changes by using a transaction or the locking API.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
 \begin_layout Subsection
 Nesting of Transactions Is Fraught
 \end_layout
@@ -685,6 +693,14 @@ least-surprise
 -obscure case.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete; nesting flag is still defined as per tdb1.
+\end_layout
 \begin_layout Subsection
 Incorrect Hash Function is Not Detected
 \end_layout
@@ -706,6 +722,14 @@ The header should contain an example hash result (eg.
 hash function produces the same answer, or fail the tdb_open call.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
 \end_layout
@@ -750,6 +774,16 @@ With the scalability problems of the freelist solved, this API can be removed.
 tuning, but initially will become a no-op.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+ TDB_VOLATILE still defined, but implementation should fail on unknown flags
+ to be future-proof.
+\end_layout
 \begin_layout Subsection
 \begin_inset CommandInset label
 LatexCommand label
@@ -802,6 +836,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether
 an API.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+\end_layout
 \begin_layout Subsection
 TDB API Is Not POSIX Thread-safe
 \end_layout
@@ -846,8 +888,6 @@ Internal locking is required to make sure that fcntl locks do not overlap
 \begin_layout Standard
 The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
 version of the library, and otherwise no overhead will exist.
-\change_inserted 0 1284016998
 Alternatively, a hooking mechanism similar to that proposed for 
 \begin_inset CommandInset ref
 LatexCommand ref
@@ -856,8 +896,14 @@ reference "Proposed-Solution-locking-hook"
 \end_inset
 could be used to enable pthread locking at runtime.
-\change_unchanged
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
 \end_layout
 \begin_layout Subsection
@@ -975,6 +1021,14 @@ This is flexible enough to handle any potential locking scenario, even when
 It also keeps the complexity out of the API, and in ctdbd where it is needed.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+\end_layout
 \begin_layout Subsection
 tdb_chainlock Functions Expose Implementation
 \end_layout
@@ -1056,6 +1110,14 @@ It may be possible to make this race-free in some implementations by having
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+\end_layout
 \begin_layout Subsection
 The API Uses Gratuitous Typedefs, Capitals
 \end_layout
@@ -1132,6 +1194,14 @@ It should simply take an extra argument, since we are prepared to break
 the API/ABI.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 Various Callback Functions Are Not Typesafe
 \end_layout
@@ -1171,6 +1241,14 @@ With careful use of macros, we can create callback functions which give
 See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+\end_layout
 \begin_layout Subsection
 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
 \end_layout
@@ -1206,19 +1284,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
 \end_inset
 .
-\change_inserted 0 1284015637
+\end_layout
+\begin_layout Subsubsection
+Status
 \end_layout
-\begin_layout Subsection
+\begin_layout Standard
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
+\end_layout
-\change_inserted 0 1284015716
+\begin_layout Subsection
 Extending The Header Is Difficult
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284015906
 We have reserved (zeroed) words in the TDB header, which can be used for
 future features.
 If the future features are compulsory, the version number must be updated
@@ -1228,14 +1308,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1284015637
 Proposed Solution
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284016114
 The header should contain a 
 \begin_inset Quotes eld
 \end_inset
@@ -1249,58 +1325,48 @@ format variant
 \end_layout
 \begin_layout Enumerate
-\change_inserted 0 1284016149
 The lower part reflects the format variant understood by code accessing
 the database.
 \end_layout
 \begin_layout Enumerate
-\change_inserted 0 1284016639
 The upper part reflects the format variant you must understand to write
 to the database (otherwise you can only open for reading).
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284016821
 The latter field can only be written at creation time, the former should
 be written under the OPEN_LOCK when opening the database for writing, if
 the variant of the code is lower than the current lowest variant.
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284016803
 This should allow backwards-compatible features to be added, and detection
 if older code (which doesn't understand the feature) writes to the database.
-\change_deleted 0 1284016101
+\end_layout
+\begin_layout Subsubsection
+Status
 \end_layout
-\begin_layout Subsection
+\begin_layout Standard
+Incomplete.
+\end_layout
-\change_inserted 0 1284015634
+\begin_layout Subsection
 Record Headers Are Not Expandible
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284015634
 If we later want to add (say) checksums on keys and data, it would require
 another format change, which we'd like to avoid.
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1284015634
 Proposed Solution
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284422552
 We often have extra padding at the tail of a record.
 If we ensure that the first byte (if any) of this padding is zero, we will
 have a way for future changes to detect code which doesn't understand a
@@ -1309,28 +1375,28 @@ We often have extra padding at the tail of a record.
 not present on that record.
 \end_layout
-\begin_layout Subsection
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+\end_layout
-\change_inserted 0 1284422568
+\begin_layout Subsection
 TDB Does Not Use Talloc
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284422646
 Many users of TDB (particularly Samba) use the talloc allocator, and thus
 have to wrap TDB in a talloc context to use it conveniently.
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1284422656
 Proposed Solution
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284423065
 The allocation within TDB is not complicated enough to justify the use of
 talloc, and I am reluctant to force another (excellent) library on TDB
 users.
@@ -1356,15 +1422,19 @@ context
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284423042
 This would form a talloc heirarchy as expected, but the caller would still
 have to attach a destructor to the tdb context returned from tdb_open to
 close it.
 All TDB_DATA fields would be children of the tdb_context, and the caller
 would still have to manage them (using talloc_free() or talloc_steal()).
-\change_unchanged
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
 \end_layout
 \begin_layout Section
@@ -1422,6 +1492,14 @@ Remove the flag.
 point.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+\end_layout
 \begin_layout Subsection
 TDB Files Have a 4G Limit
 \end_layout
@@ -1469,6 +1547,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August
 be erased and initialized as a fresh tdb!)
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 TDB Records Have a 4G Limit
 \end_layout
@@ -1498,6 +1584,14 @@ reference "sub:Records-Incur-A"
 ).
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 Hash Size Is Determined At TDB Creation Time
 \end_layout
@@ -1512,16 +1606,12 @@ TDB contains a number of hash chains in the header; the number is specified
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1283336713
 \begin_inset CommandInset label
 LatexCommand label
 name "sub:Hash-Size-Solution"
 \end_inset
-\change_unchanged
 Proposed Solution
 \end_layout
@@ -1540,58 +1630,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin
 , it became clear that it is hard to beat a straight linear hash table which
 doubles in size when it reaches saturation.
-\change_deleted 0 1283307675
-There are three details which become important:
-\end_layout
-\begin_layout Enumerate
-\change_deleted 0 1283307675
-On encountering a full bucket, we use the next bucket.
-\end_layout
-\begin_layout Enumerate
-\change_deleted 0 1283307675
-Extra hash bits are stored with the offset, to reduce comparisons.
-\end_layout
-\begin_layout Enumerate
-\change_deleted 0 1283307675
-A marker entry is used on deleting an entry.
-\end_layout
-\begin_layout Standard
-\change_deleted 0 1283307675
-The doubling of the table must be done under a transaction; we will not
- reduce it on deletion, so it will be an unusual case.
- It will either be placed at the head (other entries will be moved out the
- way so we can expand).
- We could have a pointer in the header to the current hashtable location,
- but that pointer would have to be read frequently to check for hashtable
- moves.
-\end_layout
-\begin_layout Standard
-\change_deleted 0 1283307675
-The locking for this is slightly more complex than the chained case; we
- currently have one lock per bucket, and that means we would need to expand
- the lock if we overflow to the next bucket.
- The frequency of such collisions will effect our locking heuristics: we
- can always lock more buckets than we need.
-\end_layout
-\begin_layout Standard
-\change_deleted 0 1283307675
-One possible optimization is to only re-check the hash size on an insert
- or a lookup miss.
-\change_inserted 0 1283307770
 Unfortunately, altering the hash table introduces serious locking complications
 : the entire hash table needs to be locked to enlarge the hash table, and
 others might be holding locks.
@@ -1599,8 +1637,6 @@ One possible optimization is to only re-check the hash size on an insert
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1283336187
 Thus an expanding layered hash will be used: an array of hash groups, with
 each hash group exploding into pointers to lower hash groups once it fills,
 turning into a hash tree.
@@ -1609,8 +1645,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1283336586
 Note that bits from the hash table entries should be stolen to hold more
 hash bits to reduce the penalty of collisions.
 We can use the otherwise-unused lower 3 bits.
@@ -1621,8 +1655,14 @@ Note that bits from the hash table entries should be stolen to hold more
 bits are valid.
 This means we can choose not to re-hash all entries when we expand a hash
 group; simply use the next bits we need and mark them invalid.
-\change_unchanged
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
 \end_layout
 \begin_layout Subsection
@@ -1749,8 +1789,6 @@ The single list lock limits our allocation rate; due to the other issues
 \begin_layout Subsubsection
 Proposed Solution
-\change_deleted 0 1283336858
 \end_layout
 \begin_layout Standard
@@ -1765,20 +1803,14 @@ The free list must be split to reduce contention.
 This implies that the number of free lists is related to the size of the
 hash table, but as it is rare to walk a large number of free list entries
 we can use far fewer, say 1/32 of the number of hash buckets.
-\change_inserted 0 1283336910
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1283337052
 It seems tempting to try to reuse the hash implementation which we use for
 records here, but we have two ways of searching for free entries: for allocatio
 n we search by size (and possibly zone) which produces too many clashes
 for our hash table to handle well, and for coalescing we search by address.
 Thus an array of doubly-linked free lists seems preferable.
-\change_unchanged
 \end_layout
 \begin_layout Standard
@@ -1792,24 +1824,28 @@ reference "sub:TDB-Becomes-Fragmented"
 ) but it's not clear this would reduce contention in the common case where
 all processes are allocating/freeing the same size.
 Thus we almost certainly need to divide in other ways: the most obvious
- is to divide the file into zones, and using a free list (or set of free
+ is to divide the file into zones, and using a free list (or table of free
 lists) for each.
 This approximates address ordering.
 \end_layout
 \begin_layout Standard
-Note that this means we need to split the free lists when we expand the
+Unfortunately it is difficult to know what heuristics should be used to
- file; this is probably acceptable when we double the hash table size, since
+ determine zone sizes, and our transaction code relies on being able to
- that is such an expensive operation already.
+ create a 
- In the case of increasing the file size, there is an optimization we can
+\begin_inset Quotes eld
- use: if we use M in the formula above as the file size rounded up to the
+\end_inset
- next power of 2, we only need reshuffle free lists when the file size crosses
- a power of 2 boundary, 
+recovery area
-\emph on
+\begin_inset Quotes erd
-and 
+\end_inset
-\emph default
-reshuffling the free lists is trivial: we simply merge every consecutive
+ by simply appending to the file (difficult if it would need to create a
- pair of free lists.
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
 \end_layout
 \begin_layout Standard
@@ -1818,7 +1854,7 @@ The basic algorithm is as follows.
 \end_layout
 \begin_layout Enumerate
-Identify the correct zone.
+Identify the correct free list.
 \end_layout
 \begin_layout Enumerate
@@ -1826,12 +1862,12 @@ Lock the corresponding list.
 \end_layout
 \begin_layout Enumerate
-Re-check the zone (we didn't have a lock, sizes could have changed): relock
+Re-check the list (we didn't have a lock, sizes could have changed): relock
 if necessary.
 \end_layout
 \begin_layout Enumerate
-Place the freed entry in the list for that zone.
+Place the freed entry in the list.
 \end_layout
 \begin_layout Standard
@@ -1840,15 +1876,7 @@ Allocation is a little more complicated, as we perform delayed coalescing
 \end_layout
 \begin_layout Enumerate
-Pick a zone either the zone we last freed into, or based on a 
+Pick a free table; usually the previous one.
-\begin_inset Quotes eld
-\end_inset
-random
-\begin_inset Quotes erd
-\end_inset
- number.
 \end_layout
 \begin_layout Enumerate
@@ -1856,16 +1884,16 @@ Lock the corresponding list.
 \end_layout
 \begin_layout Enumerate
-Re-check the zone: relock if necessary.
+If the top entry is -large enough, remove it from the list and return it.
 \end_layout
 \begin_layout Enumerate
-If the top entry is -large enough, remove it from the list and return it.
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
 \end_layout
 \begin_layout Enumerate
-Otherwise, coalesce entries in the list.If there was no entry large enough,
+If no list has an entry which meets our needs, try the next free table.
- unlock the list and try the next zone.
 \end_layout
 \begin_layout Enumerate
@@ -1897,73 +1925,8 @@ reference "sub:Records-Incur-A"
 \end_layout
 \begin_layout Standard
-I anticipate that the number of entries in each free zone would be small,
+Each free entry has the free table number in the header: less than 255.
- but it might be worth using one free entry to hold pointers to the others
+ It also contains a doubly-linked list for easy deletion.
- for cache efficiency.
-\change_inserted 0 1283309850
-\end_layout
-\begin_layout Standard
-\change_inserted 0 1283337216
-\begin_inset CommandInset label
-LatexCommand label
-name "freelist-in-zone"
-\end_inset
-If we want to avoid locking complexity (enlarging the free lists when we
- enlarge the file) we could place the array of free lists at the beginning
- of each zone.
- This means existing array lists never move, but means that a record cannot
- be larger than a zone.
- That in turn implies that zones should be variable sized (say, power of
- 2), which makes the question 
-\begin_inset Quotes eld
-\end_inset
-what zone is this record in?
-\begin_inset Quotes erd
-\end_inset
- much harder (and 
-\begin_inset Quotes eld
-\end_inset
-pick a random zone
-\begin_inset Quotes erd
-\end_inset
-, but that's less common).
- It could be done with as few as 4 bits from the record header.
-\begin_inset Foot
-status open
-\begin_layout Plain Layout
-\change_inserted 0 1284424151
-Using 
-\begin_inset Formula $2^{16+N*3}$
-\end_inset
-means 0 gives a minimal 65536-byte zone, 15 gives the maximal 
-\begin_inset Formula $2^{61}$
-\end_inset
- byte zone.
- Zones range in factor of 8 steps.
- Given the zone size for the zone the current record is in, we can determine
- the start of the zone.
-\change_unchanged
-\end_layout
-\end_inset
-\change_unchanged
 \end_layout
 \begin_layout Subsection
@@ -2165,8 +2128,6 @@ miss
 it reduces 99.9% of false memcmp).
 As an aside, as the lower bits are already incorporated in the hash table
 resolution, the upper bits should be used here.
-\change_inserted 0 1283336739
 Note that it's not clear that these bits will be a win, given the extra
 bits in the hash table itself (see 
 \begin_inset CommandInset ref
@@ -2176,8 +2137,6 @@ reference "sub:Hash-Size-Solution"
 \end_inset
 ).
-\change_unchanged
 \end_layout
 \begin_layout Enumerate
@@ -2214,11 +2173,11 @@ struct tdb_used_record {
 \end_layout
 \begin_layout LyX-Code
-        uint32_t magic : 16,
+        uint32_t used_magic : 16,
 \end_layout
 \begin_layout LyX-Code
-                 prev_is_free: 1,
 \end_layout
 \begin_layout LyX-Code
@@ -2226,7 +2185,7 @@ struct tdb_used_record {
 \end_layout
 \begin_layout LyX-Code
-                 top_hash: 10;
+                 top_hash: 11;
 \end_layout
 \begin_layout LyX-Code
@@ -2250,29 +2209,27 @@ struct tdb_free_record {
 \end_layout
 \begin_layout LyX-Code
-        uint32_t free_magic;
+        uint64_t free_magic: 8,
 \end_layout
 \begin_layout LyX-Code
-        uint64_t total_length;
+                   prev : 56;
-\change_inserted 0 1283337133
 \end_layout
 \begin_layout LyX-Code
-\change_inserted 0 1283337139
+\end_layout
-        uint64_t prev, next;
-\change_unchanged
+\begin_layout LyX-Code
+        uint64_t free_table: 8,
 \end_layout
 \begin_layout LyX-Code
-        ...
+                 total_length : 56
 \end_layout
 \begin_layout LyX-Code
-        uint64_t tailer;
+        uint64_t next;;
 \end_layout
 \begin_layout LyX-Code
@@ -2281,20 +2238,19 @@ struct tdb_free_record {
 \begin_layout Standard
-\change_inserted 0 1283337235
+\change_deleted 0 1291206079
-We might want to take some bits from the used record's top_hash (and the
- free record which has 32 bits of padding to spare anyway) if we use variable
- sized zones.
- See 
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "freelist-in-zone"
-\end_inset
-.
 \change_unchanged
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
 \end_layout
 \begin_layout Subsection
@@ -2387,6 +2343,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum
 a transaction in progress; we need only check for recovery if this is set.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
+\end_layout
 \begin_layout Subsection
 \begin_inset CommandInset label
 LatexCommand label
@@ -2398,13 +2362,7 @@ TDB Does Not Have Snapshot Support
 \end_layout
 \begin_layout Subsubsection
-Proposed Solution
+Proposed SolutionNone.
-\change_deleted 0 1284423472
-\end_layout
-\begin_layout Standard
-None.
 At some point you say 
 \begin_inset Quotes eld
 \end_inset
@@ -2413,13 +2371,6 @@ use a real database
 \begin_inset Quotes erd
 \end_inset
-\change_inserted 0 1284423891
-\change_deleted 0 1284423891
-.
-\change_inserted 0 1284423901
 (but see 
 \begin_inset CommandInset ref
 LatexCommand ref
@@ -2428,8 +2379,6 @@ reference "replay-attribute"
 \end_inset
 ).
-\change_unchanged
 \end_layout
 \begin_layout Standard
@@ -2452,8 +2401,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack
 \begin_layout Standard
 We could then implement snapshots using a similar method, using multiple
 different hash tables/free tables.
-\change_inserted 0 1284423495
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
 \end_layout
 \begin_layout Subsection
@@ -2473,8 +2428,6 @@ Proposed Solution
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284424201
 None (but see 
 \begin_inset CommandInset ref
 LatexCommand ref
@@ -2483,15 +2436,21 @@ reference "replay-attribute"
 \end_inset
 ).
+ We could solve a small part of the problem by providing read-only transactions.
-\change_unchanged
-We could solve a small part of the problem by providing read-only transactions.
 These would allow one write transaction to begin, but it could not commit
 until all r/o transactions are done.
 This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
 commit.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
+\end_layout
 \begin_layout Subsection
 Default Hash Function Is Suboptimal
 \end_layout
@@ -2532,6 +2491,14 @@ The seed should be created at tdb-creation time from some random source,
 hash bombing.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 \begin_inset CommandInset label
 LatexCommand label
@@ -2569,6 +2536,14 @@ reference "traverse-Proposed-Solution"
 .
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 Fcntl Locking Adds Overhead
 \end_layout
@@ -2670,19 +2645,13 @@ At some later point, a sync would allow recovery of the old data into the
 free lists (perhaps when the array of top-level pointers filled).
 On crash, tdb_open() would examine the array of top levels, and apply the
 transactions until it encountered an invalid checksum.
-\change_inserted 0 1284423555
 \end_layout
 \begin_layout Subsection
-\change_inserted 0 1284423617
 Tracing Is Fragile, Replay Is External
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284423719
 The current TDB has compile-time-enabled tracing code, but it often breaks
 as it is not enabled by default.
 In a similar way, the ctdb code has an external wrapper which does replay
@@ -2690,8 +2659,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1284423864
 Proposed Solution
 \begin_inset CommandInset label
 LatexCommand label
@@ -2703,8 +2670,6 @@ name "replay-attribute"
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284423850
 Tridge points out that an attribute can be later added to tdb_open (see
 \begin_inset CommandInset ref
@@ -2715,8 +2680,14 @@ reference "attributes"
 ) to provide replay/trace hooks, which could become the basis for this and
 future parallel transactions and snapshot support.
-\change_unchanged
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
 \end_layout
 \end_body

--- a/ccan/tdb2/doc/design.lyx,v
+++ b/ccan/tdb2/doc/design.lyx,v
-head	1.10;
+head	1.13;
 access;
 symbols;
 locks; strict;
 comment	@# @;
+1.13
+date	2010.12.01.12.22.08;	author rusty;	state Exp;
+branches;
+next	1.12;
+1.12
+date	2010.12.01.12.20.49;	author rusty;	state Exp;
+branches;
+next	1.11;
+1.11
+date	2010.12.01.11.55.20;	author rusty;	state Exp;
+branches;
+next	1.10;
 1.10
 date	2010.09.14.00.33.57;	author rusty;	state Exp;
 branches;
@@ -61,12 +76,12 @@ desc
 @
-1.10
+1.13
 log
-@Tracing attribute, talloc support.
+@Merged changes.
 @
 text
-@#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+@#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
 \lyxformat 345
 \begin_document
 \begin_header
@@ -118,13 +133,7 @@ Rusty Russell, IBM Corporation
 \end_layout
 \begin_layout Date
+1-December-2010
-\change_deleted 0 1283307542
-26-July
-\change_inserted 0 1284423485
-14-September
-\change_unchanged
-2010
 \end_layout
 \begin_layout Abstract
@@ -544,8 +553,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional
 \begin_layout Subsubsection
 Proposed Solution
-\change_inserted 0 1284422789
 \begin_inset CommandInset label
 LatexCommand label
 name "attributes"
@@ -553,8 +560,6 @@ name "attributes"
 \end_inset
-\change_unchanged
 \end_layout
 \begin_layout Standard
@@ -658,6 +663,14 @@ This allows future attributes to be added, even if this expands the size
 of the union.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 tdb_traverse Makes Impossible Guarantees
 \end_layout
@@ -699,6 +712,16 @@ Abandon the guarantee.
 You can prevent changes by using a transaction or the locking API.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
 \begin_layout Subsection
 Nesting of Transactions Is Fraught
 \end_layout
@@ -753,6 +776,14 @@ least-surprise
 -obscure case.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete; nesting flag is still defined as per tdb1.
+\end_layout
 \begin_layout Subsection
 Incorrect Hash Function is Not Detected
 \end_layout
@@ -774,6 +805,14 @@ The header should contain an example hash result (eg.
 hash function produces the same answer, or fail the tdb_open call.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
 \end_layout
@@ -818,6 +857,16 @@ With the scalability problems of the freelist solved, this API can be removed.
 tuning, but initially will become a no-op.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+ TDB_VOLATILE still defined, but implementation should fail on unknown flags
+ to be future-proof.
+\end_layout
 \begin_layout Subsection
 \begin_inset CommandInset label
 LatexCommand label
@@ -870,6 +919,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether
 an API.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+\end_layout
 \begin_layout Subsection
 TDB API Is Not POSIX Thread-safe
 \end_layout
@@ -914,8 +971,6 @@ Internal locking is required to make sure that fcntl locks do not overlap
 \begin_layout Standard
 The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
 version of the library, and otherwise no overhead will exist.
-\change_inserted 0 1284016998
 Alternatively, a hooking mechanism similar to that proposed for 
 \begin_inset CommandInset ref
 LatexCommand ref
@@ -924,8 +979,14 @@ reference "Proposed-Solution-locking-hook"
 \end_inset
 could be used to enable pthread locking at runtime.
-\change_unchanged
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
 \end_layout
 \begin_layout Subsection
@@ -1043,6 +1104,14 @@ This is flexible enough to handle any potential locking scenario, even when
 It also keeps the complexity out of the API, and in ctdbd where it is needed.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+\end_layout
 \begin_layout Subsection
 tdb_chainlock Functions Expose Implementation
 \end_layout
@@ -1124,6 +1193,14 @@ It may be possible to make this race-free in some implementations by having
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+\end_layout
 \begin_layout Subsection
 The API Uses Gratuitous Typedefs, Capitals
 \end_layout
@@ -1200,6 +1277,14 @@ It should simply take an extra argument, since we are prepared to break
 the API/ABI.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 Various Callback Functions Are Not Typesafe
 \end_layout
@@ -1239,6 +1324,14 @@ With careful use of macros, we can create callback functions which give
 See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
+\end_layout
 \begin_layout Subsection
 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
 \end_layout
@@ -1274,19 +1367,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
 \end_inset
 .
-\change_inserted 0 1284015637
+\end_layout
+\begin_layout Subsubsection
+Status
 \end_layout
-\begin_layout Subsection
+\begin_layout Standard
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
+\end_layout
-\change_inserted 0 1284015716
+\begin_layout Subsection
 Extending The Header Is Difficult
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284015906
 We have reserved (zeroed) words in the TDB header, which can be used for
 future features.
 If the future features are compulsory, the version number must be updated
@@ -1296,14 +1391,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1284015637
 Proposed Solution
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284016114
 The header should contain a 
 \begin_inset Quotes eld
 \end_inset
@@ -1317,58 +1408,48 @@ format variant
 \end_layout
 \begin_layout Enumerate
-\change_inserted 0 1284016149
 The lower part reflects the format variant understood by code accessing
 the database.
 \end_layout
 \begin_layout Enumerate
-\change_inserted 0 1284016639
 The upper part reflects the format variant you must understand to write
 to the database (otherwise you can only open for reading).
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284016821
 The latter field can only be written at creation time, the former should
 be written under the OPEN_LOCK when opening the database for writing, if
 the variant of the code is lower than the current lowest variant.
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284016803
 This should allow backwards-compatible features to be added, and detection
 if older code (which doesn't understand the feature) writes to the database.
-\change_deleted 0 1284016101
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete.
 \end_layout
 \begin_layout Subsection
-\change_inserted 0 1284015634
 Record Headers Are Not Expandible
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284015634
 If we later want to add (say) checksums on keys and data, it would require
 another format change, which we'd like to avoid.
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1284015634
 Proposed Solution
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284422552
 We often have extra padding at the tail of a record.
 If we ensure that the first byte (if any) of this padding is zero, we will
 have a way for future changes to detect code which doesn't understand a
@@ -1377,28 +1458,28 @@ We often have extra padding at the tail of a record.
 not present on that record.
 \end_layout
-\begin_layout Subsection
+\begin_layout Subsubsection
+Status
+\end_layout
-\change_inserted 0 1284422568
+\begin_layout Standard
+Incomplete.
+\end_layout
+\begin_layout Subsection
 TDB Does Not Use Talloc
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284422646
 Many users of TDB (particularly Samba) use the talloc allocator, and thus
 have to wrap TDB in a talloc context to use it conveniently.
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1284422656
 Proposed Solution
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284423065
 The allocation within TDB is not complicated enough to justify the use of
 talloc, and I am reluctant to force another (excellent) library on TDB
 users.
@@ -1424,15 +1505,19 @@ context
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284423042
 This would form a talloc heirarchy as expected, but the caller would still
 have to attach a destructor to the tdb context returned from tdb_open to
 close it.
 All TDB_DATA fields would be children of the tdb_context, and the caller
 would still have to manage them (using talloc_free() or talloc_steal()).
-\change_unchanged
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
 \end_layout
 \begin_layout Section
@@ -1490,6 +1575,14 @@ Remove the flag.
 point.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+\end_layout
 \begin_layout Subsection
 TDB Files Have a 4G Limit
 \end_layout
@@ -1537,6 +1630,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August
 be erased and initialized as a fresh tdb!)
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 TDB Records Have a 4G Limit
 \end_layout
@@ -1566,6 +1667,14 @@ reference "sub:Records-Incur-A"
 ).
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 Hash Size Is Determined At TDB Creation Time
 \end_layout
@@ -1580,16 +1689,12 @@ TDB contains a number of hash chains in the header; the number is specified
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1283336713
 \begin_inset CommandInset label
 LatexCommand label
 name "sub:Hash-Size-Solution"
 \end_inset
-\change_unchanged
 Proposed Solution
 \end_layout
@@ -1608,58 +1713,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin
 , it became clear that it is hard to beat a straight linear hash table which
 doubles in size when it reaches saturation.
-\change_deleted 0 1283307675
-There are three details which become important:
-\end_layout
-\begin_layout Enumerate
-\change_deleted 0 1283307675
-On encountering a full bucket, we use the next bucket.
-\end_layout
-\begin_layout Enumerate
-\change_deleted 0 1283307675
-Extra hash bits are stored with the offset, to reduce comparisons.
-\end_layout
-\begin_layout Enumerate
-\change_deleted 0 1283307675
-A marker entry is used on deleting an entry.
-\end_layout
-\begin_layout Standard
-\change_deleted 0 1283307675
-The doubling of the table must be done under a transaction; we will not
- reduce it on deletion, so it will be an unusual case.
- It will either be placed at the head (other entries will be moved out the
- way so we can expand).
- We could have a pointer in the header to the current hashtable location,
- but that pointer would have to be read frequently to check for hashtable
- moves.
-\end_layout
-\begin_layout Standard
-\change_deleted 0 1283307675
-The locking for this is slightly more complex than the chained case; we
- currently have one lock per bucket, and that means we would need to expand
- the lock if we overflow to the next bucket.
- The frequency of such collisions will effect our locking heuristics: we
- can always lock more buckets than we need.
-\end_layout
-\begin_layout Standard
-\change_deleted 0 1283307675
-One possible optimization is to only re-check the hash size on an insert
- or a lookup miss.
-\change_inserted 0 1283307770
 Unfortunately, altering the hash table introduces serious locking complications
 : the entire hash table needs to be locked to enlarge the hash table, and
 others might be holding locks.
@@ -1667,8 +1720,6 @@ One possible optimization is to only re-check the hash size on an insert
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1283336187
 Thus an expanding layered hash will be used: an array of hash groups, with
 each hash group exploding into pointers to lower hash groups once it fills,
 turning into a hash tree.
@@ -1677,8 +1728,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1283336586
 Note that bits from the hash table entries should be stolen to hold more
 hash bits to reduce the penalty of collisions.
 We can use the otherwise-unused lower 3 bits.
@@ -1689,8 +1738,14 @@ Note that bits from the hash table entries should be stolen to hold more
 bits are valid.
 This means we can choose not to re-hash all entries when we expand a hash
 group; simply use the next bits we need and mark them invalid.
-\change_unchanged
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
 \end_layout
 \begin_layout Subsection
@@ -1817,8 +1872,6 @@ The single list lock limits our allocation rate; due to the other issues
 \begin_layout Subsubsection
 Proposed Solution
-\change_deleted 0 1283336858
 \end_layout
 \begin_layout Standard
@@ -1833,20 +1886,14 @@ The free list must be split to reduce contention.
 This implies that the number of free lists is related to the size of the
 hash table, but as it is rare to walk a large number of free list entries
 we can use far fewer, say 1/32 of the number of hash buckets.
-\change_inserted 0 1283336910
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1283337052
 It seems tempting to try to reuse the hash implementation which we use for
 records here, but we have two ways of searching for free entries: for allocatio
 n we search by size (and possibly zone) which produces too many clashes
 for our hash table to handle well, and for coalescing we search by address.
 Thus an array of doubly-linked free lists seems preferable.
-\change_unchanged
 \end_layout
 \begin_layout Standard
@@ -1860,24 +1907,28 @@ reference "sub:TDB-Becomes-Fragmented"
 ) but it's not clear this would reduce contention in the common case where
 all processes are allocating/freeing the same size.
 Thus we almost certainly need to divide in other ways: the most obvious
- is to divide the file into zones, and using a free list (or set of free
+ is to divide the file into zones, and using a free list (or table of free
 lists) for each.
 This approximates address ordering.
 \end_layout
 \begin_layout Standard
-Note that this means we need to split the free lists when we expand the
+Unfortunately it is difficult to know what heuristics should be used to
- file; this is probably acceptable when we double the hash table size, since
+ determine zone sizes, and our transaction code relies on being able to
- that is such an expensive operation already.
+ create a 
- In the case of increasing the file size, there is an optimization we can
+\begin_inset Quotes eld
- use: if we use M in the formula above as the file size rounded up to the
+\end_inset
- next power of 2, we only need reshuffle free lists when the file size crosses
- a power of 2 boundary, 
+recovery area
-\emph on
+\begin_inset Quotes erd
-and 
+\end_inset
-\emph default
-reshuffling the free lists is trivial: we simply merge every consecutive
+ by simply appending to the file (difficult if it would need to create a
- pair of free lists.
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
 \end_layout
 \begin_layout Standard
@@ -1886,7 +1937,7 @@ The basic algorithm is as follows.
 \end_layout
 \begin_layout Enumerate
-Identify the correct zone.
+Identify the correct free list.
 \end_layout
 \begin_layout Enumerate
@@ -1894,12 +1945,12 @@ Lock the corresponding list.
 \end_layout
 \begin_layout Enumerate
-Re-check the zone (we didn't have a lock, sizes could have changed): relock
+Re-check the list (we didn't have a lock, sizes could have changed): relock
 if necessary.
 \end_layout
 \begin_layout Enumerate
-Place the freed entry in the list for that zone.
+Place the freed entry in the list.
 \end_layout
 \begin_layout Standard
@@ -1908,15 +1959,7 @@ Allocation is a little more complicated, as we perform delayed coalescing
 \end_layout
 \begin_layout Enumerate
-Pick a zone either the zone we last freed into, or based on a 
+Pick a free table; usually the previous one.
-\begin_inset Quotes eld
-\end_inset
-random
-\begin_inset Quotes erd
-\end_inset
- number.
 \end_layout
 \begin_layout Enumerate
@@ -1924,16 +1967,16 @@ Lock the corresponding list.
 \end_layout
 \begin_layout Enumerate
-Re-check the zone: relock if necessary.
+If the top entry is -large enough, remove it from the list and return it.
 \end_layout
 \begin_layout Enumerate
-If the top entry is -large enough, remove it from the list and return it.
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
 \end_layout
 \begin_layout Enumerate
-Otherwise, coalesce entries in the list.If there was no entry large enough,
+If no list has an entry which meets our needs, try the next free table.
- unlock the list and try the next zone.
 \end_layout
 \begin_layout Enumerate
@@ -1965,73 +2008,8 @@ reference "sub:Records-Incur-A"
 \end_layout
 \begin_layout Standard
-I anticipate that the number of entries in each free zone would be small,
+Each free entry has the free table number in the header: less than 255.
- but it might be worth using one free entry to hold pointers to the others
+ It also contains a doubly-linked list for easy deletion.
- for cache efficiency.
-\change_inserted 0 1283309850
-\end_layout
-\begin_layout Standard
-\change_inserted 0 1283337216
-\begin_inset CommandInset label
-LatexCommand label
-name "freelist-in-zone"
-\end_inset
-If we want to avoid locking complexity (enlarging the free lists when we
- enlarge the file) we could place the array of free lists at the beginning
- of each zone.
- This means existing array lists never move, but means that a record cannot
- be larger than a zone.
- That in turn implies that zones should be variable sized (say, power of
- 2), which makes the question 
-\begin_inset Quotes eld
-\end_inset
-what zone is this record in?
-\begin_inset Quotes erd
-\end_inset
- much harder (and 
-\begin_inset Quotes eld
-\end_inset
-pick a random zone
-\begin_inset Quotes erd
-\end_inset
-, but that's less common).
- It could be done with as few as 4 bits from the record header.
-\begin_inset Foot
-status open
-\begin_layout Plain Layout
-\change_inserted 0 1284424151
-Using 
-\begin_inset Formula $2^{16+N*3}$
-\end_inset
-means 0 gives a minimal 65536-byte zone, 15 gives the maximal 
-\begin_inset Formula $2^{61}$
-\end_inset
- byte zone.
- Zones range in factor of 8 steps.
- Given the zone size for the zone the current record is in, we can determine
- the start of the zone.
-\change_unchanged
-\end_layout
-\end_inset
-\change_unchanged
 \end_layout
 \begin_layout Subsection
@@ -2233,8 +2211,6 @@ miss
 it reduces 99.9% of false memcmp).
 As an aside, as the lower bits are already incorporated in the hash table
 resolution, the upper bits should be used here.
-\change_inserted 0 1283336739
 Note that it's not clear that these bits will be a win, given the extra
 bits in the hash table itself (see 
 \begin_inset CommandInset ref
@@ -2244,8 +2220,6 @@ reference "sub:Hash-Size-Solution"
 \end_inset
 ).
-\change_unchanged
 \end_layout
 \begin_layout Enumerate
@@ -2282,11 +2256,11 @@ struct tdb_used_record {
 \end_layout
 \begin_layout LyX-Code
-        uint32_t magic : 16,
+        uint32_t used_magic : 16,
 \end_layout
 \begin_layout LyX-Code
-                 prev_is_free: 1,
 \end_layout
 \begin_layout LyX-Code
@@ -2294,7 +2268,7 @@ struct tdb_used_record {
 \end_layout
 \begin_layout LyX-Code
-                 top_hash: 10;
+                 top_hash: 11;
 \end_layout
 \begin_layout LyX-Code
@@ -2318,29 +2292,27 @@ struct tdb_free_record {
 \end_layout
 \begin_layout LyX-Code
-        uint32_t free_magic;
+        uint64_t free_magic: 8,
 \end_layout
 \begin_layout LyX-Code
-        uint64_t total_length;
+                   prev : 56;
-\change_inserted 0 1283337133
 \end_layout
 \begin_layout LyX-Code
-\change_inserted 0 1283337139
+\end_layout
-        uint64_t prev, next;
-\change_unchanged
+\begin_layout LyX-Code
+        uint64_t free_table: 8,
 \end_layout
 \begin_layout LyX-Code
-        ...
+                 total_length : 56
 \end_layout
 \begin_layout LyX-Code
-        uint64_t tailer;
+        uint64_t next;;
 \end_layout
 \begin_layout LyX-Code
@@ -2349,20 +2321,19 @@ struct tdb_free_record {
 \begin_layout Standard
-\change_inserted 0 1283337235
+\change_deleted 0 1291206079
-We might want to take some bits from the used record's top_hash (and the
- free record which has 32 bits of padding to spare anyway) if we use variable
- sized zones.
- See 
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "freelist-in-zone"
-\end_inset
-.
 \change_unchanged
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
 \end_layout
 \begin_layout Subsection
@@ -2455,6 +2426,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum
 a transaction in progress; we need only check for recovery if this is set.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
+\end_layout
 \begin_layout Subsection
 \begin_inset CommandInset label
 LatexCommand label
@@ -2466,13 +2445,7 @@ TDB Does Not Have Snapshot Support
 \end_layout
 \begin_layout Subsubsection
-Proposed Solution
+Proposed SolutionNone.
-\change_deleted 0 1284423472
-\end_layout
-\begin_layout Standard
-None.
 At some point you say 
 \begin_inset Quotes eld
 \end_inset
@@ -2481,13 +2454,6 @@ use a real database
 \begin_inset Quotes erd
 \end_inset
-\change_inserted 0 1284423891
-\change_deleted 0 1284423891
-.
-\change_inserted 0 1284423901
 (but see 
 \begin_inset CommandInset ref
 LatexCommand ref
@@ -2496,8 +2462,6 @@ reference "replay-attribute"
 \end_inset
 ).
-\change_unchanged
 \end_layout
 \begin_layout Standard
@@ -2520,8 +2484,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack
 \begin_layout Standard
 We could then implement snapshots using a similar method, using multiple
 different hash tables/free tables.
-\change_inserted 0 1284423495
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
 \end_layout
 \begin_layout Subsection
@@ -2541,8 +2511,6 @@ Proposed Solution
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284424201
 None (but see 
 \begin_inset CommandInset ref
 LatexCommand ref
@@ -2551,15 +2519,21 @@ reference "replay-attribute"
 \end_inset
 ).
+ We could solve a small part of the problem by providing read-only transactions.
-\change_unchanged
-We could solve a small part of the problem by providing read-only transactions.
 These would allow one write transaction to begin, but it could not commit
 until all r/o transactions are done.
 This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
 commit.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
+\end_layout
 \begin_layout Subsection
 Default Hash Function Is Suboptimal
 \end_layout
@@ -2600,6 +2574,14 @@ The seed should be created at tdb-creation time from some random source,
 hash bombing.
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 \begin_inset CommandInset label
 LatexCommand label
@@ -2637,6 +2619,14 @@ reference "traverse-Proposed-Solution"
 .
 \end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
+\end_layout
 \begin_layout Subsection
 Fcntl Locking Adds Overhead
 \end_layout
@@ -2738,19 +2728,13 @@ At some later point, a sync would allow recovery of the old data into the
 free lists (perhaps when the array of top-level pointers filled).
 On crash, tdb_open() would examine the array of top levels, and apply the
 transactions until it encountered an invalid checksum.
-\change_inserted 0 1284423555
 \end_layout
 \begin_layout Subsection
-\change_inserted 0 1284423617
 Tracing Is Fragile, Replay Is External
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284423719
 The current TDB has compile-time-enabled tracing code, but it often breaks
 as it is not enabled by default.
 In a similar way, the ctdb code has an external wrapper which does replay
@@ -2758,8 +2742,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks
 \end_layout
 \begin_layout Subsubsection
-\change_inserted 0 1284423864
 Proposed Solution
 \begin_inset CommandInset label
 LatexCommand label
@@ -2771,8 +2753,6 @@ name "replay-attribute"
 \end_layout
 \begin_layout Standard
-\change_inserted 0 1284423850
 Tridge points out that an attribute can be later added to tdb_open (see
 \begin_inset CommandInset ref
@@ -2783,8 +2763,14 @@ reference "attributes"
 ) to provide replay/trace hooks, which could become the basis for this and
 future parallel transactions and snapshot support.
-\change_unchanged
+\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
 \end_layout
 \end_body
@@ -2792,6 +2778,996 @@ reference "attributes"
 @
+1.12
+log
+@Add status, some fixes, linked freelists.
+@
+text
+@d53 1
+a53 7
+\change_deleted 0 1291204535
+14-September
+\change_inserted 0 1291204533
+1-December
+\change_unchanged
+-2010
+a580 2
+\change_inserted 0 1291204563
+a583 2
+\change_inserted 0 1291204572
+a587 2
+\change_inserted 0 1291204573
+a588 2
+\change_unchanged
+a629 2
+\change_inserted 0 1291204588
+a632 2
+\change_inserted 0 1291204588
+a636 2
+\change_inserted 0 1291204631
+a639 2
+\change_unchanged
+a693 2
+\change_inserted 0 1291204639
+a696 2
+\change_inserted 0 1291204640
+a700 2
+\change_inserted 0 1291204665
+a701 2
+\change_unchanged
+a722 2
+\change_inserted 0 1291204671
+a725 2
+\change_inserted 0 1291204671
+a729 2
+\change_inserted 0 1291204673
+a730 2
+\change_unchanged
+a774 2
+\change_inserted 0 1291204731
+a777 2
+\change_inserted 0 1291204732
+a781 2
+\change_inserted 0 1291204779
+a784 2
+\change_unchanged
+a836 2
+\change_inserted 0 1291204830
+a839 2
+\change_inserted 0 1291204831
+a843 2
+\change_inserted 0 1291204834
+a844 2
+\change_unchanged
+a898 2
+\change_inserted 0 1291204847
+a901 2
+\change_inserted 0 1291204847
+a905 2
+\change_inserted 0 1291204852
+a906 2
+\change_unchanged
+a1021 2
+\change_inserted 0 1291204881
+a1024 2
+\change_inserted 0 1291204881
+a1028 2
+\change_inserted 0 1291204885
+a1029 2
+\change_unchanged
+a1110 2
+\change_inserted 0 1291204898
+a1113 2
+\change_inserted 0 1291204898
+a1117 2
+\change_inserted 0 1291204901
+a1118 2
+\change_unchanged
+a1194 2
+\change_inserted 0 1291204908
+a1197 2
+\change_inserted 0 1291204908
+a1201 2
+\change_inserted 0 1291204908
+a1202 2
+\change_unchanged
+a1241 2
+\change_inserted 0 1291204917
+a1244 2
+\change_inserted 0 1291204917
+a1248 2
+\change_inserted 0 1291204920
+a1249 2
+\change_unchanged
+a1286 2
+\change_inserted 0 1291204927
+a1289 2
+\change_inserted 0 1291204928
+a1293 2
+\change_inserted 0 1291204942
+a1294 2
+\change_unchanged
+a1345 2
+\change_inserted 0 1291205003
+a1348 2
+\change_inserted 0 1291205004
+a1352 2
+\change_inserted 0 1291205007
+a1375 2
+\change_inserted 0 1291205019
+a1378 2
+\change_inserted 0 1291205019
+a1382 2
+\change_inserted 0 1291205023
+a1383 2
+\change_unchanged
+a1429 2
+\change_inserted 0 1291205029
+a1432 2
+\change_inserted 0 1291205029
+a1436 2
+\change_inserted 0 1291206020
+a1437 2
+\change_unchanged
+a1492 2
+\change_inserted 0 1291205043
+a1495 2
+\change_inserted 0 1291205043
+a1499 2
+\change_inserted 0 1291205057
+a1500 2
+\change_unchanged
+a1547 2
+\change_inserted 0 1291205062
+a1550 2
+\change_inserted 0 1291205062
+a1554 2
+\change_inserted 0 1291205062
+a1555 2
+\change_unchanged
+a1584 2
+\change_inserted 0 1291205072
+a1587 2
+\change_inserted 0 1291205073
+a1591 2
+\change_inserted 0 1291205073
+a1592 2
+\change_unchanged
+a1632 4
+\change_deleted 0 1291204504
+\change_unchanged
+a1657 2
+\change_inserted 0 1291205079
+a1660 2
+\change_inserted 0 1291205080
+a1664 2
+\change_inserted 0 1291205080
+a1665 2
+\change_unchanged
+a1791 2
+\change_inserted 0 1291205090
+d1827 2
+a1828 7
+ is to divide the file into zones, and using a free list (or 
+\change_inserted 0 1291205498
+table
+\change_deleted 0 1291205497
+set
+\change_unchanged
+ of free lists) for each.
+a1829 2
+\change_inserted 0 1291205203
+a1832 2
+\change_inserted 0 1291205358
+a1848 21
+\change_unchanged
+\end_layout
+\begin_layout Standard
+\change_deleted 0 1291205198
+Note that this means we need to split the free lists when we expand the
+ file; this is probably acceptable when we double the hash table size, since
+ that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary, 
+\emph on
+and 
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+\change_unchanged
+d1857 1
+a1857 7
+Identify the correct 
+\change_inserted 0 1291205366
+free list
+\change_deleted 0 1291205364
+zone
+\change_unchanged
+.
+d1865 2
+a1866 7
+Re-check the 
+\change_inserted 0 1291205372
+list
+\change_deleted 0 1291205371
+zone
+\change_unchanged
+ (we didn't have a lock, sizes could have changed): relock if necessary.
+d1870 1
+a1870 5
+Place the freed entry in the list
+\change_deleted 0 1291205382
+ for that zone
+\change_unchanged
+.
+d1879 1
+a1879 15
+Pick a 
+\change_deleted 0 1291205403
+zone either the zone we last freed into, or based on a 
+\begin_inset Quotes eld
+\end_inset
+random
+\begin_inset Quotes erd
+\end_inset
+ number.
+\change_inserted 0 1291205411
+free table; usually the previous one.
+\change_unchanged
+a1883 10
+\change_deleted 0 1291205432
+\end_layout
+\begin_layout Enumerate
+\change_deleted 0 1291205428
+Re-check the zone: relock if necessary.
+\change_unchanged
+d1892 1
+a1892 7
+ unlock the list and try the next 
+\change_inserted 0 1291205455
+largest list
+\change_deleted 0 1291205452
+zone.
+\change_inserted 0 1291205457
+a1895 2
+\change_inserted 0 1291205476
+a1896 2
+\change_unchanged
+a1924 2
+\change_inserted 0 1291205542
+a1927 2
+\change_inserted 0 1291205591
+a1929 70
+\change_unchanged
+\end_layout
+\begin_layout Standard
+\change_deleted 0 1291205539
+I anticipate that the number of entries in each free zone would be small,
+ but it might be worth using one free entry to hold pointers to the others
+ for cache efficiency.
+\change_unchanged
+\end_layout
+\begin_layout Standard
+\change_deleted 0 1291205534
+\begin_inset CommandInset label
+LatexCommand label
+name "freelist-in-zone"
+\end_inset
+If we want to avoid locking complexity (enlarging the free lists when we
+ enlarge the file) we could place the array of free lists at the beginning
+ of each zone.
+ This means existing array lists never move, but means that a record cannot
+ be larger than a zone.
+ That in turn implies that zones should be variable sized (say, power of
+ 2), which makes the question 
+\begin_inset Quotes eld
+\end_inset
+what zone is this record in?
+\begin_inset Quotes erd
+\end_inset
+ much harder (and 
+\begin_inset Quotes eld
+\end_inset
+pick a random zone
+\begin_inset Quotes erd
+\end_inset
+, but that's less common).
+ It could be done with as few as 4 bits from the record header.
+\begin_inset Foot
+status collapsed
+\begin_layout Plain Layout
+Using 
+\begin_inset Formula $2^{16+N*3}$
+\end_inset
+means 0 gives a minimal 65536-byte zone, 15 gives the maximal 
+\begin_inset Formula $2^{61}$
+\end_inset
+ byte zone.
+ Zones range in factor of 8 steps.
+ Given the zone size for the zone the current record is in, we can determine
+ the start of the zone.
+\end_layout
+\end_inset
+\change_inserted 0 1291205139
+d2176 1
+a2176 5
+        uint32_t 
+\change_inserted 0 1291205758
+used_
+\change_unchanged
+magic : 16,
+a2180 4
+\change_deleted 0 1291205693
+                 prev_is_free: 1,
+\change_unchanged
+d2188 1
+a2188 7
+                 top_hash: 1
+\change_inserted 0 1291205704
+1
+\change_deleted 0 1291205704
+0
+\change_unchanged
+;
+d2212 1
+a2212 9
+        uint
+\change_inserted 0 1291205725
+64
+\change_deleted 0 1291205723
+32
+\change_unchanged
+_t 
+\change_inserted 0 1291205753
+free_magic: 8,
+a2215 2
+\change_inserted 0 1291205746
+a2220 24
+\change_deleted 0 1291205749
+free_magic;
+\change_unchanged
+\end_layout
+\begin_layout LyX-Code
+        uint64_t 
+\change_inserted 0 1291205786
+free_table: 8,
+\end_layout
+\begin_layout LyX-Code
+\change_inserted 0 1291205788
+\change_unchanged
+total_length
+\change_inserted 0 1291205792
+ : 56
+\change_deleted 0 1291205790
+;
+\change_unchanged
+d2224 1
+a2224 7
+        uint64_t 
+\change_deleted 0 1291205801
+prev, 
+\change_unchanged
+next;
+\change_deleted 0 1291205811
+d2228 1
+a2228 3
+\change_deleted 0 1291205811
+        ...
+d2232 1
+a2232 5
+\change_deleted 0 1291205808
+        uint64_t tailer
+\change_unchanged
+;
+d2241 5
+a2245 16
+\change_deleted 0 1291205827
+We might want to take some bits from the used record's top_hash (and the
+ free record which has 32 bits of padding to spare anyway) if we use variable
+ sized zones.
+ See 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "freelist-in-zone"
+\end_inset
+.
+\change_inserted 0 1291205885
+ Note that by limiting valid offsets to 56 bits, we can pack everything
+ we need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+a2248 2
+\change_inserted 0 1291205886
+a2252 2
+\change_inserted 0 1291205886
+a2253 2
+\change_unchanged
+a2343 2
+\change_inserted 0 1291205894
+a2346 2
+\change_inserted 0 1291205894
+a2350 2
+\change_inserted 0 1291205902
+a2351 2
+\change_unchanged
+a2373 4
+\change_deleted 0 1291204504
+\change_unchanged
+a2403 2
+\change_inserted 0 1291205910
+a2406 2
+\change_inserted 0 1291205910
+a2410 2
+\change_inserted 0 1291205914
+a2411 2
+\change_unchanged
+a2443 2
+\change_inserted 0 1291205919
+a2446 2
+\change_inserted 0 1291205919
+a2450 2
+\change_inserted 0 1291205922
+a2451 2
+\change_unchanged
+a2491 2
+\change_inserted 0 1291205929
+a2494 2
+\change_inserted 0 1291205929
+a2498 2
+\change_inserted 0 1291205929
+a2499 2
+\change_unchanged
+a2536 2
+\change_inserted 0 1291205932
+a2539 2
+\change_inserted 0 1291205933
+a2543 2
+\change_inserted 0 1291205933
+a2544 2
+\change_unchanged
+a2682 2
+\change_inserted 0 1291205944
+a2685 2
+\change_inserted 0 1291205945
+a2689 2
+\change_inserted 0 1291205948
+a2690 2
+\change_unchanged
+@
+1.11
+log
+@Merge changes
+@
+text
+@d53 7
+a59 1
+14-September-2010
+d587 16
+d644 18
+d716 16
+d753 16
+d813 18
+d883 16
+d953 16
+d1084 16
+d1181 16
+d1273 16
+d1328 16
+d1381 16
+d1447 19
+a1465 2
+ if older code (which doesn't understand the feature) writes to the database.Reco
+rd Headers Are Not Expandible
+d1484 16
+d1546 16
+d1617 16
+d1680 16
+d1725 16
+d1810 16
+d1951 8
+a1958 3
+Proposed SolutionThe first step is to remove all the current heuristics,
+ as they obviously interact, then examine them once the lock contention
+ is addressed.
+d1989 7
+a1995 2
+ is to divide the file into zones, and using a free list (or set of free
+ lists) for each.
+d1997 2
+d2002 25
+d2039 2
+d2049 7
+a2055 1
+Identify the correct zone.
+d2063 7
+a2069 2
+Re-check the zone (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+d2073 5
+a2077 1
+Place the freed entry in the list for that zone.
+d2086 3
+a2088 1
+Pick a zone either the zone we last freed into, or based on a 
+d2097 4
+d2105 2
+d2110 2
+d2113 2
+d2123 15
+a2137 1
+ unlock the list and try the next zone.
+d2166 11
+d2180 2
+d2185 2
+d2190 2
+d2223 1
+a2223 1
+status open
+d2243 2
+d2491 5
+a2495 1
+        uint32_t magic : 16,
+d2499 2
+d2502 2
+d2511 7
+a2517 1
+                 top_hash: 10;
+d2541 29
+a2569 1
+        uint32_t free_magic;
+d2573 11
+a2583 1
+        uint64_t total_length;
+d2587 7
+a2593 1
+        uint64_t prev, next;
+d2597 2
+d2603 5
+a2607 1
+        uint64_t tailer;
+d2615 2
+d2628 18
+d2736 16
+d2808 16
+d2856 16
+d2912 16
+d2965 16
+d3119 16
+@
+1.10
+log
+@Tracing attribute, talloc support.
+@
+text
+@d1 1
+a1 1
+#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+d53 1
+a53 7
+\change_deleted 0 1283307542
+26-July
+\change_inserted 0 1284423485
+14-September
+\change_unchanged
+-2010
+a472 2
+\change_inserted 0 1284422789
+a479 2
+\change_unchanged
+a838 2
+\change_inserted 0 1284016998
+a846 2
+\change_unchanged
+a1194 2
+\change_inserted 0 1284015637
+a1197 2
+\change_inserted 0 1284015716
+a1201 2
+\change_inserted 0 1284015906
+a1210 2
+\change_inserted 0 1284015637
+a1214 2
+\change_inserted 0 1284016114
+a1227 2
+\change_inserted 0 1284016149
+a1232 2
+\change_inserted 0 1284016639
+a1237 2
+\change_inserted 0 1284016821
+a1243 2
+\change_inserted 0 1284016803
+d1245 2
+a1246 9
+ if older code (which doesn't understand the feature) writes to the database.
+\change_deleted 0 1284016101
+\end_layout
+\begin_layout Subsection
+\change_inserted 0 1284015634
+Record Headers Are Not Expandible
+a1249 2
+\change_inserted 0 1284015634
+a1254 2
+\change_inserted 0 1284015634
+a1258 2
+\change_inserted 0 1284422552
+a1267 2
+\change_inserted 0 1284422568
+a1271 2
+\change_inserted 0 1284422646
+a1276 2
+\change_inserted 0 1284422656
+a1280 2
+\change_inserted 0 1284423065
+a1305 2
+\change_inserted 0 1284423042
+a1310 2
+\change_unchanged
+a1457 2
+\change_inserted 0 1283336713
+a1463 2
+\change_unchanged
+d1482 2
+d1485 1
+a1485 51
+\change_deleted 0 1283307675
+There are three details which become important:
+\end_layout
+\begin_layout Enumerate
+\change_deleted 0 1283307675
+On encountering a full bucket, we use the next bucket.
+\end_layout
+\begin_layout Enumerate
+\change_deleted 0 1283307675
+Extra hash bits are stored with the offset, to reduce comparisons.
+\end_layout
+\begin_layout Enumerate
+\change_deleted 0 1283307675
+A marker entry is used on deleting an entry.
+\end_layout
+\begin_layout Standard
+\change_deleted 0 1283307675
+The doubling of the table must be done under a transaction; we will not
+ reduce it on deletion, so it will be an unusual case.
+ It will either be placed at the head (other entries will be moved out the
+ way so we can expand).
+ We could have a pointer in the header to the current hashtable location,
+ but that pointer would have to be read frequently to check for hashtable
+ moves.
+\end_layout
+\begin_layout Standard
+\change_deleted 0 1283307675
+The locking for this is slightly more complex than the chained case; we
+ currently have one lock per bucket, and that means we would need to expand
+ the lock if we overflow to the next bucket.
+ The frequency of such collisions will effect our locking heuristics: we
+ can always lock more buckets than we need.
+\end_layout
+\begin_layout Standard
+\change_deleted 0 1283307675
+One possible optimization is to only re-check the hash size on an insert
+ or a lookup miss.
+\change_inserted 0 1283307770
+a1492 2
+\change_inserted 0 1283336187
+a1500 2
+\change_inserted 0 1283336586
+a1510 2
+\change_unchanged
+d1636 3
+a1638 8
+Proposed Solution
+\change_deleted 0 1283336858
+\end_layout
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+a1647 2
+\change_inserted 0 1283336910
+a1650 2
+\change_inserted 0 1283337052
+a1655 2
+\change_unchanged
+a1776 2
+\change_inserted 0 1283309850
+a1779 2
+\change_inserted 0 1283337216
+a1813 2
+\change_inserted 0 1284424151
+a1825 2
+\change_unchanged
+a1830 2
+\change_unchanged
+a2031 2
+\change_inserted 0 1283336739
+a2040 2
+\change_unchanged
+a2117 2
+\change_inserted 0 1283337133
+a2120 2
+\change_inserted 0 1283337139
+a2121 2
+\change_unchanged
+a2136 2
+\change_inserted 0 1283337235
+a2147 2
+\change_unchanged
+d2251 1
+a2251 7
+Proposed Solution
+\change_deleted 0 1284423472
+\end_layout
+\begin_layout Standard
+None.
+d2261 1
+a2261 1
+\change_inserted 0 1284423891
+d2263 1
+a2263 4
+\change_deleted 0 1284423891
+.
+\change_inserted 0 1284423901
+a2271 2
+\change_unchanged
+a2293 2
+\change_inserted 0 1284423495
+a2312 2
+\change_inserted 0 1284424201
+d2321 1
+a2321 3
+\change_unchanged
+We could solve a small part of the problem by providing read-only transactions.
+a2505 2
+\change_inserted 0 1284423555
+a2508 2
+\change_inserted 0 1284423617
+a2512 2
+\change_inserted 0 1284423719
+a2519 2
+\change_inserted 0 1284423864
+a2530 2
+\change_inserted 0 1284423850
+a2540 2
+\change_unchanged
+@
 1.9
 log
 @Extension mechanism.

--- a/ccan/tdb2/doc/design.pdf
+++ b/ccan/tdb2/doc/design.pdf
--- a/ccan/tdb2/doc/design.txt
+++ b/ccan/tdb2/doc/design.txt
@@ -2,7 +2,7 @@ TDB2: A Redesigning The Trivial DataBase
 Rusty Russell, IBM Corporation
-14-September-2010
+1-December-2010
 Abstract
@@ -129,6 +129,10 @@ union tdb_attribute {
 This allows future attributes to be added, even if this expands 
 the size of the union.
+2.1.2 Status
+Complete.
 2.2 tdb_traverse Makes Impossible Guarantees
 tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, 
@@ -148,6 +152,11 @@ occur during your traversal, otherwise you will see some subset.
 You can prevent changes by using a transaction or the locking 
 API.
+2.2.2 Status
+Complete. Delete-during-traverse will still delete every record, 
+too (assuming no other changes).
 2.3 Nesting of Transactions Is Fraught
 TDB has alternated between allowing nested transactions and not 
@@ -182,6 +191,10 @@ However, this behavior can be simulated with a wrapper which uses
 tdb_add_flags() and tdb_remove_flags(), so the API should not be 
 expanded for this relatively-obscure case.
+2.3.2 Status
+Incomplete; nesting flag is still defined as per tdb1.
 2.4 Incorrect Hash Function is Not Detected
 tdb_open_ex() allows the calling code to specify a different hash 
@@ -195,6 +208,10 @@ The header should contain an example hash result (eg. the hash of
 0xdeadbeef), and tdb_open_ex() should check that the given hash 
 function produces the same answer, or fail the tdb_open call.
+2.4.2 Status
+Complete.
 2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
 In response to scalability issues with the free list ([TDB-Freelist-Is]
@@ -216,6 +233,11 @@ hint that store and delete of records will be at least as common
 as fetch in order to allow some internal tuning, but initially 
 will become a no-op.
+2.5.2 Status
+Incomplete. TDB_VOLATILE still defined, but implementation should 
+fail on unknown flags to be future-proof.
 2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times 
  In The Same Process
@@ -251,6 +273,10 @@ whether re-opening is allowed, as though there may be some
 benefit to adding a call to detect when a tdb_context is shared, 
 to allow other to create such an API.
+2.6.2 Status
+Incomplete.
 2.7 TDB API Is Not POSIX Thread-safe
 The TDB API uses an error code which can be queried after an 
@@ -281,6 +307,10 @@ will exist. Alternatively, a hooking mechanism similar to that
 proposed for [Proposed-Solution-locking-hook] could be used to 
 enable pthread locking at runtime.
+2.7.2 Status
+Incomplete.
 2.8 *_nonblock Functions And *_mark Functions Expose 
  Implementation
@@ -343,6 +373,10 @@ locks it doesn't need to obtain.
 It also keeps the complexity out of the API, and in ctdbd where 
 it is needed.
+2.8.2 Status
+Incomplete.
 2.9 tdb_chainlock Functions Expose Implementation
 tdb_chainlock locks some number of records, including the record 
@@ -391,6 +425,10 @@ EINVAL if the signal occurs before the kernel is entered,
 otherwise EAGAIN.
 ]
+2.10.2 Status
+Incomplete.
 2.11 The API Uses Gratuitous Typedefs, Capitals
 typedefs are useful for providing source compatibility when types 
@@ -433,6 +471,10 @@ the tdb_open_ex for logging.
 It should simply take an extra argument, since we are prepared to 
 break the API/ABI.
+2.12.2 Status
+Complete.
 2.13 Various Callback Functions Are Not Typesafe
 The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
@@ -455,6 +497,10 @@ their parameter.
 See CCAN's typesafe_cb module at 
 http://ccan.ozlabs.org/info/typesafe_cb.html
+2.13.2 Status
+Incomplete.
 2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, 
  tdb_reopen_all Problematic
@@ -475,6 +521,11 @@ it alone has opened the TDB and will erase it.
 Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but 
 see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+2.14.2 Status
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not 
+implemented.
 2.15 Extending The Header Is Difficult
 We have reserved (zeroed) words in the TDB header, which can be 
@@ -505,6 +556,10 @@ This should allow backwards-compatible features to be added, and
 detection if older code (which doesn't understand the feature) 
 writes to the database.
+2.15.2 Status
+Incomplete.
 2.16 Record Headers Are Not Expandible
 If we later want to add (say) checksums on keys and data, it 
@@ -519,6 +574,10 @@ understand a new format: the new code would write (say) a 1 at
 the tail, and thus if there is no tail or the first byte is 0, we 
 would know the extension is not present on that record.
+2.16.2 Status
+Incomplete.
 2.17 TDB Does Not Use Talloc
 Many users of TDB (particularly Samba) use the talloc allocator, 
@@ -541,6 +600,10 @@ returned from tdb_open to close it. All TDB_DATA fields would be
 children of the tdb_context, and the caller would still have to 
 manage them (using talloc_free() or talloc_steal()).
+2.17.2 Status
+Deferred.
 3 Performance And Scalability Issues
 3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST 
@@ -570,6 +633,10 @@ Remove the flag. It was a neat idea, but even trivial servers
 tend to know when they are initializing for the first time and 
 can simply unlink the old tdb at that point.
+3.1.2 Status
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
 3.2 TDB Files Have a 4G Limit
 This seems to be becoming an issue (so much for “trivial”!), 
@@ -596,6 +663,10 @@ Old versions of tdb will fail to open the new TDB files (since 28
 August 2009, commit 398d0c29290: prior to that any unrecognized 
 file format would be erased and initialized as a fresh tdb!)
+3.2.2 Status
+Complete.
 3.3 TDB Records Have a 4G Limit
 This has not been a reported problem, and the API uses size_t 
@@ -610,6 +681,10 @@ implementation would return TDB_ERR_OOM in a similar case). It
 seems unlikely that 32 bit keys will be a limitation, so the 
 implementation may not support this (see [sub:Records-Incur-A]).
+3.3.2 Status
+Complete.
 3.4 Hash Size Is Determined At TDB Creation Time
 TDB contains a number of hash chains in the header; the number is 
@@ -628,20 +703,9 @@ This was annoying because I was previously convinced that an
 expanding tree of hashes would be very close to optimal.
 ], it became clear that it is hard to beat a straight linear hash 
 table which doubles in size when it reaches saturation. 
+Unfortunately, altering the hash table introduces serious locking 
-1. 
+complications: the entire hash table needs to be locked to 
+enlarge the hash table, and others might be holding locks. 
-2. 
-3. 
- Unfortunately, altering the hash table introduces serious 
-locking complications: the entire hash table needs to be locked 
-to enlarge the hash table, and others might be holding locks. 
 Particularly insidious are insertions done under tdb_chainlock.
 Thus an expanding layered hash will be used: an array of hash 
@@ -662,6 +726,10 @@ means we can choose not to re-hash all entries when we expand a
 hash group; simply use the next bits we need and mark them 
 invalid.
+3.4.2 Status
+Complete.
 3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
 TDB uses a single linked list for the free list. Allocation 
@@ -749,45 +817,45 @@ There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fr
 case where all processes are allocating/freeing the same size. 
 Thus we almost certainly need to divide in other ways: the most 
 obvious is to divide the file into zones, and using a free list 
-(or set of free lists) for each. This approximates address 
+(or table of free lists) for each. This approximates address 
 ordering.
-Note that this means we need to split the free lists when we 
+Unfortunately it is difficult to know what heuristics should be 
-expand the file; this is probably acceptable when we double the 
+used to determine zone sizes, and our transaction code relies on 
-hash table size, since that is such an expensive operation 
+being able to create a “recovery area” by simply appending to the 
-already. In the case of increasing the file size, there is an 
+file (difficult if it would need to create a new zone header). 
-optimization we can use: if we use M in the formula above as the 
+Thus we use a linked-list of free tables; currently we only ever 
-file size rounded up to the next power of 2, we only need 
+create one, but if there is more than one we choose one at random 
-reshuffle free lists when the file size crosses a power of 2 
+to use. In future we may use heuristics to add new free tables on 
-boundary, and reshuffling the free lists is trivial: we simply 
+contention. We only expand the file when all free tables are 
-merge every consecutive pair of free lists.
+exhausted.
 The basic algorithm is as follows. Freeing is simple:
-1. Identify the correct zone.
+1. Identify the correct free list.
 2. Lock the corresponding list.
-3. Re-check the zone (we didn't have a lock, sizes could have 
+3. Re-check the list (we didn't have a lock, sizes could have 
  changed): relock if necessary.
-4. Place the freed entry in the list for that zone.
+4. Place the freed entry in the list.
 Allocation is a little more complicated, as we perform delayed 
 coalescing at this point:
-1. Pick a zone either the zone we last freed into, or based on a “
+1. Pick a free table; usually the previous one.
-  random” number.
 2. Lock the corresponding list.
-3. Re-check the zone: relock if necessary.
+3. If the top entry is -large enough, remove it from the list and 
-4. If the top entry is -large enough, remove it from the list and 
  return it.
-5. Otherwise, coalesce entries in the list.If there was no entry 
+4. Otherwise, coalesce entries in the list.If there was no entry 
-  large enough, unlock the list and try the next zone.
+  large enough, unlock the list and try the next largest list
+5. If no list has an entry which meets our needs, try the next 
+  free table.
 6. If no zone satisfies, expand the file.
@@ -798,24 +866,9 @@ ordering seems to be fairly good for keeping fragmentation low
 does not need a tailer to coalesce, though if we needed one we 
 could have one cheaply: see [sub:Records-Incur-A]. 
-I anticipate that the number of entries in each free zone would 
+Each free entry has the free table number in the header: less 
-be small, but it might be worth using one free entry to hold 
+than 255. It also contains a doubly-linked list for easy 
-pointers to the others for cache efficiency.
+deletion.
-<freelist-in-zone>If we want to avoid locking complexity 
-(enlarging the free lists when we enlarge the file) we could 
-place the array of free lists at the beginning of each zone. This 
-means existing array lists never move, but means that a record 
-cannot be larger than a zone. That in turn implies that zones 
-should be variable sized (say, power of 2), which makes the 
-question “what zone is this record in?” much harder (and “pick a 
-random zone”, but that's less common). It could be done with as 
-few as 4 bits from the record header.[footnote:
-Using 2^{16+N*3}means 0 gives a minimal 65536-byte zone, 15 gives 
-the maximal 2^{61} byte zone. Zones range in factor of 8 steps. 
-Given the zone size for the zone the current record is in, we can 
-determine the start of the zone.
-]
 3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
@@ -944,13 +997,13 @@ This produces a 16 byte used header like this:
 struct tdb_used_record {
-        uint32_t magic : 16,
+        uint32_t used_magic : 16,
-                 prev_is_free: 1,
                 key_data_divide: 5,
-                 top_hash: 10;
+                 top_hash: 11;
        uint32_t extra_octets;
@@ -962,21 +1015,27 @@ And a free record like this:
 struct tdb_free_record {
-        uint32_t free_magic;
+        uint64_t free_magic: 8,
+                   prev : 56;
-        uint64_t total_length;
-        uint64_t prev, next;
-        ...
+        uint64_t free_table: 8,
-        uint64_t tailer;
+                 total_length : 56
+        uint64_t next;;
 };
-We might want to take some bits from the used record's top_hash 
+Note that by limiting valid offsets to 56 bits, we can pack 
-(and the free record which has 32 bits of padding to spare 
+everything we need into 3 64-byte words, meaning our minimum 
-anyway) if we use variable sized zones. See [freelist-in-zone].
+record size is 8 bytes.
+3.7.2 Status
+Complete.
 3.8 Transaction Commit Requires 4 fdatasync
@@ -1029,12 +1088,14 @@ but need only be done at open. For running databases, a separate
 header field can be used to indicate a transaction in progress; 
 we need only check for recovery if this is set.
-3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+3.8.2 Status
-3.9.1 Proposed Solution
+Deferred.
-None. At some point you say “use a real database”  (but see [replay-attribute]
+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
-).
+3.9.1 Proposed SolutionNone. At some point you say “use a real 
+  database” (but see [replay-attribute]).
 But as a thought experiment, if we implemented transactions to 
 only overwrite free entries (this is tricky: there must not be a 
@@ -1053,6 +1114,10 @@ rewrite some sections of the hash, too.
 We could then implement snapshots using a similar method, using 
 multiple different hash tables/free tables.
+3.9.2 Status
+Deferred.
 3.10 Transactions Cannot Operate in Parallel
 This would be useless for ldb, as it hits the index records with 
@@ -1069,6 +1134,10 @@ allow one write transaction to begin, but it could not commit
 until all r/o transactions are done. This would require a new 
 RO_TRANSACTION_LOCK, which would be upgraded on commit.
+3.10.2 Status
+Deferred.
 3.11 Default Hash Function Is Suboptimal
 The Knuth-inspired multiplicative hash used by tdb is fairly slow 
@@ -1090,6 +1159,10 @@ The seed should be created at tdb-creation time from some random
 source, and placed in the header. This is far from foolproof, but 
 adds a little bit of protection against hash bombing.
+3.11.2 Status
+Complete.
 3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
 We lock a record during traversal iteration, and try to grab that 
@@ -1104,6 +1177,10 @@ indefinitely.
 Remove reliability guarantees; see [traverse-Proposed-Solution].
+3.12.2 Status
+Complete.
 3.13 Fcntl Locking Adds Overhead
 Placing a fcntl lock means a system call, as does removing one. 
@@ -1176,3 +1253,7 @@ tdb_open (see [attributes]) to provide replay/trace hooks, which
 could become the basis for this and future parallel transactions 
 and snapshot support.
+3.15.2 Status
+Deferred.
--- a/ccan/tdb2/free.c
+++ b/ccan/tdb2/free.c
@@ -49,23 +49,24 @@ unsigned int size_to_bucket(tdb_len_t data_len)
 	return bucket;
 }
-tdb_off_t first_flist(struct tdb_context *tdb)
+tdb_off_t first_ftable(struct tdb_context *tdb)
 {
-	return tdb_read_off(tdb, offsetof(struct tdb_header, free_list));
+	return tdb_read_off(tdb, offsetof(struct tdb_header, free_table));
 }
-tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist)
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable)
 {
-	return tdb_read_off(tdb, flist + offsetof(struct tdb_freelist, next));
+	return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next));
 }
-int tdb_flist_init(struct tdb_context *tdb)
+int tdb_ftable_init(struct tdb_context *tdb)
 {
 	/* Use reservoir sampling algorithm to select a free list at random. */
-	unsigned int rnd, max = 0;
+	unsigned int rnd, max = 0, count = 0;
 	tdb_off_t off;
-	tdb->flist_off = off = first_flist(tdb);
+	tdb->ftable_off = off = first_ftable(tdb);
+	tdb->ftable = 0;
 	while (off) {
 		if (off == TDB_OFF_ERR)
@@ -73,50 +74,52 @@ int tdb_flist_init(struct tdb_context *tdb)
 		rnd = random();
 		if (rnd >= max) {
-			tdb->flist_off = off;
+			tdb->ftable_off = off;
+			tdb->ftable = count;
 			max = rnd;
 		}
-		off = next_flist(tdb, off);
+		off = next_ftable(tdb, off);
+		count++;
 	}
 	return 0;
 }
 /* Offset of a given bucket. */
-tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket)
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket)
 {
-	return flist_off + offsetof(struct tdb_freelist, buckets)
+	return ftable_off + offsetof(struct tdb_freetable, buckets)
 		+ bucket * sizeof(tdb_off_t);
 }
 /* Returns free_buckets + 1, or list number to search. */
 static tdb_off_t find_free_head(struct tdb_context *tdb,
-				tdb_off_t flist_off,
+				tdb_off_t ftable_off,
 				tdb_off_t bucket)
 {
 	/* Speculatively search for a non-zero bucket. */
-	return tdb_find_nonzero_off(tdb, bucket_off(flist_off, 0),
+	return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0),
 				    bucket, TDB_FREE_BUCKETS);
 }
 /* Remove from free bucket. */
 static int remove_from_list(struct tdb_context *tdb,
 			    tdb_off_t b_off, tdb_off_t r_off,
-			    struct tdb_free_record *r)
+			    const struct tdb_free_record *r)
 {
 	tdb_off_t off;
 	/* Front of list? */
-	if (r->prev == 0) {
+	if (frec_prev(r) == 0) {
 		off = b_off;
 	} else {
-		off = r->prev + offsetof(struct tdb_free_record, next);
+		off = frec_prev(r) + offsetof(struct tdb_free_record, next);
 	}
 #ifdef DEBUG
 	if (tdb_read_off(tdb, off) != r_off) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
-			 "remove_from_list: %llu bad prev in list %llu\n",
+			 "remove_from_list: %llu bad prev in list %llu",
 			 (long long)r_off, (long long)b_off);
 		return -1;
 	}
@@ -128,19 +131,19 @@ static int remove_from_list(struct tdb_context *tdb,
 	}
 	if (r->next != 0) {
-		off = r->next + offsetof(struct tdb_free_record, prev);
+		off = r->next + offsetof(struct tdb_free_record,magic_and_prev);
 		/* r->next->prev = r->prev */
 #ifdef DEBUG
-		if (tdb_read_off(tdb, off) != r_off) {
+		if (tdb_read_off(tdb, off) & TDB_OFF_MASK != r_off) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
-				 "remove_from_list: %llu bad list %llu\n",
+				   "remove_from_list: %llu bad list %llu",
-				 (long long)r_off, (long long)b_off);
+				   (long long)r_off, (long long)b_off);
 			return -1;
 		}
 #endif
-		if (tdb_write_off(tdb, off, r->prev)) {
+		if (tdb_write_off(tdb, off, r->magic_and_prev)) {
 			return -1;
 		}
 	}
@@ -151,58 +154,66 @@ static int remove_from_list(struct tdb_context *tdb,
 static int enqueue_in_free(struct tdb_context *tdb,
 			   tdb_off_t b_off,
 			   tdb_off_t off,
-			   struct tdb_free_record *new)
+			   tdb_len_t len)
 {
-	new->prev = 0;
+	struct tdb_free_record new;
+	uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
+	/* We only need to set ftable_and_len; rest is set in enqueue_in_free */
+	new.ftable_and_len = ((uint64_t)tdb->ftable << (64 - TDB_OFF_UPPER_STEAL))
+		| len;
+	/* prev = 0. */
+	new.magic_and_prev = magic;
 	/* new->next = head. */
-	new->next = tdb_read_off(tdb, b_off);
+	new.next = tdb_read_off(tdb, b_off);
-	if (new->next == TDB_OFF_ERR)
+	if (new.next == TDB_OFF_ERR)
 		return -1;
-	if (new->next) {
+	if (new.next) {
 #ifdef DEBUG
 		if (tdb_read_off(tdb,
-				 new->next
+				 new.next + offsetof(struct tdb_free_record,
-				 + offsetof(struct tdb_free_record, prev))
+						     magic_and_prev))
-		    != 0) {
+		    != magic) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
-				 "enqueue_in_free: %llu bad head prev %llu\n",
+				   "enqueue_in_free: %llu bad head"
-				 (long long)new->next, (long long)b_off);
+				   " prev %llu",
+				   (long long)new.next, (long long)b_off);
 			return -1;
 		}
 #endif
 		/* next->prev = new. */
-		if (tdb_write_off(tdb, new->next
+		if (tdb_write_off(tdb, new.next
-				  + offsetof(struct tdb_free_record, prev),
+				  + offsetof(struct tdb_free_record,
-				  off) != 0)
+					     magic_and_prev),
+				  off | magic) != 0)
 			return -1;
 	}
 	/* head = new */
 	if (tdb_write_off(tdb, b_off, off) != 0)
 		return -1;
-	return tdb_write_convert(tdb, off, new, sizeof(*new));
+	return tdb_write_convert(tdb, off, &new, sizeof(new));
 }
 /* List need not be locked. */
 int add_free_record(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_len_t len_with_header)
 {
-	struct tdb_free_record new;
 	tdb_off_t b_off;
+	tdb_len_t len;
 	int ret;
-	assert(len_with_header >= sizeof(new));
+	assert(len_with_header >= sizeof(struct tdb_free_record));
-	new.magic_and_meta = TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)
+	len = len_with_header - sizeof(struct tdb_used_record);
-		| tdb->flist_off;
-	new.data_len = len_with_header - sizeof(struct tdb_used_record);
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(new.data_len));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
 	if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 		return -1;
-	ret = enqueue_in_free(tdb, b_off, off, &new);
+	ret = enqueue_in_free(tdb, b_off, off, len);
 	tdb_unlock_free_bucket(tdb, b_off);
 	return ret;
 }
@@ -234,91 +245,113 @@ static size_t record_leftover(size_t keylen, size_t datalen,
 	return leftover;
 }
+static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
+{
+	tdb_off_t off;
+	unsigned int i;
+	if (likely(tdb->ftable == ftable))
+		return tdb->ftable_off;
+	off = first_ftable(tdb);
+	for (i = 0; i < ftable; i++)
+		off = next_ftable(tdb, off);
+	return off;
+}
 /* Note: we unlock the current bucket if we coalesce or fail. */
 static int coalesce(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 {
-	struct tdb_free_record pad, *r;
 	tdb_off_t end;
+	struct tdb_free_record rec;
+	add_stat(tdb, alloc_coalesce_tried, 1);
 	end = off + sizeof(struct tdb_used_record) + data_len;
 	while (end < tdb->map_size) {
+		const struct tdb_free_record *r;
 		tdb_off_t nb_off;
+		unsigned ftable, bucket;
-		/* FIXME: do tdb_get here and below really win? */
+		r = tdb_access_read(tdb, end, sizeof(*r), true);
-		r = tdb_get(tdb, end, &pad, sizeof(pad));
 		if (!r)
 			goto err;
-		if (frec_magic(r) != TDB_FREE_MAGIC)
+		if (frec_magic(r) != TDB_FREE_MAGIC
+		    || frec_ftable(r) == TDB_FTABLE_NONE) {
+			tdb_access_release(tdb, r);
 			break;
+		}
-		nb_off = bucket_off(frec_flist(r), size_to_bucket(r->data_len));
+		ftable = frec_ftable(r);
+		bucket = size_to_bucket(frec_len(r));
+		nb_off = bucket_off(ftable_offset(tdb, ftable), bucket);
+		tdb_access_release(tdb, r);
 		/* We may be violating lock order here, so best effort. */
-		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
+		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1) {
+			add_stat(tdb, alloc_coalesce_lockfail, 1);
 			break;
+		}
 		/* Now we have lock, re-check. */
-		r = tdb_get(tdb, end, &pad, sizeof(pad));
+		if (tdb_read_convert(tdb, end, &rec, sizeof(rec))) {
-		if (!r) {
 			tdb_unlock_free_bucket(tdb, nb_off);
 			goto err;
 		}
-		if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
+		if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) {
+			add_stat(tdb, alloc_coalesce_race, 1);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			break;
 		}
-		if (unlikely(bucket_off(frec_flist(r),
+		if (unlikely(frec_ftable(&rec) != ftable)
-					size_to_bucket(r->data_len))
+		    || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
-			     != nb_off)) {
+			add_stat(tdb, alloc_coalesce_race, 1);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			break;
 		}
-		if (remove_from_list(tdb, nb_off, end, r) == -1) {
+		if (remove_from_list(tdb, nb_off, end, &rec) == -1) {
 			tdb_unlock_free_bucket(tdb, nb_off);
 			goto err;
 		}
-		end += sizeof(struct tdb_used_record) + r->data_len;
+		end += sizeof(struct tdb_used_record) + frec_len(&rec);
 		tdb_unlock_free_bucket(tdb, nb_off);
+		add_stat(tdb, alloc_coalesce_num_merged, 1);
 	}
 	/* Didn't find any adjacent free? */
 	if (end == off + sizeof(struct tdb_used_record) + data_len)
 		return 0;
-	/* OK, expand record */
+	/* OK, expand initial record */
-	r = tdb_get(tdb, off, &pad, sizeof(pad));
+	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
-	if (!r)
 		goto err;
-	if (r->data_len != data_len) {
+	if (frec_len(&rec) != data_len) {
-		tdb->ecode = TDB_ERR_CORRUPT;
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			   "coalesce: expected data len %zu not %zu",
-			 "coalesce: expected data len %llu not %llu\n",
+			   (size_t)data_len, (size_t)frec_len(&rec));
-			 (long long)data_len, (long long)r->data_len);
 		goto err;
 	}
-	if (remove_from_list(tdb, b_off, off, r) == -1)
+	if (remove_from_list(tdb, b_off, off, &rec) == -1)
-		goto err;
-	r = tdb_access_write(tdb, off, sizeof(*r), true);
-	if (!r)
 		goto err;
 	/* We have to drop this to avoid deadlocks, so make sure record
 	 * doesn't get coalesced by someone else! */
-	r->magic_and_meta = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL);
+	rec.ftable_and_len = (TDB_FTABLE_NONE << (64 - TDB_OFF_UPPER_STEAL))
-	r->data_len = end - off - sizeof(struct tdb_used_record);
+		| (end - off - sizeof(struct tdb_used_record));
-	if (tdb_access_commit(tdb, r) != 0)
+	if (tdb_write_off(tdb, off + offsetof(struct tdb_free_record,
+					      ftable_and_len),
+			  rec.ftable_and_len) != 0)
 		goto err;
+	add_stat(tdb, alloc_coalesce_succeeded, 1);
 	tdb_unlock_free_bucket(tdb, b_off);
 	if (add_free_record(tdb, off, end - off) == -1)
@@ -333,19 +366,21 @@ err:
 /* We need size bytes to put our key and data in. */
 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
-				tdb_off_t flist_off,
+				tdb_off_t ftable_off,
 				tdb_off_t bucket,
 				size_t keylen, size_t datalen,
 				bool want_extra,
+				unsigned magic,
 				unsigned hashlow)
 {
 	tdb_off_t off, b_off,best_off;
-	struct tdb_free_record pad, best = { 0 }, *r;
+	struct tdb_free_record best = { 0 };
 	double multiplier;
 	size_t size = adjust_size(keylen, datalen);
+	add_stat(tdb, allocs, 1);
 again:
-	b_off = bucket_off(flist_off, bucket);
+	b_off = bucket_off(ftable_off, bucket);
 	/* FIXME: Try non-blocking wait first, to measure contention. */
 	/* Lock this bucket. */
@@ -353,7 +388,7 @@ again:
 		return TDB_OFF_ERR;
 	}
-	best.data_len = -1ULL;
+	best.ftable_and_len = -1ULL;
 	best_off = 0;
 	/* Get slack if we're after extra. */
@@ -369,30 +404,40 @@ again:
 		goto unlock_err;
 	while (off) {
-		/* FIXME: Does tdb_get win anything here? */
+		const struct tdb_free_record *r;
-		r = tdb_get(tdb, off, &pad, sizeof(*r));
+		tdb_len_t len;
+		tdb_off_t next;
+		r = tdb_access_read(tdb, off, sizeof(*r), true);
 		if (!r)
 			goto unlock_err;
 		if (frec_magic(r) != TDB_FREE_MAGIC) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_access_release(tdb, r);
-				 "lock_and_alloc: %llu non-free 0x%llx\n",
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
-				 (long long)off, (long long)r->magic_and_meta);
+				 "lock_and_alloc: %llu non-free 0x%llx",
+				 (long long)off, (long long)r->magic_and_prev);
 			goto unlock_err;
 		}
-		if (r->data_len >= size && r->data_len < best.data_len) {
+		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
 			best_off = off;
 			best = *r;
 		}
-		if (best.data_len < size * multiplier && best_off)
+		if (frec_len(&best) < size * multiplier && best_off) {
+			tdb_access_release(tdb, r);
 			break;
+		}
 		multiplier *= 1.01;
+		next = r->next;
+		len = frec_len(r);
+		tdb_access_release(tdb, r);
 		/* Since we're going slow anyway, try coalescing here. */
-		switch (coalesce(tdb, off, b_off, r->data_len)) {
+		switch (coalesce(tdb, off, b_off, len)) {
 		case -1:
 			/* This has already unlocked on error. */
 			return -1;
@@ -400,7 +445,7 @@ again:
 			/* This has unlocked list, restart. */
 			goto again;
 		}
-		off = r->next;
+		off = next;
 	}
 	/* If we found anything at all, use it. */
@@ -413,28 +458,30 @@ again:
 			goto unlock_err;
 		leftover = record_leftover(keylen, datalen, want_extra,
-					   best.data_len);
+					   frec_len(&best));
-		assert(keylen + datalen + leftover <= best.data_len);
+		assert(keylen + datalen + leftover <= frec_len(&best));
 		/* We need to mark non-free before we drop lock, otherwise
 		 * coalesce() could try to merge it! */
-		if (set_header(tdb, &rec, keylen, datalen,
+		if (set_header(tdb, &rec, magic, keylen, datalen,
-			       best.data_len - leftover,
+			       frec_len(&best) - leftover, hashlow) != 0)
-			       hashlow) != 0)
 			goto unlock_err;
 		if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
 			goto unlock_err;
-		tdb_unlock_free_bucket(tdb, b_off);
+		/* Bucket of leftover will be <= current bucket, so nested
+		 * locking is allowed. */
 		if (leftover) {
+			add_stat(tdb, alloc_leftover, 1);
 			if (add_free_record(tdb,
 					    best_off + sizeof(rec)
-					    + best.data_len - leftover,
+					    + frec_len(&best) - leftover,
 					    leftover))
-				return TDB_OFF_ERR;
+				best_off = TDB_OFF_ERR;
 		}
+		tdb_unlock_free_bucket(tdb, b_off);
 		return best_off;
 	}
@@ -449,10 +496,10 @@ unlock_err:
 /* Get a free block from current free list, or 0 if none. */
 static tdb_off_t get_free(struct tdb_context *tdb,
 			  size_t keylen, size_t datalen, bool want_extra,
-			  unsigned hashlow)
+			  unsigned magic, unsigned hashlow)
 {
-	tdb_off_t off, flist;
+	tdb_off_t off, ftable_off;
-	unsigned start_b, b;
+	unsigned start_b, b, ftable;
 	bool wrapped = false;
 	/* If they are growing, add 50% to get to higher bucket. */
@@ -462,31 +509,40 @@ static tdb_off_t get_free(struct tdb_context *tdb,
 	else
 		start_b = size_to_bucket(adjust_size(keylen, datalen));
-	flist = tdb->flist_off;
+	ftable_off = tdb->ftable_off;
-	while (!wrapped || flist != tdb->flist_off) {
+	ftable = tdb->ftable;
+	while (!wrapped || ftable_off != tdb->ftable_off) {
 		/* Start at exact size bucket, and search up... */
-		for (b = find_free_head(tdb, flist, start_b);
+		for (b = find_free_head(tdb, ftable_off, start_b);
 		     b < TDB_FREE_BUCKETS;
-		     b = find_free_head(tdb, flist, b + 1)) {
+		     b = find_free_head(tdb, ftable_off, b + 1)) {
 			/* Try getting one from list. */
-			off = lock_and_alloc(tdb, flist,
+			off = lock_and_alloc(tdb, ftable_off,
 					     b, keylen, datalen, want_extra,
-					     hashlow);
+					     magic, hashlow);
 			if (off == TDB_OFF_ERR)
 				return TDB_OFF_ERR;
 			if (off != 0) {
+				if (b == start_b)
+					add_stat(tdb, alloc_bucket_exact, 1);
+				if (b == TDB_FREE_BUCKETS - 1)
+					add_stat(tdb, alloc_bucket_max, 1);
 				/* Worked?  Stay using this list. */
-				tdb->flist_off = flist;
+				tdb->ftable_off = ftable_off;
+				tdb->ftable = ftable;
 				return off;
 			}
 			/* Didn't work.  Try next bucket. */
 		}
-		/* Hmm, try next list. */
+		/* Hmm, try next table. */
-		flist = next_flist(tdb, flist);
+		ftable_off = next_ftable(tdb, ftable_off);
-		if (flist == 0) {
+		ftable++;
+		if (ftable_off == 0) {
 			wrapped = true;
-			flist = first_flist(tdb);
+			ftable_off = first_ftable(tdb);
+			ftable = 0;
 		}
 	}
@@ -495,7 +551,7 @@ static tdb_off_t get_free(struct tdb_context *tdb,
 int set_header(struct tdb_context *tdb,
 	       struct tdb_used_record *rec,
-	       uint64_t keylen, uint64_t datalen,
+	       unsigned magic, uint64_t keylen, uint64_t datalen,
 	       uint64_t actuallen, unsigned hashlow)
 {
 	uint64_t keybits = (fls64(keylen) + 1) / 2;
@@ -504,16 +560,15 @@ int set_header(struct tdb_context *tdb,
 	rec->magic_and_meta = (hashlow & ((1 << 11)-1))
 		| ((actuallen - (keylen + datalen)) << 11)
 		| (keybits << 43)
-		| (TDB_MAGIC << 48);
+		| ((uint64_t)magic << 48);
 	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
 	/* Encoding can fail on big values. */
 	if (rec_key_length(rec) != keylen
 	    || rec_data_length(rec) != datalen
 	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
-		tdb->ecode = TDB_ERR_IO;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "Could not encode k=%llu,d=%llu,a=%llu",
-			 "Could not encode k=%llu,d=%llu,a=%llu\n",
 			 (long long)keylen, (long long)datalen,
 			 (long long)actuallen);
 		return -1;
@@ -533,11 +588,19 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	/* Need to hold a hash lock to expand DB: transactions rely on it. */
 	if (!(tdb->flags & TDB_NOLOCK)
 	    && !tdb->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-			 "tdb_expand: must hold lock during expand\n");
+			   "tdb_expand: must hold lock during expand");
 		return -1;
 	}
+	/* always make room for at least 100 more records, and at
+           least 25% more space. */
+	if (size * TDB_EXTENSION_FACTOR > tdb->map_size / 4)
+		wanted = size * TDB_EXTENSION_FACTOR;
+	else
+		wanted = tdb->map_size / 4;
+	wanted = adjust_size(0, wanted);
 	/* Only one person can expand file at a time. */
 	if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 		return -1;
@@ -550,7 +613,7 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 		return 0;
 	}
-	if (tdb->methods->expand_file(tdb, wanted*TDB_EXTENSION_FACTOR) == -1) {
+	if (tdb->methods->expand_file(tdb, wanted) == -1) {
 		tdb_unlock_expand(tdb, F_WRLCK);
 		return -1;
 	}
@@ -558,12 +621,13 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	/* We need to drop this lock before adding free record. */
 	tdb_unlock_expand(tdb, F_WRLCK);
-	return add_free_record(tdb, old_size, wanted * TDB_EXTENSION_FACTOR);
+	add_stat(tdb, expands, 1);
+	return add_free_record(tdb, old_size, wanted);
 }
 /* This won't fail: it will expand the database if it has to. */
 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
-		uint64_t hash, bool growing)
+		uint64_t hash, unsigned magic, bool growing)
 {
 	tdb_off_t off;
@@ -571,7 +635,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 	assert(!tdb->direct_access);
 	for (;;) {
-		off = get_free(tdb, keylen, datalen, growing, hash);
+		off = get_free(tdb, keylen, datalen, growing, magic, hash);
 		if (likely(off != 0))
 			break;

--- a/ccan/tdb2/hash.c
+++ b/ccan/tdb2/hash.c
@@ -42,17 +42,19 @@ uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
 uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
 {
-	struct tdb_used_record pad, *r;
+	const struct tdb_used_record *r;
 	const void *key;
 	uint64_t klen, hash;
-	r = tdb_get(tdb, off, &pad, sizeof(pad));
+	r = tdb_access_read(tdb, off, sizeof(*r), true);
 	if (!r)
 		/* FIXME */
 		return 0;
 	klen = rec_key_length(r);
-	key = tdb_access_read(tdb, off + sizeof(pad), klen, false);
+	tdb_access_release(tdb, r);
+	key = tdb_access_read(tdb, off + sizeof(*r), klen, false);
 	if (!key)
 		return 0;
@@ -76,6 +78,30 @@ static uint32_t use_bits(struct hash_info *h, unsigned num)
 	return bits(h->h, 64 - h->hash_used, num);
 }
+static bool key_matches(struct tdb_context *tdb,
+			const struct tdb_used_record *rec,
+			tdb_off_t off,
+			const struct tdb_data *key)
+{
+	bool ret = false;
+	const char *rkey;
+	if (rec_key_length(rec) != key->dsize) {
+		add_stat(tdb, compare_wrong_keylen, 1);
+		return ret;
+	}
+	rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
+	if (!rkey)
+		return ret;
+	if (memcmp(rkey, key->dptr, key->dsize) == 0)
+		ret = true;
+	else
+		add_stat(tdb, compare_wrong_keycmp, 1);
+	tdb_access_release(tdb, rkey);
+	return ret;
+}
 /* Does entry match? */
 static bool match(struct tdb_context *tdb,
 		  struct hash_info *h,
@@ -83,38 +109,33 @@ static bool match(struct tdb_context *tdb,
 		  tdb_off_t val,
 		  struct tdb_used_record *rec)
 {
-	bool ret;
-	const unsigned char *rkey;
 	tdb_off_t off;
-	/* FIXME: Handle hash value truncated. */
+	add_stat(tdb, compares, 1);
-	if (bits(val, TDB_OFF_HASH_TRUNCATED_BIT, 1))
-		abort();
 	/* Desired bucket must match. */
-	if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK))
+	if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) {
+		add_stat(tdb, compare_wrong_bucket, 1);
 		return false;
+	}
 	/* Top bits of offset == next bits of hash. */
 	if (bits(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA)
 	    != bits(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
-		    TDB_OFF_UPPER_STEAL_EXTRA))
+		    TDB_OFF_UPPER_STEAL_EXTRA)) {
+		add_stat(tdb, compare_wrong_offsetbits, 1);
 		return false;
+	}
 	off = val & TDB_OFF_MASK;
 	if (tdb_read_convert(tdb, off, rec, sizeof(*rec)) == -1)
 		return false;
-	/* FIXME: check extra bits in header? */
+	if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) {
-	if (rec_key_length(rec) != key->dsize)
+		add_stat(tdb, compare_wrong_rechash, 1);
 		return false;
+	}
-	rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
+	return key_matches(tdb, rec, off, key);
-	if (!rkey)
-		return false;
-	ret = (memcmp(rkey, key->dptr, key->dsize) == 0);
-	tdb_access_release(tdb, rkey);
-	return ret;
 }
 static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
@@ -123,10 +144,9 @@ static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
 		+ (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
 }
-/* Truncated hashes can't be all 1: that's how we spot a sub-hash */
 bool is_subhash(tdb_off_t val)
 {
-	return val >> (64-TDB_OFF_UPPER_STEAL) == (1<<TDB_OFF_UPPER_STEAL) - 1;
+	return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1;
 }
 /* FIXME: Guess the depth, don't over-lock! */
@@ -136,6 +156,65 @@ static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
 	return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
 }
+static tdb_off_t COLD find_in_chain(struct tdb_context *tdb,
+				    struct tdb_data key,
+				    tdb_off_t chain,
+				    struct hash_info *h,
+				    struct tdb_used_record *rec,
+				    struct traverse_info *tinfo)
+{
+	tdb_off_t off, next;
+	/* In case nothing is free, we set these to zero. */
+	h->home_bucket = h->found_bucket = 0;
+	for (off = chain; off; off = next) {
+		unsigned int i;
+		h->group_start = off;
+		if (tdb_read_convert(tdb, off, h->group, sizeof(h->group)))
+			return TDB_OFF_ERR;
+		for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+			tdb_off_t recoff;
+			if (!h->group[i]) {
+				/* Remember this empty bucket. */
+				h->home_bucket = h->found_bucket = i;
+				continue;
+			}
+			/* We can insert extra bits via add_to_hash
+			 * empty bucket logic. */
+			recoff = h->group[i] & TDB_OFF_MASK;
+			if (tdb_read_convert(tdb, recoff, rec, sizeof(*rec)))
+				return TDB_OFF_ERR;
+			if (key_matches(tdb, rec, recoff, &key)) {
+				h->home_bucket = h->found_bucket = i;
+				if (tinfo) {
+					tinfo->levels[tinfo->num_levels]
+						.hashtable = off;
+					tinfo->levels[tinfo->num_levels]
+						.total_buckets
+						= 1 << TDB_HASH_GROUP_BITS;
+					tinfo->levels[tinfo->num_levels].entry
+						= i;
+					tinfo->num_levels++;
+				}
+				return recoff;
+			}
+		}
+		next = tdb_read_off(tdb, off
+				    + offsetof(struct tdb_chain, next));
+		if (next == TDB_OFF_ERR)
+			return TDB_OFF_ERR;
+		if (next)
+			next += sizeof(struct tdb_used_record);
+	}
+	return 0;
+}
 /* This is the core routine which searches the hashtable for an entry.
 * On error, no locks are held and TDB_OFF_ERR is returned.
 * Otherwise, hinfo is filled in (and the optional tinfo).
@@ -171,7 +250,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
 		tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS;
 	}
-	while (likely(h->hash_used < 64)) {
+	while (h->hash_used <= 64) {
 		/* Read in the hash group. */
 		h->group_start = hashtable
 			+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
@@ -228,8 +307,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
 		return 0;
 	}
-	/* FIXME: We hit the bottom.  Chain! */
+	return find_in_chain(tdb, key, hashtable, h, rec, tinfo);
-	abort();
 fail:
 	tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype);
@@ -239,8 +317,8 @@ fail:
 /* I wrote a simple test, expanding a hash to 2GB, for the following
 * cases:
 * 1) Expanding all the buckets at once,
- * 2) Expanding the most-populated bucket,
+ * 2) Expanding the bucket we wanted to place the new entry into.
- * 3) Expanding the bucket we wanted to place the new entry ito.
+ * 3) Expanding the most-populated bucket,
 *
 * I measured the worst/average/best density during this process.
 * 1) 3%/16%/30%
@@ -315,6 +393,41 @@ int replace_in_hash(struct tdb_context *tdb,
 			     encode_offset(new_off, h));
 }
+/* We slot in anywhere that's empty in the chain. */
+static int COLD add_to_chain(struct tdb_context *tdb,
+			     tdb_off_t subhash,
+			     tdb_off_t new_off)
+{
+	size_t entry = tdb_find_zero_off(tdb, subhash, 1<<TDB_HASH_GROUP_BITS);
+	if (entry == 1 << TDB_HASH_GROUP_BITS) {
+		tdb_off_t next;
+		next = tdb_read_off(tdb, subhash
+				    + offsetof(struct tdb_chain, next));
+		if (next == TDB_OFF_ERR)
+			return -1;
+		if (!next) {
+			next = alloc(tdb, 0, sizeof(struct tdb_chain), 0,
+				     TDB_CHAIN_MAGIC, false);
+			if (next == TDB_OFF_ERR)
+				return -1;
+			if (zero_out(tdb, next+sizeof(struct tdb_used_record),
+				     sizeof(struct tdb_chain)))
+				return -1;
+			if (tdb_write_off(tdb, subhash
+					  + offsetof(struct tdb_chain, next),
+					  next) != 0)
+				return -1;
+		}
+		return add_to_chain(tdb, next, new_off);
+	}
+	return tdb_write_off(tdb, subhash + entry * sizeof(tdb_off_t),
+			     new_off);
+}
 /* Add into a newly created subhash. */
 static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 			  unsigned hash_used, tdb_off_t val)
@@ -325,14 +438,12 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 	h.hash_used = hash_used;
-	/* FIXME chain if hash_used == 64 */
 	if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64)
-		abort();
+		return add_to_chain(tdb, subhash, off);
-	/* FIXME: Do truncated hash bits if we can! */
 	h.h = hash_record(tdb, off);
 	gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS);
-	h.group_start = subhash	+ sizeof(struct tdb_used_record)
+	h.group_start = subhash
 		+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
 	h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS);
@@ -346,20 +457,29 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 static int expand_group(struct tdb_context *tdb, struct hash_info *h)
 {
-	unsigned bucket, num_vals, i;
+	unsigned bucket, num_vals, i, magic;
+	size_t subsize;
 	tdb_off_t subhash;
 	tdb_off_t vals[1 << TDB_HASH_GROUP_BITS];
 	/* Attach new empty subhash under fullest bucket. */
 	bucket = fullest_bucket(tdb, h->group, h->home_bucket);
-	subhash = alloc(tdb, 0, sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS,
+	if (h->hash_used == 64) {
-			0, false);
+		add_stat(tdb, alloc_chain, 1);
+		subsize = sizeof(struct tdb_chain);
+		magic = TDB_CHAIN_MAGIC;
+	} else {
+		add_stat(tdb, alloc_subhash, 1);
+		subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS);
+		magic = TDB_HTABLE_MAGIC;
+	}
+	subhash = alloc(tdb, 0, subsize, 0, magic, false);
 	if (subhash == TDB_OFF_ERR)
 		return -1;
-	if (zero_out(tdb, subhash + sizeof(struct tdb_used_record),
+	if (zero_out(tdb, subhash + sizeof(struct tdb_used_record), subsize))
-		     sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) == -1)
 		return -1;
 	/* Remove any which are destined for bucket or are in wrong place. */
@@ -377,7 +497,10 @@ static int expand_group(struct tdb_context *tdb, struct hash_info *h)
 	/* assert(num_vals); */
 	/* Overwrite expanded bucket with subhash pointer. */
-	h->group[bucket] = subhash | ~((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
+	h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT);
+	/* Point to actual contents of record. */
+	subhash += sizeof(struct tdb_used_record);
 	/* Put values back. */
 	for (i = 0; i < num_vals; i++) {
@@ -433,10 +556,6 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
 int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
 {
-	/* FIXME: chain! */
-	if (h->hash_used >= 64)
-		abort();
 	/* We hit an empty bucket during search?  That's where it goes. */
 	if (!h->group[h->found_bucket]) {
 		h->group[h->found_bucket] = encode_offset(new_off, h);
@@ -445,6 +564,9 @@ int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
 					 h->group, sizeof(h->group));
 	}
+	if (h->hash_used > 64)
+		return add_to_chain(tdb, h->group_start, new_off);
 	/* We're full.  Expand. */
 	if (expand_group(tdb, h) == -1)
 		return -1;
@@ -523,7 +645,11 @@ again:
 		tlevel++;
 		tlevel->hashtable = off + sizeof(struct tdb_used_record);
 		tlevel->entry = 0;
-		tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
+		/* Next level is a chain? */
+		if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1))
+			tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS);
+		else
+			tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
 		goto again;
 	}
@@ -531,6 +657,20 @@ again:
 	if (tinfo->num_levels == 1)
 		return 0;
+	/* Handle chained entries. */
+	if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) {
+		tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable
+						 + offsetof(struct tdb_chain,
+							    next));
+		if (tlevel->hashtable == TDB_OFF_ERR)
+			return TDB_OFF_ERR;
+		if (tlevel->hashtable) {
+			tlevel->hashtable += sizeof(struct tdb_used_record);
+			tlevel->entry = 0;
+			goto again;
+		}
+	}
 	/* Go back up and keep searching. */
 	tinfo->num_levels--;
 	tlevel--;
@@ -563,11 +703,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
 						  ltype);
 				return -1;
 			}
-			if (rec_magic(&rec) != TDB_MAGIC) {
+			if (rec_magic(&rec) != TDB_USED_MAGIC) {
-				tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					 "next_in_hash:"
+					   TDB_DEBUG_FATAL,
-					 " corrupt record at %llu\n",
+					   "next_in_hash:"
-					 (long long)off);
+					   " corrupt record at %llu",
+					   (long long)off);
 				return -1;
 			}

--- a/ccan/tdb2/io.c
+++ b/ccan/tdb2/io.c
@@ -56,9 +56,9 @@ void tdb_mmap(struct tdb_context *tdb)
 	 */
 	if (tdb->map_ptr == MAP_FAILED) {
 		tdb->map_ptr = NULL;
-		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
+		tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
-			 "tdb_mmap failed for size %lld (%s)\n", 
+			   "tdb_mmap failed for size %lld (%s)",
-			 (long long)tdb->map_size, strerror(errno));
+			   (long long)tdb->map_size, strerror(errno));
 	}
 }
@@ -70,7 +70,6 @@ void tdb_mmap(struct tdb_context *tdb)
 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 {
 	struct stat st;
-	int ret;
 	/* We can't hold pointers during this: we could unmap! */
 	assert(!tdb->direct_access
@@ -81,11 +80,9 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 		return 0;
 	if (tdb->flags & TDB_INTERNAL) {
 		if (!probe) {
-			/* Ensure ecode is set for log fn. */
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 				 "tdb_oob len %lld beyond internal"
-				 " malloc size %lld\n",
+				 " malloc size %lld",
 				 (long long)len,
 				 (long long)tdb->map_size);
 		}
@@ -95,22 +92,20 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 	if (tdb_lock_expand(tdb, F_RDLCK) != 0)
 		return -1;
-	ret = fstat(tdb->fd, &st);
+	if (fstat(tdb->fd, &st) != 0) {
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
-	tdb_unlock_expand(tdb, F_RDLCK);
+			   "Failed to fstat file: %s", strerror(errno));
+		tdb_unlock_expand(tdb, F_RDLCK);
-	if (ret == -1) {
-		tdb->ecode = TDB_ERR_IO;
 		return -1;
 	}
+	tdb_unlock_expand(tdb, F_RDLCK);
 	if (st.st_size < (size_t)len) {
 		if (!probe) {
-			/* Ensure ecode is set for log fn. */
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
-			tdb->ecode = TDB_ERR_IO;
+				   "tdb_oob len %zu beyond eof at %zu",
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				   (size_t)len, st.st_size);
-				 "tdb_oob len %lld beyond eof at %lld\n",
-				 (long long)len, (long long)st.st_size);
 		}
 		return -1;
 	}
@@ -123,19 +118,6 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 	return 0;
 }
-/* Either make a copy into pad and return that, or return ptr into mmap. */
-/* Note: pad has to be a real object, so we can't get here if len
- * overflows size_t */
-void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
-{
-	if (likely(!(tdb->flags & TDB_CONVERT))) {
-		void *ret = tdb->methods->direct(tdb, off, len);
-		if (ret)
-			return ret;
-	}
-	return tdb_read_convert(tdb, off, pad, len) == -1 ? NULL : pad;
-}
 /* Endian conversion: we only ever deal with 8 byte quantities */
 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
 {
@@ -191,7 +173,9 @@ uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
 {
 	char buf[8192] = { 0 };
-	void *p = tdb->methods->direct(tdb, off, len);
+	void *p = tdb->methods->direct(tdb, off, len, true);
+	assert(!tdb->read_only);
 	if (p) {
 		memset(p, 0, len);
 		return 0;
@@ -208,13 +192,18 @@ int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
 {
-	tdb_off_t pad, *ret;
+	tdb_off_t ret;
-	ret = tdb_get(tdb, off, &pad, sizeof(pad));
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
-	if (!ret) {
+		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
-		return TDB_OFF_ERR;
+						    false);
+		if (p)
+			return *p;
 	}
-	return *ret;
+	if (tdb_read_convert(tdb, off, &ret, sizeof(ret)) == -1)
+		return TDB_OFF_ERR;
+	return ret;
 }
 /* Even on files, we can get partial writes due to signals. */
@@ -278,15 +267,17 @@ bool tdb_read_all(int fd, void *buf, size_t len)
 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
 		     const void *buf, tdb_len_t len)
 {
-	if (len == 0) {
-		return 0;
-	}
 	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_RDONLY;
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write to read-only database");
 		return -1;
 	}
+	/* FIXME: Bogus optimization? */
+	if (len == 0) {
+		return 0;
+	}
 	if (tdb->methods->oob(tdb, off + len, 0) != 0)
 		return -1;
@@ -294,11 +285,9 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
 		memcpy(off + (char *)tdb->map_ptr, buf, len);
 	} else {
 		if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
-			tdb->ecode = TDB_ERR_IO;
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				   "tdb_write failed at %zu len=%zu (%s)",
-				 "tdb_write failed at %llu len=%llu (%s)\n",
+				   (size_t)off, (size_t)len, strerror(errno));
-				 (long long)off, (long long)len,
-				 strerror(errno));
 			return -1;
 		}
 	}
@@ -317,14 +306,12 @@ static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
 		memcpy(buf, off + (char *)tdb->map_ptr, len);
 	} else {
 		if (!tdb_pread_all(tdb->fd, buf, len, off)) {
-			/* Ensure ecode is set for log fn. */
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
-			tdb->ecode = TDB_ERR_IO;
+				   "tdb_read failed at %zu "
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				   "len=%zu (%s) map_size=%zu",
-				 "tdb_read failed at %lld "
+				 (size_t)off, (size_t)len,
-				 "len=%lld (%s) map_size=%lld\n",
-				 (long long)off, (long long)len,
 				 strerror(errno),
-				 (long long)tdb->map_size);
+				 (size_t)tdb->map_size);
 			return -1;
 		}
 	}
@@ -338,10 +325,9 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
 	if (unlikely((tdb->flags & TDB_CONVERT))) {
 		void *conv = malloc(len);
 		if (!conv) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				   "tdb_write: no memory converting"
-				 "tdb_write: no memory converting %zu bytes\n",
+				   " %zu bytes", len);
-				 len);
 			return -1;
 		}
 		memcpy(conv, rec, len);
@@ -364,6 +350,20 @@ int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
 {
+	if (tdb->read_only) {
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write to read-only database");
+		return -1;
+	}
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
+						    true);
+		if (p) {
+			*p = val;
+			return 0;
+		}
+	}
 	return tdb_write_convert(tdb, off, &val, sizeof(val));
 }
@@ -374,12 +374,12 @@ static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
 	/* some systems don't like zero length malloc */
 	buf = malloc(prefix + len ? prefix + len : 1);
-	if (unlikely(!buf)) {
+	if (!buf) {
-		tdb->ecode = TDB_ERR_OOM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_alloc_read malloc failed len=%zu",
-			 "tdb_alloc_read malloc failed len=%lld\n",
+			   (size_t)(prefix + len));
-			 (long long)prefix + len);
+	} else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix,
-	} else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) {
+					       len) == -1)) {
 		free(buf);
 		buf = NULL;
 	}
@@ -400,9 +400,8 @@ static int fill(struct tdb_context *tdb,
 		size_t n = len > size ? size : len;
 		if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
-			tdb->ecode = TDB_ERR_IO;
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "fill write failed: giving up!");
-				 "fill write failed: giving up!\n");
 			return -1;
 		}
 		len -= n;
@@ -418,14 +417,16 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
 	char buf[8192];
 	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_RDONLY;
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Expand on read-only database");
 		return -1;
 	}
 	if (tdb->flags & TDB_INTERNAL) {
 		char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
 		if (!new) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "No memory to expand database");
 			return -1;
 		}
 		tdb->map_ptr = new;
@@ -443,7 +444,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
 		   file isn't sparse, which would be very bad if we ran out of
 		   disk. This must be done with write, not via mmap */
 		memset(buf, 0x43, sizeof(buf));
-		if (fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
+		if (0 || fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
 			return -1;
 		tdb->map_size += addition;
 		tdb_mmap(tdb);
@@ -451,25 +452,20 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
 	return 0;
 }
-/* This is only neded for tdb_access_commit, but used everywhere to simplify. */
-struct tdb_access_hdr {
-	tdb_off_t off;
-	tdb_len_t len;
-	bool convert;
-};
 const void *tdb_access_read(struct tdb_context *tdb,
 			    tdb_off_t off, tdb_len_t len, bool convert)
 {
 	const void *ret = NULL;	
 	if (likely(!(tdb->flags & TDB_CONVERT)))
-		ret = tdb->methods->direct(tdb, off, len);
+		ret = tdb->methods->direct(tdb, off, len, false);
 	if (!ret) {
 		struct tdb_access_hdr *hdr;
 		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
 		if (hdr) {
+			hdr->next = tdb->access;
+			tdb->access = hdr;
 			ret = hdr + 1;
 			if (convert)
 				tdb_convert(tdb, (void *)ret, len);
@@ -485,13 +481,21 @@ void *tdb_access_write(struct tdb_context *tdb,
 {
 	void *ret = NULL;
+	if (tdb->read_only) {
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write to read-only database");
+		return NULL;
+	}
 	if (likely(!(tdb->flags & TDB_CONVERT)))
-		ret = tdb->methods->direct(tdb, off, len);
+		ret = tdb->methods->direct(tdb, off, len, true);
 	if (!ret) {
 		struct tdb_access_hdr *hdr;
 		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
 		if (hdr) {
+			hdr->next = tdb->access;
+			tdb->access = hdr;
 			hdr->off = off;
 			hdr->len = len;
 			hdr->convert = convert;
@@ -505,30 +509,41 @@ void *tdb_access_write(struct tdb_context *tdb,
 	return ret;
 }
+static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
+{
+	struct tdb_access_hdr **hp;
+	for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
+		if (*hp + 1 == p)
+			return hp;
+	}
+	return NULL;
+}
 void tdb_access_release(struct tdb_context *tdb, const void *p)
 {
-	if (!tdb->map_ptr
+	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
-	    || (char *)p < (char *)tdb->map_ptr
-	    || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
+	if (hp) {
-		free((struct tdb_access_hdr *)p - 1);
+		hdr = *hp;
-	else
+		*hp = hdr->next;
+		free(hdr);
+	} else
 		tdb->direct_access--;
 }
 int tdb_access_commit(struct tdb_context *tdb, void *p)
 {
+	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
 	int ret = 0;
-	if (!tdb->map_ptr
+	if (hp) {
-	    || (char *)p < (char *)tdb->map_ptr
+		hdr = *hp;
-	    || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
-		struct tdb_access_hdr *hdr;
-		hdr = (struct tdb_access_hdr *)p - 1;
 		if (hdr->convert)
 			ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
 		else
 			ret = tdb_write(tdb, hdr->off, p, hdr->len);
+		*hp = hdr->next;
 		free(hdr);
 	} else
 		tdb->direct_access--;
@@ -536,7 +551,8 @@ int tdb_access_commit(struct tdb_context *tdb, void *p)
 	return ret;
 }
-static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
+static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
+			bool write)
 {
 	if (unlikely(!tdb->map_ptr))
 		return NULL;
@@ -546,6 +562,12 @@ static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
 	return (char *)tdb->map_ptr + off;
 }
+void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val)
+{
+	if ((uintptr_t)stat < (uintptr_t)tdb->stats + tdb->stats->size)
+		*stat += val;
+}
 static const struct tdb_methods io_methods = {
 	tdb_read,
 	tdb_write,

--- a/ccan/tdb2/lock.c
+++ b/ccan/tdb2/lock.c
@@ -40,10 +40,13 @@ static int fcntl_lock(struct tdb_context *tdb,
 	fl.l_len = len;
 	fl.l_pid = 0;
+	add_stat(tdb, lock_lowlevel, 1);
 	if (waitflag)
 		return fcntl(tdb->fd, F_SETLKW, &fl);
-	else
+	else {
+		add_stat(tdb, lock_nonblock, 1);
 		return fcntl(tdb->fd, F_SETLK, &fl);
+	}
 }
 static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
@@ -99,7 +102,7 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
 	}
 	if (!found) {
-		fprintf(stderr, "Unlock on %u@%u not found!\n",
+		fprintf(stderr, "Unlock on %u@%u not found!",
 			(int)off, (int)len);
 		abort();
 	}
@@ -132,16 +135,16 @@ static int tdb_brlock(struct tdb_context *tdb,
 	}
 	if (rw_type == F_WRLCK && tdb->read_only) {
-		tdb->ecode = TDB_ERR_RDONLY;
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write lock attempted on read-only database");
 		return -1;
 	}
 	/* A 32 bit system cannot open a 64-bit file, but it could have
 	 * expanded since then: check here. */
 	if ((size_t)(offset + len) != offset + len) {
-		tdb->ecode = TDB_ERR_IO;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_brlock: lock on giant offset %llu",
-			 "tdb_brlock: lock on giant offset %llu\n",
 			 (long long)(offset + len));
 		return -1;
 	}
@@ -157,11 +160,12 @@ static int tdb_brlock(struct tdb_context *tdb,
 		 * EAGAIN is an expected return from non-blocking
 		 * locks. */
 		if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-				 "tdb_brlock failed (fd=%d) at"
+				   "tdb_brlock failed (fd=%d) at"
-				 " offset %llu rw_type=%d flags=%d len=%llu\n",
+				   " offset %zu rw_type=%d flags=%d len=%zu:"
-				 tdb->fd, (long long)offset, rw_type,
+				   " %s",
-				 flags, (long long)len);
+				   tdb->fd, (size_t)offset, rw_type,
+				   flags, (size_t)len, strerror(errno));
 		}
 		return -1;
 	}
@@ -182,10 +186,10 @@ static int tdb_brunlock(struct tdb_context *tdb,
 	} while (ret == -1 && errno == EINTR);
 	if (ret == -1) {
-		tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_TRACE,
-			 "tdb_brunlock failed (fd=%d) at offset %llu"
+			   "tdb_brunlock failed (fd=%d) at offset %zu"
-			 " rw_type=%d len=%llu\n",
+			   " rw_type=%d len=%zu",
-			 tdb->fd, (long long)offset, rw_type, (long long)len);
+			   tdb->fd, (size_t)offset, rw_type, (size_t)len);
 	}
 	return ret;
 }
@@ -201,15 +205,15 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
 	int count = 1000;
 	if (tdb->allrecord_lock.count != 1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-			 "tdb_allrecord_upgrade failed: count %u too high\n",
+			   "tdb_allrecord_upgrade failed: count %u too high",
-			 tdb->allrecord_lock.count);
+			   tdb->allrecord_lock.count);
 		return -1;
 	}
 	if (tdb->allrecord_lock.off != 1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-			 "tdb_allrecord_upgrade failed: already upgraded?\n");
+			   "tdb_allrecord_upgrade failed: already upgraded?");
 		return -1;
 	}
@@ -230,8 +234,8 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
 		tv.tv_usec = 1;
 		select(0, NULL, NULL, NULL, &tv);
 	}
-	tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
+	tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_WARNING,
-		 "tdb_allrecord_upgrade failed\n");
+		   "tdb_allrecord_upgrade failed");
 	return -1;
 }
@@ -276,23 +280,23 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
 	struct tdb_lock_type *new_lck;
 	if (offset > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE + tdb->map_size / 8) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			   "tdb_nest_lock: invalid offset %zu ltype=%d",
-			 "tdb_nest_lock: invalid offset %llu ltype=%d\n",
+			   (size_t)offset, ltype);
-			 (long long)offset, ltype);
 		return -1;
 	}
 	if (tdb->flags & TDB_NOLOCK)
 		return 0;
+	add_stat(tdb, locks, 1);
 	new_lck = find_nestlock(tdb, offset);
 	if (new_lck) {
 		if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
-			tdb->ecode = TDB_ERR_LOCK;
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				   "tdb_nest_lock: offset %zu has read lock",
-				 "tdb_nest_lock: offset %llu has read lock\n",
+				   (size_t)offset);
-				 (long long)offset);
 			return -1;
 		}
 		/* Just increment the struct, posix locks don't stack. */
@@ -303,9 +307,8 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
 	if (tdb->num_lockrecs
 	    && offset >= TDB_HASH_LOCK_START
 	    && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			   "tdb_nest_lock: already have a hash lock?");
-			 "tdb_nest_lock: already have a hash lock?\n");
 		return -1;
 	}
@@ -313,10 +316,9 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
 		tdb->lockrecs,
 		sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
 	if (new_lck == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_nest_lock: unable to allocate %zu lock struct",
-			 "tdb_nest_lock: unable to allocate %llu lock struct",
+			 tdb->num_lockrecs + 1);
-			 (long long)(tdb->num_lockrecs + 1));
 		errno = ENOMEM;
 		return -1;
 	}
@@ -361,9 +363,8 @@ static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype)
 	lck = find_nestlock(tdb, off);
 	if ((lck == NULL) || (lck->count == 0)) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_nest_unlock: no lock for %zu", (size_t)off);
-			 "tdb_nest_unlock: no lock for %llu\n", (long long)off);
 		return -1;
 	}
@@ -448,9 +449,8 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 {
 	/* FIXME: There are no locks on read-only dbs */
 	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_allrecord_lock: read-only");
-			 "tdb_allrecord_lock: read-only\n");
 		return -1;
 	}
@@ -462,49 +462,45 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 	if (tdb->allrecord_lock.count) {
 		/* a global lock of a different type exists */
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_allrecord_lock: already have %s lock",
-			 "tdb_allrecord_lock: already have %s lock\n",
+			   tdb->allrecord_lock.ltype == F_RDLCK
-			 tdb->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
-			 ? "read" : "write");
 		return -1;
 	}
 	if (tdb_has_hash_locks(tdb)) {
 		/* can't combine global and chain locks */
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_lock: already have chain lock");
-			 "tdb_allrecord_lock: already have chain lock\n");
 		return -1;
 	}
 	if (upgradable && ltype != F_RDLCK) {
 		/* tdb error: you can't upgrade a write lock! */
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_allrecord_lock: can't upgrade a write lock");
-			 "tdb_allrecord_lock: can't upgrade a write lock\n");
 		return -1;
 	}
+	add_stat(tdb, locks, 1);
 again:
 	/* Lock hashes, gradually. */
 	if (tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
 			     TDB_HASH_LOCK_RANGE)) {
 		if (!(flags & TDB_LOCK_PROBE)) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
-				 "tdb_allrecord_lock hashes failed (%s)\n",
+				   "tdb_allrecord_lock hashes failed");
-				 strerror(errno));
 		}
 		return -1;
 	}
-	/* Lock free lists: there to end of file. */
+	/* Lock free tables: there to end of file. */
 	if (tdb_brlock(tdb, ltype, TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE,
 		       0, flags)) {
 		if (!(flags & TDB_LOCK_PROBE)) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
-				 "tdb_allrecord_lock freelist failed (%s)\n",
+				 "tdb_allrecord_lock freetables failed");
-				 strerror(errno));
 		}
 		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 
 			     TDB_HASH_LOCK_RANGE);
@@ -559,29 +555,19 @@ void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
 /* unlock entire db */
 int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
 {
-	/* FIXME: There are no locks on read-only dbs */
-	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_unlock: read-only\n");
-		return -1;
-	}
 	if (tdb->allrecord_lock.count == 0) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_allrecord_unlock: not locked!");
-			 "tdb_allrecord_unlock: not locked!\n");
 		return -1;
 	}
 	/* Upgradable locks are marked as write locks. */
 	if (tdb->allrecord_lock.ltype != ltype
 	    && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_unlock: have %s lock",
-			 "tdb_allrecord_unlock: have %s lock\n",
+			   tdb->allrecord_lock.ltype == F_RDLCK
-			 tdb->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
-			 ? "read" : "write");
 		return -1;
 	}
@@ -642,25 +628,22 @@ int tdb_lock_hashes(struct tdb_context *tdb,
 	}
 	if (tdb->allrecord_lock.count) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_lock_hashes: already have %s allrecordlock",
-			 "tdb_lock_hashes: have %s allrecordlock\n",
+			   tdb->allrecord_lock.ltype == F_RDLCK
-			 tdb->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
-			 ? "read" : "write");
 		return -1;
 	}
 	if (tdb_has_free_lock(tdb)) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_lock_hashes: already have free lock");
-			 "tdb_lock_hashes: have free lock already\n");
 		return -1;
 	}
 	if (tdb_has_expansion_lock(tdb)) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_lock_hashes: already have expansion lock");
-			 "tdb_lock_hashes: have expansion lock already\n");
 		return -1;
 	}
@@ -678,9 +661,8 @@ int tdb_unlock_hashes(struct tdb_context *tdb,
 	if (tdb->allrecord_lock.count) {
 		if (tdb->allrecord_lock.ltype == F_RDLCK
 		    && ltype == F_WRLCK) {
-			tdb->ecode = TDB_ERR_LOCK;
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				   "tdb_unlock_hashes RO allrecord!");
-				 "tdb_unlock_hashes RO allrecord!\n");
 			return -1;
 		}
 		return 0;
@@ -709,17 +691,15 @@ int tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
 	if (tdb->allrecord_lock.count) {
 		if (tdb->allrecord_lock.ltype == F_WRLCK)
 			return 0;
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			 "tdb_lock_free_bucket with RO allrecordlock!");
-			 "tdb_lock_free_bucket with RO allrecordlock!\n");
 		return -1;
 	}
 #if 0 /* FIXME */
 	if (tdb_has_expansion_lock(tdb)) {
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_lock_free_bucket: already have expansion lock");
-			 "tdb_lock_free_bucket: have expansion lock already\n");
 		return -1;
 	}
 #endif

--- a/ccan/tdb2/private.h
+++ b/ccan/tdb2/private.h
@@ -36,6 +36,7 @@
 #include "config.h"
 #include <ccan/tdb2/tdb2.h>
 #include <ccan/likely/likely.h>
+#include <ccan/compiler/compiler.h>
 #ifdef HAVE_BYTESWAP_H
 #include <byteswap.h>
 #endif
@@ -63,9 +64,11 @@ typedef uint64_t tdb_off_t;
 #define TDB_MAGIC_FOOD "TDB file\n"
 #define TDB_VERSION ((uint64_t)(0x26011967 + 7))
-#define TDB_MAGIC ((uint64_t)0x1999)
+#define TDB_USED_MAGIC ((uint64_t)0x1999)
+#define TDB_HTABLE_MAGIC ((uint64_t)0x1888)
+#define TDB_CHAIN_MAGIC ((uint64_t)0x1777)
+#define TDB_FTABLE_MAGIC ((uint64_t)0x1666)
 #define TDB_FREE_MAGIC ((uint64_t)0xFE)
-#define TDB_COALESCING_MAGIC ((uint64_t)0xFD)
 #define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
 #define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
 #define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
@@ -91,20 +94,22 @@ typedef uint64_t tdb_off_t;
 #define TDB_SUBLEVEL_HASH_BITS 6
 /* And 8 entries in each group, ie 8 groups per sublevel. */
 #define TDB_HASH_GROUP_BITS 3
+/* This is currently 10: beyond this we chain. */
+#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS)
-/* Extend file by least 32 times larger than needed. */
+/* Extend file by least 100 times larger than needed. */
-#define TDB_EXTENSION_FACTOR 32
+#define TDB_EXTENSION_FACTOR 100
 /* We steal bits from the offsets to store hash info. */
 #define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
 /* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
 #define TDB_OFF_UPPER_STEAL 8
 #define   TDB_OFF_UPPER_STEAL_EXTRA 7
-#define   TDB_OFF_UPPER_STEAL_TRUNCBIT 1
+/* The bit number where we store extra hash bits. */
-/* If this is set, hash is truncated (only 1 bit is valid). */
-#define TDB_OFF_HASH_TRUNCATED_BIT 56
-/* The bit number where we store next level of hash. */
 #define TDB_OFF_HASH_EXTRA_BIT 57
+#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56
+/* The bit number where we store the extra hash bits. */
 /* Convenience mask to get actual offset. */
 #define TDB_OFF_MASK \
 	(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
@@ -116,6 +121,9 @@ typedef uint64_t tdb_off_t;
 #define TDB_MIN_DATA_LEN	\
 	(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
+/* Indicates this entry is not on an flist (can happen during coalescing) */
+#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1)
 #if !HAVE_BSWAP_64
 static inline uint64_t bswap_64(uint64_t x)
 {
@@ -173,20 +181,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
 }
 struct tdb_free_record {
-        uint64_t magic_and_meta; /* TDB_OFF_UPPER_STEAL bits of magic */
+        uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
-        uint64_t data_len; /* Not counting these two fields. */
+        uint64_t ftable_and_len; /* Len not counting these two fields. */
-	/* This is why the minimum record size is 16 bytes.  */
+	/* This is why the minimum record size is 8 bytes.  */
-	uint64_t next, prev;
+	uint64_t next;
 };
+static inline uint64_t frec_prev(const struct tdb_free_record *f)
+{
+	return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
+}
 static inline uint64_t frec_magic(const struct tdb_free_record *f)
 {
-	return f->magic_and_meta >> (64 - TDB_OFF_UPPER_STEAL);
+	return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
 }
-static inline uint64_t frec_flist(const struct tdb_free_record *f)
+static inline uint64_t frec_len(const struct tdb_free_record *f)
 {
-	return f->magic_and_meta & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
+	return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
+}
+static inline unsigned frec_ftable(const struct tdb_free_record *f)
+{
+	return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL);
 }
 struct tdb_recovery_record {
@@ -199,6 +217,12 @@ struct tdb_recovery_record {
 	uint64_t eof;
 };
+/* If we bottom out of the subhashes, we chain. */
+struct tdb_chain {
+	tdb_off_t rec[1 << TDB_HASH_GROUP_BITS];
+	tdb_off_t next;
+};
 /* this is stored at the front of every database */
 struct tdb_header {
 	char magic_food[64]; /* for /etc/magic */
@@ -206,7 +230,7 @@ struct tdb_header {
 	uint64_t version; /* version of the code */
 	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
 	uint64_t hash_seed; /* "random" seed written at creation time. */
-	tdb_off_t free_list; /* (First) free list. */
+	tdb_off_t free_table; /* (First) free table. */
 	tdb_off_t recovery; /* Transaction recovery area. */
 	tdb_off_t reserved[26];
@@ -215,7 +239,7 @@ struct tdb_header {
 	tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS];
 };
-struct tdb_freelist {
+struct tdb_freetable {
 	struct tdb_used_record hdr;
 	tdb_off_t next;
 	tdb_off_t buckets[TDB_FREE_BUCKETS];
@@ -246,7 +270,7 @@ struct traverse_info {
 		/* We ignore groups here, and treat it as a big array. */
 		unsigned entry;
 		unsigned int total_buckets;
-	} levels[64 / TDB_SUBLEVEL_HASH_BITS];
+	} levels[TDB_MAX_LEVELS + 1];
 	unsigned int num_levels;
 	unsigned int toplevel_group;
 	/* This makes delete-everything-inside-traverse work as expected. */
@@ -269,6 +293,15 @@ struct tdb_lock_type {
 	uint32_t ltype;
 };
+/* This is only needed for tdb_access_commit, but used everywhere to
+ * simplify. */
+struct tdb_access_hdr {
+	struct tdb_access_hdr *next;
+	tdb_off_t off;
+	tdb_len_t len;
+	bool convert;
+};
 struct tdb_context {
 	/* Filename of the database. */
 	const char *name;
@@ -298,8 +331,8 @@ struct tdb_context {
 	uint32_t flags;
 	/* Logging function */
-	tdb_logfn_t log;
+	tdb_logfn_t logfn;
-	void *log_priv;
+	void *log_private;
 	/* Hash function. */
 	tdb_hashfn_t khash;
@@ -309,17 +342,23 @@ struct tdb_context {
 	/* Set if we are in a transaction. */
 	struct tdb_transaction *transaction;
-	/* What freelist are we using? */
+	/* What free table are we using? */
-	uint64_t flist_off;
+	tdb_off_t ftable_off;
+	unsigned int ftable;
 	/* IO methods: changes for transactions. */
 	const struct tdb_methods *methods;
 	/* Lock information */
 	struct tdb_lock_type allrecord_lock;
-	uint64_t num_lockrecs;
+	size_t num_lockrecs;
 	struct tdb_lock_type *lockrecs;
+	struct tdb_attribute_stats *stats;
+	/* Direct access information */
+	struct tdb_access_hdr *access;
 	/* Single list of all TDBs, to avoid multiple opens. */
 	struct tdb_context *next;
 	dev_t device;	
@@ -331,7 +370,7 @@ struct tdb_methods {
 	int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
 	int (*oob)(struct tdb_context *, tdb_off_t, bool);
 	int (*expand_file)(struct tdb_context *, tdb_len_t);
-	void *(*direct)(struct tdb_context *, tdb_off_t, size_t);
+	void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool);
 };
 /*
@@ -367,29 +406,32 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
 bool is_subhash(tdb_off_t val);
 /* free.c: */
-int tdb_flist_init(struct tdb_context *tdb);
+int tdb_ftable_init(struct tdb_context *tdb);
 /* check.c needs these to iterate through free lists. */
-tdb_off_t first_flist(struct tdb_context *tdb);
+tdb_off_t first_ftable(struct tdb_context *tdb);
-tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist);
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable);
-/* If this fails, try tdb_expand. */
+/* This returns space or TDB_OFF_ERR. */
 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
-		uint64_t hash, bool growing);
+		uint64_t hash, unsigned magic, bool growing);
 /* Put this record in a free list. */
 int add_free_record(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_len_t len_with_header);
-/* Set up header for a used record. */
+/* Set up header for a used/ftable/htable/chain record. */
 int set_header(struct tdb_context *tdb,
 	       struct tdb_used_record *rec,
-	       uint64_t keylen, uint64_t datalen,
+	       unsigned magic, uint64_t keylen, uint64_t datalen,
 	       uint64_t actuallen, unsigned hashlow);
 /* Used by tdb_check to verify. */
 unsigned int size_to_bucket(tdb_len_t data_len);
-tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket);
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket);
+/* Used by tdb_summary */
+size_t dead_space(struct tdb_context *tdb, tdb_off_t off);
 /* io.c: */
 /* Initialize tdb->methods. */
@@ -402,10 +444,6 @@ void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
 void tdb_munmap(struct tdb_context *tdb);
 void tdb_mmap(struct tdb_context *tdb);
-/* Either make a copy into pad and return that, or return ptr into mmap.
- * Converts endian (ie. will use pad in that case). */
-void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len);
 /* Either alloc a copy, or give direct access.  Release frees or noop. */
 const void *tdb_access_read(struct tdb_context *tdb,
 			    tdb_off_t off, tdb_len_t len, bool convert);
@@ -452,6 +490,13 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
 		     void *rec, size_t len);
+/* Adds a stat, if it's in range. */
+void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val);
+#define add_stat(tdb, statname, val)					\
+	do {								\
+		if (unlikely((tdb)->stats))				\
+			add_stat_((tdb), &(tdb)->stats->statname, (val)); \
+	} while (0)
 /* lock.c: */
 void tdb_lock_init(struct tdb_context *tdb);
@@ -507,6 +552,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
 int tdb_transaction_recover(struct tdb_context *tdb);
 bool tdb_needs_recovery(struct tdb_context *tdb);
+/* tdb.c: */
+void COLD tdb_logerr(struct tdb_context *tdb,
+		     enum TDB_ERROR ecode,
+		     enum tdb_debug_level level,
+		     const char *fmt, ...);
 #ifdef TDB_TRACE
 void tdb_trace(struct tdb_context *tdb, const char *op);
 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);

--- a/ccan/tdb2/summary.c
+++ b/ccan/tdb2/summary.c
@@ -37,33 +37,43 @@ static int count_hash(struct tdb_context *tdb,
 static bool summarize(struct tdb_context *tdb,
 		      struct tally *hashes,
-		      struct tally *flists,
+		      struct tally *ftables,
 		      struct tally *free,
 		      struct tally *keys,
 		      struct tally *data,
 		      struct tally *extra,
 		      struct tally *uncoal,
-		      struct tally *buckets)
+		      struct tally *buckets,
+		      struct tally *chains)
 {
 	tdb_off_t off;
 	tdb_len_t len;
 	tdb_len_t unc = 0;
 	for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
-		union {
+		const union {
 			struct tdb_used_record u;
 			struct tdb_free_record f;
-		} pad, *p;
+			struct tdb_recovery_record r;
-		p = tdb_get(tdb, off, &pad, sizeof(pad));
+		} *p;
+		/* We might not be able to get the whole thing. */
+		p = tdb_access_read(tdb, off, sizeof(p->f), true);
 		if (!p)
 			return false;
-		if (rec_magic(&p->u) != TDB_MAGIC) {
+		if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
-			len = p->f.data_len;
+		    || p->r.magic == TDB_RECOVERY_MAGIC) {
+			if (unc) {
+				tally_add(uncoal, unc);
+				unc = 0;
+			}
+			len = sizeof(p->r) + p->r.max_len;
+		} else if (frec_magic(&p->f) == TDB_FREE_MAGIC) {
+			len = frec_len(&p->f);
 			tally_add(free, len);
 			tally_add(buckets, size_to_bucket(len));
 			len += sizeof(p->u);
 			unc++;
-		} else {
+		} else if (rec_magic(&p->u) == TDB_USED_MAGIC) {
 			if (unc) {
 				tally_add(uncoal, unc);
 				unc = 0;
@@ -73,25 +83,35 @@ static bool summarize(struct tdb_context *tdb,
 				+ rec_data_length(&p->u)
 				+ rec_extra_padding(&p->u);
-			/* FIXME: Use different magic for hashes, flists. */
+			tally_add(keys, rec_key_length(&p->u));
-			if (!rec_key_length(&p->u) && rec_hash(&p->u) < 2) {
+			tally_add(data, rec_data_length(&p->u));
-				if (rec_hash(&p->u) == 0) {
+			tally_add(extra, rec_extra_padding(&p->u));
-					int count = count_hash(tdb,
+		} else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) {
-							off + sizeof(p->u),
+			int count = count_hash(tdb,
-							TDB_SUBLEVEL_HASH_BITS);
+					       off + sizeof(p->u),
-					if (count == -1)
+					       TDB_SUBLEVEL_HASH_BITS);
-						return false;
+			if (count == -1)
-					tally_add(hashes, count);
+				return false;
-				} else {
+			tally_add(hashes, count);
-					tally_add(flists,
+			tally_add(extra, rec_extra_padding(&p->u));
-						  rec_data_length(&p->u));
+			len = sizeof(p->u)
-				}
+				+ rec_data_length(&p->u)
-			} else {
+				+ rec_extra_padding(&p->u);
-				tally_add(keys, rec_key_length(&p->u));
+		} else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) {
-				tally_add(data, rec_data_length(&p->u));
+			len = sizeof(p->u)
-			}
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(ftables, rec_data_length(&p->u));
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(chains, 1);
 			tally_add(extra, rec_extra_padding(&p->u));
-		}
+		} else
+			len = dead_space(tdb, off);
+		tdb_access_release(tdb, p);
 	}
 	if (unc)
 		tally_add(uncoal, unc);
@@ -110,6 +130,7 @@ static bool summarize(struct tdb_context *tdb,
 	"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
 	"Number of free lists: %zu\n%s" \
 	"Toplevel hash used: %u of %u\n" \
+	"Number of chains: %zu\n" \
 	"Number of subhashes: %zu\n" \
 	"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
 	"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
@@ -127,8 +148,8 @@ static bool summarize(struct tdb_context *tdb,
 char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 {
 	tdb_len_t len;
-	struct tally *flists, *hashes, *freet, *keys, *data, *extra, *uncoal,
+	struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal,
-		*buckets;
+		*buckets, *chains;
 	char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg, *bucketsg;
 	char *ret = NULL;
@@ -143,7 +164,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 	}
 	/* Start stats off empty. */
-	flists = tally_new(HISTO_HEIGHT);
+	ftables = tally_new(HISTO_HEIGHT);
 	hashes = tally_new(HISTO_HEIGHT);
 	freet = tally_new(HISTO_HEIGHT);
 	keys = tally_new(HISTO_HEIGHT);
@@ -151,14 +172,16 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 	extra = tally_new(HISTO_HEIGHT);
 	uncoal = tally_new(HISTO_HEIGHT);
 	buckets = tally_new(HISTO_HEIGHT);
-	if (!flists || !hashes || !freet || !keys || !data || !extra
+	chains = tally_new(HISTO_HEIGHT);
-	    || !uncoal || !buckets) {
+	if (!ftables || !hashes || !freet || !keys || !data || !extra
-		tdb->ecode = TDB_ERR_OOM;
+	    || !uncoal || !buckets || !chains) {
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_summary: failed to allocate tally structures");
 		goto unlock;
 	}
-	if (!summarize(tdb, hashes, flists, freet, keys, data, extra, uncoal,
+	if (!summarize(tdb, hashes, ftables, freet, keys, data, extra, uncoal,
-		       buckets))
+		       buckets, chains))
 		goto unlock;
 	if (flags & TDB_SUMMARY_HISTOGRAMS) {
@@ -206,6 +229,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 		      count_hash(tdb, offsetof(struct tdb_header, hashtable),
 				 TDB_TOPLEVEL_HASH_BITS),
 		      1 << TDB_TOPLEVEL_HASH_BITS,
+		      tally_num(chains),
 		      tally_num(hashes),
 		      tally_min(hashes), tally_mean(hashes), tally_max(hashes),
 		      hashesg ? hashesg : "",
@@ -215,11 +239,12 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 		      tally_total(freet, NULL) * 100.0 / tdb->map_size,
 		      (tally_num(keys) + tally_num(freet) + tally_num(hashes))
 		      * sizeof(struct tdb_used_record) * 100.0 / tdb->map_size,
-		      tally_num(flists) * sizeof(struct tdb_freelist)
+		      tally_num(ftables) * sizeof(struct tdb_freetable)
 		      * 100.0 / tdb->map_size,
 		      (tally_num(hashes)
 		       * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
-		       + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS))
+		       + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS)
+		       + sizeof(struct tdb_chain) * tally_num(chains))
 		      * 100.0 / tdb->map_size);
 unlock:
@@ -237,6 +262,8 @@ unlock:
 	free(data);
 	free(extra);
 	free(uncoal);
+	free(ftables);
+	free(chains);
 	tdb_allrecord_unlock(tdb, F_RDLCK);
 	tdb_unlock_expand(tdb, F_RDLCK);

--- a/ccan/tdb2/tdb.c
+++ b/ccan/tdb2/tdb.c
 #include "private.h"
 #include <ccan/tdb2/tdb2.h>
-#include <ccan/build_assert/build_assert.h>
-#include <ccan/likely/likely.h>
 #include <assert.h>
+#include <stdarg.h>
 /* The null return. */
 struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
@@ -10,13 +9,6 @@ struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
 static struct tdb_context *tdbs = NULL;
-PRINTF_FMT(4, 5) static void
-null_log_fn(struct tdb_context *tdb,
-	    enum tdb_debug_level level, void *priv,
-	    const char *fmt, ...)
-{
-}
 static bool tdb_already_open(dev_t device, ino_t ino)
 {
 	struct tdb_context *i;
@@ -39,8 +31,8 @@ static uint64_t random_number(struct tdb_context *tdb)
 	fd = open("/dev/urandom", O_RDONLY);
 	if (fd >= 0) {
 		if (tdb_read_all(fd, &ret, sizeof(ret))) {
-			tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
+			tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
-				 "tdb_open: random from /dev/urandom\n");
+				 "tdb_open: random from /dev/urandom");
 			close(fd);
 			return ret;
 		}
@@ -55,9 +47,9 @@ static uint64_t random_number(struct tdb_context *tdb)
 			char reply[1 + sizeof(uint64_t)];
 			int r = read(fd, reply, sizeof(reply));
 			if (r > 1) {
-				tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
+				tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
-					 "tdb_open: %u random bytes from"
+					   "tdb_open: %u random bytes from"
-					 " /dev/egd-pool\n", r-1);
+					   " /dev/egd-pool", r-1);
 				/* Copy at least some bytes. */
 				memcpy(&ret, reply+1, r - 1);
 				if (reply[0] == sizeof(uint64_t)
@@ -73,14 +65,14 @@ static uint64_t random_number(struct tdb_context *tdb)
 	/* Fallback: pid and time. */
 	gettimeofday(&now, NULL);
 	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
-	tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
+	tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
-		 "tdb_open: random from getpid and time\n");
+		   "tdb_open: random from getpid and time");
 	return ret;
 }
 struct new_database {
 	struct tdb_header hdr;
-	struct tdb_freelist flist;
+	struct tdb_freetable ftable;
 };
 /* initialise a new database */
@@ -109,11 +101,11 @@ static int tdb_new_database(struct tdb_context *tdb,
 	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
 	/* Free is empty. */
-	newdb.hdr.free_list = offsetof(struct new_database, flist);
+	newdb.hdr.free_table = offsetof(struct new_database, ftable);
-	memset(&newdb.flist, 0, sizeof(newdb.flist));
+	memset(&newdb.ftable, 0, sizeof(newdb.ftable));
-	set_header(NULL, &newdb.flist.hdr, 0,
+	set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0,
-		   sizeof(newdb.flist) - sizeof(newdb.flist.hdr),
+		   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
-		   sizeof(newdb.flist) - sizeof(newdb.flist.hdr), 1);
+		   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr), 0);
 	/* Magic food */
 	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
@@ -130,7 +122,8 @@ static int tdb_new_database(struct tdb_context *tdb,
 		tdb->map_size = sizeof(newdb);
 		tdb->map_ptr = malloc(tdb->map_size);
 		if (!tdb->map_ptr) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "tdb_new_database: failed to allocate");
 			return -1;
 		}
 		memcpy(tdb->map_ptr, &newdb, tdb->map_size);
@@ -143,7 +136,9 @@ static int tdb_new_database(struct tdb_context *tdb,
 		return -1;
 	if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) {
-		tdb->ecode = TDB_ERR_IO;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "tdb_new_database: failed to write: %s",
+			   strerror(errno));
 		return -1;
 	}
 	return 0;
@@ -155,7 +150,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 {
 	struct tdb_context *tdb;
 	struct stat st;
-	int save_errno;
+	int saved_errno = 0;
 	uint64_t hash_test;
 	unsigned v;
 	struct tdb_header hdr;
@@ -165,7 +160,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	if (!tdb) {
 		/* Can't log this */
 		errno = ENOMEM;
-		goto fail;
+		return NULL;
 	}
 	tdb->name = NULL;
 	tdb->map_ptr = NULL;
@@ -174,9 +169,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	tdb->map_size = sizeof(struct tdb_header);
 	tdb->ecode = TDB_SUCCESS;
 	tdb->flags = tdb_flags;
-	tdb->log = null_log_fn;
+	tdb->logfn = NULL;
-	tdb->log_priv = NULL;
 	tdb->transaction = NULL;
+	tdb->stats = NULL;
+	tdb->access = NULL;
 	tdb_hash_init(tdb);
 	tdb_io_init(tdb);
 	tdb_lock_init(tdb);
@@ -184,8 +180,8 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	while (attr) {
 		switch (attr->base.attr) {
 		case TDB_ATTRIBUTE_LOG:
-			tdb->log = attr->log.log_fn;
+			tdb->logfn = attr->log.log_fn;
-			tdb->log_priv = attr->log.log_private;
+			tdb->log_private = attr->log.log_private;
 			break;
 		case TDB_ATTRIBUTE_HASH:
 			tdb->khash = attr->hash.hash_fn;
@@ -194,20 +190,24 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 		case TDB_ATTRIBUTE_SEED:
 			seed = &attr->seed;
 			break;
+		case TDB_ATTRIBUTE_STATS:
+			tdb->stats = &attr->stats;
+			/* They have stats we don't know about?  Tell them. */
+			if (tdb->stats->size > sizeof(attr->stats))
+				tdb->stats->size = sizeof(attr->stats);
+			break;
 		default:
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
-				 "tdb_open: unknown attribute type %u\n",
+				   "tdb_open: unknown attribute type %u",
-				 attr->base.attr);
+				   attr->base.attr);
-			errno = EINVAL;
 			goto fail;
 		}
 		attr = attr->base.next;
 	}
 	if ((open_flags & O_ACCMODE) == O_WRONLY) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
-			 "tdb_open: can't open tdb %s write-only\n", name);
+			   "tdb_open: can't open tdb %s write-only", name);
-		errno = EINVAL;
 		goto fail;
 	}
@@ -225,21 +225,21 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	if (tdb->flags & TDB_INTERNAL) {
 		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
 		if (tdb_new_database(tdb, seed, &hdr) != 0) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_open: tdb_new_database failed!");
 			goto fail;
 		}
 		tdb_convert(tdb, &hdr.hash_seed, sizeof(hdr.hash_seed));
 		tdb->hash_seed = hdr.hash_seed;
-		tdb_flist_init(tdb);
+		tdb_ftable_init(tdb);
 		return tdb;
 	}
 	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
+		/* errno set by open(2) */
-			 "tdb_open: could not open file %s: %s\n",
+		saved_errno = errno;
-			 name, strerror(errno));
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
-		goto fail;	/* errno set by open(2) */
+			   "tdb_open: could not open file %s: %s",
+			   name, strerror(errno));
+		goto fail;
 	}
 	/* on exec, don't inherit the fd */
@@ -248,19 +248,19 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	/* ensure there is only one process initialising at once */
 	if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		/* errno set by tdb_brlock */
-			 "tdb_open: failed to get open lock on %s: %s\n",
+		saved_errno = errno;
-			 name, strerror(errno));
+		goto fail;
-		goto fail;	/* errno set by tdb_brlock */
 	}
 	if (!tdb_pread_all(tdb->fd, &hdr, sizeof(hdr), 0)
 	    || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
-		if (!(open_flags & O_CREAT)
+		if (!(open_flags & O_CREAT)) {
-		    || tdb_new_database(tdb, seed, &hdr) == -1) {
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
-			if (errno == 0) {
+				   "tdb_open: %s is not a tdb file", name);
-				errno = EIO; /* ie bad format or something */
+			goto fail;
-			}
+		}
+		if (tdb_new_database(tdb, seed, &hdr) == -1) {
 			goto fail;
 		}
 	} else if (hdr.version != TDB_VERSION) {
@@ -268,10 +268,9 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 			tdb->flags |= TDB_CONVERT;
 		else {
 			/* wrong version */
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
-				 "tdb_open: %s is unknown version 0x%llx\n",
+				   "tdb_open: %s is unknown version 0x%llx",
-				 name, (long long)hdr.version);
+				   name, (long long)hdr.version);
-			errno = EIO;
 			goto fail;
 		}
 	}
@@ -282,29 +281,34 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
 	if (hdr.hash_test != hash_test) {
 		/* wrong hash variant */
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
-			 "tdb_open: %s uses a different hash function\n",
+			   "tdb_open: %s uses a different hash function",
-			 name);
+			   name);
-		errno = EIO;
 		goto fail;
 	}
-	if (fstat(tdb->fd, &st) == -1)
+	if (fstat(tdb->fd, &st) == -1) {
+		saved_errno = errno;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			   "tdb_open: could not stat open %s: %s",
+			   name, strerror(errno));
 		goto fail;
+	}
 	/* Is it already in the open list?  If so, fail. */
 	if (tdb_already_open(st.st_dev, st.st_ino)) {
 		/* FIXME */
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_NESTING, TDB_DEBUG_ERROR,
-			 "tdb_open: %s (%d,%d) is already open in this process\n",
+			   "tdb_open: %s (%d,%d) is already open in this"
-			 name, (int)st.st_dev, (int)st.st_ino);
+			   " process",
-		errno = EBUSY;
+			   name, (int)st.st_dev, (int)st.st_ino);
 		goto fail;
 	}
 	tdb->name = strdup(name);
 	if (!tdb->name) {
-		errno = ENOMEM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_open: failed to allocate name");
 		goto fail;
 	}
@@ -317,11 +321,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	/* Now it's fully formed, recover if necessary. */
 	if (tdb_needs_recovery(tdb) && tdb_lock_and_recover(tdb) == -1) {
-		errno = EIO;
 		goto fail;
 	}
-	if (tdb_flist_init(tdb) == -1)
+	if (tdb_ftable_init(tdb) == -1)
 		goto fail;
 	tdb->next = tdbs;
@@ -329,10 +332,30 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	return tdb;
 fail:
-	save_errno = errno;
+	/* Map ecode to some logical errno. */
+	if (!saved_errno) {
-	if (!tdb)
+		switch (tdb->ecode) {
-		return NULL;
+		case TDB_ERR_CORRUPT:
+		case TDB_ERR_IO:
+			saved_errno = EIO;
+			break;
+		case TDB_ERR_LOCK:
+			saved_errno = EWOULDBLOCK;
+			break;
+		case TDB_ERR_OOM:
+			saved_errno = ENOMEM;
+			break;
+		case TDB_ERR_EINVAL:
+			saved_errno = EINVAL;
+			break;
+		case TDB_ERR_NESTING:
+			saved_errno = EBUSY;
+			break;
+		default:
+			saved_errno = EINVAL;
+			break;
+		}
+	}
 #ifdef TDB_TRACE
 	close(tdb->tracefd);
@@ -346,15 +369,14 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	free((char *)tdb->name);
 	if (tdb->fd != -1)
 		if (close(tdb->fd) != 0)
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
-				 "tdb_open: failed to close tdb->fd"
+				   "tdb_open: failed to close tdb->fd"
-				 " on error!\n");
+				   " on error!");
 	free(tdb);
-	errno = save_errno;
+	errno = saved_errno;
 	return NULL;
 }
-/* FIXME: modify, don't rewrite! */
 static int update_rec_hdr(struct tdb_context *tdb,
 			  tdb_off_t off,
 			  tdb_len_t keylen,
@@ -364,7 +386,8 @@ static int update_rec_hdr(struct tdb_context *tdb,
 {
 	uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
-	if (set_header(tdb, rec, keylen, datalen, keylen + dataroom, h))
+	if (set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen,
+		       keylen + dataroom, h))
 		return -1;
 	return tdb_write_convert(tdb, off, rec, sizeof(*rec));
@@ -380,12 +403,14 @@ static int replace_data(struct tdb_context *tdb,
 	tdb_off_t new_off;
 	/* Allocate a new record. */
-	new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, growing);
+	new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC,
+			growing);
 	if (unlikely(new_off == TDB_OFF_ERR))
 		return -1;
 	/* We didn't like the existing one: remove it. */
 	if (old_off) {
+		add_stat(tdb, frees, 1);
 		add_free_record(tdb, old_off,
 				sizeof(struct tdb_used_record)
 				+ key.dsize + old_room);
@@ -445,7 +470,6 @@ int tdb_store(struct tdb_context *tdb,
 						  h.hlock_range, F_WRLCK);
 				return 0;
 			}
-			/* FIXME: See if right record is free? */
 		} else {
 			if (flag == TDB_MODIFY) {
 				/* if the record doesn't exist and we
@@ -502,15 +526,13 @@ int tdb_append(struct tdb_context *tdb,
 					  F_WRLCK);
 			return 0;
 		}
-		/* FIXME: Check right record free? */
 		/* Slow path. */
 		newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
 		if (!newdata) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				   "tdb_append: failed to allocate %zu bytes",
-				 "tdb_append: cannot allocate %llu bytes!\n",
+				   (size_t)(key.dsize+old_dlen+dbuf.dsize));
-				 (long long)key.dsize + old_dlen + dbuf.dsize);
 			goto fail;
 		}
 		if (tdb->methods->read(tdb, off + sizeof(rec) + key.dsize,
@@ -582,6 +604,7 @@ int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
 		goto unlock_err;
 	/* Free the deleted entry. */
+	add_stat(tdb, frees, 1);
 	if (add_free_record(tdb, off,
 			    sizeof(struct tdb_used_record)
 			    + rec_key_length(&rec)
@@ -602,12 +625,11 @@ int tdb_close(struct tdb_context *tdb)
 	struct tdb_context **i;
 	int ret = 0;
-	/* FIXME:
+	tdb_trace(tdb, "tdb_close");
 	if (tdb->transaction) {
 		tdb_transaction_cancel(tdb);
 	}
-	*/
-	tdb_trace(tdb, "tdb_close");
 	if (tdb->map_ptr) {
 		if (tdb->flags & TDB_INTERNAL)
@@ -638,12 +660,12 @@ int tdb_close(struct tdb_context *tdb)
 	return ret;
 }
-enum TDB_ERROR tdb_error(struct tdb_context *tdb)
+enum TDB_ERROR tdb_error(const struct tdb_context *tdb)
 {
 	return tdb->ecode;
 }
-const char *tdb_errorstr(struct tdb_context *tdb)
+const char *tdb_errorstr(const struct tdb_context *tdb)
 {
 	/* Gcc warns if you miss a case in the switch, so use that. */
 	switch (tdb->ecode) {
@@ -660,3 +682,38 @@ const char *tdb_errorstr(struct tdb_context *tdb)
 	}
 	return "Invalid error code";
 }
+void COLD tdb_logerr(struct tdb_context *tdb,
+		     enum TDB_ERROR ecode,
+		     enum tdb_debug_level level,
+		     const char *fmt, ...)
+{
+	char *message;
+	va_list ap;
+	size_t len;
+	/* tdb_open paths care about errno, so save it. */
+	int saved_errno = errno;
+	tdb->ecode = ecode;
+	if (!tdb->logfn)
+		return;
+	/* FIXME: Doesn't assume asprintf. */
+	va_start(ap, fmt);
+	len = vsnprintf(NULL, 0, fmt, ap);
+	va_end(ap);
+	message = malloc(len + 1);
+	if (!message) {
+		tdb->logfn(tdb, level, tdb->log_private,
+			   "out of memory formatting message");
+		return;
+	}
+	va_start(ap, fmt);
+	len = vsprintf(message, fmt, ap);
+	va_end(ap);
+	tdb->logfn(tdb, level, tdb->log_private, message);
+	free(message);
+	errno = saved_errno;
+}
--- a/ccan/tdb2/tdb2.h
+++ b/ccan/tdb2/tdb2.h
@@ -67,7 +67,7 @@ enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK,
 /* flags for tdb_summary. Logical or to combine. */
 enum tdb_summary_flags { TDB_SUMMARY_HISTOGRAMS = 1 };
-/* debugging uses one of the following levels */
+/* logging uses one of the following levels */
 enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR, 
 		      TDB_DEBUG_WARNING, TDB_DEBUG_TRACE};
@@ -80,14 +80,15 @@ struct tdb_context;
 /* FIXME: Make typesafe */
 typedef int (*tdb_traverse_func)(struct tdb_context *, TDB_DATA, TDB_DATA, void *);
-typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *priv, const char *, ...) PRINTF_FMT(4, 5);
+typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *, const char *);
 typedef uint64_t (*tdb_hashfn_t)(const void *key, size_t len, uint64_t seed,
 				 void *priv);
 enum tdb_attribute_type {
 	TDB_ATTRIBUTE_LOG = 0,
 	TDB_ATTRIBUTE_HASH = 1,
-	TDB_ATTRIBUTE_SEED = 2
+	TDB_ATTRIBUTE_SEED = 2,
+	TDB_ATTRIBUTE_STATS = 3
 };
 struct tdb_attribute_base {
@@ -112,11 +113,39 @@ struct tdb_attribute_seed {
 	uint64_t seed;
 };
+struct tdb_attribute_stats {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */
+	size_t size; /* = sizeof(struct tdb_attribute_stats) */
+	uint64_t allocs;
+	uint64_t   alloc_subhash;
+	uint64_t   alloc_chain;
+	uint64_t   alloc_bucket_exact;
+	uint64_t   alloc_bucket_max;
+	uint64_t   alloc_leftover;
+	uint64_t   alloc_coalesce_tried;
+	uint64_t     alloc_coalesce_lockfail;
+	uint64_t     alloc_coalesce_race;
+	uint64_t     alloc_coalesce_succeeded;
+	uint64_t        alloc_coalesce_num_merged;
+	uint64_t compares;
+	uint64_t   compare_wrong_bucket;
+	uint64_t   compare_wrong_offsetbits;
+	uint64_t   compare_wrong_keylen;
+	uint64_t   compare_wrong_rechash;
+	uint64_t   compare_wrong_keycmp;
+	uint64_t expands;
+	uint64_t frees;
+	uint64_t locks;
+	uint64_t    lock_lowlevel;
+	uint64_t    lock_nonblock;
+};
 union tdb_attribute {
 	struct tdb_attribute_base base;
 	struct tdb_attribute_log log;
 	struct tdb_attribute_hash hash;
 	struct tdb_attribute_seed seed;
+	struct tdb_attribute_stats stats;
 };
 struct tdb_context *tdb_open(const char *name, int tdb_flags,
@@ -139,8 +168,8 @@ int tdb_check(struct tdb_context *tdb,
 	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
 	      void *private_data);
-enum TDB_ERROR tdb_error(struct tdb_context *tdb);
+enum TDB_ERROR tdb_error(const struct tdb_context *tdb);
-const char *tdb_errorstr(struct tdb_context *tdb);
+const char *tdb_errorstr(const struct tdb_context *tdb);
 int tdb_transaction_start(struct tdb_context *tdb);
 void tdb_transaction_cancel(struct tdb_context *tdb);

--- a/ccan/tdb2/test/layout.c
+++ b/ccan/tdb2/test/layout.c
@@ -23,20 +23,20 @@ static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
 	layout->elem[layout->num_elems++] = elem;
 }
-void tdb_layout_add_freelist(struct tdb_layout *layout)
+void tdb_layout_add_freetable(struct tdb_layout *layout)
 {
 	union tdb_layout_elem elem;
-	elem.base.type = FREELIST;
+	elem.base.type = FREETABLE;
 	add(layout, elem);
 }
 void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
-			 unsigned flist)
+			 unsigned ftable)
 {
 	union tdb_layout_elem elem;
 	elem.base.type = FREE;
 	elem.free.len = len;
-	elem.free.flist_num = flist;
+	elem.free.ftable_num = ftable;
 	add(layout, elem);
 }
@@ -82,9 +82,9 @@ static tdb_len_t hashtable_len(struct tle_hashtable *htable)
 		+ htable->extra;
 }
-static tdb_len_t freelist_len(struct tle_freelist *flist)
+static tdb_len_t freetable_len(struct tle_freetable *ftable)
 {
-	return sizeof(struct tdb_freelist);
+	return sizeof(struct tdb_freetable);
 }
 static void set_free_record(void *mem, tdb_len_t len)
@@ -97,7 +97,7 @@ static void set_data_record(void *mem, struct tdb_context *tdb,
 {
 	struct tdb_used_record *u = mem;
-	set_header(tdb, u, used->key.dsize, used->data.dsize,
+	set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize,
 		   used->key.dsize + used->data.dsize + used->extra,
 		   tdb_hash(tdb, used->key.dptr, used->key.dsize));
 	memcpy(u + 1, used->key.dptr, used->key.dsize);
@@ -111,34 +111,36 @@ static void set_hashtable(void *mem, struct tdb_context *tdb,
 	struct tdb_used_record *u = mem;
 	tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS;
-	set_header(tdb, u, 0, len, len + htable->extra, 0);
+	set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0);
 	memset(u + 1, 0, len);
 }
-static void set_freelist(void *mem, struct tdb_context *tdb,
+static void set_freetable(void *mem, struct tdb_context *tdb,
-			 struct tle_freelist *freelist, struct tdb_header *hdr,
+			 struct tle_freetable *freetable, struct tdb_header *hdr,
-			 tdb_off_t last_flist)
+			 tdb_off_t last_ftable)
 {
-	struct tdb_freelist *flist = mem;
+	struct tdb_freetable *ftable = mem;
-	memset(flist, 0, sizeof(*flist));
+	memset(ftable, 0, sizeof(*ftable));
-	set_header(tdb, &flist->hdr, 0,
+	set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0,
-		   sizeof(*flist) - sizeof(flist->hdr),
+			sizeof(*ftable) - sizeof(ftable->hdr),
-		   sizeof(*flist) - sizeof(flist->hdr), 1);
+			sizeof(*ftable) - sizeof(ftable->hdr), 0);
-	if (last_flist) {
+	if (last_ftable) {
-		flist = (struct tdb_freelist *)((char *)hdr + last_flist);
+		ftable = (struct tdb_freetable *)((char *)hdr + last_ftable);
-		flist->next = freelist->base.off;
+		ftable->next = freetable->base.off;
 	} else {
-		hdr->free_list = freelist->base.off;
+		hdr->free_table = freetable->base.off;
 	}
 }
 static void add_to_freetable(struct tdb_context *tdb,
 			     tdb_off_t eoff,
 			     tdb_off_t elen,
-			     struct tle_freelist *freelist)
+			     unsigned ftable,
+			     struct tle_freetable *freetable)
 {
-	tdb->flist_off = freelist->base.off;
+	tdb->ftable_off = freetable->base.off;
+	tdb->ftable = ftable;
 	add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
 }
@@ -202,15 +204,15 @@ static void add_to_hashtable(struct tdb_context *tdb,
 	abort();
 }
-static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
+static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num)
 {
 	unsigned i;
 	for (i = 0; i < layout->num_elems; i++) {
-		if (layout->elem[i].base.type != FREELIST)
+		if (layout->elem[i].base.type != FREETABLE)
 			continue;
 		if (num == 0)
-			return &layout->elem[i].flist;
+			return &layout->elem[i].ftable;
 		num--;
 	}
 	abort();
@@ -220,7 +222,7 @@ static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
 struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 {
 	unsigned int i;
-	tdb_off_t off, len, last_flist;
+	tdb_off_t off, len, last_ftable;
 	char *mem;
 	struct tdb_context *tdb;
@@ -231,8 +233,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		union tdb_layout_elem *e = &layout->elem[i];
 		e->base.off = off;
 		switch (e->base.type) {
-		case FREELIST:
+		case FREETABLE:
-			len = freelist_len(&e->flist);
+			len = freetable_len(&e->ftable);
 			break;
 		case FREE:
 			len = free_record_len(e->free.len);
@@ -259,14 +261,14 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 	tdb->map_ptr = mem;
 	tdb->map_size = off;
-	last_flist = 0;
+	last_ftable = 0;
 	for (i = 0; i < layout->num_elems; i++) {
 		union tdb_layout_elem *e = &layout->elem[i];
 		switch (e->base.type) {
-		case FREELIST:
+		case FREETABLE:
-			set_freelist(mem + e->base.off, tdb, &e->flist,
+			set_freetable(mem + e->base.off, tdb, &e->ftable,
-				     (struct tdb_header *)mem, last_flist);
+				     (struct tdb_header *)mem, last_ftable);
-			last_flist = e->base.off;
+			last_ftable = e->base.off;
 			break;
 		case FREE:
 			set_free_record(mem + e->base.off, e->free.len);
@@ -279,8 +281,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 			break;
 		}
 	}
-	/* Must have a free list! */
+	/* Must have a free table! */
-	assert(last_flist);
+	assert(last_ftable);
 	/* Now fill the free and hash tables. */
 	for (i = 0; i < layout->num_elems; i++) {
@@ -288,7 +290,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		switch (e->base.type) {
 		case FREE:
 			add_to_freetable(tdb, e->base.off, e->free.len,
-					 find_flist(layout, e->free.flist_num));
+					 e->free.ftable_num,
+					 find_ftable(layout, e->free.ftable_num));
 			break;
 		case DATA:
 			add_to_hashtable(tdb, e->base.off, e->used.key);
@@ -298,7 +301,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		}
 	}
-	tdb->flist_off = find_flist(layout, 0)->base.off;
+	tdb->ftable_off = find_ftable(layout, 0)->base.off;
 	/* Get physical if they asked for it. */
 	if (layout->filename) {

--- a/ccan/tdb2/test/layout.h
+++ b/ccan/tdb2/test/layout.h
@@ -3,9 +3,9 @@
 #include <ccan/tdb2/private.h>
 struct tdb_layout *new_tdb_layout(const char *filename);
-void tdb_layout_add_freelist(struct tdb_layout *layout);
+void tdb_layout_add_freetable(struct tdb_layout *layout);
 void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
-			 unsigned flist);
+			 unsigned ftable);
 void tdb_layout_add_used(struct tdb_layout *layout,
 			 TDB_DATA key, TDB_DATA data,
 			 tdb_len_t extra);
@@ -18,7 +18,7 @@ void tdb_layout_add_hashtable(struct tdb_layout *layout,
 struct tdb_context *tdb_layout_get(struct tdb_layout *layout);
 enum layout_type {
-	FREELIST, FREE, DATA, HASHTABLE,
+	FREETABLE, FREE, DATA, HASHTABLE,
 };
 /* Shared by all union members. */
@@ -27,14 +27,14 @@ struct tle_base {
 	tdb_off_t off;
 };
-struct tle_freelist {
+struct tle_freetable {
 	struct tle_base base;
 };
 struct tle_free {
 	struct tle_base base;
 	tdb_len_t len;
-	unsigned flist_num;
+	unsigned ftable_num;
 };
 struct tle_used {
@@ -53,7 +53,7 @@ struct tle_hashtable {
 union tdb_layout_elem {
 	struct tle_base base;
-	struct tle_freelist flist;
+	struct tle_freetable ftable;
 	struct tle_free free;
 	struct tle_used used;
 	struct tle_hashtable hashtable;

--- a/ccan/tdb2/test/logging.c
+++ b/ccan/tdb2/test/logging.c
-#define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <ccan/tap/tap.h>
 #include "logging.h"
@@ -16,24 +14,13 @@ union tdb_attribute tap_log_attr = {
 void tap_log_fn(struct tdb_context *tdb,
 		enum tdb_debug_level level, void *priv,
-		const char *fmt, ...)
+		const char *message)
 {
-	va_list ap;
-	char *p;
 	if (suppress_logging)
 		return;
-	va_start(ap, fmt);
+	diag("tdb log level %u: %s%s", level, log_prefix, message);
-	if (vasprintf(&p, fmt, ap) == -1)
-		abort();
-	/* Strip trailing \n: diag adds it. */
-	if (p[strlen(p)-1] == '\n')
-		p[strlen(p)-1] = '\0';
-	diag("tdb log level %u: %s%s", level, log_prefix, p);
-	free(p);
 	if (level != TDB_DEBUG_TRACE)
 		tap_log_messages++;
-	va_end(ap);
 }
--- a/ccan/tdb2/test/logging.h
+++ b/ccan/tdb2/test/logging.h
@@ -11,7 +11,7 @@ extern union tdb_attribute tap_log_attr;
 void tap_log_fn(struct tdb_context *tdb,
 		enum tdb_debug_level level, void *priv,
-		const char *fmt, ...);
+		const char *message);
 static inline bool data_equal(struct tdb_data a, struct tdb_data b)
 {

--- a/ccan/tdb2/test/run-001-encode.c
+++ b/ccan/tdb2/test/run-001-encode.c
@@ -12,18 +12,20 @@ int main(int argc, char *argv[])
 {
 	unsigned int i;
 	struct tdb_used_record rec;
-	struct tdb_context tdb = { .log = tap_log_fn, .log_priv = NULL };
+	struct tdb_context tdb = { .logfn = tap_log_fn };
 	plan_tests(64 + 32 + 48*6 + 1);
 	/* We should be able to encode any data value. */
 	for (i = 0; i < 64; i++)
-		ok1(set_header(&tdb, &rec, 0, 1ULL << i, 1ULL << i, 0) == 0);
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i,
+			       1ULL << i, 0) == 0);
 	/* And any key and data with < 64 bits between them. */
 	for (i = 0; i < 32; i++) {
 		tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
-		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen, 0) == 0);
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
+			       klen + dlen, 0)  == 0);
 	}
 	/* We should neatly encode all values. */
@@ -32,13 +34,13 @@ int main(int argc, char *argv[])
 		uint64_t klen = 1ULL << (i < 16 ? i : 15);
 		uint64_t dlen = 1ULL << i;
 		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
-		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen + xlen, h)
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
-		    == 0);
+			       klen+dlen+xlen, h) == 0);
 		ok1(rec_key_length(&rec) == klen);
 		ok1(rec_data_length(&rec) == dlen);
 		ok1(rec_extra_padding(&rec) == xlen);
 		ok1((uint64_t)rec_hash(&rec) == h);
-		ok1(rec_magic(&rec) == TDB_MAGIC);
+		ok1(rec_magic(&rec) == TDB_USED_MAGIC);
 	}
 	ok1(tap_log_messages == 0);
 	return exit_status();

--- a/ccan/tdb2/test/run-03-coalesce.c
+++ b/ccan/tdb2/test/run-03-coalesce.c
@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
 		return TDB_OFF_ERR;
 	if (frec_magic(&f) != TDB_FREE_MAGIC)
 		return TDB_OFF_ERR;
-	return f.data_len;
+	return frec_len(&f);
 }
 int main(int argc, char *argv[])
@@ -38,7 +38,7 @@ int main(int argc, char *argv[])
 	/* No coalescing can be done due to EOF */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	len = 1024;
 	tdb_layout_add_free(layout, len, 0);
 	tdb = tdb_layout_get(layout);
@@ -46,7 +46,7 @@ int main(int argc, char *argv[])
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(len));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
 	/* Lock and fail to coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len) == 0);
@@ -57,7 +57,7 @@ int main(int argc, char *argv[])
 	/* No coalescing can be done due to used record */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_used(layout, key, data, 6);
 	tdb = tdb_layout_get(layout);
@@ -65,7 +65,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and fail to coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 0);
@@ -76,7 +76,7 @@ int main(int argc, char *argv[])
 	/* Coalescing can be done due to two free records, then EOF */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 2048, 0);
 	tdb = tdb_layout_get(layout);
@@ -85,7 +85,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	/* Figure out which bucket (first) free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
@@ -97,7 +97,7 @@ int main(int argc, char *argv[])
 	/* Coalescing can be done due to two free records, then data */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 512, 0);
 	tdb_layout_add_used(layout, key, data, 6);
@@ -107,7 +107,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
@@ -119,7 +119,7 @@ int main(int argc, char *argv[])
 	/* Coalescing can be done due to three free records, then EOF */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 512, 0);
 	tdb_layout_add_free(layout, 256, 0);
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);

--- a/ccan/tdb2/test/run-04-basichash.c
+++ b/ccan/tdb2/test/run-04-basichash.c
@@ -65,7 +65,8 @@ int main(int argc, char *argv[])
 		/* FIXME: Check lock length */
 		/* Allocate a new record. */
-		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, false);
+		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+				TDB_USED_MAGIC, false);
 		ok1(new_off != TDB_OFF_ERR);
 		/* We should be able to add it now. */
@@ -225,7 +226,8 @@ int main(int argc, char *argv[])
 		/* We should be able to add it now. */
 		/* Allocate a new record. */
-		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, false);
+		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+				TDB_USED_MAGIC, false);
 		ok1(new_off != TDB_OFF_ERR);
 		ok1(add_to_hash(tdb, &h, new_off) == 0);

--- a/ccan/tdb2/test/run-25-hashoverload.c
+++ b/ccan/tdb2/test/run-25-hashoverload.c
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return 0;
+}
+static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
+{
+	if (p)
+		return tdb_delete(tdb, key);
+	return 0;
+}
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+	struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) };
+	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+						.hash_fn = badhash } };
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT,
+	};
+	hattr.base.next = &tap_log_attr;
+	plan_tests(5395);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct tdb_data d;
+		tdb = tdb_open("run-25-hashoverload.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+		/* Fill a group. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		/* Now store one last value: should form chain. */
+		ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		/* Check we can find them all. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) {
+			d = tdb_fetch(tdb, key);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+		}
+		/* Now add a *lot* more. */
+		for (j = (1 << TDB_HASH_GROUP_BITS) + 1;
+		     j < (16 << TDB_HASH_GROUP_BITS);
+		     j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+			d = tdb_fetch(tdb, key);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		/* Traverse through them. */
+		ok1(tdb_traverse(tdb, trav, NULL) == j);
+		/* Empty the first chain-worth. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
+			ok1(tdb_delete(tdb, key) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		for (j = (1 << TDB_HASH_GROUP_BITS);
+		     j < (16 << TDB_HASH_GROUP_BITS);
+		     j++) {
+			d = tdb_fetch(tdb, key);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+		}
+		/* Traverse through them. */
+		ok1(tdb_traverse(tdb, trav, NULL)
+		    == (15 << TDB_HASH_GROUP_BITS));
+		/* Re-add */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		/* Now try deleting as we go. */
+		ok1(tdb_traverse(tdb, trav, trav)
+		    == (16 << TDB_HASH_GROUP_BITS));
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tdb_traverse(tdb, trav, NULL) == 0);
+		tdb_close(tdb);
+	}
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
--- a/ccan/tdb2/test/run-30-exhaust-before-expand.c
+++ b/ccan/tdb2/test/run-30-exhaust-before-expand.c
@@ -9,13 +9,13 @@
 #include <err.h>
 #include "logging.h"
-static bool empty_freelist(struct tdb_context *tdb)
+static bool empty_freetable(struct tdb_context *tdb)
 {
-	struct tdb_freelist free;
+	struct tdb_freetable free;
 	unsigned int i;
-	/* Now, free list should be completely exhausted in zone 0 */
+	/* Now, free table should be completely exhausted in zone 0 */
-	if (tdb_read_convert(tdb, tdb->flist_off, &free, sizeof(free)) != 0)
+	if (tdb_read_convert(tdb, tdb->ftable_off, &free, sizeof(free)) != 0)
 		abort();
 	for (i = 0; i < sizeof(free.buckets)/sizeof(free.buckets[0]); i++) {
@@ -50,26 +50,26 @@ int main(int argc, char *argv[])
 		if (!tdb)
 			continue;
-		ok1(empty_freelist(tdb));
+		ok1(empty_freetable(tdb));
 		/* Need some hash lock for expand. */
 		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
 		/* Create some free space. */
 		ok1(tdb_expand(tdb, 1) == 0);
 		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(!empty_freelist(tdb));
+		ok1(!empty_freetable(tdb));
 		size = tdb->map_size;
 		/* Insert minimal-length records until we expand. */
 		for (j = 0; tdb->map_size == size; j++) {
-			was_empty = empty_freelist(tdb);
+			was_empty = empty_freetable(tdb);
 			if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
 				err(1, "Failed to store record %i", j);
 		}
 		/* Would have been empty before expansion, but no longer. */
 		ok1(was_empty);
-		ok1(!empty_freelist(tdb));
+		ok1(!empty_freetable(tdb));
 		tdb_close(tdb);
 	}

--- a/ccan/tdb2/test/run-50-multiple-freelists.c
+++ b/ccan/tdb2/test/run-50-multiple-freelists.c
@@ -22,11 +22,11 @@ int main(int argc, char *argv[])
 	data.dsize = 5;
 	key.dsize = 5;
-	/* Create a TDB with three free lists. */
+	/* Create a TDB with three free tables. */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 80, 0);
 	/* Used record prevent coalescing. */
 	tdb_layout_add_used(layout, key, data, 6);
@@ -40,24 +40,28 @@ int main(int argc, char *argv[])
 	tdb = tdb_layout_get(layout);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
-	off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[3].base.off);
-	ok1(tdb->flist_off == layout->elem[0].base.off);
+	ok1(tdb->ftable_off == layout->elem[0].base.off);
-	off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[5].base.off);
-	ok1(tdb->flist_off == layout->elem[1].base.off);
+	ok1(tdb->ftable_off == layout->elem[1].base.off);
-	off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[7].base.off);
-	ok1(tdb->flist_off == layout->elem[2].base.off);
+	ok1(tdb->ftable_off == layout->elem[2].base.off);
-	off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[9].base.off);
-	ok1(tdb->flist_off == layout->elem[0].base.off);
+	ok1(tdb->ftable_off == layout->elem[0].base.off);
 	/* Now we fail. */
-	off = get_free(tdb, 0, 0, 1, 0);
+	off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0);
 	ok1(off == 0);
 	tdb_close(tdb);

--- a/ccan/tdb2/test/run-seed.c
+++ b/ccan/tdb2/test/run-seed.c
@@ -13,7 +13,7 @@ static int log_count = 0;
 /* Normally we get a log when setting random seed. */
 static void my_log_fn(struct tdb_context *tdb,
 		      enum tdb_debug_level level, void *priv,
-		      const char *fmt, ...)
+		      const char *message)
 {
 	log_count++;
 }

--- a/ccan/tdb2/test/run-traverse.c
+++ b/ccan/tdb2/test/run-traverse.c
@@ -56,7 +56,6 @@ static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
 		td->high = val;
 	if (td->delete) {
 		if (tdb_delete(tdb, key) != 0) {
 			td->delete_error = tdb_error(tdb);
 			return -1;
@@ -120,7 +119,7 @@ int main(int argc, char *argv[])
 	hattr.base.next = &tap_log_attr;
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 50 + 1);
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 53 + 1);
 	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
 		tdb = tdb_open("run-traverse.tdb", flags[i],
 			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
@@ -182,6 +181,7 @@ int main(int argc, char *argv[])
 		ok1(td.low <= NUM_RECORDS / 2);
 		ok1(td.high > NUM_RECORDS / 2);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tap_log_messages == 0);
 		/* Growing traverse.  Expect failure on r/o traverse. */
 		tgd.calls = 0;
@@ -193,6 +193,8 @@ int main(int argc, char *argv[])
 		ok1(tgd.error == TDB_ERR_RDONLY);
 		ok1(tgd.calls == 1);
 		ok1(!tgd.mismatch);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		/* Deleting traverse.  Expect failure on r/o traverse. */
@@ -209,6 +211,8 @@ int main(int argc, char *argv[])
 		ok1(!td.mismatch);
 		ok1(td.calls == 1);
 		ok1(td.low == td.high);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		/* Deleting traverse (delete everything). */

--- a/ccan/tdb2/tools/Makefile
+++ b/ccan/tdb2/tools/Makefile
 OBJS:=../../tdb2.o ../../hash.o ../../tally.o
-CFLAGS:=-I../../.. -Wall -g #-g -O3 #-g -pg
+CFLAGS:=-I../../.. -Wall -g -O3 #-g -pg
 LDFLAGS:=-L../../..
-default: tdbtorture tdbtool mktdb
+default: tdbtorture tdbtool mktdb speed
 tdbtorture: tdbtorture.c $(OBJS)
 tdbtool: tdbtool.c $(OBJS)
 mktdb: mktdb.c $(OBJS)
+speed: speed.c $(OBJS)
 clean:
-	rm -f tdbtorture tdbtool mktdb
+	rm -f tdbtorture tdbtool mktdb speed
--- a/ccan/tdb2/tools/speed.c
+++ b/ccan/tdb2/tools/speed.c
+/* Simple speed test for TDB */
+#include <err.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <ccan/tdb2/tdb2.h>
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+			const struct timeval *stop,
+			unsigned int num)
+{
+	struct timeval diff;
+	timersub(stop, start, &diff);
+	/* Floating point is more accurate here. */
+	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+		/ num * 1000;
+}
+static size_t file_size(void)
+{
+	struct stat st;
+	if (stat("/tmp/speed.tdb", &st) != 0)
+		return -1;
+	return st.st_size;
+}
+static int count_record(struct tdb_context *tdb,
+			TDB_DATA key, TDB_DATA data, void *p)
+{
+	int *total = p;
+	*total += *(int *)data.dptr;
+	return 0;
+}
+static void dump_and_clear_stats(struct tdb_attribute_stats *stats)
+{
+	printf("allocs = %llu\n",
+	       (unsigned long long)stats->allocs);
+	printf("  alloc_subhash = %llu\n",
+	       (unsigned long long)stats->alloc_subhash);
+	printf("  alloc_chain = %llu\n",
+	       (unsigned long long)stats->alloc_chain);
+	printf("  alloc_bucket_exact = %llu\n",
+	       (unsigned long long)stats->alloc_bucket_exact);
+	printf("  alloc_bucket_max = %llu\n",
+	       (unsigned long long)stats->alloc_bucket_max);
+	printf("  alloc_leftover = %llu\n",
+	       (unsigned long long)stats->alloc_leftover);
+	printf("  alloc_coalesce_tried = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_tried);
+	printf("    alloc_coalesce_lockfail = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_lockfail);
+	printf("    alloc_coalesce_race = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_race);
+	printf("    alloc_coalesce_succeeded = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_succeeded);
+	printf("       alloc_coalesce_num_merged = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_num_merged);
+	printf("compares = %llu\n",
+	       (unsigned long long)stats->compares);
+	printf("  compare_wrong_bucket = %llu\n",
+	       (unsigned long long)stats->compare_wrong_bucket);
+	printf("  compare_wrong_offsetbits = %llu\n",
+	       (unsigned long long)stats->compare_wrong_offsetbits);
+	printf("  compare_wrong_keylen = %llu\n",
+	       (unsigned long long)stats->compare_wrong_keylen);
+	printf("  compare_wrong_rechash = %llu\n",
+	       (unsigned long long)stats->compare_wrong_rechash);
+	printf("  compare_wrong_keycmp = %llu\n",
+	       (unsigned long long)stats->compare_wrong_keycmp);
+	printf("expands = %llu\n",
+	       (unsigned long long)stats->expands);
+	printf("frees = %llu\n",
+	       (unsigned long long)stats->frees);
+	printf("locks = %llu\n",
+	       (unsigned long long)stats->locks);
+	printf("   lock_lowlevel = %llu\n",
+	       (unsigned long long)stats->lock_lowlevel);
+	printf("   lock_nonblock = %llu\n",
+	       (unsigned long long)stats->lock_nonblock);
+	/* Now clear. */
+	memset(&stats->allocs, 0, (char *)(stats+1) - (char *)&stats->allocs);
+}
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, num = 1000, stage = 0, stopat = -1;
+	int flags = TDB_DEFAULT;
+	bool transaction = false;
+	TDB_DATA key, data;
+	struct tdb_context *tdb;
+	struct timeval start, stop;
+	union tdb_attribute seed, stats;
+	/* Try to keep benchmarks even. */
+	seed.base.attr = TDB_ATTRIBUTE_SEED;
+	seed.base.next = NULL;
+	seed.seed.seed = 0;
+	memset(&stats, 0, sizeof(stats));
+	stats.base.attr = TDB_ATTRIBUTE_STATS;
+	stats.base.next = NULL;
+	stats.stats.size = sizeof(stats);
+	if (argv[1] && strcmp(argv[1], "--internal") == 0) {
+		flags = TDB_INTERNAL;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
+		transaction = true;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--stats") == 0) {
+		seed.base.next = &stats;
+		argc--;
+		argv++;
+	}
+	tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC,
+		       0600, &seed);
+	if (!tdb)
+		err(1, "Opening /tmp/speed.tdb");
+	key.dptr = (void *)&i;
+	key.dsize = sizeof(i);
+	data = key;
+	if (argv[1]) {
+		num = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+	if (argv[1]) {
+		stopat = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Add 1000 records. */
+	printf("Adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Finding 1000 records. */
+	printf("Finding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (!dptr || *dptr != i)
+			errx(1, "Fetching key %u in tdb gave %u",
+			     i, dptr ? *dptr : -1);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Missing 1000 records. */
+	printf("Missing %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = num; i < num*2; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (dptr)
+			errx(1, "Fetching key %u in tdb gave %u", i, *dptr);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Traverse 1000 records. */
+	printf("Traversing %u records: ", num); fflush(stdout);
+	i = 0;
+	gettimeofday(&start, NULL);
+	if (tdb_traverse(tdb, count_record, &i) != num)
+		errx(1, "Traverse returned wrong number of records");
+	if (i != (num - 1) * (num / 2))
+		errx(1, "Traverse tallied to %u", i);
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Delete 1000 records (not in order). */
+	printf("Deleting %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Re-add 1000 records (not in order). */
+	printf("Re-adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Append 1000 records. */
+	printf("Appending %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_append(tdb, key, data) != 0)
+			errx(1, "Appending key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+	/* Churn 1000 records: not in order! */
+	printf("Churning %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 1000019) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+		i += num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+	return 0;
+}
--- a/ccan/tdb2/transaction.c
+++ b/ccan/tdb2/transaction.c
@@ -169,10 +169,9 @@ static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
 	return 0;
 fail:
-	tdb->ecode = TDB_ERR_IO;
+	tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
-	tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		   "transaction_read: failed at off=%zu len=%zu",
-		 "transaction_read: failed at off=%llu len=%llu\n",
+		   (size_t)off, (size_t)len);
-		 (long long)off, (long long)len);
 	tdb->transaction->transaction_error = 1;
 	return -1;
 }
@@ -188,12 +187,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 	/* Only a commit is allowed on a prepared transaction */
 	if (tdb->transaction->prepared) {
-		tdb->ecode = TDB_ERR_EINVAL;
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_FATAL,
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 			 "transaction_write: transaction already prepared,"
-			 " write not allowed\n");
+			 " write not allowed");
-		tdb->transaction->transaction_error = 1;
+		goto fail;
-		return -1;
 	}
 	/* break it up into block sized chunks */
@@ -228,7 +225,8 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 				(blk+1)*sizeof(uint8_t *));
 		}
 		if (new_blocks == NULL) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "transaction_write: failed to allocate");
 			goto fail;
 		}
 		memset(&new_blocks[tdb->transaction->num_blocks], 0,
@@ -242,9 +240,9 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 	if (tdb->transaction->blocks[blk] == NULL) {
 		tdb->transaction->blocks[blk] = (uint8_t *)calloc(getpagesize(), 1);
 		if (tdb->transaction->blocks[blk] == NULL) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
-			tdb->transaction->transaction_error = 1;
+				   "transaction_write: failed to allocate");
-			return -1;
+			goto fail;
 		}
 		if (tdb->transaction->old_map_size > blk * getpagesize()) {
 			tdb_len_t len2 = getpagesize();
@@ -254,6 +252,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 			if (tdb->transaction->io_methods->read(tdb, blk * getpagesize(),
 							       tdb->transaction->blocks[blk],
 							       len2) != 0) {
+				tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+					   "transaction_write: failed to"
+					   " read old block: %s",
+					   strerror(errno));
 				SAFE_FREE(tdb->transaction->blocks[blk]);
 				goto fail;
 			}
@@ -278,10 +280,6 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 	return 0;
 fail:
-	tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-		 "transaction_write: failed at off=%llu len=%llu\n",
-		 (long long)((blk*getpagesize()) + off),
-		 (long long)len);
 	tdb->transaction->transaction_error = 1;
 	return -1;
 }
@@ -341,6 +339,12 @@ static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 		return 0;
 	}
 	tdb->ecode = TDB_ERR_IO;
+	if (!probe) {
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "tdb_oob len %lld beyond transaction size %lld",
+			   (long long)len,
+			   (long long)tdb->map_size);
+	}
 	return -1;
 }
@@ -359,10 +363,39 @@ static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t addition)
 }
 static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
-				size_t len)
+				size_t len, bool write)
 {
-	/* FIXME */
+	size_t blk = off / getpagesize(), end_blk;
-	return NULL;
+	/* This is wrong for zero-length blocks, but will fail gracefully */
+	end_blk = (off + len - 1) / getpagesize();
+	/* Can only do direct if in single block and we've already copied. */
+	if (write) {
+		if (blk != end_blk)
+			return NULL;
+		if (blk >= tdb->transaction->num_blocks)
+			return NULL;
+		if (tdb->transaction->blocks[blk] == NULL)
+			return NULL;
+		return tdb->transaction->blocks[blk] + off % getpagesize();
+	}
+	/* Single which we have copied? */
+	if (blk == end_blk
+	    && blk < tdb->transaction->num_blocks
+	    && tdb->transaction->blocks[blk])
+		return tdb->transaction->blocks[blk] + off % getpagesize();
+	/* Otherwise must be all not copied. */
+	while (blk < end_blk) {
+		if (blk >= tdb->transaction->num_blocks)
+			break;
+		if (tdb->transaction->blocks[blk])
+			return NULL;
+		blk++;
+	}
+	return tdb->transaction->io_methods->direct(tdb, off, len, write);
 }
 static const struct tdb_methods transaction_methods = {
@@ -383,9 +416,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
 	}
 	if (fsync(tdb->fd) != 0) {
-		tdb->ecode = TDB_ERR_IO;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			   "tdb_transaction: fsync failed: %s",
-			 "tdb_transaction: fsync failed\n");
+			   strerror(errno));
 		return -1;
 	}
 #ifdef MS_SYNC
@@ -393,10 +426,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
 		tdb_off_t moffset = offset & ~(getpagesize()-1);
 		if (msync(moffset + (char *)tdb->map_ptr,
 			  length + (offset - moffset), MS_SYNC) != 0) {
-			tdb->ecode = TDB_ERR_IO;
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				   "tdb_transaction: msync failed: %s",
-				 "tdb_transaction: msync failed - %s\n",
+				   strerror(errno));
-				 strerror(errno));
 			return -1;
 		}
 	}
@@ -410,9 +442,8 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
 	int i;
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_EINVAL;
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_transaction_cancel: no transaction");
-			 "tdb_transaction_cancel: no transaction\n");
 		return;
 	}
@@ -441,9 +472,9 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
 				   &invalid, sizeof(invalid)) == -1 ||
 		    transaction_sync(tdb, tdb->transaction->magic_offset,
 				     sizeof(invalid)) == -1) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
-				 "tdb_transaction_cancel: failed to remove"
+				   "tdb_transaction_cancel: failed to remove"
-				 " recovery magic\n");
+				   " recovery magic");
 		}
 	}
@@ -469,16 +500,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
 {
 	/* some sanity checks */
 	if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
-		tdb->ecode = TDB_ERR_EINVAL;
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_transaction_start: cannot start a transaction"
-			 "tdb_transaction_start: cannot start a transaction"
+			   " on a read-only or internal db");
-			 " on a read-only or internal db\n");
 		return -1;
 	}
 	/* cope with nested tdb_transaction_start() calls */
 	if (tdb->transaction != NULL) {
-		tdb->ecode = TDB_ERR_NESTING;
+		tdb_logerr(tdb, TDB_ERR_NESTING, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start:"
+			   " already inside transaction");
 		return -1;
 	}
@@ -486,17 +518,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
 		/* the caller must not have any locks when starting a
 		   transaction as otherwise we'll be screwed by lack
 		   of nested locks in posix */
-		tdb->ecode = TDB_ERR_LOCK;
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_transaction_start: cannot start a transaction"
-			 "tdb_transaction_start: cannot start a transaction"
+			   " with locks held");
-			 " with locks held\n");
 		return -1;
 	}
 	tdb->transaction = (struct tdb_transaction *)
 		calloc(sizeof(struct tdb_transaction), 1);
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start: cannot allocate");
 		return -1;
 	}
@@ -585,17 +617,17 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
 	if (recovery_head == TDB_OFF_ERR) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_recovery_allocate:"
-			 " failed to read recovery head\n");
+			 " failed to read recovery head");
 		return -1;
 	}
 	if (recovery_head != 0) {
 		if (methods->read(tdb, recovery_head, &rec, sizeof(rec))) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_recovery_allocate:"
-				 " failed to read recovery record\n");
+				 " failed to read recovery record");
 			return -1;
 		}
 		tdb_convert(tdb, &rec, sizeof(rec));
@@ -621,11 +653,12 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 	   us an area that is being currently used (as of the start of
 	   the transaction) */
 	if (recovery_head != 0) {
+		add_stat(tdb, frees, 1);
 		if (add_free_record(tdb, recovery_head,
 				    sizeof(rec) + rec.max_len) != 0) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
-				 "tdb_recovery_allocate:"
+				   "tdb_recovery_allocate:"
-				 " failed to free previous recovery area\n");
+				   " failed to free previous recovery area");
 			return -1;
 		}
 	}
@@ -649,9 +682,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 		sizeof(rec) + *recovery_max_size;
 	tdb->map_size = tdb->transaction->old_map_size;
 	if (methods->expand_file(tdb, addition) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_recovery_allocate:"
-			 " failed to create recovery area\n");
+			 " failed to create recovery area");
 		return -1;
 	}
@@ -665,9 +698,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 	tdb_convert(tdb, &recovery_head, sizeof(recovery_head));
 	if (methods->write(tdb, offsetof(struct tdb_header, recovery),
 			   &recovery_head, sizeof(tdb_off_t)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_recovery_allocate:"
-			 " failed to write recovery head\n");
+			 " failed to write recovery head");
 		return -1;
 	}
 	transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
@@ -713,7 +746,8 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 	data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
 	if (data == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+			   "transaction_setup_recovery: cannot allocate");
 		return -1;
 	}
@@ -743,10 +777,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 			continue;
 		}
 		if (offset + length > tdb->map_size) {
-			tdb->ecode = TDB_ERR_CORRUPT;
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				   "tdb_transaction_setup_recovery:"
-				 "tdb_transaction_setup_recovery:"
+				   " transaction data over new region boundary");
-				 " transaction data over new region boundary\n");
 			free(data);
 			return -1;
 		}
@@ -774,9 +807,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 	/* write the recovery data to the recovery area */
 	if (methods->write(tdb, recovery_offset, data,
 			   sizeof(*rec) + recovery_size) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_setup_recovery:"
-			 " failed to write recovery data\n");
+			 " failed to write recovery data");
 		free(data);
 		return -1;
 	}
@@ -801,9 +834,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 						   magic);
 	if (methods->write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_setup_recovery:"
-			 " failed to write recovery magic\n");
+			 " failed to write recovery magic");
 		return -1;
 	}
 	transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic));
@@ -821,27 +854,24 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	const struct tdb_methods *methods;
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_EINVAL;
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			   "tdb_transaction_prepare_commit: no transaction");
-			 "tdb_transaction_prepare_commit: no transaction\n");
 		return -1;
 	}
 	if (tdb->transaction->prepared) {
-		tdb->ecode = TDB_ERR_EINVAL;
 		_tdb_transaction_cancel(tdb);
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
-			 "tdb_transaction_prepare_commit:"
+			   "tdb_transaction_prepare_commit:"
-			 " transaction already prepared\n");
+			   " transaction already prepared");
 		return -1;
 	}
 	if (tdb->transaction->transaction_error) {
-		tdb->ecode = TDB_ERR_IO;
 		_tdb_transaction_cancel(tdb);
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
-			 "tdb_transaction_prepare_commit:"
+			   "tdb_transaction_prepare_commit:"
-			 " transaction error pending\n");
+			   " transaction error pending");
 		return -1;
 	}
@@ -860,9 +890,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	/* upgrade the main transaction lock region to a write lock */
 	if (tdb_allrecord_upgrade(tdb) == -1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
 			 "tdb_transaction_prepare_commit:"
-			 " failed to upgrade hash locks\n");
+			 " failed to upgrade hash locks");
 		_tdb_transaction_cancel(tdb);
 		return -1;
 	}
@@ -870,9 +900,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	/* get the open lock - this prevents new users attaching to the database
 	   during the commit */
 	if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
 			 "tdb_transaction_prepare_commit:"
-			 " failed to get open lock\n");
+			 " failed to get open lock");
 		_tdb_transaction_cancel(tdb);
 		return -1;
 	}
@@ -881,9 +911,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	if (!(tdb->flags & TDB_NOSYNC)) {
 		/* write the recovery data to the end of the file */
 		if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_transaction_prepare_commit:"
-				 " failed to setup recovery data\n");
+				 " failed to setup recovery data");
 			_tdb_transaction_cancel(tdb);
 			return -1;
 		}
@@ -897,9 +927,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 		/* Restore original map size for tdb_expand_file */
 		tdb->map_size = tdb->transaction->old_map_size;
 		if (methods->expand_file(tdb, add) == -1) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
 				 "tdb_transaction_prepare_commit:"
-				 " expansion failed\n");
+				 " expansion failed");
 			_tdb_transaction_cancel(tdb);
 			return -1;
 		}
@@ -927,19 +957,18 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 	int i;
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_EINVAL;
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_transaction_commit: no transaction");
-			 "tdb_transaction_commit: no transaction\n");
 		return -1;
 	}
 	tdb_trace(tdb, "tdb_transaction_commit");
 	if (tdb->transaction->transaction_error) {
-		tdb->ecode = TDB_ERR_IO;
 		tdb_transaction_cancel(tdb);
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
-			 "tdb_transaction_commit: transaction error pending\n");
+			   "tdb_transaction_commit:"
+			   " transaction error pending");
 		return -1;
 	}
@@ -980,9 +1009,9 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 		if (methods->write(tdb, offset, tdb->transaction->blocks[i],
 				   length) == -1) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
-				 "tdb_transaction_commit:"
+				   "tdb_transaction_commit:"
-				 " write failed during commit\n");
+				   " write failed during commit");
 			/* we've overwritten part of the data and
 			   possibly expanded the file, so we need to
@@ -1042,9 +1071,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	/* find the recovery area */
 	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
 	if (recovery_head == TDB_OFF_ERR) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_recover:"
-			 " failed to read recovery head\n");
+			 " failed to read recovery head");
 		return -1;
 	}
@@ -1055,9 +1084,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	/* read the recovery record */
 	if (tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
-			 "tdb_transaction_recover:"
+			   "tdb_transaction_recover:"
-			 " failed to read recovery record\n");
+			   " failed to read recovery record");
 		return -1;
 	}
@@ -1067,10 +1096,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	}
 	if (tdb->read_only) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
-			 "tdb_transaction_recover:"
+			   "tdb_transaction_recover:"
-			 " attempt to recover read only database\n");
+			   " attempt to recover read only database");
-		tdb->ecode = TDB_ERR_CORRUPT;
 		return -1;
 	}
@@ -1078,19 +1106,18 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	data = (unsigned char *)malloc(rec.len);
 	if (data == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			   "tdb_transaction_recover:"
-			 "tdb_transaction_recover:"
+			   " failed to allocate recovery data");
-			 " failed to allocate recovery data\n");
 		return -1;
 	}
 	/* read the full recovery data */
 	if (tdb->methods->read(tdb, recovery_head + sizeof(rec), data,
 			       rec.len) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
-			 "tdb_transaction_recover:"
+			   "tdb_transaction_recover:"
-			 " failed to read recovery data\n");
+			   " failed to read recovery data");
 		return -1;
 	}
@@ -1106,9 +1133,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 		if (tdb->methods->write(tdb, ofs, p, len) == -1) {
 			free(data);
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_transaction_recover:"
-				 " failed to recover %zu bytes at offset %zu\n",
+				 " failed to recover %zu bytes at offset %zu",
 				 (size_t)len, (size_t)ofs);
 			return -1;
 		}
@@ -1118,8 +1145,8 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	free(data);
 	if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
-			 "tdb_transaction_recover: failed to sync recovery\n");
+			   "tdb_transaction_recover: failed to sync recovery");
 		return -1;
 	}
@@ -1127,9 +1154,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	if (recovery_eof <= recovery_head) {
 		if (tdb_write_off(tdb, offsetof(struct tdb_header,recovery), 0)
 		    == -1) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_transaction_recover:"
-				 " failed to remove recovery head\n");
+				 " failed to remove recovery head");
 			return -1;
 		}
 	}
@@ -1139,21 +1166,21 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 			  recovery_head
 			  + offsetof(struct tdb_recovery_record, magic),
 			  TDB_RECOVERY_INVALID_MAGIC) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_recover:"
-			 " failed to remove recovery magic\n");
+			 " failed to remove recovery magic");
 		return -1;
 	}
 	if (transaction_sync(tdb, 0, recovery_eof) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
-			 "tdb_transaction_recover: failed to sync2 recovery\n");
+			 "tdb_transaction_recover: failed to sync2 recovery");
 		return -1;
 	}
-	tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
+	tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
-		 "tdb_transaction_recover: recovered %zu byte database\n",
+		   "tdb_transaction_recover: recovered %zu byte database",
-		 (size_t)recovery_eof);
+		   (size_t)recovery_eof);
 	/* all done */
 	return 0;