Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
ccan
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
mirror
ccan
Commits
51a56b52
Commit
51a56b52
authored
Dec 01, 2010
by
Rusty Russell
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'tdb2'
parents
451d97ad
a42bba8e
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
30 changed files
with
3815 additions
and
1543 deletions
+3815
-1543
ccan/tdb/tools/Makefile
ccan/tdb/tools/Makefile
+2
-2
ccan/tdb/tools/speed.c
ccan/tdb/tools/speed.c
+247
-0
ccan/tdb2/check.c
ccan/tdb2/check.c
+310
-162
ccan/tdb2/doc/design.lyx
ccan/tdb2/doc/design.lyx
+249
-278
ccan/tdb2/doc/design.lyx,v
ccan/tdb2/doc/design.lyx,v
+1257
-281
ccan/tdb2/doc/design.pdf
ccan/tdb2/doc/design.pdf
+0
-0
ccan/tdb2/doc/design.txt
ccan/tdb2/doc/design.txt
+149
-68
ccan/tdb2/free.c
ccan/tdb2/free.c
+187
-123
ccan/tdb2/hash.c
ccan/tdb2/hash.c
+187
-46
ccan/tdb2/io.c
ccan/tdb2/io.c
+114
-92
ccan/tdb2/lock.c
ccan/tdb2/lock.c
+77
-97
ccan/tdb2/private.h
ccan/tdb2/private.h
+87
-36
ccan/tdb2/summary.c
ccan/tdb2/summary.c
+63
-36
ccan/tdb2/tdb.c
ccan/tdb2/tdb.c
+148
-91
ccan/tdb2/tdb2.h
ccan/tdb2/tdb2.h
+34
-5
ccan/tdb2/test/layout.c
ccan/tdb2/test/layout.c
+41
-38
ccan/tdb2/test/layout.h
ccan/tdb2/test/layout.h
+6
-6
ccan/tdb2/test/logging.c
ccan/tdb2/test/logging.c
+2
-15
ccan/tdb2/test/logging.h
ccan/tdb2/test/logging.h
+1
-1
ccan/tdb2/test/run-001-encode.c
ccan/tdb2/test/run-001-encode.c
+8
-6
ccan/tdb2/test/run-03-coalesce.c
ccan/tdb2/test/run-03-coalesce.c
+11
-11
ccan/tdb2/test/run-04-basichash.c
ccan/tdb2/test/run-04-basichash.c
+4
-2
ccan/tdb2/test/run-25-hashoverload.c
ccan/tdb2/test/run-25-hashoverload.c
+117
-0
ccan/tdb2/test/run-30-exhaust-before-expand.c
ccan/tdb2/test/run-30-exhaust-before-expand.c
+8
-8
ccan/tdb2/test/run-50-multiple-freelists.c
ccan/tdb2/test/run-50-multiple-freelists.c
+17
-13
ccan/tdb2/test/run-seed.c
ccan/tdb2/test/run-seed.c
+1
-1
ccan/tdb2/test/run-traverse.c
ccan/tdb2/test/run-traverse.c
+6
-2
ccan/tdb2/tools/Makefile
ccan/tdb2/tools/Makefile
+4
-3
ccan/tdb2/tools/speed.c
ccan/tdb2/tools/speed.c
+331
-0
ccan/tdb2/transaction.c
ccan/tdb2/transaction.c
+147
-120
No files found.
ccan/tdb/tools/Makefile
View file @
51a56b52
...
@@ -2,7 +2,7 @@ LDLIBS:=../../tdb.o ../../tally.o
...
@@ -2,7 +2,7 @@ LDLIBS:=../../tdb.o ../../tally.o
CFLAGS
:=
-I
../../..
-Wall
-O3
#-g -pg
CFLAGS
:=
-I
../../..
-Wall
-O3
#-g -pg
LDFLAGS
:=
-L
../../..
LDFLAGS
:=
-L
../../..
default
:
replay_trace tdbtorture tdbdump tdbtool starvation mktdb
default
:
replay_trace tdbtorture tdbdump tdbtool starvation mktdb
speed
benchmark
:
replay_trace
benchmark
:
replay_trace
@
trap
"rm -f /tmp/trace.
$$$$
"
0
;
for
f
in
benchmarks/
*
.rz
;
do if
runzip
-k
$$
f
-o
/tmp/trace.
$$$$
&&
echo
-n
"
$$
f"
:
&&
./replay_trace
--quiet
-n
5 replay.tdb /tmp/trace.
$$$$
&&
rm
/tmp/trace.
$$$$
;
then
rm
-f
/tmp/trace.
$$$$
;
else
exit
1
;
fi
;
done
@
trap
"rm -f /tmp/trace.
$$$$
"
0
;
for
f
in
benchmarks/
*
.rz
;
do if
runzip
-k
$$
f
-o
/tmp/trace.
$$$$
&&
echo
-n
"
$$
f"
:
&&
./replay_trace
--quiet
-n
5 replay.tdb /tmp/trace.
$$$$
&&
rm
/tmp/trace.
$$$$
;
then
rm
-f
/tmp/trace.
$$$$
;
else
exit
1
;
fi
;
done
...
@@ -30,4 +30,4 @@ check: replay_trace
...
@@ -30,4 +30,4 @@ check: replay_trace
@
sed
's/\(^[0-9]* traverse\) .*/\1fn/'
<
$^
>
$@
@
sed
's/\(^[0-9]* traverse\) .*/\1fn/'
<
$^
>
$@
clean
:
clean
:
rm
-f
replay_trace tdbtorture tdbdump tdbtool
*
.o
rm
-f
replay_trace tdbtorture tdbdump tdbtool
speed
*
.o
ccan/tdb/tools/speed.c
0 → 100644
View file @
51a56b52
/* Simple speed test for TDB */
#include <err.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/time.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ccan/tdb/tdb.h>
/* Nanoseconds per operation */
static
size_t
normalize
(
const
struct
timeval
*
start
,
const
struct
timeval
*
stop
,
unsigned
int
num
)
{
struct
timeval
diff
;
timersub
(
stop
,
start
,
&
diff
);
/* Floating point is more accurate here. */
return
(
double
)(
diff
.
tv_sec
*
1000000
+
diff
.
tv_usec
)
/
num
*
1000
;
}
static
size_t
file_size
(
void
)
{
struct
stat
st
;
if
(
stat
(
"/tmp/speed.tdb"
,
&
st
)
!=
0
)
return
-
1
;
return
st
.
st_size
;
}
static
int
count_record
(
struct
tdb_context
*
tdb
,
TDB_DATA
key
,
TDB_DATA
data
,
void
*
p
)
{
int
*
total
=
p
;
*
total
+=
*
(
int
*
)
data
.
dptr
;
return
0
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
unsigned
int
i
,
j
,
num
=
1000
,
stage
=
0
,
stopat
=
-
1
;
int
flags
=
TDB_DEFAULT
;
TDB_DATA
key
,
data
;
struct
tdb_context
*
tdb
;
struct
timeval
start
,
stop
;
bool
transaction
=
false
;
if
(
argv
[
1
]
&&
strcmp
(
argv
[
1
],
"--internal"
)
==
0
)
{
flags
=
TDB_INTERNAL
;
argc
--
;
argv
++
;
}
if
(
argv
[
1
]
&&
strcmp
(
argv
[
1
],
"--transaction"
)
==
0
)
{
transaction
=
true
;
argc
--
;
argv
++
;
}
tdb
=
tdb_open
(
"/tmp/speed.tdb"
,
100003
,
flags
,
O_RDWR
|
O_CREAT
|
O_TRUNC
,
0600
);
if
(
!
tdb
)
err
(
1
,
"Opening /tmp/speed.tdb"
);
key
.
dptr
=
(
void
*
)
&
i
;
key
.
dsize
=
sizeof
(
i
);
data
=
key
;
if
(
argv
[
1
])
{
num
=
atoi
(
argv
[
1
]);
argv
++
;
argc
--
;
}
if
(
argv
[
1
])
{
stopat
=
atoi
(
argv
[
1
]);
argv
++
;
argc
--
;
}
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Add 1000 records. */
printf
(
"Adding %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
i
=
0
;
i
<
num
;
i
++
)
if
(
tdb_store
(
tdb
,
key
,
data
,
TDB_INSERT
)
!=
0
)
errx
(
1
,
"Inserting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Finding 1000 records. */
printf
(
"Finding %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
i
=
0
;
i
<
num
;
i
++
)
{
int
*
dptr
;
dptr
=
(
int
*
)
tdb_fetch
(
tdb
,
key
).
dptr
;
if
(
!
dptr
||
*
dptr
!=
i
)
errx
(
1
,
"Fetching key %u in tdb gave %u"
,
i
,
dptr
?
*
dptr
:
-
1
);
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Missing 1000 records. */
printf
(
"Missing %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
i
=
num
;
i
<
num
*
2
;
i
++
)
{
int
*
dptr
;
dptr
=
(
int
*
)
tdb_fetch
(
tdb
,
key
).
dptr
;
if
(
dptr
)
errx
(
1
,
"Fetching key %u in tdb gave %u"
,
i
,
*
dptr
);
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Traverse 1000 records. */
printf
(
"Traversing %u records: "
,
num
);
fflush
(
stdout
);
i
=
0
;
gettimeofday
(
&
start
,
NULL
);
if
(
tdb_traverse
(
tdb
,
count_record
,
&
i
)
!=
num
)
errx
(
1
,
"Traverse returned wrong number of records"
);
if
(
i
!=
(
num
-
1
)
*
(
num
/
2
))
errx
(
1
,
"Traverse tallied to %u"
,
i
);
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Delete 1000 records (not in order). */
printf
(
"Deleting %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
j
=
0
;
j
<
num
;
j
++
)
{
i
=
(
j
+
100003
)
%
num
;
if
(
tdb_delete
(
tdb
,
key
)
!=
0
)
errx
(
1
,
"Deleting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Re-add 1000 records (not in order). */
printf
(
"Re-adding %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
j
=
0
;
j
<
num
;
j
++
)
{
i
=
(
j
+
100003
)
%
num
;
if
(
tdb_store
(
tdb
,
key
,
data
,
TDB_INSERT
)
!=
0
)
errx
(
1
,
"Inserting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Append 1000 records. */
printf
(
"Appending %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
i
=
0
;
i
<
num
;
i
++
)
if
(
tdb_append
(
tdb
,
key
,
data
)
!=
0
)
errx
(
1
,
"Appending key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Churn 1000 records: not in order! */
printf
(
"Churning %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
j
=
0
;
j
<
num
;
j
++
)
{
i
=
(
j
+
1000019
)
%
num
;
if
(
tdb_delete
(
tdb
,
key
)
!=
0
)
errx
(
1
,
"Deleting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
i
+=
num
;
if
(
tdb_store
(
tdb
,
key
,
data
,
TDB_INSERT
)
!=
0
)
errx
(
1
,
"Inserting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
return
0
;
}
ccan/tdb2/check.c
View file @
51a56b52
...
@@ -43,25 +43,25 @@ static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
...
@@ -43,25 +43,25 @@ static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
hash_test
=
TDB_HASH_MAGIC
;
hash_test
=
TDB_HASH_MAGIC
;
hash_test
=
tdb_hash
(
tdb
,
&
hash_test
,
sizeof
(
hash_test
));
hash_test
=
tdb_hash
(
tdb
,
&
hash_test
,
sizeof
(
hash_test
));
if
(
hdr
.
hash_test
!=
hash_test
)
{
if
(
hdr
.
hash_test
!=
hash_test
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"check: hash test %llu should be %llu
\n
"
,
"check: hash test %llu should be %llu
"
,
(
long
long
)
hdr
.
hash_test
,
(
long
long
)
hdr
.
hash_test
,
(
long
long
)
hash_test
);
(
long
long
)
hash_test
);
return
false
;
return
false
;
}
}
if
(
strcmp
(
hdr
.
magic_food
,
TDB_MAGIC_FOOD
)
!=
0
)
{
if
(
strcmp
(
hdr
.
magic_food
,
TDB_MAGIC_FOOD
)
!=
0
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"check: bad magic '%.*s'
\n
"
,
"check: bad magic '%.*s'
"
,
(
unsigned
)
sizeof
(
hdr
.
magic_food
),
hdr
.
magic_food
);
(
unsigned
)
sizeof
(
hdr
.
magic_food
),
hdr
.
magic_food
);
return
false
;
return
false
;
}
}
*
recovery
=
hdr
.
recovery
;
*
recovery
=
hdr
.
recovery
;
if
(
*
recovery
)
{
if
(
*
recovery
)
{
if
(
*
recovery
<
sizeof
(
hdr
)
||
*
recovery
>
tdb
->
map_size
)
{
if
(
*
recovery
<
sizeof
(
hdr
)
||
*
recovery
>
tdb
->
map_size
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: invalid recovery offset %zu
\n
"
,
"tdb_check: invalid recovery offset %zu"
,
(
size_t
)
*
recovery
);
(
size_t
)
*
recovery
);
return
false
;
return
false
;
}
}
...
@@ -77,7 +77,65 @@ static bool check_hash_tree(struct tdb_context *tdb,
...
@@ -77,7 +77,65 @@ static bool check_hash_tree(struct tdb_context *tdb,
unsigned
hprefix_bits
,
unsigned
hprefix_bits
,
tdb_off_t
used
[],
tdb_off_t
used
[],
size_t
num_used
,
size_t
num_used
,
size_t
*
num_found
);
size_t
*
num_found
,
int
(
*
check
)(
TDB_DATA
,
TDB_DATA
,
void
*
),
void
*
private_data
);
static
bool
check_hash_chain
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
uint64_t
hash
,
tdb_off_t
used
[],
size_t
num_used
,
size_t
*
num_found
,
int
(
*
check
)(
TDB_DATA
,
TDB_DATA
,
void
*
),
void
*
private_data
)
{
struct
tdb_used_record
rec
;
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
))
==
-
1
)
return
false
;
if
(
rec_magic
(
&
rec
)
!=
TDB_CHAIN_MAGIC
)
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Bad hash chain magic %llu"
,
(
long
long
)
rec_magic
(
&
rec
));
return
false
;
}
if
(
rec_data_length
(
&
rec
)
!=
sizeof
(
struct
tdb_chain
))
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Bad hash chain length %llu vs %zu"
,
(
long
long
)
rec_data_length
(
&
rec
),
sizeof
(
struct
tdb_chain
));
return
false
;
}
if
(
rec_key_length
(
&
rec
)
!=
0
)
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Bad hash chain key length %llu"
,
(
long
long
)
rec_key_length
(
&
rec
));
return
false
;
}
if
(
rec_hash
(
&
rec
)
!=
0
)
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Bad hash chain hash value %llu"
,
(
long
long
)
rec_hash
(
&
rec
));
return
false
;
}
off
+=
sizeof
(
rec
);
if
(
!
check_hash_tree
(
tdb
,
off
,
0
,
hash
,
64
,
used
,
num_used
,
num_found
,
check
,
private_data
))
return
false
;
off
=
tdb_read_off
(
tdb
,
off
+
offsetof
(
struct
tdb_chain
,
next
));
if
(
off
==
TDB_OFF_ERR
)
return
false
;
if
(
off
==
0
)
return
true
;
(
*
num_found
)
++
;
return
check_hash_chain
(
tdb
,
off
,
hash
,
used
,
num_used
,
num_found
,
check
,
private_data
);
}
static
bool
check_hash_record
(
struct
tdb_context
*
tdb
,
static
bool
check_hash_record
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_off_t
off
,
...
@@ -85,30 +143,43 @@ static bool check_hash_record(struct tdb_context *tdb,
...
@@ -85,30 +143,43 @@ static bool check_hash_record(struct tdb_context *tdb,
unsigned
hprefix_bits
,
unsigned
hprefix_bits
,
tdb_off_t
used
[],
tdb_off_t
used
[],
size_t
num_used
,
size_t
num_used
,
size_t
*
num_found
)
size_t
*
num_found
,
int
(
*
check
)(
TDB_DATA
,
TDB_DATA
,
void
*
),
void
*
private_data
)
{
{
struct
tdb_used_record
rec
;
struct
tdb_used_record
rec
;
if
(
hprefix_bits
>=
64
)
return
check_hash_chain
(
tdb
,
off
,
hprefix
,
used
,
num_used
,
num_found
,
check
,
private_data
);
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
))
==
-
1
)
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
))
==
-
1
)
return
false
;
return
false
;
if
(
rec_magic
(
&
rec
)
!=
TDB_HTABLE_MAGIC
)
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Bad hash table magic %llu"
,
(
long
long
)
rec_magic
(
&
rec
));
return
false
;
}
if
(
rec_data_length
(
&
rec
)
if
(
rec_data_length
(
&
rec
)
!=
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
)
{
!=
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Bad hash table length %llu vs %llu
\n
"
,
"tdb_check: Bad hash table length %llu vs %llu"
,
(
long
long
)
rec_data_length
(
&
rec
),
(
long
long
)
rec_data_length
(
&
rec
),
(
long
long
)
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
);
(
long
long
)
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
);
return
false
;
return
false
;
}
}
if
(
rec_key_length
(
&
rec
)
!=
0
)
{
if
(
rec_key_length
(
&
rec
)
!=
0
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Bad hash table key length %llu
\n
"
,
"tdb_check: Bad hash table key length %llu"
,
(
long
long
)
rec_key_length
(
&
rec
));
(
long
long
)
rec_key_length
(
&
rec
));
return
false
;
return
false
;
}
}
if
(
rec_hash
(
&
rec
)
!=
0
)
{
if
(
rec_hash
(
&
rec
)
!=
0
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Bad hash table hash value %llu
\n
"
,
"tdb_check: Bad hash table hash value %llu"
,
(
long
long
)
rec_hash
(
&
rec
));
(
long
long
)
rec_hash
(
&
rec
));
return
false
;
return
false
;
}
}
...
@@ -117,7 +188,7 @@ static bool check_hash_record(struct tdb_context *tdb,
...
@@ -117,7 +188,7 @@ static bool check_hash_record(struct tdb_context *tdb,
return
check_hash_tree
(
tdb
,
off
,
return
check_hash_tree
(
tdb
,
off
,
TDB_SUBLEVEL_HASH_BITS
-
TDB_HASH_GROUP_BITS
,
TDB_SUBLEVEL_HASH_BITS
-
TDB_HASH_GROUP_BITS
,
hprefix
,
hprefix_bits
,
hprefix
,
hprefix_bits
,
used
,
num_used
,
num_found
);
used
,
num_used
,
num_found
,
check
,
private_data
);
}
}
static
int
off_cmp
(
const
tdb_off_t
*
a
,
const
tdb_off_t
*
b
)
static
int
off_cmp
(
const
tdb_off_t
*
a
,
const
tdb_off_t
*
b
)
...
@@ -141,7 +212,9 @@ static bool check_hash_tree(struct tdb_context *tdb,
...
@@ -141,7 +212,9 @@ static bool check_hash_tree(struct tdb_context *tdb,
unsigned
hprefix_bits
,
unsigned
hprefix_bits
,
tdb_off_t
used
[],
tdb_off_t
used
[],
size_t
num_used
,
size_t
num_used
,
size_t
*
num_found
)
size_t
*
num_found
,
int
(
*
check
)(
TDB_DATA
,
TDB_DATA
,
void
*
),
void
*
private_data
)
{
{
unsigned
int
g
,
b
;
unsigned
int
g
,
b
;
const
tdb_off_t
*
hash
;
const
tdb_off_t
*
hash
;
...
@@ -166,16 +239,42 @@ static bool check_hash_tree(struct tdb_context *tdb,
...
@@ -166,16 +239,42 @@ static bool check_hash_tree(struct tdb_context *tdb,
off
=
group
[
b
]
&
TDB_OFF_MASK
;
off
=
group
[
b
]
&
TDB_OFF_MASK
;
p
=
asearch
(
&
off
,
used
,
num_used
,
off_cmp
);
p
=
asearch
(
&
off
,
used
,
num_used
,
off_cmp
);
if
(
!
p
)
{
if
(
!
p
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"tdb_check: Invalid offset %llu "
TDB_DEBUG_ERROR
,
"in hash
\n
"
,
"tdb_check: Invalid offset %llu "
(
long
long
)
off
);
"in hash"
,
(
long
long
)
off
);
goto
fail
;
goto
fail
;
}
}
/* Mark it invalid. */
/* Mark it invalid. */
*
p
^=
1
;
*
p
^=
1
;
(
*
num_found
)
++
;
(
*
num_found
)
++
;
if
(
hprefix_bits
==
64
)
{
/* Chained entries are unordered. */
if
(
is_subhash
(
group
[
b
]))
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Invalid chain"
" entry subhash"
);
goto
fail
;
}
h
=
hash_record
(
tdb
,
off
);
if
(
h
!=
hprefix
)
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"check: bad hash chain"
" placement"
" 0x%llx vs 0x%llx"
,
(
long
long
)
h
,
(
long
long
)
hprefix
);
goto
fail
;
}
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
)))
goto
fail
;
goto
check
;
}
if
(
is_subhash
(
group
[
b
]))
{
if
(
is_subhash
(
group
[
b
]))
{
uint64_t
subprefix
;
uint64_t
subprefix
;
subprefix
=
(
hprefix
subprefix
=
(
hprefix
...
@@ -188,7 +287,8 @@ static bool check_hash_tree(struct tdb_context *tdb,
...
@@ -188,7 +287,8 @@ static bool check_hash_tree(struct tdb_context *tdb,
hprefix_bits
hprefix_bits
+
group_bits
+
group_bits
+
TDB_HASH_GROUP_BITS
,
+
TDB_HASH_GROUP_BITS
,
used
,
num_used
,
num_found
))
used
,
num_used
,
num_found
,
check
,
private_data
))
goto
fail
;
goto
fail
;
continue
;
continue
;
}
}
...
@@ -199,18 +299,20 @@ static bool check_hash_tree(struct tdb_context *tdb,
...
@@ -199,18 +299,20 @@ static bool check_hash_tree(struct tdb_context *tdb,
used_bits
=
0
;
used_bits
=
0
;
if
(
get_bits
(
h
,
hprefix_bits
,
&
used_bits
)
!=
hprefix
if
(
get_bits
(
h
,
hprefix_bits
,
&
used_bits
)
!=
hprefix
&&
hprefix_bits
)
{
&&
hprefix_bits
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"check: bad hash placement"
TDB_DEBUG_ERROR
,
" 0x%llx vs 0x%llx
\n
"
,
"check: bad hash placement"
" 0x%llx vs 0x%llx"
,
(
long
long
)
h
,
(
long
long
)
hprefix
);
(
long
long
)
h
,
(
long
long
)
hprefix
);
goto
fail
;
goto
fail
;
}
}
/* Does it belong in this group? */
/* Does it belong in this group? */
if
(
get_bits
(
h
,
group_bits
,
&
used_bits
)
!=
g
)
{
if
(
get_bits
(
h
,
group_bits
,
&
used_bits
)
!=
g
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"check: bad group %llu vs %u
\n
"
,
TDB_DEBUG_ERROR
,
(
long
long
)
h
,
g
);
"check: bad group %llu vs %u"
,
(
long
long
)
h
,
g
);
goto
fail
;
goto
fail
;
}
}
...
@@ -219,11 +321,12 @@ static bool check_hash_tree(struct tdb_context *tdb,
...
@@ -219,11 +321,12 @@ static bool check_hash_tree(struct tdb_context *tdb,
if
(
get_bits
(
h
,
TDB_HASH_GROUP_BITS
,
&
used_bits
)
if
(
get_bits
(
h
,
TDB_HASH_GROUP_BITS
,
&
used_bits
)
!=
bucket
)
{
!=
bucket
)
{
used_bits
-=
TDB_HASH_GROUP_BITS
;
used_bits
-=
TDB_HASH_GROUP_BITS
;
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"check: bad bucket %u vs %u
\n
"
,
TDB_DEBUG_ERROR
,
"check: bad bucket %u vs %u"
,
(
unsigned
)
get_bits
(
h
,
(
unsigned
)
get_bits
(
h
,
TDB_HASH_GROUP_BITS
,
TDB_HASH_GROUP_BITS
,
&
used_bits
),
&
used_bits
),
bucket
);
bucket
);
goto
fail
;
goto
fail
;
}
}
...
@@ -234,28 +337,46 @@ static bool check_hash_tree(struct tdb_context *tdb,
...
@@ -234,28 +337,46 @@ static bool check_hash_tree(struct tdb_context *tdb,
i
!=
b
;
i
!=
b
;
i
=
(
i
+
1
)
%
(
1
<<
TDB_HASH_GROUP_BITS
))
{
i
=
(
i
+
1
)
%
(
1
<<
TDB_HASH_GROUP_BITS
))
{
if
(
group
[
i
]
==
0
)
{
if
(
group
[
i
]
==
0
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
tdb
->
log_priv
,
TDB_DEBUG_ERROR
,
"check: bad group placement"
"check: bad group placement"
" %u vs %u
\n
"
,
" %u vs %u
"
,
b
,
bucket
);
b
,
bucket
);
goto
fail
;
goto
fail
;
}
}
}
}
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
))
==
-
1
)
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
)))
goto
fail
;
goto
fail
;
/* Bottom bits must match header. */
/* Bottom bits must match header. */
if
((
h
&
((
1
<<
11
)
-
1
))
!=
rec_hash
(
&
rec
))
{
if
((
h
&
((
1
<<
11
)
-
1
))
!=
rec_hash
(
&
rec
))
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"tdb_check: Bad hash magic at"
TDB_DEBUG_ERROR
,
" offset %llu (0x%llx vs 0x%llx)
\n
"
,
"tdb_check: Bad hash magic at"
(
long
long
)
off
,
" offset %llu (0x%llx vs 0x%llx)"
,
(
long
long
)
h
,
(
long
long
)
off
,
(
long
long
)
rec_hash
(
&
rec
));
(
long
long
)
h
,
(
long
long
)
rec_hash
(
&
rec
));
goto
fail
;
goto
fail
;
}
}
check:
if
(
check
)
{
TDB_DATA
key
,
data
;
key
.
dsize
=
rec_key_length
(
&
rec
);
data
.
dsize
=
rec_data_length
(
&
rec
);
key
.
dptr
=
(
void
*
)
tdb_access_read
(
tdb
,
off
+
sizeof
(
rec
),
key
.
dsize
+
data
.
dsize
,
false
);
if
(
!
key
.
dptr
)
goto
fail
;
data
.
dptr
=
key
.
dptr
+
key
.
dsize
;
if
(
check
(
key
,
data
,
private_data
)
!=
0
)
goto
fail
;
tdb_access_release
(
tdb
,
key
.
dptr
);
}
}
}
}
}
tdb_access_release
(
tdb
,
hash
);
tdb_access_release
(
tdb
,
hash
);
...
@@ -268,19 +389,22 @@ fail:
...
@@ -268,19 +389,22 @@ fail:
static
bool
check_hash
(
struct
tdb_context
*
tdb
,
static
bool
check_hash
(
struct
tdb_context
*
tdb
,
tdb_off_t
used
[],
tdb_off_t
used
[],
size_t
num_used
,
size_t
num_flists
)
size_t
num_used
,
size_t
num_ftables
,
int
(
*
check
)(
TDB_DATA
,
TDB_DATA
,
void
*
),
void
*
private_data
)
{
{
/* Free
list
s also show up as used. */
/* Free
table
s also show up as used. */
size_t
num_found
=
num_f
list
s
;
size_t
num_found
=
num_f
table
s
;
if
(
!
check_hash_tree
(
tdb
,
offsetof
(
struct
tdb_header
,
hashtable
),
if
(
!
check_hash_tree
(
tdb
,
offsetof
(
struct
tdb_header
,
hashtable
),
TDB_TOPLEVEL_HASH_BITS
-
TDB_HASH_GROUP_BITS
,
TDB_TOPLEVEL_HASH_BITS
-
TDB_HASH_GROUP_BITS
,
0
,
0
,
used
,
num_used
,
&
num_found
))
0
,
0
,
used
,
num_used
,
&
num_found
,
check
,
private_data
))
return
false
;
return
false
;
if
(
num_found
!=
num_used
)
{
if
(
num_found
!=
num_used
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Not all entries are in hash
\n
"
);
"tdb_check: Not all entries are in hash
"
);
return
false
;
return
false
;
}
}
return
true
;
return
true
;
...
@@ -289,62 +413,63 @@ static bool check_hash(struct tdb_context *tdb,
...
@@ -289,62 +413,63 @@ static bool check_hash(struct tdb_context *tdb,
static
bool
check_free
(
struct
tdb_context
*
tdb
,
static
bool
check_free
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_off_t
off
,
const
struct
tdb_free_record
*
frec
,
const
struct
tdb_free_record
*
frec
,
tdb_off_t
prev
,
tdb_off_t
flist_off
,
unsigned
int
bucket
)
tdb_off_t
prev
,
unsigned
int
ftable
,
unsigned
int
bucket
)
{
{
if
(
frec_magic
(
frec
)
!=
TDB_FREE_MAGIC
)
{
if
(
frec_magic
(
frec
)
!=
TDB_FREE_MAGIC
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: offset %llu bad magic 0x%llx
\n
"
,
"tdb_check: offset %llu bad magic 0x%llx
"
,
(
long
long
)
off
,
(
long
long
)
frec
->
magic_and_meta
);
(
long
long
)
off
,
(
long
long
)
frec
->
magic_and_prev
);
return
false
;
return
false
;
}
}
if
(
frec_f
list
(
frec
)
!=
flist_off
)
{
if
(
frec_f
table
(
frec
)
!=
ftable
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: offset %llu bad freelist 0x%llx
\n
"
,
"tdb_check: offset %llu bad freetable %u
"
,
(
long
long
)
off
,
(
long
long
)
frec_flist
(
frec
));
(
long
long
)
off
,
frec_ftable
(
frec
));
return
false
;
return
false
;
}
}
if
(
tdb
->
methods
->
oob
(
tdb
,
off
if
(
tdb
->
methods
->
oob
(
tdb
,
off
+
frec
->
data_len
+
sizeof
(
struct
tdb_used_record
),
+
frec
_len
(
frec
)
+
sizeof
(
struct
tdb_used_record
),
false
))
false
))
return
false
;
return
false
;
if
(
size_to_bucket
(
frec
->
data_len
)
!=
bucket
)
{
if
(
size_to_bucket
(
frec
_len
(
frec
)
)
!=
bucket
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: offset %llu in wrong bucket %u vs %u
\n
"
,
"tdb_check: offset %llu in wrong bucket %u vs %u
"
,
(
long
long
)
off
,
(
long
long
)
off
,
bucket
,
size_to_bucket
(
frec
->
data_len
));
bucket
,
size_to_bucket
(
frec_len
(
frec
)
));
return
false
;
return
false
;
}
}
if
(
prev
!=
frec
->
prev
)
{
if
(
prev
!=
frec
_prev
(
frec
)
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: offset %llu bad prev %llu vs %llu
\n
"
,
"tdb_check: offset %llu bad prev %llu vs %llu
"
,
(
long
long
)
off
,
(
long
long
)
off
,
(
long
long
)
prev
,
(
long
long
)
frec
->
prev
);
(
long
long
)
prev
,
(
long
long
)
frec_len
(
frec
)
);
return
false
;
return
false
;
}
}
return
true
;
return
true
;
}
}
static
bool
check_free_list
(
struct
tdb_context
*
tdb
,
static
bool
check_free_table
(
struct
tdb_context
*
tdb
,
tdb_off_t
flist_off
,
tdb_off_t
ftable_off
,
tdb_off_t
free
[],
unsigned
ftable_num
,
size_t
num_free
,
tdb_off_t
free
[],
size_t
*
num_found
)
size_t
num_free
,
size_t
*
num_found
)
{
{
struct
tdb_free
list
flis
t
;
struct
tdb_free
table
f
t
;
tdb_off_t
h
;
tdb_off_t
h
;
unsigned
int
i
;
unsigned
int
i
;
if
(
tdb_read_convert
(
tdb
,
f
list_off
,
&
flist
,
sizeof
(
flis
t
))
==
-
1
)
if
(
tdb_read_convert
(
tdb
,
f
table_off
,
&
ft
,
sizeof
(
f
t
))
==
-
1
)
return
false
;
return
false
;
if
(
rec_magic
(
&
flist
.
hdr
)
!=
TDB_MAGIC
if
(
rec_magic
(
&
ft
.
hdr
)
!=
TDB_FTABLE_MAGIC
||
rec_key_length
(
&
flist
.
hdr
)
!=
0
||
rec_key_length
(
&
ft
.
hdr
)
!=
0
||
rec_data_length
(
&
flist
.
hdr
)
!=
sizeof
(
flist
)
-
sizeof
(
flist
.
hdr
)
||
rec_data_length
(
&
ft
.
hdr
)
!=
sizeof
(
ft
)
-
sizeof
(
ft
.
hdr
)
||
rec_hash
(
&
flist
.
hdr
)
!=
1
)
{
||
rec_hash
(
&
ft
.
hdr
)
!=
0
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_check: Invalid header on free table"
);
"tdb_check: Invalid header on free list
\n
"
);
return
false
;
return
false
;
}
}
...
@@ -352,23 +477,23 @@ static bool check_free_list(struct tdb_context *tdb,
...
@@ -352,23 +477,23 @@ static bool check_free_list(struct tdb_context *tdb,
tdb_off_t
off
,
prev
=
0
,
*
p
;
tdb_off_t
off
,
prev
=
0
,
*
p
;
struct
tdb_free_record
f
;
struct
tdb_free_record
f
;
h
=
bucket_off
(
f
list
_off
,
i
);
h
=
bucket_off
(
f
table
_off
,
i
);
for
(
off
=
tdb_read_off
(
tdb
,
h
);
off
;
off
=
f
.
next
)
{
for
(
off
=
tdb_read_off
(
tdb
,
h
);
off
;
off
=
f
.
next
)
{
if
(
off
==
TDB_OFF_ERR
)
if
(
off
==
TDB_OFF_ERR
)
return
false
;
return
false
;
if
(
tdb_read_convert
(
tdb
,
off
,
&
f
,
sizeof
(
f
)))
if
(
tdb_read_convert
(
tdb
,
off
,
&
f
,
sizeof
(
f
)))
return
false
;
return
false
;
if
(
!
check_free
(
tdb
,
off
,
&
f
,
prev
,
f
list_off
,
i
))
if
(
!
check_free
(
tdb
,
off
,
&
f
,
prev
,
f
table_num
,
i
))
return
false
;
return
false
;
/* FIXME: Check hash bits */
/* FIXME: Check hash bits */
p
=
asearch
(
&
off
,
free
,
num_free
,
off_cmp
);
p
=
asearch
(
&
off
,
free
,
num_free
,
off_cmp
);
if
(
!
p
)
{
if
(
!
p
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
tdb
->
log_priv
,
TDB_DEBUG_ERROR
,
"tdb_check: Invalid offset"
"tdb_check: Invalid offset"
" %llu in free table
\n
"
,
" %llu in free table
"
,
(
long
long
)
off
);
(
long
long
)
off
);
return
false
;
return
false
;
}
}
/* Mark it invalid. */
/* Mark it invalid. */
...
@@ -381,7 +506,7 @@ static bool check_free_list(struct tdb_context *tdb,
...
@@ -381,7 +506,7 @@ static bool check_free_list(struct tdb_context *tdb,
}
}
/* Slow, but should be very rare. */
/* Slow, but should be very rare. */
s
tatic
s
ize_t
dead_space
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
)
size_t
dead_space
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
)
{
{
size_t
len
;
size_t
len
;
...
@@ -409,113 +534,135 @@ static bool check_linear(struct tdb_context *tdb,
...
@@ -409,113 +534,135 @@ static bool check_linear(struct tdb_context *tdb,
struct
tdb_used_record
u
;
struct
tdb_used_record
u
;
struct
tdb_free_record
f
;
struct
tdb_free_record
f
;
struct
tdb_recovery_record
r
;
struct
tdb_recovery_record
r
;
}
pad
,
*
p
;
}
rec
;
p
=
tdb_get
(
tdb
,
off
,
&
pad
,
sizeof
(
pad
));
/* r is larger: only get that if we need to. */
if
(
!
p
)
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
.
f
))
==
-
1
)
return
false
;
return
false
;
/* If we crash after ftruncate, we can get zeroes or fill. */
/* If we crash after ftruncate, we can get zeroes or fill. */
if
(
p
->
r
.
magic
==
TDB_RECOVERY_INVALID_MAGIC
if
(
rec
.
r
.
magic
==
TDB_RECOVERY_INVALID_MAGIC
||
p
->
r
.
magic
==
0x4343434343434343ULL
)
{
||
rec
.
r
.
magic
==
0x4343434343434343ULL
)
{
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
.
r
)))
return
false
;
if
(
recovery
==
off
)
{
if
(
recovery
==
off
)
{
found_recovery
=
true
;
found_recovery
=
true
;
len
=
sizeof
(
p
->
r
)
+
p
->
r
.
max_len
;
len
=
sizeof
(
rec
.
r
)
+
rec
.
r
.
max_len
;
}
else
{
}
else
{
len
=
dead_space
(
tdb
,
off
);
len
=
dead_space
(
tdb
,
off
);
if
(
len
<
sizeof
(
p
->
r
))
{
if
(
len
<
sizeof
(
rec
.
r
))
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
tdb
->
log_priv
,
TDB_DEBUG_ERROR
,
"tdb_check: invalid dead space"
"tdb_check: invalid dead"
" at %zu
\n
"
,
(
size_t
)
off
);
" space at %zu"
,
(
size_t
)
off
);
return
false
;
return
false
;
}
}
tdb
->
log
(
tdb
,
TDB_DEBUG_WARNING
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_SUCCESS
,
TDB_DEBUG_WARNING
,
"Dead space at %zu-%zu (of %zu)
\n
"
,
"Dead space at %zu-%zu (of %zu)
"
,
(
size_t
)
off
,
(
size_t
)(
off
+
len
),
(
size_t
)
off
,
(
size_t
)(
off
+
len
),
(
size_t
)
tdb
->
map_size
);
(
size_t
)
tdb
->
map_size
);
}
}
}
else
if
(
p
->
r
.
magic
==
TDB_RECOVERY_MAGIC
)
{
}
else
if
(
rec
.
r
.
magic
==
TDB_RECOVERY_MAGIC
)
{
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
.
r
)))
return
false
;
if
(
recovery
!=
off
)
{
if
(
recovery
!=
off
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"tdb_check: unexpected recovery"
TDB_DEBUG_ERROR
,
" record at offset %zu
\n
"
,
"tdb_check: unexpected recovery"
(
size_t
)
off
);
" record at offset %zu"
,
(
size_t
)
off
);
return
false
;
}
if
(
rec
.
r
.
len
>
rec
.
r
.
max_len
)
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: invalid recovery length"
" %zu"
,
(
size_t
)
rec
.
r
.
len
);
return
false
;
}
if
(
rec
.
r
.
eof
>
tdb
->
map_size
)
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: invalid old EOF"
" %zu"
,
(
size_t
)
rec
.
r
.
eof
);
return
false
;
return
false
;
}
}
found_recovery
=
true
;
found_recovery
=
true
;
len
=
sizeof
(
p
->
r
)
+
p
->
r
.
max_len
;
len
=
sizeof
(
rec
.
r
)
+
rec
.
r
.
max_len
;
}
else
if
(
frec_magic
(
&
p
->
f
)
==
TDB_FREE_MAGIC
}
else
if
(
frec_magic
(
&
rec
.
f
)
==
TDB_FREE_MAGIC
)
{
||
frec_magic
(
&
p
->
f
)
==
TDB_COALESCING_MAGIC
)
{
len
=
sizeof
(
rec
.
u
)
+
frec_len
(
&
rec
.
f
);
len
=
sizeof
(
p
->
u
)
+
p
->
f
.
data_len
;
if
(
off
+
len
>
tdb
->
map_size
)
{
if
(
off
+
len
>
tdb
->
map_size
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"tdb_check: free overlength %llu"
TDB_DEBUG_ERROR
,
" at offset %llu
\n
"
,
"tdb_check: free overlength %llu"
(
long
long
)
len
,
(
long
long
)
off
);
" at offset %llu"
,
(
long
long
)
len
,
(
long
long
)
off
);
return
false
;
return
false
;
}
}
/* This record
is free!
*/
/* This record
should be in free lists.
*/
if
(
frec_
magic
(
&
p
->
f
)
==
TDB_FREE_MAGIC
if
(
frec_
ftable
(
&
rec
.
f
)
!=
TDB_FTABLE_NONE
&&
!
append
(
free
,
num_free
,
off
))
&&
!
append
(
free
,
num_free
,
off
))
return
false
;
return
false
;
}
else
{
}
else
if
(
rec_magic
(
&
rec
.
u
)
==
TDB_USED_MAGIC
||
rec_magic
(
&
rec
.
u
)
==
TDB_CHAIN_MAGIC
||
rec_magic
(
&
rec
.
u
)
==
TDB_HTABLE_MAGIC
||
rec_magic
(
&
rec
.
u
)
==
TDB_FTABLE_MAGIC
)
{
uint64_t
klen
,
dlen
,
extra
;
uint64_t
klen
,
dlen
,
extra
;
/* This record is used! */
/* This record is used! */
if
(
rec_magic
(
&
p
->
u
)
!=
TDB_MAGIC
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_check: Bad magic 0x%llx"
" at offset %llu
\n
"
,
(
long
long
)
rec_magic
(
&
p
->
u
),
(
long
long
)
off
);
return
false
;
}
if
(
!
append
(
used
,
num_used
,
off
))
if
(
!
append
(
used
,
num_used
,
off
))
return
false
;
return
false
;
klen
=
rec_key_length
(
&
p
->
u
);
klen
=
rec_key_length
(
&
rec
.
u
);
dlen
=
rec_data_length
(
&
p
->
u
);
dlen
=
rec_data_length
(
&
rec
.
u
);
extra
=
rec_extra_padding
(
&
p
->
u
);
extra
=
rec_extra_padding
(
&
rec
.
u
);
len
=
sizeof
(
p
->
u
)
+
klen
+
dlen
+
extra
;
len
=
sizeof
(
rec
.
u
)
+
klen
+
dlen
+
extra
;
if
(
off
+
len
>
tdb
->
map_size
)
{
if
(
off
+
len
>
tdb
->
map_size
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"tdb_check: used overlength %llu"
TDB_DEBUG_ERROR
,
" at offset %llu
\n
"
,
"tdb_check: used overlength %llu"
(
long
long
)
len
,
(
long
long
)
off
);
" at offset %llu"
,
(
long
long
)
len
,
(
long
long
)
off
);
return
false
;
return
false
;
}
}
if
(
len
<
sizeof
(
p
->
f
))
{
if
(
len
<
sizeof
(
rec
.
f
))
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"tdb_check: too short record %llu at"
TDB_DEBUG_ERROR
,
" %llu
\n
"
,
"tdb_check: too short record %llu"
(
long
long
)
len
,
(
long
long
)
off
);
" at %llu"
,
(
long
long
)
len
,
(
long
long
)
off
);
return
false
;
return
false
;
}
}
}
else
{
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Bad magic 0x%llx at offset %zu"
,
(
long
long
)
rec_magic
(
&
rec
.
u
),
(
size_t
)
off
);
return
false
;
}
}
}
}
/* We must have found recovery area if there was one. */
/* We must have found recovery area if there was one. */
if
(
recovery
!=
0
&&
!
found_recovery
)
{
if
(
recovery
!=
0
&&
!
found_recovery
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: expected a recovery area at %zu
\n
"
,
"tdb_check: expected a recovery area at %zu
"
,
(
size_t
)
recovery
);
(
size_t
)
recovery
);
return
false
;
return
false
;
}
}
return
true
;
return
true
;
}
}
/* FIXME: call check() function. */
int
tdb_check
(
struct
tdb_context
*
tdb
,
int
tdb_check
(
struct
tdb_context
*
tdb
,
int
(
*
check
)(
TDB_DATA
key
,
TDB_DATA
data
,
void
*
private_data
),
int
(
*
check
)(
TDB_DATA
key
,
TDB_DATA
data
,
void
*
private_data
),
void
*
private_data
)
void
*
private_data
)
{
{
tdb_off_t
*
free
=
NULL
,
*
used
=
NULL
,
f
lis
t
,
recovery
;
tdb_off_t
*
free
=
NULL
,
*
used
=
NULL
,
ft
,
recovery
;
size_t
num_free
=
0
,
num_used
=
0
,
num_found
=
0
,
num_f
list
s
=
0
;
size_t
num_free
=
0
,
num_used
=
0
,
num_found
=
0
,
num_f
table
s
=
0
;
if
(
tdb_allrecord_lock
(
tdb
,
F_RDLCK
,
TDB_LOCK_WAIT
,
false
)
!=
0
)
if
(
tdb_allrecord_lock
(
tdb
,
F_RDLCK
,
TDB_LOCK_WAIT
,
false
)
!=
0
)
return
-
1
;
return
-
1
;
...
@@ -532,22 +679,23 @@ int tdb_check(struct tdb_context *tdb,
...
@@ -532,22 +679,23 @@ int tdb_check(struct tdb_context *tdb,
if
(
!
check_linear
(
tdb
,
&
used
,
&
num_used
,
&
free
,
&
num_free
,
recovery
))
if
(
!
check_linear
(
tdb
,
&
used
,
&
num_used
,
&
free
,
&
num_free
,
recovery
))
goto
fail
;
goto
fail
;
for
(
f
list
=
first_flist
(
tdb
);
flist
;
flist
=
next_flist
(
tdb
,
flis
t
))
{
for
(
f
t
=
first_ftable
(
tdb
);
ft
;
ft
=
next_ftable
(
tdb
,
f
t
))
{
if
(
f
lis
t
==
TDB_OFF_ERR
)
if
(
ft
==
TDB_OFF_ERR
)
goto
fail
;
goto
fail
;
if
(
!
check_free_list
(
tdb
,
flist
,
free
,
num_free
,
&
num_found
))
if
(
!
check_free_table
(
tdb
,
ft
,
num_ftables
,
free
,
num_free
,
&
num_found
))
goto
fail
;
goto
fail
;
num_f
list
s
++
;
num_f
table
s
++
;
}
}
/* FIXME: Check key uniqueness? */
/* FIXME: Check key uniqueness? */
if
(
!
check_hash
(
tdb
,
used
,
num_used
,
num_f
lists
))
if
(
!
check_hash
(
tdb
,
used
,
num_used
,
num_f
tables
,
check
,
private_data
))
goto
fail
;
goto
fail
;
if
(
num_found
!=
num_free
)
{
if
(
num_found
!=
num_free
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_ERROR
,
"tdb_check: Not all entries are in free table
\n
"
);
"tdb_check: Not all entries are in free table
"
);
return
false
;
return
-
1
;
}
}
tdb_allrecord_unlock
(
tdb
,
F_RDLCK
);
tdb_allrecord_unlock
(
tdb
,
F_RDLCK
);
...
...
ccan/tdb2/doc/design.lyx
View file @
51a56b52
#LyX 1.6.
5
created this file. For more info see http://www.lyx.org/
#LyX 1.6.
7
created this file. For more info see http://www.lyx.org/
\lyxformat 345
\lyxformat 345
\begin_document
\begin_document
\begin_header
\begin_header
...
@@ -50,13 +50,7 @@ Rusty Russell, IBM Corporation
...
@@ -50,13 +50,7 @@ Rusty Russell, IBM Corporation
\end_layout
\end_layout
\begin_layout Date
\begin_layout Date
1-December-2010
\change_deleted 0 1283307542
26-July
\change_inserted 0 1284423485
14-September
\change_unchanged
-2010
\end_layout
\end_layout
\begin_layout Abstract
\begin_layout Abstract
...
@@ -476,8 +470,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional
...
@@ -476,8 +470,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional
\begin_layout Subsubsection
\begin_layout Subsubsection
Proposed Solution
Proposed Solution
\change_inserted 0 1284422789
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
name "attributes"
name "attributes"
...
@@ -485,8 +477,6 @@ name "attributes"
...
@@ -485,8 +477,6 @@ name "attributes"
\end_inset
\end_inset
\change_unchanged
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -590,6 +580,14 @@ This allows future attributes to be added, even if this expands the size
...
@@ -590,6 +580,14 @@ This allows future attributes to be added, even if this expands the size
of the union.
of the union.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
tdb_traverse Makes Impossible Guarantees
tdb_traverse Makes Impossible Guarantees
\end_layout
\end_layout
...
@@ -631,6 +629,16 @@ Abandon the guarantee.
...
@@ -631,6 +629,16 @@ Abandon the guarantee.
You can prevent changes by using a transaction or the locking API.
You can prevent changes by using a transaction or the locking API.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
Delete-during-traverse will still delete every record, too (assuming no
other changes).
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Nesting of Transactions Is Fraught
Nesting of Transactions Is Fraught
\end_layout
\end_layout
...
@@ -685,6 +693,14 @@ least-surprise
...
@@ -685,6 +693,14 @@ least-surprise
-obscure case.
-obscure case.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete; nesting flag is still defined as per tdb1.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Incorrect Hash Function is Not Detected
Incorrect Hash Function is Not Detected
\end_layout
\end_layout
...
@@ -706,6 +722,14 @@ The header should contain an example hash result (eg.
...
@@ -706,6 +722,14 @@ The header should contain an example hash result (eg.
hash function produces the same answer, or fail the tdb_open call.
hash function produces the same answer, or fail the tdb_open call.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
tdb_set_max_dead/TDB_VOLATILE Expose Implementation
tdb_set_max_dead/TDB_VOLATILE Expose Implementation
\end_layout
\end_layout
...
@@ -750,6 +774,16 @@ With the scalability problems of the freelist solved, this API can be removed.
...
@@ -750,6 +774,16 @@ With the scalability problems of the freelist solved, this API can be removed.
tuning, but initially will become a no-op.
tuning, but initially will become a no-op.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
TDB_VOLATILE still defined, but implementation should fail on unknown flags
to be future-proof.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
...
@@ -802,6 +836,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether
...
@@ -802,6 +836,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether
an API.
an API.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
TDB API Is Not POSIX Thread-safe
TDB API Is Not POSIX Thread-safe
\end_layout
\end_layout
...
@@ -846,8 +888,6 @@ Internal locking is required to make sure that fcntl locks do not overlap
...
@@ -846,8 +888,6 @@ Internal locking is required to make sure that fcntl locks do not overlap
\begin_layout Standard
\begin_layout Standard
The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
version of the library, and otherwise no overhead will exist.
version of the library, and otherwise no overhead will exist.
\change_inserted 0 1284016998
Alternatively, a hooking mechanism similar to that proposed for
Alternatively, a hooking mechanism similar to that proposed for
\begin_inset CommandInset ref
\begin_inset CommandInset ref
LatexCommand ref
LatexCommand ref
...
@@ -856,8 +896,14 @@ reference "Proposed-Solution-locking-hook"
...
@@ -856,8 +896,14 @@ reference "Proposed-Solution-locking-hook"
\end_inset
\end_inset
could be used to enable pthread locking at runtime.
could be used to enable pthread locking at runtime.
\
change_unchanged
\
end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -975,6 +1021,14 @@ This is flexible enough to handle any potential locking scenario, even when
...
@@ -975,6 +1021,14 @@ This is flexible enough to handle any potential locking scenario, even when
It also keeps the complexity out of the API, and in ctdbd where it is needed.
It also keeps the complexity out of the API, and in ctdbd where it is needed.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
tdb_chainlock Functions Expose Implementation
tdb_chainlock Functions Expose Implementation
\end_layout
\end_layout
...
@@ -1056,6 +1110,14 @@ It may be possible to make this race-free in some implementations by having
...
@@ -1056,6 +1110,14 @@ It may be possible to make this race-free in some implementations by having
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
The API Uses Gratuitous Typedefs, Capitals
The API Uses Gratuitous Typedefs, Capitals
\end_layout
\end_layout
...
@@ -1132,6 +1194,14 @@ It should simply take an extra argument, since we are prepared to break
...
@@ -1132,6 +1194,14 @@ It should simply take an extra argument, since we are prepared to break
the API/ABI.
the API/ABI.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Various Callback Functions Are Not Typesafe
Various Callback Functions Are Not Typesafe
\end_layout
\end_layout
...
@@ -1171,6 +1241,14 @@ With careful use of macros, we can create callback functions which give
...
@@ -1171,6 +1241,14 @@ With careful use of macros, we can create callback functions which give
See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
\end_layout
\end_layout
...
@@ -1206,19 +1284,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
...
@@ -1206,19 +1284,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
\end_inset
\end_inset
.
.
\
change_inserted 0 1284015637
\
end_layout
\begin_layout Subsubsection
Status
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Standard
Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
\end_layout
\
change_inserted 0 1284015716
\
begin_layout Subsection
Extending The Header Is Difficult
Extending The Header Is Difficult
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284015906
We have reserved (zeroed) words in the TDB header, which can be used for
We have reserved (zeroed) words in the TDB header, which can be used for
future features.
future features.
If the future features are compulsory, the version number must be updated
If the future features are compulsory, the version number must be updated
...
@@ -1228,14 +1308,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for
...
@@ -1228,14 +1308,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1284015637
Proposed Solution
Proposed Solution
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284016114
The header should contain a
The header should contain a
\begin_inset Quotes eld
\begin_inset Quotes eld
\end_inset
\end_inset
...
@@ -1249,58 +1325,48 @@ format variant
...
@@ -1249,58 +1325,48 @@ format variant
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
\change_inserted 0 1284016149
The lower part reflects the format variant understood by code accessing
The lower part reflects the format variant understood by code accessing
the database.
the database.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
\change_inserted 0 1284016639
The upper part reflects the format variant you must understand to write
The upper part reflects the format variant you must understand to write
to the database (otherwise you can only open for reading).
to the database (otherwise you can only open for reading).
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284016821
The latter field can only be written at creation time, the former should
The latter field can only be written at creation time, the former should
be written under the OPEN_LOCK when opening the database for writing, if
be written under the OPEN_LOCK when opening the database for writing, if
the variant of the code is lower than the current lowest variant.
the variant of the code is lower than the current lowest variant.
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284016803
This should allow backwards-compatible features to be added, and detection
This should allow backwards-compatible features to be added, and detection
if older code (which doesn't understand the feature) writes to the database.
if older code (which doesn't understand the feature) writes to the database.
\
change_deleted 0 1284016101
\
end_layout
\begin_layout Subsubsection
Status
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Standard
Incomplete.
\end_layout
\
change_inserted 0 1284015634
\
begin_layout Subsection
Record Headers Are Not Expandible
Record Headers Are Not Expandible
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284015634
If we later want to add (say) checksums on keys and data, it would require
If we later want to add (say) checksums on keys and data, it would require
another format change, which we'd like to avoid.
another format change, which we'd like to avoid.
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1284015634
Proposed Solution
Proposed Solution
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284422552
We often have extra padding at the tail of a record.
We often have extra padding at the tail of a record.
If we ensure that the first byte (if any) of this padding is zero, we will
If we ensure that the first byte (if any) of this padding is zero, we will
have a way for future changes to detect code which doesn't understand a
have a way for future changes to detect code which doesn't understand a
...
@@ -1309,28 +1375,28 @@ We often have extra padding at the tail of a record.
...
@@ -1309,28 +1375,28 @@ We often have extra padding at the tail of a record.
not present on that record.
not present on that record.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\
change_inserted 0 1284422568
\
begin_layout Subsection
TDB Does Not Use Talloc
TDB Does Not Use Talloc
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284422646
Many users of TDB (particularly Samba) use the talloc allocator, and thus
Many users of TDB (particularly Samba) use the talloc allocator, and thus
have to wrap TDB in a talloc context to use it conveniently.
have to wrap TDB in a talloc context to use it conveniently.
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1284422656
Proposed Solution
Proposed Solution
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284423065
The allocation within TDB is not complicated enough to justify the use of
The allocation within TDB is not complicated enough to justify the use of
talloc, and I am reluctant to force another (excellent) library on TDB
talloc, and I am reluctant to force another (excellent) library on TDB
users.
users.
...
@@ -1356,15 +1422,19 @@ context
...
@@ -1356,15 +1422,19 @@ context
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284423042
This would form a talloc heirarchy as expected, but the caller would still
This would form a talloc heirarchy as expected, but the caller would still
have to attach a destructor to the tdb context returned from tdb_open to
have to attach a destructor to the tdb context returned from tdb_open to
close it.
close it.
All TDB_DATA fields would be children of the tdb_context, and the caller
All TDB_DATA fields would be children of the tdb_context, and the caller
would still have to manage them (using talloc_free() or talloc_steal()).
would still have to manage them (using talloc_free() or talloc_steal()).
\
change_unchanged
\
end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\end_layout
\begin_layout Section
\begin_layout Section
...
@@ -1422,6 +1492,14 @@ Remove the flag.
...
@@ -1422,6 +1492,14 @@ Remove the flag.
point.
point.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
TDB Files Have a 4G Limit
TDB Files Have a 4G Limit
\end_layout
\end_layout
...
@@ -1469,6 +1547,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August
...
@@ -1469,6 +1547,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August
be erased and initialized as a fresh tdb!)
be erased and initialized as a fresh tdb!)
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
TDB Records Have a 4G Limit
TDB Records Have a 4G Limit
\end_layout
\end_layout
...
@@ -1498,6 +1584,14 @@ reference "sub:Records-Incur-A"
...
@@ -1498,6 +1584,14 @@ reference "sub:Records-Incur-A"
).
).
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Hash Size Is Determined At TDB Creation Time
Hash Size Is Determined At TDB Creation Time
\end_layout
\end_layout
...
@@ -1512,16 +1606,12 @@ TDB contains a number of hash chains in the header; the number is specified
...
@@ -1512,16 +1606,12 @@ TDB contains a number of hash chains in the header; the number is specified
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1283336713
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
name "sub:Hash-Size-Solution"
name "sub:Hash-Size-Solution"
\end_inset
\end_inset
\change_unchanged
Proposed Solution
Proposed Solution
\end_layout
\end_layout
...
@@ -1540,58 +1630,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin
...
@@ -1540,58 +1630,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin
, it became clear that it is hard to beat a straight linear hash table which
, it became clear that it is hard to beat a straight linear hash table which
doubles in size when it reaches saturation.
doubles in size when it reaches saturation.
\change_deleted 0 1283307675
There are three details which become important:
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
On encountering a full bucket, we use the next bucket.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
Extra hash bits are stored with the offset, to reduce comparisons.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
A marker entry is used on deleting an entry.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The doubling of the table must be done under a transaction; we will not
reduce it on deletion, so it will be an unusual case.
It will either be placed at the head (other entries will be moved out the
way so we can expand).
We could have a pointer in the header to the current hashtable location,
but that pointer would have to be read frequently to check for hashtable
moves.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The locking for this is slightly more complex than the chained case; we
currently have one lock per bucket, and that means we would need to expand
the lock if we overflow to the next bucket.
The frequency of such collisions will effect our locking heuristics: we
can always lock more buckets than we need.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
One possible optimization is to only re-check the hash size on an insert
or a lookup miss.
\change_inserted 0 1283307770
Unfortunately, altering the hash table introduces serious locking complications
Unfortunately, altering the hash table introduces serious locking complications
: the entire hash table needs to be locked to enlarge the hash table, and
: the entire hash table needs to be locked to enlarge the hash table, and
others might be holding locks.
others might be holding locks.
...
@@ -1599,8 +1637,6 @@ One possible optimization is to only re-check the hash size on an insert
...
@@ -1599,8 +1637,6 @@ One possible optimization is to only re-check the hash size on an insert
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1283336187
Thus an expanding layered hash will be used: an array of hash groups, with
Thus an expanding layered hash will be used: an array of hash groups, with
each hash group exploding into pointers to lower hash groups once it fills,
each hash group exploding into pointers to lower hash groups once it fills,
turning into a hash tree.
turning into a hash tree.
...
@@ -1609,8 +1645,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with
...
@@ -1609,8 +1645,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1283336586
Note that bits from the hash table entries should be stolen to hold more
Note that bits from the hash table entries should be stolen to hold more
hash bits to reduce the penalty of collisions.
hash bits to reduce the penalty of collisions.
We can use the otherwise-unused lower 3 bits.
We can use the otherwise-unused lower 3 bits.
...
@@ -1621,8 +1655,14 @@ Note that bits from the hash table entries should be stolen to hold more
...
@@ -1621,8 +1655,14 @@ Note that bits from the hash table entries should be stolen to hold more
bits are valid.
bits are valid.
This means we can choose not to re-hash all entries when we expand a hash
This means we can choose not to re-hash all entries when we expand a hash
group; simply use the next bits we need and mark them invalid.
group; simply use the next bits we need and mark them invalid.
\
change_unchanged
\
end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -1749,8 +1789,6 @@ The single list lock limits our allocation rate; due to the other issues
...
@@ -1749,8 +1789,6 @@ The single list lock limits our allocation rate; due to the other issues
\begin_layout Subsubsection
\begin_layout Subsubsection
Proposed Solution
Proposed Solution
\change_deleted 0 1283336858
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -1765,20 +1803,14 @@ The free list must be split to reduce contention.
...
@@ -1765,20 +1803,14 @@ The free list must be split to reduce contention.
This implies that the number of free lists is related to the size of the
This implies that the number of free lists is related to the size of the
hash table, but as it is rare to walk a large number of free list entries
hash table, but as it is rare to walk a large number of free list entries
we can use far fewer, say 1/32 of the number of hash buckets.
we can use far fewer, say 1/32 of the number of hash buckets.
\change_inserted 0 1283336910
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1283337052
It seems tempting to try to reuse the hash implementation which we use for
It seems tempting to try to reuse the hash implementation which we use for
records here, but we have two ways of searching for free entries: for allocatio
records here, but we have two ways of searching for free entries: for allocatio
n we search by size (and possibly zone) which produces too many clashes
n we search by size (and possibly zone) which produces too many clashes
for our hash table to handle well, and for coalescing we search by address.
for our hash table to handle well, and for coalescing we search by address.
Thus an array of doubly-linked free lists seems preferable.
Thus an array of doubly-linked free lists seems preferable.
\change_unchanged
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -1792,24 +1824,28 @@ reference "sub:TDB-Becomes-Fragmented"
...
@@ -1792,24 +1824,28 @@ reference "sub:TDB-Becomes-Fragmented"
) but it's not clear this would reduce contention in the common case where
) but it's not clear this would reduce contention in the common case where
all processes are allocating/freeing the same size.
all processes are allocating/freeing the same size.
Thus we almost certainly need to divide in other ways: the most obvious
Thus we almost certainly need to divide in other ways: the most obvious
is to divide the file into zones, and using a free list (or
set
of free
is to divide the file into zones, and using a free list (or
table
of free
lists) for each.
lists) for each.
This approximates address ordering.
This approximates address ordering.
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
Note that this means we need to split the free lists when we expand the
Unfortunately it is difficult to know what heuristics should be used to
file; this is probably acceptable when we double the hash table size, since
determine zone sizes, and our transaction code relies on being able to
that is such an expensive operation already.
create a
In the case of increasing the file size, there is an optimization we can
\begin_inset Quotes eld
use: if we use M in the formula above as the file size rounded up to the
\end_inset
next power of 2, we only need reshuffle free lists when the file size crosses
a power of 2 boundary,
recovery area
\emph on
\begin_inset Quotes erd
and
\end_inset
\emph default
reshuffling the free lists is trivial: we simply merge every consecutive
by simply appending to the file (difficult if it would need to create a
pair of free lists.
new zone header).
Thus we use a linked-list of free tables; currently we only ever create
one, but if there is more than one we choose one at random to use.
In future we may use heuristics to add new free tables on contention.
We only expand the file when all free tables are exhausted.
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -1818,7 +1854,7 @@ The basic algorithm is as follows.
...
@@ -1818,7 +1854,7 @@ The basic algorithm is as follows.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Identify the correct
zone
.
Identify the correct
free list
.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
...
@@ -1826,12 +1862,12 @@ Lock the corresponding list.
...
@@ -1826,12 +1862,12 @@ Lock the corresponding list.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Re-check the
zone
(we didn't have a lock, sizes could have changed): relock
Re-check the
list
(we didn't have a lock, sizes could have changed): relock
if necessary.
if necessary.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Place the freed entry in the list
for that zone
.
Place the freed entry in the list.
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -1840,15 +1876,7 @@ Allocation is a little more complicated, as we perform delayed coalescing
...
@@ -1840,15 +1876,7 @@ Allocation is a little more complicated, as we perform delayed coalescing
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Pick a zone either the zone we last freed into, or based on a
Pick a free table; usually the previous one.
\begin_inset Quotes eld
\end_inset
random
\begin_inset Quotes erd
\end_inset
number.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
...
@@ -1856,16 +1884,16 @@ Lock the corresponding list.
...
@@ -1856,16 +1884,16 @@ Lock the corresponding list.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Re-check the zone: relock if necessary
.
If the top entry is -large enough, remove it from the list and return it
.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
If the top entry is -large enough, remove it from the list and return it.
Otherwise, coalesce entries in the list.If there was no entry large enough,
unlock the list and try the next largest list
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Otherwise, coalesce entries in the list.If there was no entry large enough,
If no list has an entry which meets our needs, try the next free table.
unlock the list and try the next zone.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
...
@@ -1897,73 +1925,8 @@ reference "sub:Records-Incur-A"
...
@@ -1897,73 +1925,8 @@ reference "sub:Records-Incur-A"
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
I anticipate that the number of entries in each free zone would be small,
Each free entry has the free table number in the header: less than 255.
but it might be worth using one free entry to hold pointers to the others
It also contains a doubly-linked list for easy deletion.
for cache efficiency.
\change_inserted 0 1283309850
\end_layout
\begin_layout Standard
\change_inserted 0 1283337216
\begin_inset CommandInset label
LatexCommand label
name "freelist-in-zone"
\end_inset
If we want to avoid locking complexity (enlarging the free lists when we
enlarge the file) we could place the array of free lists at the beginning
of each zone.
This means existing array lists never move, but means that a record cannot
be larger than a zone.
That in turn implies that zones should be variable sized (say, power of
2), which makes the question
\begin_inset Quotes eld
\end_inset
what zone is this record in?
\begin_inset Quotes erd
\end_inset
much harder (and
\begin_inset Quotes eld
\end_inset
pick a random zone
\begin_inset Quotes erd
\end_inset
, but that's less common).
It could be done with as few as 4 bits from the record header.
\begin_inset Foot
status open
\begin_layout Plain Layout
\change_inserted 0 1284424151
Using
\begin_inset Formula $2^{16+N*3}$
\end_inset
means 0 gives a minimal 65536-byte zone, 15 gives the maximal
\begin_inset Formula $2^{61}$
\end_inset
byte zone.
Zones range in factor of 8 steps.
Given the zone size for the zone the current record is in, we can determine
the start of the zone.
\change_unchanged
\end_layout
\end_inset
\change_unchanged
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -2165,8 +2128,6 @@ miss
...
@@ -2165,8 +2128,6 @@ miss
it reduces 99.9% of false memcmp).
it reduces 99.9% of false memcmp).
As an aside, as the lower bits are already incorporated in the hash table
As an aside, as the lower bits are already incorporated in the hash table
resolution, the upper bits should be used here.
resolution, the upper bits should be used here.
\change_inserted 0 1283336739
Note that it's not clear that these bits will be a win, given the extra
Note that it's not clear that these bits will be a win, given the extra
bits in the hash table itself (see
bits in the hash table itself (see
\begin_inset CommandInset ref
\begin_inset CommandInset ref
...
@@ -2176,8 +2137,6 @@ reference "sub:Hash-Size-Solution"
...
@@ -2176,8 +2137,6 @@ reference "sub:Hash-Size-Solution"
\end_inset
\end_inset
).
).
\change_unchanged
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
...
@@ -2214,11 +2173,11 @@ struct tdb_used_record {
...
@@ -2214,11 +2173,11 @@ struct tdb_used_record {
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
uint32_t magic : 16,
uint32_t
used_
magic : 16,
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
prev_is_free: 1,
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
...
@@ -2226,7 +2185,7 @@ struct tdb_used_record {
...
@@ -2226,7 +2185,7 @@ struct tdb_used_record {
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
top_hash: 1
0
;
top_hash: 1
1
;
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
...
@@ -2250,29 +2209,27 @@ struct tdb_free_record {
...
@@ -2250,29 +2209,27 @@ struct tdb_free_record {
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
uint
32_t free_magic;
uint
64_t free_magic: 8,
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
uint64_t total_length;
prev : 56;
\change_inserted 0 1283337133
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
\change_inserted 0 1283337139
\end_layout
uint64_t prev, next;
\change_unchanged
\begin_layout LyX-Code
uint64_t free_table: 8,
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
...
total_length : 56
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
uint64_t
tailer
;
uint64_t
next;
;
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
...
@@ -2281,20 +2238,19 @@ struct tdb_free_record {
...
@@ -2281,20 +2238,19 @@ struct tdb_free_record {
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1283337235
\change_deleted 0 1291206079
We might want to take some bits from the used record's top_hash (and the
free record which has 32 bits of padding to spare anyway) if we use variable
sized zones.
See
\begin_inset CommandInset ref
LatexCommand ref
reference "freelist-in-zone"
\end_inset
.
\change_unchanged
\change_unchanged
Note that by limiting valid offsets to 56 bits, we can pack everything we
need into 3 64-byte words, meaning our minimum record size is 8 bytes.
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -2387,6 +2343,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum
...
@@ -2387,6 +2343,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum
a transaction in progress; we need only check for recovery if this is set.
a transaction in progress; we need only check for recovery if this is set.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
...
@@ -2398,13 +2362,7 @@ TDB Does Not Have Snapshot Support
...
@@ -2398,13 +2362,7 @@ TDB Does Not Have Snapshot Support
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
Proposed Solution
Proposed SolutionNone.
\change_deleted 0 1284423472
\end_layout
\begin_layout Standard
None.
At some point you say
At some point you say
\begin_inset Quotes eld
\begin_inset Quotes eld
\end_inset
\end_inset
...
@@ -2413,13 +2371,6 @@ use a real database
...
@@ -2413,13 +2371,6 @@ use a real database
\begin_inset Quotes erd
\begin_inset Quotes erd
\end_inset
\end_inset
\change_inserted 0 1284423891
\change_deleted 0 1284423891
.
\change_inserted 0 1284423901
(but see
(but see
\begin_inset CommandInset ref
\begin_inset CommandInset ref
LatexCommand ref
LatexCommand ref
...
@@ -2428,8 +2379,6 @@ reference "replay-attribute"
...
@@ -2428,8 +2379,6 @@ reference "replay-attribute"
\end_inset
\end_inset
).
).
\change_unchanged
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -2452,8 +2401,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack
...
@@ -2452,8 +2401,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack
\begin_layout Standard
\begin_layout Standard
We could then implement snapshots using a similar method, using multiple
We could then implement snapshots using a similar method, using multiple
different hash tables/free tables.
different hash tables/free tables.
\
change_inserted 0 1284423495
\
end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -2473,8 +2428,6 @@ Proposed Solution
...
@@ -2473,8 +2428,6 @@ Proposed Solution
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284424201
None (but see
None (but see
\begin_inset CommandInset ref
\begin_inset CommandInset ref
LatexCommand ref
LatexCommand ref
...
@@ -2483,15 +2436,21 @@ reference "replay-attribute"
...
@@ -2483,15 +2436,21 @@ reference "replay-attribute"
\end_inset
\end_inset
).
).
We could solve a small part of the problem by providing read-only transactions.
\change_unchanged
We could solve a small part of the problem by providing read-only transactions.
These would allow one write transaction to begin, but it could not commit
These would allow one write transaction to begin, but it could not commit
until all r/o transactions are done.
until all r/o transactions are done.
This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
commit.
commit.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Default Hash Function Is Suboptimal
Default Hash Function Is Suboptimal
\end_layout
\end_layout
...
@@ -2532,6 +2491,14 @@ The seed should be created at tdb-creation time from some random source,
...
@@ -2532,6 +2491,14 @@ The seed should be created at tdb-creation time from some random source,
hash bombing.
hash bombing.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
...
@@ -2569,6 +2536,14 @@ reference "traverse-Proposed-Solution"
...
@@ -2569,6 +2536,14 @@ reference "traverse-Proposed-Solution"
.
.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Fcntl Locking Adds Overhead
Fcntl Locking Adds Overhead
\end_layout
\end_layout
...
@@ -2670,19 +2645,13 @@ At some later point, a sync would allow recovery of the old data into the
...
@@ -2670,19 +2645,13 @@ At some later point, a sync would allow recovery of the old data into the
free lists (perhaps when the array of top-level pointers filled).
free lists (perhaps when the array of top-level pointers filled).
On crash, tdb_open() would examine the array of top levels, and apply the
On crash, tdb_open() would examine the array of top levels, and apply the
transactions until it encountered an invalid checksum.
transactions until it encountered an invalid checksum.
\change_inserted 0 1284423555
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
\change_inserted 0 1284423617
Tracing Is Fragile, Replay Is External
Tracing Is Fragile, Replay Is External
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284423719
The current TDB has compile-time-enabled tracing code, but it often breaks
The current TDB has compile-time-enabled tracing code, but it often breaks
as it is not enabled by default.
as it is not enabled by default.
In a similar way, the ctdb code has an external wrapper which does replay
In a similar way, the ctdb code has an external wrapper which does replay
...
@@ -2690,8 +2659,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks
...
@@ -2690,8 +2659,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1284423864
Proposed Solution
Proposed Solution
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
...
@@ -2703,8 +2670,6 @@ name "replay-attribute"
...
@@ -2703,8 +2670,6 @@ name "replay-attribute"
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284423850
Tridge points out that an attribute can be later added to tdb_open (see
Tridge points out that an attribute can be later added to tdb_open (see
\begin_inset CommandInset ref
\begin_inset CommandInset ref
...
@@ -2715,8 +2680,14 @@ reference "attributes"
...
@@ -2715,8 +2680,14 @@ reference "attributes"
) to provide replay/trace hooks, which could become the basis for this and
) to provide replay/trace hooks, which could become the basis for this and
future parallel transactions and snapshot support.
future parallel transactions and snapshot support.
\
change_unchanged
\
end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\end_layout
\end_body
\end_body
...
...
ccan/tdb2/doc/design.lyx,v
View file @
51a56b52
head 1.1
0
;
head 1.1
3
;
access;
access;
symbols;
symbols;
locks; strict;
locks; strict;
comment @# @;
comment @# @;
1.13
date 2010.12.01.12.22.08; author rusty; state Exp;
branches;
next 1.12;
1.12
date 2010.12.01.12.20.49; author rusty; state Exp;
branches;
next 1.11;
1.11
date 2010.12.01.11.55.20; author rusty; state Exp;
branches;
next 1.10;
1.10
1.10
date 2010.09.14.00.33.57; author rusty; state Exp;
date 2010.09.14.00.33.57; author rusty; state Exp;
branches;
branches;
...
@@ -61,12 +76,12 @@ desc
...
@@ -61,12 +76,12 @@ desc
@
@
1.1
0
1.1
3
log
log
@
Tracing attribute, talloc support
.
@
Merged changes
.
@
@
text
text
@#LyX 1.6.
5
created this file. For more info see http://www.lyx.org/
@#LyX 1.6.
7
created this file. For more info see http://www.lyx.org/
\lyxformat 345
\lyxformat 345
\begin_document
\begin_document
\begin_header
\begin_header
...
@@ -118,13 +133,7 @@ Rusty Russell, IBM Corporation
...
@@ -118,13 +133,7 @@ Rusty Russell, IBM Corporation
\end_layout
\end_layout
\begin_layout Date
\begin_layout Date
1-December-2010
\change_deleted 0 1283307542
26-July
\change_inserted 0 1284423485
14-September
\change_unchanged
-2010
\end_layout
\end_layout
\begin_layout Abstract
\begin_layout Abstract
...
@@ -544,8 +553,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional
...
@@ -544,8 +553,6 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional
\begin_layout Subsubsection
\begin_layout Subsubsection
Proposed Solution
Proposed Solution
\change_inserted 0 1284422789
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
name "attributes"
name "attributes"
...
@@ -553,8 +560,6 @@ name "attributes"
...
@@ -553,8 +560,6 @@ name "attributes"
\end_inset
\end_inset
\change_unchanged
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -658,6 +663,14 @@ This allows future attributes to be added, even if this expands the size
...
@@ -658,6 +663,14 @@ This allows future attributes to be added, even if this expands the size
of the union.
of the union.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
tdb_traverse Makes Impossible Guarantees
tdb_traverse Makes Impossible Guarantees
\end_layout
\end_layout
...
@@ -699,6 +712,16 @@ Abandon the guarantee.
...
@@ -699,6 +712,16 @@ Abandon the guarantee.
You can prevent changes by using a transaction or the locking API.
You can prevent changes by using a transaction or the locking API.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
Delete-during-traverse will still delete every record, too (assuming no
other changes).
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Nesting of Transactions Is Fraught
Nesting of Transactions Is Fraught
\end_layout
\end_layout
...
@@ -753,6 +776,14 @@ least-surprise
...
@@ -753,6 +776,14 @@ least-surprise
-obscure case.
-obscure case.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete; nesting flag is still defined as per tdb1.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Incorrect Hash Function is Not Detected
Incorrect Hash Function is Not Detected
\end_layout
\end_layout
...
@@ -774,6 +805,14 @@ The header should contain an example hash result (eg.
...
@@ -774,6 +805,14 @@ The header should contain an example hash result (eg.
hash function produces the same answer, or fail the tdb_open call.
hash function produces the same answer, or fail the tdb_open call.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
tdb_set_max_dead/TDB_VOLATILE Expose Implementation
tdb_set_max_dead/TDB_VOLATILE Expose Implementation
\end_layout
\end_layout
...
@@ -818,6 +857,16 @@ With the scalability problems of the freelist solved, this API can be removed.
...
@@ -818,6 +857,16 @@ With the scalability problems of the freelist solved, this API can be removed.
tuning, but initially will become a no-op.
tuning, but initially will become a no-op.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
TDB_VOLATILE still defined, but implementation should fail on unknown flags
to be future-proof.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
...
@@ -870,6 +919,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether
...
@@ -870,6 +919,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether
an API.
an API.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
TDB API Is Not POSIX Thread-safe
TDB API Is Not POSIX Thread-safe
\end_layout
\end_layout
...
@@ -914,8 +971,6 @@ Internal locking is required to make sure that fcntl locks do not overlap
...
@@ -914,8 +971,6 @@ Internal locking is required to make sure that fcntl locks do not overlap
\begin_layout Standard
\begin_layout Standard
The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
version of the library, and otherwise no overhead will exist.
version of the library, and otherwise no overhead will exist.
\change_inserted 0 1284016998
Alternatively, a hooking mechanism similar to that proposed for
Alternatively, a hooking mechanism similar to that proposed for
\begin_inset CommandInset ref
\begin_inset CommandInset ref
LatexCommand ref
LatexCommand ref
...
@@ -924,8 +979,14 @@ reference "Proposed-Solution-locking-hook"
...
@@ -924,8 +979,14 @@ reference "Proposed-Solution-locking-hook"
\end_inset
\end_inset
could be used to enable pthread locking at runtime.
could be used to enable pthread locking at runtime.
\change_unchanged
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -1043,6 +1104,14 @@ This is flexible enough to handle any potential locking scenario, even when
...
@@ -1043,6 +1104,14 @@ This is flexible enough to handle any potential locking scenario, even when
It also keeps the complexity out of the API, and in ctdbd where it is needed.
It also keeps the complexity out of the API, and in ctdbd where it is needed.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
tdb_chainlock Functions Expose Implementation
tdb_chainlock Functions Expose Implementation
\end_layout
\end_layout
...
@@ -1124,6 +1193,14 @@ It may be possible to make this race-free in some implementations by having
...
@@ -1124,6 +1193,14 @@ It may be possible to make this race-free in some implementations by having
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
The API Uses Gratuitous Typedefs, Capitals
The API Uses Gratuitous Typedefs, Capitals
\end_layout
\end_layout
...
@@ -1200,6 +1277,14 @@ It should simply take an extra argument, since we are prepared to break
...
@@ -1200,6 +1277,14 @@ It should simply take an extra argument, since we are prepared to break
the API/ABI.
the API/ABI.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Various Callback Functions Are Not Typesafe
Various Callback Functions Are Not Typesafe
\end_layout
\end_layout
...
@@ -1239,6 +1324,14 @@ With careful use of macros, we can create callback functions which give
...
@@ -1239,6 +1324,14 @@ With careful use of macros, we can create callback functions which give
See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
\end_layout
\end_layout
...
@@ -1274,19 +1367,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
...
@@ -1274,19 +1367,21 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
\end_inset
\end_inset
.
.
\
change_inserted 0 1284015637
\
end_layout
\begin_layout Subsubsection
Status
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Standard
Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
\end_layout
\
change_inserted 0 1284015716
\
begin_layout Subsection
Extending The Header Is Difficult
Extending The Header Is Difficult
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284015906
We have reserved (zeroed) words in the TDB header, which can be used for
We have reserved (zeroed) words in the TDB header, which can be used for
future features.
future features.
If the future features are compulsory, the version number must be updated
If the future features are compulsory, the version number must be updated
...
@@ -1296,14 +1391,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for
...
@@ -1296,14 +1391,10 @@ We have reserved (zeroed) words in the TDB header, which can be used for
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1284015637
Proposed Solution
Proposed Solution
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284016114
The header should contain a
The header should contain a
\begin_inset Quotes eld
\begin_inset Quotes eld
\end_inset
\end_inset
...
@@ -1317,58 +1408,48 @@ format variant
...
@@ -1317,58 +1408,48 @@ format variant
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
\change_inserted 0 1284016149
The lower part reflects the format variant understood by code accessing
The lower part reflects the format variant understood by code accessing
the database.
the database.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
\change_inserted 0 1284016639
The upper part reflects the format variant you must understand to write
The upper part reflects the format variant you must understand to write
to the database (otherwise you can only open for reading).
to the database (otherwise you can only open for reading).
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284016821
The latter field can only be written at creation time, the former should
The latter field can only be written at creation time, the former should
be written under the OPEN_LOCK when opening the database for writing, if
be written under the OPEN_LOCK when opening the database for writing, if
the variant of the code is lower than the current lowest variant.
the variant of the code is lower than the current lowest variant.
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284016803
This should allow backwards-compatible features to be added, and detection
This should allow backwards-compatible features to be added, and detection
if older code (which doesn't understand the feature) writes to the database.
if older code (which doesn't understand the feature) writes to the database.
\change_deleted 0 1284016101
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
\change_inserted 0 1284015634
Record Headers Are Not Expandible
Record Headers Are Not Expandible
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284015634
If we later want to add (say) checksums on keys and data, it would require
If we later want to add (say) checksums on keys and data, it would require
another format change, which we'd like to avoid.
another format change, which we'd like to avoid.
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1284015634
Proposed Solution
Proposed Solution
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284422552
We often have extra padding at the tail of a record.
We often have extra padding at the tail of a record.
If we ensure that the first byte (if any) of this padding is zero, we will
If we ensure that the first byte (if any) of this padding is zero, we will
have a way for future changes to detect code which doesn't understand a
have a way for future changes to detect code which doesn't understand a
...
@@ -1377,28 +1458,28 @@ We often have extra padding at the tail of a record.
...
@@ -1377,28 +1458,28 @@ We often have extra padding at the tail of a record.
not present on that record.
not present on that record.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsubsection
Status
\end_layout
\change_inserted 0 1284422568
\begin_layout Standard
Incomplete.
\end_layout
\begin_layout Subsection
TDB Does Not Use Talloc
TDB Does Not Use Talloc
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284422646
Many users of TDB (particularly Samba) use the talloc allocator, and thus
Many users of TDB (particularly Samba) use the talloc allocator, and thus
have to wrap TDB in a talloc context to use it conveniently.
have to wrap TDB in a talloc context to use it conveniently.
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1284422656
Proposed Solution
Proposed Solution
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284423065
The allocation within TDB is not complicated enough to justify the use of
The allocation within TDB is not complicated enough to justify the use of
talloc, and I am reluctant to force another (excellent) library on TDB
talloc, and I am reluctant to force another (excellent) library on TDB
users.
users.
...
@@ -1424,15 +1505,19 @@ context
...
@@ -1424,15 +1505,19 @@ context
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284423042
This would form a talloc heirarchy as expected, but the caller would still
This would form a talloc heirarchy as expected, but the caller would still
have to attach a destructor to the tdb context returned from tdb_open to
have to attach a destructor to the tdb context returned from tdb_open to
close it.
close it.
All TDB_DATA fields would be children of the tdb_context, and the caller
All TDB_DATA fields would be children of the tdb_context, and the caller
would still have to manage them (using talloc_free() or talloc_steal()).
would still have to manage them (using talloc_free() or talloc_steal()).
\change_unchanged
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\end_layout
\begin_layout Section
\begin_layout Section
...
@@ -1490,6 +1575,14 @@ Remove the flag.
...
@@ -1490,6 +1575,14 @@ Remove the flag.
point.
point.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
TDB Files Have a 4G Limit
TDB Files Have a 4G Limit
\end_layout
\end_layout
...
@@ -1537,6 +1630,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August
...
@@ -1537,6 +1630,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August
be erased and initialized as a fresh tdb!)
be erased and initialized as a fresh tdb!)
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
TDB Records Have a 4G Limit
TDB Records Have a 4G Limit
\end_layout
\end_layout
...
@@ -1566,6 +1667,14 @@ reference "sub:Records-Incur-A"
...
@@ -1566,6 +1667,14 @@ reference "sub:Records-Incur-A"
).
).
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Hash Size Is Determined At TDB Creation Time
Hash Size Is Determined At TDB Creation Time
\end_layout
\end_layout
...
@@ -1580,16 +1689,12 @@ TDB contains a number of hash chains in the header; the number is specified
...
@@ -1580,16 +1689,12 @@ TDB contains a number of hash chains in the header; the number is specified
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1283336713
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
name "sub:Hash-Size-Solution"
name "sub:Hash-Size-Solution"
\end_inset
\end_inset
\change_unchanged
Proposed Solution
Proposed Solution
\end_layout
\end_layout
...
@@ -1608,58 +1713,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin
...
@@ -1608,58 +1713,6 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin
, it became clear that it is hard to beat a straight linear hash table which
, it became clear that it is hard to beat a straight linear hash table which
doubles in size when it reaches saturation.
doubles in size when it reaches saturation.
\change_deleted 0 1283307675
There are three details which become important:
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
On encountering a full bucket, we use the next bucket.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
Extra hash bits are stored with the offset, to reduce comparisons.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
A marker entry is used on deleting an entry.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The doubling of the table must be done under a transaction; we will not
reduce it on deletion, so it will be an unusual case.
It will either be placed at the head (other entries will be moved out the
way so we can expand).
We could have a pointer in the header to the current hashtable location,
but that pointer would have to be read frequently to check for hashtable
moves.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The locking for this is slightly more complex than the chained case; we
currently have one lock per bucket, and that means we would need to expand
the lock if we overflow to the next bucket.
The frequency of such collisions will effect our locking heuristics: we
can always lock more buckets than we need.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
One possible optimization is to only re-check the hash size on an insert
or a lookup miss.
\change_inserted 0 1283307770
Unfortunately, altering the hash table introduces serious locking complications
Unfortunately, altering the hash table introduces serious locking complications
: the entire hash table needs to be locked to enlarge the hash table, and
: the entire hash table needs to be locked to enlarge the hash table, and
others might be holding locks.
others might be holding locks.
...
@@ -1667,8 +1720,6 @@ One possible optimization is to only re-check the hash size on an insert
...
@@ -1667,8 +1720,6 @@ One possible optimization is to only re-check the hash size on an insert
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1283336187
Thus an expanding layered hash will be used: an array of hash groups, with
Thus an expanding layered hash will be used: an array of hash groups, with
each hash group exploding into pointers to lower hash groups once it fills,
each hash group exploding into pointers to lower hash groups once it fills,
turning into a hash tree.
turning into a hash tree.
...
@@ -1677,8 +1728,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with
...
@@ -1677,8 +1728,6 @@ Thus an expanding layered hash will be used: an array of hash groups, with
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1283336586
Note that bits from the hash table entries should be stolen to hold more
Note that bits from the hash table entries should be stolen to hold more
hash bits to reduce the penalty of collisions.
hash bits to reduce the penalty of collisions.
We can use the otherwise-unused lower 3 bits.
We can use the otherwise-unused lower 3 bits.
...
@@ -1689,8 +1738,14 @@ Note that bits from the hash table entries should be stolen to hold more
...
@@ -1689,8 +1738,14 @@ Note that bits from the hash table entries should be stolen to hold more
bits are valid.
bits are valid.
This means we can choose not to re-hash all entries when we expand a hash
This means we can choose not to re-hash all entries when we expand a hash
group; simply use the next bits we need and mark them invalid.
group; simply use the next bits we need and mark them invalid.
\change_unchanged
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -1817,8 +1872,6 @@ The single list lock limits our allocation rate; due to the other issues
...
@@ -1817,8 +1872,6 @@ The single list lock limits our allocation rate; due to the other issues
\begin_layout Subsubsection
\begin_layout Subsubsection
Proposed Solution
Proposed Solution
\change_deleted 0 1283336858
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -1833,20 +1886,14 @@ The free list must be split to reduce contention.
...
@@ -1833,20 +1886,14 @@ The free list must be split to reduce contention.
This implies that the number of free lists is related to the size of the
This implies that the number of free lists is related to the size of the
hash table, but as it is rare to walk a large number of free list entries
hash table, but as it is rare to walk a large number of free list entries
we can use far fewer, say 1/32 of the number of hash buckets.
we can use far fewer, say 1/32 of the number of hash buckets.
\change_inserted 0 1283336910
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1283337052
It seems tempting to try to reuse the hash implementation which we use for
It seems tempting to try to reuse the hash implementation which we use for
records here, but we have two ways of searching for free entries: for allocatio
records here, but we have two ways of searching for free entries: for allocatio
n we search by size (and possibly zone) which produces too many clashes
n we search by size (and possibly zone) which produces too many clashes
for our hash table to handle well, and for coalescing we search by address.
for our hash table to handle well, and for coalescing we search by address.
Thus an array of doubly-linked free lists seems preferable.
Thus an array of doubly-linked free lists seems preferable.
\change_unchanged
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -1860,24 +1907,28 @@ reference "sub:TDB-Becomes-Fragmented"
...
@@ -1860,24 +1907,28 @@ reference "sub:TDB-Becomes-Fragmented"
) but it's not clear this would reduce contention in the common case where
) but it's not clear this would reduce contention in the common case where
all processes are allocating/freeing the same size.
all processes are allocating/freeing the same size.
Thus we almost certainly need to divide in other ways: the most obvious
Thus we almost certainly need to divide in other ways: the most obvious
is to divide the file into zones, and using a free list (or
set
of free
is to divide the file into zones, and using a free list (or
table
of free
lists) for each.
lists) for each.
This approximates address ordering.
This approximates address ordering.
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
Note that this means we need to split the free lists when we expand the
Unfortunately it is difficult to know what heuristics should be used to
file; this is probably acceptable when we double the hash table size, since
determine zone sizes, and our transaction code relies on being able to
that is such an expensive operation already.
create a
In the case of increasing the file size, there is an optimization we can
\begin_inset Quotes eld
use: if we use M in the formula above as the file size rounded up to the
\end_inset
next power of 2, we only need reshuffle free lists when the file size crosses
a power of 2 boundary,
recovery area
\emph on
\begin_inset Quotes erd
and
\end_inset
\emph default
reshuffling the free lists is trivial: we simply merge every consecutive
by simply appending to the file (difficult if it would need to create a
pair of free lists.
new zone header).
Thus we use a linked-list of free tables; currently we only ever create
one, but if there is more than one we choose one at random to use.
In future we may use heuristics to add new free tables on contention.
We only expand the file when all free tables are exhausted.
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -1886,7 +1937,7 @@ The basic algorithm is as follows.
...
@@ -1886,7 +1937,7 @@ The basic algorithm is as follows.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Identify the correct
zone
.
Identify the correct
free list
.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
...
@@ -1894,12 +1945,12 @@ Lock the corresponding list.
...
@@ -1894,12 +1945,12 @@ Lock the corresponding list.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Re-check the
zone
(we didn't have a lock, sizes could have changed): relock
Re-check the
list
(we didn't have a lock, sizes could have changed): relock
if necessary.
if necessary.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Place the freed entry in the list
for that zone
.
Place the freed entry in the list.
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -1908,15 +1959,7 @@ Allocation is a little more complicated, as we perform delayed coalescing
...
@@ -1908,15 +1959,7 @@ Allocation is a little more complicated, as we perform delayed coalescing
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Pick a zone either the zone we last freed into, or based on a
Pick a free table; usually the previous one.
\begin_inset Quotes eld
\end_inset
random
\begin_inset Quotes erd
\end_inset
number.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
...
@@ -1924,16 +1967,16 @@ Lock the corresponding list.
...
@@ -1924,16 +1967,16 @@ Lock the corresponding list.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Re-check the zone: relock if necessary
.
If the top entry is -large enough, remove it from the list and return it
.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
If the top entry is -large enough, remove it from the list and return it.
Otherwise, coalesce entries in the list.If there was no entry large enough,
unlock the list and try the next largest list
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
Otherwise, coalesce entries in the list.If there was no entry large enough,
If no list has an entry which meets our needs, try the next free table.
unlock the list and try the next zone.
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
...
@@ -1965,73 +2008,8 @@ reference "sub:Records-Incur-A"
...
@@ -1965,73 +2008,8 @@ reference "sub:Records-Incur-A"
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
I anticipate that the number of entries in each free zone would be small,
Each free entry has the free table number in the header: less than 255.
but it might be worth using one free entry to hold pointers to the others
It also contains a doubly-linked list for easy deletion.
for cache efficiency.
\change_inserted 0 1283309850
\end_layout
\begin_layout Standard
\change_inserted 0 1283337216
\begin_inset CommandInset label
LatexCommand label
name "freelist-in-zone"
\end_inset
If we want to avoid locking complexity (enlarging the free lists when we
enlarge the file) we could place the array of free lists at the beginning
of each zone.
This means existing array lists never move, but means that a record cannot
be larger than a zone.
That in turn implies that zones should be variable sized (say, power of
2), which makes the question
\begin_inset Quotes eld
\end_inset
what zone is this record in?
\begin_inset Quotes erd
\end_inset
much harder (and
\begin_inset Quotes eld
\end_inset
pick a random zone
\begin_inset Quotes erd
\end_inset
, but that's less common).
It could be done with as few as 4 bits from the record header.
\begin_inset Foot
status open
\begin_layout Plain Layout
\change_inserted 0 1284424151
Using
\begin_inset Formula $2^{16+N*3}$
\end_inset
means 0 gives a minimal 65536-byte zone, 15 gives the maximal
\begin_inset Formula $2^{61}$
\end_inset
byte zone.
Zones range in factor of 8 steps.
Given the zone size for the zone the current record is in, we can determine
the start of the zone.
\change_unchanged
\end_layout
\end_inset
\change_unchanged
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -2233,8 +2211,6 @@ miss
...
@@ -2233,8 +2211,6 @@ miss
it reduces 99.9% of false memcmp).
it reduces 99.9% of false memcmp).
As an aside, as the lower bits are already incorporated in the hash table
As an aside, as the lower bits are already incorporated in the hash table
resolution, the upper bits should be used here.
resolution, the upper bits should be used here.
\change_inserted 0 1283336739
Note that it's not clear that these bits will be a win, given the extra
Note that it's not clear that these bits will be a win, given the extra
bits in the hash table itself (see
bits in the hash table itself (see
\begin_inset CommandInset ref
\begin_inset CommandInset ref
...
@@ -2244,8 +2220,6 @@ reference "sub:Hash-Size-Solution"
...
@@ -2244,8 +2220,6 @@ reference "sub:Hash-Size-Solution"
\end_inset
\end_inset
).
).
\change_unchanged
\end_layout
\end_layout
\begin_layout Enumerate
\begin_layout Enumerate
...
@@ -2282,11 +2256,11 @@ struct tdb_used_record {
...
@@ -2282,11 +2256,11 @@ struct tdb_used_record {
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
uint32_t magic : 16,
uint32_t
used_
magic : 16,
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
prev_is_free: 1,
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
...
@@ -2294,7 +2268,7 @@ struct tdb_used_record {
...
@@ -2294,7 +2268,7 @@ struct tdb_used_record {
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
top_hash: 1
0
;
top_hash: 1
1
;
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
...
@@ -2318,29 +2292,27 @@ struct tdb_free_record {
...
@@ -2318,29 +2292,27 @@ struct tdb_free_record {
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
uint
32_t free_magic;
uint
64_t free_magic: 8,
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
uint64_t total_length;
prev : 56;
\change_inserted 0 1283337133
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
\change_inserted 0 1283337139
\end_layout
uint64_t prev, next;
\change_unchanged
\begin_layout LyX-Code
uint64_t free_table: 8,
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
...
total_length : 56
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
uint64_t
tailer
;
uint64_t
next;
;
\end_layout
\end_layout
\begin_layout LyX-Code
\begin_layout LyX-Code
...
@@ -2349,20 +2321,19 @@ struct tdb_free_record {
...
@@ -2349,20 +2321,19 @@ struct tdb_free_record {
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1283337235
\change_deleted 0 1291206079
We might want to take some bits from the used record's top_hash (and the
free record which has 32 bits of padding to spare anyway) if we use variable
sized zones.
See
\begin_inset CommandInset ref
LatexCommand ref
reference "freelist-in-zone"
\end_inset
.
\change_unchanged
\change_unchanged
Note that by limiting valid offsets to 56 bits, we can pack everything we
need into 3 64-byte words, meaning our minimum record size is 8 bytes.
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -2455,6 +2426,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum
...
@@ -2455,6 +2426,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum
a transaction in progress; we need only check for recovery if this is set.
a transaction in progress; we need only check for recovery if this is set.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
...
@@ -2466,13 +2445,7 @@ TDB Does Not Have Snapshot Support
...
@@ -2466,13 +2445,7 @@ TDB Does Not Have Snapshot Support
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
Proposed Solution
Proposed SolutionNone.
\change_deleted 0 1284423472
\end_layout
\begin_layout Standard
None.
At some point you say
At some point you say
\begin_inset Quotes eld
\begin_inset Quotes eld
\end_inset
\end_inset
...
@@ -2481,13 +2454,6 @@ use a real database
...
@@ -2481,13 +2454,6 @@ use a real database
\begin_inset Quotes erd
\begin_inset Quotes erd
\end_inset
\end_inset
\change_inserted 0 1284423891
\change_deleted 0 1284423891
.
\change_inserted 0 1284423901
(but see
(but see
\begin_inset CommandInset ref
\begin_inset CommandInset ref
LatexCommand ref
LatexCommand ref
...
@@ -2496,8 +2462,6 @@ reference "replay-attribute"
...
@@ -2496,8 +2462,6 @@ reference "replay-attribute"
\end_inset
\end_inset
).
).
\change_unchanged
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
...
@@ -2520,8 +2484,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack
...
@@ -2520,8 +2484,14 @@ This would not allow arbitrary changes to the database, such as tdb_repack
\begin_layout Standard
\begin_layout Standard
We could then implement snapshots using a similar method, using multiple
We could then implement snapshots using a similar method, using multiple
different hash tables/free tables.
different hash tables/free tables.
\change_inserted 0 1284423495
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
...
@@ -2541,8 +2511,6 @@ Proposed Solution
...
@@ -2541,8 +2511,6 @@ Proposed Solution
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284424201
None (but see
None (but see
\begin_inset CommandInset ref
\begin_inset CommandInset ref
LatexCommand ref
LatexCommand ref
...
@@ -2551,15 +2519,21 @@ reference "replay-attribute"
...
@@ -2551,15 +2519,21 @@ reference "replay-attribute"
\end_inset
\end_inset
).
).
We could solve a small part of the problem by providing read-only transactions.
\change_unchanged
We could solve a small part of the problem by providing read-only transactions.
These would allow one write transaction to begin, but it could not commit
These would allow one write transaction to begin, but it could not commit
until all r/o transactions are done.
until all r/o transactions are done.
This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
commit.
commit.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Default Hash Function Is Suboptimal
Default Hash Function Is Suboptimal
\end_layout
\end_layout
...
@@ -2600,6 +2574,14 @@ The seed should be created at tdb-creation time from some random source,
...
@@ -2600,6 +2574,14 @@ The seed should be created at tdb-creation time from some random source,
hash bombing.
hash bombing.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
...
@@ -2637,6 +2619,14 @@ reference "traverse-Proposed-Solution"
...
@@ -2637,6 +2619,14 @@ reference "traverse-Proposed-Solution"
.
.
\end_layout
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Complete.
\end_layout
\begin_layout Subsection
\begin_layout Subsection
Fcntl Locking Adds Overhead
Fcntl Locking Adds Overhead
\end_layout
\end_layout
...
@@ -2738,19 +2728,13 @@ At some later point, a sync would allow recovery of the old data into the
...
@@ -2738,19 +2728,13 @@ At some later point, a sync would allow recovery of the old data into the
free lists (perhaps when the array of top-level pointers filled).
free lists (perhaps when the array of top-level pointers filled).
On crash, tdb_open() would examine the array of top levels, and apply the
On crash, tdb_open() would examine the array of top levels, and apply the
transactions until it encountered an invalid checksum.
transactions until it encountered an invalid checksum.
\change_inserted 0 1284423555
\end_layout
\end_layout
\begin_layout Subsection
\begin_layout Subsection
\change_inserted 0 1284423617
Tracing Is Fragile, Replay Is External
Tracing Is Fragile, Replay Is External
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284423719
The current TDB has compile-time-enabled tracing code, but it often breaks
The current TDB has compile-time-enabled tracing code, but it often breaks
as it is not enabled by default.
as it is not enabled by default.
In a similar way, the ctdb code has an external wrapper which does replay
In a similar way, the ctdb code has an external wrapper which does replay
...
@@ -2758,8 +2742,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks
...
@@ -2758,8 +2742,6 @@ The current TDB has compile-time-enabled tracing code, but it often breaks
\end_layout
\end_layout
\begin_layout Subsubsection
\begin_layout Subsubsection
\change_inserted 0 1284423864
Proposed Solution
Proposed Solution
\begin_inset CommandInset label
\begin_inset CommandInset label
LatexCommand label
LatexCommand label
...
@@ -2771,8 +2753,6 @@ name "replay-attribute"
...
@@ -2771,8 +2753,6 @@ name "replay-attribute"
\end_layout
\end_layout
\begin_layout Standard
\begin_layout Standard
\change_inserted 0 1284423850
Tridge points out that an attribute can be later added to tdb_open (see
Tridge points out that an attribute can be later added to tdb_open (see
\begin_inset CommandInset ref
\begin_inset CommandInset ref
...
@@ -2783,8 +2763,14 @@ reference "attributes"
...
@@ -2783,8 +2763,14 @@ reference "attributes"
) to provide replay/trace hooks, which could become the basis for this and
) to provide replay/trace hooks, which could become the basis for this and
future parallel transactions and snapshot support.
future parallel transactions and snapshot support.
\change_unchanged
\end_layout
\begin_layout Subsubsection
Status
\end_layout
\begin_layout Standard
Deferred.
\end_layout
\end_layout
\end_body
\end_body
...
@@ -2792,6 +2778,996 @@ reference "attributes"
...
@@ -2792,6 +2778,996 @@ reference "attributes"
@
@
1.12
log
@Add status, some fixes, linked freelists.
@
text
@d53 1
a53 7
\change_deleted 0 1291204535
14-September
\change_inserted 0 1291204533
1-December
\change_unchanged
-2010
a580 2
\change_inserted 0 1291204563
a583 2
\change_inserted 0 1291204572
a587 2
\change_inserted 0 1291204573
a588 2
\change_unchanged
a629 2
\change_inserted 0 1291204588
a632 2
\change_inserted 0 1291204588
a636 2
\change_inserted 0 1291204631
a639 2
\change_unchanged
a693 2
\change_inserted 0 1291204639
a696 2
\change_inserted 0 1291204640
a700 2
\change_inserted 0 1291204665
a701 2
\change_unchanged
a722 2
\change_inserted 0 1291204671
a725 2
\change_inserted 0 1291204671
a729 2
\change_inserted 0 1291204673
a730 2
\change_unchanged
a774 2
\change_inserted 0 1291204731
a777 2
\change_inserted 0 1291204732
a781 2
\change_inserted 0 1291204779
a784 2
\change_unchanged
a836 2
\change_inserted 0 1291204830
a839 2
\change_inserted 0 1291204831
a843 2
\change_inserted 0 1291204834
a844 2
\change_unchanged
a898 2
\change_inserted 0 1291204847
a901 2
\change_inserted 0 1291204847
a905 2
\change_inserted 0 1291204852
a906 2
\change_unchanged
a1021 2
\change_inserted 0 1291204881
a1024 2
\change_inserted 0 1291204881
a1028 2
\change_inserted 0 1291204885
a1029 2
\change_unchanged
a1110 2
\change_inserted 0 1291204898
a1113 2
\change_inserted 0 1291204898
a1117 2
\change_inserted 0 1291204901
a1118 2
\change_unchanged
a1194 2
\change_inserted 0 1291204908
a1197 2
\change_inserted 0 1291204908
a1201 2
\change_inserted 0 1291204908
a1202 2
\change_unchanged
a1241 2
\change_inserted 0 1291204917
a1244 2
\change_inserted 0 1291204917
a1248 2
\change_inserted 0 1291204920
a1249 2
\change_unchanged
a1286 2
\change_inserted 0 1291204927
a1289 2
\change_inserted 0 1291204928
a1293 2
\change_inserted 0 1291204942
a1294 2
\change_unchanged
a1345 2
\change_inserted 0 1291205003
a1348 2
\change_inserted 0 1291205004
a1352 2
\change_inserted 0 1291205007
a1375 2
\change_inserted 0 1291205019
a1378 2
\change_inserted 0 1291205019
a1382 2
\change_inserted 0 1291205023
a1383 2
\change_unchanged
a1429 2
\change_inserted 0 1291205029
a1432 2
\change_inserted 0 1291205029
a1436 2
\change_inserted 0 1291206020
a1437 2
\change_unchanged
a1492 2
\change_inserted 0 1291205043
a1495 2
\change_inserted 0 1291205043
a1499 2
\change_inserted 0 1291205057
a1500 2
\change_unchanged
a1547 2
\change_inserted 0 1291205062
a1550 2
\change_inserted 0 1291205062
a1554 2
\change_inserted 0 1291205062
a1555 2
\change_unchanged
a1584 2
\change_inserted 0 1291205072
a1587 2
\change_inserted 0 1291205073
a1591 2
\change_inserted 0 1291205073
a1592 2
\change_unchanged
a1632 4
\change_deleted 0 1291204504
\change_unchanged
a1657 2
\change_inserted 0 1291205079
a1660 2
\change_inserted 0 1291205080
a1664 2
\change_inserted 0 1291205080
a1665 2
\change_unchanged
a1791 2
\change_inserted 0 1291205090
d1827 2
a1828 7
is to divide the file into zones, and using a free list (or
\change_inserted 0 1291205498
table
\change_deleted 0 1291205497
set
\change_unchanged
of free lists) for each.
a1829 2
\change_inserted 0 1291205203
a1832 2
\change_inserted 0 1291205358
a1848 21
\change_unchanged
\end_layout
\begin_layout Standard
\change_deleted 0 1291205198
Note that this means we need to split the free lists when we expand the
file; this is probably acceptable when we double the hash table size, since
that is such an expensive operation already.
In the case of increasing the file size, there is an optimization we can
use: if we use M in the formula above as the file size rounded up to the
next power of 2, we only need reshuffle free lists when the file size crosses
a power of 2 boundary,
\emph on
and
\emph default
reshuffling the free lists is trivial: we simply merge every consecutive
pair of free lists.
\change_unchanged
d1857 1
a1857 7
Identify the correct
\change_inserted 0 1291205366
free list
\change_deleted 0 1291205364
zone
\change_unchanged
.
d1865 2
a1866 7
Re-check the
\change_inserted 0 1291205372
list
\change_deleted 0 1291205371
zone
\change_unchanged
(we didn't have a lock, sizes could have changed): relock if necessary.
d1870 1
a1870 5
Place the freed entry in the list
\change_deleted 0 1291205382
for that zone
\change_unchanged
.
d1879 1
a1879 15
Pick a
\change_deleted 0 1291205403
zone either the zone we last freed into, or based on a
\begin_inset Quotes eld
\end_inset
random
\begin_inset Quotes erd
\end_inset
number.
\change_inserted 0 1291205411
free table; usually the previous one.
\change_unchanged
a1883 10
\change_deleted 0 1291205432
\end_layout
\begin_layout Enumerate
\change_deleted 0 1291205428
Re-check the zone: relock if necessary.
\change_unchanged
d1892 1
a1892 7
unlock the list and try the next
\change_inserted 0 1291205455
largest list
\change_deleted 0 1291205452
zone.
\change_inserted 0 1291205457
a1895 2
\change_inserted 0 1291205476
a1896 2
\change_unchanged
a1924 2
\change_inserted 0 1291205542
a1927 2
\change_inserted 0 1291205591
a1929 70
\change_unchanged
\end_layout
\begin_layout Standard
\change_deleted 0 1291205539
I anticipate that the number of entries in each free zone would be small,
but it might be worth using one free entry to hold pointers to the others
for cache efficiency.
\change_unchanged
\end_layout
\begin_layout Standard
\change_deleted 0 1291205534
\begin_inset CommandInset label
LatexCommand label
name "freelist-in-zone"
\end_inset
If we want to avoid locking complexity (enlarging the free lists when we
enlarge the file) we could place the array of free lists at the beginning
of each zone.
This means existing array lists never move, but means that a record cannot
be larger than a zone.
That in turn implies that zones should be variable sized (say, power of
2), which makes the question
\begin_inset Quotes eld
\end_inset
what zone is this record in?
\begin_inset Quotes erd
\end_inset
much harder (and
\begin_inset Quotes eld
\end_inset
pick a random zone
\begin_inset Quotes erd
\end_inset
, but that's less common).
It could be done with as few as 4 bits from the record header.
\begin_inset Foot
status collapsed
\begin_layout Plain Layout
Using
\begin_inset Formula $2^{16+N*3}$
\end_inset
means 0 gives a minimal 65536-byte zone, 15 gives the maximal
\begin_inset Formula $2^{61}$
\end_inset
byte zone.
Zones range in factor of 8 steps.
Given the zone size for the zone the current record is in, we can determine
the start of the zone.
\end_layout
\end_inset
\change_inserted 0 1291205139
d2176 1
a2176 5
uint32_t
\change_inserted 0 1291205758
used_
\change_unchanged
magic : 16,
a2180 4
\change_deleted 0 1291205693
prev_is_free: 1,
\change_unchanged
d2188 1
a2188 7
top_hash: 1
\change_inserted 0 1291205704
1
\change_deleted 0 1291205704
0
\change_unchanged
;
d2212 1
a2212 9
uint
\change_inserted 0 1291205725
64
\change_deleted 0 1291205723
32
\change_unchanged
_t
\change_inserted 0 1291205753
free_magic: 8,
a2215 2
\change_inserted 0 1291205746
a2220 24
\change_deleted 0 1291205749
free_magic;
\change_unchanged
\end_layout
\begin_layout LyX-Code
uint64_t
\change_inserted 0 1291205786
free_table: 8,
\end_layout
\begin_layout LyX-Code
\change_inserted 0 1291205788
\change_unchanged
total_length
\change_inserted 0 1291205792
: 56
\change_deleted 0 1291205790
;
\change_unchanged
d2224 1
a2224 7
uint64_t
\change_deleted 0 1291205801
prev,
\change_unchanged
next;
\change_deleted 0 1291205811
d2228 1
a2228 3
\change_deleted 0 1291205811
...
d2232 1
a2232 5
\change_deleted 0 1291205808
uint64_t tailer
\change_unchanged
;
d2241 5
a2245 16
\change_deleted 0 1291205827
We might want to take some bits from the used record's top_hash (and the
free record which has 32 bits of padding to spare anyway) if we use variable
sized zones.
See
\begin_inset CommandInset ref
LatexCommand ref
reference "freelist-in-zone"
\end_inset
.
\change_inserted 0 1291205885
Note that by limiting valid offsets to 56 bits, we can pack everything
we need into 3 64-byte words, meaning our minimum record size is 8 bytes.
a2248 2
\change_inserted 0 1291205886
a2252 2
\change_inserted 0 1291205886
a2253 2
\change_unchanged
a2343 2
\change_inserted 0 1291205894
a2346 2
\change_inserted 0 1291205894
a2350 2
\change_inserted 0 1291205902
a2351 2
\change_unchanged
a2373 4
\change_deleted 0 1291204504
\change_unchanged
a2403 2
\change_inserted 0 1291205910
a2406 2
\change_inserted 0 1291205910
a2410 2
\change_inserted 0 1291205914
a2411 2
\change_unchanged
a2443 2
\change_inserted 0 1291205919
a2446 2
\change_inserted 0 1291205919
a2450 2
\change_inserted 0 1291205922
a2451 2
\change_unchanged
a2491 2
\change_inserted 0 1291205929
a2494 2
\change_inserted 0 1291205929
a2498 2
\change_inserted 0 1291205929
a2499 2
\change_unchanged
a2536 2
\change_inserted 0 1291205932
a2539 2
\change_inserted 0 1291205933
a2543 2
\change_inserted 0 1291205933
a2544 2
\change_unchanged
a2682 2
\change_inserted 0 1291205944
a2685 2
\change_inserted 0 1291205945
a2689 2
\change_inserted 0 1291205948
a2690 2
\change_unchanged
@
1.11
log
@Merge changes
@
text
@d53 7
a59 1
14-September-2010
d587 16
d644 18
d716 16
d753 16
d813 18
d883 16
d953 16
d1084 16
d1181 16
d1273 16
d1328 16
d1381 16
d1447 19
a1465 2
if older code (which doesn't understand the feature) writes to the database.Reco
rd Headers Are Not Expandible
d1484 16
d1546 16
d1617 16
d1680 16
d1725 16
d1810 16
d1951 8
a1958 3
Proposed SolutionThe first step is to remove all the current heuristics,
as they obviously interact, then examine them once the lock contention
is addressed.
d1989 7
a1995 2
is to divide the file into zones, and using a free list (or set of free
lists) for each.
d1997 2
d2002 25
d2039 2
d2049 7
a2055 1
Identify the correct zone.
d2063 7
a2069 2
Re-check the zone (we didn't have a lock, sizes could have changed): relock
if necessary.
d2073 5
a2077 1
Place the freed entry in the list for that zone.
d2086 3
a2088 1
Pick a zone either the zone we last freed into, or based on a
d2097 4
d2105 2
d2110 2
d2113 2
d2123 15
a2137 1
unlock the list and try the next zone.
d2166 11
d2180 2
d2185 2
d2190 2
d2223 1
a2223 1
status open
d2243 2
d2491 5
a2495 1
uint32_t magic : 16,
d2499 2
d2502 2
d2511 7
a2517 1
top_hash: 10;
d2541 29
a2569 1
uint32_t free_magic;
d2573 11
a2583 1
uint64_t total_length;
d2587 7
a2593 1
uint64_t prev, next;
d2597 2
d2603 5
a2607 1
uint64_t tailer;
d2615 2
d2628 18
d2736 16
d2808 16
d2856 16
d2912 16
d2965 16
d3119 16
@
1.10
log
@Tracing attribute, talloc support.
@
text
@d1 1
a1 1
#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
d53 1
a53 7
\change_deleted 0 1283307542
26-July
\change_inserted 0 1284423485
14-September
\change_unchanged
-2010
a472 2
\change_inserted 0 1284422789
a479 2
\change_unchanged
a838 2
\change_inserted 0 1284016998
a846 2
\change_unchanged
a1194 2
\change_inserted 0 1284015637
a1197 2
\change_inserted 0 1284015716
a1201 2
\change_inserted 0 1284015906
a1210 2
\change_inserted 0 1284015637
a1214 2
\change_inserted 0 1284016114
a1227 2
\change_inserted 0 1284016149
a1232 2
\change_inserted 0 1284016639
a1237 2
\change_inserted 0 1284016821
a1243 2
\change_inserted 0 1284016803
d1245 2
a1246 9
if older code (which doesn't understand the feature) writes to the database.
\change_deleted 0 1284016101
\end_layout
\begin_layout Subsection
\change_inserted 0 1284015634
Record Headers Are Not Expandible
a1249 2
\change_inserted 0 1284015634
a1254 2
\change_inserted 0 1284015634
a1258 2
\change_inserted 0 1284422552
a1267 2
\change_inserted 0 1284422568
a1271 2
\change_inserted 0 1284422646
a1276 2
\change_inserted 0 1284422656
a1280 2
\change_inserted 0 1284423065
a1305 2
\change_inserted 0 1284423042
a1310 2
\change_unchanged
a1457 2
\change_inserted 0 1283336713
a1463 2
\change_unchanged
d1482 2
d1485 1
a1485 51
\change_deleted 0 1283307675
There are three details which become important:
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
On encountering a full bucket, we use the next bucket.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
Extra hash bits are stored with the offset, to reduce comparisons.
\end_layout
\begin_layout Enumerate
\change_deleted 0 1283307675
A marker entry is used on deleting an entry.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The doubling of the table must be done under a transaction; we will not
reduce it on deletion, so it will be an unusual case.
It will either be placed at the head (other entries will be moved out the
way so we can expand).
We could have a pointer in the header to the current hashtable location,
but that pointer would have to be read frequently to check for hashtable
moves.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
The locking for this is slightly more complex than the chained case; we
currently have one lock per bucket, and that means we would need to expand
the lock if we overflow to the next bucket.
The frequency of such collisions will effect our locking heuristics: we
can always lock more buckets than we need.
\end_layout
\begin_layout Standard
\change_deleted 0 1283307675
One possible optimization is to only re-check the hash size on an insert
or a lookup miss.
\change_inserted 0 1283307770
a1492 2
\change_inserted 0 1283336187
a1500 2
\change_inserted 0 1283336586
a1510 2
\change_unchanged
d1636 3
a1638 8
Proposed Solution
\change_deleted 0 1283336858
\end_layout
\begin_layout Standard
The first step is to remove all the current heuristics, as they obviously
interact, then examine them once the lock contention is addressed.
a1647 2
\change_inserted 0 1283336910
a1650 2
\change_inserted 0 1283337052
a1655 2
\change_unchanged
a1776 2
\change_inserted 0 1283309850
a1779 2
\change_inserted 0 1283337216
a1813 2
\change_inserted 0 1284424151
a1825 2
\change_unchanged
a1830 2
\change_unchanged
a2031 2
\change_inserted 0 1283336739
a2040 2
\change_unchanged
a2117 2
\change_inserted 0 1283337133
a2120 2
\change_inserted 0 1283337139
a2121 2
\change_unchanged
a2136 2
\change_inserted 0 1283337235
a2147 2
\change_unchanged
d2251 1
a2251 7
Proposed Solution
\change_deleted 0 1284423472
\end_layout
\begin_layout Standard
None.
d2261 1
a2261 1
\change_inserted 0 1284423891
d2263 1
a2263 4
\change_deleted 0 1284423891
.
\change_inserted 0 1284423901
a2271 2
\change_unchanged
a2293 2
\change_inserted 0 1284423495
a2312 2
\change_inserted 0 1284424201
d2321 1
a2321 3
\change_unchanged
We could solve a small part of the problem by providing read-only transactions.
a2505 2
\change_inserted 0 1284423555
a2508 2
\change_inserted 0 1284423617
a2512 2
\change_inserted 0 1284423719
a2519 2
\change_inserted 0 1284423864
a2530 2
\change_inserted 0 1284423850
a2540 2
\change_unchanged
@
1.9
1.9
log
log
@Extension mechanism.
@Extension mechanism.
...
...
ccan/tdb2/doc/design.pdf
View file @
51a56b52
No preview for this file type
ccan/tdb2/doc/design.txt
View file @
51a56b52
...
@@ -2,7 +2,7 @@ TDB2: A Redesigning The Trivial DataBase
...
@@ -2,7 +2,7 @@ TDB2: A Redesigning The Trivial DataBase
Rusty Russell, IBM Corporation
Rusty Russell, IBM Corporation
1
4-Sept
ember-2010
1
-Dec
ember-2010
Abstract
Abstract
...
@@ -129,6 +129,10 @@ union tdb_attribute {
...
@@ -129,6 +129,10 @@ union tdb_attribute {
This allows future attributes to be added, even if this expands
This allows future attributes to be added, even if this expands
the size of the union.
the size of the union.
2.1.2 Status
Complete.
2.2 tdb_traverse Makes Impossible Guarantees
2.2 tdb_traverse Makes Impossible Guarantees
tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
...
@@ -148,6 +152,11 @@ occur during your traversal, otherwise you will see some subset.
...
@@ -148,6 +152,11 @@ occur during your traversal, otherwise you will see some subset.
You can prevent changes by using a transaction or the locking
You can prevent changes by using a transaction or the locking
API.
API.
2.2.2 Status
Complete. Delete-during-traverse will still delete every record,
too (assuming no other changes).
2.3 Nesting of Transactions Is Fraught
2.3 Nesting of Transactions Is Fraught
TDB has alternated between allowing nested transactions and not
TDB has alternated between allowing nested transactions and not
...
@@ -182,6 +191,10 @@ However, this behavior can be simulated with a wrapper which uses
...
@@ -182,6 +191,10 @@ However, this behavior can be simulated with a wrapper which uses
tdb_add_flags() and tdb_remove_flags(), so the API should not be
tdb_add_flags() and tdb_remove_flags(), so the API should not be
expanded for this relatively-obscure case.
expanded for this relatively-obscure case.
2.3.2 Status
Incomplete; nesting flag is still defined as per tdb1.
2.4 Incorrect Hash Function is Not Detected
2.4 Incorrect Hash Function is Not Detected
tdb_open_ex() allows the calling code to specify a different hash
tdb_open_ex() allows the calling code to specify a different hash
...
@@ -195,6 +208,10 @@ The header should contain an example hash result (eg. the hash of
...
@@ -195,6 +208,10 @@ The header should contain an example hash result (eg. the hash of
0xdeadbeef), and tdb_open_ex() should check that the given hash
0xdeadbeef), and tdb_open_ex() should check that the given hash
function produces the same answer, or fail the tdb_open call.
function produces the same answer, or fail the tdb_open call.
2.4.2 Status
Complete.
2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
In response to scalability issues with the free list ([TDB-Freelist-Is]
In response to scalability issues with the free list ([TDB-Freelist-Is]
...
@@ -216,6 +233,11 @@ hint that store and delete of records will be at least as common
...
@@ -216,6 +233,11 @@ hint that store and delete of records will be at least as common
as fetch in order to allow some internal tuning, but initially
as fetch in order to allow some internal tuning, but initially
will become a no-op.
will become a no-op.
2.5.2 Status
Incomplete. TDB_VOLATILE still defined, but implementation should
fail on unknown flags to be future-proof.
2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
In The Same Process
In The Same Process
...
@@ -251,6 +273,10 @@ whether re-opening is allowed, as though there may be some
...
@@ -251,6 +273,10 @@ whether re-opening is allowed, as though there may be some
benefit to adding a call to detect when a tdb_context is shared,
benefit to adding a call to detect when a tdb_context is shared,
to allow other to create such an API.
to allow other to create such an API.
2.6.2 Status
Incomplete.
2.7 TDB API Is Not POSIX Thread-safe
2.7 TDB API Is Not POSIX Thread-safe
The TDB API uses an error code which can be queried after an
The TDB API uses an error code which can be queried after an
...
@@ -281,6 +307,10 @@ will exist. Alternatively, a hooking mechanism similar to that
...
@@ -281,6 +307,10 @@ will exist. Alternatively, a hooking mechanism similar to that
proposed for [Proposed-Solution-locking-hook] could be used to
proposed for [Proposed-Solution-locking-hook] could be used to
enable pthread locking at runtime.
enable pthread locking at runtime.
2.7.2 Status
Incomplete.
2.8 *_nonblock Functions And *_mark Functions Expose
2.8 *_nonblock Functions And *_mark Functions Expose
Implementation
Implementation
...
@@ -343,6 +373,10 @@ locks it doesn't need to obtain.
...
@@ -343,6 +373,10 @@ locks it doesn't need to obtain.
It also keeps the complexity out of the API, and in ctdbd where
It also keeps the complexity out of the API, and in ctdbd where
it is needed.
it is needed.
2.8.2 Status
Incomplete.
2.9 tdb_chainlock Functions Expose Implementation
2.9 tdb_chainlock Functions Expose Implementation
tdb_chainlock locks some number of records, including the record
tdb_chainlock locks some number of records, including the record
...
@@ -391,6 +425,10 @@ EINVAL if the signal occurs before the kernel is entered,
...
@@ -391,6 +425,10 @@ EINVAL if the signal occurs before the kernel is entered,
otherwise EAGAIN.
otherwise EAGAIN.
]
]
2.10.2 Status
Incomplete.
2.11 The API Uses Gratuitous Typedefs, Capitals
2.11 The API Uses Gratuitous Typedefs, Capitals
typedefs are useful for providing source compatibility when types
typedefs are useful for providing source compatibility when types
...
@@ -433,6 +471,10 @@ the tdb_open_ex for logging.
...
@@ -433,6 +471,10 @@ the tdb_open_ex for logging.
It should simply take an extra argument, since we are prepared to
It should simply take an extra argument, since we are prepared to
break the API/ABI.
break the API/ABI.
2.12.2 Status
Complete.
2.13 Various Callback Functions Are Not Typesafe
2.13 Various Callback Functions Are Not Typesafe
The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
...
@@ -455,6 +497,10 @@ their parameter.
...
@@ -455,6 +497,10 @@ their parameter.
See CCAN's typesafe_cb module at
See CCAN's typesafe_cb module at
http://ccan.ozlabs.org/info/typesafe_cb.html
http://ccan.ozlabs.org/info/typesafe_cb.html
2.13.2 Status
Incomplete.
2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
tdb_reopen_all Problematic
tdb_reopen_all Problematic
...
@@ -475,6 +521,11 @@ it alone has opened the TDB and will erase it.
...
@@ -475,6 +521,11 @@ it alone has opened the TDB and will erase it.
Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
2.14.2 Status
Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
implemented.
2.15 Extending The Header Is Difficult
2.15 Extending The Header Is Difficult
We have reserved (zeroed) words in the TDB header, which can be
We have reserved (zeroed) words in the TDB header, which can be
...
@@ -505,6 +556,10 @@ This should allow backwards-compatible features to be added, and
...
@@ -505,6 +556,10 @@ This should allow backwards-compatible features to be added, and
detection if older code (which doesn't understand the feature)
detection if older code (which doesn't understand the feature)
writes to the database.
writes to the database.
2.15.2 Status
Incomplete.
2.16 Record Headers Are Not Expandible
2.16 Record Headers Are Not Expandible
If we later want to add (say) checksums on keys and data, it
If we later want to add (say) checksums on keys and data, it
...
@@ -519,6 +574,10 @@ understand a new format: the new code would write (say) a 1 at
...
@@ -519,6 +574,10 @@ understand a new format: the new code would write (say) a 1 at
the tail, and thus if there is no tail or the first byte is 0, we
the tail, and thus if there is no tail or the first byte is 0, we
would know the extension is not present on that record.
would know the extension is not present on that record.
2.16.2 Status
Incomplete.
2.17 TDB Does Not Use Talloc
2.17 TDB Does Not Use Talloc
Many users of TDB (particularly Samba) use the talloc allocator,
Many users of TDB (particularly Samba) use the talloc allocator,
...
@@ -541,6 +600,10 @@ returned from tdb_open to close it. All TDB_DATA fields would be
...
@@ -541,6 +600,10 @@ returned from tdb_open to close it. All TDB_DATA fields would be
children of the tdb_context, and the caller would still have to
children of the tdb_context, and the caller would still have to
manage them (using talloc_free() or talloc_steal()).
manage them (using talloc_free() or talloc_steal()).
2.17.2 Status
Deferred.
3 Performance And Scalability Issues
3 Performance And Scalability Issues
3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
...
@@ -570,6 +633,10 @@ Remove the flag. It was a neat idea, but even trivial servers
...
@@ -570,6 +633,10 @@ Remove the flag. It was a neat idea, but even trivial servers
tend to know when they are initializing for the first time and
tend to know when they are initializing for the first time and
can simply unlink the old tdb at that point.
can simply unlink the old tdb at that point.
3.1.2 Status
Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
3.2 TDB Files Have a 4G Limit
3.2 TDB Files Have a 4G Limit
This seems to be becoming an issue (so much for “trivial”!),
This seems to be becoming an issue (so much for “trivial”!),
...
@@ -596,6 +663,10 @@ Old versions of tdb will fail to open the new TDB files (since 28
...
@@ -596,6 +663,10 @@ Old versions of tdb will fail to open the new TDB files (since 28
August 2009, commit 398d0c29290: prior to that any unrecognized
August 2009, commit 398d0c29290: prior to that any unrecognized
file format would be erased and initialized as a fresh tdb!)
file format would be erased and initialized as a fresh tdb!)
3.2.2 Status
Complete.
3.3 TDB Records Have a 4G Limit
3.3 TDB Records Have a 4G Limit
This has not been a reported problem, and the API uses size_t
This has not been a reported problem, and the API uses size_t
...
@@ -610,6 +681,10 @@ implementation would return TDB_ERR_OOM in a similar case). It
...
@@ -610,6 +681,10 @@ implementation would return TDB_ERR_OOM in a similar case). It
seems unlikely that 32 bit keys will be a limitation, so the
seems unlikely that 32 bit keys will be a limitation, so the
implementation may not support this (see [sub:Records-Incur-A]).
implementation may not support this (see [sub:Records-Incur-A]).
3.3.2 Status
Complete.
3.4 Hash Size Is Determined At TDB Creation Time
3.4 Hash Size Is Determined At TDB Creation Time
TDB contains a number of hash chains in the header; the number is
TDB contains a number of hash chains in the header; the number is
...
@@ -628,20 +703,9 @@ This was annoying because I was previously convinced that an
...
@@ -628,20 +703,9 @@ This was annoying because I was previously convinced that an
expanding tree of hashes would be very close to optimal.
expanding tree of hashes would be very close to optimal.
], it became clear that it is hard to beat a straight linear hash
], it became clear that it is hard to beat a straight linear hash
table which doubles in size when it reaches saturation.
table which doubles in size when it reaches saturation.
Unfortunately, altering the hash table introduces serious locking
1.
complications: the entire hash table needs to be locked to
enlarge the hash table, and others might be holding locks.
2.
3.
Unfortunately, altering the hash table introduces serious
locking complications: the entire hash table needs to be locked
to enlarge the hash table, and others might be holding locks.
Particularly insidious are insertions done under tdb_chainlock.
Particularly insidious are insertions done under tdb_chainlock.
Thus an expanding layered hash will be used: an array of hash
Thus an expanding layered hash will be used: an array of hash
...
@@ -662,6 +726,10 @@ means we can choose not to re-hash all entries when we expand a
...
@@ -662,6 +726,10 @@ means we can choose not to re-hash all entries when we expand a
hash group; simply use the next bits we need and mark them
hash group; simply use the next bits we need and mark them
invalid.
invalid.
3.4.2 Status
Complete.
3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
TDB uses a single linked list for the free list. Allocation
TDB uses a single linked list for the free list. Allocation
...
@@ -749,45 +817,45 @@ There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fr
...
@@ -749,45 +817,45 @@ There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fr
case where all processes are allocating/freeing the same size.
case where all processes are allocating/freeing the same size.
Thus we almost certainly need to divide in other ways: the most
Thus we almost certainly need to divide in other ways: the most
obvious is to divide the file into zones, and using a free list
obvious is to divide the file into zones, and using a free list
(or
set
of free lists) for each. This approximates address
(or
table
of free lists) for each. This approximates address
ordering.
ordering.
Note that this means we need to split the free lists when w
e
Unfortunately it is difficult to know what heuristics should b
e
expand the file; this is probably acceptable when we double the
used to determine zone sizes, and our transaction code relies on
hash table size, since that is such an expensive operation
being able to create a “recovery area” by simply appending to the
already. In the case of increasing the file size, there is an
file (difficult if it would need to create a new zone header).
optimization we can use: if we use M in the formula above as the
Thus we use a linked-list of free tables; currently we only ever
file size rounded up to the next power of 2, we only need
create one, but if there is more than one we choose one at random
reshuffle free lists when the file size crosses a power of 2
to use. In future we may use heuristics to add new free tables on
boundary, and reshuffling the free lists is trivial: we simply
contention. We only expand the file when all free tables are
merge every consecutive pair of free lists
.
exhausted
.
The basic algorithm is as follows. Freeing is simple:
The basic algorithm is as follows. Freeing is simple:
1. Identify the correct
zone
.
1. Identify the correct
free list
.
2. Lock the corresponding list.
2. Lock the corresponding list.
3. Re-check the
zone
(we didn't have a lock, sizes could have
3. Re-check the
list
(we didn't have a lock, sizes could have
changed): relock if necessary.
changed): relock if necessary.
4. Place the freed entry in the list
for that zone
.
4. Place the freed entry in the list.
Allocation is a little more complicated, as we perform delayed
Allocation is a little more complicated, as we perform delayed
coalescing at this point:
coalescing at this point:
1. Pick a zone either the zone we last freed into, or based on a “
1. Pick a free table; usually the previous one.
random” number.
2. Lock the corresponding list.
2. Lock the corresponding list.
3. Re-check the zone: relock if necessary.
3. If the top entry is -large enough, remove it from the list and
4. If the top entry is -large enough, remove it from the list and
return it.
return it.
5. Otherwise, coalesce entries in the list.If there was no entry
4. Otherwise, coalesce entries in the list.If there was no entry
large enough, unlock the list and try the next zone.
large enough, unlock the list and try the next largest list
5. If no list has an entry which meets our needs, try the next
free table.
6. If no zone satisfies, expand the file.
6. If no zone satisfies, expand the file.
...
@@ -798,24 +866,9 @@ ordering seems to be fairly good for keeping fragmentation low
...
@@ -798,24 +866,9 @@ ordering seems to be fairly good for keeping fragmentation low
does not need a tailer to coalesce, though if we needed one we
does not need a tailer to coalesce, though if we needed one we
could have one cheaply: see [sub:Records-Incur-A].
could have one cheaply: see [sub:Records-Incur-A].
I anticipate that the number of entries in each free zone would
Each free entry has the free table number in the header: less
be small, but it might be worth using one free entry to hold
than 255. It also contains a doubly-linked list for easy
pointers to the others for cache efficiency.
deletion.
<freelist-in-zone>If we want to avoid locking complexity
(enlarging the free lists when we enlarge the file) we could
place the array of free lists at the beginning of each zone. This
means existing array lists never move, but means that a record
cannot be larger than a zone. That in turn implies that zones
should be variable sized (say, power of 2), which makes the
question “what zone is this record in?” much harder (and “pick a
random zone”, but that's less common). It could be done with as
few as 4 bits from the record header.[footnote:
Using 2^{16+N*3}means 0 gives a minimal 65536-byte zone, 15 gives
the maximal 2^{61} byte zone. Zones range in factor of 8 steps.
Given the zone size for the zone the current record is in, we can
determine the start of the zone.
]
3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
...
@@ -944,13 +997,13 @@ This produces a 16 byte used header like this:
...
@@ -944,13 +997,13 @@ This produces a 16 byte used header like this:
struct tdb_used_record {
struct tdb_used_record {
uint32_t magic : 16,
uint32_t used_magic : 16,
prev_is_free: 1,
key_data_divide: 5,
key_data_divide: 5,
top_hash: 1
0
;
top_hash: 1
1
;
uint32_t extra_octets;
uint32_t extra_octets;
...
@@ -962,21 +1015,27 @@ And a free record like this:
...
@@ -962,21 +1015,27 @@ And a free record like this:
struct tdb_free_record {
struct tdb_free_record {
uint32_t free_magic;
uint64_t free_magic: 8,
prev : 56;
uint64_t total_length;
uint64_t prev, next;
...
uint64_t free_table: 8,
uint64_t tailer;
total_length : 56
uint64_t next;;
};
};
We might want to take some bits from the used record's top_hash
Note that by limiting valid offsets to 56 bits, we can pack
(and the free record which has 32 bits of padding to spare
everything we need into 3 64-byte words, meaning our minimum
anyway) if we use variable sized zones. See [freelist-in-zone].
record size is 8 bytes.
3.7.2 Status
Complete.
3.8 Transaction Commit Requires 4 fdatasync
3.8 Transaction Commit Requires 4 fdatasync
...
@@ -1029,12 +1088,14 @@ but need only be done at open. For running databases, a separate
...
@@ -1029,12 +1088,14 @@ but need only be done at open. For running databases, a separate
header field can be used to indicate a transaction in progress;
header field can be used to indicate a transaction in progress;
we need only check for recovery if this is set.
we need only check for recovery if this is set.
3.
9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
3.
8.2 Status
3.9.1 Proposed Solution
Deferred.
None. At some point you say “use a real database” (but see [replay-attribute]
3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
).
3.9.1 Proposed SolutionNone. At some point you say “use a real
database” (but see [replay-attribute]).
But as a thought experiment, if we implemented transactions to
But as a thought experiment, if we implemented transactions to
only overwrite free entries (this is tricky: there must not be a
only overwrite free entries (this is tricky: there must not be a
...
@@ -1053,6 +1114,10 @@ rewrite some sections of the hash, too.
...
@@ -1053,6 +1114,10 @@ rewrite some sections of the hash, too.
We could then implement snapshots using a similar method, using
We could then implement snapshots using a similar method, using
multiple different hash tables/free tables.
multiple different hash tables/free tables.
3.9.2 Status
Deferred.
3.10 Transactions Cannot Operate in Parallel
3.10 Transactions Cannot Operate in Parallel
This would be useless for ldb, as it hits the index records with
This would be useless for ldb, as it hits the index records with
...
@@ -1069,6 +1134,10 @@ allow one write transaction to begin, but it could not commit
...
@@ -1069,6 +1134,10 @@ allow one write transaction to begin, but it could not commit
until all r/o transactions are done. This would require a new
until all r/o transactions are done. This would require a new
RO_TRANSACTION_LOCK, which would be upgraded on commit.
RO_TRANSACTION_LOCK, which would be upgraded on commit.
3.10.2 Status
Deferred.
3.11 Default Hash Function Is Suboptimal
3.11 Default Hash Function Is Suboptimal
The Knuth-inspired multiplicative hash used by tdb is fairly slow
The Knuth-inspired multiplicative hash used by tdb is fairly slow
...
@@ -1090,6 +1159,10 @@ The seed should be created at tdb-creation time from some random
...
@@ -1090,6 +1159,10 @@ The seed should be created at tdb-creation time from some random
source, and placed in the header. This is far from foolproof, but
source, and placed in the header. This is far from foolproof, but
adds a little bit of protection against hash bombing.
adds a little bit of protection against hash bombing.
3.11.2 Status
Complete.
3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
We lock a record during traversal iteration, and try to grab that
We lock a record during traversal iteration, and try to grab that
...
@@ -1104,6 +1177,10 @@ indefinitely.
...
@@ -1104,6 +1177,10 @@ indefinitely.
Remove reliability guarantees; see [traverse-Proposed-Solution].
Remove reliability guarantees; see [traverse-Proposed-Solution].
3.12.2 Status
Complete.
3.13 Fcntl Locking Adds Overhead
3.13 Fcntl Locking Adds Overhead
Placing a fcntl lock means a system call, as does removing one.
Placing a fcntl lock means a system call, as does removing one.
...
@@ -1176,3 +1253,7 @@ tdb_open (see [attributes]) to provide replay/trace hooks, which
...
@@ -1176,3 +1253,7 @@ tdb_open (see [attributes]) to provide replay/trace hooks, which
could become the basis for this and future parallel transactions
could become the basis for this and future parallel transactions
and snapshot support.
and snapshot support.
3.15.2 Status
Deferred.
ccan/tdb2/free.c
View file @
51a56b52
...
@@ -49,23 +49,24 @@ unsigned int size_to_bucket(tdb_len_t data_len)
...
@@ -49,23 +49,24 @@ unsigned int size_to_bucket(tdb_len_t data_len)
return
bucket
;
return
bucket
;
}
}
tdb_off_t
first_f
list
(
struct
tdb_context
*
tdb
)
tdb_off_t
first_f
table
(
struct
tdb_context
*
tdb
)
{
{
return
tdb_read_off
(
tdb
,
offsetof
(
struct
tdb_header
,
free_
list
));
return
tdb_read_off
(
tdb
,
offsetof
(
struct
tdb_header
,
free_
table
));
}
}
tdb_off_t
next_f
list
(
struct
tdb_context
*
tdb
,
tdb_off_t
flist
)
tdb_off_t
next_f
table
(
struct
tdb_context
*
tdb
,
tdb_off_t
ftable
)
{
{
return
tdb_read_off
(
tdb
,
f
list
+
offsetof
(
struct
tdb_freelist
,
next
));
return
tdb_read_off
(
tdb
,
f
table
+
offsetof
(
struct
tdb_freetable
,
next
));
}
}
int
tdb_f
list
_init
(
struct
tdb_context
*
tdb
)
int
tdb_f
table
_init
(
struct
tdb_context
*
tdb
)
{
{
/* Use reservoir sampling algorithm to select a free list at random. */
/* Use reservoir sampling algorithm to select a free list at random. */
unsigned
int
rnd
,
max
=
0
;
unsigned
int
rnd
,
max
=
0
,
count
=
0
;
tdb_off_t
off
;
tdb_off_t
off
;
tdb
->
flist_off
=
off
=
first_flist
(
tdb
);
tdb
->
ftable_off
=
off
=
first_ftable
(
tdb
);
tdb
->
ftable
=
0
;
while
(
off
)
{
while
(
off
)
{
if
(
off
==
TDB_OFF_ERR
)
if
(
off
==
TDB_OFF_ERR
)
...
@@ -73,50 +74,52 @@ int tdb_flist_init(struct tdb_context *tdb)
...
@@ -73,50 +74,52 @@ int tdb_flist_init(struct tdb_context *tdb)
rnd
=
random
();
rnd
=
random
();
if
(
rnd
>=
max
)
{
if
(
rnd
>=
max
)
{
tdb
->
flist_off
=
off
;
tdb
->
ftable_off
=
off
;
tdb
->
ftable
=
count
;
max
=
rnd
;
max
=
rnd
;
}
}
off
=
next_flist
(
tdb
,
off
);
off
=
next_ftable
(
tdb
,
off
);
count
++
;
}
}
return
0
;
return
0
;
}
}
/* Offset of a given bucket. */
/* Offset of a given bucket. */
tdb_off_t
bucket_off
(
tdb_off_t
f
list
_off
,
unsigned
bucket
)
tdb_off_t
bucket_off
(
tdb_off_t
f
table
_off
,
unsigned
bucket
)
{
{
return
f
list_off
+
offsetof
(
struct
tdb_freelist
,
buckets
)
return
f
table_off
+
offsetof
(
struct
tdb_freetable
,
buckets
)
+
bucket
*
sizeof
(
tdb_off_t
);
+
bucket
*
sizeof
(
tdb_off_t
);
}
}
/* Returns free_buckets + 1, or list number to search. */
/* Returns free_buckets + 1, or list number to search. */
static
tdb_off_t
find_free_head
(
struct
tdb_context
*
tdb
,
static
tdb_off_t
find_free_head
(
struct
tdb_context
*
tdb
,
tdb_off_t
f
list
_off
,
tdb_off_t
f
table
_off
,
tdb_off_t
bucket
)
tdb_off_t
bucket
)
{
{
/* Speculatively search for a non-zero bucket. */
/* Speculatively search for a non-zero bucket. */
return
tdb_find_nonzero_off
(
tdb
,
bucket_off
(
f
list
_off
,
0
),
return
tdb_find_nonzero_off
(
tdb
,
bucket_off
(
f
table
_off
,
0
),
bucket
,
TDB_FREE_BUCKETS
);
bucket
,
TDB_FREE_BUCKETS
);
}
}
/* Remove from free bucket. */
/* Remove from free bucket. */
static
int
remove_from_list
(
struct
tdb_context
*
tdb
,
static
int
remove_from_list
(
struct
tdb_context
*
tdb
,
tdb_off_t
b_off
,
tdb_off_t
r_off
,
tdb_off_t
b_off
,
tdb_off_t
r_off
,
struct
tdb_free_record
*
r
)
const
struct
tdb_free_record
*
r
)
{
{
tdb_off_t
off
;
tdb_off_t
off
;
/* Front of list? */
/* Front of list? */
if
(
r
->
prev
==
0
)
{
if
(
frec_prev
(
r
)
==
0
)
{
off
=
b_off
;
off
=
b_off
;
}
else
{
}
else
{
off
=
r
->
prev
+
offsetof
(
struct
tdb_free_record
,
next
);
off
=
frec_prev
(
r
)
+
offsetof
(
struct
tdb_free_record
,
next
);
}
}
#ifdef DEBUG
#ifdef DEBUG
if
(
tdb_read_off
(
tdb
,
off
)
!=
r_off
)
{
if
(
tdb_read_off
(
tdb
,
off
)
!=
r_off
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_FATAL
,
"remove_from_list: %llu bad prev in list %llu
\n
"
,
"remove_from_list: %llu bad prev in list %llu"
,
(
long
long
)
r_off
,
(
long
long
)
b_off
);
(
long
long
)
r_off
,
(
long
long
)
b_off
);
return
-
1
;
return
-
1
;
}
}
...
@@ -128,19 +131,19 @@ static int remove_from_list(struct tdb_context *tdb,
...
@@ -128,19 +131,19 @@ static int remove_from_list(struct tdb_context *tdb,
}
}
if
(
r
->
next
!=
0
)
{
if
(
r
->
next
!=
0
)
{
off
=
r
->
next
+
offsetof
(
struct
tdb_free_record
,
prev
);
off
=
r
->
next
+
offsetof
(
struct
tdb_free_record
,
magic_and_
prev
);
/* r->next->prev = r->prev */
/* r->next->prev = r->prev */
#ifdef DEBUG
#ifdef DEBUG
if
(
tdb_read_off
(
tdb
,
off
)
!=
r_off
)
{
if
(
tdb_read_off
(
tdb
,
off
)
&
TDB_OFF_MASK
!=
r_off
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_FATAL
,
"remove_from_list: %llu bad list %llu
\n
"
,
"remove_from_list: %llu bad list %llu
"
,
(
long
long
)
r_off
,
(
long
long
)
b_off
);
(
long
long
)
r_off
,
(
long
long
)
b_off
);
return
-
1
;
return
-
1
;
}
}
#endif
#endif
if
(
tdb_write_off
(
tdb
,
off
,
r
->
prev
))
{
if
(
tdb_write_off
(
tdb
,
off
,
r
->
magic_and_
prev
))
{
return
-
1
;
return
-
1
;
}
}
}
}
...
@@ -151,58 +154,66 @@ static int remove_from_list(struct tdb_context *tdb,
...
@@ -151,58 +154,66 @@ static int remove_from_list(struct tdb_context *tdb,
static
int
enqueue_in_free
(
struct
tdb_context
*
tdb
,
static
int
enqueue_in_free
(
struct
tdb_context
*
tdb
,
tdb_off_t
b_off
,
tdb_off_t
b_off
,
tdb_off_t
off
,
tdb_off_t
off
,
struct
tdb_free_record
*
new
)
tdb_len_t
len
)
{
{
new
->
prev
=
0
;
struct
tdb_free_record
new
;
uint64_t
magic
=
(
TDB_FREE_MAGIC
<<
(
64
-
TDB_OFF_UPPER_STEAL
));
/* We only need to set ftable_and_len; rest is set in enqueue_in_free */
new
.
ftable_and_len
=
((
uint64_t
)
tdb
->
ftable
<<
(
64
-
TDB_OFF_UPPER_STEAL
))
|
len
;
/* prev = 0. */
new
.
magic_and_prev
=
magic
;
/* new->next = head. */
/* new->next = head. */
new
->
next
=
tdb_read_off
(
tdb
,
b_off
);
new
.
next
=
tdb_read_off
(
tdb
,
b_off
);
if
(
new
->
next
==
TDB_OFF_ERR
)
if
(
new
.
next
==
TDB_OFF_ERR
)
return
-
1
;
return
-
1
;
if
(
new
->
next
)
{
if
(
new
.
next
)
{
#ifdef DEBUG
#ifdef DEBUG
if
(
tdb_read_off
(
tdb
,
if
(
tdb_read_off
(
tdb
,
new
->
next
new
.
next
+
offsetof
(
struct
tdb_free_record
,
+
offsetof
(
struct
tdb_free_record
,
prev
))
magic_and_prev
))
!=
0
)
{
!=
magic
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_FATAL
,
"enqueue_in_free: %llu bad head prev %llu
\n
"
,
"enqueue_in_free: %llu bad head"
(
long
long
)
new
->
next
,
(
long
long
)
b_off
);
" prev %llu"
,
(
long
long
)
new
.
next
,
(
long
long
)
b_off
);
return
-
1
;
return
-
1
;
}
}
#endif
#endif
/* next->prev = new. */
/* next->prev = new. */
if
(
tdb_write_off
(
tdb
,
new
->
next
if
(
tdb_write_off
(
tdb
,
new
.
next
+
offsetof
(
struct
tdb_free_record
,
prev
),
+
offsetof
(
struct
tdb_free_record
,
off
)
!=
0
)
magic_and_prev
),
off
|
magic
)
!=
0
)
return
-
1
;
return
-
1
;
}
}
/* head = new */
/* head = new */
if
(
tdb_write_off
(
tdb
,
b_off
,
off
)
!=
0
)
if
(
tdb_write_off
(
tdb
,
b_off
,
off
)
!=
0
)
return
-
1
;
return
-
1
;
return
tdb_write_convert
(
tdb
,
off
,
new
,
sizeof
(
*
new
));
return
tdb_write_convert
(
tdb
,
off
,
&
new
,
sizeof
(
new
));
}
}
/* List need not be locked. */
/* List need not be locked. */
int
add_free_record
(
struct
tdb_context
*
tdb
,
int
add_free_record
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_len_t
len_with_header
)
tdb_off_t
off
,
tdb_len_t
len_with_header
)
{
{
struct
tdb_free_record
new
;
tdb_off_t
b_off
;
tdb_off_t
b_off
;
tdb_len_t
len
;
int
ret
;
int
ret
;
assert
(
len_with_header
>=
sizeof
(
new
));
assert
(
len_with_header
>=
sizeof
(
struct
tdb_free_record
));
new
.
magic_and_meta
=
TDB_FREE_MAGIC
<<
(
64
-
TDB_OFF_UPPER_STEAL
)
len
=
len_with_header
-
sizeof
(
struct
tdb_used_record
);
|
tdb
->
flist_off
;
new
.
data_len
=
len_with_header
-
sizeof
(
struct
tdb_used_record
);
b_off
=
bucket_off
(
tdb
->
f
list_off
,
size_to_bucket
(
new
.
data_
len
));
b_off
=
bucket_off
(
tdb
->
f
table_off
,
size_to_bucket
(
len
));
if
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
!=
0
)
if
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
!=
0
)
return
-
1
;
return
-
1
;
ret
=
enqueue_in_free
(
tdb
,
b_off
,
off
,
&
new
);
ret
=
enqueue_in_free
(
tdb
,
b_off
,
off
,
len
);
tdb_unlock_free_bucket
(
tdb
,
b_off
);
tdb_unlock_free_bucket
(
tdb
,
b_off
);
return
ret
;
return
ret
;
}
}
...
@@ -234,91 +245,113 @@ static size_t record_leftover(size_t keylen, size_t datalen,
...
@@ -234,91 +245,113 @@ static size_t record_leftover(size_t keylen, size_t datalen,
return
leftover
;
return
leftover
;
}
}
static
tdb_off_t
ftable_offset
(
struct
tdb_context
*
tdb
,
unsigned
int
ftable
)
{
tdb_off_t
off
;
unsigned
int
i
;
if
(
likely
(
tdb
->
ftable
==
ftable
))
return
tdb
->
ftable_off
;
off
=
first_ftable
(
tdb
);
for
(
i
=
0
;
i
<
ftable
;
i
++
)
off
=
next_ftable
(
tdb
,
off
);
return
off
;
}
/* Note: we unlock the current bucket if we coalesce or fail. */
/* Note: we unlock the current bucket if we coalesce or fail. */
static
int
coalesce
(
struct
tdb_context
*
tdb
,
static
int
coalesce
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_off_t
b_off
,
tdb_len_t
data_len
)
tdb_off_t
off
,
tdb_off_t
b_off
,
tdb_len_t
data_len
)
{
{
struct
tdb_free_record
pad
,
*
r
;
tdb_off_t
end
;
tdb_off_t
end
;
struct
tdb_free_record
rec
;
add_stat
(
tdb
,
alloc_coalesce_tried
,
1
);
end
=
off
+
sizeof
(
struct
tdb_used_record
)
+
data_len
;
end
=
off
+
sizeof
(
struct
tdb_used_record
)
+
data_len
;
while
(
end
<
tdb
->
map_size
)
{
while
(
end
<
tdb
->
map_size
)
{
const
struct
tdb_free_record
*
r
;
tdb_off_t
nb_off
;
tdb_off_t
nb_off
;
unsigned
ftable
,
bucket
;
/* FIXME: do tdb_get here and below really win? */
r
=
tdb_access_read
(
tdb
,
end
,
sizeof
(
*
r
),
true
);
r
=
tdb_get
(
tdb
,
end
,
&
pad
,
sizeof
(
pad
));
if
(
!
r
)
if
(
!
r
)
goto
err
;
goto
err
;
if
(
frec_magic
(
r
)
!=
TDB_FREE_MAGIC
)
if
(
frec_magic
(
r
)
!=
TDB_FREE_MAGIC
||
frec_ftable
(
r
)
==
TDB_FTABLE_NONE
)
{
tdb_access_release
(
tdb
,
r
);
break
;
break
;
}
nb_off
=
bucket_off
(
frec_flist
(
r
),
size_to_bucket
(
r
->
data_len
));
ftable
=
frec_ftable
(
r
);
bucket
=
size_to_bucket
(
frec_len
(
r
));
nb_off
=
bucket_off
(
ftable_offset
(
tdb
,
ftable
),
bucket
);
tdb_access_release
(
tdb
,
r
);
/* We may be violating lock order here, so best effort. */
/* We may be violating lock order here, so best effort. */
if
(
tdb_lock_free_bucket
(
tdb
,
nb_off
,
TDB_LOCK_NOWAIT
)
==
-
1
)
if
(
tdb_lock_free_bucket
(
tdb
,
nb_off
,
TDB_LOCK_NOWAIT
)
==
-
1
)
{
add_stat
(
tdb
,
alloc_coalesce_lockfail
,
1
);
break
;
break
;
}
/* Now we have lock, re-check. */
/* Now we have lock, re-check. */
r
=
tdb_get
(
tdb
,
end
,
&
pad
,
sizeof
(
pad
));
if
(
tdb_read_convert
(
tdb
,
end
,
&
rec
,
sizeof
(
rec
)))
{
if
(
!
r
)
{
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
goto
err
;
goto
err
;
}
}
if
(
unlikely
(
frec_magic
(
r
)
!=
TDB_FREE_MAGIC
))
{
if
(
unlikely
(
frec_magic
(
&
rec
)
!=
TDB_FREE_MAGIC
))
{
add_stat
(
tdb
,
alloc_coalesce_race
,
1
);
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
break
;
break
;
}
}
if
(
unlikely
(
bucket_off
(
frec_flist
(
r
),
if
(
unlikely
(
frec_ftable
(
&
rec
)
!=
ftable
)
size_to_bucket
(
r
->
data_len
))
||
unlikely
(
size_to_bucket
(
frec_len
(
&
rec
))
!=
bucket
))
{
!=
nb_off
))
{
add_stat
(
tdb
,
alloc_coalesce_race
,
1
);
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
break
;
break
;
}
}
if
(
remove_from_list
(
tdb
,
nb_off
,
end
,
r
)
==
-
1
)
{
if
(
remove_from_list
(
tdb
,
nb_off
,
end
,
&
rec
)
==
-
1
)
{
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
goto
err
;
goto
err
;
}
}
end
+=
sizeof
(
struct
tdb_used_record
)
+
r
->
data_len
;
end
+=
sizeof
(
struct
tdb_used_record
)
+
frec_len
(
&
rec
)
;
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
tdb_unlock_free_bucket
(
tdb
,
nb_off
);
add_stat
(
tdb
,
alloc_coalesce_num_merged
,
1
);
}
}
/* Didn't find any adjacent free? */
/* Didn't find any adjacent free? */
if
(
end
==
off
+
sizeof
(
struct
tdb_used_record
)
+
data_len
)
if
(
end
==
off
+
sizeof
(
struct
tdb_used_record
)
+
data_len
)
return
0
;
return
0
;
/* OK, expand record */
/* OK, expand initial record */
r
=
tdb_get
(
tdb
,
off
,
&
pad
,
sizeof
(
pad
));
if
(
tdb_read_convert
(
tdb
,
off
,
&
rec
,
sizeof
(
rec
)))
if
(
!
r
)
goto
err
;
goto
err
;
if
(
r
->
data_len
!=
data_len
)
{
if
(
frec_len
(
&
rec
)
!=
data_len
)
{
tdb
->
ecode
=
TDB_ERR_CORRUPT
;
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"coalesce: expected data len %zu not %zu"
,
"coalesce: expected data len %llu not %llu
\n
"
,
(
size_t
)
data_len
,
(
size_t
)
frec_len
(
&
rec
));
(
long
long
)
data_len
,
(
long
long
)
r
->
data_len
);
goto
err
;
goto
err
;
}
}
if
(
remove_from_list
(
tdb
,
b_off
,
off
,
r
)
==
-
1
)
if
(
remove_from_list
(
tdb
,
b_off
,
off
,
&
rec
)
==
-
1
)
goto
err
;
r
=
tdb_access_write
(
tdb
,
off
,
sizeof
(
*
r
),
true
);
if
(
!
r
)
goto
err
;
goto
err
;
/* We have to drop this to avoid deadlocks, so make sure record
/* We have to drop this to avoid deadlocks, so make sure record
* doesn't get coalesced by someone else! */
* doesn't get coalesced by someone else! */
r
->
magic_and_meta
=
TDB_COALESCING_MAGIC
<<
(
64
-
TDB_OFF_UPPER_STEAL
);
rec
.
ftable_and_len
=
(
TDB_FTABLE_NONE
<<
(
64
-
TDB_OFF_UPPER_STEAL
))
r
->
data_len
=
end
-
off
-
sizeof
(
struct
tdb_used_record
);
|
(
end
-
off
-
sizeof
(
struct
tdb_used_record
));
if
(
tdb_access_commit
(
tdb
,
r
)
!=
0
)
if
(
tdb_write_off
(
tdb
,
off
+
offsetof
(
struct
tdb_free_record
,
ftable_and_len
),
rec
.
ftable_and_len
)
!=
0
)
goto
err
;
goto
err
;
add_stat
(
tdb
,
alloc_coalesce_succeeded
,
1
);
tdb_unlock_free_bucket
(
tdb
,
b_off
);
tdb_unlock_free_bucket
(
tdb
,
b_off
);
if
(
add_free_record
(
tdb
,
off
,
end
-
off
)
==
-
1
)
if
(
add_free_record
(
tdb
,
off
,
end
-
off
)
==
-
1
)
...
@@ -333,19 +366,21 @@ err:
...
@@ -333,19 +366,21 @@ err:
/* We need size bytes to put our key and data in. */
/* We need size bytes to put our key and data in. */
static
tdb_off_t
lock_and_alloc
(
struct
tdb_context
*
tdb
,
static
tdb_off_t
lock_and_alloc
(
struct
tdb_context
*
tdb
,
tdb_off_t
f
list
_off
,
tdb_off_t
f
table
_off
,
tdb_off_t
bucket
,
tdb_off_t
bucket
,
size_t
keylen
,
size_t
datalen
,
size_t
keylen
,
size_t
datalen
,
bool
want_extra
,
bool
want_extra
,
unsigned
magic
,
unsigned
hashlow
)
unsigned
hashlow
)
{
{
tdb_off_t
off
,
b_off
,
best_off
;
tdb_off_t
off
,
b_off
,
best_off
;
struct
tdb_free_record
pad
,
best
=
{
0
},
*
r
;
struct
tdb_free_record
best
=
{
0
}
;
double
multiplier
;
double
multiplier
;
size_t
size
=
adjust_size
(
keylen
,
datalen
);
size_t
size
=
adjust_size
(
keylen
,
datalen
);
add_stat
(
tdb
,
allocs
,
1
);
again:
again:
b_off
=
bucket_off
(
f
list
_off
,
bucket
);
b_off
=
bucket_off
(
f
table
_off
,
bucket
);
/* FIXME: Try non-blocking wait first, to measure contention. */
/* FIXME: Try non-blocking wait first, to measure contention. */
/* Lock this bucket. */
/* Lock this bucket. */
...
@@ -353,7 +388,7 @@ again:
...
@@ -353,7 +388,7 @@ again:
return
TDB_OFF_ERR
;
return
TDB_OFF_ERR
;
}
}
best
.
data
_len
=
-
1ULL
;
best
.
ftable_and
_len
=
-
1ULL
;
best_off
=
0
;
best_off
=
0
;
/* Get slack if we're after extra. */
/* Get slack if we're after extra. */
...
@@ -369,30 +404,40 @@ again:
...
@@ -369,30 +404,40 @@ again:
goto
unlock_err
;
goto
unlock_err
;
while
(
off
)
{
while
(
off
)
{
/* FIXME: Does tdb_get win anything here? */
const
struct
tdb_free_record
*
r
;
r
=
tdb_get
(
tdb
,
off
,
&
pad
,
sizeof
(
*
r
));
tdb_len_t
len
;
tdb_off_t
next
;
r
=
tdb_access_read
(
tdb
,
off
,
sizeof
(
*
r
),
true
);
if
(
!
r
)
if
(
!
r
)
goto
unlock_err
;
goto
unlock_err
;
if
(
frec_magic
(
r
)
!=
TDB_FREE_MAGIC
)
{
if
(
frec_magic
(
r
)
!=
TDB_FREE_MAGIC
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_access_release
(
tdb
,
r
);
"lock_and_alloc: %llu non-free 0x%llx
\n
"
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_FATAL
,
(
long
long
)
off
,
(
long
long
)
r
->
magic_and_meta
);
"lock_and_alloc: %llu non-free 0x%llx"
,
(
long
long
)
off
,
(
long
long
)
r
->
magic_and_prev
);
goto
unlock_err
;
goto
unlock_err
;
}
}
if
(
r
->
data_len
>=
size
&&
r
->
data_len
<
best
.
data_len
)
{
if
(
frec_len
(
r
)
>=
size
&&
frec_len
(
r
)
<
frec_len
(
&
best
)
)
{
best_off
=
off
;
best_off
=
off
;
best
=
*
r
;
best
=
*
r
;
}
}
if
(
best
.
data_len
<
size
*
multiplier
&&
best_off
)
if
(
frec_len
(
&
best
)
<
size
*
multiplier
&&
best_off
)
{
tdb_access_release
(
tdb
,
r
);
break
;
break
;
}
multiplier
*=
1
.
01
;
multiplier
*=
1
.
01
;
next
=
r
->
next
;
len
=
frec_len
(
r
);
tdb_access_release
(
tdb
,
r
);
/* Since we're going slow anyway, try coalescing here. */
/* Since we're going slow anyway, try coalescing here. */
switch
(
coalesce
(
tdb
,
off
,
b_off
,
r
->
data_
len
))
{
switch
(
coalesce
(
tdb
,
off
,
b_off
,
len
))
{
case
-
1
:
case
-
1
:
/* This has already unlocked on error. */
/* This has already unlocked on error. */
return
-
1
;
return
-
1
;
...
@@ -400,7 +445,7 @@ again:
...
@@ -400,7 +445,7 @@ again:
/* This has unlocked list, restart. */
/* This has unlocked list, restart. */
goto
again
;
goto
again
;
}
}
off
=
r
->
next
;
off
=
next
;
}
}
/* If we found anything at all, use it. */
/* If we found anything at all, use it. */
...
@@ -413,28 +458,30 @@ again:
...
@@ -413,28 +458,30 @@ again:
goto
unlock_err
;
goto
unlock_err
;
leftover
=
record_leftover
(
keylen
,
datalen
,
want_extra
,
leftover
=
record_leftover
(
keylen
,
datalen
,
want_extra
,
best
.
data_len
);
frec_len
(
&
best
)
);
assert
(
keylen
+
datalen
+
leftover
<=
best
.
data_len
);
assert
(
keylen
+
datalen
+
leftover
<=
frec_len
(
&
best
)
);
/* We need to mark non-free before we drop lock, otherwise
/* We need to mark non-free before we drop lock, otherwise
* coalesce() could try to merge it! */
* coalesce() could try to merge it! */
if
(
set_header
(
tdb
,
&
rec
,
keylen
,
datalen
,
if
(
set_header
(
tdb
,
&
rec
,
magic
,
keylen
,
datalen
,
best
.
data_len
-
leftover
,
frec_len
(
&
best
)
-
leftover
,
hashlow
)
!=
0
)
hashlow
)
!=
0
)
goto
unlock_err
;
goto
unlock_err
;
if
(
tdb_write_convert
(
tdb
,
best_off
,
&
rec
,
sizeof
(
rec
))
!=
0
)
if
(
tdb_write_convert
(
tdb
,
best_off
,
&
rec
,
sizeof
(
rec
))
!=
0
)
goto
unlock_err
;
goto
unlock_err
;
tdb_unlock_free_bucket
(
tdb
,
b_off
);
/* Bucket of leftover will be <= current bucket, so nested
* locking is allowed. */
if
(
leftover
)
{
if
(
leftover
)
{
add_stat
(
tdb
,
alloc_leftover
,
1
);
if
(
add_free_record
(
tdb
,
if
(
add_free_record
(
tdb
,
best_off
+
sizeof
(
rec
)
best_off
+
sizeof
(
rec
)
+
best
.
data_len
-
leftover
,
+
frec_len
(
&
best
)
-
leftover
,
leftover
))
leftover
))
return
TDB_OFF_ERR
;
best_off
=
TDB_OFF_ERR
;
}
}
tdb_unlock_free_bucket
(
tdb
,
b_off
);
return
best_off
;
return
best_off
;
}
}
...
@@ -449,10 +496,10 @@ unlock_err:
...
@@ -449,10 +496,10 @@ unlock_err:
/* Get a free block from current free list, or 0 if none. */
/* Get a free block from current free list, or 0 if none. */
static
tdb_off_t
get_free
(
struct
tdb_context
*
tdb
,
static
tdb_off_t
get_free
(
struct
tdb_context
*
tdb
,
size_t
keylen
,
size_t
datalen
,
bool
want_extra
,
size_t
keylen
,
size_t
datalen
,
bool
want_extra
,
unsigned
hashlow
)
unsigned
magic
,
unsigned
hashlow
)
{
{
tdb_off_t
off
,
f
list
;
tdb_off_t
off
,
f
table_off
;
unsigned
start_b
,
b
;
unsigned
start_b
,
b
,
ftable
;
bool
wrapped
=
false
;
bool
wrapped
=
false
;
/* If they are growing, add 50% to get to higher bucket. */
/* If they are growing, add 50% to get to higher bucket. */
...
@@ -462,31 +509,40 @@ static tdb_off_t get_free(struct tdb_context *tdb,
...
@@ -462,31 +509,40 @@ static tdb_off_t get_free(struct tdb_context *tdb,
else
else
start_b
=
size_to_bucket
(
adjust_size
(
keylen
,
datalen
));
start_b
=
size_to_bucket
(
adjust_size
(
keylen
,
datalen
));
flist
=
tdb
->
flist_off
;
ftable_off
=
tdb
->
ftable_off
;
while
(
!
wrapped
||
flist
!=
tdb
->
flist_off
)
{
ftable
=
tdb
->
ftable
;
while
(
!
wrapped
||
ftable_off
!=
tdb
->
ftable_off
)
{
/* Start at exact size bucket, and search up... */
/* Start at exact size bucket, and search up... */
for
(
b
=
find_free_head
(
tdb
,
f
list
,
start_b
);
for
(
b
=
find_free_head
(
tdb
,
f
table_off
,
start_b
);
b
<
TDB_FREE_BUCKETS
;
b
<
TDB_FREE_BUCKETS
;
b
=
find_free_head
(
tdb
,
f
list
,
b
+
1
))
{
b
=
find_free_head
(
tdb
,
f
table_off
,
b
+
1
))
{
/* Try getting one from list. */
/* Try getting one from list. */
off
=
lock_and_alloc
(
tdb
,
f
list
,
off
=
lock_and_alloc
(
tdb
,
f
table_off
,
b
,
keylen
,
datalen
,
want_extra
,
b
,
keylen
,
datalen
,
want_extra
,
hashlow
);
magic
,
hashlow
);
if
(
off
==
TDB_OFF_ERR
)
if
(
off
==
TDB_OFF_ERR
)
return
TDB_OFF_ERR
;
return
TDB_OFF_ERR
;
if
(
off
!=
0
)
{
if
(
off
!=
0
)
{
if
(
b
==
start_b
)
add_stat
(
tdb
,
alloc_bucket_exact
,
1
);
if
(
b
==
TDB_FREE_BUCKETS
-
1
)
add_stat
(
tdb
,
alloc_bucket_max
,
1
);
/* Worked? Stay using this list. */
/* Worked? Stay using this list. */
tdb
->
flist_off
=
flist
;
tdb
->
ftable_off
=
ftable_off
;
tdb
->
ftable
=
ftable
;
return
off
;
return
off
;
}
}
/* Didn't work. Try next bucket. */
/* Didn't work. Try next bucket. */
}
}
/* Hmm, try next list. */
/* Hmm, try next table. */
flist
=
next_flist
(
tdb
,
flist
);
ftable_off
=
next_ftable
(
tdb
,
ftable_off
);
if
(
flist
==
0
)
{
ftable
++
;
if
(
ftable_off
==
0
)
{
wrapped
=
true
;
wrapped
=
true
;
flist
=
first_flist
(
tdb
);
ftable_off
=
first_ftable
(
tdb
);
ftable
=
0
;
}
}
}
}
...
@@ -495,7 +551,7 @@ static tdb_off_t get_free(struct tdb_context *tdb,
...
@@ -495,7 +551,7 @@ static tdb_off_t get_free(struct tdb_context *tdb,
int
set_header
(
struct
tdb_context
*
tdb
,
int
set_header
(
struct
tdb_context
*
tdb
,
struct
tdb_used_record
*
rec
,
struct
tdb_used_record
*
rec
,
uint64_t
keylen
,
uint64_t
datalen
,
u
nsigned
magic
,
u
int64_t
keylen
,
uint64_t
datalen
,
uint64_t
actuallen
,
unsigned
hashlow
)
uint64_t
actuallen
,
unsigned
hashlow
)
{
{
uint64_t
keybits
=
(
fls64
(
keylen
)
+
1
)
/
2
;
uint64_t
keybits
=
(
fls64
(
keylen
)
+
1
)
/
2
;
...
@@ -504,16 +560,15 @@ int set_header(struct tdb_context *tdb,
...
@@ -504,16 +560,15 @@ int set_header(struct tdb_context *tdb,
rec
->
magic_and_meta
=
(
hashlow
&
((
1
<<
11
)
-
1
))
rec
->
magic_and_meta
=
(
hashlow
&
((
1
<<
11
)
-
1
))
|
((
actuallen
-
(
keylen
+
datalen
))
<<
11
)
|
((
actuallen
-
(
keylen
+
datalen
))
<<
11
)
|
(
keybits
<<
43
)
|
(
keybits
<<
43
)
|
(
TDB_MAGIC
<<
48
);
|
(
(
uint64_t
)
magic
<<
48
);
rec
->
key_and_data_len
=
(
keylen
|
(
datalen
<<
(
keybits
*
2
)));
rec
->
key_and_data_len
=
(
keylen
|
(
datalen
<<
(
keybits
*
2
)));
/* Encoding can fail on big values. */
/* Encoding can fail on big values. */
if
(
rec_key_length
(
rec
)
!=
keylen
if
(
rec_key_length
(
rec
)
!=
keylen
||
rec_data_length
(
rec
)
!=
datalen
||
rec_data_length
(
rec
)
!=
datalen
||
rec_extra_padding
(
rec
)
!=
actuallen
-
(
keylen
+
datalen
))
{
||
rec_extra_padding
(
rec
)
!=
actuallen
-
(
keylen
+
datalen
))
{
tdb
->
ecode
=
TDB_ERR_IO
;
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"Could not encode k=%llu,d=%llu,a=%llu"
,
"Could not encode k=%llu,d=%llu,a=%llu
\n
"
,
(
long
long
)
keylen
,
(
long
long
)
datalen
,
(
long
long
)
keylen
,
(
long
long
)
datalen
,
(
long
long
)
actuallen
);
(
long
long
)
actuallen
);
return
-
1
;
return
-
1
;
...
@@ -533,11 +588,19 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
...
@@ -533,11 +588,19 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
/* Need to hold a hash lock to expand DB: transactions rely on it. */
/* Need to hold a hash lock to expand DB: transactions rely on it. */
if
(
!
(
tdb
->
flags
&
TDB_NOLOCK
)
if
(
!
(
tdb
->
flags
&
TDB_NOLOCK
)
&&
!
tdb
->
allrecord_lock
.
count
&&
!
tdb_has_hash_locks
(
tdb
))
{
&&
!
tdb
->
allrecord_lock
.
count
&&
!
tdb_has_hash_locks
(
tdb
))
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
"tdb_expand: must hold lock during expand
\n
"
);
"tdb_expand: must hold lock during expand
"
);
return
-
1
;
return
-
1
;
}
}
/* always make room for at least 100 more records, and at
least 25% more space. */
if
(
size
*
TDB_EXTENSION_FACTOR
>
tdb
->
map_size
/
4
)
wanted
=
size
*
TDB_EXTENSION_FACTOR
;
else
wanted
=
tdb
->
map_size
/
4
;
wanted
=
adjust_size
(
0
,
wanted
);
/* Only one person can expand file at a time. */
/* Only one person can expand file at a time. */
if
(
tdb_lock_expand
(
tdb
,
F_WRLCK
)
!=
0
)
if
(
tdb_lock_expand
(
tdb
,
F_WRLCK
)
!=
0
)
return
-
1
;
return
-
1
;
...
@@ -550,7 +613,7 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
...
@@ -550,7 +613,7 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
return
0
;
return
0
;
}
}
if
(
tdb
->
methods
->
expand_file
(
tdb
,
wanted
*
TDB_EXTENSION_FACTOR
)
==
-
1
)
{
if
(
tdb
->
methods
->
expand_file
(
tdb
,
wanted
)
==
-
1
)
{
tdb_unlock_expand
(
tdb
,
F_WRLCK
);
tdb_unlock_expand
(
tdb
,
F_WRLCK
);
return
-
1
;
return
-
1
;
}
}
...
@@ -558,12 +621,13 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
...
@@ -558,12 +621,13 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
/* We need to drop this lock before adding free record. */
/* We need to drop this lock before adding free record. */
tdb_unlock_expand
(
tdb
,
F_WRLCK
);
tdb_unlock_expand
(
tdb
,
F_WRLCK
);
return
add_free_record
(
tdb
,
old_size
,
wanted
*
TDB_EXTENSION_FACTOR
);
add_stat
(
tdb
,
expands
,
1
);
return
add_free_record
(
tdb
,
old_size
,
wanted
);
}
}
/* This won't fail: it will expand the database if it has to. */
/* This won't fail: it will expand the database if it has to. */
tdb_off_t
alloc
(
struct
tdb_context
*
tdb
,
size_t
keylen
,
size_t
datalen
,
tdb_off_t
alloc
(
struct
tdb_context
*
tdb
,
size_t
keylen
,
size_t
datalen
,
uint64_t
hash
,
bool
growing
)
uint64_t
hash
,
unsigned
magic
,
bool
growing
)
{
{
tdb_off_t
off
;
tdb_off_t
off
;
...
@@ -571,7 +635,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
...
@@ -571,7 +635,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
assert
(
!
tdb
->
direct_access
);
assert
(
!
tdb
->
direct_access
);
for
(;;)
{
for
(;;)
{
off
=
get_free
(
tdb
,
keylen
,
datalen
,
growing
,
hash
);
off
=
get_free
(
tdb
,
keylen
,
datalen
,
growing
,
magic
,
hash
);
if
(
likely
(
off
!=
0
))
if
(
likely
(
off
!=
0
))
break
;
break
;
...
...
ccan/tdb2/hash.c
View file @
51a56b52
...
@@ -42,17 +42,19 @@ uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
...
@@ -42,17 +42,19 @@ uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
uint64_t
hash_record
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
)
uint64_t
hash_record
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
)
{
{
struct
tdb_used_record
pad
,
*
r
;
const
struct
tdb_used_record
*
r
;
const
void
*
key
;
const
void
*
key
;
uint64_t
klen
,
hash
;
uint64_t
klen
,
hash
;
r
=
tdb_
get
(
tdb
,
off
,
&
pad
,
sizeof
(
pad
)
);
r
=
tdb_
access_read
(
tdb
,
off
,
sizeof
(
*
r
),
true
);
if
(
!
r
)
if
(
!
r
)
/* FIXME */
/* FIXME */
return
0
;
return
0
;
klen
=
rec_key_length
(
r
);
klen
=
rec_key_length
(
r
);
key
=
tdb_access_read
(
tdb
,
off
+
sizeof
(
pad
),
klen
,
false
);
tdb_access_release
(
tdb
,
r
);
key
=
tdb_access_read
(
tdb
,
off
+
sizeof
(
*
r
),
klen
,
false
);
if
(
!
key
)
if
(
!
key
)
return
0
;
return
0
;
...
@@ -76,6 +78,30 @@ static uint32_t use_bits(struct hash_info *h, unsigned num)
...
@@ -76,6 +78,30 @@ static uint32_t use_bits(struct hash_info *h, unsigned num)
return
bits
(
h
->
h
,
64
-
h
->
hash_used
,
num
);
return
bits
(
h
->
h
,
64
-
h
->
hash_used
,
num
);
}
}
static
bool
key_matches
(
struct
tdb_context
*
tdb
,
const
struct
tdb_used_record
*
rec
,
tdb_off_t
off
,
const
struct
tdb_data
*
key
)
{
bool
ret
=
false
;
const
char
*
rkey
;
if
(
rec_key_length
(
rec
)
!=
key
->
dsize
)
{
add_stat
(
tdb
,
compare_wrong_keylen
,
1
);
return
ret
;
}
rkey
=
tdb_access_read
(
tdb
,
off
+
sizeof
(
*
rec
),
key
->
dsize
,
false
);
if
(
!
rkey
)
return
ret
;
if
(
memcmp
(
rkey
,
key
->
dptr
,
key
->
dsize
)
==
0
)
ret
=
true
;
else
add_stat
(
tdb
,
compare_wrong_keycmp
,
1
);
tdb_access_release
(
tdb
,
rkey
);
return
ret
;
}
/* Does entry match? */
/* Does entry match? */
static
bool
match
(
struct
tdb_context
*
tdb
,
static
bool
match
(
struct
tdb_context
*
tdb
,
struct
hash_info
*
h
,
struct
hash_info
*
h
,
...
@@ -83,38 +109,33 @@ static bool match(struct tdb_context *tdb,
...
@@ -83,38 +109,33 @@ static bool match(struct tdb_context *tdb,
tdb_off_t
val
,
tdb_off_t
val
,
struct
tdb_used_record
*
rec
)
struct
tdb_used_record
*
rec
)
{
{
bool
ret
;
const
unsigned
char
*
rkey
;
tdb_off_t
off
;
tdb_off_t
off
;
/* FIXME: Handle hash value truncated. */
add_stat
(
tdb
,
compares
,
1
);
if
(
bits
(
val
,
TDB_OFF_HASH_TRUNCATED_BIT
,
1
))
abort
();
/* Desired bucket must match. */
/* Desired bucket must match. */
if
(
h
->
home_bucket
!=
(
val
&
TDB_OFF_HASH_GROUP_MASK
))
if
(
h
->
home_bucket
!=
(
val
&
TDB_OFF_HASH_GROUP_MASK
))
{
add_stat
(
tdb
,
compare_wrong_bucket
,
1
);
return
false
;
return
false
;
}
/* Top bits of offset == next bits of hash. */
/* Top bits of offset == next bits of hash. */
if
(
bits
(
val
,
TDB_OFF_HASH_EXTRA_BIT
,
TDB_OFF_UPPER_STEAL_EXTRA
)
if
(
bits
(
val
,
TDB_OFF_HASH_EXTRA_BIT
,
TDB_OFF_UPPER_STEAL_EXTRA
)
!=
bits
(
h
->
h
,
64
-
h
->
hash_used
-
TDB_OFF_UPPER_STEAL_EXTRA
,
!=
bits
(
h
->
h
,
64
-
h
->
hash_used
-
TDB_OFF_UPPER_STEAL_EXTRA
,
TDB_OFF_UPPER_STEAL_EXTRA
))
TDB_OFF_UPPER_STEAL_EXTRA
))
{
add_stat
(
tdb
,
compare_wrong_offsetbits
,
1
);
return
false
;
return
false
;
}
off
=
val
&
TDB_OFF_MASK
;
off
=
val
&
TDB_OFF_MASK
;
if
(
tdb_read_convert
(
tdb
,
off
,
rec
,
sizeof
(
*
rec
))
==
-
1
)
if
(
tdb_read_convert
(
tdb
,
off
,
rec
,
sizeof
(
*
rec
))
==
-
1
)
return
false
;
return
false
;
/* FIXME: check extra bits in header? */
if
((
h
->
h
&
((
1
<<
11
)
-
1
))
!=
rec_hash
(
rec
))
{
if
(
rec_key_length
(
rec
)
!=
key
->
dsize
)
add_stat
(
tdb
,
compare_wrong_rechash
,
1
);
return
false
;
return
false
;
}
rkey
=
tdb_access_read
(
tdb
,
off
+
sizeof
(
*
rec
),
key
->
dsize
,
false
);
return
key_matches
(
tdb
,
rec
,
off
,
key
);
if
(
!
rkey
)
return
false
;
ret
=
(
memcmp
(
rkey
,
key
->
dptr
,
key
->
dsize
)
==
0
);
tdb_access_release
(
tdb
,
rkey
);
return
ret
;
}
}
static
tdb_off_t
hbucket_off
(
tdb_off_t
group_start
,
unsigned
bucket
)
static
tdb_off_t
hbucket_off
(
tdb_off_t
group_start
,
unsigned
bucket
)
...
@@ -123,10 +144,9 @@ static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
...
@@ -123,10 +144,9 @@ static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
+
(
bucket
%
(
1
<<
TDB_HASH_GROUP_BITS
))
*
sizeof
(
tdb_off_t
);
+
(
bucket
%
(
1
<<
TDB_HASH_GROUP_BITS
))
*
sizeof
(
tdb_off_t
);
}
}
/* Truncated hashes can't be all 1: that's how we spot a sub-hash */
bool
is_subhash
(
tdb_off_t
val
)
bool
is_subhash
(
tdb_off_t
val
)
{
{
return
val
>>
(
64
-
TDB_OFF_UPPER_STEAL
)
==
(
1
<<
TDB_OFF_UPPER_STEAL
)
-
1
;
return
(
val
>>
TDB_OFF_UPPER_STEAL_SUBHASH_BIT
)
&
1
;
}
}
/* FIXME: Guess the depth, don't over-lock! */
/* FIXME: Guess the depth, don't over-lock! */
...
@@ -136,6 +156,65 @@ static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
...
@@ -136,6 +156,65 @@ static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
return
group
<<
(
64
-
(
TDB_TOPLEVEL_HASH_BITS
-
TDB_HASH_GROUP_BITS
));
return
group
<<
(
64
-
(
TDB_TOPLEVEL_HASH_BITS
-
TDB_HASH_GROUP_BITS
));
}
}
static
tdb_off_t
COLD
find_in_chain
(
struct
tdb_context
*
tdb
,
struct
tdb_data
key
,
tdb_off_t
chain
,
struct
hash_info
*
h
,
struct
tdb_used_record
*
rec
,
struct
traverse_info
*
tinfo
)
{
tdb_off_t
off
,
next
;
/* In case nothing is free, we set these to zero. */
h
->
home_bucket
=
h
->
found_bucket
=
0
;
for
(
off
=
chain
;
off
;
off
=
next
)
{
unsigned
int
i
;
h
->
group_start
=
off
;
if
(
tdb_read_convert
(
tdb
,
off
,
h
->
group
,
sizeof
(
h
->
group
)))
return
TDB_OFF_ERR
;
for
(
i
=
0
;
i
<
(
1
<<
TDB_HASH_GROUP_BITS
);
i
++
)
{
tdb_off_t
recoff
;
if
(
!
h
->
group
[
i
])
{
/* Remember this empty bucket. */
h
->
home_bucket
=
h
->
found_bucket
=
i
;
continue
;
}
/* We can insert extra bits via add_to_hash
* empty bucket logic. */
recoff
=
h
->
group
[
i
]
&
TDB_OFF_MASK
;
if
(
tdb_read_convert
(
tdb
,
recoff
,
rec
,
sizeof
(
*
rec
)))
return
TDB_OFF_ERR
;
if
(
key_matches
(
tdb
,
rec
,
recoff
,
&
key
))
{
h
->
home_bucket
=
h
->
found_bucket
=
i
;
if
(
tinfo
)
{
tinfo
->
levels
[
tinfo
->
num_levels
]
.
hashtable
=
off
;
tinfo
->
levels
[
tinfo
->
num_levels
]
.
total_buckets
=
1
<<
TDB_HASH_GROUP_BITS
;
tinfo
->
levels
[
tinfo
->
num_levels
].
entry
=
i
;
tinfo
->
num_levels
++
;
}
return
recoff
;
}
}
next
=
tdb_read_off
(
tdb
,
off
+
offsetof
(
struct
tdb_chain
,
next
));
if
(
next
==
TDB_OFF_ERR
)
return
TDB_OFF_ERR
;
if
(
next
)
next
+=
sizeof
(
struct
tdb_used_record
);
}
return
0
;
}
/* This is the core routine which searches the hashtable for an entry.
/* This is the core routine which searches the hashtable for an entry.
* On error, no locks are held and TDB_OFF_ERR is returned.
* On error, no locks are held and TDB_OFF_ERR is returned.
* Otherwise, hinfo is filled in (and the optional tinfo).
* Otherwise, hinfo is filled in (and the optional tinfo).
...
@@ -171,7 +250,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
...
@@ -171,7 +250,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
tinfo
->
levels
[
0
].
total_buckets
=
1
<<
TDB_HASH_GROUP_BITS
;
tinfo
->
levels
[
0
].
total_buckets
=
1
<<
TDB_HASH_GROUP_BITS
;
}
}
while
(
likely
(
h
->
hash_used
<
64
)
)
{
while
(
h
->
hash_used
<=
64
)
{
/* Read in the hash group. */
/* Read in the hash group. */
h
->
group_start
=
hashtable
h
->
group_start
=
hashtable
+
group
*
(
sizeof
(
tdb_off_t
)
<<
TDB_HASH_GROUP_BITS
);
+
group
*
(
sizeof
(
tdb_off_t
)
<<
TDB_HASH_GROUP_BITS
);
...
@@ -228,8 +307,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
...
@@ -228,8 +307,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
return
0
;
return
0
;
}
}
/* FIXME: We hit the bottom. Chain! */
return
find_in_chain
(
tdb
,
key
,
hashtable
,
h
,
rec
,
tinfo
);
abort
();
fail:
fail:
tdb_unlock_hashes
(
tdb
,
h
->
hlock_start
,
h
->
hlock_range
,
ltype
);
tdb_unlock_hashes
(
tdb
,
h
->
hlock_start
,
h
->
hlock_range
,
ltype
);
...
@@ -239,8 +317,8 @@ fail:
...
@@ -239,8 +317,8 @@ fail:
/* I wrote a simple test, expanding a hash to 2GB, for the following
/* I wrote a simple test, expanding a hash to 2GB, for the following
* cases:
* cases:
* 1) Expanding all the buckets at once,
* 1) Expanding all the buckets at once,
* 2) Expanding the
most-populated bucket,
* 2) Expanding the
bucket we wanted to place the new entry into.
* 3) Expanding the
bucket we wanted to place the new entry ito.
* 3) Expanding the
most-populated bucket,
*
*
* I measured the worst/average/best density during this process.
* I measured the worst/average/best density during this process.
* 1) 3%/16%/30%
* 1) 3%/16%/30%
...
@@ -315,6 +393,41 @@ int replace_in_hash(struct tdb_context *tdb,
...
@@ -315,6 +393,41 @@ int replace_in_hash(struct tdb_context *tdb,
encode_offset
(
new_off
,
h
));
encode_offset
(
new_off
,
h
));
}
}
/* We slot in anywhere that's empty in the chain. */
static
int
COLD
add_to_chain
(
struct
tdb_context
*
tdb
,
tdb_off_t
subhash
,
tdb_off_t
new_off
)
{
size_t
entry
=
tdb_find_zero_off
(
tdb
,
subhash
,
1
<<
TDB_HASH_GROUP_BITS
);
if
(
entry
==
1
<<
TDB_HASH_GROUP_BITS
)
{
tdb_off_t
next
;
next
=
tdb_read_off
(
tdb
,
subhash
+
offsetof
(
struct
tdb_chain
,
next
));
if
(
next
==
TDB_OFF_ERR
)
return
-
1
;
if
(
!
next
)
{
next
=
alloc
(
tdb
,
0
,
sizeof
(
struct
tdb_chain
),
0
,
TDB_CHAIN_MAGIC
,
false
);
if
(
next
==
TDB_OFF_ERR
)
return
-
1
;
if
(
zero_out
(
tdb
,
next
+
sizeof
(
struct
tdb_used_record
),
sizeof
(
struct
tdb_chain
)))
return
-
1
;
if
(
tdb_write_off
(
tdb
,
subhash
+
offsetof
(
struct
tdb_chain
,
next
),
next
)
!=
0
)
return
-
1
;
}
return
add_to_chain
(
tdb
,
next
,
new_off
);
}
return
tdb_write_off
(
tdb
,
subhash
+
entry
*
sizeof
(
tdb_off_t
),
new_off
);
}
/* Add into a newly created subhash. */
/* Add into a newly created subhash. */
static
int
add_to_subhash
(
struct
tdb_context
*
tdb
,
tdb_off_t
subhash
,
static
int
add_to_subhash
(
struct
tdb_context
*
tdb
,
tdb_off_t
subhash
,
unsigned
hash_used
,
tdb_off_t
val
)
unsigned
hash_used
,
tdb_off_t
val
)
...
@@ -325,14 +438,12 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
...
@@ -325,14 +438,12 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
h
.
hash_used
=
hash_used
;
h
.
hash_used
=
hash_used
;
/* FIXME chain if hash_used == 64 */
if
(
hash_used
+
TDB_SUBLEVEL_HASH_BITS
>
64
)
if
(
hash_used
+
TDB_SUBLEVEL_HASH_BITS
>
64
)
abort
(
);
return
add_to_chain
(
tdb
,
subhash
,
off
);
/* FIXME: Do truncated hash bits if we can! */
h
.
h
=
hash_record
(
tdb
,
off
);
h
.
h
=
hash_record
(
tdb
,
off
);
gnum
=
use_bits
(
&
h
,
TDB_SUBLEVEL_HASH_BITS
-
TDB_HASH_GROUP_BITS
);
gnum
=
use_bits
(
&
h
,
TDB_SUBLEVEL_HASH_BITS
-
TDB_HASH_GROUP_BITS
);
h
.
group_start
=
subhash
+
sizeof
(
struct
tdb_used_record
)
h
.
group_start
=
subhash
+
gnum
*
(
sizeof
(
tdb_off_t
)
<<
TDB_HASH_GROUP_BITS
);
+
gnum
*
(
sizeof
(
tdb_off_t
)
<<
TDB_HASH_GROUP_BITS
);
h
.
home_bucket
=
use_bits
(
&
h
,
TDB_HASH_GROUP_BITS
);
h
.
home_bucket
=
use_bits
(
&
h
,
TDB_HASH_GROUP_BITS
);
...
@@ -346,20 +457,29 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
...
@@ -346,20 +457,29 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
static
int
expand_group
(
struct
tdb_context
*
tdb
,
struct
hash_info
*
h
)
static
int
expand_group
(
struct
tdb_context
*
tdb
,
struct
hash_info
*
h
)
{
{
unsigned
bucket
,
num_vals
,
i
;
unsigned
bucket
,
num_vals
,
i
,
magic
;
size_t
subsize
;
tdb_off_t
subhash
;
tdb_off_t
subhash
;
tdb_off_t
vals
[
1
<<
TDB_HASH_GROUP_BITS
];
tdb_off_t
vals
[
1
<<
TDB_HASH_GROUP_BITS
];
/* Attach new empty subhash under fullest bucket. */
/* Attach new empty subhash under fullest bucket. */
bucket
=
fullest_bucket
(
tdb
,
h
->
group
,
h
->
home_bucket
);
bucket
=
fullest_bucket
(
tdb
,
h
->
group
,
h
->
home_bucket
);
subhash
=
alloc
(
tdb
,
0
,
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
,
if
(
h
->
hash_used
==
64
)
{
0
,
false
);
add_stat
(
tdb
,
alloc_chain
,
1
);
subsize
=
sizeof
(
struct
tdb_chain
);
magic
=
TDB_CHAIN_MAGIC
;
}
else
{
add_stat
(
tdb
,
alloc_subhash
,
1
);
subsize
=
(
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
);
magic
=
TDB_HTABLE_MAGIC
;
}
subhash
=
alloc
(
tdb
,
0
,
subsize
,
0
,
magic
,
false
);
if
(
subhash
==
TDB_OFF_ERR
)
if
(
subhash
==
TDB_OFF_ERR
)
return
-
1
;
return
-
1
;
if
(
zero_out
(
tdb
,
subhash
+
sizeof
(
struct
tdb_used_record
),
if
(
zero_out
(
tdb
,
subhash
+
sizeof
(
struct
tdb_used_record
),
subsize
))
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
)
==
-
1
)
return
-
1
;
return
-
1
;
/* Remove any which are destined for bucket or are in wrong place. */
/* Remove any which are destined for bucket or are in wrong place. */
...
@@ -377,7 +497,10 @@ static int expand_group(struct tdb_context *tdb, struct hash_info *h)
...
@@ -377,7 +497,10 @@ static int expand_group(struct tdb_context *tdb, struct hash_info *h)
/* assert(num_vals); */
/* assert(num_vals); */
/* Overwrite expanded bucket with subhash pointer. */
/* Overwrite expanded bucket with subhash pointer. */
h
->
group
[
bucket
]
=
subhash
|
~
((
1ULL
<<
(
64
-
TDB_OFF_UPPER_STEAL
))
-
1
);
h
->
group
[
bucket
]
=
subhash
|
(
1ULL
<<
TDB_OFF_UPPER_STEAL_SUBHASH_BIT
);
/* Point to actual contents of record. */
subhash
+=
sizeof
(
struct
tdb_used_record
);
/* Put values back. */
/* Put values back. */
for
(
i
=
0
;
i
<
num_vals
;
i
++
)
{
for
(
i
=
0
;
i
<
num_vals
;
i
++
)
{
...
@@ -433,10 +556,6 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
...
@@ -433,10 +556,6 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
int
add_to_hash
(
struct
tdb_context
*
tdb
,
struct
hash_info
*
h
,
tdb_off_t
new_off
)
int
add_to_hash
(
struct
tdb_context
*
tdb
,
struct
hash_info
*
h
,
tdb_off_t
new_off
)
{
{
/* FIXME: chain! */
if
(
h
->
hash_used
>=
64
)
abort
();
/* We hit an empty bucket during search? That's where it goes. */
/* We hit an empty bucket during search? That's where it goes. */
if
(
!
h
->
group
[
h
->
found_bucket
])
{
if
(
!
h
->
group
[
h
->
found_bucket
])
{
h
->
group
[
h
->
found_bucket
]
=
encode_offset
(
new_off
,
h
);
h
->
group
[
h
->
found_bucket
]
=
encode_offset
(
new_off
,
h
);
...
@@ -445,6 +564,9 @@ int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
...
@@ -445,6 +564,9 @@ int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
h
->
group
,
sizeof
(
h
->
group
));
h
->
group
,
sizeof
(
h
->
group
));
}
}
if
(
h
->
hash_used
>
64
)
return
add_to_chain
(
tdb
,
h
->
group_start
,
new_off
);
/* We're full. Expand. */
/* We're full. Expand. */
if
(
expand_group
(
tdb
,
h
)
==
-
1
)
if
(
expand_group
(
tdb
,
h
)
==
-
1
)
return
-
1
;
return
-
1
;
...
@@ -523,7 +645,11 @@ again:
...
@@ -523,7 +645,11 @@ again:
tlevel
++
;
tlevel
++
;
tlevel
->
hashtable
=
off
+
sizeof
(
struct
tdb_used_record
);
tlevel
->
hashtable
=
off
+
sizeof
(
struct
tdb_used_record
);
tlevel
->
entry
=
0
;
tlevel
->
entry
=
0
;
tlevel
->
total_buckets
=
(
1
<<
TDB_SUBLEVEL_HASH_BITS
);
/* Next level is a chain? */
if
(
unlikely
(
tinfo
->
num_levels
==
TDB_MAX_LEVELS
+
1
))
tlevel
->
total_buckets
=
(
1
<<
TDB_HASH_GROUP_BITS
);
else
tlevel
->
total_buckets
=
(
1
<<
TDB_SUBLEVEL_HASH_BITS
);
goto
again
;
goto
again
;
}
}
...
@@ -531,6 +657,20 @@ again:
...
@@ -531,6 +657,20 @@ again:
if
(
tinfo
->
num_levels
==
1
)
if
(
tinfo
->
num_levels
==
1
)
return
0
;
return
0
;
/* Handle chained entries. */
if
(
unlikely
(
tinfo
->
num_levels
==
TDB_MAX_LEVELS
+
1
))
{
tlevel
->
hashtable
=
tdb_read_off
(
tdb
,
tlevel
->
hashtable
+
offsetof
(
struct
tdb_chain
,
next
));
if
(
tlevel
->
hashtable
==
TDB_OFF_ERR
)
return
TDB_OFF_ERR
;
if
(
tlevel
->
hashtable
)
{
tlevel
->
hashtable
+=
sizeof
(
struct
tdb_used_record
);
tlevel
->
entry
=
0
;
goto
again
;
}
}
/* Go back up and keep searching. */
/* Go back up and keep searching. */
tinfo
->
num_levels
--
;
tinfo
->
num_levels
--
;
tlevel
--
;
tlevel
--
;
...
@@ -563,11 +703,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
...
@@ -563,11 +703,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
ltype
);
ltype
);
return
-
1
;
return
-
1
;
}
}
if
(
rec_magic
(
&
rec
)
!=
TDB_MAGIC
)
{
if
(
rec_magic
(
&
rec
)
!=
TDB_USED_MAGIC
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
"next_in_hash:"
TDB_DEBUG_FATAL
,
" corrupt record at %llu
\n
"
,
"next_in_hash:"
(
long
long
)
off
);
" corrupt record at %llu"
,
(
long
long
)
off
);
return
-
1
;
return
-
1
;
}
}
...
...
ccan/tdb2/io.c
View file @
51a56b52
...
@@ -56,9 +56,9 @@ void tdb_mmap(struct tdb_context *tdb)
...
@@ -56,9 +56,9 @@ void tdb_mmap(struct tdb_context *tdb)
*/
*/
if
(
tdb
->
map_ptr
==
MAP_FAILED
)
{
if
(
tdb
->
map_ptr
==
MAP_FAILED
)
{
tdb
->
map_ptr
=
NULL
;
tdb
->
map_ptr
=
NULL
;
tdb
->
log
(
tdb
,
TDB_DEBUG_WARNING
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_SUCCESS
,
TDB_DEBUG_WARNING
,
"tdb_mmap failed for size %lld (%s)
\n
"
,
"tdb_mmap failed for size %lld (%s)"
,
(
long
long
)
tdb
->
map_size
,
strerror
(
errno
));
(
long
long
)
tdb
->
map_size
,
strerror
(
errno
));
}
}
}
}
...
@@ -70,7 +70,6 @@ void tdb_mmap(struct tdb_context *tdb)
...
@@ -70,7 +70,6 @@ void tdb_mmap(struct tdb_context *tdb)
static
int
tdb_oob
(
struct
tdb_context
*
tdb
,
tdb_off_t
len
,
bool
probe
)
static
int
tdb_oob
(
struct
tdb_context
*
tdb
,
tdb_off_t
len
,
bool
probe
)
{
{
struct
stat
st
;
struct
stat
st
;
int
ret
;
/* We can't hold pointers during this: we could unmap! */
/* We can't hold pointers during this: we could unmap! */
assert
(
!
tdb
->
direct_access
assert
(
!
tdb
->
direct_access
...
@@ -81,11 +80,9 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
...
@@ -81,11 +80,9 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
return
0
;
return
0
;
if
(
tdb
->
flags
&
TDB_INTERNAL
)
{
if
(
tdb
->
flags
&
TDB_INTERNAL
)
{
if
(
!
probe
)
{
if
(
!
probe
)
{
/* Ensure ecode is set for log fn. */
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
tdb
->
ecode
=
TDB_ERR_IO
;
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_oob len %lld beyond internal"
"tdb_oob len %lld beyond internal"
" malloc size %lld
\n
"
,
" malloc size %lld"
,
(
long
long
)
len
,
(
long
long
)
len
,
(
long
long
)
tdb
->
map_size
);
(
long
long
)
tdb
->
map_size
);
}
}
...
@@ -95,22 +92,20 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
...
@@ -95,22 +92,20 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
if
(
tdb_lock_expand
(
tdb
,
F_RDLCK
)
!=
0
)
if
(
tdb_lock_expand
(
tdb
,
F_RDLCK
)
!=
0
)
return
-
1
;
return
-
1
;
ret
=
fstat
(
tdb
->
fd
,
&
st
);
if
(
fstat
(
tdb
->
fd
,
&
st
)
!=
0
)
{
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
tdb_unlock_expand
(
tdb
,
F_RDLCK
);
"Failed to fstat file: %s"
,
strerror
(
errno
));
tdb_unlock_expand
(
tdb
,
F_RDLCK
);
if
(
ret
==
-
1
)
{
tdb
->
ecode
=
TDB_ERR_IO
;
return
-
1
;
return
-
1
;
}
}
tdb_unlock_expand
(
tdb
,
F_RDLCK
);
if
(
st
.
st_size
<
(
size_t
)
len
)
{
if
(
st
.
st_size
<
(
size_t
)
len
)
{
if
(
!
probe
)
{
if
(
!
probe
)
{
/* Ensure ecode is set for log fn. */
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
tdb
->
ecode
=
TDB_ERR_IO
;
"tdb_oob len %zu beyond eof at %zu"
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
(
size_t
)
len
,
st
.
st_size
);
"tdb_oob len %lld beyond eof at %lld
\n
"
,
(
long
long
)
len
,
(
long
long
)
st
.
st_size
);
}
}
return
-
1
;
return
-
1
;
}
}
...
@@ -123,19 +118,6 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
...
@@ -123,19 +118,6 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
return
0
;
return
0
;
}
}
/* Either make a copy into pad and return that, or return ptr into mmap. */
/* Note: pad has to be a real object, so we can't get here if len
* overflows size_t */
void
*
tdb_get
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
void
*
pad
,
size_t
len
)
{
if
(
likely
(
!
(
tdb
->
flags
&
TDB_CONVERT
)))
{
void
*
ret
=
tdb
->
methods
->
direct
(
tdb
,
off
,
len
);
if
(
ret
)
return
ret
;
}
return
tdb_read_convert
(
tdb
,
off
,
pad
,
len
)
==
-
1
?
NULL
:
pad
;
}
/* Endian conversion: we only ever deal with 8 byte quantities */
/* Endian conversion: we only ever deal with 8 byte quantities */
void
*
tdb_convert
(
const
struct
tdb_context
*
tdb
,
void
*
buf
,
tdb_len_t
size
)
void
*
tdb_convert
(
const
struct
tdb_context
*
tdb
,
void
*
buf
,
tdb_len_t
size
)
{
{
...
@@ -191,7 +173,9 @@ uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
...
@@ -191,7 +173,9 @@ uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
int
zero_out
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_len_t
len
)
int
zero_out
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_len_t
len
)
{
{
char
buf
[
8192
]
=
{
0
};
char
buf
[
8192
]
=
{
0
};
void
*
p
=
tdb
->
methods
->
direct
(
tdb
,
off
,
len
);
void
*
p
=
tdb
->
methods
->
direct
(
tdb
,
off
,
len
,
true
);
assert
(
!
tdb
->
read_only
);
if
(
p
)
{
if
(
p
)
{
memset
(
p
,
0
,
len
);
memset
(
p
,
0
,
len
);
return
0
;
return
0
;
...
@@ -208,13 +192,18 @@ int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
...
@@ -208,13 +192,18 @@ int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
tdb_off_t
tdb_read_off
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
)
tdb_off_t
tdb_read_off
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
)
{
{
tdb_off_t
pad
,
*
ret
;
tdb_off_t
ret
;
ret
=
tdb_get
(
tdb
,
off
,
&
pad
,
sizeof
(
pad
));
if
(
likely
(
!
(
tdb
->
flags
&
TDB_CONVERT
)))
{
if
(
!
ret
)
{
tdb_off_t
*
p
=
tdb
->
methods
->
direct
(
tdb
,
off
,
sizeof
(
*
p
),
return
TDB_OFF_ERR
;
false
);
if
(
p
)
return
*
p
;
}
}
return
*
ret
;
if
(
tdb_read_convert
(
tdb
,
off
,
&
ret
,
sizeof
(
ret
))
==
-
1
)
return
TDB_OFF_ERR
;
return
ret
;
}
}
/* Even on files, we can get partial writes due to signals. */
/* Even on files, we can get partial writes due to signals. */
...
@@ -278,15 +267,17 @@ bool tdb_read_all(int fd, void *buf, size_t len)
...
@@ -278,15 +267,17 @@ bool tdb_read_all(int fd, void *buf, size_t len)
static
int
tdb_write
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
static
int
tdb_write
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
const
void
*
buf
,
tdb_len_t
len
)
const
void
*
buf
,
tdb_len_t
len
)
{
{
if
(
len
==
0
)
{
return
0
;
}
if
(
tdb
->
read_only
)
{
if
(
tdb
->
read_only
)
{
tdb
->
ecode
=
TDB_ERR_RDONLY
;
tdb_logerr
(
tdb
,
TDB_ERR_RDONLY
,
TDB_DEBUG_WARNING
,
"Write to read-only database"
);
return
-
1
;
return
-
1
;
}
}
/* FIXME: Bogus optimization? */
if
(
len
==
0
)
{
return
0
;
}
if
(
tdb
->
methods
->
oob
(
tdb
,
off
+
len
,
0
)
!=
0
)
if
(
tdb
->
methods
->
oob
(
tdb
,
off
+
len
,
0
)
!=
0
)
return
-
1
;
return
-
1
;
...
@@ -294,11 +285,9 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
...
@@ -294,11 +285,9 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
memcpy
(
off
+
(
char
*
)
tdb
->
map_ptr
,
buf
,
len
);
memcpy
(
off
+
(
char
*
)
tdb
->
map_ptr
,
buf
,
len
);
}
else
{
}
else
{
if
(
!
tdb_pwrite_all
(
tdb
->
fd
,
buf
,
len
,
off
))
{
if
(
!
tdb_pwrite_all
(
tdb
->
fd
,
buf
,
len
,
off
))
{
tdb
->
ecode
=
TDB_ERR_IO
;
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_write failed at %zu len=%zu (%s)"
,
"tdb_write failed at %llu len=%llu (%s)
\n
"
,
(
size_t
)
off
,
(
size_t
)
len
,
strerror
(
errno
));
(
long
long
)
off
,
(
long
long
)
len
,
strerror
(
errno
));
return
-
1
;
return
-
1
;
}
}
}
}
...
@@ -317,14 +306,12 @@ static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
...
@@ -317,14 +306,12 @@ static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
memcpy
(
buf
,
off
+
(
char
*
)
tdb
->
map_ptr
,
len
);
memcpy
(
buf
,
off
+
(
char
*
)
tdb
->
map_ptr
,
len
);
}
else
{
}
else
{
if
(
!
tdb_pread_all
(
tdb
->
fd
,
buf
,
len
,
off
))
{
if
(
!
tdb_pread_all
(
tdb
->
fd
,
buf
,
len
,
off
))
{
/* Ensure ecode is set for log fn. */
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
tdb
->
ecode
=
TDB_ERR_IO
;
"tdb_read failed at %zu "
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"len=%zu (%s) map_size=%zu"
,
"tdb_read failed at %lld "
(
size_t
)
off
,
(
size_t
)
len
,
"len=%lld (%s) map_size=%lld
\n
"
,
(
long
long
)
off
,
(
long
long
)
len
,
strerror
(
errno
),
strerror
(
errno
),
(
long
long
)
tdb
->
map_size
);
(
size_t
)
tdb
->
map_size
);
return
-
1
;
return
-
1
;
}
}
}
}
...
@@ -338,10 +325,9 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
...
@@ -338,10 +325,9 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
if
(
unlikely
((
tdb
->
flags
&
TDB_CONVERT
)))
{
if
(
unlikely
((
tdb
->
flags
&
TDB_CONVERT
)))
{
void
*
conv
=
malloc
(
len
);
void
*
conv
=
malloc
(
len
);
if
(
!
conv
)
{
if
(
!
conv
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_write: no memory converting"
"tdb_write: no memory converting %zu bytes
\n
"
,
" %zu bytes"
,
len
);
len
);
return
-
1
;
return
-
1
;
}
}
memcpy
(
conv
,
rec
,
len
);
memcpy
(
conv
,
rec
,
len
);
...
@@ -364,6 +350,20 @@ int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
...
@@ -364,6 +350,20 @@ int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
int
tdb_write_off
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_off_t
val
)
int
tdb_write_off
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_off_t
val
)
{
{
if
(
tdb
->
read_only
)
{
tdb_logerr
(
tdb
,
TDB_ERR_RDONLY
,
TDB_DEBUG_WARNING
,
"Write to read-only database"
);
return
-
1
;
}
if
(
likely
(
!
(
tdb
->
flags
&
TDB_CONVERT
)))
{
tdb_off_t
*
p
=
tdb
->
methods
->
direct
(
tdb
,
off
,
sizeof
(
*
p
),
true
);
if
(
p
)
{
*
p
=
val
;
return
0
;
}
}
return
tdb_write_convert
(
tdb
,
off
,
&
val
,
sizeof
(
val
));
return
tdb_write_convert
(
tdb
,
off
,
&
val
,
sizeof
(
val
));
}
}
...
@@ -374,12 +374,12 @@ static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
...
@@ -374,12 +374,12 @@ static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
/* some systems don't like zero length malloc */
/* some systems don't like zero length malloc */
buf
=
malloc
(
prefix
+
len
?
prefix
+
len
:
1
);
buf
=
malloc
(
prefix
+
len
?
prefix
+
len
:
1
);
if
(
unlikely
(
!
buf
)
)
{
if
(
!
buf
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb
_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_alloc_read malloc failed len=%zu"
,
"tdb_alloc_read malloc failed len=%lld
\n
"
,
(
size_t
)(
prefix
+
len
));
(
long
long
)
prefix
+
len
);
}
else
if
(
unlikely
(
tdb
->
methods
->
read
(
tdb
,
offset
,
buf
+
prefix
,
}
else
if
(
unlikely
(
tdb
->
methods
->
read
(
tdb
,
offset
,
buf
+
prefix
,
len
)
))
{
len
)
==
-
1
))
{
free
(
buf
);
free
(
buf
);
buf
=
NULL
;
buf
=
NULL
;
}
}
...
@@ -400,9 +400,8 @@ static int fill(struct tdb_context *tdb,
...
@@ -400,9 +400,8 @@ static int fill(struct tdb_context *tdb,
size_t
n
=
len
>
size
?
size
:
len
;
size_t
n
=
len
>
size
?
size
:
len
;
if
(
!
tdb_pwrite_all
(
tdb
->
fd
,
buf
,
n
,
off
))
{
if
(
!
tdb_pwrite_all
(
tdb
->
fd
,
buf
,
n
,
off
))
{
tdb
->
ecode
=
TDB_ERR_IO
;
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"fill write failed: giving up!"
);
"fill write failed: giving up!
\n
"
);
return
-
1
;
return
-
1
;
}
}
len
-=
n
;
len
-=
n
;
...
@@ -418,14 +417,16 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
...
@@ -418,14 +417,16 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
char
buf
[
8192
];
char
buf
[
8192
];
if
(
tdb
->
read_only
)
{
if
(
tdb
->
read_only
)
{
tdb
->
ecode
=
TDB_ERR_RDONLY
;
tdb_logerr
(
tdb
,
TDB_ERR_RDONLY
,
TDB_DEBUG_WARNING
,
"Expand on read-only database"
);
return
-
1
;
return
-
1
;
}
}
if
(
tdb
->
flags
&
TDB_INTERNAL
)
{
if
(
tdb
->
flags
&
TDB_INTERNAL
)
{
char
*
new
=
realloc
(
tdb
->
map_ptr
,
tdb
->
map_size
+
addition
);
char
*
new
=
realloc
(
tdb
->
map_ptr
,
tdb
->
map_size
+
addition
);
if
(
!
new
)
{
if
(
!
new
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_FATAL
,
"No memory to expand database"
);
return
-
1
;
return
-
1
;
}
}
tdb
->
map_ptr
=
new
;
tdb
->
map_ptr
=
new
;
...
@@ -443,7 +444,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
...
@@ -443,7 +444,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
file isn't sparse, which would be very bad if we ran out of
file isn't sparse, which would be very bad if we ran out of
disk. This must be done with write, not via mmap */
disk. This must be done with write, not via mmap */
memset
(
buf
,
0x43
,
sizeof
(
buf
));
memset
(
buf
,
0x43
,
sizeof
(
buf
));
if
(
fill
(
tdb
,
buf
,
sizeof
(
buf
),
tdb
->
map_size
,
addition
)
==
-
1
)
if
(
0
||
fill
(
tdb
,
buf
,
sizeof
(
buf
),
tdb
->
map_size
,
addition
)
==
-
1
)
return
-
1
;
return
-
1
;
tdb
->
map_size
+=
addition
;
tdb
->
map_size
+=
addition
;
tdb_mmap
(
tdb
);
tdb_mmap
(
tdb
);
...
@@ -451,25 +452,20 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
...
@@ -451,25 +452,20 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
return
0
;
return
0
;
}
}
/* This is only neded for tdb_access_commit, but used everywhere to simplify. */
struct
tdb_access_hdr
{
tdb_off_t
off
;
tdb_len_t
len
;
bool
convert
;
};
const
void
*
tdb_access_read
(
struct
tdb_context
*
tdb
,
const
void
*
tdb_access_read
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_len_t
len
,
bool
convert
)
tdb_off_t
off
,
tdb_len_t
len
,
bool
convert
)
{
{
const
void
*
ret
=
NULL
;
const
void
*
ret
=
NULL
;
if
(
likely
(
!
(
tdb
->
flags
&
TDB_CONVERT
)))
if
(
likely
(
!
(
tdb
->
flags
&
TDB_CONVERT
)))
ret
=
tdb
->
methods
->
direct
(
tdb
,
off
,
len
);
ret
=
tdb
->
methods
->
direct
(
tdb
,
off
,
len
,
false
);
if
(
!
ret
)
{
if
(
!
ret
)
{
struct
tdb_access_hdr
*
hdr
;
struct
tdb_access_hdr
*
hdr
;
hdr
=
_tdb_alloc_read
(
tdb
,
off
,
len
,
sizeof
(
*
hdr
));
hdr
=
_tdb_alloc_read
(
tdb
,
off
,
len
,
sizeof
(
*
hdr
));
if
(
hdr
)
{
if
(
hdr
)
{
hdr
->
next
=
tdb
->
access
;
tdb
->
access
=
hdr
;
ret
=
hdr
+
1
;
ret
=
hdr
+
1
;
if
(
convert
)
if
(
convert
)
tdb_convert
(
tdb
,
(
void
*
)
ret
,
len
);
tdb_convert
(
tdb
,
(
void
*
)
ret
,
len
);
...
@@ -485,13 +481,21 @@ void *tdb_access_write(struct tdb_context *tdb,
...
@@ -485,13 +481,21 @@ void *tdb_access_write(struct tdb_context *tdb,
{
{
void
*
ret
=
NULL
;
void
*
ret
=
NULL
;
if
(
tdb
->
read_only
)
{
tdb_logerr
(
tdb
,
TDB_ERR_RDONLY
,
TDB_DEBUG_WARNING
,
"Write to read-only database"
);
return
NULL
;
}
if
(
likely
(
!
(
tdb
->
flags
&
TDB_CONVERT
)))
if
(
likely
(
!
(
tdb
->
flags
&
TDB_CONVERT
)))
ret
=
tdb
->
methods
->
direct
(
tdb
,
off
,
len
);
ret
=
tdb
->
methods
->
direct
(
tdb
,
off
,
len
,
true
);
if
(
!
ret
)
{
if
(
!
ret
)
{
struct
tdb_access_hdr
*
hdr
;
struct
tdb_access_hdr
*
hdr
;
hdr
=
_tdb_alloc_read
(
tdb
,
off
,
len
,
sizeof
(
*
hdr
));
hdr
=
_tdb_alloc_read
(
tdb
,
off
,
len
,
sizeof
(
*
hdr
));
if
(
hdr
)
{
if
(
hdr
)
{
hdr
->
next
=
tdb
->
access
;
tdb
->
access
=
hdr
;
hdr
->
off
=
off
;
hdr
->
off
=
off
;
hdr
->
len
=
len
;
hdr
->
len
=
len
;
hdr
->
convert
=
convert
;
hdr
->
convert
=
convert
;
...
@@ -505,30 +509,41 @@ void *tdb_access_write(struct tdb_context *tdb,
...
@@ -505,30 +509,41 @@ void *tdb_access_write(struct tdb_context *tdb,
return
ret
;
return
ret
;
}
}
static
struct
tdb_access_hdr
**
find_hdr
(
struct
tdb_context
*
tdb
,
const
void
*
p
)
{
struct
tdb_access_hdr
**
hp
;
for
(
hp
=
&
tdb
->
access
;
*
hp
;
hp
=
&
(
*
hp
)
->
next
)
{
if
(
*
hp
+
1
==
p
)
return
hp
;
}
return
NULL
;
}
void
tdb_access_release
(
struct
tdb_context
*
tdb
,
const
void
*
p
)
void
tdb_access_release
(
struct
tdb_context
*
tdb
,
const
void
*
p
)
{
{
if
(
!
tdb
->
map_ptr
struct
tdb_access_hdr
*
hdr
,
**
hp
=
find_hdr
(
tdb
,
p
);
||
(
char
*
)
p
<
(
char
*
)
tdb
->
map_ptr
||
(
char
*
)
p
>=
(
char
*
)
tdb
->
map_ptr
+
tdb
->
map_size
)
if
(
hp
)
{
free
((
struct
tdb_access_hdr
*
)
p
-
1
);
hdr
=
*
hp
;
else
*
hp
=
hdr
->
next
;
free
(
hdr
);
}
else
tdb
->
direct_access
--
;
tdb
->
direct_access
--
;
}
}
int
tdb_access_commit
(
struct
tdb_context
*
tdb
,
void
*
p
)
int
tdb_access_commit
(
struct
tdb_context
*
tdb
,
void
*
p
)
{
{
struct
tdb_access_hdr
*
hdr
,
**
hp
=
find_hdr
(
tdb
,
p
);
int
ret
=
0
;
int
ret
=
0
;
if
(
!
tdb
->
map_ptr
if
(
hp
)
{
||
(
char
*
)
p
<
(
char
*
)
tdb
->
map_ptr
hdr
=
*
hp
;
||
(
char
*
)
p
>=
(
char
*
)
tdb
->
map_ptr
+
tdb
->
map_size
)
{
struct
tdb_access_hdr
*
hdr
;
hdr
=
(
struct
tdb_access_hdr
*
)
p
-
1
;
if
(
hdr
->
convert
)
if
(
hdr
->
convert
)
ret
=
tdb_write_convert
(
tdb
,
hdr
->
off
,
p
,
hdr
->
len
);
ret
=
tdb_write_convert
(
tdb
,
hdr
->
off
,
p
,
hdr
->
len
);
else
else
ret
=
tdb_write
(
tdb
,
hdr
->
off
,
p
,
hdr
->
len
);
ret
=
tdb_write
(
tdb
,
hdr
->
off
,
p
,
hdr
->
len
);
*
hp
=
hdr
->
next
;
free
(
hdr
);
free
(
hdr
);
}
else
}
else
tdb
->
direct_access
--
;
tdb
->
direct_access
--
;
...
@@ -536,7 +551,8 @@ int tdb_access_commit(struct tdb_context *tdb, void *p)
...
@@ -536,7 +551,8 @@ int tdb_access_commit(struct tdb_context *tdb, void *p)
return
ret
;
return
ret
;
}
}
static
void
*
tdb_direct
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
size_t
len
)
static
void
*
tdb_direct
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
size_t
len
,
bool
write
)
{
{
if
(
unlikely
(
!
tdb
->
map_ptr
))
if
(
unlikely
(
!
tdb
->
map_ptr
))
return
NULL
;
return
NULL
;
...
@@ -546,6 +562,12 @@ static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
...
@@ -546,6 +562,12 @@ static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
return
(
char
*
)
tdb
->
map_ptr
+
off
;
return
(
char
*
)
tdb
->
map_ptr
+
off
;
}
}
void
add_stat_
(
struct
tdb_context
*
tdb
,
uint64_t
*
stat
,
size_t
val
)
{
if
((
uintptr_t
)
stat
<
(
uintptr_t
)
tdb
->
stats
+
tdb
->
stats
->
size
)
*
stat
+=
val
;
}
static
const
struct
tdb_methods
io_methods
=
{
static
const
struct
tdb_methods
io_methods
=
{
tdb_read
,
tdb_read
,
tdb_write
,
tdb_write
,
...
...
ccan/tdb2/lock.c
View file @
51a56b52
...
@@ -40,10 +40,13 @@ static int fcntl_lock(struct tdb_context *tdb,
...
@@ -40,10 +40,13 @@ static int fcntl_lock(struct tdb_context *tdb,
fl
.
l_len
=
len
;
fl
.
l_len
=
len
;
fl
.
l_pid
=
0
;
fl
.
l_pid
=
0
;
add_stat
(
tdb
,
lock_lowlevel
,
1
);
if
(
waitflag
)
if
(
waitflag
)
return
fcntl
(
tdb
->
fd
,
F_SETLKW
,
&
fl
);
return
fcntl
(
tdb
->
fd
,
F_SETLKW
,
&
fl
);
else
else
{
add_stat
(
tdb
,
lock_nonblock
,
1
);
return
fcntl
(
tdb
->
fd
,
F_SETLK
,
&
fl
);
return
fcntl
(
tdb
->
fd
,
F_SETLK
,
&
fl
);
}
}
}
static
int
fcntl_unlock
(
struct
tdb_context
*
tdb
,
int
rw
,
off_t
off
,
off_t
len
)
static
int
fcntl_unlock
(
struct
tdb_context
*
tdb
,
int
rw
,
off_t
off
,
off_t
len
)
...
@@ -99,7 +102,7 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
...
@@ -99,7 +102,7 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
}
}
if (!found) {
if (!found) {
fprintf(stderr, "Unlock on %u@%u not found!
\n
",
fprintf(stderr, "Unlock on %u@%u not found!",
(int)off, (int)len);
(int)off, (int)len);
abort();
abort();
}
}
...
@@ -132,16 +135,16 @@ static int tdb_brlock(struct tdb_context *tdb,
...
@@ -132,16 +135,16 @@ static int tdb_brlock(struct tdb_context *tdb,
}
}
if
(
rw_type
==
F_WRLCK
&&
tdb
->
read_only
)
{
if
(
rw_type
==
F_WRLCK
&&
tdb
->
read_only
)
{
tdb
->
ecode
=
TDB_ERR_RDONLY
;
tdb_logerr
(
tdb
,
TDB_ERR_RDONLY
,
TDB_DEBUG_WARNING
,
"Write lock attempted on read-only database"
);
return
-
1
;
return
-
1
;
}
}
/* A 32 bit system cannot open a 64-bit file, but it could have
/* A 32 bit system cannot open a 64-bit file, but it could have
* expanded since then: check here. */
* expanded since then: check here. */
if
((
size_t
)(
offset
+
len
)
!=
offset
+
len
)
{
if
((
size_t
)(
offset
+
len
)
!=
offset
+
len
)
{
tdb
->
ecode
=
TDB_ERR_IO
;
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_brlock: lock on giant offset %llu"
,
"tdb_brlock: lock on giant offset %llu
\n
"
,
(
long
long
)(
offset
+
len
));
(
long
long
)(
offset
+
len
));
return
-
1
;
return
-
1
;
}
}
...
@@ -157,11 +160,12 @@ static int tdb_brlock(struct tdb_context *tdb,
...
@@ -157,11 +160,12 @@ static int tdb_brlock(struct tdb_context *tdb,
* EAGAIN is an expected return from non-blocking
* EAGAIN is an expected return from non-blocking
* locks. */
* locks. */
if
(
!
(
flags
&
TDB_LOCK_PROBE
)
&&
errno
!=
EAGAIN
)
{
if
(
!
(
flags
&
TDB_LOCK_PROBE
)
&&
errno
!=
EAGAIN
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
"tdb_brlock failed (fd=%d) at"
"tdb_brlock failed (fd=%d) at"
" offset %llu rw_type=%d flags=%d len=%llu
\n
"
,
" offset %zu rw_type=%d flags=%d len=%zu:"
tdb
->
fd
,
(
long
long
)
offset
,
rw_type
,
" %s"
,
flags
,
(
long
long
)
len
);
tdb
->
fd
,
(
size_t
)
offset
,
rw_type
,
flags
,
(
size_t
)
len
,
strerror
(
errno
));
}
}
return
-
1
;
return
-
1
;
}
}
...
@@ -182,10 +186,10 @@ static int tdb_brunlock(struct tdb_context *tdb,
...
@@ -182,10 +186,10 @@ static int tdb_brunlock(struct tdb_context *tdb,
}
while
(
ret
==
-
1
&&
errno
==
EINTR
);
}
while
(
ret
==
-
1
&&
errno
==
EINTR
);
if
(
ret
==
-
1
)
{
if
(
ret
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_TRACE
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_TRACE
,
"tdb_brunlock failed (fd=%d) at offset %ll
u"
"tdb_brunlock failed (fd=%d) at offset %z
u"
" rw_type=%d len=%llu
\n
"
,
" rw_type=%d len=%zu
"
,
tdb
->
fd
,
(
long
long
)
offset
,
rw_type
,
(
long
long
)
len
);
tdb
->
fd
,
(
size_t
)
offset
,
rw_type
,
(
size_t
)
len
);
}
}
return
ret
;
return
ret
;
}
}
...
@@ -201,15 +205,15 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
...
@@ -201,15 +205,15 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
int
count
=
1000
;
int
count
=
1000
;
if
(
tdb
->
allrecord_lock
.
count
!=
1
)
{
if
(
tdb
->
allrecord_lock
.
count
!=
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
"tdb_allrecord_upgrade failed: count %u too high
\n
"
,
"tdb_allrecord_upgrade failed: count %u too high
"
,
tdb
->
allrecord_lock
.
count
);
tdb
->
allrecord_lock
.
count
);
return
-
1
;
return
-
1
;
}
}
if
(
tdb
->
allrecord_lock
.
off
!=
1
)
{
if
(
tdb
->
allrecord_lock
.
off
!=
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
"tdb_allrecord_upgrade failed: already upgraded?
\n
"
);
"tdb_allrecord_upgrade failed: already upgraded?
"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -230,8 +234,8 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
...
@@ -230,8 +234,8 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
tv
.
tv_usec
=
1
;
tv
.
tv_usec
=
1
;
select
(
0
,
NULL
,
NULL
,
NULL
,
&
tv
);
select
(
0
,
NULL
,
NULL
,
NULL
,
&
tv
);
}
}
tdb
->
log
(
tdb
,
TDB_DEBUG_WARNING
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_WARNING
,
"tdb_allrecord_upgrade failed
\n
"
);
"tdb_allrecord_upgrade failed
"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -276,23 +280,23 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
...
@@ -276,23 +280,23 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
struct
tdb_lock_type
*
new_lck
;
struct
tdb_lock_type
*
new_lck
;
if
(
offset
>
TDB_HASH_LOCK_START
+
TDB_HASH_LOCK_RANGE
+
tdb
->
map_size
/
8
)
{
if
(
offset
>
TDB_HASH_LOCK_START
+
TDB_HASH_LOCK_RANGE
+
tdb
->
map_size
/
8
)
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_nest_lock: invalid offset %zu ltype=%d"
,
"tdb_nest_lock: invalid offset %llu ltype=%d
\n
"
,
(
size_t
)
offset
,
ltype
);
(
long
long
)
offset
,
ltype
);
return
-
1
;
return
-
1
;
}
}
if
(
tdb
->
flags
&
TDB_NOLOCK
)
if
(
tdb
->
flags
&
TDB_NOLOCK
)
return
0
;
return
0
;
add_stat
(
tdb
,
locks
,
1
);
new_lck
=
find_nestlock
(
tdb
,
offset
);
new_lck
=
find_nestlock
(
tdb
,
offset
);
if
(
new_lck
)
{
if
(
new_lck
)
{
if
(
new_lck
->
ltype
==
F_RDLCK
&&
ltype
==
F_WRLCK
)
{
if
(
new_lck
->
ltype
==
F_RDLCK
&&
ltype
==
F_WRLCK
)
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_nest_lock: offset %zu has read lock"
,
"tdb_nest_lock: offset %llu has read lock
\n
"
,
(
size_t
)
offset
);
(
long
long
)
offset
);
return
-
1
;
return
-
1
;
}
}
/* Just increment the struct, posix locks don't stack. */
/* Just increment the struct, posix locks don't stack. */
...
@@ -303,9 +307,8 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
...
@@ -303,9 +307,8 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
if
(
tdb
->
num_lockrecs
if
(
tdb
->
num_lockrecs
&&
offset
>=
TDB_HASH_LOCK_START
&&
offset
>=
TDB_HASH_LOCK_START
&&
offset
<
TDB_HASH_LOCK_START
+
TDB_HASH_LOCK_RANGE
)
{
&&
offset
<
TDB_HASH_LOCK_START
+
TDB_HASH_LOCK_RANGE
)
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_nest_lock: already have a hash lock?"
);
"tdb_nest_lock: already have a hash lock?
\n
"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -313,10 +316,9 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
...
@@ -313,10 +316,9 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
tdb
->
lockrecs
,
tdb
->
lockrecs
,
sizeof
(
*
tdb
->
lockrecs
)
*
(
tdb
->
num_lockrecs
+
1
));
sizeof
(
*
tdb
->
lockrecs
)
*
(
tdb
->
num_lockrecs
+
1
));
if
(
new_lck
==
NULL
)
{
if
(
new_lck
==
NULL
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_nest_lock: unable to allocate %zu lock struct"
,
"tdb_nest_lock: unable to allocate %llu lock struct"
,
tdb
->
num_lockrecs
+
1
);
(
long
long
)(
tdb
->
num_lockrecs
+
1
));
errno
=
ENOMEM
;
errno
=
ENOMEM
;
return
-
1
;
return
-
1
;
}
}
...
@@ -361,9 +363,8 @@ static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype)
...
@@ -361,9 +363,8 @@ static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype)
lck
=
find_nestlock
(
tdb
,
off
);
lck
=
find_nestlock
(
tdb
,
off
);
if
((
lck
==
NULL
)
||
(
lck
->
count
==
0
))
{
if
((
lck
==
NULL
)
||
(
lck
->
count
==
0
))
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_nest_unlock: no lock for %zu"
,
(
size_t
)
off
);
"tdb_nest_unlock: no lock for %llu
\n
"
,
(
long
long
)
off
);
return
-
1
;
return
-
1
;
}
}
...
@@ -448,9 +449,8 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
...
@@ -448,9 +449,8 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
{
{
/* FIXME: There are no locks on read-only dbs */
/* FIXME: There are no locks on read-only dbs */
if
(
tdb
->
read_only
)
{
if
(
tdb
->
read_only
)
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_allrecord_lock: read-only"
);
"tdb_allrecord_lock: read-only
\n
"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -462,49 +462,45 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
...
@@ -462,49 +462,45 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
if
(
tdb
->
allrecord_lock
.
count
)
{
if
(
tdb
->
allrecord_lock
.
count
)
{
/* a global lock of a different type exists */
/* a global lock of a different type exists */
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_allrecord_lock: already have %s lock"
,
"tdb_allrecord_lock: already have %s lock
\n
"
,
tdb
->
allrecord_lock
.
ltype
==
F_RDLCK
tdb
->
allrecord_lock
.
ltype
==
F_RDLCK
?
"read"
:
"write"
);
?
"read"
:
"write"
);
return
-
1
;
return
-
1
;
}
}
if
(
tdb_has_hash_locks
(
tdb
))
{
if
(
tdb_has_hash_locks
(
tdb
))
{
/* can't combine global and chain locks */
/* can't combine global and chain locks */
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_allrecord_lock: already have chain lock"
);
"tdb_allrecord_lock: already have chain lock
\n
"
);
return
-
1
;
return
-
1
;
}
}
if
(
upgradable
&&
ltype
!=
F_RDLCK
)
{
if
(
upgradable
&&
ltype
!=
F_RDLCK
)
{
/* tdb error: you can't upgrade a write lock! */
/* tdb error: you can't upgrade a write lock! */
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_allrecord_lock: can't upgrade a write lock"
);
"tdb_allrecord_lock: can't upgrade a write lock
\n
"
);
return
-
1
;
return
-
1
;
}
}
add_stat
(
tdb
,
locks
,
1
);
again:
again:
/* Lock hashes, gradually. */
/* Lock hashes, gradually. */
if
(
tdb_lock_gradual
(
tdb
,
ltype
,
flags
,
TDB_HASH_LOCK_START
,
if
(
tdb_lock_gradual
(
tdb
,
ltype
,
flags
,
TDB_HASH_LOCK_START
,
TDB_HASH_LOCK_RANGE
))
{
TDB_HASH_LOCK_RANGE
))
{
if
(
!
(
flags
&
TDB_LOCK_PROBE
))
{
if
(
!
(
flags
&
TDB_LOCK_PROBE
))
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_ERROR
,
"tdb_allrecord_lock hashes failed (%s)
\n
"
,
"tdb_allrecord_lock hashes failed"
);
strerror
(
errno
));
}
}
return
-
1
;
return
-
1
;
}
}
/* Lock free
list
s: there to end of file. */
/* Lock free
table
s: there to end of file. */
if
(
tdb_brlock
(
tdb
,
ltype
,
TDB_HASH_LOCK_START
+
TDB_HASH_LOCK_RANGE
,
if
(
tdb_brlock
(
tdb
,
ltype
,
TDB_HASH_LOCK_START
+
TDB_HASH_LOCK_RANGE
,
0
,
flags
))
{
0
,
flags
))
{
if
(
!
(
flags
&
TDB_LOCK_PROBE
))
{
if
(
!
(
flags
&
TDB_LOCK_PROBE
))
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_ERROR
,
"tdb_allrecord_lock freelist failed (%s)
\n
"
,
"tdb_allrecord_lock freetables failed"
);
strerror
(
errno
));
}
}
tdb_brunlock
(
tdb
,
ltype
,
TDB_HASH_LOCK_START
,
tdb_brunlock
(
tdb
,
ltype
,
TDB_HASH_LOCK_START
,
TDB_HASH_LOCK_RANGE
);
TDB_HASH_LOCK_RANGE
);
...
@@ -559,29 +555,19 @@ void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
...
@@ -559,29 +555,19 @@ void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
/* unlock entire db */
/* unlock entire db */
int
tdb_allrecord_unlock
(
struct
tdb_context
*
tdb
,
int
ltype
)
int
tdb_allrecord_unlock
(
struct
tdb_context
*
tdb
,
int
ltype
)
{
{
/* FIXME: There are no locks on read-only dbs */
if
(
tdb
->
read_only
)
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_allrecord_unlock: read-only
\n
"
);
return
-
1
;
}
if
(
tdb
->
allrecord_lock
.
count
==
0
)
{
if
(
tdb
->
allrecord_lock
.
count
==
0
)
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_allrecord_unlock: not locked!"
);
"tdb_allrecord_unlock: not locked!
\n
"
);
return
-
1
;
return
-
1
;
}
}
/* Upgradable locks are marked as write locks. */
/* Upgradable locks are marked as write locks. */
if
(
tdb
->
allrecord_lock
.
ltype
!=
ltype
if
(
tdb
->
allrecord_lock
.
ltype
!=
ltype
&&
(
!
tdb
->
allrecord_lock
.
off
||
ltype
!=
F_RDLCK
))
{
&&
(
!
tdb
->
allrecord_lock
.
off
||
ltype
!=
F_RDLCK
))
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_allrecord_unlock: have %s lock"
,
"tdb_allrecord_unlock: have %s lock
\n
"
,
tdb
->
allrecord_lock
.
ltype
==
F_RDLCK
tdb
->
allrecord_lock
.
ltype
==
F_RDLCK
?
"read"
:
"write"
);
?
"read"
:
"write"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -642,25 +628,22 @@ int tdb_lock_hashes(struct tdb_context *tdb,
...
@@ -642,25 +628,22 @@ int tdb_lock_hashes(struct tdb_context *tdb,
}
}
if
(
tdb
->
allrecord_lock
.
count
)
{
if
(
tdb
->
allrecord_lock
.
count
)
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_lock_hashes: already have %s allrecordlock"
,
"tdb_lock_hashes: have %s allrecordlock
\n
"
,
tdb
->
allrecord_lock
.
ltype
==
F_RDLCK
tdb
->
allrecord_lock
.
ltype
==
F_RDLCK
?
"read"
:
"write"
);
?
"read"
:
"write"
);
return
-
1
;
return
-
1
;
}
}
if
(
tdb_has_free_lock
(
tdb
))
{
if
(
tdb_has_free_lock
(
tdb
))
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_lock_hashes: already have free lock"
);
"tdb_lock_hashes: have free lock already
\n
"
);
return
-
1
;
return
-
1
;
}
}
if
(
tdb_has_expansion_lock
(
tdb
))
{
if
(
tdb_has_expansion_lock
(
tdb
))
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_lock_hashes: already have expansion lock"
);
"tdb_lock_hashes: have expansion lock already
\n
"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -678,9 +661,8 @@ int tdb_unlock_hashes(struct tdb_context *tdb,
...
@@ -678,9 +661,8 @@ int tdb_unlock_hashes(struct tdb_context *tdb,
if
(
tdb
->
allrecord_lock
.
count
)
{
if
(
tdb
->
allrecord_lock
.
count
)
{
if
(
tdb
->
allrecord_lock
.
ltype
==
F_RDLCK
if
(
tdb
->
allrecord_lock
.
ltype
==
F_RDLCK
&&
ltype
==
F_WRLCK
)
{
&&
ltype
==
F_WRLCK
)
{
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_unlock_hashes RO allrecord!"
);
"tdb_unlock_hashes RO allrecord!
\n
"
);
return
-
1
;
return
-
1
;
}
}
return
0
;
return
0
;
...
@@ -709,17 +691,15 @@ int tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
...
@@ -709,17 +691,15 @@ int tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
if
(
tdb
->
allrecord_lock
.
count
)
{
if
(
tdb
->
allrecord_lock
.
count
)
{
if
(
tdb
->
allrecord_lock
.
ltype
==
F_WRLCK
)
if
(
tdb
->
allrecord_lock
.
ltype
==
F_WRLCK
)
return
0
;
return
0
;
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_lock_free_bucket with RO allrecordlock!"
);
"tdb_lock_free_bucket with RO allrecordlock!
\n
"
);
return
-
1
;
return
-
1
;
}
}
#if 0 /* FIXME */
#if 0 /* FIXME */
if (tdb_has_expansion_lock(tdb)) {
if (tdb_has_expansion_lock(tdb)) {
tdb->ecode = TDB_ERR_LOCK;
tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_lock_free_bucket: already have expansion lock");
"tdb_lock_free_bucket: have expansion lock already\n");
return -1;
return -1;
}
}
#endif
#endif
...
...
ccan/tdb2/private.h
View file @
51a56b52
...
@@ -36,6 +36,7 @@
...
@@ -36,6 +36,7 @@
#include "config.h"
#include "config.h"
#include <ccan/tdb2/tdb2.h>
#include <ccan/tdb2/tdb2.h>
#include <ccan/likely/likely.h>
#include <ccan/likely/likely.h>
#include <ccan/compiler/compiler.h>
#ifdef HAVE_BYTESWAP_H
#ifdef HAVE_BYTESWAP_H
#include <byteswap.h>
#include <byteswap.h>
#endif
#endif
...
@@ -63,9 +64,11 @@ typedef uint64_t tdb_off_t;
...
@@ -63,9 +64,11 @@ typedef uint64_t tdb_off_t;
#define TDB_MAGIC_FOOD "TDB file\n"
#define TDB_MAGIC_FOOD "TDB file\n"
#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
#define TDB_MAGIC ((uint64_t)0x1999)
#define TDB_USED_MAGIC ((uint64_t)0x1999)
#define TDB_HTABLE_MAGIC ((uint64_t)0x1888)
#define TDB_CHAIN_MAGIC ((uint64_t)0x1777)
#define TDB_FTABLE_MAGIC ((uint64_t)0x1666)
#define TDB_FREE_MAGIC ((uint64_t)0xFE)
#define TDB_FREE_MAGIC ((uint64_t)0xFE)
#define TDB_COALESCING_MAGIC ((uint64_t)0xFD)
#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
#define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
#define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
#define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
#define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
...
@@ -91,20 +94,22 @@ typedef uint64_t tdb_off_t;
...
@@ -91,20 +94,22 @@ typedef uint64_t tdb_off_t;
#define TDB_SUBLEVEL_HASH_BITS 6
#define TDB_SUBLEVEL_HASH_BITS 6
/* And 8 entries in each group, ie 8 groups per sublevel. */
/* And 8 entries in each group, ie 8 groups per sublevel. */
#define TDB_HASH_GROUP_BITS 3
#define TDB_HASH_GROUP_BITS 3
/* This is currently 10: beyond this we chain. */
#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS)
/* Extend file by least
32
times larger than needed. */
/* Extend file by least
100
times larger than needed. */
#define TDB_EXTENSION_FACTOR
32
#define TDB_EXTENSION_FACTOR
100
/* We steal bits from the offsets to store hash info. */
/* We steal bits from the offsets to store hash info. */
#define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
#define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
#define TDB_OFF_UPPER_STEAL 8
#define TDB_OFF_UPPER_STEAL 8
#define TDB_OFF_UPPER_STEAL_EXTRA 7
#define TDB_OFF_UPPER_STEAL_EXTRA 7
#define TDB_OFF_UPPER_STEAL_TRUNCBIT 1
/* The bit number where we store extra hash bits. */
/* If this is set, hash is truncated (only 1 bit is valid). */
#define TDB_OFF_HASH_TRUNCATED_BIT 56
/* The bit number where we store next level of hash. */
#define TDB_OFF_HASH_EXTRA_BIT 57
#define TDB_OFF_HASH_EXTRA_BIT 57
#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56
/* The bit number where we store the extra hash bits. */
/* Convenience mask to get actual offset. */
/* Convenience mask to get actual offset. */
#define TDB_OFF_MASK \
#define TDB_OFF_MASK \
(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
...
@@ -116,6 +121,9 @@ typedef uint64_t tdb_off_t;
...
@@ -116,6 +121,9 @@ typedef uint64_t tdb_off_t;
#define TDB_MIN_DATA_LEN \
#define TDB_MIN_DATA_LEN \
(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
/* Indicates this entry is not on an flist (can happen during coalescing) */
#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1)
#if !HAVE_BSWAP_64
#if !HAVE_BSWAP_64
static
inline
uint64_t
bswap_64
(
uint64_t
x
)
static
inline
uint64_t
bswap_64
(
uint64_t
x
)
{
{
...
@@ -173,20 +181,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
...
@@ -173,20 +181,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
}
}
struct
tdb_free_record
{
struct
tdb_free_record
{
uint64_t
magic_and_
meta
;
/* TDB_OFF_UPPER_STEAL bits of magic
*/
uint64_t
magic_and_
prev
;
/* TDB_OFF_UPPER_STEAL bits magic, then prev
*/
uint64_t
data_len
;
/* N
ot counting these two fields. */
uint64_t
ftable_and_len
;
/* Len n
ot counting these two fields. */
/* This is why the minimum record size is
16
bytes. */
/* This is why the minimum record size is
8
bytes. */
uint64_t
next
,
prev
;
uint64_t
next
;
};
};
static
inline
uint64_t
frec_prev
(
const
struct
tdb_free_record
*
f
)
{
return
f
->
magic_and_prev
&
((
1ULL
<<
(
64
-
TDB_OFF_UPPER_STEAL
))
-
1
);
}
static
inline
uint64_t
frec_magic
(
const
struct
tdb_free_record
*
f
)
static
inline
uint64_t
frec_magic
(
const
struct
tdb_free_record
*
f
)
{
{
return
f
->
magic_and_
meta
>>
(
64
-
TDB_OFF_UPPER_STEAL
);
return
f
->
magic_and_
prev
>>
(
64
-
TDB_OFF_UPPER_STEAL
);
}
}
static
inline
uint64_t
frec_
flist
(
const
struct
tdb_free_record
*
f
)
static
inline
uint64_t
frec_
len
(
const
struct
tdb_free_record
*
f
)
{
{
return
f
->
magic_and_meta
&
((
1ULL
<<
(
64
-
TDB_OFF_UPPER_STEAL
))
-
1
);
return
f
->
ftable_and_len
&
((
1ULL
<<
(
64
-
TDB_OFF_UPPER_STEAL
))
-
1
);
}
static
inline
unsigned
frec_ftable
(
const
struct
tdb_free_record
*
f
)
{
return
f
->
ftable_and_len
>>
(
64
-
TDB_OFF_UPPER_STEAL
);
}
}
struct
tdb_recovery_record
{
struct
tdb_recovery_record
{
...
@@ -199,6 +217,12 @@ struct tdb_recovery_record {
...
@@ -199,6 +217,12 @@ struct tdb_recovery_record {
uint64_t
eof
;
uint64_t
eof
;
};
};
/* If we bottom out of the subhashes, we chain. */
struct
tdb_chain
{
tdb_off_t
rec
[
1
<<
TDB_HASH_GROUP_BITS
];
tdb_off_t
next
;
};
/* this is stored at the front of every database */
/* this is stored at the front of every database */
struct
tdb_header
{
struct
tdb_header
{
char
magic_food
[
64
];
/* for /etc/magic */
char
magic_food
[
64
];
/* for /etc/magic */
...
@@ -206,7 +230,7 @@ struct tdb_header {
...
@@ -206,7 +230,7 @@ struct tdb_header {
uint64_t
version
;
/* version of the code */
uint64_t
version
;
/* version of the code */
uint64_t
hash_test
;
/* result of hashing HASH_MAGIC. */
uint64_t
hash_test
;
/* result of hashing HASH_MAGIC. */
uint64_t
hash_seed
;
/* "random" seed written at creation time. */
uint64_t
hash_seed
;
/* "random" seed written at creation time. */
tdb_off_t
free_
list
;
/* (First) free list
. */
tdb_off_t
free_
table
;
/* (First) free table
. */
tdb_off_t
recovery
;
/* Transaction recovery area. */
tdb_off_t
recovery
;
/* Transaction recovery area. */
tdb_off_t
reserved
[
26
];
tdb_off_t
reserved
[
26
];
...
@@ -215,7 +239,7 @@ struct tdb_header {
...
@@ -215,7 +239,7 @@ struct tdb_header {
tdb_off_t
hashtable
[
1ULL
<<
TDB_TOPLEVEL_HASH_BITS
];
tdb_off_t
hashtable
[
1ULL
<<
TDB_TOPLEVEL_HASH_BITS
];
};
};
struct
tdb_free
list
{
struct
tdb_free
table
{
struct
tdb_used_record
hdr
;
struct
tdb_used_record
hdr
;
tdb_off_t
next
;
tdb_off_t
next
;
tdb_off_t
buckets
[
TDB_FREE_BUCKETS
];
tdb_off_t
buckets
[
TDB_FREE_BUCKETS
];
...
@@ -246,7 +270,7 @@ struct traverse_info {
...
@@ -246,7 +270,7 @@ struct traverse_info {
/* We ignore groups here, and treat it as a big array. */
/* We ignore groups here, and treat it as a big array. */
unsigned
entry
;
unsigned
entry
;
unsigned
int
total_buckets
;
unsigned
int
total_buckets
;
}
levels
[
64
/
TDB_SUBLEVEL_HASH_BITS
];
}
levels
[
TDB_MAX_LEVELS
+
1
];
unsigned
int
num_levels
;
unsigned
int
num_levels
;
unsigned
int
toplevel_group
;
unsigned
int
toplevel_group
;
/* This makes delete-everything-inside-traverse work as expected. */
/* This makes delete-everything-inside-traverse work as expected. */
...
@@ -269,6 +293,15 @@ struct tdb_lock_type {
...
@@ -269,6 +293,15 @@ struct tdb_lock_type {
uint32_t
ltype
;
uint32_t
ltype
;
};
};
/* This is only needed for tdb_access_commit, but used everywhere to
* simplify. */
struct
tdb_access_hdr
{
struct
tdb_access_hdr
*
next
;
tdb_off_t
off
;
tdb_len_t
len
;
bool
convert
;
};
struct
tdb_context
{
struct
tdb_context
{
/* Filename of the database. */
/* Filename of the database. */
const
char
*
name
;
const
char
*
name
;
...
@@ -298,8 +331,8 @@ struct tdb_context {
...
@@ -298,8 +331,8 @@ struct tdb_context {
uint32_t
flags
;
uint32_t
flags
;
/* Logging function */
/* Logging function */
tdb_logfn_t
log
;
tdb_logfn_t
log
fn
;
void
*
log_priv
;
void
*
log_priv
ate
;
/* Hash function. */
/* Hash function. */
tdb_hashfn_t
khash
;
tdb_hashfn_t
khash
;
...
@@ -309,17 +342,23 @@ struct tdb_context {
...
@@ -309,17 +342,23 @@ struct tdb_context {
/* Set if we are in a transaction. */
/* Set if we are in a transaction. */
struct
tdb_transaction
*
transaction
;
struct
tdb_transaction
*
transaction
;
/* What freelist are we using? */
/* What free table are we using? */
uint64_t
flist_off
;
tdb_off_t
ftable_off
;
unsigned
int
ftable
;
/* IO methods: changes for transactions. */
/* IO methods: changes for transactions. */
const
struct
tdb_methods
*
methods
;
const
struct
tdb_methods
*
methods
;
/* Lock information */
/* Lock information */
struct
tdb_lock_type
allrecord_lock
;
struct
tdb_lock_type
allrecord_lock
;
uint64
_t
num_lockrecs
;
size
_t
num_lockrecs
;
struct
tdb_lock_type
*
lockrecs
;
struct
tdb_lock_type
*
lockrecs
;
struct
tdb_attribute_stats
*
stats
;
/* Direct access information */
struct
tdb_access_hdr
*
access
;
/* Single list of all TDBs, to avoid multiple opens. */
/* Single list of all TDBs, to avoid multiple opens. */
struct
tdb_context
*
next
;
struct
tdb_context
*
next
;
dev_t
device
;
dev_t
device
;
...
@@ -331,7 +370,7 @@ struct tdb_methods {
...
@@ -331,7 +370,7 @@ struct tdb_methods {
int
(
*
write
)(
struct
tdb_context
*
,
tdb_off_t
,
const
void
*
,
tdb_len_t
);
int
(
*
write
)(
struct
tdb_context
*
,
tdb_off_t
,
const
void
*
,
tdb_len_t
);
int
(
*
oob
)(
struct
tdb_context
*
,
tdb_off_t
,
bool
);
int
(
*
oob
)(
struct
tdb_context
*
,
tdb_off_t
,
bool
);
int
(
*
expand_file
)(
struct
tdb_context
*
,
tdb_len_t
);
int
(
*
expand_file
)(
struct
tdb_context
*
,
tdb_len_t
);
void
*
(
*
direct
)(
struct
tdb_context
*
,
tdb_off_t
,
size_t
);
void
*
(
*
direct
)(
struct
tdb_context
*
,
tdb_off_t
,
size_t
,
bool
);
};
};
/*
/*
...
@@ -367,29 +406,32 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
...
@@ -367,29 +406,32 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
bool
is_subhash
(
tdb_off_t
val
);
bool
is_subhash
(
tdb_off_t
val
);
/* free.c: */
/* free.c: */
int
tdb_f
list
_init
(
struct
tdb_context
*
tdb
);
int
tdb_f
table
_init
(
struct
tdb_context
*
tdb
);
/* check.c needs these to iterate through free lists. */
/* check.c needs these to iterate through free lists. */
tdb_off_t
first_f
list
(
struct
tdb_context
*
tdb
);
tdb_off_t
first_f
table
(
struct
tdb_context
*
tdb
);
tdb_off_t
next_f
list
(
struct
tdb_context
*
tdb
,
tdb_off_t
flist
);
tdb_off_t
next_f
table
(
struct
tdb_context
*
tdb
,
tdb_off_t
ftable
);
/*
If this fails, try tdb_expand
. */
/*
This returns space or TDB_OFF_ERR
. */
tdb_off_t
alloc
(
struct
tdb_context
*
tdb
,
size_t
keylen
,
size_t
datalen
,
tdb_off_t
alloc
(
struct
tdb_context
*
tdb
,
size_t
keylen
,
size_t
datalen
,
uint64_t
hash
,
bool
growing
);
uint64_t
hash
,
unsigned
magic
,
bool
growing
);
/* Put this record in a free list. */
/* Put this record in a free list. */
int
add_free_record
(
struct
tdb_context
*
tdb
,
int
add_free_record
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_len_t
len_with_header
);
tdb_off_t
off
,
tdb_len_t
len_with_header
);
/* Set up header for a used record. */
/* Set up header for a used
/ftable/htable/chain
record. */
int
set_header
(
struct
tdb_context
*
tdb
,
int
set_header
(
struct
tdb_context
*
tdb
,
struct
tdb_used_record
*
rec
,
struct
tdb_used_record
*
rec
,
uint64_t
keylen
,
uint64_t
datalen
,
u
nsigned
magic
,
u
int64_t
keylen
,
uint64_t
datalen
,
uint64_t
actuallen
,
unsigned
hashlow
);
uint64_t
actuallen
,
unsigned
hashlow
);
/* Used by tdb_check to verify. */
/* Used by tdb_check to verify. */
unsigned
int
size_to_bucket
(
tdb_len_t
data_len
);
unsigned
int
size_to_bucket
(
tdb_len_t
data_len
);
tdb_off_t
bucket_off
(
tdb_off_t
flist_off
,
unsigned
bucket
);
tdb_off_t
bucket_off
(
tdb_off_t
ftable_off
,
unsigned
bucket
);
/* Used by tdb_summary */
size_t
dead_space
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
);
/* io.c: */
/* io.c: */
/* Initialize tdb->methods. */
/* Initialize tdb->methods. */
...
@@ -402,10 +444,6 @@ void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
...
@@ -402,10 +444,6 @@ void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
void
tdb_munmap
(
struct
tdb_context
*
tdb
);
void
tdb_munmap
(
struct
tdb_context
*
tdb
);
void
tdb_mmap
(
struct
tdb_context
*
tdb
);
void
tdb_mmap
(
struct
tdb_context
*
tdb
);
/* Either make a copy into pad and return that, or return ptr into mmap.
* Converts endian (ie. will use pad in that case). */
void
*
tdb_get
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
void
*
pad
,
size_t
len
);
/* Either alloc a copy, or give direct access. Release frees or noop. */
/* Either alloc a copy, or give direct access. Release frees or noop. */
const
void
*
tdb_access_read
(
struct
tdb_context
*
tdb
,
const
void
*
tdb_access_read
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_len_t
len
,
bool
convert
);
tdb_off_t
off
,
tdb_len_t
len
,
bool
convert
);
...
@@ -452,6 +490,13 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
...
@@ -452,6 +490,13 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
int
tdb_read_convert
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
int
tdb_read_convert
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
void
*
rec
,
size_t
len
);
void
*
rec
,
size_t
len
);
/* Adds a stat, if it's in range. */
void
add_stat_
(
struct
tdb_context
*
tdb
,
uint64_t
*
stat
,
size_t
val
);
#define add_stat(tdb, statname, val) \
do { \
if (unlikely((tdb)->stats)) \
add_stat_((tdb), &(tdb)->stats->statname, (val)); \
} while (0)
/* lock.c: */
/* lock.c: */
void
tdb_lock_init
(
struct
tdb_context
*
tdb
);
void
tdb_lock_init
(
struct
tdb_context
*
tdb
);
...
@@ -507,6 +552,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
...
@@ -507,6 +552,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
int
tdb_transaction_recover
(
struct
tdb_context
*
tdb
);
int
tdb_transaction_recover
(
struct
tdb_context
*
tdb
);
bool
tdb_needs_recovery
(
struct
tdb_context
*
tdb
);
bool
tdb_needs_recovery
(
struct
tdb_context
*
tdb
);
/* tdb.c: */
void
COLD
tdb_logerr
(
struct
tdb_context
*
tdb
,
enum
TDB_ERROR
ecode
,
enum
tdb_debug_level
level
,
const
char
*
fmt
,
...);
#ifdef TDB_TRACE
#ifdef TDB_TRACE
void
tdb_trace
(
struct
tdb_context
*
tdb
,
const
char
*
op
);
void
tdb_trace
(
struct
tdb_context
*
tdb
,
const
char
*
op
);
void
tdb_trace_seqnum
(
struct
tdb_context
*
tdb
,
uint32_t
seqnum
,
const
char
*
op
);
void
tdb_trace_seqnum
(
struct
tdb_context
*
tdb
,
uint32_t
seqnum
,
const
char
*
op
);
...
...
ccan/tdb2/summary.c
View file @
51a56b52
...
@@ -37,33 +37,43 @@ static int count_hash(struct tdb_context *tdb,
...
@@ -37,33 +37,43 @@ static int count_hash(struct tdb_context *tdb,
static
bool
summarize
(
struct
tdb_context
*
tdb
,
static
bool
summarize
(
struct
tdb_context
*
tdb
,
struct
tally
*
hashes
,
struct
tally
*
hashes
,
struct
tally
*
f
list
s
,
struct
tally
*
f
table
s
,
struct
tally
*
free
,
struct
tally
*
free
,
struct
tally
*
keys
,
struct
tally
*
keys
,
struct
tally
*
data
,
struct
tally
*
data
,
struct
tally
*
extra
,
struct
tally
*
extra
,
struct
tally
*
uncoal
,
struct
tally
*
uncoal
,
struct
tally
*
buckets
)
struct
tally
*
buckets
,
struct
tally
*
chains
)
{
{
tdb_off_t
off
;
tdb_off_t
off
;
tdb_len_t
len
;
tdb_len_t
len
;
tdb_len_t
unc
=
0
;
tdb_len_t
unc
=
0
;
for
(
off
=
sizeof
(
struct
tdb_header
);
off
<
tdb
->
map_size
;
off
+=
len
)
{
for
(
off
=
sizeof
(
struct
tdb_header
);
off
<
tdb
->
map_size
;
off
+=
len
)
{
union
{
const
union
{
struct
tdb_used_record
u
;
struct
tdb_used_record
u
;
struct
tdb_free_record
f
;
struct
tdb_free_record
f
;
}
pad
,
*
p
;
struct
tdb_recovery_record
r
;
p
=
tdb_get
(
tdb
,
off
,
&
pad
,
sizeof
(
pad
));
}
*
p
;
/* We might not be able to get the whole thing. */
p
=
tdb_access_read
(
tdb
,
off
,
sizeof
(
p
->
f
),
true
);
if
(
!
p
)
if
(
!
p
)
return
false
;
return
false
;
if
(
rec_magic
(
&
p
->
u
)
!=
TDB_MAGIC
)
{
if
(
p
->
r
.
magic
==
TDB_RECOVERY_INVALID_MAGIC
len
=
p
->
f
.
data_len
;
||
p
->
r
.
magic
==
TDB_RECOVERY_MAGIC
)
{
if
(
unc
)
{
tally_add
(
uncoal
,
unc
);
unc
=
0
;
}
len
=
sizeof
(
p
->
r
)
+
p
->
r
.
max_len
;
}
else
if
(
frec_magic
(
&
p
->
f
)
==
TDB_FREE_MAGIC
)
{
len
=
frec_len
(
&
p
->
f
);
tally_add
(
free
,
len
);
tally_add
(
free
,
len
);
tally_add
(
buckets
,
size_to_bucket
(
len
));
tally_add
(
buckets
,
size_to_bucket
(
len
));
len
+=
sizeof
(
p
->
u
);
len
+=
sizeof
(
p
->
u
);
unc
++
;
unc
++
;
}
else
{
}
else
if
(
rec_magic
(
&
p
->
u
)
==
TDB_USED_MAGIC
)
{
if
(
unc
)
{
if
(
unc
)
{
tally_add
(
uncoal
,
unc
);
tally_add
(
uncoal
,
unc
);
unc
=
0
;
unc
=
0
;
...
@@ -73,25 +83,35 @@ static bool summarize(struct tdb_context *tdb,
...
@@ -73,25 +83,35 @@ static bool summarize(struct tdb_context *tdb,
+
rec_data_length
(
&
p
->
u
)
+
rec_data_length
(
&
p
->
u
)
+
rec_extra_padding
(
&
p
->
u
);
+
rec_extra_padding
(
&
p
->
u
);
/* FIXME: Use different magic for hashes, flists. */
tally_add
(
keys
,
rec_key_length
(
&
p
->
u
));
if
(
!
rec_key_length
(
&
p
->
u
)
&&
rec_hash
(
&
p
->
u
)
<
2
)
{
tally_add
(
data
,
rec_data_length
(
&
p
->
u
));
if
(
rec_hash
(
&
p
->
u
)
==
0
)
{
tally_add
(
extra
,
rec_extra_padding
(
&
p
->
u
));
int
count
=
count_hash
(
tdb
,
}
else
if
(
rec_magic
(
&
p
->
u
)
==
TDB_HTABLE_MAGIC
)
{
off
+
sizeof
(
p
->
u
),
int
count
=
count_hash
(
tdb
,
TDB_SUBLEVEL_HASH_BITS
);
off
+
sizeof
(
p
->
u
),
if
(
count
==
-
1
)
TDB_SUBLEVEL_HASH_BITS
);
return
false
;
if
(
count
==
-
1
)
tally_add
(
hashes
,
count
);
return
false
;
}
else
{
tally_add
(
hashes
,
count
);
tally_add
(
flists
,
tally_add
(
extra
,
rec_extra_padding
(
&
p
->
u
));
rec_data_length
(
&
p
->
u
));
len
=
sizeof
(
p
->
u
)
}
+
rec_data_length
(
&
p
->
u
)
}
else
{
+
rec_extra_padding
(
&
p
->
u
);
tally_add
(
keys
,
rec_key_length
(
&
p
->
u
));
}
else
if
(
rec_magic
(
&
p
->
u
)
==
TDB_FTABLE_MAGIC
)
{
tally_add
(
data
,
rec_data_length
(
&
p
->
u
));
len
=
sizeof
(
p
->
u
)
}
+
rec_data_length
(
&
p
->
u
)
+
rec_extra_padding
(
&
p
->
u
);
tally_add
(
ftables
,
rec_data_length
(
&
p
->
u
));
tally_add
(
extra
,
rec_extra_padding
(
&
p
->
u
));
}
else
if
(
rec_magic
(
&
p
->
u
)
==
TDB_CHAIN_MAGIC
)
{
len
=
sizeof
(
p
->
u
)
+
rec_data_length
(
&
p
->
u
)
+
rec_extra_padding
(
&
p
->
u
);
tally_add
(
chains
,
1
);
tally_add
(
extra
,
rec_extra_padding
(
&
p
->
u
));
tally_add
(
extra
,
rec_extra_padding
(
&
p
->
u
));
}
}
else
len
=
dead_space
(
tdb
,
off
);
tdb_access_release
(
tdb
,
p
);
}
}
if
(
unc
)
if
(
unc
)
tally_add
(
uncoal
,
unc
);
tally_add
(
uncoal
,
unc
);
...
@@ -110,6 +130,7 @@ static bool summarize(struct tdb_context *tdb,
...
@@ -110,6 +130,7 @@ static bool summarize(struct tdb_context *tdb,
"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
"Number of free lists: %zu\n%s" \
"Number of free lists: %zu\n%s" \
"Toplevel hash used: %u of %u\n" \
"Toplevel hash used: %u of %u\n" \
"Number of chains: %zu\n" \
"Number of subhashes: %zu\n" \
"Number of subhashes: %zu\n" \
"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
...
@@ -127,8 +148,8 @@ static bool summarize(struct tdb_context *tdb,
...
@@ -127,8 +148,8 @@ static bool summarize(struct tdb_context *tdb,
char
*
tdb_summary
(
struct
tdb_context
*
tdb
,
enum
tdb_summary_flags
flags
)
char
*
tdb_summary
(
struct
tdb_context
*
tdb
,
enum
tdb_summary_flags
flags
)
{
{
tdb_len_t
len
;
tdb_len_t
len
;
struct
tally
*
f
list
s
,
*
hashes
,
*
freet
,
*
keys
,
*
data
,
*
extra
,
*
uncoal
,
struct
tally
*
f
table
s
,
*
hashes
,
*
freet
,
*
keys
,
*
data
,
*
extra
,
*
uncoal
,
*
buckets
;
*
buckets
,
*
chains
;
char
*
hashesg
,
*
freeg
,
*
keysg
,
*
datag
,
*
extrag
,
*
uncoalg
,
*
bucketsg
;
char
*
hashesg
,
*
freeg
,
*
keysg
,
*
datag
,
*
extrag
,
*
uncoalg
,
*
bucketsg
;
char
*
ret
=
NULL
;
char
*
ret
=
NULL
;
...
@@ -143,7 +164,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
...
@@ -143,7 +164,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
}
}
/* Start stats off empty. */
/* Start stats off empty. */
f
list
s
=
tally_new
(
HISTO_HEIGHT
);
f
table
s
=
tally_new
(
HISTO_HEIGHT
);
hashes
=
tally_new
(
HISTO_HEIGHT
);
hashes
=
tally_new
(
HISTO_HEIGHT
);
freet
=
tally_new
(
HISTO_HEIGHT
);
freet
=
tally_new
(
HISTO_HEIGHT
);
keys
=
tally_new
(
HISTO_HEIGHT
);
keys
=
tally_new
(
HISTO_HEIGHT
);
...
@@ -151,14 +172,16 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
...
@@ -151,14 +172,16 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
extra
=
tally_new
(
HISTO_HEIGHT
);
extra
=
tally_new
(
HISTO_HEIGHT
);
uncoal
=
tally_new
(
HISTO_HEIGHT
);
uncoal
=
tally_new
(
HISTO_HEIGHT
);
buckets
=
tally_new
(
HISTO_HEIGHT
);
buckets
=
tally_new
(
HISTO_HEIGHT
);
if
(
!
flists
||
!
hashes
||
!
freet
||
!
keys
||
!
data
||
!
extra
chains
=
tally_new
(
HISTO_HEIGHT
);
||
!
uncoal
||
!
buckets
)
{
if
(
!
ftables
||
!
hashes
||
!
freet
||
!
keys
||
!
data
||
!
extra
tdb
->
ecode
=
TDB_ERR_OOM
;
||
!
uncoal
||
!
buckets
||
!
chains
)
{
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_ERROR
,
"tdb_summary: failed to allocate tally structures"
);
goto
unlock
;
goto
unlock
;
}
}
if
(
!
summarize
(
tdb
,
hashes
,
f
list
s
,
freet
,
keys
,
data
,
extra
,
uncoal
,
if
(
!
summarize
(
tdb
,
hashes
,
f
table
s
,
freet
,
keys
,
data
,
extra
,
uncoal
,
buckets
))
buckets
,
chains
))
goto
unlock
;
goto
unlock
;
if
(
flags
&
TDB_SUMMARY_HISTOGRAMS
)
{
if
(
flags
&
TDB_SUMMARY_HISTOGRAMS
)
{
...
@@ -206,6 +229,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
...
@@ -206,6 +229,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
count_hash
(
tdb
,
offsetof
(
struct
tdb_header
,
hashtable
),
count_hash
(
tdb
,
offsetof
(
struct
tdb_header
,
hashtable
),
TDB_TOPLEVEL_HASH_BITS
),
TDB_TOPLEVEL_HASH_BITS
),
1
<<
TDB_TOPLEVEL_HASH_BITS
,
1
<<
TDB_TOPLEVEL_HASH_BITS
,
tally_num
(
chains
),
tally_num
(
hashes
),
tally_num
(
hashes
),
tally_min
(
hashes
),
tally_mean
(
hashes
),
tally_max
(
hashes
),
tally_min
(
hashes
),
tally_mean
(
hashes
),
tally_max
(
hashes
),
hashesg
?
hashesg
:
""
,
hashesg
?
hashesg
:
""
,
...
@@ -215,11 +239,12 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
...
@@ -215,11 +239,12 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
tally_total
(
freet
,
NULL
)
*
100
.
0
/
tdb
->
map_size
,
tally_total
(
freet
,
NULL
)
*
100
.
0
/
tdb
->
map_size
,
(
tally_num
(
keys
)
+
tally_num
(
freet
)
+
tally_num
(
hashes
))
(
tally_num
(
keys
)
+
tally_num
(
freet
)
+
tally_num
(
hashes
))
*
sizeof
(
struct
tdb_used_record
)
*
100
.
0
/
tdb
->
map_size
,
*
sizeof
(
struct
tdb_used_record
)
*
100
.
0
/
tdb
->
map_size
,
tally_num
(
f
lists
)
*
sizeof
(
struct
tdb_freelist
)
tally_num
(
f
tables
)
*
sizeof
(
struct
tdb_freetable
)
*
100
.
0
/
tdb
->
map_size
,
*
100
.
0
/
tdb
->
map_size
,
(
tally_num
(
hashes
)
(
tally_num
(
hashes
)
*
(
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
)
*
(
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
)
+
(
sizeof
(
tdb_off_t
)
<<
TDB_TOPLEVEL_HASH_BITS
))
+
(
sizeof
(
tdb_off_t
)
<<
TDB_TOPLEVEL_HASH_BITS
)
+
sizeof
(
struct
tdb_chain
)
*
tally_num
(
chains
))
*
100
.
0
/
tdb
->
map_size
);
*
100
.
0
/
tdb
->
map_size
);
unlock:
unlock:
...
@@ -237,6 +262,8 @@ unlock:
...
@@ -237,6 +262,8 @@ unlock:
free
(
data
);
free
(
data
);
free
(
extra
);
free
(
extra
);
free
(
uncoal
);
free
(
uncoal
);
free
(
ftables
);
free
(
chains
);
tdb_allrecord_unlock
(
tdb
,
F_RDLCK
);
tdb_allrecord_unlock
(
tdb
,
F_RDLCK
);
tdb_unlock_expand
(
tdb
,
F_RDLCK
);
tdb_unlock_expand
(
tdb
,
F_RDLCK
);
...
...
ccan/tdb2/tdb.c
View file @
51a56b52
#include "private.h"
#include "private.h"
#include <ccan/tdb2/tdb2.h>
#include <ccan/tdb2/tdb2.h>
#include <ccan/build_assert/build_assert.h>
#include <ccan/likely/likely.h>
#include <assert.h>
#include <assert.h>
#include <stdarg.h>
/* The null return. */
/* The null return. */
struct
tdb_data
tdb_null
=
{
.
dptr
=
NULL
,
.
dsize
=
0
};
struct
tdb_data
tdb_null
=
{
.
dptr
=
NULL
,
.
dsize
=
0
};
...
@@ -10,13 +9,6 @@ struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
...
@@ -10,13 +9,6 @@ struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
static
struct
tdb_context
*
tdbs
=
NULL
;
static
struct
tdb_context
*
tdbs
=
NULL
;
PRINTF_FMT
(
4
,
5
)
static
void
null_log_fn
(
struct
tdb_context
*
tdb
,
enum
tdb_debug_level
level
,
void
*
priv
,
const
char
*
fmt
,
...)
{
}
static
bool
tdb_already_open
(
dev_t
device
,
ino_t
ino
)
static
bool
tdb_already_open
(
dev_t
device
,
ino_t
ino
)
{
{
struct
tdb_context
*
i
;
struct
tdb_context
*
i
;
...
@@ -39,8 +31,8 @@ static uint64_t random_number(struct tdb_context *tdb)
...
@@ -39,8 +31,8 @@ static uint64_t random_number(struct tdb_context *tdb)
fd
=
open
(
"/dev/urandom"
,
O_RDONLY
);
fd
=
open
(
"/dev/urandom"
,
O_RDONLY
);
if
(
fd
>=
0
)
{
if
(
fd
>=
0
)
{
if
(
tdb_read_all
(
fd
,
&
ret
,
sizeof
(
ret
)))
{
if
(
tdb_read_all
(
fd
,
&
ret
,
sizeof
(
ret
)))
{
tdb
->
log
(
tdb
,
TDB_DEBUG_TRACE
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_SUCCESS
,
TDB_DEBUG_TRACE
,
"tdb_open: random from /dev/urandom
\n
"
);
"tdb_open: random from /dev/urandom"
);
close
(
fd
);
close
(
fd
);
return
ret
;
return
ret
;
}
}
...
@@ -55,9 +47,9 @@ static uint64_t random_number(struct tdb_context *tdb)
...
@@ -55,9 +47,9 @@ static uint64_t random_number(struct tdb_context *tdb)
char
reply
[
1
+
sizeof
(
uint64_t
)];
char
reply
[
1
+
sizeof
(
uint64_t
)];
int
r
=
read
(
fd
,
reply
,
sizeof
(
reply
));
int
r
=
read
(
fd
,
reply
,
sizeof
(
reply
));
if
(
r
>
1
)
{
if
(
r
>
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_TRACE
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_SUCCESS
,
TDB_DEBUG_TRACE
,
"tdb_open: %u random bytes from"
"tdb_open: %u random bytes from"
" /dev/egd-pool
\n
"
,
r
-
1
);
" /dev/egd-pool
"
,
r
-
1
);
/* Copy at least some bytes. */
/* Copy at least some bytes. */
memcpy
(
&
ret
,
reply
+
1
,
r
-
1
);
memcpy
(
&
ret
,
reply
+
1
,
r
-
1
);
if
(
reply
[
0
]
==
sizeof
(
uint64_t
)
if
(
reply
[
0
]
==
sizeof
(
uint64_t
)
...
@@ -73,14 +65,14 @@ static uint64_t random_number(struct tdb_context *tdb)
...
@@ -73,14 +65,14 @@ static uint64_t random_number(struct tdb_context *tdb)
/* Fallback: pid and time. */
/* Fallback: pid and time. */
gettimeofday
(
&
now
,
NULL
);
gettimeofday
(
&
now
,
NULL
);
ret
=
getpid
()
*
100132289ULL
+
now
.
tv_sec
*
1000000ULL
+
now
.
tv_usec
;
ret
=
getpid
()
*
100132289ULL
+
now
.
tv_sec
*
1000000ULL
+
now
.
tv_usec
;
tdb
->
log
(
tdb
,
TDB_DEBUG_TRACE
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_SUCCESS
,
TDB_DEBUG_TRACE
,
"tdb_open: random from getpid and time
\n
"
);
"tdb_open: random from getpid and time
"
);
return
ret
;
return
ret
;
}
}
struct
new_database
{
struct
new_database
{
struct
tdb_header
hdr
;
struct
tdb_header
hdr
;
struct
tdb_free
list
flist
;
struct
tdb_free
table
ftable
;
};
};
/* initialise a new database */
/* initialise a new database */
...
@@ -109,11 +101,11 @@ static int tdb_new_database(struct tdb_context *tdb,
...
@@ -109,11 +101,11 @@ static int tdb_new_database(struct tdb_context *tdb,
memset
(
newdb
.
hdr
.
hashtable
,
0
,
sizeof
(
newdb
.
hdr
.
hashtable
));
memset
(
newdb
.
hdr
.
hashtable
,
0
,
sizeof
(
newdb
.
hdr
.
hashtable
));
/* Free is empty. */
/* Free is empty. */
newdb
.
hdr
.
free_
list
=
offsetof
(
struct
new_database
,
flist
);
newdb
.
hdr
.
free_
table
=
offsetof
(
struct
new_database
,
ftable
);
memset
(
&
newdb
.
f
list
,
0
,
sizeof
(
newdb
.
flist
));
memset
(
&
newdb
.
f
table
,
0
,
sizeof
(
newdb
.
ftable
));
set_header
(
NULL
,
&
newdb
.
f
list
.
hdr
,
0
,
set_header
(
NULL
,
&
newdb
.
f
table
.
hdr
,
TDB_FTABLE_MAGIC
,
0
,
sizeof
(
newdb
.
f
list
)
-
sizeof
(
newdb
.
flist
.
hdr
),
sizeof
(
newdb
.
f
table
)
-
sizeof
(
newdb
.
ftable
.
hdr
),
sizeof
(
newdb
.
f
list
)
-
sizeof
(
newdb
.
flist
.
hdr
),
1
);
sizeof
(
newdb
.
f
table
)
-
sizeof
(
newdb
.
ftable
.
hdr
),
0
);
/* Magic food */
/* Magic food */
memset
(
newdb
.
hdr
.
magic_food
,
0
,
sizeof
(
newdb
.
hdr
.
magic_food
));
memset
(
newdb
.
hdr
.
magic_food
,
0
,
sizeof
(
newdb
.
hdr
.
magic_food
));
...
@@ -130,7 +122,8 @@ static int tdb_new_database(struct tdb_context *tdb,
...
@@ -130,7 +122,8 @@ static int tdb_new_database(struct tdb_context *tdb,
tdb
->
map_size
=
sizeof
(
newdb
);
tdb
->
map_size
=
sizeof
(
newdb
);
tdb
->
map_ptr
=
malloc
(
tdb
->
map_size
);
tdb
->
map_ptr
=
malloc
(
tdb
->
map_size
);
if
(
!
tdb
->
map_ptr
)
{
if
(
!
tdb
->
map_ptr
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_FATAL
,
"tdb_new_database: failed to allocate"
);
return
-
1
;
return
-
1
;
}
}
memcpy
(
tdb
->
map_ptr
,
&
newdb
,
tdb
->
map_size
);
memcpy
(
tdb
->
map_ptr
,
&
newdb
,
tdb
->
map_size
);
...
@@ -143,7 +136,9 @@ static int tdb_new_database(struct tdb_context *tdb,
...
@@ -143,7 +136,9 @@ static int tdb_new_database(struct tdb_context *tdb,
return
-
1
;
return
-
1
;
if
(
!
tdb_pwrite_all
(
tdb
->
fd
,
&
newdb
,
sizeof
(
newdb
),
0
))
{
if
(
!
tdb_pwrite_all
(
tdb
->
fd
,
&
newdb
,
sizeof
(
newdb
),
0
))
{
tdb
->
ecode
=
TDB_ERR_IO
;
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
"tdb_new_database: failed to write: %s"
,
strerror
(
errno
));
return
-
1
;
return
-
1
;
}
}
return
0
;
return
0
;
...
@@ -155,7 +150,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -155,7 +150,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
{
{
struct
tdb_context
*
tdb
;
struct
tdb_context
*
tdb
;
struct
stat
st
;
struct
stat
st
;
int
save
_errno
;
int
save
d_errno
=
0
;
uint64_t
hash_test
;
uint64_t
hash_test
;
unsigned
v
;
unsigned
v
;
struct
tdb_header
hdr
;
struct
tdb_header
hdr
;
...
@@ -165,7 +160,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -165,7 +160,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
if
(
!
tdb
)
{
if
(
!
tdb
)
{
/* Can't log this */
/* Can't log this */
errno
=
ENOMEM
;
errno
=
ENOMEM
;
goto
fail
;
return
NULL
;
}
}
tdb
->
name
=
NULL
;
tdb
->
name
=
NULL
;
tdb
->
map_ptr
=
NULL
;
tdb
->
map_ptr
=
NULL
;
...
@@ -174,9 +169,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -174,9 +169,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
tdb
->
map_size
=
sizeof
(
struct
tdb_header
);
tdb
->
map_size
=
sizeof
(
struct
tdb_header
);
tdb
->
ecode
=
TDB_SUCCESS
;
tdb
->
ecode
=
TDB_SUCCESS
;
tdb
->
flags
=
tdb_flags
;
tdb
->
flags
=
tdb_flags
;
tdb
->
log
=
null_log_fn
;
tdb
->
logfn
=
NULL
;
tdb
->
log_priv
=
NULL
;
tdb
->
transaction
=
NULL
;
tdb
->
transaction
=
NULL
;
tdb
->
stats
=
NULL
;
tdb
->
access
=
NULL
;
tdb_hash_init
(
tdb
);
tdb_hash_init
(
tdb
);
tdb_io_init
(
tdb
);
tdb_io_init
(
tdb
);
tdb_lock_init
(
tdb
);
tdb_lock_init
(
tdb
);
...
@@ -184,8 +180,8 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -184,8 +180,8 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
while
(
attr
)
{
while
(
attr
)
{
switch
(
attr
->
base
.
attr
)
{
switch
(
attr
->
base
.
attr
)
{
case
TDB_ATTRIBUTE_LOG
:
case
TDB_ATTRIBUTE_LOG
:
tdb
->
log
=
attr
->
log
.
log_fn
;
tdb
->
log
fn
=
attr
->
log
.
log_fn
;
tdb
->
log_priv
=
attr
->
log
.
log_private
;
tdb
->
log_priv
ate
=
attr
->
log
.
log_private
;
break
;
break
;
case
TDB_ATTRIBUTE_HASH
:
case
TDB_ATTRIBUTE_HASH
:
tdb
->
khash
=
attr
->
hash
.
hash_fn
;
tdb
->
khash
=
attr
->
hash
.
hash_fn
;
...
@@ -194,20 +190,24 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -194,20 +190,24 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
case
TDB_ATTRIBUTE_SEED
:
case
TDB_ATTRIBUTE_SEED
:
seed
=
&
attr
->
seed
;
seed
=
&
attr
->
seed
;
break
;
break
;
case
TDB_ATTRIBUTE_STATS
:
tdb
->
stats
=
&
attr
->
stats
;
/* They have stats we don't know about? Tell them. */
if
(
tdb
->
stats
->
size
>
sizeof
(
attr
->
stats
))
tdb
->
stats
->
size
=
sizeof
(
attr
->
stats
);
break
;
default:
default:
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_EINVAL
,
TDB_DEBUG_ERROR
,
"tdb_open: unknown attribute type %u
\n
"
,
"tdb_open: unknown attribute type %u"
,
attr
->
base
.
attr
);
attr
->
base
.
attr
);
errno
=
EINVAL
;
goto
fail
;
goto
fail
;
}
}
attr
=
attr
->
base
.
next
;
attr
=
attr
->
base
.
next
;
}
}
if
((
open_flags
&
O_ACCMODE
)
==
O_WRONLY
)
{
if
((
open_flags
&
O_ACCMODE
)
==
O_WRONLY
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_EINVAL
,
TDB_DEBUG_ERROR
,
"tdb_open: can't open tdb %s write-only
\n
"
,
name
);
"tdb_open: can't open tdb %s write-only"
,
name
);
errno
=
EINVAL
;
goto
fail
;
goto
fail
;
}
}
...
@@ -225,21 +225,21 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -225,21 +225,21 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
if
(
tdb
->
flags
&
TDB_INTERNAL
)
{
if
(
tdb
->
flags
&
TDB_INTERNAL
)
{
tdb
->
flags
|=
(
TDB_NOLOCK
|
TDB_NOMMAP
);
tdb
->
flags
|=
(
TDB_NOLOCK
|
TDB_NOMMAP
);
if
(
tdb_new_database
(
tdb
,
seed
,
&
hdr
)
!=
0
)
{
if
(
tdb_new_database
(
tdb
,
seed
,
&
hdr
)
!=
0
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_open: tdb_new_database failed!"
);
goto
fail
;
goto
fail
;
}
}
tdb_convert
(
tdb
,
&
hdr
.
hash_seed
,
sizeof
(
hdr
.
hash_seed
));
tdb_convert
(
tdb
,
&
hdr
.
hash_seed
,
sizeof
(
hdr
.
hash_seed
));
tdb
->
hash_seed
=
hdr
.
hash_seed
;
tdb
->
hash_seed
=
hdr
.
hash_seed
;
tdb_f
list
_init
(
tdb
);
tdb_f
table
_init
(
tdb
);
return
tdb
;
return
tdb
;
}
}
if
((
tdb
->
fd
=
open
(
name
,
open_flags
,
mode
))
==
-
1
)
{
if
((
tdb
->
fd
=
open
(
name
,
open_flags
,
mode
))
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_WARNING
,
tdb
->
log_priv
,
/* errno set by open(2) */
"tdb_open: could not open file %s: %s
\n
"
,
saved_errno
=
errno
;
name
,
strerror
(
errno
));
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_ERROR
,
goto
fail
;
/* errno set by open(2) */
"tdb_open: could not open file %s: %s"
,
name
,
strerror
(
errno
));
goto
fail
;
}
}
/* on exec, don't inherit the fd */
/* on exec, don't inherit the fd */
...
@@ -248,19 +248,19 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -248,19 +248,19 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
/* ensure there is only one process initialising at once */
/* ensure there is only one process initialising at once */
if
(
tdb_lock_open
(
tdb
,
TDB_LOCK_WAIT
|
TDB_LOCK_NOCHECK
)
==
-
1
)
{
if
(
tdb_lock_open
(
tdb
,
TDB_LOCK_WAIT
|
TDB_LOCK_NOCHECK
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
/* errno set by tdb_brlock */
"tdb_open: failed to get open lock on %s: %s
\n
"
,
saved_errno
=
errno
;
name
,
strerror
(
errno
));
goto
fail
;
goto
fail
;
/* errno set by tdb_brlock */
}
}
if
(
!
tdb_pread_all
(
tdb
->
fd
,
&
hdr
,
sizeof
(
hdr
),
0
)
if
(
!
tdb_pread_all
(
tdb
->
fd
,
&
hdr
,
sizeof
(
hdr
),
0
)
||
strcmp
(
hdr
.
magic_food
,
TDB_MAGIC_FOOD
)
!=
0
)
{
||
strcmp
(
hdr
.
magic_food
,
TDB_MAGIC_FOOD
)
!=
0
)
{
if
(
!
(
open_flags
&
O_CREAT
)
if
(
!
(
open_flags
&
O_CREAT
))
{
||
tdb_new_database
(
tdb
,
seed
,
&
hdr
)
==
-
1
)
{
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_ERROR
,
if
(
errno
==
0
)
{
"tdb_open: %s is not a tdb file"
,
name
);
errno
=
EIO
;
/* ie bad format or something */
goto
fail
;
}
}
if
(
tdb_new_database
(
tdb
,
seed
,
&
hdr
)
==
-
1
)
{
goto
fail
;
goto
fail
;
}
}
}
else
if
(
hdr
.
version
!=
TDB_VERSION
)
{
}
else
if
(
hdr
.
version
!=
TDB_VERSION
)
{
...
@@ -268,10 +268,9 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -268,10 +268,9 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
tdb
->
flags
|=
TDB_CONVERT
;
tdb
->
flags
|=
TDB_CONVERT
;
else
{
else
{
/* wrong version */
/* wrong version */
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_ERROR
,
"tdb_open: %s is unknown version 0x%llx
\n
"
,
"tdb_open: %s is unknown version 0x%llx"
,
name
,
(
long
long
)
hdr
.
version
);
name
,
(
long
long
)
hdr
.
version
);
errno
=
EIO
;
goto
fail
;
goto
fail
;
}
}
}
}
...
@@ -282,29 +281,34 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -282,29 +281,34 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
hash_test
=
tdb_hash
(
tdb
,
&
hash_test
,
sizeof
(
hash_test
));
hash_test
=
tdb_hash
(
tdb
,
&
hash_test
,
sizeof
(
hash_test
));
if
(
hdr
.
hash_test
!=
hash_test
)
{
if
(
hdr
.
hash_test
!=
hash_test
)
{
/* wrong hash variant */
/* wrong hash variant */
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_ERROR
,
"tdb_open: %s uses a different hash function
\n
"
,
"tdb_open: %s uses a different hash function"
,
name
);
name
);
errno
=
EIO
;
goto
fail
;
goto
fail
;
}
}
if
(
fstat
(
tdb
->
fd
,
&
st
)
==
-
1
)
if
(
fstat
(
tdb
->
fd
,
&
st
)
==
-
1
)
{
saved_errno
=
errno
;
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_ERROR
,
"tdb_open: could not stat open %s: %s"
,
name
,
strerror
(
errno
));
goto
fail
;
goto
fail
;
}
/* Is it already in the open list? If so, fail. */
/* Is it already in the open list? If so, fail. */
if
(
tdb_already_open
(
st
.
st_dev
,
st
.
st_ino
))
{
if
(
tdb_already_open
(
st
.
st_dev
,
st
.
st_ino
))
{
/* FIXME */
/* FIXME */
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_NESTING
,
TDB_DEBUG_ERROR
,
"tdb_open: %s (%d,%d) is already open in this process
\n
"
,
"tdb_open: %s (%d,%d) is already open in this"
name
,
(
int
)
st
.
st_dev
,
(
int
)
st
.
st_ino
);
" process"
,
errno
=
EBUSY
;
name
,
(
int
)
st
.
st_dev
,
(
int
)
st
.
st_ino
)
;
goto
fail
;
goto
fail
;
}
}
tdb
->
name
=
strdup
(
name
);
tdb
->
name
=
strdup
(
name
);
if
(
!
tdb
->
name
)
{
if
(
!
tdb
->
name
)
{
errno
=
ENOMEM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_ERROR
,
"tdb_open: failed to allocate name"
);
goto
fail
;
goto
fail
;
}
}
...
@@ -317,11 +321,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -317,11 +321,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
/* Now it's fully formed, recover if necessary. */
/* Now it's fully formed, recover if necessary. */
if
(
tdb_needs_recovery
(
tdb
)
&&
tdb_lock_and_recover
(
tdb
)
==
-
1
)
{
if
(
tdb_needs_recovery
(
tdb
)
&&
tdb_lock_and_recover
(
tdb
)
==
-
1
)
{
errno
=
EIO
;
goto
fail
;
goto
fail
;
}
}
if
(
tdb_f
list
_init
(
tdb
)
==
-
1
)
if
(
tdb_f
table
_init
(
tdb
)
==
-
1
)
goto
fail
;
goto
fail
;
tdb
->
next
=
tdbs
;
tdb
->
next
=
tdbs
;
...
@@ -329,10 +332,30 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -329,10 +332,30 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
return
tdb
;
return
tdb
;
fail:
fail:
save_errno
=
errno
;
/* Map ecode to some logical errno. */
if
(
!
saved_errno
)
{
if
(
!
tdb
)
switch
(
tdb
->
ecode
)
{
return
NULL
;
case
TDB_ERR_CORRUPT
:
case
TDB_ERR_IO
:
saved_errno
=
EIO
;
break
;
case
TDB_ERR_LOCK
:
saved_errno
=
EWOULDBLOCK
;
break
;
case
TDB_ERR_OOM
:
saved_errno
=
ENOMEM
;
break
;
case
TDB_ERR_EINVAL
:
saved_errno
=
EINVAL
;
break
;
case
TDB_ERR_NESTING
:
saved_errno
=
EBUSY
;
break
;
default:
saved_errno
=
EINVAL
;
break
;
}
}
#ifdef TDB_TRACE
#ifdef TDB_TRACE
close
(
tdb
->
tracefd
);
close
(
tdb
->
tracefd
);
...
@@ -346,15 +369,14 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
...
@@ -346,15 +369,14 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
free
((
char
*
)
tdb
->
name
);
free
((
char
*
)
tdb
->
name
);
if
(
tdb
->
fd
!=
-
1
)
if
(
tdb
->
fd
!=
-
1
)
if
(
close
(
tdb
->
fd
)
!=
0
)
if
(
close
(
tdb
->
fd
)
!=
0
)
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_ERROR
,
"tdb_open: failed to close tdb->fd"
"tdb_open: failed to close tdb->fd"
" on error!
\n
"
);
" on error!
"
);
free
(
tdb
);
free
(
tdb
);
errno
=
save_errno
;
errno
=
save
d
_errno
;
return
NULL
;
return
NULL
;
}
}
/* FIXME: modify, don't rewrite! */
static
int
update_rec_hdr
(
struct
tdb_context
*
tdb
,
static
int
update_rec_hdr
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
tdb_off_t
off
,
tdb_len_t
keylen
,
tdb_len_t
keylen
,
...
@@ -364,7 +386,8 @@ static int update_rec_hdr(struct tdb_context *tdb,
...
@@ -364,7 +386,8 @@ static int update_rec_hdr(struct tdb_context *tdb,
{
{
uint64_t
dataroom
=
rec_data_length
(
rec
)
+
rec_extra_padding
(
rec
);
uint64_t
dataroom
=
rec_data_length
(
rec
)
+
rec_extra_padding
(
rec
);
if
(
set_header
(
tdb
,
rec
,
keylen
,
datalen
,
keylen
+
dataroom
,
h
))
if
(
set_header
(
tdb
,
rec
,
TDB_USED_MAGIC
,
keylen
,
datalen
,
keylen
+
dataroom
,
h
))
return
-
1
;
return
-
1
;
return
tdb_write_convert
(
tdb
,
off
,
rec
,
sizeof
(
*
rec
));
return
tdb_write_convert
(
tdb
,
off
,
rec
,
sizeof
(
*
rec
));
...
@@ -380,12 +403,14 @@ static int replace_data(struct tdb_context *tdb,
...
@@ -380,12 +403,14 @@ static int replace_data(struct tdb_context *tdb,
tdb_off_t
new_off
;
tdb_off_t
new_off
;
/* Allocate a new record. */
/* Allocate a new record. */
new_off
=
alloc
(
tdb
,
key
.
dsize
,
dbuf
.
dsize
,
h
->
h
,
growing
);
new_off
=
alloc
(
tdb
,
key
.
dsize
,
dbuf
.
dsize
,
h
->
h
,
TDB_USED_MAGIC
,
growing
);
if
(
unlikely
(
new_off
==
TDB_OFF_ERR
))
if
(
unlikely
(
new_off
==
TDB_OFF_ERR
))
return
-
1
;
return
-
1
;
/* We didn't like the existing one: remove it. */
/* We didn't like the existing one: remove it. */
if
(
old_off
)
{
if
(
old_off
)
{
add_stat
(
tdb
,
frees
,
1
);
add_free_record
(
tdb
,
old_off
,
add_free_record
(
tdb
,
old_off
,
sizeof
(
struct
tdb_used_record
)
sizeof
(
struct
tdb_used_record
)
+
key
.
dsize
+
old_room
);
+
key
.
dsize
+
old_room
);
...
@@ -445,7 +470,6 @@ int tdb_store(struct tdb_context *tdb,
...
@@ -445,7 +470,6 @@ int tdb_store(struct tdb_context *tdb,
h
.
hlock_range
,
F_WRLCK
);
h
.
hlock_range
,
F_WRLCK
);
return
0
;
return
0
;
}
}
/* FIXME: See if right record is free? */
}
else
{
}
else
{
if
(
flag
==
TDB_MODIFY
)
{
if
(
flag
==
TDB_MODIFY
)
{
/* if the record doesn't exist and we
/* if the record doesn't exist and we
...
@@ -502,15 +526,13 @@ int tdb_append(struct tdb_context *tdb,
...
@@ -502,15 +526,13 @@ int tdb_append(struct tdb_context *tdb,
F_WRLCK
);
F_WRLCK
);
return
0
;
return
0
;
}
}
/* FIXME: Check right record free? */
/* Slow path. */
/* Slow path. */
newdata
=
malloc
(
key
.
dsize
+
old_dlen
+
dbuf
.
dsize
);
newdata
=
malloc
(
key
.
dsize
+
old_dlen
+
dbuf
.
dsize
);
if
(
!
newdata
)
{
if
(
!
newdata
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_append: failed to allocate %zu bytes"
,
"tdb_append: cannot allocate %llu bytes!
\n
"
,
(
size_t
)(
key
.
dsize
+
old_dlen
+
dbuf
.
dsize
));
(
long
long
)
key
.
dsize
+
old_dlen
+
dbuf
.
dsize
);
goto
fail
;
goto
fail
;
}
}
if
(
tdb
->
methods
->
read
(
tdb
,
off
+
sizeof
(
rec
)
+
key
.
dsize
,
if
(
tdb
->
methods
->
read
(
tdb
,
off
+
sizeof
(
rec
)
+
key
.
dsize
,
...
@@ -582,6 +604,7 @@ int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
...
@@ -582,6 +604,7 @@ int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
goto
unlock_err
;
goto
unlock_err
;
/* Free the deleted entry. */
/* Free the deleted entry. */
add_stat
(
tdb
,
frees
,
1
);
if
(
add_free_record
(
tdb
,
off
,
if
(
add_free_record
(
tdb
,
off
,
sizeof
(
struct
tdb_used_record
)
sizeof
(
struct
tdb_used_record
)
+
rec_key_length
(
&
rec
)
+
rec_key_length
(
&
rec
)
...
@@ -602,12 +625,11 @@ int tdb_close(struct tdb_context *tdb)
...
@@ -602,12 +625,11 @@ int tdb_close(struct tdb_context *tdb)
struct
tdb_context
**
i
;
struct
tdb_context
**
i
;
int
ret
=
0
;
int
ret
=
0
;
/* FIXME:
tdb_trace
(
tdb
,
"tdb_close"
);
if
(
tdb
->
transaction
)
{
if
(
tdb
->
transaction
)
{
tdb_transaction_cancel
(
tdb
);
tdb_transaction_cancel
(
tdb
);
}
}
*/
tdb_trace
(
tdb
,
"tdb_close"
);
if
(
tdb
->
map_ptr
)
{
if
(
tdb
->
map_ptr
)
{
if
(
tdb
->
flags
&
TDB_INTERNAL
)
if
(
tdb
->
flags
&
TDB_INTERNAL
)
...
@@ -638,12 +660,12 @@ int tdb_close(struct tdb_context *tdb)
...
@@ -638,12 +660,12 @@ int tdb_close(struct tdb_context *tdb)
return
ret
;
return
ret
;
}
}
enum
TDB_ERROR
tdb_error
(
struct
tdb_context
*
tdb
)
enum
TDB_ERROR
tdb_error
(
const
struct
tdb_context
*
tdb
)
{
{
return
tdb
->
ecode
;
return
tdb
->
ecode
;
}
}
const
char
*
tdb_errorstr
(
struct
tdb_context
*
tdb
)
const
char
*
tdb_errorstr
(
const
struct
tdb_context
*
tdb
)
{
{
/* Gcc warns if you miss a case in the switch, so use that. */
/* Gcc warns if you miss a case in the switch, so use that. */
switch
(
tdb
->
ecode
)
{
switch
(
tdb
->
ecode
)
{
...
@@ -660,3 +682,38 @@ const char *tdb_errorstr(struct tdb_context *tdb)
...
@@ -660,3 +682,38 @@ const char *tdb_errorstr(struct tdb_context *tdb)
}
}
return
"Invalid error code"
;
return
"Invalid error code"
;
}
}
void
COLD
tdb_logerr
(
struct
tdb_context
*
tdb
,
enum
TDB_ERROR
ecode
,
enum
tdb_debug_level
level
,
const
char
*
fmt
,
...)
{
char
*
message
;
va_list
ap
;
size_t
len
;
/* tdb_open paths care about errno, so save it. */
int
saved_errno
=
errno
;
tdb
->
ecode
=
ecode
;
if
(
!
tdb
->
logfn
)
return
;
/* FIXME: Doesn't assume asprintf. */
va_start
(
ap
,
fmt
);
len
=
vsnprintf
(
NULL
,
0
,
fmt
,
ap
);
va_end
(
ap
);
message
=
malloc
(
len
+
1
);
if
(
!
message
)
{
tdb
->
logfn
(
tdb
,
level
,
tdb
->
log_private
,
"out of memory formatting message"
);
return
;
}
va_start
(
ap
,
fmt
);
len
=
vsprintf
(
message
,
fmt
,
ap
);
va_end
(
ap
);
tdb
->
logfn
(
tdb
,
level
,
tdb
->
log_private
,
message
);
free
(
message
);
errno
=
saved_errno
;
}
ccan/tdb2/tdb2.h
View file @
51a56b52
...
@@ -67,7 +67,7 @@ enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK,
...
@@ -67,7 +67,7 @@ enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK,
/* flags for tdb_summary. Logical or to combine. */
/* flags for tdb_summary. Logical or to combine. */
enum
tdb_summary_flags
{
TDB_SUMMARY_HISTOGRAMS
=
1
};
enum
tdb_summary_flags
{
TDB_SUMMARY_HISTOGRAMS
=
1
};
/*
debu
gging uses one of the following levels */
/*
lo
gging uses one of the following levels */
enum
tdb_debug_level
{
TDB_DEBUG_FATAL
=
0
,
TDB_DEBUG_ERROR
,
enum
tdb_debug_level
{
TDB_DEBUG_FATAL
=
0
,
TDB_DEBUG_ERROR
,
TDB_DEBUG_WARNING
,
TDB_DEBUG_TRACE
};
TDB_DEBUG_WARNING
,
TDB_DEBUG_TRACE
};
...
@@ -80,14 +80,15 @@ struct tdb_context;
...
@@ -80,14 +80,15 @@ struct tdb_context;
/* FIXME: Make typesafe */
/* FIXME: Make typesafe */
typedef
int
(
*
tdb_traverse_func
)(
struct
tdb_context
*
,
TDB_DATA
,
TDB_DATA
,
void
*
);
typedef
int
(
*
tdb_traverse_func
)(
struct
tdb_context
*
,
TDB_DATA
,
TDB_DATA
,
void
*
);
typedef
void
(
*
tdb_logfn_t
)(
struct
tdb_context
*
,
enum
tdb_debug_level
,
void
*
priv
,
const
char
*
,
...)
PRINTF_FMT
(
4
,
5
);
typedef
void
(
*
tdb_logfn_t
)(
struct
tdb_context
*
,
enum
tdb_debug_level
,
void
*
,
const
char
*
);
typedef
uint64_t
(
*
tdb_hashfn_t
)(
const
void
*
key
,
size_t
len
,
uint64_t
seed
,
typedef
uint64_t
(
*
tdb_hashfn_t
)(
const
void
*
key
,
size_t
len
,
uint64_t
seed
,
void
*
priv
);
void
*
priv
);
enum
tdb_attribute_type
{
enum
tdb_attribute_type
{
TDB_ATTRIBUTE_LOG
=
0
,
TDB_ATTRIBUTE_LOG
=
0
,
TDB_ATTRIBUTE_HASH
=
1
,
TDB_ATTRIBUTE_HASH
=
1
,
TDB_ATTRIBUTE_SEED
=
2
TDB_ATTRIBUTE_SEED
=
2
,
TDB_ATTRIBUTE_STATS
=
3
};
};
struct
tdb_attribute_base
{
struct
tdb_attribute_base
{
...
@@ -112,11 +113,39 @@ struct tdb_attribute_seed {
...
@@ -112,11 +113,39 @@ struct tdb_attribute_seed {
uint64_t
seed
;
uint64_t
seed
;
};
};
struct
tdb_attribute_stats
{
struct
tdb_attribute_base
base
;
/* .attr = TDB_ATTRIBUTE_STATS */
size_t
size
;
/* = sizeof(struct tdb_attribute_stats) */
uint64_t
allocs
;
uint64_t
alloc_subhash
;
uint64_t
alloc_chain
;
uint64_t
alloc_bucket_exact
;
uint64_t
alloc_bucket_max
;
uint64_t
alloc_leftover
;
uint64_t
alloc_coalesce_tried
;
uint64_t
alloc_coalesce_lockfail
;
uint64_t
alloc_coalesce_race
;
uint64_t
alloc_coalesce_succeeded
;
uint64_t
alloc_coalesce_num_merged
;
uint64_t
compares
;
uint64_t
compare_wrong_bucket
;
uint64_t
compare_wrong_offsetbits
;
uint64_t
compare_wrong_keylen
;
uint64_t
compare_wrong_rechash
;
uint64_t
compare_wrong_keycmp
;
uint64_t
expands
;
uint64_t
frees
;
uint64_t
locks
;
uint64_t
lock_lowlevel
;
uint64_t
lock_nonblock
;
};
union
tdb_attribute
{
union
tdb_attribute
{
struct
tdb_attribute_base
base
;
struct
tdb_attribute_base
base
;
struct
tdb_attribute_log
log
;
struct
tdb_attribute_log
log
;
struct
tdb_attribute_hash
hash
;
struct
tdb_attribute_hash
hash
;
struct
tdb_attribute_seed
seed
;
struct
tdb_attribute_seed
seed
;
struct
tdb_attribute_stats
stats
;
};
};
struct
tdb_context
*
tdb_open
(
const
char
*
name
,
int
tdb_flags
,
struct
tdb_context
*
tdb_open
(
const
char
*
name
,
int
tdb_flags
,
...
@@ -139,8 +168,8 @@ int tdb_check(struct tdb_context *tdb,
...
@@ -139,8 +168,8 @@ int tdb_check(struct tdb_context *tdb,
int
(
*
check
)(
TDB_DATA
key
,
TDB_DATA
data
,
void
*
private_data
),
int
(
*
check
)(
TDB_DATA
key
,
TDB_DATA
data
,
void
*
private_data
),
void
*
private_data
);
void
*
private_data
);
enum
TDB_ERROR
tdb_error
(
struct
tdb_context
*
tdb
);
enum
TDB_ERROR
tdb_error
(
const
struct
tdb_context
*
tdb
);
const
char
*
tdb_errorstr
(
struct
tdb_context
*
tdb
);
const
char
*
tdb_errorstr
(
const
struct
tdb_context
*
tdb
);
int
tdb_transaction_start
(
struct
tdb_context
*
tdb
);
int
tdb_transaction_start
(
struct
tdb_context
*
tdb
);
void
tdb_transaction_cancel
(
struct
tdb_context
*
tdb
);
void
tdb_transaction_cancel
(
struct
tdb_context
*
tdb
);
...
...
ccan/tdb2/test/layout.c
View file @
51a56b52
...
@@ -23,20 +23,20 @@ static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
...
@@ -23,20 +23,20 @@ static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
layout
->
elem
[
layout
->
num_elems
++
]
=
elem
;
layout
->
elem
[
layout
->
num_elems
++
]
=
elem
;
}
}
void
tdb_layout_add_free
list
(
struct
tdb_layout
*
layout
)
void
tdb_layout_add_free
table
(
struct
tdb_layout
*
layout
)
{
{
union
tdb_layout_elem
elem
;
union
tdb_layout_elem
elem
;
elem
.
base
.
type
=
FREE
LIST
;
elem
.
base
.
type
=
FREE
TABLE
;
add
(
layout
,
elem
);
add
(
layout
,
elem
);
}
}
void
tdb_layout_add_free
(
struct
tdb_layout
*
layout
,
tdb_len_t
len
,
void
tdb_layout_add_free
(
struct
tdb_layout
*
layout
,
tdb_len_t
len
,
unsigned
f
list
)
unsigned
f
table
)
{
{
union
tdb_layout_elem
elem
;
union
tdb_layout_elem
elem
;
elem
.
base
.
type
=
FREE
;
elem
.
base
.
type
=
FREE
;
elem
.
free
.
len
=
len
;
elem
.
free
.
len
=
len
;
elem
.
free
.
f
list_num
=
flist
;
elem
.
free
.
f
table_num
=
ftable
;
add
(
layout
,
elem
);
add
(
layout
,
elem
);
}
}
...
@@ -82,9 +82,9 @@ static tdb_len_t hashtable_len(struct tle_hashtable *htable)
...
@@ -82,9 +82,9 @@ static tdb_len_t hashtable_len(struct tle_hashtable *htable)
+
htable
->
extra
;
+
htable
->
extra
;
}
}
static
tdb_len_t
free
list_len
(
struct
tle_freelist
*
flist
)
static
tdb_len_t
free
table_len
(
struct
tle_freetable
*
ftable
)
{
{
return
sizeof
(
struct
tdb_free
list
);
return
sizeof
(
struct
tdb_free
table
);
}
}
static
void
set_free_record
(
void
*
mem
,
tdb_len_t
len
)
static
void
set_free_record
(
void
*
mem
,
tdb_len_t
len
)
...
@@ -97,7 +97,7 @@ static void set_data_record(void *mem, struct tdb_context *tdb,
...
@@ -97,7 +97,7 @@ static void set_data_record(void *mem, struct tdb_context *tdb,
{
{
struct
tdb_used_record
*
u
=
mem
;
struct
tdb_used_record
*
u
=
mem
;
set_header
(
tdb
,
u
,
used
->
key
.
dsize
,
used
->
data
.
dsize
,
set_header
(
tdb
,
u
,
TDB_USED_MAGIC
,
used
->
key
.
dsize
,
used
->
data
.
dsize
,
used
->
key
.
dsize
+
used
->
data
.
dsize
+
used
->
extra
,
used
->
key
.
dsize
+
used
->
data
.
dsize
+
used
->
extra
,
tdb_hash
(
tdb
,
used
->
key
.
dptr
,
used
->
key
.
dsize
));
tdb_hash
(
tdb
,
used
->
key
.
dptr
,
used
->
key
.
dsize
));
memcpy
(
u
+
1
,
used
->
key
.
dptr
,
used
->
key
.
dsize
);
memcpy
(
u
+
1
,
used
->
key
.
dptr
,
used
->
key
.
dsize
);
...
@@ -111,34 +111,36 @@ static void set_hashtable(void *mem, struct tdb_context *tdb,
...
@@ -111,34 +111,36 @@ static void set_hashtable(void *mem, struct tdb_context *tdb,
struct
tdb_used_record
*
u
=
mem
;
struct
tdb_used_record
*
u
=
mem
;
tdb_len_t
len
=
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
;
tdb_len_t
len
=
sizeof
(
tdb_off_t
)
<<
TDB_SUBLEVEL_HASH_BITS
;
set_header
(
tdb
,
u
,
0
,
len
,
len
+
htable
->
extra
,
0
);
set_header
(
tdb
,
u
,
TDB_HTABLE_MAGIC
,
0
,
len
,
len
+
htable
->
extra
,
0
);
memset
(
u
+
1
,
0
,
len
);
memset
(
u
+
1
,
0
,
len
);
}
}
static
void
set_free
list
(
void
*
mem
,
struct
tdb_context
*
tdb
,
static
void
set_free
table
(
void
*
mem
,
struct
tdb_context
*
tdb
,
struct
tle_free
list
*
freelist
,
struct
tdb_header
*
hdr
,
struct
tle_free
table
*
freetable
,
struct
tdb_header
*
hdr
,
tdb_off_t
last_f
list
)
tdb_off_t
last_f
table
)
{
{
struct
tdb_free
list
*
flist
=
mem
;
struct
tdb_free
table
*
ftable
=
mem
;
memset
(
f
list
,
0
,
sizeof
(
*
flist
));
memset
(
f
table
,
0
,
sizeof
(
*
ftable
));
set_header
(
tdb
,
&
f
list
->
hdr
,
0
,
set_header
(
tdb
,
&
f
table
->
hdr
,
TDB_FTABLE_MAGIC
,
0
,
sizeof
(
*
flist
)
-
sizeof
(
flist
->
hdr
),
sizeof
(
*
ftable
)
-
sizeof
(
ftable
->
hdr
),
sizeof
(
*
flist
)
-
sizeof
(
flist
->
hdr
),
1
);
sizeof
(
*
ftable
)
-
sizeof
(
ftable
->
hdr
),
0
);
if
(
last_f
list
)
{
if
(
last_f
table
)
{
f
list
=
(
struct
tdb_freelist
*
)((
char
*
)
hdr
+
last_flist
);
f
table
=
(
struct
tdb_freetable
*
)((
char
*
)
hdr
+
last_ftable
);
f
list
->
next
=
freelist
->
base
.
off
;
f
table
->
next
=
freetable
->
base
.
off
;
}
else
{
}
else
{
hdr
->
free_
list
=
freelist
->
base
.
off
;
hdr
->
free_
table
=
freetable
->
base
.
off
;
}
}
}
}
static
void
add_to_freetable
(
struct
tdb_context
*
tdb
,
static
void
add_to_freetable
(
struct
tdb_context
*
tdb
,
tdb_off_t
eoff
,
tdb_off_t
eoff
,
tdb_off_t
elen
,
tdb_off_t
elen
,
struct
tle_freelist
*
freelist
)
unsigned
ftable
,
struct
tle_freetable
*
freetable
)
{
{
tdb
->
flist_off
=
freelist
->
base
.
off
;
tdb
->
ftable_off
=
freetable
->
base
.
off
;
tdb
->
ftable
=
ftable
;
add_free_record
(
tdb
,
eoff
,
sizeof
(
struct
tdb_used_record
)
+
elen
);
add_free_record
(
tdb
,
eoff
,
sizeof
(
struct
tdb_used_record
)
+
elen
);
}
}
...
@@ -202,15 +204,15 @@ static void add_to_hashtable(struct tdb_context *tdb,
...
@@ -202,15 +204,15 @@ static void add_to_hashtable(struct tdb_context *tdb,
abort
();
abort
();
}
}
static
struct
tle_free
list
*
find_flist
(
struct
tdb_layout
*
layout
,
unsigned
num
)
static
struct
tle_free
table
*
find_ftable
(
struct
tdb_layout
*
layout
,
unsigned
num
)
{
{
unsigned
i
;
unsigned
i
;
for
(
i
=
0
;
i
<
layout
->
num_elems
;
i
++
)
{
for
(
i
=
0
;
i
<
layout
->
num_elems
;
i
++
)
{
if
(
layout
->
elem
[
i
].
base
.
type
!=
FREE
LIST
)
if
(
layout
->
elem
[
i
].
base
.
type
!=
FREE
TABLE
)
continue
;
continue
;
if
(
num
==
0
)
if
(
num
==
0
)
return
&
layout
->
elem
[
i
].
f
list
;
return
&
layout
->
elem
[
i
].
f
table
;
num
--
;
num
--
;
}
}
abort
();
abort
();
...
@@ -220,7 +222,7 @@ static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
...
@@ -220,7 +222,7 @@ static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
struct
tdb_context
*
tdb_layout_get
(
struct
tdb_layout
*
layout
)
struct
tdb_context
*
tdb_layout_get
(
struct
tdb_layout
*
layout
)
{
{
unsigned
int
i
;
unsigned
int
i
;
tdb_off_t
off
,
len
,
last_f
list
;
tdb_off_t
off
,
len
,
last_f
table
;
char
*
mem
;
char
*
mem
;
struct
tdb_context
*
tdb
;
struct
tdb_context
*
tdb
;
...
@@ -231,8 +233,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
...
@@ -231,8 +233,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
union
tdb_layout_elem
*
e
=
&
layout
->
elem
[
i
];
union
tdb_layout_elem
*
e
=
&
layout
->
elem
[
i
];
e
->
base
.
off
=
off
;
e
->
base
.
off
=
off
;
switch
(
e
->
base
.
type
)
{
switch
(
e
->
base
.
type
)
{
case
FREE
LIST
:
case
FREE
TABLE
:
len
=
free
list_len
(
&
e
->
flist
);
len
=
free
table_len
(
&
e
->
ftable
);
break
;
break
;
case
FREE
:
case
FREE
:
len
=
free_record_len
(
e
->
free
.
len
);
len
=
free_record_len
(
e
->
free
.
len
);
...
@@ -259,14 +261,14 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
...
@@ -259,14 +261,14 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
tdb
->
map_ptr
=
mem
;
tdb
->
map_ptr
=
mem
;
tdb
->
map_size
=
off
;
tdb
->
map_size
=
off
;
last_f
list
=
0
;
last_f
table
=
0
;
for
(
i
=
0
;
i
<
layout
->
num_elems
;
i
++
)
{
for
(
i
=
0
;
i
<
layout
->
num_elems
;
i
++
)
{
union
tdb_layout_elem
*
e
=
&
layout
->
elem
[
i
];
union
tdb_layout_elem
*
e
=
&
layout
->
elem
[
i
];
switch
(
e
->
base
.
type
)
{
switch
(
e
->
base
.
type
)
{
case
FREE
LIST
:
case
FREE
TABLE
:
set_free
list
(
mem
+
e
->
base
.
off
,
tdb
,
&
e
->
flist
,
set_free
table
(
mem
+
e
->
base
.
off
,
tdb
,
&
e
->
ftable
,
(
struct
tdb_header
*
)
mem
,
last_f
list
);
(
struct
tdb_header
*
)
mem
,
last_f
table
);
last_f
list
=
e
->
base
.
off
;
last_f
table
=
e
->
base
.
off
;
break
;
break
;
case
FREE
:
case
FREE
:
set_free_record
(
mem
+
e
->
base
.
off
,
e
->
free
.
len
);
set_free_record
(
mem
+
e
->
base
.
off
,
e
->
free
.
len
);
...
@@ -279,8 +281,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
...
@@ -279,8 +281,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
break
;
break
;
}
}
}
}
/* Must have a free
list
! */
/* Must have a free
table
! */
assert
(
last_f
list
);
assert
(
last_f
table
);
/* Now fill the free and hash tables. */
/* Now fill the free and hash tables. */
for
(
i
=
0
;
i
<
layout
->
num_elems
;
i
++
)
{
for
(
i
=
0
;
i
<
layout
->
num_elems
;
i
++
)
{
...
@@ -288,7 +290,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
...
@@ -288,7 +290,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
switch
(
e
->
base
.
type
)
{
switch
(
e
->
base
.
type
)
{
case
FREE
:
case
FREE
:
add_to_freetable
(
tdb
,
e
->
base
.
off
,
e
->
free
.
len
,
add_to_freetable
(
tdb
,
e
->
base
.
off
,
e
->
free
.
len
,
find_flist
(
layout
,
e
->
free
.
flist_num
));
e
->
free
.
ftable_num
,
find_ftable
(
layout
,
e
->
free
.
ftable_num
));
break
;
break
;
case
DATA
:
case
DATA
:
add_to_hashtable
(
tdb
,
e
->
base
.
off
,
e
->
used
.
key
);
add_to_hashtable
(
tdb
,
e
->
base
.
off
,
e
->
used
.
key
);
...
@@ -298,7 +301,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
...
@@ -298,7 +301,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
}
}
}
}
tdb
->
f
list_off
=
find_flist
(
layout
,
0
)
->
base
.
off
;
tdb
->
f
table_off
=
find_ftable
(
layout
,
0
)
->
base
.
off
;
/* Get physical if they asked for it. */
/* Get physical if they asked for it. */
if
(
layout
->
filename
)
{
if
(
layout
->
filename
)
{
...
...
ccan/tdb2/test/layout.h
View file @
51a56b52
...
@@ -3,9 +3,9 @@
...
@@ -3,9 +3,9 @@
#include <ccan/tdb2/private.h>
#include <ccan/tdb2/private.h>
struct
tdb_layout
*
new_tdb_layout
(
const
char
*
filename
);
struct
tdb_layout
*
new_tdb_layout
(
const
char
*
filename
);
void
tdb_layout_add_free
list
(
struct
tdb_layout
*
layout
);
void
tdb_layout_add_free
table
(
struct
tdb_layout
*
layout
);
void
tdb_layout_add_free
(
struct
tdb_layout
*
layout
,
tdb_len_t
len
,
void
tdb_layout_add_free
(
struct
tdb_layout
*
layout
,
tdb_len_t
len
,
unsigned
f
list
);
unsigned
f
table
);
void
tdb_layout_add_used
(
struct
tdb_layout
*
layout
,
void
tdb_layout_add_used
(
struct
tdb_layout
*
layout
,
TDB_DATA
key
,
TDB_DATA
data
,
TDB_DATA
key
,
TDB_DATA
data
,
tdb_len_t
extra
);
tdb_len_t
extra
);
...
@@ -18,7 +18,7 @@ void tdb_layout_add_hashtable(struct tdb_layout *layout,
...
@@ -18,7 +18,7 @@ void tdb_layout_add_hashtable(struct tdb_layout *layout,
struct
tdb_context
*
tdb_layout_get
(
struct
tdb_layout
*
layout
);
struct
tdb_context
*
tdb_layout_get
(
struct
tdb_layout
*
layout
);
enum
layout_type
{
enum
layout_type
{
FREE
LIST
,
FREE
,
DATA
,
HASHTABLE
,
FREE
TABLE
,
FREE
,
DATA
,
HASHTABLE
,
};
};
/* Shared by all union members. */
/* Shared by all union members. */
...
@@ -27,14 +27,14 @@ struct tle_base {
...
@@ -27,14 +27,14 @@ struct tle_base {
tdb_off_t
off
;
tdb_off_t
off
;
};
};
struct
tle_free
list
{
struct
tle_free
table
{
struct
tle_base
base
;
struct
tle_base
base
;
};
};
struct
tle_free
{
struct
tle_free
{
struct
tle_base
base
;
struct
tle_base
base
;
tdb_len_t
len
;
tdb_len_t
len
;
unsigned
f
list
_num
;
unsigned
f
table
_num
;
};
};
struct
tle_used
{
struct
tle_used
{
...
@@ -53,7 +53,7 @@ struct tle_hashtable {
...
@@ -53,7 +53,7 @@ struct tle_hashtable {
union
tdb_layout_elem
{
union
tdb_layout_elem
{
struct
tle_base
base
;
struct
tle_base
base
;
struct
tle_free
list
flist
;
struct
tle_free
table
ftable
;
struct
tle_free
free
;
struct
tle_free
free
;
struct
tle_used
used
;
struct
tle_used
used
;
struct
tle_hashtable
hashtable
;
struct
tle_hashtable
hashtable
;
...
...
ccan/tdb2/test/logging.c
View file @
51a56b52
#define _GNU_SOURCE
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#include <stdarg.h>
#include <ccan/tap/tap.h>
#include <ccan/tap/tap.h>
#include "logging.h"
#include "logging.h"
...
@@ -16,24 +14,13 @@ union tdb_attribute tap_log_attr = {
...
@@ -16,24 +14,13 @@ union tdb_attribute tap_log_attr = {
void
tap_log_fn
(
struct
tdb_context
*
tdb
,
void
tap_log_fn
(
struct
tdb_context
*
tdb
,
enum
tdb_debug_level
level
,
void
*
priv
,
enum
tdb_debug_level
level
,
void
*
priv
,
const
char
*
fmt
,
...
)
const
char
*
message
)
{
{
va_list
ap
;
char
*
p
;
if
(
suppress_logging
)
if
(
suppress_logging
)
return
;
return
;
va_start
(
ap
,
fmt
);
diag
(
"tdb log level %u: %s%s"
,
level
,
log_prefix
,
message
);
if
(
vasprintf
(
&
p
,
fmt
,
ap
)
==
-
1
)
abort
();
/* Strip trailing \n: diag adds it. */
if
(
p
[
strlen
(
p
)
-
1
]
==
'\n'
)
p
[
strlen
(
p
)
-
1
]
=
'\0'
;
diag
(
"tdb log level %u: %s%s"
,
level
,
log_prefix
,
p
);
free
(
p
);
if
(
level
!=
TDB_DEBUG_TRACE
)
if
(
level
!=
TDB_DEBUG_TRACE
)
tap_log_messages
++
;
tap_log_messages
++
;
va_end
(
ap
);
}
}
ccan/tdb2/test/logging.h
View file @
51a56b52
...
@@ -11,7 +11,7 @@ extern union tdb_attribute tap_log_attr;
...
@@ -11,7 +11,7 @@ extern union tdb_attribute tap_log_attr;
void
tap_log_fn
(
struct
tdb_context
*
tdb
,
void
tap_log_fn
(
struct
tdb_context
*
tdb
,
enum
tdb_debug_level
level
,
void
*
priv
,
enum
tdb_debug_level
level
,
void
*
priv
,
const
char
*
fmt
,
...
);
const
char
*
message
);
static
inline
bool
data_equal
(
struct
tdb_data
a
,
struct
tdb_data
b
)
static
inline
bool
data_equal
(
struct
tdb_data
a
,
struct
tdb_data
b
)
{
{
...
...
ccan/tdb2/test/run-001-encode.c
View file @
51a56b52
...
@@ -12,18 +12,20 @@ int main(int argc, char *argv[])
...
@@ -12,18 +12,20 @@ int main(int argc, char *argv[])
{
{
unsigned
int
i
;
unsigned
int
i
;
struct
tdb_used_record
rec
;
struct
tdb_used_record
rec
;
struct
tdb_context
tdb
=
{
.
log
=
tap_log_fn
,
.
log_priv
=
NULL
};
struct
tdb_context
tdb
=
{
.
log
fn
=
tap_log_fn
};
plan_tests
(
64
+
32
+
48
*
6
+
1
);
plan_tests
(
64
+
32
+
48
*
6
+
1
);
/* We should be able to encode any data value. */
/* We should be able to encode any data value. */
for
(
i
=
0
;
i
<
64
;
i
++
)
for
(
i
=
0
;
i
<
64
;
i
++
)
ok1
(
set_header
(
&
tdb
,
&
rec
,
0
,
1ULL
<<
i
,
1ULL
<<
i
,
0
)
==
0
);
ok1
(
set_header
(
&
tdb
,
&
rec
,
TDB_USED_MAGIC
,
0
,
1ULL
<<
i
,
1ULL
<<
i
,
0
)
==
0
);
/* And any key and data with < 64 bits between them. */
/* And any key and data with < 64 bits between them. */
for
(
i
=
0
;
i
<
32
;
i
++
)
{
for
(
i
=
0
;
i
<
32
;
i
++
)
{
tdb_len_t
dlen
=
1ULL
>>
(
63
-
i
),
klen
=
1ULL
<<
i
;
tdb_len_t
dlen
=
1ULL
>>
(
63
-
i
),
klen
=
1ULL
<<
i
;
ok1
(
set_header
(
&
tdb
,
&
rec
,
klen
,
dlen
,
klen
+
dlen
,
0
)
==
0
);
ok1
(
set_header
(
&
tdb
,
&
rec
,
TDB_USED_MAGIC
,
klen
,
dlen
,
klen
+
dlen
,
0
)
==
0
);
}
}
/* We should neatly encode all values. */
/* We should neatly encode all values. */
...
@@ -32,13 +34,13 @@ int main(int argc, char *argv[])
...
@@ -32,13 +34,13 @@ int main(int argc, char *argv[])
uint64_t
klen
=
1ULL
<<
(
i
<
16
?
i
:
15
);
uint64_t
klen
=
1ULL
<<
(
i
<
16
?
i
:
15
);
uint64_t
dlen
=
1ULL
<<
i
;
uint64_t
dlen
=
1ULL
<<
i
;
uint64_t
xlen
=
1ULL
<<
(
i
<
32
?
i
:
31
);
uint64_t
xlen
=
1ULL
<<
(
i
<
32
?
i
:
31
);
ok1
(
set_header
(
&
tdb
,
&
rec
,
klen
,
dlen
,
klen
+
dlen
+
xlen
,
h
)
ok1
(
set_header
(
&
tdb
,
&
rec
,
TDB_USED_MAGIC
,
klen
,
dlen
,
==
0
);
klen
+
dlen
+
xlen
,
h
)
==
0
);
ok1
(
rec_key_length
(
&
rec
)
==
klen
);
ok1
(
rec_key_length
(
&
rec
)
==
klen
);
ok1
(
rec_data_length
(
&
rec
)
==
dlen
);
ok1
(
rec_data_length
(
&
rec
)
==
dlen
);
ok1
(
rec_extra_padding
(
&
rec
)
==
xlen
);
ok1
(
rec_extra_padding
(
&
rec
)
==
xlen
);
ok1
((
uint64_t
)
rec_hash
(
&
rec
)
==
h
);
ok1
((
uint64_t
)
rec_hash
(
&
rec
)
==
h
);
ok1
(
rec_magic
(
&
rec
)
==
TDB_MAGIC
);
ok1
(
rec_magic
(
&
rec
)
==
TDB_
USED_
MAGIC
);
}
}
ok1
(
tap_log_messages
==
0
);
ok1
(
tap_log_messages
==
0
);
return
exit_status
();
return
exit_status
();
...
...
ccan/tdb2/test/run-03-coalesce.c
View file @
51a56b52
...
@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
...
@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
return
TDB_OFF_ERR
;
return
TDB_OFF_ERR
;
if
(
frec_magic
(
&
f
)
!=
TDB_FREE_MAGIC
)
if
(
frec_magic
(
&
f
)
!=
TDB_FREE_MAGIC
)
return
TDB_OFF_ERR
;
return
TDB_OFF_ERR
;
return
f
.
data_len
;
return
f
rec_len
(
&
f
)
;
}
}
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
...
@@ -38,7 +38,7 @@ int main(int argc, char *argv[])
...
@@ -38,7 +38,7 @@ int main(int argc, char *argv[])
/* No coalescing can be done due to EOF */
/* No coalescing can be done due to EOF */
layout
=
new_tdb_layout
(
NULL
);
layout
=
new_tdb_layout
(
NULL
);
tdb_layout_add_free
list
(
layout
);
tdb_layout_add_free
table
(
layout
);
len
=
1024
;
len
=
1024
;
tdb_layout_add_free
(
layout
,
len
,
0
);
tdb_layout_add_free
(
layout
,
len
,
0
);
tdb
=
tdb_layout_get
(
layout
);
tdb
=
tdb_layout_get
(
layout
);
...
@@ -46,7 +46,7 @@ int main(int argc, char *argv[])
...
@@ -46,7 +46,7 @@ int main(int argc, char *argv[])
ok1
(
free_record_length
(
tdb
,
layout
->
elem
[
1
].
base
.
off
)
==
len
);
ok1
(
free_record_length
(
tdb
,
layout
->
elem
[
1
].
base
.
off
)
==
len
);
/* Figure out which bucket free entry is. */
/* Figure out which bucket free entry is. */
b_off
=
bucket_off
(
tdb
->
f
list
_off
,
size_to_bucket
(
len
));
b_off
=
bucket_off
(
tdb
->
f
table
_off
,
size_to_bucket
(
len
));
/* Lock and fail to coalesce. */
/* Lock and fail to coalesce. */
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
len
)
==
0
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
len
)
==
0
);
...
@@ -57,7 +57,7 @@ int main(int argc, char *argv[])
...
@@ -57,7 +57,7 @@ int main(int argc, char *argv[])
/* No coalescing can be done due to used record */
/* No coalescing can be done due to used record */
layout
=
new_tdb_layout
(
NULL
);
layout
=
new_tdb_layout
(
NULL
);
tdb_layout_add_free
list
(
layout
);
tdb_layout_add_free
table
(
layout
);
tdb_layout_add_free
(
layout
,
1024
,
0
);
tdb_layout_add_free
(
layout
,
1024
,
0
);
tdb_layout_add_used
(
layout
,
key
,
data
,
6
);
tdb_layout_add_used
(
layout
,
key
,
data
,
6
);
tdb
=
tdb_layout_get
(
layout
);
tdb
=
tdb_layout_get
(
layout
);
...
@@ -65,7 +65,7 @@ int main(int argc, char *argv[])
...
@@ -65,7 +65,7 @@ int main(int argc, char *argv[])
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Figure out which bucket free entry is. */
/* Figure out which bucket free entry is. */
b_off
=
bucket_off
(
tdb
->
f
list
_off
,
size_to_bucket
(
1024
));
b_off
=
bucket_off
(
tdb
->
f
table
_off
,
size_to_bucket
(
1024
));
/* Lock and fail to coalesce. */
/* Lock and fail to coalesce. */
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
1024
)
==
0
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
1024
)
==
0
);
...
@@ -76,7 +76,7 @@ int main(int argc, char *argv[])
...
@@ -76,7 +76,7 @@ int main(int argc, char *argv[])
/* Coalescing can be done due to two free records, then EOF */
/* Coalescing can be done due to two free records, then EOF */
layout
=
new_tdb_layout
(
NULL
);
layout
=
new_tdb_layout
(
NULL
);
tdb_layout_add_free
list
(
layout
);
tdb_layout_add_free
table
(
layout
);
tdb_layout_add_free
(
layout
,
1024
,
0
);
tdb_layout_add_free
(
layout
,
1024
,
0
);
tdb_layout_add_free
(
layout
,
2048
,
0
);
tdb_layout_add_free
(
layout
,
2048
,
0
);
tdb
=
tdb_layout_get
(
layout
);
tdb
=
tdb_layout_get
(
layout
);
...
@@ -85,7 +85,7 @@ int main(int argc, char *argv[])
...
@@ -85,7 +85,7 @@ int main(int argc, char *argv[])
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Figure out which bucket (first) free entry is. */
/* Figure out which bucket (first) free entry is. */
b_off
=
bucket_off
(
tdb
->
f
list
_off
,
size_to_bucket
(
1024
));
b_off
=
bucket_off
(
tdb
->
f
table
_off
,
size_to_bucket
(
1024
));
/* Lock and coalesce. */
/* Lock and coalesce. */
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
1024
)
==
1
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
1024
)
==
1
);
...
@@ -97,7 +97,7 @@ int main(int argc, char *argv[])
...
@@ -97,7 +97,7 @@ int main(int argc, char *argv[])
/* Coalescing can be done due to two free records, then data */
/* Coalescing can be done due to two free records, then data */
layout
=
new_tdb_layout
(
NULL
);
layout
=
new_tdb_layout
(
NULL
);
tdb_layout_add_free
list
(
layout
);
tdb_layout_add_free
table
(
layout
);
tdb_layout_add_free
(
layout
,
1024
,
0
);
tdb_layout_add_free
(
layout
,
1024
,
0
);
tdb_layout_add_free
(
layout
,
512
,
0
);
tdb_layout_add_free
(
layout
,
512
,
0
);
tdb_layout_add_used
(
layout
,
key
,
data
,
6
);
tdb_layout_add_used
(
layout
,
key
,
data
,
6
);
...
@@ -107,7 +107,7 @@ int main(int argc, char *argv[])
...
@@ -107,7 +107,7 @@ int main(int argc, char *argv[])
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Figure out which bucket free entry is. */
/* Figure out which bucket free entry is. */
b_off
=
bucket_off
(
tdb
->
f
list
_off
,
size_to_bucket
(
1024
));
b_off
=
bucket_off
(
tdb
->
f
table
_off
,
size_to_bucket
(
1024
));
/* Lock and coalesce. */
/* Lock and coalesce. */
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
1024
)
==
1
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
1024
)
==
1
);
...
@@ -119,7 +119,7 @@ int main(int argc, char *argv[])
...
@@ -119,7 +119,7 @@ int main(int argc, char *argv[])
/* Coalescing can be done due to three free records, then EOF */
/* Coalescing can be done due to three free records, then EOF */
layout
=
new_tdb_layout
(
NULL
);
layout
=
new_tdb_layout
(
NULL
);
tdb_layout_add_free
list
(
layout
);
tdb_layout_add_free
table
(
layout
);
tdb_layout_add_free
(
layout
,
1024
,
0
);
tdb_layout_add_free
(
layout
,
1024
,
0
);
tdb_layout_add_free
(
layout
,
512
,
0
);
tdb_layout_add_free
(
layout
,
512
,
0
);
tdb_layout_add_free
(
layout
,
256
,
0
);
tdb_layout_add_free
(
layout
,
256
,
0
);
...
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
...
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Figure out which bucket free entry is. */
/* Figure out which bucket free entry is. */
b_off
=
bucket_off
(
tdb
->
f
list
_off
,
size_to_bucket
(
1024
));
b_off
=
bucket_off
(
tdb
->
f
table
_off
,
size_to_bucket
(
1024
));
/* Lock and coalesce. */
/* Lock and coalesce. */
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
tdb_lock_free_bucket
(
tdb
,
b_off
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
1024
)
==
1
);
ok1
(
coalesce
(
tdb
,
layout
->
elem
[
1
].
base
.
off
,
b_off
,
1024
)
==
1
);
...
...
ccan/tdb2/test/run-04-basichash.c
View file @
51a56b52
...
@@ -65,7 +65,8 @@ int main(int argc, char *argv[])
...
@@ -65,7 +65,8 @@ int main(int argc, char *argv[])
/* FIXME: Check lock length */
/* FIXME: Check lock length */
/* Allocate a new record. */
/* Allocate a new record. */
new_off
=
alloc
(
tdb
,
key
.
dsize
,
dbuf
.
dsize
,
h
.
h
,
false
);
new_off
=
alloc
(
tdb
,
key
.
dsize
,
dbuf
.
dsize
,
h
.
h
,
TDB_USED_MAGIC
,
false
);
ok1
(
new_off
!=
TDB_OFF_ERR
);
ok1
(
new_off
!=
TDB_OFF_ERR
);
/* We should be able to add it now. */
/* We should be able to add it now. */
...
@@ -225,7 +226,8 @@ int main(int argc, char *argv[])
...
@@ -225,7 +226,8 @@ int main(int argc, char *argv[])
/* We should be able to add it now. */
/* We should be able to add it now. */
/* Allocate a new record. */
/* Allocate a new record. */
new_off
=
alloc
(
tdb
,
key
.
dsize
,
dbuf
.
dsize
,
h
.
h
,
false
);
new_off
=
alloc
(
tdb
,
key
.
dsize
,
dbuf
.
dsize
,
h
.
h
,
TDB_USED_MAGIC
,
false
);
ok1
(
new_off
!=
TDB_OFF_ERR
);
ok1
(
new_off
!=
TDB_OFF_ERR
);
ok1
(
add_to_hash
(
tdb
,
&
h
,
new_off
)
==
0
);
ok1
(
add_to_hash
(
tdb
,
&
h
,
new_off
)
==
0
);
...
...
ccan/tdb2/test/run-25-hashoverload.c
0 → 100644
View file @
51a56b52
#include <ccan/tdb2/tdb.c>
#include <ccan/tdb2/free.c>
#include <ccan/tdb2/lock.c>
#include <ccan/tdb2/io.c>
#include <ccan/tdb2/hash.c>
#include <ccan/tdb2/transaction.c>
#include <ccan/tdb2/traverse.c>
#include <ccan/tdb2/check.c>
#include <ccan/tap/tap.h>
#include "logging.h"
static
uint64_t
badhash
(
const
void
*
key
,
size_t
len
,
uint64_t
seed
,
void
*
priv
)
{
return
0
;
}
static
int
trav
(
struct
tdb_context
*
tdb
,
TDB_DATA
key
,
TDB_DATA
dbuf
,
void
*
p
)
{
if
(
p
)
return
tdb_delete
(
tdb
,
key
);
return
0
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
unsigned
int
i
,
j
;
struct
tdb_context
*
tdb
;
struct
tdb_data
key
=
{
(
unsigned
char
*
)
&
j
,
sizeof
(
j
)
};
struct
tdb_data
dbuf
=
{
(
unsigned
char
*
)
&
j
,
sizeof
(
j
)
};
union
tdb_attribute
hattr
=
{
.
hash
=
{
.
base
=
{
TDB_ATTRIBUTE_HASH
},
.
hash_fn
=
badhash
}
};
int
flags
[]
=
{
TDB_INTERNAL
,
TDB_DEFAULT
,
TDB_NOMMAP
,
TDB_INTERNAL
|
TDB_CONVERT
,
TDB_CONVERT
,
TDB_NOMMAP
|
TDB_CONVERT
,
};
hattr
.
base
.
next
=
&
tap_log_attr
;
plan_tests
(
5395
);
for
(
i
=
0
;
i
<
sizeof
(
flags
)
/
sizeof
(
flags
[
0
]);
i
++
)
{
struct
tdb_data
d
;
tdb
=
tdb_open
(
"run-25-hashoverload.tdb"
,
flags
[
i
],
O_RDWR
|
O_CREAT
|
O_TRUNC
,
0600
,
&
hattr
);
ok1
(
tdb
);
if
(
!
tdb
)
continue
;
/* Fill a group. */
for
(
j
=
0
;
j
<
(
1
<<
TDB_HASH_GROUP_BITS
);
j
++
)
{
ok1
(
tdb_store
(
tdb
,
key
,
dbuf
,
TDB_INSERT
)
==
0
);
}
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Now store one last value: should form chain. */
ok1
(
tdb_store
(
tdb
,
key
,
dbuf
,
TDB_INSERT
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Check we can find them all. */
for
(
j
=
0
;
j
<
(
1
<<
TDB_HASH_GROUP_BITS
)
+
1
;
j
++
)
{
d
=
tdb_fetch
(
tdb
,
key
);
ok1
(
d
.
dsize
==
sizeof
(
j
));
ok1
(
d
.
dptr
!=
NULL
);
ok1
(
d
.
dptr
&&
memcmp
(
d
.
dptr
,
&
j
,
d
.
dsize
)
==
0
);
}
/* Now add a *lot* more. */
for
(
j
=
(
1
<<
TDB_HASH_GROUP_BITS
)
+
1
;
j
<
(
16
<<
TDB_HASH_GROUP_BITS
);
j
++
)
{
ok1
(
tdb_store
(
tdb
,
key
,
dbuf
,
TDB_INSERT
)
==
0
);
d
=
tdb_fetch
(
tdb
,
key
);
ok1
(
d
.
dsize
==
sizeof
(
j
));
ok1
(
d
.
dptr
!=
NULL
);
ok1
(
d
.
dptr
&&
memcmp
(
d
.
dptr
,
&
j
,
d
.
dsize
)
==
0
);
}
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Traverse through them. */
ok1
(
tdb_traverse
(
tdb
,
trav
,
NULL
)
==
j
);
/* Empty the first chain-worth. */
for
(
j
=
0
;
j
<
(
1
<<
TDB_HASH_GROUP_BITS
);
j
++
)
ok1
(
tdb_delete
(
tdb
,
key
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
for
(
j
=
(
1
<<
TDB_HASH_GROUP_BITS
);
j
<
(
16
<<
TDB_HASH_GROUP_BITS
);
j
++
)
{
d
=
tdb_fetch
(
tdb
,
key
);
ok1
(
d
.
dsize
==
sizeof
(
j
));
ok1
(
d
.
dptr
!=
NULL
);
ok1
(
d
.
dptr
&&
memcmp
(
d
.
dptr
,
&
j
,
d
.
dsize
)
==
0
);
}
/* Traverse through them. */
ok1
(
tdb_traverse
(
tdb
,
trav
,
NULL
)
==
(
15
<<
TDB_HASH_GROUP_BITS
));
/* Re-add */
for
(
j
=
0
;
j
<
(
1
<<
TDB_HASH_GROUP_BITS
);
j
++
)
{
ok1
(
tdb_store
(
tdb
,
key
,
dbuf
,
TDB_INSERT
)
==
0
);
}
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Now try deleting as we go. */
ok1
(
tdb_traverse
(
tdb
,
trav
,
trav
)
==
(
16
<<
TDB_HASH_GROUP_BITS
));
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_traverse
(
tdb
,
trav
,
NULL
)
==
0
);
tdb_close
(
tdb
);
}
ok1
(
tap_log_messages
==
0
);
return
exit_status
();
}
ccan/tdb2/test/run-30-exhaust-before-expand.c
View file @
51a56b52
...
@@ -9,13 +9,13 @@
...
@@ -9,13 +9,13 @@
#include <err.h>
#include <err.h>
#include "logging.h"
#include "logging.h"
static
bool
empty_free
list
(
struct
tdb_context
*
tdb
)
static
bool
empty_free
table
(
struct
tdb_context
*
tdb
)
{
{
struct
tdb_free
list
free
;
struct
tdb_free
table
free
;
unsigned
int
i
;
unsigned
int
i
;
/* Now, free
list
should be completely exhausted in zone 0 */
/* Now, free
table
should be completely exhausted in zone 0 */
if
(
tdb_read_convert
(
tdb
,
tdb
->
f
list
_off
,
&
free
,
sizeof
(
free
))
!=
0
)
if
(
tdb_read_convert
(
tdb
,
tdb
->
f
table
_off
,
&
free
,
sizeof
(
free
))
!=
0
)
abort
();
abort
();
for
(
i
=
0
;
i
<
sizeof
(
free
.
buckets
)
/
sizeof
(
free
.
buckets
[
0
]);
i
++
)
{
for
(
i
=
0
;
i
<
sizeof
(
free
.
buckets
)
/
sizeof
(
free
.
buckets
[
0
]);
i
++
)
{
...
@@ -50,26 +50,26 @@ int main(int argc, char *argv[])
...
@@ -50,26 +50,26 @@ int main(int argc, char *argv[])
if
(
!
tdb
)
if
(
!
tdb
)
continue
;
continue
;
ok1
(
empty_free
list
(
tdb
));
ok1
(
empty_free
table
(
tdb
));
/* Need some hash lock for expand. */
/* Need some hash lock for expand. */
ok1
(
tdb_lock_hashes
(
tdb
,
0
,
1
,
F_WRLCK
,
TDB_LOCK_WAIT
)
==
0
);
ok1
(
tdb_lock_hashes
(
tdb
,
0
,
1
,
F_WRLCK
,
TDB_LOCK_WAIT
)
==
0
);
/* Create some free space. */
/* Create some free space. */
ok1
(
tdb_expand
(
tdb
,
1
)
==
0
);
ok1
(
tdb_expand
(
tdb
,
1
)
==
0
);
ok1
(
tdb_unlock_hashes
(
tdb
,
0
,
1
,
F_WRLCK
)
==
0
);
ok1
(
tdb_unlock_hashes
(
tdb
,
0
,
1
,
F_WRLCK
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
!
empty_free
list
(
tdb
));
ok1
(
!
empty_free
table
(
tdb
));
size
=
tdb
->
map_size
;
size
=
tdb
->
map_size
;
/* Insert minimal-length records until we expand. */
/* Insert minimal-length records until we expand. */
for
(
j
=
0
;
tdb
->
map_size
==
size
;
j
++
)
{
for
(
j
=
0
;
tdb
->
map_size
==
size
;
j
++
)
{
was_empty
=
empty_free
list
(
tdb
);
was_empty
=
empty_free
table
(
tdb
);
if
(
tdb_store
(
tdb
,
k
,
k
,
TDB_INSERT
)
!=
0
)
if
(
tdb_store
(
tdb
,
k
,
k
,
TDB_INSERT
)
!=
0
)
err
(
1
,
"Failed to store record %i"
,
j
);
err
(
1
,
"Failed to store record %i"
,
j
);
}
}
/* Would have been empty before expansion, but no longer. */
/* Would have been empty before expansion, but no longer. */
ok1
(
was_empty
);
ok1
(
was_empty
);
ok1
(
!
empty_free
list
(
tdb
));
ok1
(
!
empty_free
table
(
tdb
));
tdb_close
(
tdb
);
tdb_close
(
tdb
);
}
}
...
...
ccan/tdb2/test/run-50-multiple-freelists.c
View file @
51a56b52
...
@@ -22,11 +22,11 @@ int main(int argc, char *argv[])
...
@@ -22,11 +22,11 @@ int main(int argc, char *argv[])
data
.
dsize
=
5
;
data
.
dsize
=
5
;
key
.
dsize
=
5
;
key
.
dsize
=
5
;
/* Create a TDB with three free
list
s. */
/* Create a TDB with three free
table
s. */
layout
=
new_tdb_layout
(
NULL
);
layout
=
new_tdb_layout
(
NULL
);
tdb_layout_add_free
list
(
layout
);
tdb_layout_add_free
table
(
layout
);
tdb_layout_add_free
list
(
layout
);
tdb_layout_add_free
table
(
layout
);
tdb_layout_add_free
list
(
layout
);
tdb_layout_add_free
table
(
layout
);
tdb_layout_add_free
(
layout
,
80
,
0
);
tdb_layout_add_free
(
layout
,
80
,
0
);
/* Used record prevent coalescing. */
/* Used record prevent coalescing. */
tdb_layout_add_used
(
layout
,
key
,
data
,
6
);
tdb_layout_add_used
(
layout
,
key
,
data
,
6
);
...
@@ -40,24 +40,28 @@ int main(int argc, char *argv[])
...
@@ -40,24 +40,28 @@ int main(int argc, char *argv[])
tdb
=
tdb_layout_get
(
layout
);
tdb
=
tdb_layout_get
(
layout
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
off
=
get_free
(
tdb
,
0
,
80
-
sizeof
(
struct
tdb_used_record
),
0
,
0
);
off
=
get_free
(
tdb
,
0
,
80
-
sizeof
(
struct
tdb_used_record
),
0
,
TDB_USED_MAGIC
,
0
);
ok1
(
off
==
layout
->
elem
[
3
].
base
.
off
);
ok1
(
off
==
layout
->
elem
[
3
].
base
.
off
);
ok1
(
tdb
->
f
list
_off
==
layout
->
elem
[
0
].
base
.
off
);
ok1
(
tdb
->
f
table
_off
==
layout
->
elem
[
0
].
base
.
off
);
off
=
get_free
(
tdb
,
0
,
160
-
sizeof
(
struct
tdb_used_record
),
0
,
0
);
off
=
get_free
(
tdb
,
0
,
160
-
sizeof
(
struct
tdb_used_record
),
0
,
TDB_USED_MAGIC
,
0
);
ok1
(
off
==
layout
->
elem
[
5
].
base
.
off
);
ok1
(
off
==
layout
->
elem
[
5
].
base
.
off
);
ok1
(
tdb
->
f
list
_off
==
layout
->
elem
[
1
].
base
.
off
);
ok1
(
tdb
->
f
table
_off
==
layout
->
elem
[
1
].
base
.
off
);
off
=
get_free
(
tdb
,
0
,
320
-
sizeof
(
struct
tdb_used_record
),
0
,
0
);
off
=
get_free
(
tdb
,
0
,
320
-
sizeof
(
struct
tdb_used_record
),
0
,
TDB_USED_MAGIC
,
0
);
ok1
(
off
==
layout
->
elem
[
7
].
base
.
off
);
ok1
(
off
==
layout
->
elem
[
7
].
base
.
off
);
ok1
(
tdb
->
f
list
_off
==
layout
->
elem
[
2
].
base
.
off
);
ok1
(
tdb
->
f
table
_off
==
layout
->
elem
[
2
].
base
.
off
);
off
=
get_free
(
tdb
,
0
,
40
-
sizeof
(
struct
tdb_used_record
),
0
,
0
);
off
=
get_free
(
tdb
,
0
,
40
-
sizeof
(
struct
tdb_used_record
),
0
,
TDB_USED_MAGIC
,
0
);
ok1
(
off
==
layout
->
elem
[
9
].
base
.
off
);
ok1
(
off
==
layout
->
elem
[
9
].
base
.
off
);
ok1
(
tdb
->
f
list
_off
==
layout
->
elem
[
0
].
base
.
off
);
ok1
(
tdb
->
f
table
_off
==
layout
->
elem
[
0
].
base
.
off
);
/* Now we fail. */
/* Now we fail. */
off
=
get_free
(
tdb
,
0
,
0
,
1
,
0
);
off
=
get_free
(
tdb
,
0
,
0
,
1
,
TDB_USED_MAGIC
,
0
);
ok1
(
off
==
0
);
ok1
(
off
==
0
);
tdb_close
(
tdb
);
tdb_close
(
tdb
);
...
...
ccan/tdb2/test/run-seed.c
View file @
51a56b52
...
@@ -13,7 +13,7 @@ static int log_count = 0;
...
@@ -13,7 +13,7 @@ static int log_count = 0;
/* Normally we get a log when setting random seed. */
/* Normally we get a log when setting random seed. */
static
void
my_log_fn
(
struct
tdb_context
*
tdb
,
static
void
my_log_fn
(
struct
tdb_context
*
tdb
,
enum
tdb_debug_level
level
,
void
*
priv
,
enum
tdb_debug_level
level
,
void
*
priv
,
const
char
*
fmt
,
...
)
const
char
*
message
)
{
{
log_count
++
;
log_count
++
;
}
}
...
...
ccan/tdb2/test/run-traverse.c
View file @
51a56b52
...
@@ -56,7 +56,6 @@ static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
...
@@ -56,7 +56,6 @@ static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
td
->
high
=
val
;
td
->
high
=
val
;
if
(
td
->
delete
)
{
if
(
td
->
delete
)
{
if
(
tdb_delete
(
tdb
,
key
)
!=
0
)
{
if
(
tdb_delete
(
tdb
,
key
)
!=
0
)
{
td
->
delete_error
=
tdb_error
(
tdb
);
td
->
delete_error
=
tdb_error
(
tdb
);
return
-
1
;
return
-
1
;
...
@@ -120,7 +119,7 @@ int main(int argc, char *argv[])
...
@@ -120,7 +119,7 @@ int main(int argc, char *argv[])
hattr
.
base
.
next
=
&
tap_log_attr
;
hattr
.
base
.
next
=
&
tap_log_attr
;
plan_tests
(
sizeof
(
flags
)
/
sizeof
(
flags
[
0
])
*
5
0
+
1
);
plan_tests
(
sizeof
(
flags
)
/
sizeof
(
flags
[
0
])
*
5
3
+
1
);
for
(
i
=
0
;
i
<
sizeof
(
flags
)
/
sizeof
(
flags
[
0
]);
i
++
)
{
for
(
i
=
0
;
i
<
sizeof
(
flags
)
/
sizeof
(
flags
[
0
]);
i
++
)
{
tdb
=
tdb_open
(
"run-traverse.tdb"
,
flags
[
i
],
tdb
=
tdb_open
(
"run-traverse.tdb"
,
flags
[
i
],
O_RDWR
|
O_CREAT
|
O_TRUNC
,
0600
,
&
hattr
);
O_RDWR
|
O_CREAT
|
O_TRUNC
,
0600
,
&
hattr
);
...
@@ -182,6 +181,7 @@ int main(int argc, char *argv[])
...
@@ -182,6 +181,7 @@ int main(int argc, char *argv[])
ok1
(
td
.
low
<=
NUM_RECORDS
/
2
);
ok1
(
td
.
low
<=
NUM_RECORDS
/
2
);
ok1
(
td
.
high
>
NUM_RECORDS
/
2
);
ok1
(
td
.
high
>
NUM_RECORDS
/
2
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tap_log_messages
==
0
);
/* Growing traverse. Expect failure on r/o traverse. */
/* Growing traverse. Expect failure on r/o traverse. */
tgd
.
calls
=
0
;
tgd
.
calls
=
0
;
...
@@ -193,6 +193,8 @@ int main(int argc, char *argv[])
...
@@ -193,6 +193,8 @@ int main(int argc, char *argv[])
ok1
(
tgd
.
error
==
TDB_ERR_RDONLY
);
ok1
(
tgd
.
error
==
TDB_ERR_RDONLY
);
ok1
(
tgd
.
calls
==
1
);
ok1
(
tgd
.
calls
==
1
);
ok1
(
!
tgd
.
mismatch
);
ok1
(
!
tgd
.
mismatch
);
ok1
(
tap_log_messages
==
1
);
tap_log_messages
=
0
;
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Deleting traverse. Expect failure on r/o traverse. */
/* Deleting traverse. Expect failure on r/o traverse. */
...
@@ -209,6 +211,8 @@ int main(int argc, char *argv[])
...
@@ -209,6 +211,8 @@ int main(int argc, char *argv[])
ok1
(
!
td
.
mismatch
);
ok1
(
!
td
.
mismatch
);
ok1
(
td
.
calls
==
1
);
ok1
(
td
.
calls
==
1
);
ok1
(
td
.
low
==
td
.
high
);
ok1
(
td
.
low
==
td
.
high
);
ok1
(
tap_log_messages
==
1
);
tap_log_messages
=
0
;
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
ok1
(
tdb_check
(
tdb
,
NULL
,
NULL
)
==
0
);
/* Deleting traverse (delete everything). */
/* Deleting traverse (delete everything). */
...
...
ccan/tdb2/tools/Makefile
View file @
51a56b52
OBJS
:=
../../tdb2.o ../../hash.o ../../tally.o
OBJS
:=
../../tdb2.o ../../hash.o ../../tally.o
CFLAGS
:=
-I
../../..
-Wall
-g
#-g
-O3 #-g -pg
CFLAGS
:=
-I
../../..
-Wall
-g
-O3
#-g -pg
LDFLAGS
:=
-L
../../..
LDFLAGS
:=
-L
../../..
default
:
tdbtorture tdbtool mktdb
default
:
tdbtorture tdbtool mktdb
speed
tdbtorture
:
tdbtorture.c $(OBJS)
tdbtorture
:
tdbtorture.c $(OBJS)
tdbtool
:
tdbtool.c $(OBJS)
tdbtool
:
tdbtool.c $(OBJS)
mktdb
:
mktdb.c $(OBJS)
mktdb
:
mktdb.c $(OBJS)
speed
:
speed.c $(OBJS)
clean
:
clean
:
rm
-f
tdbtorture tdbtool mktdb
rm
-f
tdbtorture tdbtool mktdb
speed
ccan/tdb2/tools/speed.c
0 → 100644
View file @
51a56b52
/* Simple speed test for TDB */
#include <err.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/time.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ccan/tdb2/tdb2.h>
/* Nanoseconds per operation */
static
size_t
normalize
(
const
struct
timeval
*
start
,
const
struct
timeval
*
stop
,
unsigned
int
num
)
{
struct
timeval
diff
;
timersub
(
stop
,
start
,
&
diff
);
/* Floating point is more accurate here. */
return
(
double
)(
diff
.
tv_sec
*
1000000
+
diff
.
tv_usec
)
/
num
*
1000
;
}
static
size_t
file_size
(
void
)
{
struct
stat
st
;
if
(
stat
(
"/tmp/speed.tdb"
,
&
st
)
!=
0
)
return
-
1
;
return
st
.
st_size
;
}
static
int
count_record
(
struct
tdb_context
*
tdb
,
TDB_DATA
key
,
TDB_DATA
data
,
void
*
p
)
{
int
*
total
=
p
;
*
total
+=
*
(
int
*
)
data
.
dptr
;
return
0
;
}
static
void
dump_and_clear_stats
(
struct
tdb_attribute_stats
*
stats
)
{
printf
(
"allocs = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
allocs
);
printf
(
" alloc_subhash = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_subhash
);
printf
(
" alloc_chain = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_chain
);
printf
(
" alloc_bucket_exact = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_bucket_exact
);
printf
(
" alloc_bucket_max = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_bucket_max
);
printf
(
" alloc_leftover = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_leftover
);
printf
(
" alloc_coalesce_tried = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_coalesce_tried
);
printf
(
" alloc_coalesce_lockfail = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_coalesce_lockfail
);
printf
(
" alloc_coalesce_race = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_coalesce_race
);
printf
(
" alloc_coalesce_succeeded = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_coalesce_succeeded
);
printf
(
" alloc_coalesce_num_merged = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
alloc_coalesce_num_merged
);
printf
(
"compares = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
compares
);
printf
(
" compare_wrong_bucket = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
compare_wrong_bucket
);
printf
(
" compare_wrong_offsetbits = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
compare_wrong_offsetbits
);
printf
(
" compare_wrong_keylen = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
compare_wrong_keylen
);
printf
(
" compare_wrong_rechash = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
compare_wrong_rechash
);
printf
(
" compare_wrong_keycmp = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
compare_wrong_keycmp
);
printf
(
"expands = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
expands
);
printf
(
"frees = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
frees
);
printf
(
"locks = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
locks
);
printf
(
" lock_lowlevel = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
lock_lowlevel
);
printf
(
" lock_nonblock = %llu
\n
"
,
(
unsigned
long
long
)
stats
->
lock_nonblock
);
/* Now clear. */
memset
(
&
stats
->
allocs
,
0
,
(
char
*
)(
stats
+
1
)
-
(
char
*
)
&
stats
->
allocs
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
unsigned
int
i
,
j
,
num
=
1000
,
stage
=
0
,
stopat
=
-
1
;
int
flags
=
TDB_DEFAULT
;
bool
transaction
=
false
;
TDB_DATA
key
,
data
;
struct
tdb_context
*
tdb
;
struct
timeval
start
,
stop
;
union
tdb_attribute
seed
,
stats
;
/* Try to keep benchmarks even. */
seed
.
base
.
attr
=
TDB_ATTRIBUTE_SEED
;
seed
.
base
.
next
=
NULL
;
seed
.
seed
.
seed
=
0
;
memset
(
&
stats
,
0
,
sizeof
(
stats
));
stats
.
base
.
attr
=
TDB_ATTRIBUTE_STATS
;
stats
.
base
.
next
=
NULL
;
stats
.
stats
.
size
=
sizeof
(
stats
);
if
(
argv
[
1
]
&&
strcmp
(
argv
[
1
],
"--internal"
)
==
0
)
{
flags
=
TDB_INTERNAL
;
argc
--
;
argv
++
;
}
if
(
argv
[
1
]
&&
strcmp
(
argv
[
1
],
"--transaction"
)
==
0
)
{
transaction
=
true
;
argc
--
;
argv
++
;
}
if
(
argv
[
1
]
&&
strcmp
(
argv
[
1
],
"--stats"
)
==
0
)
{
seed
.
base
.
next
=
&
stats
;
argc
--
;
argv
++
;
}
tdb
=
tdb_open
(
"/tmp/speed.tdb"
,
flags
,
O_RDWR
|
O_CREAT
|
O_TRUNC
,
0600
,
&
seed
);
if
(
!
tdb
)
err
(
1
,
"Opening /tmp/speed.tdb"
);
key
.
dptr
=
(
void
*
)
&
i
;
key
.
dsize
=
sizeof
(
i
);
data
=
key
;
if
(
argv
[
1
])
{
num
=
atoi
(
argv
[
1
]);
argv
++
;
argc
--
;
}
if
(
argv
[
1
])
{
stopat
=
atoi
(
argv
[
1
]);
argv
++
;
argc
--
;
}
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Add 1000 records. */
printf
(
"Adding %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
i
=
0
;
i
<
num
;
i
++
)
if
(
tdb_store
(
tdb
,
key
,
data
,
TDB_INSERT
)
!=
0
)
errx
(
1
,
"Inserting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
seed
.
base
.
next
)
dump_and_clear_stats
(
&
stats
.
stats
);
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Finding 1000 records. */
printf
(
"Finding %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
i
=
0
;
i
<
num
;
i
++
)
{
int
*
dptr
;
dptr
=
(
int
*
)
tdb_fetch
(
tdb
,
key
).
dptr
;
if
(
!
dptr
||
*
dptr
!=
i
)
errx
(
1
,
"Fetching key %u in tdb gave %u"
,
i
,
dptr
?
*
dptr
:
-
1
);
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
seed
.
base
.
next
)
dump_and_clear_stats
(
&
stats
.
stats
);
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Missing 1000 records. */
printf
(
"Missing %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
i
=
num
;
i
<
num
*
2
;
i
++
)
{
int
*
dptr
;
dptr
=
(
int
*
)
tdb_fetch
(
tdb
,
key
).
dptr
;
if
(
dptr
)
errx
(
1
,
"Fetching key %u in tdb gave %u"
,
i
,
*
dptr
);
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
seed
.
base
.
next
)
dump_and_clear_stats
(
&
stats
.
stats
);
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Traverse 1000 records. */
printf
(
"Traversing %u records: "
,
num
);
fflush
(
stdout
);
i
=
0
;
gettimeofday
(
&
start
,
NULL
);
if
(
tdb_traverse
(
tdb
,
count_record
,
&
i
)
!=
num
)
errx
(
1
,
"Traverse returned wrong number of records"
);
if
(
i
!=
(
num
-
1
)
*
(
num
/
2
))
errx
(
1
,
"Traverse tallied to %u"
,
i
);
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
seed
.
base
.
next
)
dump_and_clear_stats
(
&
stats
.
stats
);
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Delete 1000 records (not in order). */
printf
(
"Deleting %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
j
=
0
;
j
<
num
;
j
++
)
{
i
=
(
j
+
100003
)
%
num
;
if
(
tdb_delete
(
tdb
,
key
)
!=
0
)
errx
(
1
,
"Deleting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
seed
.
base
.
next
)
dump_and_clear_stats
(
&
stats
.
stats
);
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Re-add 1000 records (not in order). */
printf
(
"Re-adding %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
j
=
0
;
j
<
num
;
j
++
)
{
i
=
(
j
+
100003
)
%
num
;
if
(
tdb_store
(
tdb
,
key
,
data
,
TDB_INSERT
)
!=
0
)
errx
(
1
,
"Inserting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
seed
.
base
.
next
)
dump_and_clear_stats
(
&
stats
.
stats
);
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Append 1000 records. */
printf
(
"Appending %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
i
=
0
;
i
<
num
;
i
++
)
if
(
tdb_append
(
tdb
,
key
,
data
)
!=
0
)
errx
(
1
,
"Appending key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
++
stage
==
stopat
)
exit
(
0
);
if
(
transaction
&&
tdb_transaction_start
(
tdb
))
errx
(
1
,
"starting transaction: %s"
,
tdb_errorstr
(
tdb
));
/* Churn 1000 records: not in order! */
printf
(
"Churning %u records: "
,
num
);
fflush
(
stdout
);
gettimeofday
(
&
start
,
NULL
);
for
(
j
=
0
;
j
<
num
;
j
++
)
{
i
=
(
j
+
1000019
)
%
num
;
if
(
tdb_delete
(
tdb
,
key
)
!=
0
)
errx
(
1
,
"Deleting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
i
+=
num
;
if
(
tdb_store
(
tdb
,
key
,
data
,
TDB_INSERT
)
!=
0
)
errx
(
1
,
"Inserting key %u in tdb: %s"
,
i
,
tdb_errorstr
(
tdb
));
}
gettimeofday
(
&
stop
,
NULL
);
if
(
transaction
&&
tdb_transaction_commit
(
tdb
))
errx
(
1
,
"committing transaction: %s"
,
tdb_errorstr
(
tdb
));
printf
(
" %zu ns (%zu bytes)
\n
"
,
normalize
(
&
start
,
&
stop
,
num
),
file_size
());
if
(
seed
.
base
.
next
)
dump_and_clear_stats
(
&
stats
.
stats
);
if
(
++
stage
==
stopat
)
exit
(
0
);
return
0
;
}
ccan/tdb2/transaction.c
View file @
51a56b52
...
@@ -169,10 +169,9 @@ static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
...
@@ -169,10 +169,9 @@ static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
return
0
;
return
0
;
fail:
fail:
tdb
->
ecode
=
TDB_ERR_IO
;
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"transaction_read: failed at off=%zu len=%zu"
,
"transaction_read: failed at off=%llu len=%llu
\n
"
,
(
size_t
)
off
,
(
size_t
)
len
);
(
long
long
)
off
,
(
long
long
)
len
);
tdb
->
transaction
->
transaction_error
=
1
;
tdb
->
transaction
->
transaction_error
=
1
;
return
-
1
;
return
-
1
;
}
}
...
@@ -188,12 +187,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
...
@@ -188,12 +187,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
/* Only a commit is allowed on a prepared transaction */
/* Only a commit is allowed on a prepared transaction */
if
(
tdb
->
transaction
->
prepared
)
{
if
(
tdb
->
transaction
->
prepared
)
{
tdb
->
ecode
=
TDB_ERR_EINVAL
;
tdb_logerr
(
tdb
,
TDB_ERR_EINVAL
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"transaction_write: transaction already prepared,"
"transaction_write: transaction already prepared,"
" write not allowed
\n
"
);
" write not allowed"
);
tdb
->
transaction
->
transaction_error
=
1
;
goto
fail
;
return
-
1
;
}
}
/* break it up into block sized chunks */
/* break it up into block sized chunks */
...
@@ -228,7 +225,8 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
...
@@ -228,7 +225,8 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
(
blk
+
1
)
*
sizeof
(
uint8_t
*
));
(
blk
+
1
)
*
sizeof
(
uint8_t
*
));
}
}
if
(
new_blocks
==
NULL
)
{
if
(
new_blocks
==
NULL
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_FATAL
,
"transaction_write: failed to allocate"
);
goto
fail
;
goto
fail
;
}
}
memset
(
&
new_blocks
[
tdb
->
transaction
->
num_blocks
],
0
,
memset
(
&
new_blocks
[
tdb
->
transaction
->
num_blocks
],
0
,
...
@@ -242,9 +240,9 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
...
@@ -242,9 +240,9 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
if
(
tdb
->
transaction
->
blocks
[
blk
]
==
NULL
)
{
if
(
tdb
->
transaction
->
blocks
[
blk
]
==
NULL
)
{
tdb
->
transaction
->
blocks
[
blk
]
=
(
uint8_t
*
)
calloc
(
getpagesize
(),
1
);
tdb
->
transaction
->
blocks
[
blk
]
=
(
uint8_t
*
)
calloc
(
getpagesize
(),
1
);
if
(
tdb
->
transaction
->
blocks
[
blk
]
==
NULL
)
{
if
(
tdb
->
transaction
->
blocks
[
blk
]
==
NULL
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb
_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_FATAL
,
tdb
->
transaction
->
transaction_error
=
1
;
"transaction_write: failed to allocate"
)
;
return
-
1
;
goto
fail
;
}
}
if
(
tdb
->
transaction
->
old_map_size
>
blk
*
getpagesize
())
{
if
(
tdb
->
transaction
->
old_map_size
>
blk
*
getpagesize
())
{
tdb_len_t
len2
=
getpagesize
();
tdb_len_t
len2
=
getpagesize
();
...
@@ -254,6 +252,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
...
@@ -254,6 +252,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
if
(
tdb
->
transaction
->
io_methods
->
read
(
tdb
,
blk
*
getpagesize
(),
if
(
tdb
->
transaction
->
io_methods
->
read
(
tdb
,
blk
*
getpagesize
(),
tdb
->
transaction
->
blocks
[
blk
],
tdb
->
transaction
->
blocks
[
blk
],
len2
)
!=
0
)
{
len2
)
!=
0
)
{
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_FATAL
,
"transaction_write: failed to"
" read old block: %s"
,
strerror
(
errno
));
SAFE_FREE
(
tdb
->
transaction
->
blocks
[
blk
]);
SAFE_FREE
(
tdb
->
transaction
->
blocks
[
blk
]);
goto
fail
;
goto
fail
;
}
}
...
@@ -278,10 +280,6 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
...
@@ -278,10 +280,6 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
return
0
;
return
0
;
fail:
fail:
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"transaction_write: failed at off=%llu len=%llu
\n
"
,
(
long
long
)((
blk
*
getpagesize
())
+
off
),
(
long
long
)
len
);
tdb
->
transaction
->
transaction_error
=
1
;
tdb
->
transaction
->
transaction_error
=
1
;
return
-
1
;
return
-
1
;
}
}
...
@@ -341,6 +339,12 @@ static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
...
@@ -341,6 +339,12 @@ static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
return
0
;
return
0
;
}
}
tdb
->
ecode
=
TDB_ERR_IO
;
tdb
->
ecode
=
TDB_ERR_IO
;
if
(
!
probe
)
{
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
"tdb_oob len %lld beyond transaction size %lld"
,
(
long
long
)
len
,
(
long
long
)
tdb
->
map_size
);
}
return
-
1
;
return
-
1
;
}
}
...
@@ -359,10 +363,39 @@ static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t addition)
...
@@ -359,10 +363,39 @@ static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t addition)
}
}
static
void
*
transaction_direct
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
static
void
*
transaction_direct
(
struct
tdb_context
*
tdb
,
tdb_off_t
off
,
size_t
len
)
size_t
len
,
bool
write
)
{
{
/* FIXME */
size_t
blk
=
off
/
getpagesize
(),
end_blk
;
return
NULL
;
/* This is wrong for zero-length blocks, but will fail gracefully */
end_blk
=
(
off
+
len
-
1
)
/
getpagesize
();
/* Can only do direct if in single block and we've already copied. */
if
(
write
)
{
if
(
blk
!=
end_blk
)
return
NULL
;
if
(
blk
>=
tdb
->
transaction
->
num_blocks
)
return
NULL
;
if
(
tdb
->
transaction
->
blocks
[
blk
]
==
NULL
)
return
NULL
;
return
tdb
->
transaction
->
blocks
[
blk
]
+
off
%
getpagesize
();
}
/* Single which we have copied? */
if
(
blk
==
end_blk
&&
blk
<
tdb
->
transaction
->
num_blocks
&&
tdb
->
transaction
->
blocks
[
blk
])
return
tdb
->
transaction
->
blocks
[
blk
]
+
off
%
getpagesize
();
/* Otherwise must be all not copied. */
while
(
blk
<
end_blk
)
{
if
(
blk
>=
tdb
->
transaction
->
num_blocks
)
break
;
if
(
tdb
->
transaction
->
blocks
[
blk
])
return
NULL
;
blk
++
;
}
return
tdb
->
transaction
->
io_methods
->
direct
(
tdb
,
off
,
len
,
write
);
}
}
static
const
struct
tdb_methods
transaction_methods
=
{
static
const
struct
tdb_methods
transaction_methods
=
{
...
@@ -383,9 +416,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
...
@@ -383,9 +416,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
}
}
if
(
fsync
(
tdb
->
fd
)
!=
0
)
{
if
(
fsync
(
tdb
->
fd
)
!=
0
)
{
tdb
->
ecode
=
TDB_ERR_IO
;
tdb
_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_transaction: fsync failed: %s"
,
"tdb_transaction: fsync failed
\n
"
);
strerror
(
errno
)
);
return
-
1
;
return
-
1
;
}
}
#ifdef MS_SYNC
#ifdef MS_SYNC
...
@@ -393,10 +426,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
...
@@ -393,10 +426,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
tdb_off_t
moffset
=
offset
&
~
(
getpagesize
()
-
1
);
tdb_off_t
moffset
=
offset
&
~
(
getpagesize
()
-
1
);
if
(
msync
(
moffset
+
(
char
*
)
tdb
->
map_ptr
,
if
(
msync
(
moffset
+
(
char
*
)
tdb
->
map_ptr
,
length
+
(
offset
-
moffset
),
MS_SYNC
)
!=
0
)
{
length
+
(
offset
-
moffset
),
MS_SYNC
)
!=
0
)
{
tdb
->
ecode
=
TDB_ERR_IO
;
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_transaction: msync failed: %s"
,
"tdb_transaction: msync failed - %s
\n
"
,
strerror
(
errno
));
strerror
(
errno
));
return
-
1
;
return
-
1
;
}
}
}
}
...
@@ -410,9 +442,8 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
...
@@ -410,9 +442,8 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
int
i
;
int
i
;
if
(
tdb
->
transaction
==
NULL
)
{
if
(
tdb
->
transaction
==
NULL
)
{
tdb
->
ecode
=
TDB_ERR_EINVAL
;
tdb_logerr
(
tdb
,
TDB_ERR_EINVAL
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_transaction_cancel: no transaction"
);
"tdb_transaction_cancel: no transaction
\n
"
);
return
;
return
;
}
}
...
@@ -441,9 +472,9 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
...
@@ -441,9 +472,9 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
&
invalid
,
sizeof
(
invalid
))
==
-
1
||
&
invalid
,
sizeof
(
invalid
))
==
-
1
||
transaction_sync
(
tdb
,
tdb
->
transaction
->
magic_offset
,
transaction_sync
(
tdb
,
tdb
->
transaction
->
magic_offset
,
sizeof
(
invalid
))
==
-
1
)
{
sizeof
(
invalid
))
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_cancel: failed to remove"
"tdb_transaction_cancel: failed to remove"
" recovery magic
\n
"
);
" recovery magic
"
);
}
}
}
}
...
@@ -469,16 +500,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
...
@@ -469,16 +500,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
{
{
/* some sanity checks */
/* some sanity checks */
if
(
tdb
->
read_only
||
(
tdb
->
flags
&
TDB_INTERNAL
))
{
if
(
tdb
->
read_only
||
(
tdb
->
flags
&
TDB_INTERNAL
))
{
tdb
->
ecode
=
TDB_ERR_EINVAL
;
tdb_logerr
(
tdb
,
TDB_ERR_EINVAL
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_transaction_start: cannot start a transaction"
"tdb_transaction_start: cannot start a transaction"
" on a read-only or internal db"
);
" on a read-only or internal db
\n
"
);
return
-
1
;
return
-
1
;
}
}
/* cope with nested tdb_transaction_start() calls */
/* cope with nested tdb_transaction_start() calls */
if
(
tdb
->
transaction
!=
NULL
)
{
if
(
tdb
->
transaction
!=
NULL
)
{
tdb
->
ecode
=
TDB_ERR_NESTING
;
tdb_logerr
(
tdb
,
TDB_ERR_NESTING
,
TDB_DEBUG_ERROR
,
"tdb_transaction_start:"
" already inside transaction"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -486,17 +518,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
...
@@ -486,17 +518,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
/* the caller must not have any locks when starting a
/* the caller must not have any locks when starting a
transaction as otherwise we'll be screwed by lack
transaction as otherwise we'll be screwed by lack
of nested locks in posix */
of nested locks in posix */
tdb
->
ecode
=
TDB_ERR_LOCK
;
tdb_logerr
(
tdb
,
TDB_ERR_LOCK
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_transaction_start: cannot start a transaction"
"tdb_transaction_start: cannot start a transaction"
" with locks held"
);
" with locks held
\n
"
);
return
-
1
;
return
-
1
;
}
}
tdb
->
transaction
=
(
struct
tdb_transaction
*
)
tdb
->
transaction
=
(
struct
tdb_transaction
*
)
calloc
(
sizeof
(
struct
tdb_transaction
),
1
);
calloc
(
sizeof
(
struct
tdb_transaction
),
1
);
if
(
tdb
->
transaction
==
NULL
)
{
if
(
tdb
->
transaction
==
NULL
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_ERROR
,
"tdb_transaction_start: cannot allocate"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -585,17 +617,17 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
...
@@ -585,17 +617,17 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
recovery_head
=
tdb_read_off
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
));
recovery_head
=
tdb_read_off
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
));
if
(
recovery_head
==
TDB_OFF_ERR
)
{
if
(
recovery_head
==
TDB_OFF_ERR
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_recovery_allocate:"
"tdb_recovery_allocate:"
" failed to read recovery head
\n
"
);
" failed to read recovery head"
);
return
-
1
;
return
-
1
;
}
}
if
(
recovery_head
!=
0
)
{
if
(
recovery_head
!=
0
)
{
if
(
methods
->
read
(
tdb
,
recovery_head
,
&
rec
,
sizeof
(
rec
)))
{
if
(
methods
->
read
(
tdb
,
recovery_head
,
&
rec
,
sizeof
(
rec
)))
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_recovery_allocate:"
"tdb_recovery_allocate:"
" failed to read recovery record
\n
"
);
" failed to read recovery record"
);
return
-
1
;
return
-
1
;
}
}
tdb_convert
(
tdb
,
&
rec
,
sizeof
(
rec
));
tdb_convert
(
tdb
,
&
rec
,
sizeof
(
rec
));
...
@@ -621,11 +653,12 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
...
@@ -621,11 +653,12 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
us an area that is being currently used (as of the start of
us an area that is being currently used (as of the start of
the transaction) */
the transaction) */
if
(
recovery_head
!=
0
)
{
if
(
recovery_head
!=
0
)
{
add_stat
(
tdb
,
frees
,
1
);
if
(
add_free_record
(
tdb
,
recovery_head
,
if
(
add_free_record
(
tdb
,
recovery_head
,
sizeof
(
rec
)
+
rec
.
max_len
)
!=
0
)
{
sizeof
(
rec
)
+
rec
.
max_len
)
!=
0
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_recovery_allocate:"
"tdb_recovery_allocate:"
" failed to free previous recovery area
\n
"
);
" failed to free previous recovery area
"
);
return
-
1
;
return
-
1
;
}
}
}
}
...
@@ -649,9 +682,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
...
@@ -649,9 +682,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
sizeof
(
rec
)
+
*
recovery_max_size
;
sizeof
(
rec
)
+
*
recovery_max_size
;
tdb
->
map_size
=
tdb
->
transaction
->
old_map_size
;
tdb
->
map_size
=
tdb
->
transaction
->
old_map_size
;
if
(
methods
->
expand_file
(
tdb
,
addition
)
==
-
1
)
{
if
(
methods
->
expand_file
(
tdb
,
addition
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_recovery_allocate:"
"tdb_recovery_allocate:"
" failed to create recovery area
\n
"
);
" failed to create recovery area"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -665,9 +698,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
...
@@ -665,9 +698,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
tdb_convert
(
tdb
,
&
recovery_head
,
sizeof
(
recovery_head
));
tdb_convert
(
tdb
,
&
recovery_head
,
sizeof
(
recovery_head
));
if
(
methods
->
write
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
),
if
(
methods
->
write
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
),
&
recovery_head
,
sizeof
(
tdb_off_t
))
==
-
1
)
{
&
recovery_head
,
sizeof
(
tdb_off_t
))
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_recovery_allocate:"
"tdb_recovery_allocate:"
" failed to write recovery head
\n
"
);
" failed to write recovery head"
);
return
-
1
;
return
-
1
;
}
}
transaction_write_existing
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
),
transaction_write_existing
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
),
...
@@ -713,7 +746,8 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
...
@@ -713,7 +746,8 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
data
=
(
unsigned
char
*
)
malloc
(
recovery_size
+
sizeof
(
*
rec
));
data
=
(
unsigned
char
*
)
malloc
(
recovery_size
+
sizeof
(
*
rec
));
if
(
data
==
NULL
)
{
if
(
data
==
NULL
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_FATAL
,
"transaction_setup_recovery: cannot allocate"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -743,10 +777,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
...
@@ -743,10 +777,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
continue
;
continue
;
}
}
if
(
offset
+
length
>
tdb
->
map_size
)
{
if
(
offset
+
length
>
tdb
->
map_size
)
{
tdb
->
ecode
=
TDB_ERR_CORRUPT
;
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_transaction_setup_recovery:"
"tdb_transaction_setup_recovery:"
" transaction data over new region boundary"
);
" transaction data over new region boundary
\n
"
);
free
(
data
);
free
(
data
);
return
-
1
;
return
-
1
;
}
}
...
@@ -774,9 +807,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
...
@@ -774,9 +807,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
/* write the recovery data to the recovery area */
/* write the recovery data to the recovery area */
if
(
methods
->
write
(
tdb
,
recovery_offset
,
data
,
if
(
methods
->
write
(
tdb
,
recovery_offset
,
data
,
sizeof
(
*
rec
)
+
recovery_size
)
==
-
1
)
{
sizeof
(
*
rec
)
+
recovery_size
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_setup_recovery:"
"tdb_transaction_setup_recovery:"
" failed to write recovery data
\n
"
);
" failed to write recovery data"
);
free
(
data
);
free
(
data
);
return
-
1
;
return
-
1
;
}
}
...
@@ -801,9 +834,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
...
@@ -801,9 +834,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
magic
);
magic
);
if
(
methods
->
write
(
tdb
,
*
magic_offset
,
&
magic
,
sizeof
(
magic
))
==
-
1
)
{
if
(
methods
->
write
(
tdb
,
*
magic_offset
,
&
magic
,
sizeof
(
magic
))
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_setup_recovery:"
"tdb_transaction_setup_recovery:"
" failed to write recovery magic
\n
"
);
" failed to write recovery magic"
);
return
-
1
;
return
-
1
;
}
}
transaction_write_existing
(
tdb
,
*
magic_offset
,
&
magic
,
sizeof
(
magic
));
transaction_write_existing
(
tdb
,
*
magic_offset
,
&
magic
,
sizeof
(
magic
));
...
@@ -821,27 +854,24 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
...
@@ -821,27 +854,24 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
const
struct
tdb_methods
*
methods
;
const
struct
tdb_methods
*
methods
;
if
(
tdb
->
transaction
==
NULL
)
{
if
(
tdb
->
transaction
==
NULL
)
{
tdb
->
ecode
=
TDB_ERR_EINVAL
;
tdb_logerr
(
tdb
,
TDB_ERR_EINVAL
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_transaction_prepare_commit: no transaction"
);
"tdb_transaction_prepare_commit: no transaction
\n
"
);
return
-
1
;
return
-
1
;
}
}
if
(
tdb
->
transaction
->
prepared
)
{
if
(
tdb
->
transaction
->
prepared
)
{
tdb
->
ecode
=
TDB_ERR_EINVAL
;
_tdb_transaction_cancel
(
tdb
);
_tdb_transaction_cancel
(
tdb
);
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_EINVAL
,
TDB_DEBUG_ERROR
,
"tdb_transaction_prepare_commit:"
"tdb_transaction_prepare_commit:"
" transaction already prepared
\n
"
);
" transaction already prepared
"
);
return
-
1
;
return
-
1
;
}
}
if
(
tdb
->
transaction
->
transaction_error
)
{
if
(
tdb
->
transaction
->
transaction_error
)
{
tdb
->
ecode
=
TDB_ERR_IO
;
_tdb_transaction_cancel
(
tdb
);
_tdb_transaction_cancel
(
tdb
);
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_ERR_EINVAL
,
TDB_DEBUG_ERROR
,
"tdb_transaction_prepare_commit:"
"tdb_transaction_prepare_commit:"
" transaction error pending
\n
"
);
" transaction error pending
"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -860,9 +890,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
...
@@ -860,9 +890,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
/* upgrade the main transaction lock region to a write lock */
/* upgrade the main transaction lock region to a write lock */
if
(
tdb_allrecord_upgrade
(
tdb
)
==
-
1
)
{
if
(
tdb_allrecord_upgrade
(
tdb
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_ERROR
,
"tdb_transaction_prepare_commit:"
"tdb_transaction_prepare_commit:"
" failed to upgrade hash locks
\n
"
);
" failed to upgrade hash locks"
);
_tdb_transaction_cancel
(
tdb
);
_tdb_transaction_cancel
(
tdb
);
return
-
1
;
return
-
1
;
}
}
...
@@ -870,9 +900,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
...
@@ -870,9 +900,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
/* get the open lock - this prevents new users attaching to the database
/* get the open lock - this prevents new users attaching to the database
during the commit */
during the commit */
if
(
tdb_lock_open
(
tdb
,
TDB_LOCK_WAIT
|
TDB_LOCK_NOCHECK
)
==
-
1
)
{
if
(
tdb_lock_open
(
tdb
,
TDB_LOCK_WAIT
|
TDB_LOCK_NOCHECK
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_ERROR
,
"tdb_transaction_prepare_commit:"
"tdb_transaction_prepare_commit:"
" failed to get open lock
\n
"
);
" failed to get open lock"
);
_tdb_transaction_cancel
(
tdb
);
_tdb_transaction_cancel
(
tdb
);
return
-
1
;
return
-
1
;
}
}
...
@@ -881,9 +911,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
...
@@ -881,9 +911,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
if
(
!
(
tdb
->
flags
&
TDB_NOSYNC
))
{
if
(
!
(
tdb
->
flags
&
TDB_NOSYNC
))
{
/* write the recovery data to the end of the file */
/* write the recovery data to the end of the file */
if
(
transaction_setup_recovery
(
tdb
,
&
tdb
->
transaction
->
magic_offset
)
==
-
1
)
{
if
(
transaction_setup_recovery
(
tdb
,
&
tdb
->
transaction
->
magic_offset
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_prepare_commit:"
"tdb_transaction_prepare_commit:"
" failed to setup recovery data
\n
"
);
" failed to setup recovery data"
);
_tdb_transaction_cancel
(
tdb
);
_tdb_transaction_cancel
(
tdb
);
return
-
1
;
return
-
1
;
}
}
...
@@ -897,9 +927,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
...
@@ -897,9 +927,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
/* Restore original map size for tdb_expand_file */
/* Restore original map size for tdb_expand_file */
tdb
->
map_size
=
tdb
->
transaction
->
old_map_size
;
tdb
->
map_size
=
tdb
->
transaction
->
old_map_size
;
if
(
methods
->
expand_file
(
tdb
,
add
)
==
-
1
)
{
if
(
methods
->
expand_file
(
tdb
,
add
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_ERROR
,
"tdb_transaction_prepare_commit:"
"tdb_transaction_prepare_commit:"
" expansion failed
\n
"
);
" expansion failed"
);
_tdb_transaction_cancel
(
tdb
);
_tdb_transaction_cancel
(
tdb
);
return
-
1
;
return
-
1
;
}
}
...
@@ -927,19 +957,18 @@ int tdb_transaction_commit(struct tdb_context *tdb)
...
@@ -927,19 +957,18 @@ int tdb_transaction_commit(struct tdb_context *tdb)
int
i
;
int
i
;
if
(
tdb
->
transaction
==
NULL
)
{
if
(
tdb
->
transaction
==
NULL
)
{
tdb
->
ecode
=
TDB_ERR_EINVAL
;
tdb_logerr
(
tdb
,
TDB_ERR_EINVAL
,
TDB_DEBUG_ERROR
,
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
"tdb_transaction_commit: no transaction"
);
"tdb_transaction_commit: no transaction
\n
"
);
return
-
1
;
return
-
1
;
}
}
tdb_trace
(
tdb
,
"tdb_transaction_commit"
);
tdb_trace
(
tdb
,
"tdb_transaction_commit"
);
if
(
tdb
->
transaction
->
transaction_error
)
{
if
(
tdb
->
transaction
->
transaction_error
)
{
tdb
->
ecode
=
TDB_ERR_IO
;
tdb_transaction_cancel
(
tdb
);
tdb_transaction_cancel
(
tdb
);
tdb
->
log
(
tdb
,
TDB_DEBUG_ERROR
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_IO
,
TDB_DEBUG_ERROR
,
"tdb_transaction_commit: transaction error pending
\n
"
);
"tdb_transaction_commit:"
" transaction error pending"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -980,9 +1009,9 @@ int tdb_transaction_commit(struct tdb_context *tdb)
...
@@ -980,9 +1009,9 @@ int tdb_transaction_commit(struct tdb_context *tdb)
if
(
methods
->
write
(
tdb
,
offset
,
tdb
->
transaction
->
blocks
[
i
],
if
(
methods
->
write
(
tdb
,
offset
,
tdb
->
transaction
->
blocks
[
i
],
length
)
==
-
1
)
{
length
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_commit:"
"tdb_transaction_commit:"
" write failed during commit
\n
"
);
" write failed during commit
"
);
/* we've overwritten part of the data and
/* we've overwritten part of the data and
possibly expanded the file, so we need to
possibly expanded the file, so we need to
...
@@ -1042,9 +1071,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
...
@@ -1042,9 +1071,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
/* find the recovery area */
/* find the recovery area */
recovery_head
=
tdb_read_off
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
));
recovery_head
=
tdb_read_off
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
));
if
(
recovery_head
==
TDB_OFF_ERR
)
{
if
(
recovery_head
==
TDB_OFF_ERR
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_recover:"
"tdb_transaction_recover:"
" failed to read recovery head
\n
"
);
" failed to read recovery head"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -1055,9 +1084,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
...
@@ -1055,9 +1084,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
/* read the recovery record */
/* read the recovery record */
if
(
tdb_read_convert
(
tdb
,
recovery_head
,
&
rec
,
sizeof
(
rec
))
==
-
1
)
{
if
(
tdb_read_convert
(
tdb
,
recovery_head
,
&
rec
,
sizeof
(
rec
))
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_recover:"
"tdb_transaction_recover:"
" failed to read recovery record
\n
"
);
" failed to read recovery record
"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -1067,10 +1096,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
...
@@ -1067,10 +1096,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
}
}
if
(
tdb
->
read_only
)
{
if
(
tdb
->
read_only
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb_logerr
(
tdb
,
TDB_ERR_CORRUPT
,
TDB_DEBUG_FATAL
,
"tdb_transaction_recover:"
"tdb_transaction_recover:"
" attempt to recover read only database
\n
"
);
" attempt to recover read only database"
);
tdb
->
ecode
=
TDB_ERR_CORRUPT
;
return
-
1
;
return
-
1
;
}
}
...
@@ -1078,19 +1106,18 @@ int tdb_transaction_recover(struct tdb_context *tdb)
...
@@ -1078,19 +1106,18 @@ int tdb_transaction_recover(struct tdb_context *tdb)
data
=
(
unsigned
char
*
)
malloc
(
rec
.
len
);
data
=
(
unsigned
char
*
)
malloc
(
rec
.
len
);
if
(
data
==
NULL
)
{
if
(
data
==
NULL
)
{
tdb
->
ecode
=
TDB_ERR_OOM
;
tdb_logerr
(
tdb
,
TDB_ERR_OOM
,
TDB_DEBUG_FATAL
,
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
"tdb_transaction_recover:"
"tdb_transaction_recover:"
" failed to allocate recovery data"
);
" failed to allocate recovery data
\n
"
);
return
-
1
;
return
-
1
;
}
}
/* read the full recovery data */
/* read the full recovery data */
if
(
tdb
->
methods
->
read
(
tdb
,
recovery_head
+
sizeof
(
rec
),
data
,
if
(
tdb
->
methods
->
read
(
tdb
,
recovery_head
+
sizeof
(
rec
),
data
,
rec
.
len
)
==
-
1
)
{
rec
.
len
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_recover:"
"tdb_transaction_recover:"
" failed to read recovery data
\n
"
);
" failed to read recovery data
"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -1106,9 +1133,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
...
@@ -1106,9 +1133,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
if
(
tdb
->
methods
->
write
(
tdb
,
ofs
,
p
,
len
)
==
-
1
)
{
if
(
tdb
->
methods
->
write
(
tdb
,
ofs
,
p
,
len
)
==
-
1
)
{
free
(
data
);
free
(
data
);
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_recover:"
"tdb_transaction_recover:"
" failed to recover %zu bytes at offset %zu
\n
"
,
" failed to recover %zu bytes at offset %zu"
,
(
size_t
)
len
,
(
size_t
)
ofs
);
(
size_t
)
len
,
(
size_t
)
ofs
);
return
-
1
;
return
-
1
;
}
}
...
@@ -1118,8 +1145,8 @@ int tdb_transaction_recover(struct tdb_context *tdb)
...
@@ -1118,8 +1145,8 @@ int tdb_transaction_recover(struct tdb_context *tdb)
free
(
data
);
free
(
data
);
if
(
transaction_sync
(
tdb
,
0
,
tdb
->
map_size
)
==
-
1
)
{
if
(
transaction_sync
(
tdb
,
0
,
tdb
->
map_size
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_recover: failed to sync recovery
\n
"
);
"tdb_transaction_recover: failed to sync recovery
"
);
return
-
1
;
return
-
1
;
}
}
...
@@ -1127,9 +1154,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
...
@@ -1127,9 +1154,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
if
(
recovery_eof
<=
recovery_head
)
{
if
(
recovery_eof
<=
recovery_head
)
{
if
(
tdb_write_off
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
),
0
)
if
(
tdb_write_off
(
tdb
,
offsetof
(
struct
tdb_header
,
recovery
),
0
)
==
-
1
)
{
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_recover:"
"tdb_transaction_recover:"
" failed to remove recovery head
\n
"
);
" failed to remove recovery head"
);
return
-
1
;
return
-
1
;
}
}
}
}
...
@@ -1139,21 +1166,21 @@ int tdb_transaction_recover(struct tdb_context *tdb)
...
@@ -1139,21 +1166,21 @@ int tdb_transaction_recover(struct tdb_context *tdb)
recovery_head
recovery_head
+
offsetof
(
struct
tdb_recovery_record
,
magic
),
+
offsetof
(
struct
tdb_recovery_record
,
magic
),
TDB_RECOVERY_INVALID_MAGIC
)
==
-
1
)
{
TDB_RECOVERY_INVALID_MAGIC
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_recover:"
"tdb_transaction_recover:"
" failed to remove recovery magic
\n
"
);
" failed to remove recovery magic"
);
return
-
1
;
return
-
1
;
}
}
if
(
transaction_sync
(
tdb
,
0
,
recovery_eof
)
==
-
1
)
{
if
(
transaction_sync
(
tdb
,
0
,
recovery_eof
)
==
-
1
)
{
tdb
->
log
(
tdb
,
TDB_DEBUG_FATAL
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
tdb
->
ecode
,
TDB_DEBUG_FATAL
,
"tdb_transaction_recover: failed to sync2 recovery
\n
"
);
"tdb_transaction_recover: failed to sync2 recovery"
);
return
-
1
;
return
-
1
;
}
}
tdb
->
log
(
tdb
,
TDB_DEBUG_TRACE
,
tdb
->
log_priv
,
tdb
_logerr
(
tdb
,
TDB_SUCCESS
,
TDB_DEBUG_TRACE
,
"tdb_transaction_recover: recovered %zu byte database
\n
"
,
"tdb_transaction_recover: recovered %zu byte database
"
,
(
size_t
)
recovery_eof
);
(
size_t
)
recovery_eof
);
/* all done */
/* all done */
return
0
;
return
0
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment