Commit f3dc3fde authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

refs #5842, merge Direct I/O to main

git-svn-id: file:///svn/toku/tokudb@52238 c7de825b-a66e-492c-adef-691d508d4ae1
parent bba428fd
...@@ -693,6 +693,7 @@ int main (int argc, char *const argv[] __attribute__((__unused__))) { ...@@ -693,6 +693,7 @@ int main (int argc, char *const argv[] __attribute__((__unused__))) {
printf("int log_compare (const DB_LSN*, const DB_LSN *) %s;\n", VISIBLE); printf("int log_compare (const DB_LSN*, const DB_LSN *) %s;\n", VISIBLE);
printf("int toku_set_trace_file (const char *fname) %s;\n", VISIBLE); printf("int toku_set_trace_file (const char *fname) %s;\n", VISIBLE);
printf("int toku_close_trace_file (void) %s;\n", VISIBLE); printf("int toku_close_trace_file (void) %s;\n", VISIBLE);
printf("void db_env_set_direct_io (bool direct_io_on) %s;\n", VISIBLE);
printf("void db_env_set_func_fsync (int (*)(int)) %s;\n", VISIBLE); printf("void db_env_set_func_fsync (int (*)(int)) %s;\n", VISIBLE);
printf("void db_env_set_func_free (void (*)(void*)) %s;\n", VISIBLE); printf("void db_env_set_func_free (void (*)(void*)) %s;\n", VISIBLE);
printf("void db_env_set_func_malloc (void *(*)(size_t)) %s;\n", VISIBLE); printf("void db_env_set_func_malloc (void *(*)(size_t)) %s;\n", VISIBLE);
......
...@@ -53,6 +53,9 @@ include(CheckSymbolExists) ...@@ -53,6 +53,9 @@ include(CheckSymbolExists)
check_symbol_exists(M_MMAP_THRESHOLD "malloc.h" HAVE_M_MMAP_THRESHOLD) check_symbol_exists(M_MMAP_THRESHOLD "malloc.h" HAVE_M_MMAP_THRESHOLD)
## check whether we have CLOCK_REALTIME ## check whether we have CLOCK_REALTIME
check_symbol_exists(CLOCK_REALTIME "time.h" HAVE_CLOCK_REALTIME) check_symbol_exists(CLOCK_REALTIME "time.h" HAVE_CLOCK_REALTIME)
## check how to do direct I/O
check_symbol_exists(O_DIRECT "fcntl.h" HAVE_O_DIRECT)
check_symbol_exists(F_NOCACHE "fcntl.h" HAVE_F_NOCACHE)
include(CheckFunctionExists) include(CheckFunctionExists)
......
...@@ -108,7 +108,7 @@ set_ldflags_if_supported( ...@@ -108,7 +108,7 @@ set_ldflags_if_supported(
## set extra debugging flags and preprocessor definitions ## set extra debugging flags and preprocessor definitions
set(CMAKE_C_FLAGS_DEBUG "-g3 -O0 ${CMAKE_C_FLAGS_DEBUG}") set(CMAKE_C_FLAGS_DEBUG "-g3 -O0 ${CMAKE_C_FLAGS_DEBUG}")
set(CMAKE_CXX_FLAGS_DEBUG "-g3 -O0 ${CMAKE_CXX_FLAGS_DEBUG}") set(CMAKE_CXX_FLAGS_DEBUG "-g3 -O0 ${CMAKE_CXX_FLAGS_DEBUG}")
set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS_DEBUG _FORTIFY_SOURCE=2) #set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS_DEBUG _FORTIFY_SOURCE=2)
## set extra release flags ## set extra release flags
if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL Clang) if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL Clang)
......
...@@ -58,6 +58,7 @@ block_allocator_print (BLOCK_ALLOCATOR ba) { ...@@ -58,6 +58,7 @@ block_allocator_print (BLOCK_ALLOCATOR ba) {
void void
create_block_allocator (BLOCK_ALLOCATOR *ba, uint64_t reserve_at_beginning, uint64_t alignment) { create_block_allocator (BLOCK_ALLOCATOR *ba, uint64_t reserve_at_beginning, uint64_t alignment) {
assert(alignment>=512 && 0==(alignment%512)); // the alignment must be at least 512 and aligned with 512 to make DIRECT_IO happy.
BLOCK_ALLOCATOR XMALLOC(result); BLOCK_ALLOCATOR XMALLOC(result);
result->reserve_at_beginning = reserve_at_beginning; result->reserve_at_beginning = reserve_at_beginning;
result->alignment = alignment; result->alignment = alignment;
...@@ -176,8 +177,9 @@ align (uint64_t value, BLOCK_ALLOCATOR ba) ...@@ -176,8 +177,9 @@ align (uint64_t value, BLOCK_ALLOCATOR ba)
return ((value+ba->alignment-1)/ba->alignment)*ba->alignment; return ((value+ba->alignment-1)/ba->alignment)*ba->alignment;
} }
void void block_allocator_alloc_block(BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *offset)
block_allocator_alloc_block (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *offset) { // Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512).
{
invariant(size > 0); //Allocator does not support size 0 blocks. See block_allocator_free_block. invariant(size > 0); //Allocator does not support size 0 blocks. See block_allocator_free_block.
grow_blocks_array(ba); grow_blocks_array(ba);
ba->n_bytes_in_use += size; ba->n_bytes_in_use += size;
......
...@@ -459,9 +459,11 @@ pair_is_unallocated(struct block_translation_pair *pair) { ...@@ -459,9 +459,11 @@ pair_is_unallocated(struct block_translation_pair *pair) {
return pair->size == 0 && pair->u.diskoff == diskoff_unused; return pair->size == 0 && pair->u.diskoff == diskoff_unused;
} }
// Purpose of this function is to figure out where to put the inprogress btt on disk, allocate space for it there. static void blocknum_alloc_translation_on_disk_unlocked(BLOCK_TABLE bt)
static void // Effect: figure out where to put the inprogress btt on disk, allocate space for it there.
blocknum_alloc_translation_on_disk_unlocked (BLOCK_TABLE bt) { // The space must be 512-byte aligned (both the starting address and the size).
// As a result, the allcoated space may be a little bit bigger (up to the next 512-byte boundary) than the actual btt.
{
toku_mutex_assert_locked(&bt->mutex); toku_mutex_assert_locked(&bt->mutex);
struct translation *t = &bt->inprogress; struct translation *t = &bt->inprogress;
...@@ -479,24 +481,29 @@ PRNTF("blokAllokator", 1L, size, offset, bt); ...@@ -479,24 +481,29 @@ PRNTF("blokAllokator", 1L, size, offset, bt);
t->block_translation[b.b].size = size; t->block_translation[b.b].size = size;
} }
//Fills wbuf with bt void toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w,
//A clean shutdown runs checkpoint start so that current and inprogress are copies. int64_t *address, int64_t *size)
void // Effect: Fills wbuf (which starts uninitialized) with bt
toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w, // A clean shutdown runs checkpoint start so that current and inprogress are copies.
int64_t *address, int64_t *size) { // The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needd)
// The address is guaranteed to be 512-byte aligned, but the size is not guaranteed.
// It *is* guaranteed that we can read up to the next 512-byte boundary, however
{
lock_for_blocktable(bt); lock_for_blocktable(bt);
struct translation *t = &bt->inprogress; struct translation *t = &bt->inprogress;
BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
blocknum_alloc_translation_on_disk_unlocked(bt); blocknum_alloc_translation_on_disk_unlocked(bt); // The allocated block must be 512-byte aligned to make O_DIRECT happy.
uint64_t size_translation = calculate_size_on_disk(t);
uint64_t size_aligned = roundup_to_multiple(512, size_translation);
assert((int64_t)size_translation==t->block_translation[b.b].size);
{ {
//Init wbuf //Init wbuf
uint64_t size_translation = calculate_size_on_disk(t);
assert((int64_t)size_translation==t->block_translation[b.b].size);
if (0) if (0)
printf("%s:%d writing translation table of size_translation %" PRIu64 " at %" PRId64 "\n", __FILE__, __LINE__, size_translation, t->block_translation[b.b].u.diskoff); printf("%s:%d writing translation table of size_translation %" PRIu64 " at %" PRId64 "\n", __FILE__, __LINE__, size_translation, t->block_translation[b.b].u.diskoff);
wbuf_init(w, toku_malloc(size_translation), size_translation); char *XMALLOC_N_ALIGNED(512, size_aligned, buf);
assert(w->size==size_translation); for (uint64_t i=size_translation; i<size_aligned; i++) buf[i]=0; // fill in the end of the buffer with zeros.
wbuf_init(w, buf, size_aligned);
} }
wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum); wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum);
wbuf_BLOCKNUM(w, t->blocknum_freelist_head); wbuf_BLOCKNUM(w, t->blocknum_freelist_head);
...@@ -510,9 +517,10 @@ toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w, ...@@ -510,9 +517,10 @@ toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w,
uint32_t checksum = x1764_finish(&w->checksum); uint32_t checksum = x1764_finish(&w->checksum);
wbuf_int(w, checksum); wbuf_int(w, checksum);
*address = t->block_translation[b.b].u.diskoff; *address = t->block_translation[b.b].u.diskoff;
*size = t->block_translation[b.b].size; *size = size_translation;
assert((*address)%512 == 0);
ensure_safe_write_unlocked(bt, fd, *size, *address); ensure_safe_write_unlocked(bt, fd, size_aligned, *address);
unlock_for_blocktable(bt); unlock_for_blocktable(bt);
} }
......
...@@ -3397,6 +3397,20 @@ int toku_open_ft_handle (const char *fname, int is_create, FT_HANDLE *ft_handle_ ...@@ -3397,6 +3397,20 @@ int toku_open_ft_handle (const char *fname, int is_create, FT_HANDLE *ft_handle_
return r; return r;
} }
static bool use_direct_io = true;
void toku_ft_set_direct_io (bool direct_io_on) {
use_direct_io = direct_io_on;
}
static inline int ft_open_maybe_direct(const char *filename, int oflag, int mode) {
if (use_direct_io) {
return toku_os_open_direct(filename, oflag, mode);
} else {
return toku_os_open(filename, oflag, mode);
}
}
// open a file for use by the brt // open a file for use by the brt
// Requires: File does not exist. // Requires: File does not exist.
static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) { static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) {
...@@ -3404,12 +3418,12 @@ static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) { ...@@ -3404,12 +3418,12 @@ static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) {
int r; int r;
int fd; int fd;
int er; int er;
fd = open(fname, O_RDWR | O_BINARY, mode); fd = ft_open_maybe_direct(fname, O_RDWR | O_BINARY, mode);
assert(fd==-1); assert(fd==-1);
if ((er = get_maybe_error_errno()) != ENOENT) { if ((er = get_maybe_error_errno()) != ENOENT) {
return er; return er;
} }
fd = open(fname, O_RDWR | O_CREAT | O_BINARY, mode); fd = ft_open_maybe_direct(fname, O_RDWR | O_CREAT | O_BINARY, mode);
if (fd==-1) { if (fd==-1) {
r = get_error_errno(); r = get_error_errno();
return r; return r;
...@@ -3426,7 +3440,7 @@ static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) { ...@@ -3426,7 +3440,7 @@ static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) {
static int ft_open_file(const char *fname, int *fdp) { static int ft_open_file(const char *fname, int *fdp) {
mode_t mode = S_IRWXU|S_IRWXG|S_IRWXO; mode_t mode = S_IRWXU|S_IRWXG|S_IRWXO;
int fd; int fd;
fd = open(fname, O_RDWR | O_BINARY, mode); fd = ft_open_maybe_direct(fname, O_RDWR | O_BINARY, mode);
if (fd==-1) { if (fd==-1) {
return get_error_errno(); return get_error_errno();
} }
......
...@@ -257,4 +257,5 @@ int toku_ft_strerror_r(int error, char *buf, size_t buflen); ...@@ -257,4 +257,5 @@ int toku_ft_strerror_r(int error, char *buf, size_t buflen);
extern bool garbage_collection_debug; extern bool garbage_collection_debug;
void toku_ft_set_direct_io(bool direct_io_on);
#endif #endif
...@@ -39,8 +39,11 @@ void ...@@ -39,8 +39,11 @@ void
toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset) { toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset) {
// make the checksum // make the checksum
int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum
int64_t size_aligned = roundup_to_multiple(512, size);
struct wbuf w; struct wbuf w;
wbuf_init(&w, toku_xmalloc(size), size); char *XMALLOC_N_ALIGNED(512, size_aligned, aligned_buf);
for (int64_t i=size; i<size_aligned; i++) aligned_buf[i] = 0;
wbuf_init(&w, aligned_buf, size);
toku_serialize_descriptor_contents_to_wbuf(&w, desc); toku_serialize_descriptor_contents_to_wbuf(&w, desc);
{ {
//Add checksum //Add checksum
...@@ -50,7 +53,7 @@ toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF ...@@ -50,7 +53,7 @@ toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF
lazy_assert(w.ndone==w.size); lazy_assert(w.ndone==w.size);
{ {
//Actual Write translation table //Actual Write translation table
toku_os_full_pwrite(fd, w.buf, size, offset); toku_os_full_pwrite(fd, w.buf, size_aligned, offset);
} }
toku_free(w.buf); toku_free(w.buf);
} }
...@@ -88,10 +91,12 @@ deserialize_descriptor_from(int fd, BLOCK_TABLE bt, DESCRIPTOR desc, int layout_ ...@@ -88,10 +91,12 @@ deserialize_descriptor_from(int fd, BLOCK_TABLE bt, DESCRIPTOR desc, int layout_
if (size > 0) { if (size > 0) {
lazy_assert(size>=4); //4 for checksum lazy_assert(size>=4); //4 for checksum
{ {
XMALLOC_N(size, dbuf); ssize_t size_to_malloc = roundup_to_multiple(512, size);
XMALLOC_N_ALIGNED(512, size_to_malloc, dbuf);
{ {
ssize_t sz_read = toku_os_pread(fd, dbuf, size, offset);
lazy_assert(sz_read==size); ssize_t sz_read = toku_os_pread(fd, dbuf, size_to_malloc, offset);
lazy_assert(sz_read==size_to_malloc);
} }
{ {
// check the checksum // check the checksum
...@@ -118,9 +123,9 @@ exit: ...@@ -118,9 +123,9 @@ exit:
return r; return r;
} }
// We only deserialize brt header once and then share everything with all the brts. int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
int // Effect: Deserialize the ft header.
deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version) // We deserialize brt header only once and then share everything with all the brts.
{ {
int r; int r;
FT ft = NULL; FT ft = NULL;
...@@ -179,14 +184,16 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version) ...@@ -179,14 +184,16 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
//Load translation table //Load translation table
{ {
unsigned char *XMALLOC_N(translation_size_on_disk, tbuf); size_t size_to_read = roundup_to_multiple(512, translation_size_on_disk);
unsigned char *XMALLOC_N_ALIGNED(512, size_to_read, tbuf);
{ {
// This cast is messed up in 32-bits if the block translation // This cast is messed up in 32-bits if the block translation
// table is ever more than 4GB. But in that case, the // table is ever more than 4GB. But in that case, the
// translation table itself won't fit in main memory. // translation table itself won't fit in main memory.
ssize_t readsz = toku_os_pread(fd, tbuf, translation_size_on_disk, ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read,
translation_address_on_disk); translation_address_on_disk);
lazy_assert(readsz == translation_size_on_disk); assert(readsz >= translation_size_on_disk);
assert(readsz <= (ssize_t)size_to_read);
} }
// Create table and read in data. // Create table and read in data.
r = toku_blocktable_create_from_buffer(fd, r = toku_blocktable_create_from_buffer(fd,
...@@ -427,28 +434,30 @@ serialize_ft_min_size (uint32_t version) { ...@@ -427,28 +434,30 @@ serialize_ft_min_size (uint32_t version) {
return size; return size;
} }
// Simply reading the raw bytes of the header into an rbuf is insensitive int deserialize_ft_from_fd_into_rbuf(int fd,
// to disk format version. If that ever changes, then modify this. toku_off_t offset_of_header,
struct rbuf *rb,
uint64_t *checkpoint_count,
LSN *checkpoint_lsn,
uint32_t * version_p)
// Effect: Read and parse the header of a fractalal tree
// //
// TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the // Simply reading the raw bytes of the header into an rbuf is insensitive
// file AND the header is useless // to disk format version. If that ever changes, then modify this.
int //
deserialize_ft_from_fd_into_rbuf(int fd, // TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the
toku_off_t offset_of_header, // file AND the header is useless
struct rbuf *rb,
uint64_t *checkpoint_count,
LSN *checkpoint_lsn,
uint32_t * version_p)
{ {
int r = 0; int r = 0;
const int64_t prefix_size = 8 + // magic ("tokudata") const int64_t prefix_size = 8 + // magic ("tokudata")
4 + // version 4 + // version
4 + // build_id 4 + // build_id
4; // size 4; // size
unsigned char prefix[prefix_size]; const int64_t read_size = roundup_to_multiple(512, prefix_size);
unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix);
rb->buf = NULL; rb->buf = NULL;
int64_t n = toku_os_pread(fd, prefix, prefix_size, offset_of_header); int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header);
if (n != prefix_size) { if (n != read_size) {
if (n==0) { if (n==0) {
r = TOKUDB_DICTIONARY_NO_HEADER; r = TOKUDB_DICTIONARY_NO_HEADER;
} else if (n<0) { } else if (n<0) {
...@@ -504,16 +513,21 @@ deserialize_ft_from_fd_into_rbuf(int fd, ...@@ -504,16 +513,21 @@ deserialize_ft_from_fd_into_rbuf(int fd,
lazy_assert(rb->ndone==prefix_size); lazy_assert(rb->ndone==prefix_size);
rb->size = size; rb->size = size;
XMALLOC_N(rb->size, rb->buf); {
toku_free(rb->buf);
n = toku_os_pread(fd, rb->buf, rb->size, offset_of_header); uint32_t size_to_read = roundup_to_multiple(512, size);
if (n != rb->size) { XMALLOC_N_ALIGNED(512, size_to_read, rb->buf);
if (n < 0) {
r = get_error_errno(); assert(offset_of_header%512==0);
} else { n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header);
r = EINVAL; //Header might be useless (wrong size) or could be a disk read error. if (n != size_to_read) {
if (n < 0) {
r = get_error_errno();
} else {
r = EINVAL; //Header might be useless (wrong size) or could be a disk read error.
}
goto exit;
} }
goto exit;
} }
//It's version 14 or later. Magic looks OK. //It's version 14 or later. Magic looks OK.
//We have an rbuf that represents the header. //We have an rbuf that represents the header.
...@@ -549,9 +563,7 @@ deserialize_ft_from_fd_into_rbuf(int fd, ...@@ -549,9 +563,7 @@ deserialize_ft_from_fd_into_rbuf(int fd,
exit: exit:
if (r != 0 && rb->buf != NULL) { if (r != 0 && rb->buf != NULL) {
if (rb->buf != prefix) { // don't free prefix, it's stack alloc'd toku_free(rb->buf);
toku_free(rb->buf);
}
rb->buf = NULL; rb->buf = NULL;
} }
return r; return r;
...@@ -718,16 +730,23 @@ void toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFIL ...@@ -718,16 +730,23 @@ void toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFIL
toku_serialize_translation_to_wbuf(blocktable, fd, &w_translation, toku_serialize_translation_to_wbuf(blocktable, fd, &w_translation,
&address_translation, &address_translation,
&size_translation); &size_translation);
lazy_assert(size_translation == w_translation.size); assert(size_translation == w_translation.ndone); // the bytes written are the size
assert(w_translation.size % 512 == 0); // the number of bytes available in the buffer is 0 mod 512, and those last bytes are all initialized.
struct wbuf w_main; struct wbuf w_main;
size_t size_main = toku_serialize_ft_size(h); size_t size_main = toku_serialize_ft_size(h);
wbuf_init(&w_main, toku_xmalloc(size_main), size_main); size_t size_main_aligned = roundup_to_multiple(512, size_main);
assert(size_main_aligned<BLOCK_ALLOCATOR_HEADER_RESERVE);
char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf);
for (size_t i=size_main; i<size_main_aligned; i++) mainbuf[i]=0; // initialize the end of the buffer with zeros
wbuf_init(&w_main, mainbuf, size_main);
toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation); toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation);
lazy_assert(w_main.ndone == size_main); lazy_assert(w_main.ndone == size_main);
//Actual Write translation table // Actually write translation table
toku_os_full_pwrite(fd, w_translation.buf, size_translation, address_translation); // This write is guaranteed to read good data at the end of the buffer, since the
// w_translation.buf is padded with zeros to a 512-byte boundary.
toku_os_full_pwrite(fd, w_translation.buf, roundup_to_multiple(512, size_translation), address_translation);
//Everything but the header MUST be on disk before header starts. //Everything but the header MUST be on disk before header starts.
//Otherwise we will think the header is good and some blocks might not //Otherwise we will think the header is good and some blocks might not
...@@ -746,7 +765,7 @@ void toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFIL ...@@ -746,7 +765,7 @@ void toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFIL
// Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE // Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
toku_off_t main_offset; toku_off_t main_offset;
main_offset = (h->checkpoint_count & 0x1) ? 0 : BLOCK_ALLOCATOR_HEADER_RESERVE; main_offset = (h->checkpoint_count & 0x1) ? 0 : BLOCK_ALLOCATOR_HEADER_RESERVE;
toku_os_full_pwrite(fd, w_main.buf, w_main.ndone, main_offset); toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset);
toku_free(w_main.buf); toku_free(w_main.buf);
toku_free(w_translation.buf); toku_free(w_translation.buf);
} }
...@@ -134,7 +134,9 @@ toku_maybe_preallocate_in_file (int fd, int64_t size, int64_t expected_size, int ...@@ -134,7 +134,9 @@ toku_maybe_preallocate_in_file (int fd, int64_t size, int64_t expected_size, int
to_write += alignup64(min64(file_size + to_write, FILE_CHANGE_INCREMENT), stripe_width); to_write += alignup64(min64(file_size + to_write, FILE_CHANGE_INCREMENT), stripe_width);
} }
if (to_write > 0) { if (to_write > 0) {
char *XCALLOC_N(to_write, wbuf); assert(to_write%512==0);
char *XMALLOC_N_ALIGNED(512, to_write, wbuf);
memset(wbuf, 0, to_write);
toku_off_t start_write = alignup64(file_size, stripe_width); toku_off_t start_write = alignup64(file_size, stripe_width);
invariant(start_write >= file_size); invariant(start_write >= file_size);
toku_os_full_pwrite(fd, wbuf, to_write, start_write); toku_os_full_pwrite(fd, wbuf, to_write, start_write);
...@@ -773,20 +775,21 @@ serialize_and_compress_sb_node_info(FTNODE node, struct sub_block *sb, ...@@ -773,20 +775,21 @@ serialize_and_compress_sb_node_info(FTNODE node, struct sub_block *sb,
st->compress_time += t2 - t1; st->compress_time += t2 - t1;
} }
// Writes out each child to a separate malloc'd buffer, then compresses int toku_serialize_ftnode_to_memory(FTNODE node,
// all of them, and writes the uncompressed header, to bytes_to_write, FTNODE_DISK_DATA* ndd,
// which is malloc'd. unsigned int basementnodesize,
enum toku_compression_method compression_method,
bool do_rebalancing,
bool in_parallel, // for loader is true, for toku_ftnode_flush_callback, is false
/*out*/ size_t *n_bytes_to_write,
/*out*/ size_t *n_uncompressed_bytes,
/*out*/ char **bytes_to_write)
// Effect: Writes out each child to a separate malloc'd buffer, then compresses
// all of them, and writes the uncompressed header, to bytes_to_write,
// which is malloc'd.
// //
int // The resulting buffer is guaranteed to be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needed).
toku_serialize_ftnode_to_memory (FTNODE node, // 512-byte padding is for O_DIRECT to work.
FTNODE_DISK_DATA* ndd,
unsigned int basementnodesize,
enum toku_compression_method compression_method,
bool do_rebalancing,
bool in_parallel, // for loader is true, for toku_ftnode_flush_callback, is false
/*out*/ size_t *n_bytes_to_write,
/*out*/ size_t *n_uncompressed_bytes,
/*out*/ char **bytes_to_write)
{ {
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
...@@ -849,7 +852,9 @@ toku_serialize_ftnode_to_memory (FTNODE node, ...@@ -849,7 +852,9 @@ toku_serialize_ftnode_to_memory (FTNODE node,
total_uncompressed_size += sb[i].uncompressed_size + 4; total_uncompressed_size += sb[i].uncompressed_size + 4;
} }
char *XMALLOC_N(total_node_size, data); uint32_t total_buffer_size = roundup_to_multiple(512, total_node_size); // make the buffer be 512 bytes.
char *XMALLOC_N_ALIGNED(512, total_buffer_size, data);
char *curr_ptr = data; char *curr_ptr = data;
// now create the final serialized node // now create the final serialized node
...@@ -874,9 +879,14 @@ toku_serialize_ftnode_to_memory (FTNODE node, ...@@ -874,9 +879,14 @@ toku_serialize_ftnode_to_memory (FTNODE node,
*(uint32_t *)curr_ptr = toku_htod32(sb[i].xsum); *(uint32_t *)curr_ptr = toku_htod32(sb[i].xsum);
curr_ptr += sizeof(sb[i].xsum); curr_ptr += sizeof(sb[i].xsum);
} }
// Zero the rest of the buffer
for (uint32_t i=total_node_size; i<total_buffer_size; i++) {
data[i]=0;
}
assert(curr_ptr - data == total_node_size); assert(curr_ptr - data == total_node_size);
*bytes_to_write = data; *bytes_to_write = data;
*n_bytes_to_write = total_node_size; *n_bytes_to_write = total_buffer_size;
*n_uncompressed_bytes = total_uncompressed_size; *n_uncompressed_bytes = total_uncompressed_size;
// //
...@@ -890,6 +900,8 @@ toku_serialize_ftnode_to_memory (FTNODE node, ...@@ -890,6 +900,8 @@ toku_serialize_ftnode_to_memory (FTNODE node,
toku_free(sb[i].uncompressed_ptr); toku_free(sb[i].uncompressed_ptr);
} }
assert(0 == (*n_bytes_to_write)%512);
assert(0 == ((unsigned long long)(*bytes_to_write))%512);
toku_free(sb); toku_free(sb);
return 0; return 0;
} }
...@@ -1152,11 +1164,13 @@ void read_block_from_fd_into_rbuf( ...@@ -1152,11 +1164,13 @@ void read_block_from_fd_into_rbuf(
// get the file offset and block size for the block // get the file offset and block size for the block
DISKOFF offset, size; DISKOFF offset, size;
toku_translate_blocknum_to_offset_size(h->blocktable, blocknum, &offset, &size); toku_translate_blocknum_to_offset_size(h->blocktable, blocknum, &offset, &size);
uint8_t *XMALLOC_N(size, raw_block); DISKOFF size_aligned = roundup_to_multiple(512, size);
uint8_t *XMALLOC_N_ALIGNED(512, size_aligned, raw_block);
rbuf_init(rb, raw_block, size); rbuf_init(rb, raw_block, size);
// read the block // read the block
ssize_t rlen = toku_os_pread(fd, raw_block, size, offset); ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset);
lazy_assert((DISKOFF)rlen == size); assert((DISKOFF)rlen >= size);
assert((DISKOFF)rlen <= size_aligned);
} }
static const int read_header_heuristic_max = 32*1024; static const int read_header_heuristic_max = 32*1024;
...@@ -1170,8 +1184,8 @@ static void read_ftnode_header_from_fd_into_rbuf_if_small_enough (int fd, BLOCKN ...@@ -1170,8 +1184,8 @@ static void read_ftnode_header_from_fd_into_rbuf_if_small_enough (int fd, BLOCKN
{ {
DISKOFF offset, size; DISKOFF offset, size;
toku_translate_blocknum_to_offset_size(ft->blocktable, blocknum, &offset, &size); toku_translate_blocknum_to_offset_size(ft->blocktable, blocknum, &offset, &size);
DISKOFF read_size = MIN(read_header_heuristic_max, size); DISKOFF read_size = roundup_to_multiple(512, MIN(read_header_heuristic_max, size));
uint8_t *XMALLOC_N(size, raw_block); uint8_t *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, size), raw_block);
rbuf_init(rb, raw_block, read_size); rbuf_init(rb, raw_block, read_size);
// read the block // read the block
...@@ -2418,14 +2432,20 @@ toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, i ...@@ -2418,14 +2432,20 @@ toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, i
uint32_t curr_size = BP_SIZE (ndd, childnum); uint32_t curr_size = BP_SIZE (ndd, childnum);
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0}; struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
uint8_t *XMALLOC_N(curr_size, raw_block); uint32_t pad_at_beginning = (node_offset+curr_offset)%512;
rbuf_init(&rb, raw_block, curr_size); uint32_t padded_size = roundup_to_multiple(512, pad_at_beginning + curr_size);
uint8_t *XMALLOC_N_ALIGNED(512, padded_size, raw_block);
rbuf_init(&rb, pad_at_beginning+raw_block, curr_size);
tokutime_t t0 = toku_time_now(); tokutime_t t0 = toku_time_now();
// read // read the block
ssize_t rlen = toku_os_pread(fd, raw_block, curr_size, node_offset+curr_offset); assert(0==((unsigned long long)raw_block)%512); // for O_DIRECT
lazy_assert((DISKOFF)rlen == curr_size); assert(0==(padded_size)%512);
assert(0==(node_offset+curr_offset-pad_at_beginning)%512);
ssize_t rlen = toku_os_pread(fd, raw_block, padded_size, node_offset+curr_offset-pad_at_beginning);
assert((DISKOFF)rlen >= pad_at_beginning + curr_size); // we read in at least enough to get what we wanted
assert((DISKOFF)rlen <= padded_size); // we didn't read in too much.
tokutime_t t1 = toku_time_now(); tokutime_t t1 = toku_time_now();
...@@ -2627,12 +2647,14 @@ serialize_uncompressed_block_to_memory(char * uncompressed_buf, ...@@ -2627,12 +2647,14 @@ serialize_uncompressed_block_to_memory(char * uncompressed_buf,
struct sub_block sub_block[/*n_sub_blocks*/], struct sub_block sub_block[/*n_sub_blocks*/],
enum toku_compression_method method, enum toku_compression_method method,
/*out*/ size_t *n_bytes_to_write, /*out*/ size_t *n_bytes_to_write,
/*out*/ char **bytes_to_write) { /*out*/ char **bytes_to_write)
// Guarantees that the malloc'd BYTES_TO_WRITE is 512-byte aligned (so that O_DIRECT will work)
{
// allocate space for the compressed uncompressed_buf // allocate space for the compressed uncompressed_buf
size_t compressed_len = get_sum_compressed_size_bound(n_sub_blocks, sub_block, method); size_t compressed_len = get_sum_compressed_size_bound(n_sub_blocks, sub_block, method);
size_t sub_block_header_len = sub_block_header_size(n_sub_blocks); size_t sub_block_header_len = sub_block_header_size(n_sub_blocks);
size_t header_len = node_header_overhead + sub_block_header_len + sizeof (uint32_t); // node + sub_block + checksum size_t header_len = node_header_overhead + sub_block_header_len + sizeof (uint32_t); // node + sub_block + checksum
char *XMALLOC_N(header_len + compressed_len, compressed_buf); char *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, header_len + compressed_len), compressed_buf);
// copy the header // copy the header
memcpy(compressed_buf, uncompressed_buf, node_header_overhead); memcpy(compressed_buf, uncompressed_buf, node_header_overhead);
...@@ -2662,7 +2684,12 @@ serialize_uncompressed_block_to_memory(char * uncompressed_buf, ...@@ -2662,7 +2684,12 @@ serialize_uncompressed_block_to_memory(char * uncompressed_buf,
uint32_t xsum = x1764_memory(compressed_buf, header_length); uint32_t xsum = x1764_memory(compressed_buf, header_length);
*ptr = toku_htod32(xsum); *ptr = toku_htod32(xsum);
*n_bytes_to_write = header_len + compressed_len; uint32_t padded_len = roundup_to_multiple(512, header_len + compressed_len);
// Zero out padding.
for (uint32_t i = header_len+compressed_len; i < padded_len; i++) {
compressed_buf[i] = 0;
}
*n_bytes_to_write = padded_len;
*bytes_to_write = compressed_buf; *bytes_to_write = compressed_buf;
} }
...@@ -2933,11 +2960,13 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum, ...@@ -2933,11 +2960,13 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
int r = 0; int r = 0;
if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b); if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b);
uint8_t *XMALLOC_N(size, raw_block); DISKOFF size_aligned = roundup_to_multiple(512, size);
uint8_t *XMALLOC_N_ALIGNED(512, size, raw_block);
{ {
// read the (partially compressed) block // read the (partially compressed) block
ssize_t rlen = toku_os_pread(fd, raw_block, size, offset); ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset);
lazy_assert((DISKOFF)rlen == size); lazy_assert((DISKOFF)rlen >= size);
lazy_assert((DISKOFF)rlen <= size_aligned);
} }
// get the layout_version // get the layout_version
int layout_version; int layout_version;
......
...@@ -2102,6 +2102,7 @@ static void allocate_node (struct subtrees_info *sts, int64_t b) { ...@@ -2102,6 +2102,7 @@ static void allocate_node (struct subtrees_info *sts, int64_t b) {
sts->n_subtrees++; sts->n_subtrees++;
} }
// dbuf will always contained 512-byte aligned buffer, but the length might not be a multiple of 512 bytes. If that's what you want, then pad it.
struct dbuf { struct dbuf {
unsigned char *buf; unsigned char *buf;
int buflen; int buflen;
...@@ -2225,7 +2226,7 @@ static void putbuf_bytes (struct dbuf *dbuf, const void *bytes, int nbytes) { ...@@ -2225,7 +2226,7 @@ static void putbuf_bytes (struct dbuf *dbuf, const void *bytes, int nbytes) {
int oldbuflen = dbuf->buflen; int oldbuflen = dbuf->buflen;
dbuf->buflen += dbuf->off + nbytes; dbuf->buflen += dbuf->off + nbytes;
dbuf->buflen *= 2; dbuf->buflen *= 2;
REALLOC_N(dbuf->buflen, dbuf->buf); REALLOC_N_ALIGNED(512, dbuf->buflen, dbuf->buf);
if (dbuf->buf == NULL) { if (dbuf->buf == NULL) {
dbuf->error = get_error_errno(); dbuf->error = get_error_errno();
dbuf->buf = oldbuf; dbuf->buf = oldbuf;
...@@ -2905,9 +2906,17 @@ static int write_translation_table (struct dbout *out, long long *off_of_transla ...@@ -2905,9 +2906,17 @@ static int write_translation_table (struct dbout *out, long long *off_of_transla
} }
unsigned int checksum = x1764_memory(ttable.buf, ttable.off); unsigned int checksum = x1764_memory(ttable.buf, ttable.off);
putbuf_int32(&ttable, checksum); putbuf_int32(&ttable, checksum);
// pad it to 512 zeros
long long encoded_length = ttable.off;
{
int nbytes_to_add = roundup_to_multiple(512, ttable.off) - encoded_length;
char zeros[nbytes_to_add];
for (int i=0; i<nbytes_to_add; i++) zeros[i]=0;
putbuf_bytes(&ttable, zeros, nbytes_to_add);
}
int result = ttable.error; int result = ttable.error;
if (result == 0) { if (result == 0) {
invariant(bt_size_on_disk==ttable.off); invariant(bt_size_on_disk==encoded_length);
result = toku_os_pwrite(out->fd, ttable.buf, ttable.off, off_of_translation); result = toku_os_pwrite(out->fd, ttable.buf, ttable.off, off_of_translation);
} }
dbuf_destroy(&ttable); dbuf_destroy(&ttable);
...@@ -2919,18 +2928,22 @@ static int ...@@ -2919,18 +2928,22 @@ static int
write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk) { write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk) {
int result = 0; int result = 0;
size_t size = toku_serialize_ft_size(out->h->h); size_t size = toku_serialize_ft_size(out->h->h);
size_t alloced_size = roundup_to_multiple(512, size);
struct wbuf wbuf; struct wbuf wbuf;
char *MALLOC_N(size, buf); char *MALLOC_N_ALIGNED(512, alloced_size, buf);
if (buf == NULL) { if (buf == NULL) {
result = get_error_errno(); result = get_error_errno();
} else { } else {
wbuf_init(&wbuf, buf, size); wbuf_init(&wbuf, buf, size);
out->h->h->on_disk_stats = out->h->in_memory_stats; out->h->h->on_disk_stats = out->h->in_memory_stats;
toku_serialize_ft_to_wbuf(&wbuf, out->h->h, translation_location_on_disk, translation_size_on_disk); toku_serialize_ft_to_wbuf(&wbuf, out->h->h, translation_location_on_disk, translation_size_on_disk);
for (size_t i=size; i<alloced_size; i++) buf[i]=0; // initialize all those unused spots to zero
if (wbuf.ndone != size) if (wbuf.ndone != size)
result = EINVAL; result = EINVAL;
else else {
result = toku_os_pwrite(out->fd, wbuf.buf, wbuf.ndone, 0); assert(wbuf.ndone <= alloced_size);
result = toku_os_pwrite(out->fd, wbuf.buf, alloced_size, 0);
}
toku_free(buf); toku_free(buf);
} }
return result; return result;
......
...@@ -8,19 +8,22 @@ ...@@ -8,19 +8,22 @@
static void ba_alloc_at (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t offset) { static void ba_alloc_at (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t offset) {
block_allocator_validate(ba); block_allocator_validate(ba);
block_allocator_alloc_block_at(ba, size, offset); block_allocator_alloc_block_at(ba, size*512, offset*512);
block_allocator_validate(ba); block_allocator_validate(ba);
} }
static void ba_alloc (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *answer) { static void ba_alloc (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *answer) {
block_allocator_validate(ba); block_allocator_validate(ba);
block_allocator_alloc_block(ba, size, answer); uint64_t actual_answer;
block_allocator_alloc_block(ba, 512*size, &actual_answer);
block_allocator_validate(ba); block_allocator_validate(ba);
assert(actual_answer%512==0);
*answer = actual_answer/512;
} }
static void ba_free (BLOCK_ALLOCATOR ba, uint64_t offset) { static void ba_free (BLOCK_ALLOCATOR ba, uint64_t offset) {
block_allocator_validate(ba); block_allocator_validate(ba);
block_allocator_free_block(ba, offset); block_allocator_free_block(ba, offset*512);
block_allocator_validate(ba); block_allocator_validate(ba);
} }
...@@ -30,8 +33,8 @@ ba_check_l (BLOCK_ALLOCATOR ba, uint64_t blocknum_in_layout_order, uint64_t expe ...@@ -30,8 +33,8 @@ ba_check_l (BLOCK_ALLOCATOR ba, uint64_t blocknum_in_layout_order, uint64_t expe
uint64_t actual_offset, actual_size; uint64_t actual_offset, actual_size;
int r = block_allocator_get_nth_block_in_layout_order(ba, blocknum_in_layout_order, &actual_offset, &actual_size); int r = block_allocator_get_nth_block_in_layout_order(ba, blocknum_in_layout_order, &actual_offset, &actual_size);
assert(r==0); assert(r==0);
assert(expected_offset == actual_offset); assert(expected_offset*512 == actual_offset);
assert(expected_size == actual_size); assert(expected_size *512 == actual_size);
} }
static void static void
...@@ -48,10 +51,10 @@ static void ...@@ -48,10 +51,10 @@ static void
test_ba0 (void) { test_ba0 (void) {
BLOCK_ALLOCATOR ba; BLOCK_ALLOCATOR ba;
uint64_t b0, b1; uint64_t b0, b1;
create_block_allocator(&ba, 100, 1); create_block_allocator(&ba, 100*512, 1*512);
assert(block_allocator_allocated_limit(ba)==100); assert(block_allocator_allocated_limit(ba)==100*512);
ba_alloc_at(ba, 50, 100); ba_alloc_at(ba, 50, 100);
assert(block_allocator_allocated_limit(ba)==150); assert(block_allocator_allocated_limit(ba)==150*512);
ba_alloc_at(ba, 25, 150); ba_alloc_at(ba, 25, 150);
ba_alloc (ba, 10, &b0); ba_alloc (ba, 10, &b0);
ba_check_l (ba, 0, 0, 100); ba_check_l (ba, 0, 0, 100);
...@@ -66,9 +69,9 @@ test_ba0 (void) { ...@@ -66,9 +69,9 @@ test_ba0 (void) {
assert(b0==160); assert(b0==160);
ba_alloc(ba, 10, &b0); ba_alloc(ba, 10, &b0);
ba_alloc(ba, 113, &b1); ba_alloc(ba, 113, &b1);
assert(113==block_allocator_block_size(ba, b1)); assert(113*512==block_allocator_block_size(ba, b1 *512));
assert(10==block_allocator_block_size(ba, b0)); assert(10 *512==block_allocator_block_size(ba, b0 *512));
assert(50==block_allocator_block_size(ba, 100)); assert(50 *512==block_allocator_block_size(ba, 100*512));
uint64_t b2, b3, b4, b5, b6, b7; uint64_t b2, b3, b4, b5, b6, b7;
ba_alloc(ba, 100, &b2); ba_alloc(ba, 100, &b2);
...@@ -103,7 +106,7 @@ test_ba0 (void) { ...@@ -103,7 +106,7 @@ test_ba0 (void) {
static void static void
test_ba1 (int n_initial) { test_ba1 (int n_initial) {
BLOCK_ALLOCATOR ba; BLOCK_ALLOCATOR ba;
create_block_allocator(&ba, 0, 1); create_block_allocator(&ba, 0*512, 1*512);
int i; int i;
int n_blocks=0; int n_blocks=0;
uint64_t blocks[1000]; uint64_t blocks[1000];
...@@ -136,8 +139,8 @@ test_ba2 (void) ...@@ -136,8 +139,8 @@ test_ba2 (void)
BLOCK_ALLOCATOR ba; BLOCK_ALLOCATOR ba;
uint64_t b[6]; uint64_t b[6];
enum { BSIZE = 1024 }; enum { BSIZE = 1024 };
create_block_allocator(&ba, 100, BSIZE); create_block_allocator(&ba, 100*512, BSIZE*512);
assert(block_allocator_allocated_limit(ba)==100); assert(block_allocator_allocated_limit(ba)==100*512);
ba_check_l (ba, 0, 0, 100); ba_check_l (ba, 0, 0, 100);
ba_check_none (ba, 1); ba_check_none (ba, 1);
......
...@@ -188,7 +188,7 @@ static void *my_malloc(size_t n) { ...@@ -188,7 +188,7 @@ static void *my_malloc(size_t n) {
} }
} }
} }
return malloc(n); return os_malloc(n);
} }
static int do_realloc_errors = 1; static int do_realloc_errors = 1;
...@@ -207,7 +207,7 @@ static void *my_realloc(void *p, size_t n) { ...@@ -207,7 +207,7 @@ static void *my_realloc(void *p, size_t n) {
} }
} }
} }
return realloc(p, n); return os_realloc(p, n);
} }
......
...@@ -28,7 +28,7 @@ static void *my_malloc(size_t n) { ...@@ -28,7 +28,7 @@ static void *my_malloc(size_t n) {
errno = ENOSPC; errno = ENOSPC;
return NULL; return NULL;
} else } else
return malloc(n); return os_malloc(n);
} }
static int my_compare(DB *UU(desc), const DBT *UU(akey), const DBT *UU(bkey)) { static int my_compare(DB *UU(desc), const DBT *UU(akey), const DBT *UU(bkey)) {
......
...@@ -17,6 +17,7 @@ target_link_libraries(${LIBTOKUPORTABILITY} LINK_PUBLIC ${CMAKE_THREAD_LIBS_INIT ...@@ -17,6 +17,7 @@ target_link_libraries(${LIBTOKUPORTABILITY} LINK_PUBLIC ${CMAKE_THREAD_LIBS_INIT
add_library(tokuportability_static_conv STATIC ${tokuportability_srcs}) add_library(tokuportability_static_conv STATIC ${tokuportability_srcs})
set_target_properties(tokuportability_static_conv PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(tokuportability_static_conv PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_dependencies(tokuportability_static_conv build_jemalloc)
set(tokuportability_source_libs tokuportability_static_conv jemalloc ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS}) set(tokuportability_source_libs tokuportability_static_conv jemalloc ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS})
merge_static_libs(${LIBTOKUPORTABILITY}_static ${LIBTOKUPORTABILITY}_static "${tokuportability_source_libs}") merge_static_libs(${LIBTOKUPORTABILITY}_static ${LIBTOKUPORTABILITY}_static "${tokuportability_source_libs}")
......
...@@ -196,6 +196,8 @@ toku_os_write (int fd, const void *buf, size_t len) { ...@@ -196,6 +196,8 @@ toku_os_write (int fd, const void *buf, size_t len) {
void void
toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) { toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) {
assert(0==((long long)buf)%512);
assert((len%512 == 0) && (off%512)==0); // to make pwrite work.
const char *bp = (const char *) buf; const char *bp = (const char *) buf;
while (len > 0) { while (len > 0) {
ssize_t r; ssize_t r;
...@@ -218,6 +220,9 @@ toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) { ...@@ -218,6 +220,9 @@ toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) {
ssize_t ssize_t
toku_os_pwrite (int fd, const void *buf, size_t len, toku_off_t off) { toku_os_pwrite (int fd, const void *buf, size_t len, toku_off_t off) {
assert(0==((long long)buf)%512); // these asserts are to ensure that direct I/O will work.
assert(0==len %512);
assert(0==off %512);
const char *bp = (const char *) buf; const char *bp = (const char *) buf;
ssize_t result = 0; ssize_t result = 0;
while (len > 0) { while (len > 0) {
...@@ -269,6 +274,25 @@ toku_os_open(const char *path, int oflag, int mode) { ...@@ -269,6 +274,25 @@ toku_os_open(const char *path, int oflag, int mode) {
return rval; return rval;
} }
int
toku_os_open_direct(const char *path, int oflag, int mode) {
int rval;
#if defined(HAVE_O_DIRECT)
rval = toku_os_open(path, oflag | O_DIRECT, mode);
#elif defined(HAVE_F_NOCACHE)
rval = toku_os_open(path, oflag, mode);
if (rval >= 0) {
int r = fcntl(rval, F_NOCACHE, 1);
if (r == -1) {
perror("setting F_NOCACHE");
}
}
#else
# error "No direct I/O implementation found."
#endif
return rval;
}
int int
toku_os_fclose(FILE * stream) { toku_os_fclose(FILE * stream) {
int rval = -1; int rval = -1;
...@@ -310,6 +334,9 @@ toku_os_read(int fd, void *buf, size_t count) { ...@@ -310,6 +334,9 @@ toku_os_read(int fd, void *buf, size_t count) {
ssize_t ssize_t
toku_os_pread (int fd, void *buf, size_t count, off_t offset) { toku_os_pread (int fd, void *buf, size_t count, off_t offset) {
assert(0==((long long)buf)%512);
assert(0==count%512);
assert(0==offset%512);
ssize_t r; ssize_t r;
if (t_pread) { if (t_pread) {
r = t_pread(fd, buf, count, offset); r = t_pread(fd, buf, count, offset);
......
...@@ -21,17 +21,17 @@ ...@@ -21,17 +21,17 @@
#include <portability/toku_atomic.h> #include <portability/toku_atomic.h>
static malloc_fun_t t_malloc = 0; static malloc_fun_t t_malloc = 0;
static malloc_aligned_fun_t t_malloc_aligned = 0;
static malloc_fun_t t_xmalloc = 0; static malloc_fun_t t_xmalloc = 0;
static malloc_aligned_fun_t t_xmalloc_aligned = 0;
static free_fun_t t_free = 0; static free_fun_t t_free = 0;
static realloc_fun_t t_realloc = 0; static realloc_fun_t t_realloc = 0;
static realloc_aligned_fun_t t_realloc_aligned = 0;
static realloc_fun_t t_xrealloc = 0; static realloc_fun_t t_xrealloc = 0;
static LOCAL_MEMORY_STATUS_S status; static LOCAL_MEMORY_STATUS_S status;
int toku_memory_do_stats = 0; int toku_memory_do_stats = 0;
typedef size_t (*malloc_usable_size_fun_t)(const void *);
static malloc_usable_size_fun_t malloc_usable_size_f;
static bool memory_startup_complete; static bool memory_startup_complete;
int int
...@@ -76,14 +76,6 @@ toku_memory_startup(void) { ...@@ -76,14 +76,6 @@ toku_memory_startup(void) {
} }
} }
malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_usable_size");
if (!malloc_usable_size_f) {
malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_size"); // darwin
if (!malloc_usable_size_f) {
result = EINVAL; // couldn't find a malloc size function
}
}
return result; return result;
} }
...@@ -105,7 +97,7 @@ toku_memory_get_status(LOCAL_MEMORY_STATUS s) { ...@@ -105,7 +97,7 @@ toku_memory_get_status(LOCAL_MEMORY_STATUS s) {
// jemalloc's malloc_usable_size does not work with a NULL pointer, so we implement a version that works // jemalloc's malloc_usable_size does not work with a NULL pointer, so we implement a version that works
static size_t static size_t
my_malloc_usable_size(void *p) { my_malloc_usable_size(void *p) {
return p == NULL ? 0 : malloc_usable_size_f(p); return p == NULL ? 0 : os_malloc_usable_size(p);
} }
// Note that max_in_use may be slightly off because use of max_in_use is not thread-safe. // Note that max_in_use may be slightly off because use of max_in_use is not thread-safe.
...@@ -162,6 +154,23 @@ toku_malloc(size_t size) { ...@@ -162,6 +154,23 @@ toku_malloc(size_t size) {
return p; return p;
} }
void *toku_malloc_aligned(size_t alignment, size_t size) {
void *p = t_malloc_aligned ? t_malloc_aligned(alignment, size) : os_malloc_aligned(alignment, size);
if (p) {
TOKU_ANNOTATE_NEW_MEMORY(p, size); // see #4671 and https://bugs.kde.org/show_bug.cgi?id=297147
if (toku_memory_do_stats) {
size_t used = my_malloc_usable_size(p);
toku_sync_add_and_fetch(&status.malloc_count, 1);
toku_sync_add_and_fetch(&status.requested,size);
toku_sync_add_and_fetch(&status.used, used);
set_max(status.used, status.freed);
}
} else {
toku_sync_add_and_fetch(&status.malloc_fail, 1);
}
return p;
}
void * void *
toku_calloc(size_t nmemb, size_t size) { toku_calloc(size_t nmemb, size_t size) {
size_t newsize = nmemb * size; size_t newsize = nmemb * size;
...@@ -189,6 +198,25 @@ toku_realloc(void *p, size_t size) { ...@@ -189,6 +198,25 @@ toku_realloc(void *p, size_t size) {
return q; return q;
} }
void *toku_realloc_aligned(size_t alignment, void *p, size_t size) {
size_t used_orig = p ? my_malloc_usable_size(p) : 0;
void *q = t_realloc_aligned ? t_realloc_aligned(alignment, p, size) : os_realloc_aligned(alignment, p, size);
if (q) {
if (toku_memory_do_stats) {
size_t used = my_malloc_usable_size(q);
toku_sync_add_and_fetch(&status.realloc_count, 1);
toku_sync_add_and_fetch(&status.requested, size);
toku_sync_add_and_fetch(&status.used, used);
toku_sync_add_and_fetch(&status.freed, used_orig);
set_max(status.used, status.freed);
}
} else {
toku_sync_add_and_fetch(&status.realloc_fail, 1);
}
return q;
}
void * void *
toku_memdup(const void *v, size_t len) { toku_memdup(const void *v, size_t len) {
void *p = toku_malloc(len); void *p = toku_malloc(len);
...@@ -232,6 +260,23 @@ toku_xmalloc(size_t size) { ...@@ -232,6 +260,23 @@ toku_xmalloc(size_t size) {
return p; return p;
} }
void* toku_xmalloc_aligned(size_t alignment, size_t size)
// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Fail with a resource_assert if the allocation fails (don't return an error code).
// Requires: alignment is a power of two.
{
void *p = t_xmalloc_aligned ? t_xmalloc_aligned(alignment, size) : os_malloc_aligned(alignment,size);
resource_assert(p);
if (toku_memory_do_stats) {
size_t used = my_malloc_usable_size(p);
toku_sync_add_and_fetch(&status.malloc_count, 1);
toku_sync_add_and_fetch(&status.requested, size);
toku_sync_add_and_fetch(&status.used, used);
set_max(status.used, status.freed);
}
return p;
}
void * void *
toku_xcalloc(size_t nmemb, size_t size) { toku_xcalloc(size_t nmemb, size_t size) {
size_t newsize = nmemb * size; size_t newsize = nmemb * size;
......
...@@ -8,11 +8,174 @@ ...@@ -8,11 +8,174 @@
#include <toku_portability.h> #include <toku_portability.h>
#include <stdlib.h> #include <stdlib.h>
#include <jemalloc/include/jemalloc/jemalloc.h>
#if defined(HAVE_MALLOC_H) #if defined(HAVE_MALLOC_H)
# include <malloc.h> # include <malloc.h>
#elif defined(HAVE_SYS_MALLOC_H) #elif defined(HAVE_SYS_MALLOC_H)
# include <sys/malloc.h> # include <sys/malloc.h>
#endif #endif
#include <dlfcn.h>
#include <string.h>
// #define this to use a version of os_malloc that helps to debug certain features.
// This version uses the real malloc (so that valgrind should still work) but it forces things to be slightly
// misaligned (in particular, avoiding 512-byte alignment if possible, to find situations where O_DIRECT will fail.
// #define USE_DEBUGGING_MALLOCS
#ifdef USE_DEBUGGING_MALLOCS
#include <pthread.h>
// Make things misaligned on 512-byte boundaries
static size_t malloced_now_count=0, malloced_now_size=0;
struct malloc_pair {
void *returned_pointer;
void *true_pointer;
size_t requested_size = 0;
};
static struct malloc_pair *malloced_now;
static pthread_mutex_t malloc_mutex = PTHREAD_MUTEX_INITIALIZER;
static void malloc_lock(void) {
int r = pthread_mutex_lock(&malloc_mutex);
assert(r==0);
}
static void malloc_unlock(void) {
int r = pthread_mutex_unlock(&malloc_mutex);
assert(r==0);
}
static void push_to_malloced_memory(void *returned_pointer, void *true_pointer, size_t requested_size) {
malloc_lock();
if (malloced_now_count == malloced_now_size) {
malloced_now_size = 2*malloced_now_size + 1;
malloced_now = (struct malloc_pair *)realloc(malloced_now, malloced_now_size * sizeof(*malloced_now));
}
malloced_now[malloced_now_count].returned_pointer = returned_pointer;
malloced_now[malloced_now_count].true_pointer = true_pointer;
malloced_now[malloced_now_count].requested_size = requested_size;
malloced_now_count++;
malloc_unlock();
}
static struct malloc_pair *find_malloced_pair(const void *p)
// Requires: Lock must be held before calling.
{
for (size_t i=0; i<malloced_now_count; i++) {
if (malloced_now[i].returned_pointer==p) return &malloced_now[i];
}
return 0;
}
void *os_malloc(size_t size) {
void *raw_ptr = malloc(size+16); // allocate 16 extra bytes
size_t raw_ptr_i = (size_t) raw_ptr;
if (raw_ptr_i%512==0) {
push_to_malloced_memory(16+(char*)raw_ptr, raw_ptr, size);
return 16+(char*)raw_ptr;
} else {
push_to_malloced_memory(raw_ptr, raw_ptr, size);
return raw_ptr;
}
}
void *os_malloc_aligned(size_t alignment, size_t size)
// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
{
void *p;
int r = posix_memalign(&p, alignment, size);
if (r != 0) {
errno = r;
p = nullptr;
}
return p;
if (alignment%512==0) {
void *raw_ptr;
int r = posix_memalign(&raw_ptr, alignment, size);
if (r != 0) {
errno = r;
return nullptr;
}
push_to_malloced_memory(raw_ptr, raw_ptr, size);
return raw_ptr;
} else {
// Make sure it isn't 512-byte aligned
void *raw_ptr;
int r = posix_memalign(&raw_ptr, alignment, size+alignment);
if (r != 0) {
errno = r;
return nullptr;
}
size_t raw_ptr_i = (size_t) raw_ptr;
if (raw_ptr_i%512==0) {
push_to_malloced_memory(alignment+(char*)raw_ptr, raw_ptr, size);
return alignment+(char*)raw_ptr;
} else {
push_to_malloced_memory(raw_ptr, raw_ptr, size);
return raw_ptr;
}
}
}
static size_t min(size_t a, size_t b) {
if (a<b) return a;
else return b;
}
void *os_realloc(void *p, size_t size) {
size_t alignment;
if (size<4) {
alignment = 1;
} else if (size<8) {
alignment = 4;
} else if (size<16) {
alignment = 8;
} else {
alignment = 16;
}
return os_realloc_aligned(alignment, p, size);
}
void * os_realloc_aligned(size_t alignment, void *p, size_t size)
// Effect: Perform a realloc(p, size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
{
if (p==NULL) {
return os_malloc_aligned(alignment, size);
} else {
void *result = os_malloc_aligned(alignment, size);
malloc_lock();
struct malloc_pair *mp = find_malloced_pair(p);
assert(mp);
// now copy all the good stuff from p to result
memcpy(result, p, min(size, mp->requested_size));
malloc_unlock();
os_free(p);
return result;
}
}
void os_free(void* p) {
malloc_lock();
struct malloc_pair *mp = find_malloced_pair(p);
assert(mp);
free(mp->true_pointer);
*mp = malloced_now[--malloced_now_count];
malloc_unlock();
}
size_t os_malloc_usable_size(const void *p) {
malloc_lock();
struct malloc_pair *mp = find_malloced_pair(p);
assert(mp);
size_t size = mp->requested_size;
malloc_unlock();
return size;
}
#else
void * void *
os_malloc(size_t size) os_malloc(size_t size)
...@@ -20,14 +183,81 @@ os_malloc(size_t size) ...@@ -20,14 +183,81 @@ os_malloc(size_t size)
return malloc(size); return malloc(size);
} }
void *os_malloc_aligned(size_t alignment, size_t size)
// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
{
void *p;
int r = posix_memalign(&p, alignment, size);
if (r != 0) {
errno = r;
p = nullptr;
}
return p;
}
void * void *
os_realloc(void *p, size_t size) os_realloc(void *p, size_t size)
{ {
return realloc(p, size); return realloc(p, size);
} }
void * os_realloc_aligned(size_t alignment, void *p, size_t size)
// Effect: Perform a realloc(p, size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
{
#if 1
if (p==NULL) {
return os_malloc_aligned(alignment, size);
} else {
void *newp = realloc(p, size);
if (0!=((long long)newp%alignment)) {
// it's not aligned, so align it ourselves.
void *newp2 = os_malloc_aligned(alignment, size);
memcpy(newp2, newp, size);
free(newp);
newp = newp2;
}
return newp;
}
#else
// THIS STUFF SEEMS TO FAIL VALGRIND
if (p==NULL) {
return os_malloc_aligned(alignment, size);
} else {
size_t ignore;
int r = rallocm(&p, // returned pointer
&ignore, // actual size of returned object.
size, // the size we want
0, // extra bytes to "try" to allocate at the end
ALLOCM_ALIGN(alignment));
if (r!=0) return NULL;
else return p;
}
#endif
}
void void
os_free(void* p) os_free(void* p)
{ {
free(p); free(p);
} }
typedef size_t (*malloc_usable_size_fun_t)(const void *);
static malloc_usable_size_fun_t malloc_usable_size_f = NULL;
size_t os_malloc_usable_size(const void *p) {
if (p==NULL) return 0;
if (!malloc_usable_size_f) {
malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_usable_size");
if (!malloc_usable_size_f) {
malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_size"); // darwin
if (!malloc_usable_size_f) {
abort(); // couldn't find a malloc size function
}
}
}
return malloc_usable_size_f(p);
}
#endif
...@@ -50,9 +50,8 @@ int test_main(int UU(argc), char *const argv[] UU()) { ...@@ -50,9 +50,8 @@ int test_main(int UU(argc), char *const argv[] UU()) {
} }
{ {
struct unpackedsevenbytestruct *usevenbytestructs; struct unpackedsevenbytestruct *MALLOC_N_ALIGNED(cachelinesize, 10, usevenbytestructs);
int r = posix_memalign((void **) &usevenbytestructs, cachelinesize, sizeof(unpackedsevenbytestruct) * 10); if (usevenbytestructs == NULL) {
if (r) {
// this test is supposed to crash, so exiting cleanly is a failure // this test is supposed to crash, so exiting cleanly is a failure
perror("posix_memalign"); perror("posix_memalign");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
...@@ -65,8 +64,9 @@ int test_main(int UU(argc), char *const argv[] UU()) { ...@@ -65,8 +64,9 @@ int test_main(int UU(argc), char *const argv[] UU()) {
toku_free(usevenbytestructs); toku_free(usevenbytestructs);
} }
int r = posix_memalign((void **) &psevenbytestructs, cachelinesize, sizeof(packedsevenbytestruct) * 10);
if (r) { MALLOC_N_ALIGNED(cachelinesize, 10, psevenbytestructs);
if (psevenbytestructs == NULL) {
// this test is supposed to crash, so exiting cleanly is a failure // this test is supposed to crash, so exiting cleanly is a failure
perror("posix_memalign"); perror("posix_memalign");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <test.h> #include <test.h>
#include <fcntl.h> #include <fcntl.h>
#include <toku_assert.h> #include <toku_assert.h>
#include <memory.h>
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
...@@ -28,8 +29,9 @@ int test_main(int argc, char *const argv[]) { ...@@ -28,8 +29,9 @@ int test_main(int argc, char *const argv[]) {
unlink(fname); unlink(fname);
int fd = open(fname, O_RDWR | O_CREAT | O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); int fd = open(fname, O_RDWR | O_CREAT | O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO);
assert(fd>=0); assert(fd>=0);
char buf[] = "hello"; char *XMALLOC_N_ALIGNED(512, 512, buf);
int64_t offset = (1LL<<32) + 100; strcpy(buf, "hello");
int64_t offset = (1LL<<32) + 512;
toku_os_full_pwrite(fd, buf, sizeof buf, offset); toku_os_full_pwrite(fd, buf, sizeof buf, offset);
char newbuf[sizeof buf]; char newbuf[sizeof buf];
r = pread(fd, newbuf, sizeof newbuf, 100); r = pread(fd, newbuf, sizeof newbuf, 100);
...@@ -42,6 +44,7 @@ int test_main(int argc, char *const argv[]) { ...@@ -42,6 +44,7 @@ int test_main(int argc, char *const argv[]) {
r = toku_os_get_file_size(fd, &fsize); r = toku_os_get_file_size(fd, &fsize);
assert(r == 0); assert(r == 0);
assert(fsize > 100 + (signed)sizeof(buf)); assert(fsize > 100 + (signed)sizeof(buf));
toku_free(buf);
r = close(fd); r = close(fd);
assert(r==0); assert(r==0);
return 0; return 0;
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
db_env_create; db_env_create;
db_strerror; db_strerror;
db_version; db_version;
db_env_set_direct_io;
db_env_set_func_fsync; db_env_set_func_fsync;
db_env_set_func_malloc; db_env_set_func_malloc;
db_env_set_func_realloc; db_env_set_func_realloc;
......
...@@ -27,6 +27,10 @@ void * checkpoint_callback2_extra = NULL; ...@@ -27,6 +27,10 @@ void * checkpoint_callback2_extra = NULL;
bool engine_status_enable = true; // if false, suppress engine status output on failed assert, for test programs only bool engine_status_enable = true; // if false, suppress engine status output on failed assert, for test programs only
void db_env_set_direct_io (bool direct_io_on) {
toku_ft_set_direct_io(direct_io_on);
}
void db_env_set_func_fsync (int (*fsync_function)(int)) { void db_env_set_func_fsync (int (*fsync_function)(int)) {
toku_set_func_fsync(fsync_function); toku_set_func_fsync(fsync_function);
} }
......
...@@ -44,6 +44,8 @@ ...@@ -44,6 +44,8 @@
#cmakedefine HAVE_M_MMAP_THRESHOLD 1 #cmakedefine HAVE_M_MMAP_THRESHOLD 1
#cmakedefine HAVE_CLOCK_REALTIME 1 #cmakedefine HAVE_CLOCK_REALTIME 1
#cmakedefine HAVE_O_DIRECT 1
#cmakedefine HAVE_F_NOCACHE 1
#cmakedefine HAVE_MALLOC_SIZE 1 #cmakedefine HAVE_MALLOC_SIZE 1
#cmakedefine HAVE_MALLOC_USABLE_SIZE 1 #cmakedefine HAVE_MALLOC_USABLE_SIZE 1
......
...@@ -18,17 +18,28 @@ void toku_memory_shutdown(void) __attribute__((destructor)); ...@@ -18,17 +18,28 @@ void toku_memory_shutdown(void) __attribute__((destructor));
/* Generally: errno is set to 0 or a value to indicate problems. */ /* Generally: errno is set to 0 or a value to indicate problems. */
/* Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc() */ // Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc()
// That way the tests can can, e.g., replace the malloc function using toku_set_func_malloc().
void *toku_calloc(size_t nmemb, size_t size) __attribute__((__visibility__("default"))); void *toku_calloc(size_t nmemb, size_t size) __attribute__((__visibility__("default")));
void *toku_xcalloc(size_t nmemb, size_t size) __attribute__((__visibility__("default"))); void *toku_xcalloc(size_t nmemb, size_t size) __attribute__((__visibility__("default")));
void *toku_malloc(size_t size) __attribute__((__visibility__("default"))); void *toku_malloc(size_t size) __attribute__((__visibility__("default")));
void *toku_malloc_aligned(size_t alignment, size_t size) __attribute__((__visibility__("default")));
// xmalloc aborts instead of return NULL if we run out of memory // xmalloc aborts instead of return NULL if we run out of memory
void *toku_xmalloc(size_t size); void *toku_xmalloc(size_t size) __attribute__((__visibility__("default")));
void *toku_xrealloc(void*, size_t size) __attribute__((__visibility__("default"))); void *toku_xrealloc(void*, size_t size) __attribute__((__visibility__("default")));
void *toku_xmalloc_aligned(size_t alignment, size_t size) __attribute__((__visibility__("default")));
// Effect: Perform a os_malloc_aligned(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Fail with a resource_assert if the allocation fails (don't return an error code).
// If the alloc_aligned function has been set then call it instead.
// Requires: alignment is a power of two.
void toku_free(void*) __attribute__((__visibility__("default"))); void toku_free(void*) __attribute__((__visibility__("default")));
void *toku_realloc(void *, size_t size) __attribute__((__visibility__("default"))); void *toku_realloc(void *, size_t size) __attribute__((__visibility__("default")));
void *toku_realloc_aligned(size_t alignment, void *p, size_t size) __attribute__((__visibility__("default")));
// Effect: Perform a os_realloc_aligned(alignment, p, size) which has the additional property that the returned pointer is a multiple of ALIGNMENT.
// If the malloc_aligned function has been set then call it instead.
// Requires: alignment is a power of two.
size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default"))); size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")));
...@@ -50,6 +61,8 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default") ...@@ -50,6 +61,8 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
* to make an array of 5 integers. * to make an array of 5 integers.
*/ */
#define MALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_malloc((n)*sizeof(*v))) #define MALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_malloc((n)*sizeof(*v)))
#define MALLOC_N_ALIGNED(align, n, v) CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n)*sizeof(*v)))
//CALLOC_N is like calloc with auto-figuring out size of members //CALLOC_N is like calloc with auto-figuring out size of members
#define CALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v))) #define CALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v)))
...@@ -57,6 +70,7 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default") ...@@ -57,6 +70,7 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
#define CALLOC(v) CALLOC_N(1,v) #define CALLOC(v) CALLOC_N(1,v)
#define REALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_realloc(v, (n)*sizeof(*v))) #define REALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_realloc(v, (n)*sizeof(*v)))
#define REALLOC_N_ALIGNED(align, n,v) CAST_FROM_VOIDP(v, toku_realloc_aligned((align), v, (n)*sizeof(*v)))
// XMALLOC macros are like MALLOC except they abort if the operation fails // XMALLOC macros are like MALLOC except they abort if the operation fails
#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v))) #define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v)))
...@@ -66,6 +80,8 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default") ...@@ -66,6 +80,8 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
#define XREALLOC(v,s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s)) #define XREALLOC(v,s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s))
#define XREALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n)*sizeof(*v))) #define XREALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n)*sizeof(*v)))
#define XMALLOC_N_ALIGNED(align, n, v) CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n)*sizeof(*v)))
#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src))) #define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src)))
#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len)) #define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len))
...@@ -94,6 +110,8 @@ void toku_do_memory_check(void); ...@@ -94,6 +110,8 @@ void toku_do_memory_check(void);
typedef void *(*malloc_fun_t)(size_t); typedef void *(*malloc_fun_t)(size_t);
typedef void (*free_fun_t)(void*); typedef void (*free_fun_t)(void*);
typedef void *(*realloc_fun_t)(void*,size_t); typedef void *(*realloc_fun_t)(void*,size_t);
typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/);
typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void */*pointer*/, size_t /*size*/);
void toku_set_func_malloc(malloc_fun_t f); void toku_set_func_malloc(malloc_fun_t f);
void toku_set_func_xmalloc_only(malloc_fun_t f); void toku_set_func_xmalloc_only(malloc_fun_t f);
......
...@@ -247,8 +247,26 @@ extern void *realloc(void*, size_t) __THROW __attribute__((__deprecat ...@@ -247,8 +247,26 @@ extern void *realloc(void*, size_t) __THROW __attribute__((__deprecat
#endif #endif
void *os_malloc(size_t) __attribute__((__visibility__("default"))); void *os_malloc(size_t) __attribute__((__visibility__("default")));
// Effect: See man malloc(2)
void *os_malloc_aligned(size_t /*alignment*/, size_t /*size*/) __attribute__((__visibility__("default")));
// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
void *os_realloc(void*,size_t) __attribute__((__visibility__("default"))); void *os_realloc(void*,size_t) __attribute__((__visibility__("default")));
// Effect: See man realloc(2)
void *os_realloc_aligned(size_t/*alignment*/, void*,size_t) __attribute__((__visibility__("default")));
// Effect: Perform a realloc(p, size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
void os_free(void*) __attribute__((__visibility__("default"))); void os_free(void*) __attribute__((__visibility__("default")));
// Effect: See man free(2)
size_t os_malloc_usable_size(const void *p) __attribute__((__visibility__("default")));
// Effect: Return an estimate of the usable size inside a pointer. If this function is not defined the memory.cc will
// look for the jemalloc, libc, or darwin versions of the function for computing memory footprint.
// full_pwrite and full_write performs a pwrite, and checks errors. It doesn't return unless all the data was written. */ // full_pwrite and full_write performs a pwrite, and checks errors. It doesn't return unless all the data was written. */
void toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) __attribute__((__visibility__("default"))); void toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) __attribute__((__visibility__("default")));
...@@ -262,6 +280,7 @@ int toku_os_write (int fd, const void *buf, size_t len) __attribute__((__visibil ...@@ -262,6 +280,7 @@ int toku_os_write (int fd, const void *buf, size_t len) __attribute__((__visibil
FILE * toku_os_fdopen(int fildes, const char *mode); FILE * toku_os_fdopen(int fildes, const char *mode);
FILE * toku_os_fopen(const char *filename, const char *mode); FILE * toku_os_fopen(const char *filename, const char *mode);
int toku_os_open(const char *path, int oflag, int mode); int toku_os_open(const char *path, int oflag, int mode);
int toku_os_open_direct(const char *path, int oflag, int mode);
int toku_os_close(int fd); int toku_os_close(int fd);
int toku_os_fclose(FILE * stream); int toku_os_fclose(FILE * stream);
ssize_t toku_os_read(int fd, void *buf, size_t count); ssize_t toku_os_read(int fd, void *buf, size_t count);
...@@ -293,4 +312,17 @@ void toku_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)); ...@@ -293,4 +312,17 @@ void toku_set_func_pread (ssize_t (*)(int, void *, size_t, off_t));
int toku_portability_init(void); int toku_portability_init(void);
void toku_portability_destroy(void); void toku_portability_destroy(void);
static inline uint64_t roundup_to_multiple(uint64_t alignment, uint64_t v)
// Effect: Return X, where X the smallest multiple of ALIGNMENT such that X>=V.
// Requires: ALIGNMENT is a power of two
{
assert(0==(alignment&(alignment-1))); // alignment must be a power of two
uint64_t result = (v+alignment-1)&~(alignment-1);
assert(result>=v); // The result is >=V.
assert(result%alignment==0); // The result is a multiple of alignment.
assert(result<v+alignment); // The result is the smallest such multiple of alignment.
return result;
}
#endif /* TOKU_PORTABILITY_H */ #endif /* TOKU_PORTABILITY_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment