Commit cd28a5bd authored by Vladislav Vaintroub's avatar Vladislav Vaintroub

MDEV-4338 - Support FusionIO/directFS atomic writes

parent 83e983ad
......@@ -4023,6 +4023,24 @@ fil_extend_space_to_desired_size(
start_page_no = space->size;
file_start_page_no = space->size - node->size;
#ifdef HAVE_POSIX_FALLOCATE
if (srv_use_posix_fallocate) {
offset_high = size_after_extend * page_size / (4ULL*1024*1024*1024);
offset_low = size_after_extend * page_size % (4ULL*1024*1024*1024);
mutex_exit(&fil_system->mutex);
success = os_file_set_size(node->name, node->handle,
offset_low, offset_high);
mutex_enter(&fil_system->mutex);
if (success) {
node->size += (size_after_extend - start_page_no);
space->size += (size_after_extend - start_page_no);
os_has_said_disk_full = FALSE;
}
goto complete_io;
}
#endif
/* Extend at most 64 pages at a time */
buf_size = ut_min(64, size_after_extend - start_page_no) * page_size;
buf2 = mem_alloc(buf_size + page_size);
......@@ -4075,6 +4093,10 @@ fil_extend_space_to_desired_size(
mem_free(buf2);
#ifdef HAVE_POSIX_FALLOCATE
complete_io:
#endif
fil_node_complete_io(node, fil_system, OS_FILE_WRITE);
*actual_size = space->size;
......
......@@ -163,6 +163,8 @@ static my_bool innobase_file_format_check = TRUE;
static my_bool innobase_log_archive = FALSE;
static char* innobase_log_arch_dir = NULL;
#endif /* UNIV_LOG_ARCHIVE */
static my_bool innobase_use_atomic_writes = FALSE;
static my_bool innobase_use_fallocate = TRUE;
static my_bool innobase_use_doublewrite = TRUE;
static my_bool innobase_use_checksums = TRUE;
static my_bool innobase_locks_unsafe_for_binlog = FALSE;
......@@ -2474,6 +2476,38 @@ innobase_init(
innobase_commit_concurrency_init_default();
#ifdef HAVE_POSIX_FALLOCATE
srv_use_posix_fallocate = (ibool) innobase_use_fallocate;
#endif
srv_use_atomic_writes = (ibool) innobase_use_atomic_writes;
if (innobase_use_atomic_writes) {
fprintf(stderr, "InnoDB: using atomic writes.\n");
/* Force doublewrite buffer off, atomic writes replace it. */
if (srv_use_doublewrite_buf) {
fprintf(stderr, "InnoDB: Switching off doublewrite buffer "
"because of atomic writes.\n");
innobase_use_doublewrite = srv_use_doublewrite_buf = FALSE;
}
/* Force O_DIRECT on Unixes (on Windows writes are always unbuffered)*/
#ifndef _WIN32
if(!innobase_file_flush_method ||
!strstr(innobase_file_flush_method, "O_DIRECT")) {
innobase_file_flush_method =
srv_file_flush_method_str = (char*)"O_DIRECT";
fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
}
#endif
#ifdef HAVE_POSIX_FALLOCATE
/* Due to a bug in directFS, using atomics needs
* posix_fallocate to extend the file
* pwrite() past end of the file won't work
*/
srv_use_posix_fallocate = TRUE;
#endif
}
#ifdef HAVE_PSI_INTERFACE
/* Register keys with MySQL performance schema */
if (PSI_server) {
......@@ -11374,6 +11408,20 @@ static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite,
"Disable with --skip-innodb-doublewrite.",
NULL, NULL, TRUE);
static MYSQL_SYSVAR_BOOL(use_atomic_writes, innobase_use_atomic_writes,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
"Prevent partial page writes, via atomic writes."
"The option is used to prevent partial writes in case of a crash/poweroff, "
"as faster alternative to doublewrite buffer."
"Currently this option works only "
"on Linux only with FusionIO device, and directFS filesystem.",
NULL, NULL, FALSE);
static MYSQL_SYSVAR_BOOL(use_fallocate, innobase_use_fallocate,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
"Preallocate files fast, using operating system functionality. On POSIX systems, posix_fallocate system call is used.",
NULL, NULL, TRUE);
static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
PLUGIN_VAR_RQCMDARG,
"Number of IOPs the server can do. Tunes the background IO rate",
......@@ -11733,6 +11781,8 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(data_file_path),
MYSQL_SYSVAR(data_home_dir),
MYSQL_SYSVAR(doublewrite),
MYSQL_SYSVAR(use_atomic_writes),
MYSQL_SYSVAR(use_fallocate),
MYSQL_SYSVAR(fast_shutdown),
MYSQL_SYSVAR(file_io_threads),
MYSQL_SYSVAR(read_io_threads),
......
......@@ -212,6 +212,11 @@ extern ibool srv_innodb_status;
extern unsigned long long srv_stats_sample_pages;
extern ibool srv_use_doublewrite_buf;
extern ibool srv_use_atomic_writes;
#ifdef HAVE_POSIX_FALLOCATE
extern ibool srv_use_posix_fallocate;
#endif
extern ibool srv_use_checksums;
extern ulong srv_max_buf_pool_modified_pct;
......
......@@ -1361,6 +1361,43 @@ os_file_set_nocache(
#endif
}
#ifdef __linux__
#include <sys/ioctl.h>
#ifndef DFS_IOCTL_ATOMIC_WRITE_SET
#define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
#endif
static int os_file_set_atomic_writes(os_file_t file, const char *name)
{
static int first_time = 1;
int atomic_option = 1;
int ret = ioctl (file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option);
if (ret) {
fprintf(stderr,
"InnoDB : can't use atomic write on %s, errno %d\n",
name, errno);
return ret;
}
return ret;
}
#else
static int os_file_set_atomic_writes(os_file_t file, const char *name)
{
fprintf(stderr,
"InnoDB : can't use atomic writes on %s - not implemented on this platform."
"innodb_use_atomic_writes needs to be 0.\n",
name);
#ifdef _WIN32
SetLastError(ERROR_INVALID_FUNCTION);
#else
errno = EINVAL;
#endif
return -1;
}
#endif
/****************************************************************//**
NOTE! Use the corresponding macro os_file_create(), not directly
this function!
......@@ -1512,6 +1549,13 @@ os_file_create_func(
*success = TRUE;
}
if (srv_use_atomic_writes && type == OS_DATA_FILE &&
os_file_set_atomic_writes(file, name)) {
CloseHandle(file);
*success = FALSE;
file = INVALID_HANDLE_VALUE;
}
return(file);
#else /* __WIN__ */
os_file_t file;
......@@ -1626,6 +1670,12 @@ os_file_create_func(
file = -1;
}
#endif /* USE_FILE_LOCK */
if (srv_use_atomic_writes && type == OS_DATA_FILE
&& os_file_set_atomic_writes(file, name)) {
close(file);
*success = FALSE;
file = -1;
}
return(file);
#endif /* __WIN__ */
......@@ -1970,6 +2020,28 @@ os_file_set_size(
current_size = 0;
desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
#ifdef HAVE_POSIX_FALLOCATE
if (srv_use_posix_fallocate) {
if (posix_fallocate(file, current_size, desired_size) == -1) {
fprintf(stderr,
"InnoDB: Error: preallocating data for"
" file %s failed at\n"
"InnoDB: offset 0 size %lld %lld. Operating system"
" error number %llu.\n"
"InnoDB: Check that the disk is not full"
" or a disk quota exceeded.\n"
"InnoDB: Some operating system error numbers"
" are described at\n"
"InnoDB: "
REFMAN "operating-system-error-codes.html\n",
name, (long long)size_high, (long long)size, errno);
return (FALSE);
}
return (TRUE);
}
#endif
/* Write up to 1 megabyte at a time. */
buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
* UNIV_PAGE_SIZE;
......
......@@ -401,6 +401,10 @@ this many index pages */
UNIV_INTERN unsigned long long srv_stats_sample_pages = 8;
UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE;
UNIV_INTERN ibool srv_use_atomic_writes = FALSE;
#ifdef HAVE_POSIX_FALLOCATE
UNIV_INTERN ibool srv_use_posix_fallocate = TRUE;
#endif
UNIV_INTERN ibool srv_use_checksums = TRUE;
UNIV_INTERN ulong srv_replication_delay = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment