Commit 24648768 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-30136: Deprecate innodb_flush_method

We introduce the following settable Boolean global variables:

innodb_log_file_write_through: Whether writes to ib_logfile0 are
write-through (disabling any caching, as in O_SYNC or O_DSYNC).

innodb_data_file_write_through: Whether writes to any InnoDB data files
(including the temporary tablespace) are write-through.

innodb_data_file_buffering: Whether the file system cache is enabled
for InnoDB data files.

All these parameters are OFF by default, that is, the file system cache
will be disabled, but any hardware caching is enabled, that is,
explicit calls to fsync(), fdatasync() or similar functions are needed.

On systems that support FUA it may make sense to enable write-through,
to avoid extra system calls.

If the deprecated read-only start-up parameter is set to one of the
following values, then the values of the 4 Boolean flags (the above 3
plus innodb_log_file_buffering) will be set as follows:

O_DSYNC:
innodb_log_file_write_through=ON, innodb_data_file_write_through=ON,
innodb_data_file_buffering=OFF, and
(if supported) innodb_log_file_buffering=OFF.

fsync, littlesync, nosync, or (Microsoft Windows specific) normal:
innodb_log_file_write_through=OFF, innodb_data_file_write_through=OFF,
and innodb_data_file_buffering=ON.

Note: fsync() or fdatasync() will only be disabled if the separate
parameter debug_no_sync (in the code, my_disable_sync) is set.

In mariadb-backup, the parameter innodb_flush_method will be ignored.

The Boolean parameters can be modified by SET GLOBAL while the
server is running. This will require reopening the ib_logfile0
or all currently open InnoDB data files.

We will open files straight in O_DSYNC or O_SYNC mode when applicable.
Data files we will try to open straight in O_DIRECT mode when the
page size is at least 4096 bytes. For atomically creating data files,
we will invoke os_file_set_nocache() to enable O_DIRECT afterwards,
because O_DIRECT is not supported on some file systems. We will also
continue to invoke os_file_set_nocache() on ib_logfile0 when
innodb_log_file_buffering=OFF can be fulfilled.

For reopening the ib_logfile0, we use the same logic that was developed
for online log resizing and reused for updates of
innodb_log_file_buffering.

Reopening all data files is implemented in the new function
fil_space_t::reopen_all().

Reviewed by: Vladislav Vaintroub
Tested by: Matthias Leich
parent e581396b
...@@ -199,12 +199,6 @@ xb_fil_cur_open( ...@@ -199,12 +199,6 @@ xb_fil_cur_open(
return(XB_FIL_CUR_SKIP); return(XB_FIL_CUR_SKIP);
} }
if (srv_file_flush_method == SRV_O_DIRECT
|| srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) {
os_file_set_nocache(cursor->file, node->name, "OPEN");
}
posix_fadvise(cursor->file, 0, 0, POSIX_FADV_SEQUENTIAL); posix_fadvise(cursor->file, 0, 0, POSIX_FADV_SEQUENTIAL);
cursor->page_size = node->space->physical_size(); cursor->page_size = node->space->physical_size();
......
...@@ -311,6 +311,8 @@ extern const char *innodb_checksum_algorithm_names[]; ...@@ -311,6 +311,8 @@ extern const char *innodb_checksum_algorithm_names[];
extern TYPELIB innodb_checksum_algorithm_typelib; extern TYPELIB innodb_checksum_algorithm_typelib;
extern const char *innodb_flush_method_names[]; extern const char *innodb_flush_method_names[];
extern TYPELIB innodb_flush_method_typelib; extern TYPELIB innodb_flush_method_typelib;
/** Ignored option */
static ulong innodb_flush_method;
static const char *binlog_info_values[] = {"off", "lockless", "on", "auto", static const char *binlog_info_values[] = {"off", "lockless", "on", "auto",
NullS}; NullS};
...@@ -1032,6 +1034,8 @@ enum options_xtrabackup ...@@ -1032,6 +1034,8 @@ enum options_xtrabackup
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
OPT_INNODB_LOG_FILE_BUFFERING, OPT_INNODB_LOG_FILE_BUFFERING,
#endif #endif
OPT_INNODB_DATA_FILE_BUFFERING,
OPT_INNODB_DATA_FILE_WRITE_THROUGH,
OPT_INNODB_LOG_FILE_SIZE, OPT_INNODB_LOG_FILE_SIZE,
OPT_INNODB_OPEN_FILES, OPT_INNODB_OPEN_FILES,
OPT_XTRA_DEBUG_SYNC, OPT_XTRA_DEBUG_SYNC,
...@@ -1583,10 +1587,10 @@ struct my_option xb_server_options[] = ...@@ -1583,10 +1587,10 @@ struct my_option xb_server_options[] =
FALSE, 0, 0, 0, 0, 0}, FALSE, 0, 0, 0, 0, 0},
{"innodb_flush_method", OPT_INNODB_FLUSH_METHOD, {"innodb_flush_method", OPT_INNODB_FLUSH_METHOD,
"With which method to flush data.", "Ignored parameter with no effect",
&srv_file_flush_method, &srv_file_flush_method, &innodb_flush_method, &innodb_flush_method,
&innodb_flush_method_typelib, GET_ENUM, REQUIRED_ARG, &innodb_flush_method_typelib, GET_ENUM, REQUIRED_ARG,
IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT), 0, 0, 0, 0, 0}, 4/* O_DIRECT */, 0, 0, 0, 0, 0},
{"innodb_log_buffer_size", OPT_INNODB_LOG_BUFFER_SIZE, {"innodb_log_buffer_size", OPT_INNODB_LOG_BUFFER_SIZE,
"Redo log buffer size in bytes.", "Redo log buffer size in bytes.",
...@@ -1600,6 +1604,16 @@ struct my_option xb_server_options[] = ...@@ -1600,6 +1604,16 @@ struct my_option xb_server_options[] =
(G_PTR*) &log_sys.log_buffered, 0, GET_BOOL, NO_ARG, (G_PTR*) &log_sys.log_buffered, 0, GET_BOOL, NO_ARG,
TRUE, 0, 0, 0, 0, 0}, TRUE, 0, 0, 0, 0, 0},
#endif #endif
{"innodb_data_file_buffering", OPT_INNODB_DATA_FILE_BUFFERING,
"Whether the file system cache for data files is enabled during --backup",
(G_PTR*) &fil_system.buffered,
(G_PTR*) &fil_system.buffered, 0, GET_BOOL, NO_ARG,
FALSE, 0, 0, 0, 0, 0},
{"innodb_data_file_write_through", OPT_INNODB_DATA_FILE_WRITE_THROUGH,
"Whether each write to data files writes through",
(G_PTR*) &fil_system.write_through,
(G_PTR*) &fil_system.write_through, 0, GET_BOOL, NO_ARG,
FALSE, 0, 0, 0, 0, 0},
{"innodb_log_file_size", OPT_INNODB_LOG_FILE_SIZE, {"innodb_log_file_size", OPT_INNODB_LOG_FILE_SIZE,
"Ignored for mysqld option compatibility", "Ignored for mysqld option compatibility",
(G_PTR*) &srv_log_file_size, (G_PTR*) &srv_log_file_size, 0, (G_PTR*) &srv_log_file_size, (G_PTR*) &srv_log_file_size, 0,
...@@ -1917,12 +1931,6 @@ xb_get_one_option(const struct my_option *opt, ...@@ -1917,12 +1931,6 @@ xb_get_one_option(const struct my_option *opt,
ADD_PRINT_PARAM_OPT(srv_log_group_home_dir); ADD_PRINT_PARAM_OPT(srv_log_group_home_dir);
break; break;
case OPT_INNODB_FLUSH_METHOD:
ut_a(srv_file_flush_method
<= IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT_NO_FSYNC));
ADD_PRINT_PARAM_OPT(innodb_flush_method_names[srv_file_flush_method]);
break;
case OPT_INNODB_PAGE_SIZE: case OPT_INNODB_PAGE_SIZE:
ADD_PRINT_PARAM_OPT(innobase_page_size); ADD_PRINT_PARAM_OPT(innobase_page_size);
......
...@@ -78,7 +78,7 @@ my %debuggers = ( ...@@ -78,7 +78,7 @@ my %debuggers = (
options => '-f -o {log} {exe} {args}', options => '-f -o {log} {exe} {args}',
}, },
rr => { rr => {
options => '_RR_TRACE_DIR={log} rr record {exe} {args} --loose-skip-innodb-use-native-aio --loose-innodb-flush-method=fsync', options => '_RR_TRACE_DIR={log} rr record {exe} {args}',
run => 'env', run => 'env',
pre => sub { pre => sub {
::mtr_error('rr requires kernel.perf_event_paranoid <= 1') ::mtr_error('rr requires kernel.perf_event_paranoid <= 1')
......
...@@ -355,6 +355,18 @@ NUMERIC_BLOCK_SIZE 0 ...@@ -355,6 +355,18 @@ NUMERIC_BLOCK_SIZE 0
ENUM_VALUE_LIST NULL ENUM_VALUE_LIST NULL
READ_ONLY NO READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_DATA_FILE_BUFFERING
SESSION_VALUE NULL
DEFAULT_VALUE OFF
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BOOLEAN
VARIABLE_COMMENT Whether the file system cache for data files is enabled
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST OFF,ON
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_DATA_FILE_PATH VARIABLE_NAME INNODB_DATA_FILE_PATH
SESSION_VALUE NULL SESSION_VALUE NULL
DEFAULT_VALUE ibdata1:12M:autoextend DEFAULT_VALUE ibdata1:12M:autoextend
...@@ -379,6 +391,18 @@ NUMERIC_BLOCK_SIZE 0 ...@@ -379,6 +391,18 @@ NUMERIC_BLOCK_SIZE 0
ENUM_VALUE_LIST NULL ENUM_VALUE_LIST NULL
READ_ONLY YES READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME INNODB_DATA_FILE_WRITE_THROUGH
SESSION_VALUE NULL
DEFAULT_VALUE OFF
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BOOLEAN
VARIABLE_COMMENT Whether each write to data files writes through
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST OFF,ON
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_DATA_HOME_DIR VARIABLE_NAME INNODB_DATA_HOME_DIR
SESSION_VALUE NULL SESSION_VALUE NULL
DEFAULT_VALUE DEFAULT_VALUE
...@@ -1015,6 +1039,18 @@ NUMERIC_BLOCK_SIZE 4096 ...@@ -1015,6 +1039,18 @@ NUMERIC_BLOCK_SIZE 4096
ENUM_VALUE_LIST NULL ENUM_VALUE_LIST NULL
READ_ONLY NO READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME INNODB_LOG_FILE_WRITE_THROUGH
SESSION_VALUE NULL
DEFAULT_VALUE OFF
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BOOLEAN
VARIABLE_COMMENT Whether each write to ib_logfile0 is write through
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST OFF,ON
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_LOG_GROUP_HOME_DIR VARIABLE_NAME INNODB_LOG_GROUP_HOME_DIR
SESSION_VALUE NULL SESSION_VALUE NULL
DEFAULT_VALUE DEFAULT_VALUE
......
...@@ -1724,7 +1724,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept ...@@ -1724,7 +1724,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
resize_log.write(CHECKPOINT_1, {c, get_block_size()}); resize_log.write(CHECKPOINT_1, {c, get_block_size()});
} }
if (srv_file_flush_method != SRV_O_DSYNC) if (!log_write_through)
ut_a(log.flush()); ut_a(log.flush());
latch.wr_lock(SRW_LOCK_CALL); latch.wr_lock(SRW_LOCK_CALL);
ut_ad(checkpoint_pending); ut_ad(checkpoint_pending);
...@@ -1756,7 +1756,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept ...@@ -1756,7 +1756,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
if (!is_pmem()) if (!is_pmem())
{ {
if (srv_file_flush_method != SRV_O_DSYNC) if (!log_write_through)
ut_a(resize_log.flush()); ut_a(resize_log.flush());
IF_WIN(log.close(),); IF_WIN(log.close(),);
} }
...@@ -1902,13 +1902,7 @@ static bool log_checkpoint() ...@@ -1902,13 +1902,7 @@ static bool log_checkpoint()
if (recv_recovery_is_on()) if (recv_recovery_is_on())
recv_sys.apply(true); recv_sys.apply(true);
switch (srv_file_flush_method) { fil_flush_file_spaces();
case SRV_NOSYNC:
case SRV_O_DIRECT_NO_FSYNC:
break;
default:
fil_flush_file_spaces();
}
log_sys.latch.wr_lock(SRW_LOCK_CALL); log_sys.latch.wr_lock(SRW_LOCK_CALL);
const lsn_t end_lsn= log_sys.get_lsn(); const lsn_t end_lsn= log_sys.get_lsn();
...@@ -2060,13 +2054,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) ...@@ -2060,13 +2054,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
MONITOR_FLUSH_SYNC_PAGES, n_flushed); MONITOR_FLUSH_SYNC_PAGES, n_flushed);
} }
switch (srv_file_flush_method) { fil_flush_file_spaces();
case SRV_NOSYNC:
case SRV_O_DIRECT_NO_FSYNC:
break;
default:
fil_flush_file_spaces();
}
log_sys.latch.wr_lock(SRW_LOCK_CALL); log_sys.latch.wr_lock(SRW_LOCK_CALL);
const lsn_t newest_lsn= log_sys.get_lsn(); const lsn_t newest_lsn= log_sys.get_lsn();
......
...@@ -499,6 +499,9 @@ void fil_space_t::flush_low() ...@@ -499,6 +499,9 @@ void fil_space_t::flush_low()
break; break;
} }
if (fil_system.is_write_through())
goto skip_flush;
fil_n_pending_tablespace_flushes++; fil_n_pending_tablespace_flushes++;
for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node; for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
node= UT_LIST_GET_NEXT(chain, node)) node= UT_LIST_GET_NEXT(chain, node))
...@@ -523,8 +526,9 @@ void fil_space_t::flush_low() ...@@ -523,8 +526,9 @@ void fil_space_t::flush_low()
mysql_mutex_unlock(&fil_system.mutex); mysql_mutex_unlock(&fil_system.mutex);
} }
clear_flush();
fil_n_pending_tablespace_flushes--; fil_n_pending_tablespace_flushes--;
skip_flush:
clear_flush();
} }
/** Try to extend a tablespace. /** Try to extend a tablespace.
...@@ -753,7 +757,6 @@ inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle) ...@@ -753,7 +757,6 @@ inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
{ {
if (space->is_in_unflushed_spaces) if (space->is_in_unflushed_spaces)
{ {
ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
space->is_in_unflushed_spaces= false; space->is_in_unflushed_spaces= false;
fil_system.unflushed_spaces.remove(*space); fil_system.unflushed_spaces.remove(*space);
} }
...@@ -786,7 +789,6 @@ pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle) ...@@ -786,7 +789,6 @@ pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle)
if (space->is_in_unflushed_spaces) if (space->is_in_unflushed_spaces)
{ {
ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
space->is_in_unflushed_spaces= false; space->is_in_unflushed_spaces= false;
unflushed_spaces.remove(*space); unflushed_spaces.remove(*space);
} }
...@@ -1320,6 +1322,120 @@ ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size() ...@@ -1320,6 +1322,120 @@ ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size()
mysql_mutex_unlock(&mutex); mysql_mutex_unlock(&mutex);
} }
ATTRIBUTE_COLD void fil_space_t::reopen_all()
{
mysql_mutex_assert_owner(&fil_system.mutex);
fil_system.freeze_space_list++;
for (fil_space_t &space : fil_system.space_list)
{
for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node;
node= UT_LIST_GET_NEXT(chain, node))
if (node->is_open())
goto need_to_close;
continue;
need_to_close:
uint32_t p= space.n_pending.fetch_or(CLOSING, std::memory_order_acquire);
if (p & (STOPPING | CLOSING))
continue;
for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node;
node= UT_LIST_GET_NEXT(chain, node))
{
if (!node->is_open())
continue;
ulint type= OS_DATA_FILE;
switch (FSP_FLAGS_GET_ZIP_SSIZE(space.flags)) {
case 1: case 2:
type= OS_DATA_FILE_NO_O_DIRECT;
}
for (ulint count= 10000; count--;)
{
p= space.pending();
if (!(p & CLOSING) || (p & STOPPING))
break;
if (!(p & PENDING) && !node->being_extended)
{
space.reacquire();
mysql_mutex_unlock(&fil_system.mutex);
/* Unconditionally flush the file, because
fil_system.write_through was updated prematurely,
potentially causing some flushes to be lost. */
os_file_flush(node->handle);
mysql_mutex_lock(&fil_system.mutex);
p= space.n_pending.fetch_sub(1, std::memory_order_relaxed) - 1;
if (!(p & CLOSING) || (p & STOPPING))
break;
if (!(p & PENDING) && !node->being_extended)
{
ut_a(os_file_close(node->handle));
bool success;
node->handle= os_file_create(innodb_data_file_key, node->name,
node->is_raw_disk
? OS_FILE_OPEN_RAW : OS_FILE_OPEN,
OS_FILE_AIO, type,
srv_read_only_mode, &success);
ut_a(success);
goto next_file;
}
}
space.reacquire();
mysql_mutex_unlock(&fil_system.mutex);
std::this_thread::sleep_for(std::chrono::microseconds(100));
mysql_mutex_lock(&fil_system.mutex);
space.release();
if (!node->is_open())
goto next_file;
}
if (!(p & CLOSING) || (p & STOPPING))
next_file:
continue;
sql_print_error("InnoDB: Failed to reopen file '%s' due to " UINT32PF
" operations", node->name, p & PENDING);
}
}
fil_system.freeze_space_list--;
}
void fil_system_t::set_write_through(bool write_through)
{
mysql_mutex_lock(&mutex);
if (write_through != this->write_through)
{
this->write_through= write_through;
fil_space_t::reopen_all();
}
mysql_mutex_unlock(&mutex);
}
void fil_system_t::set_buffered(bool buffered)
{
mysql_mutex_lock(&mutex);
if (buffered != this->buffered)
{
this->buffered= buffered;
fil_space_t::reopen_all();
}
mysql_mutex_unlock(&mutex);
}
/** Close all tablespace files at shutdown */ /** Close all tablespace files at shutdown */
void fil_space_t::close_all() void fil_space_t::close_all()
{ {
...@@ -1340,12 +1456,9 @@ void fil_space_t::close_all() ...@@ -1340,12 +1456,9 @@ void fil_space_t::close_all()
for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL; for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL;
node= UT_LIST_GET_NEXT(chain, node)) node= UT_LIST_GET_NEXT(chain, node))
{ {
if (!node->is_open()) if (!node->is_open())
{
next: next:
continue; continue;
}
for (ulint count= 10000; count--;) for (ulint count= 10000; count--;)
{ {
...@@ -1361,8 +1474,8 @@ void fil_space_t::close_all() ...@@ -1361,8 +1474,8 @@ void fil_space_t::close_all()
goto next; goto next;
} }
ib::error() << "File '" << node->name << "' has " << space.referenced() sql_print_error("InnoDB: File '%s' has " UINT32PF " operations",
<< " operations"; node->name, space.referenced());
} }
fil_system.detach(&space); fil_system.detach(&space);
...@@ -2598,7 +2711,7 @@ inline void fil_node_t::complete_write() ...@@ -2598,7 +2711,7 @@ inline void fil_node_t::complete_write()
mysql_mutex_assert_not_owner(&fil_system.mutex); mysql_mutex_assert_not_owner(&fil_system.mutex);
if (space->purpose != FIL_TYPE_TEMPORARY && if (space->purpose != FIL_TYPE_TEMPORARY &&
srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC && (!fil_system.is_write_through() && !my_disable_sync) &&
space->set_needs_flush()) space->set_needs_flush())
{ {
mysql_mutex_lock(&fil_system.mutex); mysql_mutex_lock(&fil_system.mutex);
...@@ -2774,14 +2887,6 @@ void fil_aio_callback(const IORequest &request) ...@@ -2774,14 +2887,6 @@ void fil_aio_callback(const IORequest &request)
possibly cached by the OS. */ possibly cached by the OS. */
void fil_flush_file_spaces() void fil_flush_file_spaces()
{ {
if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
{
ut_d(mysql_mutex_lock(&fil_system.mutex));
ut_ad(fil_system.unflushed_spaces.empty());
ut_d(mysql_mutex_unlock(&fil_system.mutex));
return;
}
rescan: rescan:
mysql_mutex_lock(&fil_system.mutex); mysql_mutex_lock(&fil_system.mutex);
......
...@@ -366,6 +366,8 @@ const char* innodb_flush_method_names[] = { ...@@ -366,6 +366,8 @@ const char* innodb_flush_method_names[] = {
NullS NullS
}; };
static constexpr ulong innodb_flush_method_default = IF_WIN(6,4);
/** Enumeration of innodb_flush_method */ /** Enumeration of innodb_flush_method */
TYPELIB innodb_flush_method_typelib = { TYPELIB innodb_flush_method_typelib = {
array_elements(innodb_flush_method_names) - 1, array_elements(innodb_flush_method_names) - 1,
...@@ -374,6 +376,9 @@ TYPELIB innodb_flush_method_typelib = { ...@@ -374,6 +376,9 @@ TYPELIB innodb_flush_method_typelib = {
NULL NULL
}; };
/** Deprecated parameter */
static ulong innodb_flush_method;
/** Names of allowed values of innodb_deadlock_report */ /** Names of allowed values of innodb_deadlock_report */
static const char *innodb_deadlock_report_names[]= { static const char *innodb_deadlock_report_names[]= {
"off", /* Do not report any details of deadlocks */ "off", /* Do not report any details of deadlocks */
...@@ -4005,22 +4010,27 @@ static int innodb_init_params() ...@@ -4005,22 +4010,27 @@ static int innodb_init_params()
data_mysql_default_charset_coll = (ulint) default_charset_info->number; data_mysql_default_charset_coll = (ulint) default_charset_info->number;
if (innodb_flush_method == 1 /* O_DSYNC */) {
log_sys.log_write_through = true;
fil_system.write_through = true;
fil_system.buffered = false;
#if defined __linux__ || defined _WIN32
log_sys.log_buffered = false;
goto skip_buffering_tweak;
#endif
} else if (innodb_flush_method >= 4 /* O_DIRECT */
IF_WIN(&& innodb_flush_method < 8 /* normal */,)) {
/* O_DIRECT and similar settings do nothing */
#ifndef _WIN32 #ifndef _WIN32
if (srv_use_atomic_writes && my_may_have_atomic_write) { } else if (srv_use_atomic_writes && my_may_have_atomic_write) {
/* /* If atomic writes are enabled, do the same as with
Force O_DIRECT on Unixes (on Windows writes are always innodb_flush_method=O_DIRECT: retain the default settings */
unbuffered)
*/
switch (srv_file_flush_method) {
case SRV_O_DIRECT:
case SRV_O_DIRECT_NO_FSYNC:
break;
default:
srv_file_flush_method = SRV_O_DIRECT;
fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
}
}
#endif #endif
} else {
log_sys.log_write_through = false;
fil_system.write_through = false;
fil_system.buffered = true;
}
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
if (srv_flush_log_at_trx_commit == 2) { if (srv_flush_log_at_trx_commit == 2) {
...@@ -4028,6 +4038,7 @@ static int innodb_init_params() ...@@ -4028,6 +4038,7 @@ static int innodb_init_params()
innodb_flush_log_at_trx_commit=2. */ innodb_flush_log_at_trx_commit=2. */
log_sys.log_buffered = true; log_sys.log_buffered = true;
} }
skip_buffering_tweak:
#endif #endif
if (srv_read_only_mode) { if (srv_read_only_mode) {
...@@ -4035,12 +4046,6 @@ static int innodb_init_params() ...@@ -4035,12 +4046,6 @@ static int innodb_init_params()
srv_use_doublewrite_buf = FALSE; srv_use_doublewrite_buf = FALSE;
} }
#if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32
/* Currently native AIO is supported only on windows and linux
and that also when the support is compiled in. In all other
cases, we ignore the setting of innodb_use_native_aio. */
srv_use_native_aio = FALSE;
#endif
#ifdef HAVE_URING #ifdef HAVE_URING
if (srv_use_native_aio && io_uring_may_be_unsafe) { if (srv_use_native_aio && io_uring_may_be_unsafe) {
sql_print_warning("innodb_use_native_aio may cause " sql_print_warning("innodb_use_native_aio may cause "
...@@ -4048,22 +4053,13 @@ static int innodb_init_params() ...@@ -4048,22 +4053,13 @@ static int innodb_init_params()
"https://jira.mariadb.org/browse/MDEV-26674", "https://jira.mariadb.org/browse/MDEV-26674",
io_uring_may_be_unsafe); io_uring_may_be_unsafe);
} }
#elif !defined LINUX_NATIVE_AIO && !defined _WIN32
/* Currently native AIO is supported only on windows and linux
and that also when the support is compiled in. In all other
cases, we ignore the setting of innodb_use_native_aio. */
srv_use_native_aio = FALSE;
#endif #endif
#ifndef _WIN32
ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC);
#else
switch (srv_file_flush_method) {
case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */:
srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC;
break;
case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */:
srv_file_flush_method = SRV_FSYNC;
break;
default:
ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
}
#endif
innodb_buffer_pool_size_init(); innodb_buffer_pool_size_init();
srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift); srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift);
...@@ -18409,7 +18405,7 @@ buffer_pool_load_abort( ...@@ -18409,7 +18405,7 @@ buffer_pool_load_abort(
} }
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*, static void innodb_log_file_buffering_update(THD *, st_mysql_sys_var*,
void *, const void *save) void *, const void *save)
{ {
mysql_mutex_unlock(&LOCK_global_system_variables); mysql_mutex_unlock(&LOCK_global_system_variables);
...@@ -18418,6 +18414,30 @@ static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*, ...@@ -18418,6 +18414,30 @@ static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*,
} }
#endif #endif
static void innodb_log_file_write_through_update(THD *, st_mysql_sys_var*,
void *, const void *save)
{
mysql_mutex_unlock(&LOCK_global_system_variables);
log_sys.set_write_through(*static_cast<const my_bool*>(save));
mysql_mutex_lock(&LOCK_global_system_variables);
}
static void innodb_data_file_buffering_update(THD *, st_mysql_sys_var*,
void *, const void *save)
{
mysql_mutex_unlock(&LOCK_global_system_variables);
fil_system.set_buffered(*static_cast<const my_bool*>(save));
mysql_mutex_lock(&LOCK_global_system_variables);
}
static void innodb_data_file_write_through_update(THD *, st_mysql_sys_var*,
void *, const void *save)
{
mysql_mutex_unlock(&LOCK_global_system_variables);
fil_system.set_write_through(*static_cast<const my_bool*>(save));
mysql_mutex_lock(&LOCK_global_system_variables);
}
static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*,
void *var, const void *save) void *var, const void *save)
{ {
...@@ -18876,11 +18896,10 @@ static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, ...@@ -18876,11 +18896,10 @@ static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
" guarantees in case of crash. 0 and 2 can be faster than 1 or 3.", " guarantees in case of crash. 0 and 2 can be faster than 1 or 3.",
NULL, NULL, 1, 0, 3, 0); NULL, NULL, 1, 0, 3, 0);
static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method, static MYSQL_SYSVAR_ENUM(flush_method, innodb_flush_method,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_DEPRECATED,
"With which method to flush data.", "With which method to flush data.",
NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT), NULL, NULL, innodb_flush_method_default, &innodb_flush_method_typelib);
&innodb_flush_method_typelib);
static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir, static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
...@@ -19312,6 +19331,21 @@ static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered, ...@@ -19312,6 +19331,21 @@ static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered,
nullptr, innodb_log_file_buffering_update, FALSE); nullptr, innodb_log_file_buffering_update, FALSE);
#endif #endif
static MYSQL_SYSVAR_BOOL(log_file_write_through, log_sys.log_write_through,
PLUGIN_VAR_OPCMDARG,
"Whether each write to ib_logfile0 is write through",
nullptr, innodb_log_file_write_through_update, FALSE);
static MYSQL_SYSVAR_BOOL(data_file_buffering, fil_system.buffered,
PLUGIN_VAR_OPCMDARG,
"Whether the file system cache for data files is enabled",
nullptr, innodb_data_file_buffering_update, FALSE);
static MYSQL_SYSVAR_BOOL(data_file_write_through, fil_system.write_through,
PLUGIN_VAR_OPCMDARG,
"Whether each write to data files writes through",
nullptr, innodb_data_file_write_through_update, FALSE);
static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size, static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
PLUGIN_VAR_RQCMDARG, PLUGIN_VAR_RQCMDARG,
"Redo log size in bytes.", "Redo log size in bytes.",
...@@ -19756,6 +19790,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { ...@@ -19756,6 +19790,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
MYSQL_SYSVAR(log_file_buffering), MYSQL_SYSVAR(log_file_buffering),
#endif #endif
MYSQL_SYSVAR(log_file_write_through),
MYSQL_SYSVAR(data_file_buffering),
MYSQL_SYSVAR(data_file_write_through),
MYSQL_SYSVAR(log_file_size), MYSQL_SYSVAR(log_file_size),
MYSQL_SYSVAR(log_group_home_dir), MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(max_dirty_pages_pct), MYSQL_SYSVAR(max_dirty_pages_pct),
......
...@@ -51,35 +51,6 @@ using space_list_t= ilist<fil_space_t, space_list_tag_t>; ...@@ -51,35 +51,6 @@ using space_list_t= ilist<fil_space_t, space_list_tag_t>;
// Forward declaration // Forward declaration
extern my_bool srv_use_doublewrite_buf; extern my_bool srv_use_doublewrite_buf;
/** Possible values of innodb_flush_method */
enum srv_flush_t
{
/** fsync, the default */
SRV_FSYNC= 0,
/** open log files in O_DSYNC mode */
SRV_O_DSYNC,
/** do not call os_file_flush() when writing data files, but do flush
after writing to log files */
SRV_LITTLESYNC,
/** do not flush after writing */
SRV_NOSYNC,
/** invoke os_file_set_nocache() on data files. This implies using
unbuffered I/O but still fdatasync(), because some filesystems might
not flush meta-data on write completion */
SRV_O_DIRECT,
/** Like O_DIRECT, but skip fdatasync(), assuming that the data is
durable on write completion */
SRV_O_DIRECT_NO_FSYNC
#ifdef _WIN32
/** Traditional Windows appoach to open all files without caching,
and do FileFlushBuffers() */
,SRV_ALL_O_DIRECT_FSYNC
#endif
};
/** innodb_flush_method */
extern ulong srv_file_flush_method;
/** Undo tablespaces starts with space_id. */ /** Undo tablespaces starts with space_id. */
extern uint32_t srv_undo_space_id_start; extern uint32_t srv_undo_space_id_start;
/** The number of UNDO tablespaces that are open and ready to use. */ /** The number of UNDO tablespaces that are open and ready to use. */
...@@ -631,6 +602,8 @@ struct fil_space_t final ...@@ -631,6 +602,8 @@ struct fil_space_t final
} }
public: public:
/** Reopen all files on set_write_through() or set_buffered(). */
static void reopen_all();
/** Try to close a file to adhere to the innodb_open_files limit. /** Try to close a file to adhere to the innodb_open_files limit.
@param print_info whether to diagnose why a file cannot be closed @param print_info whether to diagnose why a file cannot be closed
@return whether a file was closed */ @return whether a file was closed */
...@@ -1414,6 +1387,20 @@ struct fil_system_t ...@@ -1414,6 +1387,20 @@ struct fil_system_t
fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ fil_space_t* temp_space; /*!< The innodb_temporary tablespace */
/** Map of fil_space_t::id to fil_space_t* */ /** Map of fil_space_t::id to fil_space_t* */
hash_table_t spaces; hash_table_t spaces;
/** whether each write to data files is durable (O_DSYNC) */
my_bool write_through;
/** whether data files are buffered (not O_DIRECT) */
my_bool buffered;
/** Try to enable or disable write-through of data files */
void set_write_through(bool write_through);
/** Try to enable or disable file system caching of data files */
void set_buffered(bool buffered);
TPOOL_SUPPRESS_TSAN bool is_write_through() const { return write_through; }
TPOOL_SUPPRESS_TSAN bool is_buffered() const { return buffered; }
/** tablespaces for which fil_space_t::needs_flush() holds */ /** tablespaces for which fil_space_t::needs_flush() holds */
sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces; sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
/** number of currently open files; protected by mutex */ /** number of currently open files; protected by mutex */
...@@ -1527,12 +1514,7 @@ template<bool have_reference> inline void fil_space_t::flush() ...@@ -1527,12 +1514,7 @@ template<bool have_reference> inline void fil_space_t::flush()
mysql_mutex_assert_not_owner(&fil_system.mutex); mysql_mutex_assert_not_owner(&fil_system.mutex);
ut_ad(!have_reference || (pending() & PENDING)); ut_ad(!have_reference || (pending() & PENDING));
ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT); ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) if (have_reference)
{
ut_ad(!is_in_unflushed_spaces);
ut_ad(!needs_flush());
}
else if (have_reference)
flush_low(); flush_low();
else else
{ {
......
...@@ -275,6 +275,8 @@ typedef srw_lock log_rwlock_t; ...@@ -275,6 +275,8 @@ typedef srw_lock log_rwlock_t;
bool log_maybe_unbuffered; bool log_maybe_unbuffered;
# endif # endif
#endif #endif
/** whether each write to ib_logfile0 is durable (O_DSYNC) */
my_bool log_write_through;
/** Fields involved in checkpoints @{ */ /** Fields involved in checkpoints @{ */
lsn_t log_capacity; /*!< capacity of the log; if lsn_t log_capacity; /*!< capacity of the log; if
...@@ -362,6 +364,8 @@ typedef srw_lock log_rwlock_t; ...@@ -362,6 +364,8 @@ typedef srw_lock log_rwlock_t;
/** Try to enable or disable file system caching (update log_buffered) */ /** Try to enable or disable file system caching (update log_buffered) */
void set_buffered(bool buffered); void set_buffered(bool buffered);
#endif #endif
/** Try to enable or disable durable writes (update log_write_through) */
void set_write_through(bool write_through);
void attach(log_file_t file, os_offset_t size); void attach(log_file_t file, os_offset_t size);
......
...@@ -401,6 +401,31 @@ void log_t::set_buffered(bool buffered) ...@@ -401,6 +401,31 @@ void log_t::set_buffered(bool buffered)
} }
#endif #endif
/** Try to enable or disable durable writes (update log_write_through) */
void log_t::set_write_through(bool write_through)
{
if (is_pmem() || high_level_read_only)
return;
log_resize_acquire();
if (!resize_in_progress() && is_opened() &&
bool(log_write_through) != write_through)
{
os_file_close_func(log.m_file);
log.m_file= OS_FILE_CLOSED;
std::string path{get_log_file_path()};
log_write_through= write_through;
bool success;
log.m_file= os_file_create_func(path.c_str(),
OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE,
false, &success);
ut_a(log.m_file != OS_FILE_CLOSED);
sql_print_information(log_write_through
? "InnoDB: Log writes write through"
: "InnoDB: Log writes may be cached");
}
log_resize_release();
}
/** Start resizing the log and release the exclusive latch. /** Start resizing the log and release the exclusive latch.
@param size requested new file_size @param size requested new file_size
@return whether the resizing was started successfully */ @return whether the resizing was started successfully */
...@@ -852,7 +877,7 @@ bool log_t::flush(lsn_t lsn) noexcept ...@@ -852,7 +877,7 @@ bool log_t::flush(lsn_t lsn) noexcept
{ {
ut_ad(lsn >= get_flushed_lsn()); ut_ad(lsn >= get_flushed_lsn());
flush_lock.set_pending(lsn); flush_lock.set_pending(lsn);
const bool success{srv_file_flush_method == SRV_O_DSYNC || log.flush()}; const bool success{log_write_through || log.flush()};
if (UNIV_LIKELY(success)) if (UNIV_LIKELY(success))
{ {
flushed_to_disk_lsn.store(lsn, std::memory_order_release); flushed_to_disk_lsn.store(lsn, std::memory_order_release);
......
...@@ -65,7 +65,9 @@ Created 10/21/1995 Heikki Tuuri ...@@ -65,7 +65,9 @@ Created 10/21/1995 Heikki Tuuri
#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
#ifdef _WIN32 #ifdef _WIN32
#include <winioctl.h> # include <winioctl.h>
#elif !defined O_DSYNC
# define O_DSYNC O_SYNC
#endif #endif
// my_test_if_atomic_write() , my_win_secattr() // my_test_if_atomic_write() , my_win_secattr()
...@@ -931,6 +933,8 @@ bool ...@@ -931,6 +933,8 @@ bool
os_file_flush_func( os_file_flush_func(
os_file_t file) os_file_t file)
{ {
if (UNIV_UNLIKELY(my_disable_sync)) return true;
int ret; int ret;
ret = os_file_sync_posix(file); ret = os_file_sync_posix(file);
...@@ -981,40 +985,19 @@ os_file_create_simple_func( ...@@ -981,40 +985,19 @@ os_file_create_simple_func(
*success = false; *success = false;
int create_flag; int create_flag = O_RDONLY;
const char* mode_str = NULL;
ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
if (create_mode == OS_FILE_OPEN) { if (read_only) {
mode_str = "OPEN"; } else if (create_mode == OS_FILE_OPEN) {
if (access_type != OS_FILE_READ_ONLY) {
if (access_type == OS_FILE_READ_ONLY) {
create_flag = O_RDONLY;
} else if (read_only) {
create_flag = O_RDONLY;
} else {
create_flag = O_RDWR; create_flag = O_RDWR;
} }
} else if (read_only) {
mode_str = "OPEN";
create_flag = O_RDONLY;
} else if (create_mode == OS_FILE_CREATE) { } else if (create_mode == OS_FILE_CREATE) {
mode_str = "CREATE";
create_flag = O_RDWR | O_CREAT | O_EXCL; create_flag = O_RDWR | O_CREAT | O_EXCL;
} else if (create_mode == OS_FILE_CREATE_PATH) { } else if (create_mode == OS_FILE_CREATE_PATH) {
mode_str = "CREATE PATH";
/* Create subdirs along the path if needed. */ /* Create subdirs along the path if needed. */
*success = os_file_create_subdirs_if_needed(name); *success = os_file_create_subdirs_if_needed(name);
...@@ -1040,40 +1023,32 @@ os_file_create_simple_func( ...@@ -1040,40 +1023,32 @@ os_file_create_simple_func(
return(OS_FILE_CLOSED); return(OS_FILE_CLOSED);
} }
bool retry; create_flag |= O_CLOEXEC;
if (fil_system.is_write_through()) create_flag |= O_DSYNC;
int direct_flag = fil_system.is_buffered() ? 0 : O_DIRECT;
do { for (;;) {
file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); file = open(name, create_flag | direct_flag, os_innodb_umask);
if (file == -1) { if (file == -1) {
if (direct_flag && errno == EINVAL) {
direct_flag = 0;
continue;
}
*success = false; *success = false;
retry = os_file_handle_error( if (!os_file_handle_error(
name, name,
create_mode == OS_FILE_OPEN create_mode == OS_FILE_OPEN
? "open" : "create"); ? "open" : "create")) {
break;
}
} else { } else {
*success = true; *success = true;
retry = false;
}
} while (retry);
/* This function is always called for data files, we should disable
OS caching (O_DIRECT) here as we do in os_file_create_func(), so
we open the same file in the same mode, see man page of open(2). */
if (!srv_read_only_mode && *success) {
switch (srv_file_flush_method) {
case SRV_O_DSYNC:
case SRV_O_DIRECT:
case SRV_O_DIRECT_NO_FSYNC:
os_file_set_nocache(file, name, mode_str);
break;
default:
break; break;
} }
} }
#ifndef _WIN32
if (!read_only if (!read_only
&& *success && *success
&& access_type == OS_FILE_READ_WRITE && access_type == OS_FILE_READ_WRITE
...@@ -1084,7 +1059,6 @@ os_file_create_simple_func( ...@@ -1084,7 +1059,6 @@ os_file_create_simple_func(
close(file); close(file);
file = -1; file = -1;
} }
#endif /* !_WIN32 */
return(file); return(file);
} }
...@@ -1156,8 +1130,8 @@ os_file_create_func( ...@@ -1156,8 +1130,8 @@ os_file_create_func(
return(OS_FILE_CLOSED); return(OS_FILE_CLOSED);
); );
int create_flag; int create_flag = O_RDONLY | O_CLOEXEC;
const char* mode_str = NULL; const char* mode_str = "OPEN";
on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
? true : false; ? true : false;
...@@ -1167,30 +1141,17 @@ os_file_create_func( ...@@ -1167,30 +1141,17 @@ os_file_create_func(
create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
| OS_FILE_ON_ERROR_SILENT)); | OS_FILE_ON_ERROR_SILENT));
if (create_mode == OS_FILE_OPEN if (read_only) {
|| create_mode == OS_FILE_OPEN_RAW } else if (create_mode == OS_FILE_OPEN
|| create_mode == OS_FILE_OPEN_RETRY) { || create_mode == OS_FILE_OPEN_RAW
|| create_mode == OS_FILE_OPEN_RETRY) {
mode_str = "OPEN"; create_flag = O_RDWR | O_CLOEXEC;
create_flag = read_only ? O_RDONLY : O_RDWR;
} else if (read_only) {
mode_str = "OPEN";
create_flag = O_RDONLY;
} else if (create_mode == OS_FILE_CREATE) { } else if (create_mode == OS_FILE_CREATE) {
mode_str = "CREATE"; mode_str = "CREATE";
create_flag = O_RDWR | O_CREAT | O_EXCL; create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC;
} else if (create_mode == OS_FILE_OVERWRITE) { } else if (create_mode == OS_FILE_OVERWRITE) {
mode_str = "OVERWRITE"; mode_str = "OVERWRITE";
create_flag = O_RDWR | O_CREAT | O_TRUNC; create_flag = O_RDWR | O_CREAT | O_TRUNC | O_CLOEXEC;
} else { } else {
ib::error() ib::error()
<< "Unknown file create mode (" << create_mode << ")" << "Unknown file create mode (" << create_mode << ")"
...@@ -1205,25 +1166,30 @@ os_file_create_func( ...@@ -1205,25 +1166,30 @@ os_file_create_func(
ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
/* We let O_DSYNC only affect log files */ create_flag |= O_CLOEXEC;
if (!read_only int direct_flag = type == OS_DATA_FILE && create_mode != OS_FILE_CREATE
&& type == OS_LOG_FILE && !fil_system.is_buffered()
&& srv_file_flush_method == SRV_O_DSYNC) { ? O_DIRECT : 0;
#ifdef O_DSYNC
if (read_only) {
} else if ((type == OS_LOG_FILE)
? log_sys.log_write_through
: fil_system.is_write_through()) {
create_flag |= O_DSYNC; create_flag |= O_DSYNC;
#else
create_flag |= O_SYNC;
#endif
} }
os_file_t file; os_file_t file;
bool retry;
do { for (;;) {
file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); file = open(name, create_flag | direct_flag, os_innodb_umask);
if (file == -1) { if (file == -1) {
if (direct_flag && errno == EINVAL) {
direct_flag = 0;
continue;
}
const char* operation; const char* operation;
operation = (create_mode == OS_FILE_CREATE operation = (create_mode == OS_FILE_CREATE
...@@ -1232,39 +1198,30 @@ os_file_create_func( ...@@ -1232,39 +1198,30 @@ os_file_create_func(
*success = false; *success = false;
if (on_error_no_exit) { if (on_error_no_exit) {
retry = os_file_handle_error_no_exit( if (os_file_handle_error_no_exit(
name, operation, on_error_silent); name, operation, on_error_silent))
continue;
} else { } else {
retry = os_file_handle_error(name, operation); if (os_file_handle_error(name, operation))
continue;
} }
return file;
} else { } else {
*success = true; *success = true;
retry = false; break;
} }
} while (retry);
if (!*success) {
return file;
} }
#if (defined __sun__ && defined DIRECTIO_ON) || defined O_DIRECT #if (defined __sun__ && defined DIRECTIO_ON) || defined O_DIRECT
if (type == OS_DATA_FILE) { if (type == OS_DATA_FILE && create_mode == OS_FILE_CREATE
switch (srv_file_flush_method) { && !fil_system.is_buffered()) {
case SRV_O_DSYNC:
case SRV_O_DIRECT:
case SRV_O_DIRECT_NO_FSYNC:
# ifdef __linux__ # ifdef __linux__
use_o_direct: use_o_direct:
# endif # endif
os_file_set_nocache(file, name, mode_str); os_file_set_nocache(file, name, mode_str);
break;
default:
break;
}
}
# ifdef __linux__ # ifdef __linux__
else if (type == OS_LOG_FILE && !log_sys.is_opened()) { } else if (type == OS_LOG_FILE && !log_sys.is_opened()) {
struct stat st; struct stat st;
char b[20 + sizeof "/sys/dev/block/" ":" char b[20 + sizeof "/sys/dev/block/" ":"
"/../queue/physical_block_size"]; "/../queue/physical_block_size"];
...@@ -1316,11 +1273,10 @@ os_file_create_func( ...@@ -1316,11 +1273,10 @@ os_file_create_func(
log_sys.log_buffered= true; log_sys.log_buffered= true;
log_sys.set_block_size(512); log_sys.set_block_size(512);
} }
}
# endif # endif
}
#endif #endif
#ifndef _WIN32
if (!read_only if (!read_only
&& create_mode != OS_FILE_OPEN_RAW && create_mode != OS_FILE_OPEN_RAW
&& !my_disable_locking && !my_disable_locking
...@@ -1348,7 +1304,6 @@ os_file_create_func( ...@@ -1348,7 +1304,6 @@ os_file_create_func(
close(file); close(file);
file = -1; file = -1;
} }
#endif /* !_WIN32 */
return(file); return(file);
} }
...@@ -1786,6 +1741,9 @@ Flushes the write buffers of a given file to the disk. ...@@ -1786,6 +1741,9 @@ Flushes the write buffers of a given file to the disk.
@return true if success */ @return true if success */
bool os_file_flush_func(os_file_t file) bool os_file_flush_func(os_file_t file)
{ {
if (UNIV_UNLIKELY(my_disable_sync))
return true;
++os_n_fsyncs; ++os_n_fsyncs;
static bool disable_datasync; static bool disable_datasync;
...@@ -2011,6 +1969,11 @@ os_file_create_simple_func( ...@@ -2011,6 +1969,11 @@ os_file_create_simple_func(
return(OS_FILE_CLOSED); return(OS_FILE_CLOSED);
} }
if (fil_system.is_write_through())
attributes |= FILE_FLAG_WRITE_THROUGH;
if (!fil_system.is_buffered())
attributes |= FILE_FLAG_NO_BUFFERING;
bool retry; bool retry;
do { do {
...@@ -2182,27 +2145,16 @@ os_file_create_func( ...@@ -2182,27 +2145,16 @@ os_file_create_func(
if (!log_sys.is_opened() && !log_sys.log_buffered) { if (!log_sys.is_opened() && !log_sys.log_buffered) {
attributes|= FILE_FLAG_NO_BUFFERING; attributes|= FILE_FLAG_NO_BUFFERING;
} }
if (srv_file_flush_method == SRV_O_DSYNC) if (log_sys.log_write_through)
attributes|= FILE_FLAG_WRITE_THROUGH; attributes|= FILE_FLAG_WRITE_THROUGH;
} } else {
else if (type == OS_DATA_FILE) if (type == OS_DATA_FILE && !fil_system.is_buffered())
{
switch (srv_file_flush_method)
{
case SRV_FSYNC:
case SRV_LITTLESYNC:
case SRV_NOSYNC:
break;
default:
attributes|= FILE_FLAG_NO_BUFFERING; attributes|= FILE_FLAG_NO_BUFFERING;
} if (fil_system.is_write_through())
attributes|= FILE_FLAG_WRITE_THROUGH;
} }
DWORD access = GENERIC_READ; DWORD access = read_only ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE;
if (!read_only) {
access |= GENERIC_WRITE;
}
for (;;) { for (;;) {
const char *operation; const char *operation;
......
...@@ -223,9 +223,6 @@ ulong srv_read_ahead_threshold; ...@@ -223,9 +223,6 @@ ulong srv_read_ahead_threshold;
buffer in terms of percentage of the buffer pool. */ buffer in terms of percentage of the buffer pool. */
uint srv_change_buffer_max_size; uint srv_change_buffer_max_size;
ulong srv_file_flush_method;
/** copy of innodb_open_files; @see innodb_init_params() */ /** copy of innodb_open_files; @see innodb_init_params() */
ulint srv_max_n_open_files; ulint srv_max_n_open_files;
......
...@@ -1168,7 +1168,7 @@ static void trx_flush_log_if_needed_low(lsn_t lsn, const trx_t *trx) ...@@ -1168,7 +1168,7 @@ static void trx_flush_log_if_needed_low(lsn_t lsn, const trx_t *trx)
callback= &cb; callback= &cb;
} }
log_write_up_to(lsn, srv_file_flush_method != SRV_NOSYNC && log_write_up_to(lsn, !my_disable_sync &&
(srv_flush_log_at_trx_commit & 1), callback); (srv_flush_log_at_trx_commit & 1), callback);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment