Commit 10dd290b authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD

For tablespaces that do not reside on spinning storage, it does
not make sense to attempt to write nearby pages when writing out
dirty pages from the InnoDB buffer pool. It is actually detrimental
to performance and to the life span of flash ROM storage.

With this change, MariaDB will detect whether an InnoDB file resides
on solid-state storage. The detection has been implemented for Linux
and Microsoft Windows. For other systems, we will err on the safe side
and assume that files reside on SSD.

As part of this change, we will reduce the number of fstat() calls
when opening data files on POSIX systems and slightly clean up some
file I/O code.

FIXME: os_is_sparse_file_supported() on POSIX works in a destructive
manner. Thus, we can only invoke it when creating files, not when
opening them.

For diagnostics, we introduce the column ON_SSD to the table
INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table
INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose
is to reflect the contents of the InnoDB system table SYS_TABLESPACES,
which we would like to remove at some point.

On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty
sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION
or ERROR_NOT_SUPPORTED. We will silently ignore also this error,
and assume that the file does not reside on SSD.

On Linux, the detection will be based on the files
/sys/block/*/queue/rotational and /sys/block/*/dev.
Especially for USB storage, it is possible that
/sys/block/*/queue/rotational will wrongly report 1 instead of 0.

fil_node_t::on_ssd: Whether the InnoDB data file resides on
solid-state storage.

fil_system_t::ssd: Collection of Linux block devices that reside on
non-rotational storage.

fil_system_t::create(): Detect ssd on Linux based on the contents
of /sys/block/*/queue/rotational and /sys/block/*/dev.

fil_system_t::is_ssd(dev_t): Determine if a Linux block device is
non-rotational. Partitions will be identified with the containing
block device by assuming that the least significant 4 bits of the
minor number identify a partition, and that the "partition number"
of the entire device is 0.
parent 2d825e97
......@@ -184,7 +184,6 @@
#cmakedefine HAVE_PERROR 1
#cmakedefine HAVE_POLL 1
#cmakedefine HAVE_POSIX_FALLOCATE 1
#cmakedefine HAVE_LINUX_FALLOC_H 1
#cmakedefine HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE 1
#cmakedefine HAVE_PREAD 1
#cmakedefine HAVE_PAUSE_INSTRUCTION 1
......
......@@ -196,7 +196,6 @@ CHECK_INCLUDE_FILES (inttypes.h HAVE_INTTYPES_H)
CHECK_INCLUDE_FILES (langinfo.h HAVE_LANGINFO_H)
CHECK_INCLUDE_FILES (link.h HAVE_LINK_H)
CHECK_INCLUDE_FILES (linux/unistd.h HAVE_LINUX_UNISTD_H)
CHECK_INCLUDE_FILES (linux/falloc.h HAVE_LINUX_FALLOC_H)
CHECK_INCLUDE_FILES (limits.h HAVE_LIMITS_H)
CHECK_INCLUDE_FILES (locale.h HAVE_LOCALE_H)
CHECK_INCLUDE_FILES (malloc.h HAVE_MALLOC_H)
......
......@@ -385,7 +385,7 @@ SPACE NAME ENCRYPTION_SCHEME KEYSERVER_REQUESTS MIN_KEY_VERSION CURRENT_KEY_VERS
Warnings:
Warning 1012 InnoDB: SELECTing from INFORMATION_SCHEMA.innodb_tablespaces_encryption but the InnoDB storage engine is not installed
select * from information_schema.innodb_tablespaces_scrubbing;
SPACE NAME COMPRESSED LAST_SCRUB_COMPLETED CURRENT_SCRUB_STARTED CURRENT_SCRUB_ACTIVE_THREADS CURRENT_SCRUB_PAGE_NUMBER CURRENT_SCRUB_MAX_PAGE_NUMBER ROTATING_OR_FLUSHING
SPACE NAME COMPRESSED LAST_SCRUB_COMPLETED CURRENT_SCRUB_STARTED CURRENT_SCRUB_ACTIVE_THREADS CURRENT_SCRUB_PAGE_NUMBER CURRENT_SCRUB_MAX_PAGE_NUMBER ON_SSD
Warnings:
Warning 1012 InnoDB: SELECTing from INFORMATION_SCHEMA.innodb_tablespaces_scrubbing but the InnoDB storage engine is not installed
select * from information_schema.innodb_mutexes;
......
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2013, 2018, MariaDB Corporation.
Copyright (c) 2013, 2019, MariaDB Corporation.
Copyright (c) 2013, 2014, Fusion-io
This program is free software; you can redistribute it and/or modify it under
......@@ -1314,9 +1314,13 @@ buf_flush_try_neighbors(
buf_pool_t* buf_pool = buf_pool_get(page_id);
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
fil_space_t* space = fil_space_acquire_for_io(page_id.space());
if (!space) {
return 0;
}
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
|| srv_flush_neighbors == 0) {
|| !srv_flush_neighbors || !space->is_rotational()) {
/* If there is little space or neighbor flushing is
not enabled then just flush the victim. */
low = page_id.page_no();
......@@ -1371,9 +1375,8 @@ buf_flush_try_neighbors(
}
}
const ulint space_size = fil_space_get_size(page_id.space());
if (high > space_size) {
high = space_size;
if (high > space->size) {
high = space->size;
}
DBUG_PRINT("ib_buf", ("flush %u:%u..%u",
......@@ -1450,6 +1453,8 @@ buf_flush_try_neighbors(
buf_pool_mutex_exit(buf_pool);
}
space->release_for_io();
if (count > 1) {
MONITOR_INC_VALUE_CUMULATIVE(
MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
......
This diff is collapsed.
......@@ -8697,7 +8697,7 @@ static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] =
#define TABLESPACES_SCRUBBING_COMPRESSED 2
{STRUCT_FLD(field_name, "COMPRESSED"),
STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
STRUCT_FLD(field_length, 1),
STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
STRUCT_FLD(value, 0),
STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
......@@ -8749,9 +8749,9 @@ static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] =
STRUCT_FLD(old_name, ""),
STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
#define TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING 9
{STRUCT_FLD(field_name, "ROTATING_OR_FLUSHING"),
STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
#define TABLESPACES_SCRUBBING_ON_SSD 8
{STRUCT_FLD(field_name, "ON_SSD"),
STRUCT_FLD(field_length, 1),
STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
STRUCT_FLD(value, 0),
STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
......@@ -8829,6 +8829,8 @@ i_s_dict_fill_tablespaces_scrubbing(
}
}
OK(fields[TABLESPACES_SCRUBBING_ON_SSD]->store(!space->is_rotational(),
true));
OK(schema_table_store_record(thd, table_to_fill));
DBUG_RETURN(0);
......
......@@ -33,6 +33,9 @@ Created 10/25/1995 Heikki Tuuri
#include "log0recv.h"
#include "dict0types.h"
#ifdef UNIV_LINUX
# include <set>
#endif
// Forward declaration
extern my_bool srv_use_doublewrite_buf;
......@@ -234,7 +237,10 @@ struct fil_space_t {
/** Note that the tablespace has been imported.
Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
written while the space ID is being updated in each page. */
void set_imported();
inline void set_imported();
/** @return whether the storage device is rotational (HDD, not SSD) */
inline bool is_rotational() const;
/** Open each file. Only invoked on fil_system.temp_space.
@return whether all files were opened */
......@@ -537,6 +543,8 @@ struct fil_node_t {
pfs_os_file_t handle;
/** whether the file actually is a raw device or disk partition */
bool is_raw_disk;
/** whether the file is on non-rotational media (SSD) */
bool on_ssd;
/** size of the file in database pages (0 if not known yet);
the possible last incomplete megabyte may be ignored
if space->id == 0 */
......@@ -579,6 +587,14 @@ struct fil_node_t {
@return whether the page was found valid */
bool read_page0(bool first);
/** Determine some file metadata when creating or reading the file.
@param file the file that is being created, or OS_FILE_CLOSED */
void find_metadata(os_file_t file = OS_FILE_CLOSED
#ifdef UNIV_LINUX
, struct stat* statbuf = NULL
#endif
);
/** Close the file handle. */
void close();
};
......@@ -586,6 +602,24 @@ struct fil_node_t {
/** Value of fil_node_t::magic_n */
#define FIL_NODE_MAGIC_N 89389
inline void fil_space_t::set_imported()
{
ut_ad(purpose == FIL_TYPE_IMPORT);
purpose = FIL_TYPE_TABLESPACE;
UT_LIST_GET_FIRST(chain)->find_metadata();
}
inline bool fil_space_t::is_rotational() const
{
for (const fil_node_t* node = UT_LIST_GET_FIRST(chain);
node != NULL; node = UT_LIST_GET_NEXT(chain, node)) {
if (!node->on_ssd) {
return true;
}
}
return false;
}
/** Common InnoDB file extentions */
enum ib_extention {
NO_EXT = 0,
......@@ -853,6 +887,22 @@ struct fil_system_t {
private:
bool m_initialised;
#ifdef UNIV_LINUX
/** available block devices that reside on non-rotational storage */
std::vector<dev_t> ssd;
public:
/** @return whether a file system device is on non-rotational storage */
bool is_ssd(dev_t dev) const
{
/* Linux seems to allow up to 15 partitions per block device.
If the detected ssd carries "partition number 0" (it is the whole device),
compare the candidate file system number without the partition number. */
for (const auto s : ssd)
if (dev == s || (dev & ~15U) == s)
return true;
return false;
}
#endif
public:
ib_mutex_t mutex; /*!< The mutex protecting the cache */
fil_space_t* sys_space; /*!< The innodb_system tablespace */
......
/***********************************************************************
Copyright (c) 2017, MariaDB Corporation.
Copyright (c) 2017, 2019, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
......@@ -54,22 +54,4 @@ buf_page_get_trim_length(
ulint write_length)
MY_ATTRIBUTE((warn_unused_result));
/**
Get should we punch hole to tablespace.
@param[in] space Tablespace
@return true, if punch hole should be tried, false if not. */
bool
fil_node_should_punch_hole(
const fil_node_t* node)
MY_ATTRIBUTE((warn_unused_result));
/**
Set punch hole to tablespace to given value.
@param[in] space Tablespace
@param[in] val value to be set. */
void
fil_space_set_punch_hole(
fil_node_t* node,
bool val);
#endif /* OS_API_H */
......@@ -2,7 +2,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
Copyright (c) 2013, 2017, MariaDB Corporation.
Copyright (c) 2013, 2019, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted
by Percona Inc.. Those modifications are
......@@ -360,17 +360,8 @@ class IORequest {
/** Set the pointer to file node for IO
@param[in] node File node */
void set_fil_node(fil_node_t* node)
{
if (node && !fil_node_should_punch_hole(node)) {
clear_punch_hole();
}
m_fil_node = node;
}
inline void set_fil_node(fil_node_t* node);
/** Compare two requests
@reutrn true if the are equal */
bool operator==(const IORequest& rhs) const
{
return(m_type == rhs.m_type);
......@@ -414,17 +405,7 @@ class IORequest {
: 0);
}
bool should_punch_hole() const {
return (m_fil_node ?
fil_node_should_punch_hole(m_fil_node)
: false);
}
void space_no_punch_hole() const {
if (m_fil_node) {
fil_space_set_punch_hole(m_fil_node, false);
}
}
inline bool should_punch_hole() const;
/** Free storage space associated with a section of the file.
@param[in] fh Open file handle
......@@ -1591,19 +1572,6 @@ os_file_change_size_win32(
#endif /*_WIN32 */
/** Check if the file system supports sparse files.
Warning: On POSIX systems we try and punch a hole from offset 0 to
the system configured page size. This should only be called on an empty
file.
@param[in] fh File handle for the file - if opened
@return true if the file system supports sparse files */
bool
os_is_sparse_file_supported(
os_file_t fh)
MY_ATTRIBUTE((warn_unused_result));
/** Free storage space associated with a section of the file.
@param[in] fh Open file handle
@param[in] off Starting offset (SEEK_SET)
......@@ -1643,16 +1611,6 @@ is_absolute_path(
return(false);
}
/***********************************************************************//**
Try to get number of bytes per sector from file system.
@return file block size */
UNIV_INTERN
ulint
os_file_get_block_size(
/*===================*/
os_file_t file, /*!< in: handle to a file */
const char* name); /*!< in: file name */
#include "os0file.ic"
#endif /* os0file_h */
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment