Commit 10dd290b authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD

For tablespaces that do not reside on spinning storage, it does
not make sense to attempt to write nearby pages when writing out
dirty pages from the InnoDB buffer pool. It is actually detrimental
to performance and to the life span of flash ROM storage.

With this change, MariaDB will detect whether an InnoDB file resides
on solid-state storage. The detection has been implemented for Linux
and Microsoft Windows. For other systems, we will err on the safe side
and assume that files reside on SSD.

As part of this change, we will reduce the number of fstat() calls
when opening data files on POSIX systems and slightly clean up some
file I/O code.

FIXME: os_is_sparse_file_supported() on POSIX works in a destructive
manner. Thus, we can only invoke it when creating files, not when
opening them.

For diagnostics, we introduce the column ON_SSD to the table
INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table
INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose
is to reflect the contents of the InnoDB system table SYS_TABLESPACES,
which we would like to remove at some point.

On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty
sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION
or ERROR_NOT_SUPPORTED. We will silently ignore also this error,
and assume that the file does not reside on SSD.

On Linux, the detection will be based on the files
/sys/block/*/queue/rotational and /sys/block/*/dev.
Especially for USB storage, it is possible that
/sys/block/*/queue/rotational will wrongly report 1 instead of 0.

fil_node_t::on_ssd: Whether the InnoDB data file resides on
solid-state storage.

fil_system_t::ssd: Collection of Linux block devices that reside on
non-rotational storage.

fil_system_t::create(): Detect ssd on Linux based on the contents
of /sys/block/*/queue/rotational and /sys/block/*/dev.

fil_system_t::is_ssd(dev_t): Determine if a Linux block device is
non-rotational. Partitions will be identified with the containing
block device by assuming that the least significant 4 bits of the
minor number identify a partition, and that the "partition number"
of the entire device is 0.
parent 2d825e97
......@@ -184,7 +184,6 @@
#cmakedefine HAVE_PERROR 1
#cmakedefine HAVE_POLL 1
#cmakedefine HAVE_POSIX_FALLOCATE 1
#cmakedefine HAVE_LINUX_FALLOC_H 1
#cmakedefine HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE 1
#cmakedefine HAVE_PREAD 1
#cmakedefine HAVE_PAUSE_INSTRUCTION 1
......
......@@ -196,7 +196,6 @@ CHECK_INCLUDE_FILES (inttypes.h HAVE_INTTYPES_H)
CHECK_INCLUDE_FILES (langinfo.h HAVE_LANGINFO_H)
CHECK_INCLUDE_FILES (link.h HAVE_LINK_H)
CHECK_INCLUDE_FILES (linux/unistd.h HAVE_LINUX_UNISTD_H)
CHECK_INCLUDE_FILES (linux/falloc.h HAVE_LINUX_FALLOC_H)
CHECK_INCLUDE_FILES (limits.h HAVE_LIMITS_H)
CHECK_INCLUDE_FILES (locale.h HAVE_LOCALE_H)
CHECK_INCLUDE_FILES (malloc.h HAVE_MALLOC_H)
......
......@@ -385,7 +385,7 @@ SPACE NAME ENCRYPTION_SCHEME KEYSERVER_REQUESTS MIN_KEY_VERSION CURRENT_KEY_VERS
Warnings:
Warning 1012 InnoDB: SELECTing from INFORMATION_SCHEMA.innodb_tablespaces_encryption but the InnoDB storage engine is not installed
select * from information_schema.innodb_tablespaces_scrubbing;
SPACE NAME COMPRESSED LAST_SCRUB_COMPLETED CURRENT_SCRUB_STARTED CURRENT_SCRUB_ACTIVE_THREADS CURRENT_SCRUB_PAGE_NUMBER CURRENT_SCRUB_MAX_PAGE_NUMBER ROTATING_OR_FLUSHING
SPACE NAME COMPRESSED LAST_SCRUB_COMPLETED CURRENT_SCRUB_STARTED CURRENT_SCRUB_ACTIVE_THREADS CURRENT_SCRUB_PAGE_NUMBER CURRENT_SCRUB_MAX_PAGE_NUMBER ON_SSD
Warnings:
Warning 1012 InnoDB: SELECTing from INFORMATION_SCHEMA.innodb_tablespaces_scrubbing but the InnoDB storage engine is not installed
select * from information_schema.innodb_mutexes;
......
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2013, 2018, MariaDB Corporation.
Copyright (c) 2013, 2019, MariaDB Corporation.
Copyright (c) 2013, 2014, Fusion-io
This program is free software; you can redistribute it and/or modify it under
......@@ -1314,9 +1314,13 @@ buf_flush_try_neighbors(
buf_pool_t* buf_pool = buf_pool_get(page_id);
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
fil_space_t* space = fil_space_acquire_for_io(page_id.space());
if (!space) {
return 0;
}
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
|| srv_flush_neighbors == 0) {
|| !srv_flush_neighbors || !space->is_rotational()) {
/* If there is little space or neighbor flushing is
not enabled then just flush the victim. */
low = page_id.page_no();
......@@ -1371,9 +1375,8 @@ buf_flush_try_neighbors(
}
}
const ulint space_size = fil_space_get_size(page_id.space());
if (high > space_size) {
high = space_size;
if (high > space->size) {
high = space->size;
}
DBUG_PRINT("ib_buf", ("flush %u:%u..%u",
......@@ -1450,6 +1453,8 @@ buf_flush_try_neighbors(
buf_pool_mutex_exit(buf_pool);
}
space->release_for_io();
if (count > 1) {
MONITOR_INC_VALUE_CUMULATIVE(
MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
......
......@@ -50,6 +50,11 @@ Created 10/25/1995 Heikki Tuuri
#include "sync0sync.h"
#include "buf0flu.h"
#include "os0api.h"
#ifdef UNIV_LINUX
# include <sys/types.h>
# include <sys/sysmacros.h>
# include <dirent.h>
#endif
/** Tries to close a file in the LRU list. The caller must hold the fil_sys
mutex.
......@@ -380,19 +385,6 @@ fil_space_get_latch(
return(&(space->latch));
}
/** Note that the tablespace has been imported.
Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
written while the space ID is being updated in each page. */
void fil_space_t::set_imported()
{
ut_ad(purpose == FIL_TYPE_IMPORT);
const fil_node_t* node = UT_LIST_GET_FIRST(chain);
atomic_write_supported = node->atomic_write
&& srv_use_atomic_writes
&& my_test_if_atomic_write(node->handle, physical_size());
purpose = FIL_TYPE_TABLESPACE;
}
/**********************************************************************//**
Checks if all the file nodes in a space are flushed.
@return true if all are flushed */
......@@ -505,108 +497,6 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
return node;
}
/** Read the first page of a data file.
@param[in] first whether this is the very first read
@return whether the page was found valid */
bool fil_node_t::read_page0(bool first)
{
ut_ad(mutex_own(&fil_system.mutex));
ut_a(space->purpose != FIL_TYPE_LOG);
const ulint psize = space->physical_size();
os_offset_t size_bytes = os_file_get_size(handle);
ut_a(size_bytes != (os_offset_t) -1);
const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
if (size_bytes < min_size) {
ib::error() << "The size of the file " << name
<< " is only " << size_bytes
<< " bytes, should be at least " << min_size;
return false;
}
byte* buf2 = static_cast<byte*>(ut_malloc_nokey(2 * psize));
/* Align the memory for file i/o if we might have O_DIRECT set */
byte* page = static_cast<byte*>(ut_align(buf2, psize));
IORequest request(IORequest::READ);
if (!os_file_read(request, handle, page, 0, psize)) {
ib::error() << "Unable to read first page of file " << name;
ut_free(buf2);
return false;
}
const ulint space_id = fsp_header_get_space_id(page);
ulint flags = fsp_header_get_flags(page);
const ulint size = fsp_header_get_field(page, FSP_SIZE);
const ulint free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT);
const ulint free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
+ page);
if (!fil_space_t::is_valid_flags(flags, space->id)) {
ulint cflags = fsp_flags_convert_from_101(flags);
if (cflags == ULINT_UNDEFINED) {
invalid:
ib::error()
<< "Expected tablespace flags "
<< ib::hex(space->flags)
<< " but found " << ib::hex(flags)
<< " in the file " << name;
ut_free(buf2);
return false;
}
ulint cf = cflags & ~FSP_FLAGS_MEM_MASK;
ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK;
if (!fil_space_t::is_flags_equal(cf, sf)
&& !fil_space_t::is_flags_equal(sf, cf)) {
goto invalid;
}
flags = cflags;
}
ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
/* Try to read crypt_data from page 0 if it is not yet read. */
if (!space->crypt_data) {
space->crypt_data = fil_space_read_crypt_data(
fil_space_t::zip_size(flags), page);
}
ut_free(buf2);
if (UNIV_UNLIKELY(space_id != space->id)) {
ib::error() << "Expected tablespace id " << space->id
<< " but found " << space_id
<< " in the file " << name;
return false;
}
ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
ut_ad(space->free_len == 0 || space->free_len == free_len);
space->size_in_header = size;
space->free_limit = free_limit;
space->free_len = free_len;
if (first) {
/* Truncate the size to a multiple of extent size. */
ulint mask = psize * FSP_EXTENT_SIZE - 1;
if (size_bytes <= mask) {
/* .ibd files start smaller than an
extent size. Do not truncate valid data. */
} else {
size_bytes &= ~os_offset_t(mask);
}
space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags;
this->size = ulint(size_bytes / psize);
space->size += this->size;
}
return true;
}
/** Open a file node of a tablespace.
@param[in,out] node File node
@return false if the file can't be opened, otherwise true */
......@@ -682,28 +572,6 @@ static bool fil_node_open_file(fil_node_t* node)
OS_FILE_AIO, OS_DATA_FILE, read_only_mode, &success);
}
if (space->purpose != FIL_TYPE_LOG) {
/*
For the temporary tablespace and during the
non-redo-logged adjustments in
IMPORT TABLESPACE, we do not care about
the atomicity of writes.
Atomic writes is supported if the file can be used
with atomic_writes (not log file), O_DIRECT is
used (tested in ha_innodb.cc) and the file is
device and file system that supports atomic writes
for the given block size
*/
space->atomic_write_supported
= space->purpose == FIL_TYPE_TEMPORARY
|| space->purpose == FIL_TYPE_IMPORT
|| (node->atomic_write
&& srv_use_atomic_writes
&& my_test_if_atomic_write(
node->handle, space->physical_size()));
}
ut_a(success);
ut_a(node->is_open());
......@@ -967,12 +835,6 @@ fil_space_extend_must_retry(
ulint last_page_no = space->size;
const ulint file_start_page_no = last_page_no - node->size;
/* Determine correct file block size */
if (node->block_size == 0) {
node->block_size = os_file_get_block_size(
node->handle, node->name);
}
const ulint page_size = space->physical_size();
/* fil_read_first_page() expects srv_page_size bytes.
......@@ -1435,8 +1297,8 @@ fil_space_create(
to do */
if (purpose == FIL_TYPE_TABLESPACE
&& !srv_fil_crypt_rotate_key_age && fil_crypt_threads_event &&
(mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF ||
srv_encrypt_tables)) {
(mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF
|| srv_encrypt_tables)) {
/* Key rotation is not enabled, need to inform background
encryption threads. */
UT_LIST_ADD_LAST(fil_system.rotation_list, space);
......@@ -1707,6 +1569,66 @@ void fil_system_t::create(ulint hash_size)
spaces = hash_create(hash_size);
fil_space_crypt_init();
#ifdef UNIV_LINUX
ssd.clear();
char fn[sizeof(dirent::d_name)
+ sizeof "/sys/block/" "/queue/rotational"];
const size_t sizeof_fnp = (sizeof fn) - sizeof "/sys/block";
memcpy(fn, "/sys/block/", sizeof "/sys/block");
char* fnp = &fn[sizeof "/sys/block"];
std::set<std::string> ssd_devices;
if (DIR* d = opendir("/sys/block")) {
while (struct dirent* e = readdir(d)) {
if (e->d_name[0] == '.') {
continue;
}
snprintf(fnp, sizeof_fnp, "%s/queue/rotational",
e->d_name);
int f = open(fn, O_RDONLY);
if (f == -1) {
continue;
}
char b[sizeof "4294967295:4294967295\n"];
ssize_t l = read(f, b, sizeof b);
::close(f);
if (l != 2 || memcmp("0\n", b, 2)) {
continue;
}
snprintf(fnp, sizeof_fnp, "%s/dev", e->d_name);
f = open(fn, O_RDONLY);
if (f == -1) {
continue;
}
l = read(f, b, sizeof b);
::close(f);
if (l <= 0 || b[l - 1] != '\n') {
continue;
}
b[l - 1] = '\0';
char* end = b;
unsigned long dev_major = strtoul(b, &end, 10);
if (b == end || *end != ':'
|| dev_major != unsigned(dev_major)) {
continue;
}
char* c = end + 1;
unsigned long dev_minor = strtoul(c, &end, 10);
if (c == end || *end
|| dev_minor != unsigned(dev_minor)) {
continue;
}
ssd.push_back(makedev(unsigned(dev_major),
unsigned(dev_minor)));
}
closedir(d);
}
/* fil_system_t::is_ssd() assumes the following */
ut_ad(makedev(0, 8) == 8);
ut_ad(makedev(0, 4) == 4);
ut_ad(makedev(0, 2) == 2);
ut_ad(makedev(0, 1) == 1);
#endif
}
void fil_system_t::close()
......@@ -2969,6 +2891,9 @@ fil_rename_tablespace(
return(success);
}
/* FIXME: remove this! */
IF_WIN(, bool os_is_sparse_file_supported(os_file_t fh));
/** Create a tablespace file.
@param[in] space_id Tablespace ID
@param[in] name Tablespace name in dbname/tablename format.
......@@ -3044,6 +2969,7 @@ fil_ibd_create(
}
const bool is_compressed = FSP_FLAGS_HAS_PAGE_COMPRESSION(flags);
bool punch_hole = is_compressed;
#ifdef _WIN32
if (is_compressed) {
......@@ -3061,9 +2987,8 @@ fil_ibd_create(
return NULL;
}
bool punch_hole = os_is_sparse_file_supported(file);
ulint block_size = os_file_get_block_size(file, path);
/* FIXME: remove this */
IF_WIN(, punch_hole = punch_hole && os_is_sparse_file_supported(file));
/* We have to write the space id to the file immediately and flush the
file to disk. This is because in crash recovery we must be aware what
......@@ -3149,19 +3074,19 @@ fil_ibd_create(
free(crypt_data);
*err = DB_ERROR;
} else {
fil_node_t* file = space->add(path, OS_FILE_CLOSED, size,
space->punch_hole = punch_hole;
/* FIXME: Keep the file open! */
fil_node_t* node = space->add(path, OS_FILE_CLOSED, size,
false, true);
mtr_t mtr;
mtr.start();
fil_op_write_log(
MLOG_FILE_CREATE2, space_id, 0, file->name,
MLOG_FILE_CREATE2, space_id, 0, node->name,
NULL, space->flags & ~FSP_FLAGS_MEM_MASK, &mtr);
fil_name_write(space, 0, file, &mtr);
fil_name_write(space, 0, node, &mtr);
mtr.commit();
file->block_size = block_size;
space->punch_hole = punch_hole;
node->find_metadata(file);
*err = DB_SUCCESS;
}
......@@ -4154,6 +4079,15 @@ fil_report_invalid_page_access(
: "");
}
inline void IORequest::set_fil_node(fil_node_t* node)
{
if (!node->space->punch_hole) {
clear_punch_hole();
}
m_fil_node = node;
}
/** Reads or writes data. This operation could be asynchronous (aio).
@param[in,out] type IO context
......@@ -5215,29 +5149,6 @@ fil_space_found_by_id(
return space;
}
/**
Get should we punch hole to tablespace.
@param[in] node File node
@return true, if punch hole should be tried, false if not. */
bool
fil_node_should_punch_hole(
const fil_node_t* node)
{
return (node->space->punch_hole);
}
/**
Set punch hole to tablespace to given value.
@param[in] node File node
@param[in] val value to be set. */
void
fil_space_set_punch_hole(
fil_node_t* node,
bool val)
{
node->space->punch_hole = val;
}
/** Checks that this tablespace in a list of unflushed tablespaces.
@return true if in a list */
bool fil_space_t::is_in_unflushed_spaces() const {
......
......@@ -8697,7 +8697,7 @@ static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] =
#define TABLESPACES_SCRUBBING_COMPRESSED 2
{STRUCT_FLD(field_name, "COMPRESSED"),
STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
STRUCT_FLD(field_length, 1),
STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
STRUCT_FLD(value, 0),
STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
......@@ -8749,9 +8749,9 @@ static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] =
STRUCT_FLD(old_name, ""),
STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
#define TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING 9
{STRUCT_FLD(field_name, "ROTATING_OR_FLUSHING"),
STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
#define TABLESPACES_SCRUBBING_ON_SSD 8
{STRUCT_FLD(field_name, "ON_SSD"),
STRUCT_FLD(field_length, 1),
STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
STRUCT_FLD(value, 0),
STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
......@@ -8829,6 +8829,8 @@ i_s_dict_fill_tablespaces_scrubbing(
}
}
OK(fields[TABLESPACES_SCRUBBING_ON_SSD]->store(!space->is_rotational(),
true));
OK(schema_table_store_record(thd, table_to_fill));
DBUG_RETURN(0);
......
......@@ -33,6 +33,9 @@ Created 10/25/1995 Heikki Tuuri
#include "log0recv.h"
#include "dict0types.h"
#ifdef UNIV_LINUX
# include <set>
#endif
// Forward declaration
extern my_bool srv_use_doublewrite_buf;
......@@ -234,7 +237,10 @@ struct fil_space_t {
/** Note that the tablespace has been imported.
Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
written while the space ID is being updated in each page. */
void set_imported();
inline void set_imported();
/** @return whether the storage device is rotational (HDD, not SSD) */
inline bool is_rotational() const;
/** Open each file. Only invoked on fil_system.temp_space.
@return whether all files were opened */
......@@ -537,6 +543,8 @@ struct fil_node_t {
pfs_os_file_t handle;
/** whether the file actually is a raw device or disk partition */
bool is_raw_disk;
/** whether the file is on non-rotational media (SSD) */
bool on_ssd;
/** size of the file in database pages (0 if not known yet);
the possible last incomplete megabyte may be ignored
if space->id == 0 */
......@@ -579,6 +587,14 @@ struct fil_node_t {
@return whether the page was found valid */
bool read_page0(bool first);
/** Determine some file metadata when creating or reading the file.
@param file the file that is being created, or OS_FILE_CLOSED */
void find_metadata(os_file_t file = OS_FILE_CLOSED
#ifdef UNIV_LINUX
, struct stat* statbuf = NULL
#endif
);
/** Close the file handle. */
void close();
};
......@@ -586,6 +602,24 @@ struct fil_node_t {
/** Value of fil_node_t::magic_n */
#define FIL_NODE_MAGIC_N 89389
inline void fil_space_t::set_imported()
{
ut_ad(purpose == FIL_TYPE_IMPORT);
purpose = FIL_TYPE_TABLESPACE;
UT_LIST_GET_FIRST(chain)->find_metadata();
}
inline bool fil_space_t::is_rotational() const
{
for (const fil_node_t* node = UT_LIST_GET_FIRST(chain);
node != NULL; node = UT_LIST_GET_NEXT(chain, node)) {
if (!node->on_ssd) {
return true;
}
}
return false;
}
/** Common InnoDB file extentions */
enum ib_extention {
NO_EXT = 0,
......@@ -853,6 +887,22 @@ struct fil_system_t {
private:
bool m_initialised;
#ifdef UNIV_LINUX
/** available block devices that reside on non-rotational storage */
std::vector<dev_t> ssd;
public:
/** @return whether a file system device is on non-rotational storage */
bool is_ssd(dev_t dev) const
{
/* Linux seems to allow up to 15 partitions per block device.
If the detected ssd carries "partition number 0" (it is the whole device),
compare the candidate file system number without the partition number. */
for (const auto s : ssd)
if (dev == s || (dev & ~15U) == s)
return true;
return false;
}
#endif
public:
ib_mutex_t mutex; /*!< The mutex protecting the cache */
fil_space_t* sys_space; /*!< The innodb_system tablespace */
......
/***********************************************************************
Copyright (c) 2017, MariaDB Corporation.
Copyright (c) 2017, 2019, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
......@@ -54,22 +54,4 @@ buf_page_get_trim_length(
ulint write_length)
MY_ATTRIBUTE((warn_unused_result));
/**
Get should we punch hole to tablespace.
@param[in] space Tablespace
@return true, if punch hole should be tried, false if not. */
bool
fil_node_should_punch_hole(
const fil_node_t* node)
MY_ATTRIBUTE((warn_unused_result));
/**
Set punch hole to tablespace to given value.
@param[in] space Tablespace
@param[in] val value to be set. */
void
fil_space_set_punch_hole(
fil_node_t* node,
bool val);
#endif /* OS_API_H */
......@@ -2,7 +2,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
Copyright (c) 2013, 2017, MariaDB Corporation.
Copyright (c) 2013, 2019, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted
by Percona Inc.. Those modifications are
......@@ -360,17 +360,8 @@ class IORequest {
/** Set the pointer to file node for IO
@param[in] node File node */
void set_fil_node(fil_node_t* node)
{
if (node && !fil_node_should_punch_hole(node)) {
clear_punch_hole();
}
m_fil_node = node;
}
inline void set_fil_node(fil_node_t* node);
/** Compare two requests
@reutrn true if the are equal */
bool operator==(const IORequest& rhs) const
{
return(m_type == rhs.m_type);
......@@ -414,17 +405,7 @@ class IORequest {
: 0);
}
bool should_punch_hole() const {
return (m_fil_node ?
fil_node_should_punch_hole(m_fil_node)
: false);
}
void space_no_punch_hole() const {
if (m_fil_node) {
fil_space_set_punch_hole(m_fil_node, false);
}
}
inline bool should_punch_hole() const;
/** Free storage space associated with a section of the file.
@param[in] fh Open file handle
......@@ -1591,19 +1572,6 @@ os_file_change_size_win32(
#endif /*_WIN32 */
/** Check if the file system supports sparse files.
Warning: On POSIX systems we try and punch a hole from offset 0 to
the system configured page size. This should only be called on an empty
file.
@param[in] fh File handle for the file - if opened
@return true if the file system supports sparse files */
bool
os_is_sparse_file_supported(
os_file_t fh)
MY_ATTRIBUTE((warn_unused_result));
/** Free storage space associated with a section of the file.
@param[in] fh Open file handle
@param[in] off Starting offset (SEEK_SET)
......@@ -1643,16 +1611,6 @@ is_absolute_path(
return(false);
}
/***********************************************************************//**
Try to get number of bytes per sector from file system.
@return file block size */
UNIV_INTERN
ulint
os_file_get_block_size(
/*===================*/
os_file_t file, /*!< in: handle to a file */
const char* name); /*!< in: file name */
#include "os0file.ic"
#endif /* os0file_h */
......@@ -38,14 +38,14 @@ Created 10/21/1995 Heikki Tuuri
#include "sql_const.h"
#ifdef UNIV_LINUX
#include <sys/types.h>
#include <sys/stat.h>
# include <sys/types.h>
# include <sys/stat.h>
#endif
#include "srv0srv.h"
#include "srv0start.h"
#include "fil0fil.h"
#include "srv0srv.h"
#include "fsp0fsp.h"
#ifdef HAVE_LINUX_UNISTD_H
#include "unistd.h"
#endif
......@@ -70,14 +70,6 @@ Created 10/21/1995 Heikki Tuuri
# endif
#endif
#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H)
#include <sys/statvfs.h>
#endif
#if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H)
#include <linux/falloc.h>
#endif
#ifdef _WIN32
#include <winioctl.h>
#endif
......@@ -821,108 +813,6 @@ os_win32_device_io_control(
#endif
/***********************************************************************//**
Try to get number of bytes per sector from file system.
@return file block size */
UNIV_INTERN
ulint
os_file_get_block_size(
/*===================*/
os_file_t file, /*!< in: handle to a file */
const char* name) /*!< in: file name */
{
ulint fblock_size = 512;
#if defined(UNIV_LINUX)
struct stat local_stat;
int err;
err = fstat((int)file, &local_stat);
if (err != 0) {
os_file_handle_error_no_exit(name, "fstat()", FALSE);
} else {
fblock_size = local_stat.st_blksize;
}
#endif /* UNIV_LINUX */
#ifdef _WIN32
fblock_size = 0;
BOOL result = false;
size_t len = 0;
// Open volume for this file, find out it "physical bytes per sector"
HANDLE volume_handle = INVALID_HANDLE_VALUE;
char volume[MAX_PATH + 4]="\\\\.\\"; // Special prefix required for volume names.
if (!GetVolumePathName(name , volume + 4, MAX_PATH)) {
os_file_handle_error_no_exit(name,
"GetVolumePathName()", FALSE);
goto end;
}
len = strlen(volume);
if (volume[len - 1] == '\\') {
// Trim trailing backslash from volume name.
volume[len - 1] = 0;
}
volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
0, OPEN_EXISTING, 0, 0);
if (volume_handle == INVALID_HANDLE_VALUE) {
if (GetLastError() != ERROR_ACCESS_DENIED) {
os_file_handle_error_no_exit(volume,
"CreateFile()", FALSE);
}
goto end;
}
DWORD tmp;
STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment;
STORAGE_PROPERTY_QUERY storage_query;
memset(&storage_query, 0, sizeof(storage_query));
storage_query.PropertyId = StorageAccessAlignmentProperty;
storage_query.QueryType = PropertyStandardQuery;
result = os_win32_device_io_control(volume_handle,
IOCTL_STORAGE_QUERY_PROPERTY,
&storage_query,
sizeof(storage_query),
&disk_alignment,
sizeof(disk_alignment),
&tmp);
if (!result) {
DWORD err = GetLastError();
if (err != ERROR_INVALID_FUNCTION && err != ERROR_NOT_SUPPORTED) {
os_file_handle_error_no_exit(volume,
"DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)", FALSE);
}
goto end;
}
fblock_size = disk_alignment.BytesPerPhysicalSector;
end:
if (volume_handle != INVALID_HANDLE_VALUE) {
CloseHandle(volume_handle);
}
#endif /* _WIN32 */
/* Currently we support file block size up to 4Kb */
if (fblock_size > 4096 || fblock_size < 512) {
if (fblock_size < 512) {
fblock_size = 512;
} else {
fblock_size = 4096;
}
}
return fblock_size;
}
#ifdef WIN_ASYNC_IO
/** This function is only used in Windows asynchronous i/o.
Waits for an aio operation to complete. This function is used to wait the
......@@ -5255,6 +5145,34 @@ os_file_set_nocache(
#endif /* _WIN32 */
/** Check if the file system supports sparse files.
@param fh file handle
@return true if the file system supports sparse files */
IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh)
{
/* In this debugging mode, we act as if punch hole is supported,
then we skip any calls to actually punch a hole. In this way,
Transparent Page Compression is still being tested. */
DBUG_EXECUTE_IF("ignore_punch_hole",
return(true);
);
#ifdef _WIN32
FILE_ATTRIBUTE_TAG_INFO info;
if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
&info, (DWORD)sizeof(info))) {
if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
}
}
return false;
#else
/* We don't know the FS block size, use the sector size. The FS
will do the magic. */
return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size);
#endif /* _WIN32 */
}
/** Extend a file.
On Windows, extending a file allocates blocks for the file,
......@@ -5482,15 +5400,16 @@ os_file_punch_hole(
os_offset_t off,
os_offset_t len)
{
dberr_t err;
#ifdef _WIN32
err = os_file_punch_hole_win32(fh, off, len);
return os_file_punch_hole_win32(fh, off, len);
#else
err = os_file_punch_hole_posix(fh, off, len);
return os_file_punch_hole_posix(fh, off, len);
#endif /* _WIN32 */
}
return (err);
inline bool IORequest::should_punch_hole() const
{
return m_fil_node && m_fil_node->space->punch_hole;
}
/** Free storage space associated with a section of the file.
......@@ -5530,7 +5449,9 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
/* If punch hole is not supported,
set space so that it is not used. */
if (err == DB_IO_NO_PUNCH_HOLE) {
space_no_punch_hole();
if (m_fil_node) {
m_fil_node->space->punch_hole = false;
}
err = DB_SUCCESS;
}
}
......@@ -5538,43 +5459,6 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
return (err);
}
/** Check if the file system supports sparse files.
Warning: On POSIX systems we try and punch a hole from offset 0 to
the system configured page size. This should only be called on an empty
file.
@param[in] fh File handle for the file - if opened
@return true if the file system supports sparse files */
bool
os_is_sparse_file_supported(os_file_t fh)
{
/* In this debugging mode, we act as if punch hole is supported,
then we skip any calls to actually punch a hole. In this way,
Transparent Page Compression is still being tested. */
DBUG_EXECUTE_IF("ignore_punch_hole",
return(true);
);
#ifdef _WIN32
FILE_ATTRIBUTE_TAG_INFO info;
if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
&info, (DWORD)sizeof(info))) {
if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
}
}
return false;
#else
dberr_t err;
/* We don't know the FS block size, use the sector size. The FS
will do the magic. */
err = os_file_punch_hole_posix(fh, 0, srv_page_size);
return(err == DB_SUCCESS);
#endif /* _WIN32 */
}
/** This function returns information about the specified file
@param[in] path pathname of the file
@param[out] stat_info information of a file in a directory
......@@ -7604,6 +7488,279 @@ os_file_set_umask(ulint umask)
os_innodb_umask = umask;
}
/** Determine some file metadata when creating or reading the file.
@param file the file that is being created, or OS_FILE_CLOSED */
void fil_node_t::find_metadata(os_file_t file
#ifdef UNIV_LINUX
, struct stat* statbuf
#endif
)
{
if (file == OS_FILE_CLOSED) {
file = handle;
ut_ad(is_open());
}
#ifdef _WIN32 /* FIXME: make this unconditional */
if (space->punch_hole) {
space->punch_hole = os_is_sparse_file_supported(file);
}
#endif
/*
For the temporary tablespace and during the
non-redo-logged adjustments in
IMPORT TABLESPACE, we do not care about
the atomicity of writes.
Atomic writes is supported if the file can be used
with atomic_writes (not log file), O_DIRECT is
used (tested in ha_innodb.cc) and the file is
device and file system that supports atomic writes
for the given block size.
*/
space->atomic_write_supported = space->purpose == FIL_TYPE_TEMPORARY
|| space->purpose == FIL_TYPE_IMPORT;
#ifdef _WIN32
block_size = 512;
on_ssd = false;
// Open volume for this file, find out it "physical bytes per sector"
char volume[MAX_PATH + 4];
if (!GetVolumePathName(name, volume + 4, MAX_PATH)) {
os_file_handle_error_no_exit(name,
"GetVolumePathName()", FALSE);
return;
}
// Special prefix required for volume names.
memcpy(volume, "\\\\.\\", 4);
size_t len = strlen(volume);
if (volume[len - 1] == '\\') {
// Trim trailing backslash from volume name.
volume[len - 1] = 0;
}
HANDLE volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
0, OPEN_EXISTING, 0, 0);
if (volume_handle != INVALID_HANDLE_VALUE) {
DWORD tmp;
union {
STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment;
DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty;
} result;
STORAGE_PROPERTY_QUERY storage_query;
memset(&storage_query, 0, sizeof(storage_query));
storage_query.PropertyId = StorageAccessAlignmentProperty;
storage_query.QueryType = PropertyStandardQuery;
if (!os_win32_device_io_control(volume_handle,
IOCTL_STORAGE_QUERY_PROPERTY,
&storage_query,
sizeof storage_query,
&result.disk_alignment,
sizeof result.disk_alignment,
&tmp)
|| tmp < sizeof result.disk_alignment) {
switch (GetLastError()) {
case ERROR_INVALID_FUNCTION:
case ERROR_NOT_SUPPORTED:
break;
default:
ioctl_fail:
os_file_handle_error_no_exit(
volume,
"DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)",
FALSE);
}
goto end;
}
block_size = result.disk_alignment.BytesPerPhysicalSector;
storage_query.PropertyId = StorageDeviceSeekPenaltyProperty;
storage_query.QueryType = PropertyStandardQuery;
if (!os_win32_device_io_control(volume_handle,
IOCTL_STORAGE_QUERY_PROPERTY,
&storage_query,
sizeof storage_query,
&result.seek_penalty,
sizeof result.seek_penalty,
&tmp)
|| tmp < sizeof result.seek_penalty) {
switch (GetLastError()) {
case ERROR_INVALID_FUNCTION:
case ERROR_NOT_SUPPORTED:
case ERROR_GEN_FAILURE:
goto end;
default:
goto ioctl_fail;
}
}
on_ssd = !result.seek_penalty.IncursSeekPenalty;
end:
if (volume_handle != INVALID_HANDLE_VALUE) {
CloseHandle(volume_handle);
}
} else {
if (GetLastError() != ERROR_ACCESS_DENIED) {
os_file_handle_error_no_exit(volume,
"CreateFile()", FALSE);
}
}
/* Currently we support file block size up to 4KiB */
if (block_size > 4096) {
block_size = 4096;
} else if (block_size < 512) {
block_size = 512;
}
#else
on_ssd = space->atomic_write_supported;
# ifdef UNIV_LINUX
if (!on_ssd) {
struct stat sbuf;
if (!statbuf && !fstat(file, &sbuf)) {
statbuf = &sbuf;
}
if (statbuf && fil_system.is_ssd(statbuf->st_dev)) {
on_ssd = true;
}
}
# endif
#endif
if (!space->atomic_write_supported) {
space->atomic_write_supported = atomic_write
&& srv_use_atomic_writes
#ifdef _WIN32
&& my_test_if_atomic_write(file,
space->physical_size())
#else
&& srv_page_size == block_size
#endif
;
}
}
/** Read the first page of a data file.
@param[in] first whether this is the very first read
@return whether the page was found valid */
bool fil_node_t::read_page0(bool first)
{
ut_ad(mutex_own(&fil_system.mutex));
ut_a(space->purpose != FIL_TYPE_LOG);
const ulint psize = space->physical_size();
#ifndef _WIN32
struct stat statbuf;
if (fstat(handle, &statbuf)) {
return false;
}
block_size = statbuf.st_blksize;
os_offset_t size_bytes = statbuf.st_size;
#else
os_offset_t size_bytes = os_file_get_size(handle);
ut_a(size_bytes != (os_offset_t) -1);
#endif
const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
if (size_bytes < min_size) {
ib::error() << "The size of the file " << name
<< " is only " << size_bytes
<< " bytes, should be at least " << min_size;
return false;
}
byte* buf2 = static_cast<byte*>(ut_malloc_nokey(2 * psize));
/* Align the memory for file i/o if we might have O_DIRECT set */
byte* page = static_cast<byte*>(ut_align(buf2, psize));
IORequest request(IORequest::READ);
if (!os_file_read(request, handle, page, 0, psize)) {
ib::error() << "Unable to read first page of file " << name;
ut_free(buf2);
return false;
}
const ulint space_id = fsp_header_get_space_id(page);
ulint flags = fsp_header_get_flags(page);
const ulint size = fsp_header_get_field(page, FSP_SIZE);
const ulint free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT);
const ulint free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
+ page);
if (!fil_space_t::is_valid_flags(flags, space->id)) {
ulint cflags = fsp_flags_convert_from_101(flags);
if (cflags == ULINT_UNDEFINED) {
invalid:
ib::error()
<< "Expected tablespace flags "
<< ib::hex(space->flags)
<< " but found " << ib::hex(flags)
<< " in the file " << name;
ut_free(buf2);
return false;
}
ulint cf = cflags & ~FSP_FLAGS_MEM_MASK;
ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK;
if (!fil_space_t::is_flags_equal(cf, sf)
&& !fil_space_t::is_flags_equal(sf, cf)) {
goto invalid;
}
flags = cflags;
}
ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
/* Try to read crypt_data from page 0 if it is not yet read. */
if (!space->crypt_data) {
space->crypt_data = fil_space_read_crypt_data(
fil_space_t::zip_size(flags), page);
}
ut_free(buf2);
if (UNIV_UNLIKELY(space_id != space->id)) {
ib::error() << "Expected tablespace id " << space->id
<< " but found " << space_id
<< " in the file " << name;
return false;
}
ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
ut_ad(space->free_len == 0 || space->free_len == free_len);
space->size_in_header = size;
space->free_limit = free_limit;
space->free_len = free_len;
if (first) {
#ifdef UNIV_LINUX
find_metadata(handle, &statbuf);
#else
find_metadata();
#endif
/* Truncate the size to a multiple of extent size. */
ulint mask = psize * FSP_EXTENT_SIZE - 1;
if (size_bytes <= mask) {
/* .ibd files start smaller than an
extent size. Do not truncate valid data. */
} else {
size_bytes &= ~os_offset_t(mask);
}
space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags;
this->size = ulint(size_bytes / psize);
space->size += this->size;
}
return true;
}
#else
#include "univ.i"
#endif /* !UNIV_INNOCHECKSUM */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment