Commit cf2a4426 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-14717 RENAME TABLE in InnoDB is not crash-safe

This is a backport of commit 0bc36758
and commit 9eb3fcc9.

InnoDB in MariaDB 10.2 appears to only write MLOG_FILE_RENAME2
redo log records during table-rebuilding ALGORITHM=INPLACE operations.
We must write the records for any .ibd file renames, so that the
operations are crash-safe.

If InnoDB is killed during a RENAME TABLE operation, it can happen that
the transaction for updating the data dictionary will be rolled back.
But, nothing will roll back the renaming of the .ibd file
(the MLOG_FILE_RENAME2 only guarantees roll-forward), or for that matter,
the renaming of the dict_table_t::name in the dict_sys cache. We introduce
the undo log record TRX_UNDO_RENAME_TABLE to fix this.

fil_space_for_table_exists_in_mem(): Remove the parameters
adjust_space, table_id and some code that was trying to work around
these deficiencies.

fil_name_write_rename(): Write a MLOG_FILE_RENAME2 record.

dict_table_rename_in_cache(): Invoke fil_name_write_rename().

trx_undo_rec_copy(): Set the first 2 bytes to the length of the
copied undo log record.

trx_undo_page_report_rename(), trx_undo_report_rename():
Write a TRX_UNDO_RENAME_TABLE record with the old table name.

row_rename_table_for_mysql(): Invoke trx_undo_report_rename()
before modifying any data dictionary tables.

row_undo_ins_parse_undo_rec(): Roll back TRX_UNDO_RENAME_TABLE
by invoking dict_table_rename_in_cache(), which will take care
of both renaming the table and the file.

ha_innobase::truncate(): Remove a work-around.
parent e67b1070
CREATE TABLE t1 (a INT UNSIGNED PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t1 VALUES(42);
connect con1,localhost,root,,test;
SET DEBUG_SYNC='before_rename_table_commit SIGNAL renamed WAIT_FOR ever';
RENAME TABLE t1 TO t2;
connection default;
SET DEBUG_SYNC='now WAIT_FOR renamed';
disconnect con1;
SELECT * FROM t1;
a
42
DROP TABLE t1;
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--source include/not_embedded.inc
CREATE TABLE t1 (a INT UNSIGNED PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t1 VALUES(42);
--connect (con1,localhost,root,,test)
SET DEBUG_SYNC='before_rename_table_commit SIGNAL renamed WAIT_FOR ever';
--send
RENAME TABLE t1 TO t2;
--connection default
SET DEBUG_SYNC='now WAIT_FOR renamed';
--let $shutdown_timeout=0
--source include/restart_mysqld.inc
--disconnect con1
SELECT * FROM t1;
DROP TABLE t1;
......@@ -1692,6 +1692,8 @@ dict_table_rename_in_cache(
return(err);
}
fil_name_write_rename(table->space, old_path, new_path);
bool success = fil_rename_tablespace(
table->space, old_path, new_name, new_path);
......
......@@ -1448,7 +1448,7 @@ dict_check_sys_tables(
look to see if it is already in the tablespace cache. */
if (fil_space_for_table_exists_in_mem(
space_id, table_name.m_name,
false, true, NULL, 0, flags)) {
false, NULL, flags)) {
/* Recovery can open a datafile that does not
match SYS_DATAFILES. If they don't match, update
SYS_DATAFILES. */
......@@ -2857,8 +2857,7 @@ dict_load_tablespace(
/* The tablespace may already be open. */
if (fil_space_for_table_exists_in_mem(
table->space, space_name, false,
true, heap, table->id, table->flags)) {
table->space, space_name, false, heap, table->flags)) {
return;
}
......
......@@ -2313,7 +2313,7 @@ fil_op_write_log(
@param[in,out] mtr mini-transaction */
static
void
fil_name_write_rename(
fil_name_write_rename_low(
ulint space_id,
ulint first_page_no,
const char* old_name,
......@@ -2327,6 +2327,23 @@ fil_name_write_rename(
space_id, first_page_no, old_name, new_name, 0, mtr);
}
/** Write redo log for renaming a file.
@param[in] space_id tablespace id
@param[in] old_name tablespace file name
@param[in] new_name tablespace file name after renaming */
void
fil_name_write_rename(
ulint space_id,
const char* old_name,
const char* new_name)
{
mtr_t mtr;
mtr.start();
fil_name_write_rename_low(space_id, 0, old_name, new_name, &mtr);
mtr.commit();
log_write_up_to(mtr.commit_lsn(), true);
}
/** Write MLOG_FILE_NAME for a file.
@param[in] space_id tablespace id
@param[in] first_page_no first page number in the file
......@@ -3394,12 +3411,7 @@ fil_rename_tablespace(
ut_ad(strchr(new_file_name, OS_PATH_SEPARATOR) != NULL);
if (!recv_recovery_on) {
mtr_t mtr;
mtr.start();
fil_name_write_rename(
id, 0, old_file_name, new_file_name, &mtr);
mtr.commit();
fil_name_write_rename(id, old_file_name, new_file_name);
log_mutex_enter();
}
......@@ -4457,9 +4469,7 @@ startup, there may be many tablespaces which are not yet in the memory cache.
@param[in] print_error_if_does_not_exist
Print detailed error information to the
error log if a matching tablespace is not found from memory.
@param[in] adjust_space Whether to adjust space id on mismatch
@param[in] heap Heap memory
@param[in] table_id table id
@param[in] table_flags table flags
@return true if a matching tablespace exists in the memory cache */
bool
......@@ -4467,9 +4477,7 @@ fil_space_for_table_exists_in_mem(
ulint id,
const char* name,
bool print_error_if_does_not_exist,
bool adjust_space,
mem_heap_t* heap,
table_id_t table_id,
ulint table_flags)
{
fil_space_t* fnamespace;
......@@ -4494,41 +4502,6 @@ fil_space_for_table_exists_in_mem(
} else if (!valid || space == fnamespace) {
/* Found with the same file name, or got a flag mismatch. */
goto func_exit;
} else if (adjust_space
&& row_is_mysql_tmp_table_name(space->name)
&& !row_is_mysql_tmp_table_name(name)) {
/* Info from fnamespace comes from the ibd file
itself, it can be different from data obtained from
System tables since renaming files is not
transactional. We shall adjust the ibd file name
according to system table info. */
mutex_exit(&fil_system->mutex);
DBUG_EXECUTE_IF("ib_crash_before_adjust_fil_space",
DBUG_SUICIDE(););
const char* tmp_name = dict_mem_create_temporary_tablename(
heap, name, table_id);
fil_rename_tablespace(
fnamespace->id,
UT_LIST_GET_FIRST(fnamespace->chain)->name,
tmp_name, NULL);
DBUG_EXECUTE_IF("ib_crash_after_adjust_one_fil_space",
DBUG_SUICIDE(););
fil_rename_tablespace(
id, UT_LIST_GET_FIRST(space->chain)->name,
name, NULL);
DBUG_EXECUTE_IF("ib_crash_after_adjust_fil_space",
DBUG_SUICIDE(););
mutex_enter(&fil_system->mutex);
fnamespace = fil_space_get_by_name(name);
ut_ad(space == fnamespace);
goto func_exit;
}
if (!print_error_if_does_not_exist) {
......@@ -5576,7 +5549,7 @@ fil_mtr_rename_log(
return(err);
}
fil_name_write_rename(
fil_name_write_rename_low(
old_table->space, 0, old_path, tmp_path, mtr);
ut_free(tmp_path);
......@@ -5607,7 +5580,7 @@ fil_mtr_rename_log(
}
}
fil_name_write_rename(
fil_name_write_rename_low(
new_table->space, 0, new_path, old_path, mtr);
ut_free(new_path);
......
......@@ -13353,6 +13353,7 @@ innobase_rename_table(
DEBUG_SYNC_C("innodb_rename_table_ready");
trx_start_if_not_started(trx, true);
ut_ad(trx->will_lock > 0);
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks can occur then in these operations. */
......@@ -13504,13 +13505,6 @@ int ha_innobase::truncate()
}
if (err) {
/* Before MDEV-14717, rollback of RENAME TABLE fails
to undo the rename in the file system, so we do it
manually here. In case the server is killed before the
TRUNCATE operation is committed, after recovery in
MariaDB 10.2, the data file could end up "missing"
(remain called temp_name). */
innobase_rename_table(trx, temp_name, name);
trx_rollback_to_savepoint(trx, NULL);
}
......
......@@ -411,7 +411,7 @@ dict_table_rename_in_cache(
/*!< in: in ALTER TABLE we want
to preserve the original table name
in constraints which reference it */
MY_ATTRIBUTE((nonnull, warn_unused_result));
MY_ATTRIBUTE((nonnull));
/** Removes an index from the dictionary cache.
@param[in,out] table table whose index to remove
......
......@@ -869,6 +869,15 @@ fil_create_directory_for_tablename(
/*===============================*/
const char* name); /*!< in: name in the standard
'databasename/tablename' format */
/** Write redo log for renaming a file.
@param[in] space_id tablespace id
@param[in] old_name tablespace file name
@param[in] new_name tablespace file name after renaming */
void
fil_name_write_rename(
ulint space_id,
const char* old_name,
const char* new_name);
/********************************************************//**
Recreates table indexes by applying
TRUNCATE log record during recovery.
......@@ -1128,27 +1137,24 @@ fil_file_readdir_next_file(
os_file_dir_t dir, /*!< in: directory stream */
os_file_stat_t* info); /*!< in/out: buffer where the
info is returned */
/*******************************************************************//**
Returns true if a matching tablespace exists in the InnoDB tablespace memory
cache. Note that if we have not done a crash recovery at the database startup,
there may be many tablespaces which are not yet in the memory cache.
/** Determine if a matching tablespace exists in the InnoDB tablespace
memory cache. Note that if we have not done a crash recovery at the database
startup, there may be many tablespaces which are not yet in the memory cache.
@param[in] id Tablespace ID
@param[in] name Tablespace name used in fil_space_create().
@param[in] print_error_if_does_not_exist
Print detailed error information to the
error log if a matching tablespace is not found from memory.
@param[in] heap Heap memory
@param[in] table_flags table flags
@return true if a matching tablespace exists in the memory cache */
bool
fil_space_for_table_exists_in_mem(
/*==============================*/
ulint id, /*!< in: space id */
const char* name, /*!< in: table name in the standard
'databasename/tablename' format */
ulint id,
const char* name,
bool print_error_if_does_not_exist,
/*!< in: print detailed error
information to the .err log if a
matching tablespace is not found from
memory */
bool adjust_space, /*!< in: whether to adjust space id
when find table space mismatch */
mem_heap_t* heap, /*!< in: heap memory */
table_id_t table_id, /*!< in: table id */
ulint table_flags); /*!< in: table flags */
mem_heap_t* heap,
ulint table_flags);
/** Try to extend a tablespace if it is smaller than the specified size.
@param[in,out] space tablespace
......
......@@ -179,6 +179,13 @@ trx_undo_rec_get_partial_row(
mem_heap_t* heap) /*!< in: memory heap from which the memory
needed is allocated */
MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Report a RENAME TABLE operation.
@param[in,out] trx transaction
@param[in] table table that is being renamed
@return DB_SUCCESS or error code */
dberr_t
trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
MY_ATTRIBUTE((nonnull, warn_unused_result));
/***********************************************************************//**
Writes information to an undo log about an insert, update, or a delete marking
of a clustered index record. This information is used in a rollback of the
......@@ -322,6 +329,7 @@ trx_undo_read_v_idx(
compilation info multiplied by 16 is ORed to this value in an undo log
record */
#define TRX_UNDO_RENAME_TABLE 9 /*!< RENAME TABLE */
#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */
#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked
record */
......
......@@ -95,5 +95,8 @@ trx_undo_rec_copy(
len = mach_read_from_2(undo_rec)
- ut_align_offset(undo_rec, UNIV_PAGE_SIZE);
ut_ad(len < UNIV_PAGE_SIZE);
return((trx_undo_rec_t*) mem_heap_dup(heap, undo_rec, len));
trx_undo_rec_t* rec = static_cast<trx_undo_rec_t*>(
mem_heap_dup(heap, undo_rec, len));
mach_write_to_2(rec, len);
return rec;
}
......@@ -3299,7 +3299,7 @@ row_drop_single_table_tablespace(
/* If the tablespace is not in the cache, just delete the file. */
if (!fil_space_for_table_exists_in_mem(
space_id, tablename, true, false, NULL, 0, table_flags)) {
space_id, tablename, true, NULL, table_flags)) {
/* Force a delete of any discarded or temporary files. */
fil_delete_file(filepath);
......@@ -4391,6 +4391,14 @@ row_rename_table_for_mysql(
goto funct_exit;
}
if (!table->is_temporary()) {
err = trx_undo_report_rename(trx, table);
if (err != DB_SUCCESS) {
goto funct_exit;
}
}
/* We use the private SQL parser of Innobase to generate the query
graphs needed in updating the dictionary data from system tables. */
......@@ -4576,7 +4584,8 @@ row_rename_table_for_mysql(
}
}
if ((dict_table_has_fts_index(table)
if (err == DB_SUCCESS
&& (dict_table_has_fts_index(table)
|| DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID))
&& !dict_tables_have_same_db(old_name, new_name)) {
err = fts_rename_aux_tables(table, new_name, trx);
......@@ -4734,6 +4743,7 @@ row_rename_table_for_mysql(
}
if (commit) {
DEBUG_SYNC(trx->mysql_thd, "before_rename_table_commit");
trx_commit_for_mysql(trx);
}
......
......@@ -330,16 +330,13 @@ row_undo_ins_parse_undo_rec(
byte* ptr;
undo_no_t undo_no;
table_id_t table_id;
ulint type;
ulint dummy;
bool dummy_extern;
ut_ad(node);
ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
ptr = trx_undo_rec_get_pars(node->undo_rec, &node->rec_type, &dummy,
&dummy_extern, &undo_no, &table_id);
ut_ad(type == TRX_UNDO_INSERT_REC);
node->rec_type = type;
node->update = NULL;
node->table = dict_table_open_on_id(
......@@ -350,6 +347,27 @@ row_undo_ins_parse_undo_rec(
return;
}
switch (node->rec_type) {
default:
ut_ad(!"wrong undo record type");
goto close_table;
case TRX_UNDO_INSERT_REC:
break;
case TRX_UNDO_RENAME_TABLE:
dict_table_t* table = node->table;
ut_ad(!table->is_temporary());
ut_ad(dict_table_is_file_per_table(table)
== (table->space != TRX_SYS_SPACE));
size_t len = mach_read_from_2(node->undo_rec)
+ node->undo_rec - ptr - 2;
ptr[len] = 0;
const char* name = reinterpret_cast<char*>(ptr);
if (strcmp(table->name.m_name, name)) {
dict_table_rename_in_cache(table, name, false);
}
goto close_table;
}
if (UNIV_UNLIKELY(!fil_table_accessible(node->table))) {
close_table:
/* Normally, tables should not disappear or become
......
......@@ -1880,6 +1880,119 @@ trx_undo_parse_erase_page_end(
return(ptr);
}
/** Report a RENAME TABLE operation.
@param[in,out] trx transaction
@param[in] table table that is being renamed
@param[in,out] block undo page
@param[in,out] mtr mini-transaction
@return byte offset of the undo log record
@retval 0 in case of failure */
static
ulint
trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
buf_block_t* block, mtr_t* mtr)
{
byte* ptr_first_free = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ block->frame;
ulint first_free = mach_read_from_2(ptr_first_free);
ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
ut_ad(first_free <= UNIV_PAGE_SIZE);
byte* start = block->frame + first_free;
size_t len = strlen(table->name.m_name);
const size_t fixed = 2 + 1 + 11 + 11 + 2;
ut_ad(len <= NAME_LEN * 2 + 1);
/* The -10 is used in trx_undo_left() */
compile_time_assert((NAME_LEN * 1) * 2 + fixed
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE
< UNIV_PAGE_SIZE_MIN - 10 - FIL_PAGE_DATA_END);
if (trx_undo_left(block->frame, start) < fixed + len) {
ut_ad(first_free > TRX_UNDO_PAGE_HDR
+ TRX_UNDO_PAGE_HDR_SIZE);
return 0;
}
byte* ptr = start + 2;
*ptr++ = TRX_UNDO_RENAME_TABLE;
ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
ptr += mach_u64_write_much_compressed(ptr, table->id);
memcpy(ptr, table->name.m_name, len);
ptr += len;
mach_write_to_2(ptr, first_free);
ptr += 2;
ulint offset = page_offset(ptr);
mach_write_to_2(start, offset);
mach_write_to_2(ptr_first_free, offset);
trx_undof_page_add_undo_rec_log(block->frame, first_free, offset, mtr);
return first_free;
}
/** Report a RENAME TABLE operation.
@param[in,out] trx transaction
@param[in] table table that is being renamed
@return DB_SUCCESS or error code */
dberr_t
trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
{
ut_ad(!trx->read_only);
ut_ad(trx->id);
ut_ad(!table->is_temporary());
trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
trx_undo_t** pundo = &trx->rsegs.m_redo.insert_undo;
mutex_enter(&trx->undo_mutex);
dberr_t err = *pundo
? DB_SUCCESS
: trx_undo_assign_undo(trx, rseg, pundo, TRX_UNDO_INSERT);
ut_ad((err == DB_SUCCESS) == (*pundo != NULL));
if (trx_undo_t* undo = *pundo) {
mtr_t mtr;
mtr.start(trx);
buf_block_t* block = buf_page_get_gen(
page_id_t(undo->space, undo->last_page_no),
univ_page_size, RW_X_LATCH,
buf_pool_is_obsolete(undo->withdraw_clock)
? NULL : undo->guess_block,
BUF_GET, __FILE__, __LINE__, &mtr, &err);
ut_ad((err == DB_SUCCESS) == !!block);
for (ut_d(int loop_count = 0); block;) {
ut_ad(++loop_count < 2);
buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
ut_ad(undo->last_page_no == block->page.id.page_no());
if (ulint offset = trx_undo_page_report_rename(
trx, table, block, &mtr)) {
undo->withdraw_clock = buf_withdraw_clock;
undo->empty = FALSE;
undo->top_page_no = undo->last_page_no;
undo->top_offset = offset;
undo->top_undo_no = trx->undo_no++;
undo->guess_block = block;
trx->undo_rseg_space = rseg->space;
err = DB_SUCCESS;
break;
} else {
mtr.commit();
mtr.start(trx);
block = trx_undo_add_page(trx, undo, &mtr);
if (!block) {
err = DB_OUT_OF_FILE_SPACE;
break;
}
}
}
mtr.commit();
}
mutex_exit(&trx->undo_mutex);
return err;
}
/***********************************************************************//**
Writes information to an undo log about an insert, update, or a delete marking
of a clustered index record. This information is used in a rollback of the
......
......@@ -1058,11 +1058,17 @@ trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap)
trx_undo_rec_t* undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr);
const undo_no_t undo_no = trx_undo_rec_get_undo_no(undo_rec);
if (trx_undo_rec_get_type(undo_rec) == TRX_UNDO_INSERT_REC) {
switch (trx_undo_rec_get_type(undo_rec)) {
case TRX_UNDO_RENAME_TABLE:
ut_ad(undo == insert);
/* fall through */
case TRX_UNDO_INSERT_REC:
ut_ad(undo == insert || undo == temp);
*roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
} else {
break;
default:
ut_ad(undo == update || undo == temp);
break;
}
ut_ad(trx_roll_check_undo_rec_ordering(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment