Commit 9c2ff270 authored by unknown's avatar unknown

WL#3072 Maria Recovery

* recovery from ha_maria now skips replaying DDLs (too dangerous)
* maria_read_log still replays DDLs, print warning about issues
* fixes to replaying of REDO_RENAME
* don't replay DDLs on corrupted tables (safer)
* print a one-line message when really doing a recovery (applies to
ha_maria, not maria_read_log) i.e. some REDOs or UNDOs are read.


storage/maria/ma_checkpoint.c:
  fix for assertion failure
storage/maria/ma_recovery.c:
  * Recovery from ha_maria now skips replaying DDLs (as the initial
  plan said) as this is unsafe in case of crashes during the DDL;
  applying the records may do harm (destroy important files)
  so we prefer to leave the "mess" of files untouched. A proper
  recovery of DDLs requires very careful thinking, probably testing
  separately the existence of the data and index file instead of
  using maria_open() which tests the existence of both, and maybe
  storing create_rename_lsn in the data file too.
  * maria_read_log still replays DDLs, we print a warning about dangers
  (due to ALTER TABLE not logging insertions into the tmp table; we
  will maybe need an option to have logging of those insertions).
  * fixes to replaying of REDO_RENAME (test create_rename_lsn of 'new_name'
  table if it exists; if that table exists and is more recent than the
  record, remove the 'old_name' table).
  * don't replay DDLs on corrupted tables (play safe)
  * fail also in non-debug builds if table is open when it should not be
  (when creating it for example, it should not be already open).
  * when the trace file is not stdout (i.e. when this is ha_maria),
  if really doing a recovery (reading REDOs or UNDOs), print a one-line
  message to stderr to inform about start and end of recovery
  (useful to know what mysqld is doing, especially if it takes long
  or crashes).
storage/maria/ma_recovery.h:
  parameter to replay DDLs or not
storage/maria/maria_read_log.c:
  replay DDLs in maria_read_log, to be able to recreate tables from
  scratch.
parent 8829fa64
......@@ -123,7 +123,13 @@ int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
int result= 0;
DBUG_ENTER("ma_checkpoint_execute");
DBUG_ASSERT(checkpoint_inited);
if (!checkpoint_inited)
{
/*
If ha_maria failed to start, maria_panic_hton is called, we come here.
*/
DBUG_RETURN(0);
}
DBUG_ASSERT(level > CHECKPOINT_NONE);
/* look for already running checkpoints */
......
......@@ -49,6 +49,7 @@ static LSN current_group_end_lsn,
checkpoint_start= LSN_IMPOSSIBLE;
static TrID max_long_trid= 0; /**< max long trid seen by REDO phase */
static FILE *tracef; /**< trace file for debugging */
static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */
#define prototype_redo_exec_hook(R) \
static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec)
......@@ -117,7 +118,23 @@ static void enlarge_buffer(const TRANSLOG_HEADER_BUFFER *rec)
MYF(MY_WME | MY_ALLOW_ZERO_PTR));
}
}
static my_bool recovery_message_printed;
static inline void print_recovery_message()
{
/*
If we're really doing a recovery (reading REDOs or UNDOs), we print a
one-line message when we start it and when we end it. It goes to stderr,
not tracef, so that it is visible in the error log (soon we should maybe
use sql_print_error). We don't print if if tracef is stdout as stdout will
be seen by the user and thus convey sufficient info already.
*/
if (!recovery_message_printed && (tracef != stdout))
{
recovery_message_printed= TRUE;
/** @todo RECOVERY BUG all prints to stderr should go to error log */
fprintf(stderr, "Maria engine: starting recovery\n");
}
}
#define ALERT_USER() DBUG_ASSERT(0)
......@@ -147,7 +164,7 @@ int maria_recover()
{
fprintf(trace_file, "TRACE of the last MARIA recovery from mysqld\n");
DBUG_ASSERT(maria_pagecache->inited);
res= maria_apply_log(LSN_IMPOSSIBLE, TRUE, trace_file, TRUE);
res= maria_apply_log(LSN_IMPOSSIBLE, TRUE, trace_file, TRUE, TRUE);
if (!res)
fprintf(trace_file, "SUCCESS\n");
fclose(trace_file);
......@@ -164,6 +181,8 @@ int maria_recover()
LSN_IMPOSSIBLE means "use last checkpoint"
@param apply if log records should be applied or not
@param trace_file trace file where progress/debug messages will go
@param skip_DDLs Should DDL records (CREATE/RENAME/DROP/REPAIR)
be skipped by the REDO phase or not
@todo This trace_file thing is primitive; soon we will make it similar to
ma_check_print_warning() etc, and a successful recovery does not need to
......@@ -175,7 +194,7 @@ int maria_recover()
*/
int maria_apply_log(LSN from_lsn, my_bool apply, FILE *trace_file,
my_bool should_run_undo_phase)
my_bool should_run_undo_phase, my_bool skip_DDLs_arg)
{
int error= 0;
uint unfinished_trans;
......@@ -192,7 +211,33 @@ int maria_apply_log(LSN from_lsn, my_bool apply, FILE *trace_file,
if (!all_active_trans || !all_tables)
goto err;
recovery_message_printed= FALSE;
tracef= trace_file;
if (!(skip_DDLs= skip_DDLs_arg))
{
/*
Example of what can go wrong when replaying DDLs:
CREATE TABLE t (logged); INSERT INTO t VALUES(1) (logged);
ALTER TABLE t ... which does
CREATE a temporary table #sql... (logged)
INSERT data from t into #sql... (not logged)
RENAME #sql TO t (logged)
Removing tables by hand and replaying the log will leave in the
end an empty table "t": missing records. If after the RENAME an INSERT
into t was done, that row had number 1 in its page, executing the
REDO_INSERT_ROW_HEAD on the recreated empty t will fail (assertion
failure in _ma_apply_redo_insert_row_head_or_tail(): new data page is
created whereas rownr is not 0).
Another issue is that replaying of DDLs is not correct enough to work if
there was a crash during a DDL (see comment in execution of
REDO_RENAME_TABLE ).
*/
fprintf(tracef, "WARNING: MySQL server currently disables log records"
" about insertion of data by ALTER TABLE"
" (copy_data_between_tables()), applying of log records may"
" well not work. Additionally, applying of DDL records will"
" cause damage if there are tables left by a crash of a DDL.\n");
}
if (from_lsn == LSN_IMPOSSIBLE)
{
......@@ -245,6 +290,8 @@ int maria_apply_log(LSN from_lsn, my_bool apply, FILE *trace_file,
goto err;
/* If inside ha_maria, a checkpoint will soon be taken and save our work */
if (recovery_message_printed && (tracef != stdout))
fprintf(stderr, "Maria engine: finished recovery\n");
goto end;
err:
error= 1;
......@@ -365,6 +412,11 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
uint flags;
int error= 1, create_mode= O_RDWR | O_TRUNC;
MARIA_HA *info= NULL;
if (skip_DDLs)
{
fprintf(tracef, "we skip DDLs\n");
return 0;
}
enlarge_buffer(rec);
if (log_record_buffer.str == NULL ||
translog_read_record(rec->lsn, 0, rec->record_length,
......@@ -382,7 +434,12 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
{
MARIA_SHARE *share= info->s;
/* check that we're not already using it */
DBUG_ASSERT(share->reopen == 1);
if (share->reopen != 1)
{
fprintf(tracef, ", is already open (reopen=%u)\n", share->reopen);
ALERT_USER();
goto end;
}
DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
if (!share->base.born_transactional)
{
......@@ -391,7 +448,7 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
one was renamed to its name, thus create_rename_lsn is 0 and should
not be trusted.
*/
fprintf(tracef, ", is not transactional\n");
fprintf(tracef, ", is not transactional, ignoring creation\n");
ALERT_USER();
error= 0;
goto end;
......@@ -406,13 +463,16 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
}
if (maria_is_crashed(info))
{
fprintf(tracef, ", is crashed, overwriting it");
fprintf(tracef, ", is crashed, can't recreate it");
ALERT_USER();
goto end;
}
maria_close(info);
info= NULL;
}
/* if does not exist, is older, or its header is corrupted, overwrite it */
else /* one or two files absent, or header corrupted... */
fprintf(tracef, "can't be opened, probably does not exist");
/* if does not exist, or is older, overwrite it */
/** @todo symlinks */
ptr= name + strlen(name) + 1;
if ((flags= ptr[0] ? HA_DONT_TOUCH_DATA : 0))
......@@ -490,6 +550,11 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
char *old_name, *new_name;
int error= 1;
MARIA_HA *info= NULL;
if (skip_DDLs)
{
fprintf(tracef, "we skip DDLs\n");
return 0;
}
enlarge_buffer(rec);
if (log_record_buffer.str == NULL ||
translog_read_record(rec->lsn, 0, rec->record_length,
......@@ -501,7 +566,36 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
}
old_name= log_record_buffer.str;
new_name= old_name + strlen(old_name) + 1;
fprintf(tracef, "Table '%s' to rename to '%s'", old_name, new_name);
fprintf(tracef, "Table '%s' to rename to '%s'; old-name table ", old_name,
new_name);
/*
Here is why we skip CREATE/DROP/RENAME when doing a recovery from
ha_maria (whereas we do when called from maria_read_log). Consider:
CREATE TABLE t;
RENAME TABLE t to u;
DROP TABLE u;
RENAME TABLE v to u; # crash between index rename and data rename.
And do a Recovery (not removing tables beforehand).
Recovery replays CREATE, then RENAME: the maria_open("t") works,
maria_open("u") does not (no data file) so table "u" is considered
inexistent and so maria_rename() is done which overwrites u's index file,
which is lost. Ok, the data file (v.MAD) is still available, but only a
REPAIR USE_FRM can rebuild the index, which is unsafe and downtime.
So it is preferrable to not execute RENAME, and leave the "mess" of files,
rather than possibly destroy a file. DBA will manually rename files.
A safe recovery method would probably require checking the existence of
the index file and of the data file separately (not via maria_open()), and
maybe also to store a create_rename_lsn in the data file too
For now, all we risk is to leave the mess (half-renamed files) left by the
crash. We however sync files and directories at each file rename. The SQL
layer is anyway not crash-safe for DDLs (except the repartioning-related
ones).
We replay DDLs in maria_read_log to be able to recreate tables from
scratch. It means that "maria_read_log -a" should not be used on a
database which just crashed during a DDL. And also ALTER TABLE does not
log insertions of records into the temporary table, so replaying may
fail (see comment and warning in maria_apply_log()).
*/
info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
if (info)
{
......@@ -512,7 +606,7 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
*/
if (!share->base.born_transactional)
{
fprintf(tracef, ", is not transactional\n");
fprintf(tracef, ", is not transactional, ignoring renaming\n");
ALERT_USER();
error= 0;
goto end;
......@@ -540,7 +634,76 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
maria_close(info))
goto end;
info= NULL;
fprintf(tracef, ", is ok for renaming; new-name table ");
}
else /* one or two files absent, or header corrupted... */
{
fprintf(tracef, ", can't be opened, probably does not exist");
error= 0;
goto end;
}
/*
We must also check the create_rename_lsn of the 'new_name' table if it
exists: otherwise we may, with our rename which overwrites, destroy
another table. For example:
CREATE TABLE t;
RENAME t to u;
DROP TABLE u;
RENAME v to u; # v is an old table, its creation/insertions not in log
And start executing the log (without removing tables beforehand): creates
t, renames it to u (if not testing create_rename_lsn) thus overwriting
old-named v, drops u, and we are stuck, we have lost data.
*/
info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
if (info)
{
MARIA_SHARE *share= info->s;
/* We should not have open instances on this table. */
if (share->reopen != 1)
{
fprintf(tracef, ", is already open (reopen=%u)\n", share->reopen);
ALERT_USER();
goto end;
}
if (!share->base.born_transactional)
{
fprintf(tracef, ", is not transactional, ignoring renaming\n");
ALERT_USER();
goto drop;
}
if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
{
fprintf(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
" record, ignoring renaming",
LSN_IN_PARTS(share->state.create_rename_lsn));
/*
We have to drop the old_name table. Consider:
CREATE TABLE t;
CREATE TABLE v;
RENAME TABLE t to u;
DROP TABLE u;
RENAME TABLE v to u;
and apply the log without removing tables beforehand. t will be
created, v too; in REDO_RENAME u will be more recent, but we still
have to drop t otherwise it stays.
*/
goto drop;
}
if (maria_is_crashed(info))
{
fprintf(tracef, ", is crashed, can't rename it");
ALERT_USER();
goto end;
}
if (maria_close(info))
goto end;
info= NULL;
/* abnormal situation */
fprintf(tracef, ", exists but is older than record, can't rename it");
goto end;
}
else /* one or two files absent, or header corrupted... */
fprintf(tracef, ", can't be opened, probably does not exist");
fprintf(tracef, ", renaming '%s'", old_name);
if (maria_rename(old_name, new_name))
{
......@@ -559,6 +722,16 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
goto end;
info= NULL;
error= 0;
goto end;
drop:
fprintf(tracef, ", only dropping '%s'", old_name);
if (maria_delete_table(old_name))
{
fprintf(tracef, "Failed to drop table\n");
goto end;
}
error= 0;
goto end;
end:
fprintf(tracef, "\n");
if (info != NULL)
......@@ -573,8 +746,17 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
prototype_redo_exec_hook(REDO_REPAIR_TABLE)
{
int error= 1;
MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
if (info == NULL)
MARIA_HA *info;
if (skip_DDLs)
{
/*
REPAIR is not exactly a DDL, but it manipulates files without logging
insertions into them.
*/
fprintf(tracef, "we skip DDLs\n");
return 0;
}
if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL)
return 0;
/*
Otherwise, the mapping is newer than the table, and our record is newer
......@@ -610,6 +792,11 @@ prototype_redo_exec_hook(REDO_DROP_TABLE)
char *name;
int error= 1;
MARIA_HA *info= NULL;
if (skip_DDLs)
{
fprintf(tracef, "we skip DDLs\n");
return 0;
}
enlarge_buffer(rec);
if (log_record_buffer.str == NULL ||
translog_read_record(rec->lsn, 0, rec->record_length,
......@@ -631,7 +818,7 @@ prototype_redo_exec_hook(REDO_DROP_TABLE)
*/
if (!share->base.born_transactional)
{
fprintf(tracef, ", is not transactional\n");
fprintf(tracef, ", is not transactional, ignoring removal\n");
ALERT_USER();
error= 0;
goto end;
......@@ -646,8 +833,9 @@ prototype_redo_exec_hook(REDO_DROP_TABLE)
}
if (maria_is_crashed(info))
{
fprintf(tracef, ", is crashed, dropping it");
fprintf(tracef, ", is crashed, can't drop it");
ALERT_USER();
goto end;
}
/*
This maria_extra() call serves to signal that old open instances of
......@@ -658,14 +846,16 @@ prototype_redo_exec_hook(REDO_DROP_TABLE)
maria_close(info))
goto end;
info= NULL;
/* if it is older, or its header is corrupted, drop it */
fprintf(tracef, ", dropping '%s'", name);
if (maria_delete_table(name))
{
fprintf(tracef, "Failed to drop table\n");
goto end;
}
}
/* if does not exist, is older, or its header is corrupted, drop it */
fprintf(tracef, ", dropping '%s'", name);
if (maria_delete_table(name))
{
fprintf(tracef, "Failed to drop table\n");
goto end;
}
else /* one or two files absent, or header corrupted... */
fprintf(tracef,", can't be opened, probably does not exist");
error= 0;
end:
fprintf(tracef, "\n");
......@@ -753,7 +943,12 @@ static int new_table(uint16 sid, const char *name,
}
MARIA_SHARE *share= info->s;
/* check that we're not already using it */
DBUG_ASSERT(share->reopen == 1);
if (share->reopen != 1)
{
fprintf(tracef, ", is already open (reopen=%u)\n", share->reopen);
ALERT_USER();
goto end;
}
DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
if (!share->base.born_transactional)
{
......@@ -1294,7 +1489,6 @@ static int run_redo_phase(LSN lsn, my_bool apply)
uint16 sid= rec.short_trid;
const LOG_DESC *log_desc= &log_record_type_descriptor[rec.type];
display_record_position(log_desc, &rec, i);
/*
A complete group is a set of log records with an "end mark" record
(e.g. a set of REDOs for an operation, terminated by an UNDO for this
......@@ -1572,6 +1766,7 @@ static MARIA_HA *get_MARIA_HA_from_REDO_record(const
MARIA_HA *info;
char llbuf[22];
print_recovery_message();
sid= fileid_korr(rec->header);
page= page_korr(rec->header + FILEID_STORE_SIZE);
/**
......@@ -1643,6 +1838,7 @@ static MARIA_HA *get_MARIA_HA_from_UNDO_record(const
uint16 sid;
MARIA_HA *info;
print_recovery_message();
sid= fileid_korr(rec->header + LSN_STORE_SIZE);
fprintf(tracef, " For table of short id %u", sid);
info= all_tables[sid].info;
......
......@@ -26,5 +26,5 @@
C_MODE_START
int maria_recover();
int maria_apply_log(LSN lsn, my_bool apply, FILE *trace_file,
my_bool execute_undo_phase);
my_bool execute_undo_phase, my_bool skip_DDLs);
C_MODE_END
......@@ -101,7 +101,7 @@ int main(int argc, char **argv)
fprintf(stdout, "TRACE of the last maria_read_log\n");
if (maria_apply_log(lsn, opt_display_and_apply, stdout,
opt_display_and_apply))
opt_display_and_apply, FALSE))
goto err;
fprintf(stdout, "%s: SUCCESS\n", my_progname);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment