Commit 86362129 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-6120: When slave stops with error, error message should indicate the failing GTID

If replication breaks in GTID mode, it is not trivial to determine the GTID of
the failing event group. This is a problem, as such GTID is needed eg. to
explicitly set @@gtid_slave_pos to skip to after that event group, or to
compare errors on different servers, etc.

Fix by ensuring that relevant slave errors logged to the error log include the
GTID of the event group containing the problem event.
parent 00467e13
include/master-slave.inc
[connection master]
*** Test MDEV-6120, output of current GTID when a replication error is logged to the errorlog ***
CREATE TABLE t1(a INT PRIMARY KEY);
include/stop_slave.inc
CHANGE MASTER TO master_use_gtid=slave_pos;
INSERT INTO t1 VALUES (1);
SET gtid_seq_no=100;
INSERT INTO t1 VALUES (2);
INSERT INTO t1 VALUES (3);
INSERT INTO t1 VALUES (4);
SET sql_log_bin=0;
INSERT INTO t1 VALUES (2);
SET sql_log_bin=1;
START SLAVE;
include/wait_for_slave_sql_error.inc [errno=1062]
include/stop_slave.inc
SET GLOBAL gtid_slave_pos= "0-1-100";
include/start_slave.inc
SELECT * FROM t1 ORDER BY a;
a
1
2
3
4
SET @dbug_save= @@debug_dbug;
SET debug_dbug= '+d,incident_database_resync_on_replace';
REPLACE INTO t1 VALUES (5);
SET debug_dbug= @dbug_save;
include/wait_for_slave_sql_error.inc [errno=1590]
include/stop_slave.inc
CHANGE MASTER TO master_use_gtid=no;
SET sql_slave_skip_counter=1;
include/start_slave.inc
include/stop_slave.inc
CHANGE MASTER TO master_use_gtid=slave_pos;
include/start_slave.inc
SELECT * FROM t1 ORDER BY a;
a
1
2
3
4
5
DROP TABLE t1;
include/rpl_end.inc
--source include/have_debug.inc
--source include/master-slave.inc
--echo *** Test MDEV-6120, output of current GTID when a replication error is logged to the errorlog ***
--connection master
CREATE TABLE t1(a INT PRIMARY KEY);
--sync_slave_with_master
--connection slave
--source include/stop_slave.inc
CHANGE MASTER TO master_use_gtid=slave_pos;
--connection master
INSERT INTO t1 VALUES (1);
SET gtid_seq_no=100;
INSERT INTO t1 VALUES (2);
INSERT INTO t1 VALUES (3);
INSERT INTO t1 VALUES (4);
--save_master_pos
--connection slave
SET sql_log_bin=0;
INSERT INTO t1 VALUES (2);
SET sql_log_bin=1;
START SLAVE;
--let $slave_sql_errno=1062
--source include/wait_for_slave_sql_error.inc
--source include/stop_slave.inc
# Skip the problem event from the master.
SET GLOBAL gtid_slave_pos= "0-1-100";
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t1 ORDER BY a;
--connection master
SET @dbug_save= @@debug_dbug;
SET debug_dbug= '+d,incident_database_resync_on_replace';
REPLACE INTO t1 VALUES (5);
SET debug_dbug= @dbug_save;
--save_master_pos
--connection slave
--let $slave_sql_errno=1590
--source include/wait_for_slave_sql_error.inc
# ToDo no need to switch off GTID once MDEV-4937 is fixed
--source include/stop_slave.inc
CHANGE MASTER TO master_use_gtid=no;
SET sql_slave_skip_counter=1;
--source include/start_slave.inc
--sync_with_master
--source include/stop_slave.inc
CHANGE MASTER TO master_use_gtid=slave_pos;
--source include/start_slave.inc
SELECT * FROM t1 ORDER BY a;
# Check error log for correct messages.
let $log_error_= `SELECT @@GLOBAL.log_error`;
if(!$log_error_)
{
# MySQL Server on windows is started with --console and thus
# does not know the location of its .err log, use default location
let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.2.err;
}
--let SEARCH_FILE=$log_error_
--let SEARCH_PATTERN=Slave SQL: Error 'Duplicate entry .* on query\. .*Query: '.*', Gtid 0-1-100, Internal MariaDB error code:|Slave SQL: Could not execute Write_rows.*table test.t1; Duplicate entry.*, Gtid 0-1-100, Internal MariaDB error
--source include/search_pattern_in_file.inc
--let SEARCH_PATTERN=Slave SQL: The incident LOST_EVENTS occured on the master\. Message: <none>, Internal MariaDB error code: 1590
--source include/search_pattern_in_file.inc
--connection master
DROP TABLE t1;
--source include/rpl_end.inc
This diff is collapsed.
......@@ -133,8 +133,7 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, rpl_group_info *rgi)
{
DBUG_ASSERT(ptr->m_tabledef_valid);
TABLE *conv_table;
if (!ptr->m_tabledef.compatible_with(thd, const_cast<Relay_log_info*>(rli),
ptr->table, &conv_table))
if (!ptr->m_tabledef.compatible_with(thd, rgi, ptr->table, &conv_table))
{
ev_thd->is_slave_error= 1;
rgi->slave_close_thread_tables(ev_thd);
......@@ -1530,8 +1529,7 @@ int Old_rows_log_event::do_apply_event(rpl_group_info *rgi)
ptr= static_cast<RPL_TABLE_LIST*>(ptr->next_global), i++)
{
TABLE *conv_table;
if (ptr->m_tabledef.compatible_with(thd, const_cast<Relay_log_info*>(rli),
ptr->table, &conv_table))
if (ptr->m_tabledef.compatible_with(thd, rgi, ptr->table, &conv_table))
{
thd->is_slave_error= 1;
rgi->slave_close_thread_tables(thd);
......
......@@ -388,7 +388,7 @@ handle_rpl_parallel_thread(void *arg)
{
DEBUG_SYNC(thd, "rpl_parallel_start_waiting_for_prior_killed");
thd->send_kill_message();
slave_output_error_info(rgi->rli, thd);
slave_output_error_info(rgi, thd);
signal_error_to_sql_driver_thread(thd, rgi, 1);
/*
Even though we were killed, we need to continue waiting for the
......@@ -467,7 +467,7 @@ handle_rpl_parallel_thread(void *arg)
if (res < 0)
{
/* Error. */
slave_output_error_info(rgi->rli, thd);
slave_output_error_info(rgi, thd);
signal_error_to_sql_driver_thread(thd, rgi, 1);
}
else if (!res)
......@@ -514,7 +514,7 @@ handle_rpl_parallel_thread(void *arg)
if (unlikely(err) && !rgi->worker_error)
{
slave_output_error_info(rgi->rli, thd);
slave_output_error_info(rgi, thd);
signal_error_to_sql_driver_thread(thd, rgi, err);
}
if (end_of_group)
......@@ -1018,10 +1018,11 @@ rpl_parallel_thread_pool::release_thread(rpl_parallel_thread *rpt)
if it is still available. Otherwise a new worker thread is allocated.
*/
rpl_parallel_thread *
rpl_parallel_entry::choose_thread(Relay_log_info *rli, bool *did_enter_cond,
rpl_parallel_entry::choose_thread(rpl_group_info *rgi, bool *did_enter_cond,
PSI_stage_info *old_stage, bool reuse)
{
uint32 idx;
Relay_log_info *rli= rgi->rli;
rpl_parallel_thread *thr;
idx= rpl_thread_idx;
......@@ -1066,7 +1067,7 @@ rpl_parallel_entry::choose_thread(Relay_log_info *rli, bool *did_enter_cond,
debug_sync_set_action(rli->sql_driver_thd,
STRING_WITH_LEN("now SIGNAL wait_queue_killed"));
};);
slave_output_error_info(rli, rli->sql_driver_thd);
slave_output_error_info(rgi, rli->sql_driver_thd);
return NULL;
}
else
......@@ -1417,7 +1418,8 @@ rpl_parallel::do_event(rpl_group_info *serial_rgi, Log_event *ev,
instead re-use a thread that we queued for previously.
*/
cur_thread=
e->choose_thread(rli, &did_enter_cond, &old_stage, typ != GTID_EVENT);
e->choose_thread(serial_rgi, &did_enter_cond, &old_stage,
typ != GTID_EVENT);
if (!cur_thread)
{
/* This means we were killed. The error is already signalled. */
......
......@@ -208,7 +208,7 @@ struct rpl_parallel_entry {
/* The group_commit_orderer object for the events currently being queued. */
group_commit_orderer *current_gco;
rpl_parallel_thread * choose_thread(Relay_log_info *rli, bool *did_enter_cond,
rpl_parallel_thread * choose_thread(rpl_group_info *rgi, bool *did_enter_cond,
PSI_stage_info *old_stage, bool reuse);
group_commit_orderer *get_gco();
void free_gco(group_commit_orderer *gco);
......
......@@ -28,6 +28,7 @@ Slave_reporting_capability::Slave_reporting_capability(char const *thread_name)
void
Slave_reporting_capability::report(loglevel level, int err_code,
const char *extra_info,
const char *msg, ...) const
{
void (*report_function)(const char *, ...);
......@@ -67,9 +68,10 @@ Slave_reporting_capability::report(loglevel level, int err_code,
va_end(args);
/* If the msg string ends with '.', do not add a ',' it would be ugly */
report_function("Slave %s: %s%s Internal MariaDB error code: %d",
report_function("Slave %s: %s%s %s%sInternal MariaDB error code: %d",
m_thread_name, pbuff,
(pbuff[0] && *(strend(pbuff)-1) == '.') ? "" : ",",
(extra_info ? extra_info : ""), (extra_info ? ", " : ""),
err_code);
}
......
......@@ -52,8 +52,9 @@ class Slave_reporting_capability
code, but can contain more information), in
printf() format.
*/
void report(loglevel level, int err_code, const char *msg, ...) const
ATTRIBUTE_FORMAT(printf, 4, 5);
void report(loglevel level, int err_code, const char *extra_info,
const char *msg, ...) const
ATTRIBUTE_FORMAT(printf, 5, 6);
/**
Clear errors. They will not show up under <code>SHOW SLAVE
......
......@@ -1303,7 +1303,7 @@ void Relay_log_info::stmt_done(my_off_t event_master_log_pos,
inc_group_relay_log_pos(event_master_log_pos, rgi);
if (rpl_global_gtid_slave_state.record_and_update_gtid(thd, rgi))
{
report(WARNING_LEVEL, ER_CANNOT_UPDATE_GTID_STATE,
report(WARNING_LEVEL, ER_CANNOT_UPDATE_GTID_STATE, rgi->gtid_info(),
"Failed to update GTID state in %s.%s, slave state may become "
"inconsistent: %d: %s",
"mysql", rpl_gtid_slave_state_table_name.str,
......@@ -1805,6 +1805,26 @@ rpl_group_info::mark_start_commit()
}
/*
Format the current GTID as a string suitable for printing in error messages.
The string is stored in a buffer inside rpl_group_info, so remains valid
until next call to gtid_info() or until destruction of rpl_group_info.
If no GTID is available, then NULL is returned.
*/
char *
rpl_group_info::gtid_info()
{
if (!gtid_sub_id || !current_gtid.seq_no)
return NULL;
my_snprintf(gtid_info_buf, sizeof(gtid_info_buf), "Gtid %u-%u-%llu",
current_gtid.domain_id, current_gtid.server_id,
current_gtid.seq_no);
return gtid_info_buf;
}
rpl_sql_thread_info::rpl_sql_thread_info(Rpl_filter *filter)
: rpl_filter(filter)
{
......
......@@ -593,6 +593,8 @@ struct rpl_group_info
*/
time_t row_stmt_start_timestamp;
bool long_find_row_note_printed;
/* Needs room for "Gtid D-S-N\x00". */
char gtid_info_buf[5+10+1+10+1+20+1];
rpl_group_info(Relay_log_info *rli_);
~rpl_group_info();
......@@ -681,6 +683,7 @@ struct rpl_group_info
void slave_close_thread_tables(THD *);
void mark_start_commit_no_lock();
void mark_start_commit();
char *gtid_info();
time_t get_row_stmt_start_timestamp()
{
......
......@@ -826,7 +826,7 @@ can_convert_field_to(Field *field,
@retval false Master table is not compatible with slave table.
*/
bool
table_def::compatible_with(THD *thd, Relay_log_info *rli,
table_def::compatible_with(THD *thd, rpl_group_info *rgi,
TABLE *table, TABLE **conv_table_var)
const
{
......@@ -834,6 +834,7 @@ table_def::compatible_with(THD *thd, Relay_log_info *rli,
We only check the initial columns for the tables.
*/
uint const cols_to_check= MY_MIN(table->s->fields, size());
Relay_log_info *rli= rgi->rli;
TABLE *tmp_table= NULL;
for (uint col= 0 ; col < cols_to_check ; ++col)
......@@ -857,7 +858,7 @@ table_def::compatible_with(THD *thd, Relay_log_info *rli,
This will create the full table with all fields. This is
necessary to ge the correct field lengths for the record.
*/
tmp_table= create_conversion_table(thd, rli, table);
tmp_table= create_conversion_table(thd, rgi, table);
if (tmp_table == NULL)
return false;
/*
......@@ -885,7 +886,7 @@ table_def::compatible_with(THD *thd, Relay_log_info *rli,
String target_type(target_buf, sizeof(target_buf), &my_charset_latin1);
show_sql_type(type(col), field_metadata(col), &source_type, field->charset());
field->sql_type(target_type);
rli->report(ERROR_LEVEL, ER_SLAVE_CONVERSION_FAILED,
rli->report(ERROR_LEVEL, ER_SLAVE_CONVERSION_FAILED, rgi->gtid_info(),
ER(ER_SLAVE_CONVERSION_FAILED),
col, db_name, tbl_name,
source_type.c_ptr_safe(), target_type.c_ptr_safe());
......@@ -927,12 +928,14 @@ table_def::compatible_with(THD *thd, Relay_log_info *rli,
conversion table.
*/
TABLE *table_def::create_conversion_table(THD *thd, Relay_log_info *rli, TABLE *target_table) const
TABLE *table_def::create_conversion_table(THD *thd, rpl_group_info *rgi,
TABLE *target_table) const
{
DBUG_ENTER("table_def::create_conversion_table");
List<Create_field> field_list;
TABLE *conv_table= NULL;
Relay_log_info *rli= rgi->rli;
/*
At slave, columns may differ. So we should create
MY_MIN(columns@master, columns@slave) columns in the
......@@ -1014,7 +1017,7 @@ TABLE *table_def::create_conversion_table(THD *thd, Relay_log_info *rli, TABLE *
err:
if (conv_table == NULL)
rli->report(ERROR_LEVEL, ER_SLAVE_CANT_CREATE_CONVERSION,
rli->report(ERROR_LEVEL, ER_SLAVE_CANT_CREATE_CONVERSION, rgi->gtid_info(),
ER(ER_SLAVE_CANT_CREATE_CONVERSION),
target_table->s->db.str,
target_table->s->table_name.str);
......
......@@ -30,6 +30,7 @@
class Relay_log_info;
class Log_event;
struct rpl_group_info;
/**
A table definition from the master.
......@@ -187,7 +188,7 @@ class table_def
@retval 0 if the table definition is compatible with @c table
*/
#ifndef MYSQL_CLIENT
bool compatible_with(THD *thd, Relay_log_info *rli, TABLE *table,
bool compatible_with(THD *thd, rpl_group_info *rgi, TABLE *table,
TABLE **conv_table_var) const;
/**
......@@ -212,7 +213,8 @@ class table_def
@return A pointer to a temporary table with memory allocated in the
thread's memroot, NULL if the table could not be created
*/
TABLE *create_conversion_table(THD *thd, Relay_log_info *rli, TABLE *target_table) const;
TABLE *create_conversion_table(THD *thd, rpl_group_info *rgi,
TABLE *target_table) const;
#endif
......
This diff is collapsed.
......@@ -234,7 +234,7 @@ int apply_event_and_update_pos(Log_event* ev, THD* thd,
rpl_parallel_thread *rpt);
pthread_handler_t handle_slave_io(void *arg);
void slave_output_error_info(Relay_log_info *rli, THD *thd);
void slave_output_error_info(rpl_group_info *rgi, THD *thd);
pthread_handler_t handle_slave_sql(void *arg);
bool net_request_file(NET* net, const char* fname);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment