Commit 40bbf697 authored by unknown's avatar unknown

MDEV-532: Async InnoDB commit checkpoint.

Make the commit checkpoint inside InnoDB be asynchroneous.
Implement a background thread in binlog to do the writing and flushing of
binlog checkpoint events to disk.
parent e97d6232
...@@ -70,8 +70,14 @@ show binlog events in 'master-bin.000003' from <binlog_start>; ...@@ -70,8 +70,14 @@ show binlog events in 'master-bin.000003' from <binlog_start>;
Log_name Pos Event_type Server_id End_log_pos Info Log_name Pos Event_type Server_id End_log_pos Info
master-bin.000003 # Format_desc # # SERVER_VERSION, BINLOG_VERSION master-bin.000003 # Format_desc # # SERVER_VERSION, BINLOG_VERSION
master-bin.000003 # Binlog_checkpoint # # master-bin.000001 master-bin.000003 # Binlog_checkpoint # # master-bin.000001
SET DEBUG_SYNC= "RESET";
SET @old_dbug= @@global.DEBUG_DBUG;
SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
SET DEBUG_SYNC= "now SIGNAL con2_continue"; SET DEBUG_SYNC= "now SIGNAL con2_continue";
con1 is still pending, no new binlog checkpoint should have been logged. con1 is still pending, no new binlog checkpoint should have been logged.
SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
SET GLOBAL debug_dbug= @old_dbug;
SET DEBUG_SYNC= "RESET";
show binlog events in 'master-bin.000003' from <binlog_start>; show binlog events in 'master-bin.000003' from <binlog_start>;
Log_name Pos Event_type Server_id End_log_pos Info Log_name Pos Event_type Server_id End_log_pos Info
master-bin.000003 # Format_desc # # SERVER_VERSION, BINLOG_VERSION master-bin.000003 # Format_desc # # SERVER_VERSION, BINLOG_VERSION
......
...@@ -118,7 +118,11 @@ master-bin.00000<binlog_start> # Table_map # # table_id: # (test.t1) ...@@ -118,7 +118,11 @@ master-bin.00000<binlog_start> # Table_map # # table_id: # (test.t1)
master-bin.00000<binlog_start> # Write_rows # # table_id: # flags: STMT_END_F master-bin.00000<binlog_start> # Write_rows # # table_id: # flags: STMT_END_F
master-bin.00000<binlog_start> # Xid # # COMMIT /* XID */ master-bin.00000<binlog_start> # Xid # # COMMIT /* XID */
SET DEBUG_SYNC= "now SIGNAL con10_cont"; SET DEBUG_SYNC= "now SIGNAL con10_cont";
SET @old_dbug= @@global.DEBUG_DBUG;
SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
SET DEBUG_SYNC= "now SIGNAL con12_cont"; SET DEBUG_SYNC= "now SIGNAL con12_cont";
SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
SET GLOBAL debug_dbug= @old_dbug;
SET DEBUG_SYNC= "now SIGNAL con11_cont"; SET DEBUG_SYNC= "now SIGNAL con11_cont";
Checking that master-bin.000004 is the last binlog checkpoint Checking that master-bin.000004 is the last binlog checkpoint
show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>; show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;
......
...@@ -71,6 +71,12 @@ SET DEBUG_SYNC= "now WAIT_FOR con2_ready"; ...@@ -71,6 +71,12 @@ SET DEBUG_SYNC= "now WAIT_FOR con2_ready";
--let $binlog_file= master-bin.000003 --let $binlog_file= master-bin.000003
--source include/show_binlog_events.inc --source include/show_binlog_events.inc
# We need to sync the test case with the background processing of the
# commit checkpoint, otherwise we get nondeterministic results.
SET DEBUG_SYNC= "RESET";
SET @old_dbug= @@global.DEBUG_DBUG;
SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
SET DEBUG_SYNC= "now SIGNAL con2_continue"; SET DEBUG_SYNC= "now SIGNAL con2_continue";
connection con2; connection con2;
...@@ -78,6 +84,12 @@ reap; ...@@ -78,6 +84,12 @@ reap;
connection default; connection default;
--echo con1 is still pending, no new binlog checkpoint should have been logged. --echo con1 is still pending, no new binlog checkpoint should have been logged.
# Make sure commit checkpoint is processed before we check that no checkpoint
# event has been binlogged.
SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
SET GLOBAL debug_dbug= @old_dbug;
SET DEBUG_SYNC= "RESET";
--let $binlog_file= master-bin.000003 --let $binlog_file= master-bin.000003
--source include/show_binlog_events.inc --source include/show_binlog_events.inc
......
...@@ -14,8 +14,24 @@ CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb; ...@@ -14,8 +14,24 @@ CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
# Insert some data to force a couple binlog rotations (3), so we get some # Insert some data to force a couple binlog rotations (3), so we get some
# normal binlog checkpoints before starting the test. # normal binlog checkpoints before starting the test.
INSERT INTO t1 VALUES (100, REPEAT("x", 4100)); INSERT INTO t1 VALUES (100, REPEAT("x", 4100));
# Wait for the master-bin.000002 binlog checkpoint to appear.
--let $wait_for_all= 0
--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000002"
--let $field= Info
--let $condition= = "master-bin.000002"
--source include/wait_show_condition.inc
INSERT INTO t1 VALUES (101, REPEAT("x", 4100)); INSERT INTO t1 VALUES (101, REPEAT("x", 4100));
--let $wait_for_all= 0
--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003"
--let $field= Info
--let $condition= = "master-bin.000003"
--source include/wait_show_condition.inc
INSERT INTO t1 VALUES (102, REPEAT("x", 4100)); INSERT INTO t1 VALUES (102, REPEAT("x", 4100));
--let $wait_for_all= 0
--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004"
--let $field= Info
--let $condition= = "master-bin.000004"
--source include/wait_show_condition.inc
# Now start a bunch of transactions that span multiple binlog # Now start a bunch of transactions that span multiple binlog
# files. Leave then in the state prepared-but-not-committed in the engine # files. Leave then in the state prepared-but-not-committed in the engine
...@@ -153,10 +169,19 @@ SET DEBUG_SYNC= "now SIGNAL con10_cont"; ...@@ -153,10 +169,19 @@ SET DEBUG_SYNC= "now SIGNAL con10_cont";
connection con10; connection con10;
reap; reap;
connection default; connection default;
# We need to sync the test case with the background processing of the
# commit checkpoint, otherwise we get nondeterministic results.
SET @old_dbug= @@global.DEBUG_DBUG;
SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
SET DEBUG_SYNC= "now SIGNAL con12_cont"; SET DEBUG_SYNC= "now SIGNAL con12_cont";
connection con12; connection con12;
reap; reap;
connection default; connection default;
SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
SET GLOBAL debug_dbug= @old_dbug;
SET DEBUG_SYNC= "now SIGNAL con11_cont"; SET DEBUG_SYNC= "now SIGNAL con11_cont";
connection con11; connection con11;
reap; reap;
...@@ -210,7 +235,20 @@ RESET MASTER; ...@@ -210,7 +235,20 @@ RESET MASTER;
# crash recovery fails due to the error insert used for previous test. # crash recovery fails due to the error insert used for previous test.
INSERT INTO t1 VALUES (21, REPEAT("x", 4100)); INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
INSERT INTO t1 VALUES (22, REPEAT("x", 4100)); INSERT INTO t1 VALUES (22, REPEAT("x", 4100));
# Wait for the master-bin.000003 binlog checkpoint to appear.
--let $wait_for_all= 0
--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003"
--let $field= Info
--let $condition= = "master-bin.000003"
--source include/wait_show_condition.inc
INSERT INTO t1 VALUES (23, REPEAT("x", 4100)); INSERT INTO t1 VALUES (23, REPEAT("x", 4100));
# Wait for the last (master-bin.000004) binlog checkpoint to appear.
--let $wait_for_all= 0
--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004"
--let $field= Info
--let $condition= = "master-bin.000004"
--source include/wait_show_condition.inc
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect --write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
wait-binlog_xa_recover.test wait-binlog_xa_recover.test
EOF EOF
......
...@@ -76,6 +76,7 @@ wait/synch/mutex/sql/Master_info::run_lock ...@@ -76,6 +76,7 @@ wait/synch/mutex/sql/Master_info::run_lock
wait/synch/mutex/sql/Master_info::sleep_lock wait/synch/mutex/sql/Master_info::sleep_lock
wait/synch/mutex/sql/MDL_map::mutex wait/synch/mutex/sql/MDL_map::mutex
wait/synch/mutex/sql/MDL_wait::LOCK_wait_status wait/synch/mutex/sql/MDL_wait::LOCK_wait_status
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_background_thread
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list
wait/synch/mutex/sql/MYSQL_RELAY_LOG::LOCK_index wait/synch/mutex/sql/MYSQL_RELAY_LOG::LOCK_index
...@@ -129,6 +130,8 @@ wait/synch/cond/sql/Master_info::sleep_cond ...@@ -129,6 +130,8 @@ wait/synch/cond/sql/Master_info::sleep_cond
wait/synch/cond/sql/Master_info::start_cond wait/synch/cond/sql/Master_info::start_cond
wait/synch/cond/sql/Master_info::stop_cond wait/synch/cond/sql/Master_info::stop_cond
wait/synch/cond/sql/MDL_context::COND_wait_status wait/synch/cond/sql/MDL_context::COND_wait_status
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread_end
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list
wait/synch/cond/sql/MYSQL_BIN_LOG::update_cond wait/synch/cond/sql/MYSQL_BIN_LOG::update_cond
......
...@@ -56,8 +56,11 @@ where event_name like "%MYSQL_BIN_LOG%" ...@@ -56,8 +56,11 @@ where event_name like "%MYSQL_BIN_LOG%"
and event_name not like "%MYSQL_BIN_LOG::update_cond" and event_name not like "%MYSQL_BIN_LOG::update_cond"
order by event_name; order by event_name;
EVENT_NAME COUNT_STAR EVENT_NAME COUNT_STAR
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread NONE
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread_end NONE
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy NONE wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy NONE
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list NONE wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list NONE
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_background_thread MANY
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list MANY
"Expect no slave relay log" "Expect no slave relay log"
...@@ -131,8 +134,11 @@ where event_name like "%MYSQL_BIN_LOG%" ...@@ -131,8 +134,11 @@ where event_name like "%MYSQL_BIN_LOG%"
and event_name not like "%MYSQL_BIN_LOG::update_cond" and event_name not like "%MYSQL_BIN_LOG::update_cond"
order by event_name; order by event_name;
EVENT_NAME COUNT_STAR EVENT_NAME COUNT_STAR
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread MANY
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread_end NONE
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy NONE wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy NONE
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list NONE wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list MANY
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_background_thread MANY
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list MANY
"Expect a slave relay log" "Expect a slave relay log"
......
...@@ -984,6 +984,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str) ...@@ -984,6 +984,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
DBUG_ENTER("debug_sync_eval_action"); DBUG_ENTER("debug_sync_eval_action");
DBUG_ASSERT(thd); DBUG_ASSERT(thd);
DBUG_ASSERT(action_str); DBUG_ASSERT(action_str);
DBUG_PRINT("debug_sync", ("action_str='%s'", action_str));
/* /*
Get debug sync point name. Or a special command. Get debug sync point name. Or a special command.
......
...@@ -54,6 +54,7 @@ ...@@ -54,6 +54,7 @@
#include "rpl_handler.h" #include "rpl_handler.h"
#include "debug_sync.h" #include "debug_sync.h"
#include "sql_show.h" #include "sql_show.h"
#include "my_pthread.h"
/* max size of the log message */ /* max size of the log message */
#define MAX_LOG_BUFFER_SIZE 1024 #define MAX_LOG_BUFFER_SIZE 1024
...@@ -107,6 +108,17 @@ static SHOW_VAR binlog_status_vars_detail[]= ...@@ -107,6 +108,17 @@ static SHOW_VAR binlog_status_vars_detail[]=
{NullS, NullS, SHOW_LONG} {NullS, NullS, SHOW_LONG}
}; };
/*
Variables for the binlog background thread.
Protected by the MYSQL_BIN_LOG::LOCK_binlog_background_thread mutex.
*/
static bool binlog_background_thread_started= false;
static bool binlog_background_thread_stop= false;
static MYSQL_BIN_LOG::xid_count_per_binlog *
binlog_background_thread_queue= NULL;
static bool start_binlog_background_thread();
/** /**
purge logs, master and slave sides both, related error code purge logs, master and slave sides both, related error code
...@@ -2958,12 +2970,28 @@ void MYSQL_BIN_LOG::cleanup() ...@@ -2958,12 +2970,28 @@ void MYSQL_BIN_LOG::cleanup()
my_free(b); my_free(b);
} }
/* Wait for the binlog background thread to stop. */
if (!is_relay_log && binlog_background_thread_started)
{
mysql_mutex_lock(&LOCK_binlog_background_thread);
binlog_background_thread_stop= true;
mysql_cond_signal(&COND_binlog_background_thread);
while (binlog_background_thread_stop)
mysql_cond_wait(&COND_binlog_background_thread_end,
&LOCK_binlog_background_thread);
mysql_mutex_unlock(&LOCK_binlog_background_thread);
binlog_background_thread_started= false;
}
mysql_mutex_destroy(&LOCK_log); mysql_mutex_destroy(&LOCK_log);
mysql_mutex_destroy(&LOCK_index); mysql_mutex_destroy(&LOCK_index);
mysql_mutex_destroy(&LOCK_xid_list); mysql_mutex_destroy(&LOCK_xid_list);
mysql_mutex_destroy(&LOCK_binlog_background_thread);
mysql_cond_destroy(&update_cond); mysql_cond_destroy(&update_cond);
mysql_cond_destroy(&COND_queue_busy); mysql_cond_destroy(&COND_queue_busy);
mysql_cond_destroy(&COND_xid_list); mysql_cond_destroy(&COND_xid_list);
mysql_cond_destroy(&COND_binlog_background_thread);
mysql_cond_destroy(&COND_binlog_background_thread_end);
} }
DBUG_VOID_RETURN; DBUG_VOID_RETURN;
} }
...@@ -2989,6 +3017,13 @@ void MYSQL_BIN_LOG::init_pthread_objects() ...@@ -2989,6 +3017,13 @@ void MYSQL_BIN_LOG::init_pthread_objects()
mysql_cond_init(m_key_update_cond, &update_cond, 0); mysql_cond_init(m_key_update_cond, &update_cond, 0);
mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0); mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0);
mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0); mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0);
mysql_mutex_init(key_BINLOG_LOCK_binlog_background_thread,
&LOCK_binlog_background_thread, MY_MUTEX_INIT_FAST);
mysql_cond_init(key_BINLOG_COND_binlog_background_thread,
&COND_binlog_background_thread, 0);
mysql_cond_init(key_BINLOG_COND_binlog_background_thread_end,
&COND_binlog_background_thread_end, 0);
} }
...@@ -3086,6 +3121,10 @@ bool MYSQL_BIN_LOG::open(const char *log_name, ...@@ -3086,6 +3121,10 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
DBUG_ENTER("MYSQL_BIN_LOG::open"); DBUG_ENTER("MYSQL_BIN_LOG::open");
DBUG_PRINT("enter",("log_type: %d",(int) log_type_arg)); DBUG_PRINT("enter",("log_type: %d",(int) log_type_arg));
if (!is_relay_log && !binlog_background_thread_started &&
start_binlog_background_thread())
DBUG_RETURN(1);
if (init_and_set_log_file_name(log_name, new_name, log_type_arg, if (init_and_set_log_file_name(log_name, new_name, log_type_arg,
io_cache_type_arg)) io_cache_type_arg))
{ {
...@@ -5541,11 +5580,7 @@ bool general_log_write(THD *thd, enum enum_server_command command, ...@@ -5541,11 +5580,7 @@ bool general_log_write(THD *thd, enum enum_server_command command,
} }
/* static void
I would like to make this function static, but this causes compiler warnings
when it is declared as friend function in log.h.
*/
void
binlog_checkpoint_callback(void *cookie) binlog_checkpoint_callback(void *cookie)
{ {
MYSQL_BIN_LOG::xid_count_per_binlog *entry= MYSQL_BIN_LOG::xid_count_per_binlog *entry=
...@@ -8135,9 +8170,129 @@ int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid) ...@@ -8135,9 +8170,129 @@ int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
void void
TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie) TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
{ {
mark_xid_done(((xid_count_per_binlog *)cookie)->binlog_id, true); xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie);
mysql_mutex_lock(&LOCK_binlog_background_thread);
entry->next_in_queue= binlog_background_thread_queue;
binlog_background_thread_queue= entry;
mysql_cond_signal(&COND_binlog_background_thread);
mysql_mutex_unlock(&LOCK_binlog_background_thread);
} }
/*
Binlog background thread.
This thread is used to log binlog checkpoints in the background, rather than
in the context of random storage engine threads that happen to call
commit_checkpoint_notify_ha() and may not like the delays while syncing
binlog to disk or may not be setup with all my_thread_init() and other
necessary stuff.
In the future, this thread could also be used to do log rotation in the
background, which could elimiate all stalls around binlog rotations.
*/
pthread_handler_t
binlog_background_thread(void *arg __attribute__((unused)))
{
bool stop;
MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next;
THD *thd;
my_thread_init();
thd= new THD;
thd->system_thread= SYSTEM_THREAD_BINLOG_BACKGROUND;
thd->thread_stack= (char*) &thd; /* Set approximate stack start */
mysql_mutex_lock(&LOCK_thread_count);
thd->thread_id= thread_id++;
mysql_mutex_unlock(&LOCK_thread_count);
thd->store_globals();
for (;;)
{
/*
Wait until there is something in the queue to process, or we are asked
to shut down.
*/
thd_proc_info(thd, "Waiting for background binlog tasks");
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
for (;;)
{
stop= binlog_background_thread_stop;
queue= binlog_background_thread_queue;
if (stop || queue)
break;
mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread,
&mysql_bin_log.LOCK_binlog_background_thread);
}
/* Grab the queue, if any. */
binlog_background_thread_queue= NULL;
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
/* Process any incoming commit_checkpoint_notify() calls. */
while (queue)
{
thd_proc_info(thd, "Processing binlog checkpoint notification");
/* Grab next pointer first, as mark_xid_done() may free the element. */
next= queue->next_in_queue;
mysql_bin_log.mark_xid_done(queue->binlog_id, true);
queue= next;
DBUG_EXECUTE_IF("binlog_background_checkpoint_processed",
DBUG_ASSERT(!debug_sync_set_action(
thd,
STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed")));
);
}
if (stop)
break;
}
thd_proc_info(thd, "Stopping binlog background thread");
mysql_mutex_lock(&LOCK_thread_count);
delete thd;
mysql_mutex_unlock(&LOCK_thread_count);
my_thread_end();
/* Signal that we are (almost) stopped. */
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
binlog_background_thread_stop= false;
mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
return 0;
}
#ifdef HAVE_PSI_INTERFACE
static PSI_thread_key key_thread_binlog;
static PSI_thread_info all_binlog_threads[]=
{
{ &key_thread_binlog, "binlog_background", PSI_FLAG_GLOBAL},
};
#endif /* HAVE_PSI_INTERFACE */
static bool
start_binlog_background_thread()
{
pthread_t th;
#ifdef HAVE_PSI_INTERFACE
if (PSI_server)
PSI_server->register_thread("sql", all_binlog_threads,
array_elements(all_binlog_threads));
#endif
if (mysql_thread_create(key_thread_binlog, &th, NULL,
binlog_background_thread, NULL))
return 1;
binlog_background_thread_started= true;
return 0;
}
int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
IO_CACHE *first_log, IO_CACHE *first_log,
Format_description_log_event *fdle) Format_description_log_event *fdle)
......
...@@ -395,8 +395,6 @@ private: ...@@ -395,8 +395,6 @@ private:
#define BINLOG_COOKIE_IS_DUMMY(c) \ #define BINLOG_COOKIE_IS_DUMMY(c) \
( ((ulong)(c)>>1) == BINLOG_COOKIE_DUMMY_ID ) ( ((ulong)(c)>>1) == BINLOG_COOKIE_DUMMY_ID )
void binlog_checkpoint_callback(void *cookie);
class binlog_cache_mngr; class binlog_cache_mngr;
class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
{ {
...@@ -450,27 +448,6 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG ...@@ -450,27 +448,6 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
ulong binlog_id; ulong binlog_id;
}; };
/*
A list of struct xid_count_per_binlog is used to keep track of how many
XIDs are in prepared, but not committed, state in each binlog. And how
many commit_checkpoint_request()'s are pending.
When count drops to zero in a binlog after rotation, it means that there
are no more XIDs in prepared state, so that binlog is no longer needed
for XA crash recovery, and we can log a new binlog checkpoint event.
The list is protected against simultaneous access from multiple
threads by LOCK_xid_list.
*/
struct xid_count_per_binlog : public ilink {
char *binlog_name;
uint binlog_name_len;
ulong binlog_id;
/* Total prepared XIDs and pending checkpoint requests in this binlog. */
long xid_count;
xid_count_per_binlog(); /* Give link error if constructor used. */
};
I_List<xid_count_per_binlog> binlog_xid_count_list;
/* /*
When this is set, a RESET MASTER is in progress. When this is set, a RESET MASTER is in progress.
...@@ -480,7 +457,6 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG ...@@ -480,7 +457,6 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
checkpoint arrives - when all have arrived, RESET MASTER will complete. checkpoint arrives - when all have arrived, RESET MASTER will complete.
*/ */
bool reset_master_pending; bool reset_master_pending;
friend void binlog_checkpoint_callback(void *cookie);
/* LOCK_log and LOCK_index are inited by init_pthread_objects() */ /* LOCK_log and LOCK_index are inited by init_pthread_objects() */
mysql_mutex_t LOCK_index; mysql_mutex_t LOCK_index;
...@@ -550,10 +526,35 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG ...@@ -550,10 +526,35 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
int write_transaction_or_stmt(group_commit_entry *entry); int write_transaction_or_stmt(group_commit_entry *entry);
bool write_transaction_to_binlog_events(group_commit_entry *entry); bool write_transaction_to_binlog_events(group_commit_entry *entry);
void trx_group_commit_leader(group_commit_entry *leader); void trx_group_commit_leader(group_commit_entry *leader);
void mark_xid_done(ulong cookie, bool write_checkpoint);
void mark_xids_active(ulong cookie, uint xid_count);
public: public:
/*
A list of struct xid_count_per_binlog is used to keep track of how many
XIDs are in prepared, but not committed, state in each binlog. And how
many commit_checkpoint_request()'s are pending.
When count drops to zero in a binlog after rotation, it means that there
are no more XIDs in prepared state, so that binlog is no longer needed
for XA crash recovery, and we can log a new binlog checkpoint event.
The list is protected against simultaneous access from multiple
threads by LOCK_xid_list.
*/
struct xid_count_per_binlog : public ilink {
char *binlog_name;
uint binlog_name_len;
ulong binlog_id;
/* Total prepared XIDs and pending checkpoint requests in this binlog. */
long xid_count;
/* For linking in requests to the binlog background thread. */
xid_count_per_binlog *next_in_queue;
xid_count_per_binlog(); /* Give link error if constructor used. */
};
I_List<xid_count_per_binlog> binlog_xid_count_list;
mysql_mutex_t LOCK_binlog_background_thread;
mysql_cond_t COND_binlog_background_thread;
mysql_cond_t COND_binlog_background_thread_end;
using MYSQL_LOG::generate_name; using MYSQL_LOG::generate_name;
using MYSQL_LOG::is_open; using MYSQL_LOG::is_open;
...@@ -709,6 +710,8 @@ public: ...@@ -709,6 +710,8 @@ public:
bool appendv(const char* buf,uint len,...); bool appendv(const char* buf,uint len,...);
bool append(Log_event* ev); bool append(Log_event* ev);
void mark_xids_active(ulong cookie, uint xid_count);
void mark_xid_done(ulong cookie, bool write_checkpoint);
void make_log_name(char* buf, const char* log_ident); void make_log_name(char* buf, const char* log_ident);
bool is_active(const char* log_file_name); bool is_active(const char* log_file_name);
bool can_purge_log(const char *log_file_name); bool can_purge_log(const char *log_file_name);
......
...@@ -726,6 +726,7 @@ PSI_mutex_key key_LOCK_des_key_file; ...@@ -726,6 +726,7 @@ PSI_mutex_key key_LOCK_des_key_file;
#endif /* HAVE_OPENSSL */ #endif /* HAVE_OPENSSL */
PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list, PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
key_BINLOG_LOCK_binlog_background_thread,
key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi, key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi,
key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create, key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create,
key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log, key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log,
...@@ -768,6 +769,7 @@ static PSI_mutex_info all_server_mutexes[]= ...@@ -768,6 +769,7 @@ static PSI_mutex_info all_server_mutexes[]=
{ &key_BINLOG_LOCK_index, "MYSQL_BIN_LOG::LOCK_index", 0}, { &key_BINLOG_LOCK_index, "MYSQL_BIN_LOG::LOCK_index", 0},
{ &key_BINLOG_LOCK_xid_list, "MYSQL_BIN_LOG::LOCK_xid_list", 0}, { &key_BINLOG_LOCK_xid_list, "MYSQL_BIN_LOG::LOCK_xid_list", 0},
{ &key_BINLOG_LOCK_binlog_background_thread, "MYSQL_BIN_LOG::LOCK_binlog_background_thread", 0},
{ &key_RELAYLOG_LOCK_index, "MYSQL_RELAY_LOG::LOCK_index", 0}, { &key_RELAYLOG_LOCK_index, "MYSQL_RELAY_LOG::LOCK_index", 0},
{ &key_delayed_insert_mutex, "Delayed_insert::mutex", 0}, { &key_delayed_insert_mutex, "Delayed_insert::mutex", 0},
{ &key_hash_filo_lock, "hash_filo::lock", 0}, { &key_hash_filo_lock, "hash_filo::lock", 0},
...@@ -836,6 +838,8 @@ PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool; ...@@ -836,6 +838,8 @@ PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool;
#endif /* HAVE_MMAP */ #endif /* HAVE_MMAP */
PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond, PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond,
key_BINLOG_COND_binlog_background_thread,
key_BINLOG_COND_binlog_background_thread_end,
key_COND_cache_status_changed, key_COND_manager, key_COND_cache_status_changed, key_COND_manager,
key_COND_rpl_status, key_COND_server_started, key_COND_rpl_status, key_COND_server_started,
key_delayed_insert_cond, key_delayed_insert_cond_client, key_delayed_insert_cond, key_delayed_insert_cond_client,
...@@ -865,6 +869,8 @@ static PSI_cond_info all_server_conds[]= ...@@ -865,6 +869,8 @@ static PSI_cond_info all_server_conds[]=
#endif /* HAVE_MMAP */ #endif /* HAVE_MMAP */
{ &key_BINLOG_COND_xid_list, "MYSQL_BIN_LOG::COND_xid_list", 0}, { &key_BINLOG_COND_xid_list, "MYSQL_BIN_LOG::COND_xid_list", 0},
{ &key_BINLOG_update_cond, "MYSQL_BIN_LOG::update_cond", 0}, { &key_BINLOG_update_cond, "MYSQL_BIN_LOG::update_cond", 0},
{ &key_BINLOG_COND_binlog_background_thread, "MYSQL_BIN_LOG::COND_binlog_background_thread", 0},
{ &key_BINLOG_COND_binlog_background_thread_end, "MYSQL_BIN_LOG::COND_binlog_background_thread_end", 0},
{ &key_BINLOG_COND_queue_busy, "MYSQL_BIN_LOG::COND_queue_busy", 0}, { &key_BINLOG_COND_queue_busy, "MYSQL_BIN_LOG::COND_queue_busy", 0},
{ &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0}, { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0},
{ &key_RELAYLOG_COND_queue_busy, "MYSQL_RELAY_LOG::COND_queue_busy", 0}, { &key_RELAYLOG_COND_queue_busy, "MYSQL_RELAY_LOG::COND_queue_busy", 0},
......
...@@ -228,6 +228,7 @@ extern PSI_mutex_key key_LOCK_des_key_file; ...@@ -228,6 +228,7 @@ extern PSI_mutex_key key_LOCK_des_key_file;
#endif #endif
extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list, extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
key_BINLOG_LOCK_binlog_background_thread,
key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi, key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi,
key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create, key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create,
key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log, key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log,
...@@ -259,6 +260,8 @@ extern PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool; ...@@ -259,6 +260,8 @@ extern PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool;
#endif /* HAVE_MMAP */ #endif /* HAVE_MMAP */
extern PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond, extern PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond,
key_BINLOG_COND_binlog_background_thread,
key_BINLOG_COND_binlog_background_thread_end,
key_COND_cache_status_changed, key_COND_manager, key_COND_cache_status_changed, key_COND_manager,
key_COND_rpl_status, key_COND_server_started, key_COND_rpl_status, key_COND_server_started,
key_delayed_insert_cond, key_delayed_insert_cond_client, key_delayed_insert_cond, key_delayed_insert_cond_client,
......
...@@ -58,6 +58,7 @@ Relay_log_info::Relay_log_info(bool is_slave_recovery) ...@@ -58,6 +58,7 @@ Relay_log_info::Relay_log_info(bool is_slave_recovery)
{ {
DBUG_ENTER("Relay_log_info::Relay_log_info"); DBUG_ENTER("Relay_log_info::Relay_log_info");
relay_log.is_relay_log= TRUE;
#ifdef HAVE_PSI_INTERFACE #ifdef HAVE_PSI_INTERFACE
relay_log.set_psi_keys(key_RELAYLOG_LOCK_index, relay_log.set_psi_keys(key_RELAYLOG_LOCK_index,
key_RELAYLOG_update_cond, key_RELAYLOG_update_cond,
...@@ -216,8 +217,6 @@ a file name for --relay-log-index option", opt_relaylog_index_name); ...@@ -216,8 +217,6 @@ a file name for --relay-log-index option", opt_relaylog_index_name);
&mi->connection_name); &mi->connection_name);
} }
rli->relay_log.is_relay_log= TRUE;
/* /*
note, that if open() fails, we'll still have index file open note, that if open() fails, we'll still have index file open
but a destructor will take care of that but a destructor will take care of that
......
...@@ -1255,7 +1255,8 @@ enum enum_thread_type ...@@ -1255,7 +1255,8 @@ enum enum_thread_type
SYSTEM_THREAD_SLAVE_SQL= 4, SYSTEM_THREAD_SLAVE_SQL= 4,
SYSTEM_THREAD_NDBCLUSTER_BINLOG= 8, SYSTEM_THREAD_NDBCLUSTER_BINLOG= 8,
SYSTEM_THREAD_EVENT_SCHEDULER= 16, SYSTEM_THREAD_EVENT_SCHEDULER= 16,
SYSTEM_THREAD_EVENT_WORKER= 32 SYSTEM_THREAD_EVENT_WORKER= 32,
SYSTEM_THREAD_BINLOG_BACKGROUND= 64
}; };
inline char const * inline char const *
......
...@@ -106,6 +106,7 @@ static ulong commit_threads = 0; ...@@ -106,6 +106,7 @@ static ulong commit_threads = 0;
static mysql_mutex_t commit_threads_m; static mysql_mutex_t commit_threads_m;
static mysql_cond_t commit_cond; static mysql_cond_t commit_cond;
static mysql_mutex_t commit_cond_m; static mysql_mutex_t commit_cond_m;
static mysql_mutex_t pending_checkpoint_mutex;
static bool innodb_inited = 0; static bool innodb_inited = 0;
#define INSIDE_HA_INNOBASE_CC #define INSIDE_HA_INNOBASE_CC
...@@ -222,11 +223,13 @@ static mysql_pfs_key_t innobase_share_mutex_key; ...@@ -222,11 +223,13 @@ static mysql_pfs_key_t innobase_share_mutex_key;
static mysql_pfs_key_t commit_threads_m_key; static mysql_pfs_key_t commit_threads_m_key;
static mysql_pfs_key_t commit_cond_mutex_key; static mysql_pfs_key_t commit_cond_mutex_key;
static mysql_pfs_key_t commit_cond_key; static mysql_pfs_key_t commit_cond_key;
static mysql_pfs_key_t pending_checkpoint_mutex_key;
static PSI_mutex_info all_pthread_mutexes[] = { static PSI_mutex_info all_pthread_mutexes[] = {
{&commit_threads_m_key, "commit_threads_m", 0}, {&commit_threads_m_key, "commit_threads_m", 0},
{&commit_cond_mutex_key, "commit_cond_mutex", 0}, {&commit_cond_mutex_key, "commit_cond_mutex", 0},
{&innobase_share_mutex_key, "innobase_share_mutex", 0} {&innobase_share_mutex_key, "innobase_share_mutex", 0},
{&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0}
}; };
static PSI_cond_info all_innodb_conds[] = { static PSI_cond_info all_innodb_conds[] = {
...@@ -2601,6 +2604,9 @@ innobase_change_buffering_inited_ok: ...@@ -2601,6 +2604,9 @@ innobase_change_buffering_inited_ok:
mysql_mutex_init(commit_cond_mutex_key, mysql_mutex_init(commit_cond_mutex_key,
&commit_cond_m, MY_MUTEX_INIT_FAST); &commit_cond_m, MY_MUTEX_INIT_FAST);
mysql_cond_init(commit_cond_key, &commit_cond, NULL); mysql_cond_init(commit_cond_key, &commit_cond, NULL);
mysql_mutex_init(pending_checkpoint_mutex_key,
&pending_checkpoint_mutex,
MY_MUTEX_INIT_FAST);
innodb_inited= 1; innodb_inited= 1;
#ifdef MYSQL_DYNAMIC_PLUGIN #ifdef MYSQL_DYNAMIC_PLUGIN
if (innobase_hton != p) { if (innobase_hton != p) {
...@@ -2648,6 +2654,7 @@ innobase_end( ...@@ -2648,6 +2654,7 @@ innobase_end(
mysql_mutex_destroy(&commit_threads_m); mysql_mutex_destroy(&commit_threads_m);
mysql_mutex_destroy(&commit_cond_m); mysql_mutex_destroy(&commit_cond_m);
mysql_cond_destroy(&commit_cond); mysql_cond_destroy(&commit_cond);
mysql_mutex_destroy(&pending_checkpoint_mutex);
} }
DBUG_RETURN(err); DBUG_RETURN(err);
...@@ -3017,17 +3024,145 @@ innobase_rollback_trx( ...@@ -3017,17 +3024,145 @@ innobase_rollback_trx(
DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
} }
struct pending_checkpoint {
struct pending_checkpoint *next;
handlerton *hton;
void *cookie;
ib_uint64_t lsn;
};
static struct pending_checkpoint *pending_checkpoint_list;
static struct pending_checkpoint *pending_checkpoint_list_end;
/*****************************************************************//** /*****************************************************************//**
Handle a commit checkpoint request from server layer. Handle a commit checkpoint request from server layer.
We simply flush the redo log immediately and do the notify call.*/ We put the request in a queue, so that we can notify upper layer about
checkpoint complete when we have flushed the redo log.
If we have already flushed all relevant redo log, we notify immediately.*/
static static
void void
innobase_checkpoint_request( innobase_checkpoint_request(
handlerton *hton, handlerton *hton,
void *cookie) void *cookie)
{ {
log_buffer_flush_to_disk(); ib_uint64_t lsn;
commit_checkpoint_notify_ha(hton, cookie); ib_uint64_t flush_lsn;
struct pending_checkpoint * entry;
/* Do the allocation outside of lock to reduce contention. The normal
case is that not everything is flushed, so we will need to enqueue. */
entry = static_cast<struct pending_checkpoint *>
(my_malloc(sizeof(*entry), MYF(MY_WME)));
if (!entry) {
sql_print_error("Failed to allocate %u bytes."
" Commit checkpoint will be skipped.",
static_cast<unsigned>(sizeof(*entry)));
return;
}
entry->next = NULL;
entry->hton = hton;
entry->cookie = cookie;
mysql_mutex_lock(&pending_checkpoint_mutex);
lsn = log_get_lsn();
flush_lsn = log_get_flush_lsn();
if (lsn > flush_lsn) {
/* Put the request in queue.
When the log gets flushed past the lsn, we will remove the
entry from the queue and notify the upper layer. */
entry->lsn = lsn;
if (pending_checkpoint_list_end) {
pending_checkpoint_list_end->next = entry;
/* There is no need to order the entries in the list
by lsn. The upper layer can accept notifications in
any order, and short delays in notifications do not
significantly impact performance. */
} else {
pending_checkpoint_list = entry;
}
pending_checkpoint_list_end = entry;
entry = NULL;
}
mysql_mutex_unlock(&pending_checkpoint_mutex);
if (entry) {
/* We are already flushed. Notify the checkpoint immediately. */
commit_checkpoint_notify_ha(entry->hton, entry->cookie);
my_free(entry);
}
}
/*****************************************************************//**
Log code calls this whenever log has been written and/or flushed up
to a new position. We use this to notify upper layer of a new commit
checkpoint when necessary.*/
extern "C" UNIV_INTERN
void
innobase_mysql_log_notify(
/*===============*/
ib_uint64_t write_lsn, /*!< in: LSN written to log file */
ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */
{
struct pending_checkpoint * pending;
struct pending_checkpoint * entry;
struct pending_checkpoint * last_ready;
/* It is safe to do a quick check for NULL first without lock.
Even if we should race, we will at most skip one checkpoint and
take the next one, which is harmless. */
if (!pending_checkpoint_list)
return;
mysql_mutex_lock(&pending_checkpoint_mutex);
pending = pending_checkpoint_list;
if (!pending)
{
mysql_mutex_unlock(&pending_checkpoint_mutex);
return;
}
last_ready = NULL;
for (entry = pending; entry != NULL; entry = entry -> next)
{
/* Notify checkpoints up until the first entry that has not
been fully flushed to the redo log. Since we do not maintain
the list ordered, in principle there could be more entries
later than were also flushed. But there is no harm in
delaying notifications for those a bit. And in practise, the
list is unlikely to have more than one element anyway, as we
flush the redo log at least once every second. */
if (entry->lsn > flush_lsn)
break;
last_ready = entry;
}
if (last_ready)
{
/* We found some pending checkpoints that are now flushed to
disk. So remove them from the list. */
pending_checkpoint_list = entry;
if (!entry)
pending_checkpoint_list_end = NULL;
}
mysql_mutex_unlock(&pending_checkpoint_mutex);
if (!last_ready)
return;
/* Now that we have released the lock, notify upper layer about all
commit checkpoints that have now completed. */
for (;;) {
entry = pending;
pending = pending->next;
commit_checkpoint_notify_ha(entry->hton, entry->cookie);
my_free(entry);
if (entry == last_ready)
break;
}
} }
/*****************************************************************//** /*****************************************************************//**
......
...@@ -136,6 +136,17 @@ innobase_mysql_print_thd( ...@@ -136,6 +136,17 @@ innobase_mysql_print_thd(
uint max_query_len); /*!< in: max query length to print, or 0 to uint max_query_len); /*!< in: max query length to print, or 0 to
use the default max length */ use the default max length */
/*****************************************************************//**
Log code calls this whenever log has been written and/or flushed up
to a new position. We use this to notify upper layer of a new commit
checkpoint when necessary.*/
UNIV_INTERN
void
innobase_mysql_log_notify(
/*===============*/
ib_uint64_t write_lsn, /*!< in: LSN written to log file */
ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */
/**************************************************************//** /**************************************************************//**
Converts a MySQL type to an InnoDB type. Note that this function returns Converts a MySQL type to an InnoDB type. Note that this function returns
the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
......
...@@ -151,6 +151,13 @@ UNIV_INLINE ...@@ -151,6 +151,13 @@ UNIV_INLINE
ib_uint64_t ib_uint64_t
log_get_lsn(void); log_get_lsn(void);
/*=============*/ /*=============*/
/************************************************************//**
Gets the last lsn that is fully flushed to disk.
@return last flushed lsn */
UNIV_INLINE
ib_uint64_t
log_get_flush_lsn(void);
/*=============*/
/**************************************************************** /****************************************************************
Gets the log group capacity. It is OK to read the value without Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant. holding log_sys->mutex because it is constant.
......
...@@ -411,6 +411,25 @@ log_get_lsn(void) ...@@ -411,6 +411,25 @@ log_get_lsn(void)
return(lsn); return(lsn);
} }
/************************************************************//**
Gets the last lsn that is fully flushed to disk.
@return last flushed lsn */
UNIV_INLINE
ib_uint64_t
log_get_flush_lsn(void)
/*=============*/
{
ib_uint64_t lsn;
mutex_enter(&(log_sys->mutex));
lsn = log_sys->flushed_to_disk_lsn;
mutex_exit(&(log_sys->mutex));
return(lsn);
}
/**************************************************************** /****************************************************************
Gets the log group capacity. It is OK to read the value without Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant. holding log_sys->mutex because it is constant.
......
...@@ -1353,6 +1353,8 @@ log_write_up_to( ...@@ -1353,6 +1353,8 @@ log_write_up_to(
ulint loop_count = 0; ulint loop_count = 0;
#endif /* UNIV_DEBUG */ #endif /* UNIV_DEBUG */
ulint unlock; ulint unlock;
ib_uint64_t write_lsn;
ib_uint64_t flush_lsn;
if (recv_no_ibuf_operations) { if (recv_no_ibuf_operations) {
/* Recovery is running and no operations on the log files are /* Recovery is running and no operations on the log files are
...@@ -1530,8 +1532,13 @@ loop: ...@@ -1530,8 +1532,13 @@ loop:
log_flush_do_unlocks(unlock); log_flush_do_unlocks(unlock);
write_lsn = log_sys->write_lsn;
flush_lsn = log_sys->flushed_to_disk_lsn;
mutex_exit(&(log_sys->mutex)); mutex_exit(&(log_sys->mutex));
innobase_mysql_log_notify(write_lsn, flush_lsn);
return; return;
do_waits: do_waits:
......
...@@ -121,6 +121,7 @@ static ulong commit_threads = 0; ...@@ -121,6 +121,7 @@ static ulong commit_threads = 0;
static mysql_mutex_t commit_threads_m; static mysql_mutex_t commit_threads_m;
static mysql_cond_t commit_cond; static mysql_cond_t commit_cond;
static mysql_mutex_t commit_cond_m; static mysql_mutex_t commit_cond_m;
static mysql_mutex_t pending_checkpoint_mutex;
static bool innodb_inited = 0; static bool innodb_inited = 0;
...@@ -254,11 +255,13 @@ static mysql_pfs_key_t innobase_share_mutex_key; ...@@ -254,11 +255,13 @@ static mysql_pfs_key_t innobase_share_mutex_key;
static mysql_pfs_key_t commit_threads_m_key; static mysql_pfs_key_t commit_threads_m_key;
static mysql_pfs_key_t commit_cond_mutex_key; static mysql_pfs_key_t commit_cond_mutex_key;
static mysql_pfs_key_t commit_cond_key; static mysql_pfs_key_t commit_cond_key;
static mysql_pfs_key_t pending_checkpoint_mutex_key;
static PSI_mutex_info all_pthread_mutexes[] = { static PSI_mutex_info all_pthread_mutexes[] = {
{&commit_threads_m_key, "commit_threads_m", 0}, {&commit_threads_m_key, "commit_threads_m", 0},
{&commit_cond_mutex_key, "commit_cond_mutex", 0}, {&commit_cond_mutex_key, "commit_cond_mutex", 0},
{&innobase_share_mutex_key, "innobase_share_mutex", 0} {&innobase_share_mutex_key, "innobase_share_mutex", 0},
{&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0}
}; };
static PSI_cond_info all_innodb_conds[] = { static PSI_cond_info all_innodb_conds[] = {
...@@ -3088,6 +3091,9 @@ skip_overwrite: ...@@ -3088,6 +3091,9 @@ skip_overwrite:
mysql_mutex_init(commit_cond_mutex_key, mysql_mutex_init(commit_cond_mutex_key,
&commit_cond_m, MY_MUTEX_INIT_FAST); &commit_cond_m, MY_MUTEX_INIT_FAST);
mysql_cond_init(commit_cond_key, &commit_cond, NULL); mysql_cond_init(commit_cond_key, &commit_cond, NULL);
mysql_mutex_init(pending_checkpoint_mutex_key,
&pending_checkpoint_mutex,
MY_MUTEX_INIT_FAST);
innodb_inited= 1; innodb_inited= 1;
#ifdef MYSQL_DYNAMIC_PLUGIN #ifdef MYSQL_DYNAMIC_PLUGIN
if (innobase_hton != p) { if (innobase_hton != p) {
...@@ -3135,6 +3141,7 @@ innobase_end( ...@@ -3135,6 +3141,7 @@ innobase_end(
mysql_mutex_destroy(&commit_threads_m); mysql_mutex_destroy(&commit_threads_m);
mysql_mutex_destroy(&commit_cond_m); mysql_mutex_destroy(&commit_cond_m);
mysql_cond_destroy(&commit_cond); mysql_cond_destroy(&commit_cond);
mysql_mutex_destroy(&pending_checkpoint_mutex);
} }
DBUG_RETURN(err); DBUG_RETURN(err);
...@@ -3530,17 +3537,145 @@ innobase_rollback_trx( ...@@ -3530,17 +3537,145 @@ innobase_rollback_trx(
DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
} }
struct pending_checkpoint {
struct pending_checkpoint *next;
handlerton *hton;
void *cookie;
ib_uint64_t lsn;
};
static struct pending_checkpoint *pending_checkpoint_list;
static struct pending_checkpoint *pending_checkpoint_list_end;
/*****************************************************************//** /*****************************************************************//**
Handle a commit checkpoint request from server layer. Handle a commit checkpoint request from server layer.
We simply flush the redo log immediately and do the notify call.*/ We put the request in a queue, so that we can notify upper layer about
checkpoint complete when we have flushed the redo log.
If we have already flushed all relevant redo log, we notify immediately.*/
static static
void void
innobase_checkpoint_request( innobase_checkpoint_request(
handlerton *hton, handlerton *hton,
void *cookie) void *cookie)
{ {
log_buffer_flush_to_disk(); ib_uint64_t lsn;
commit_checkpoint_notify_ha(hton, cookie); ib_uint64_t flush_lsn;
struct pending_checkpoint * entry;
/* Do the allocation outside of lock to reduce contention. The normal
case is that not everything is flushed, so we will need to enqueue. */
entry = static_cast<struct pending_checkpoint *>
(my_malloc(sizeof(*entry), MYF(MY_WME)));
if (!entry) {
sql_print_error("Failed to allocate %u bytes."
" Commit checkpoint will be skipped.",
static_cast<unsigned>(sizeof(*entry)));
return;
}
entry->next = NULL;
entry->hton = hton;
entry->cookie = cookie;
mysql_mutex_lock(&pending_checkpoint_mutex);
lsn = log_get_lsn();
flush_lsn = log_get_flush_lsn();
if (lsn > flush_lsn) {
/* Put the request in queue.
When the log gets flushed past the lsn, we will remove the
entry from the queue and notify the upper layer. */
entry->lsn = lsn;
if (pending_checkpoint_list_end) {
pending_checkpoint_list_end->next = entry;
/* There is no need to order the entries in the list
by lsn. The upper layer can accept notifications in
any order, and short delays in notifications do not
significantly impact performance. */
} else {
pending_checkpoint_list = entry;
}
pending_checkpoint_list_end = entry;
entry = NULL;
}
mysql_mutex_unlock(&pending_checkpoint_mutex);
if (entry) {
/* We are already flushed. Notify the checkpoint immediately. */
commit_checkpoint_notify_ha(entry->hton, entry->cookie);
my_free(entry);
}
}
/*****************************************************************//**
Log code calls this whenever log has been written and/or flushed up
to a new position. We use this to notify upper layer of a new commit
checkpoint when necessary.*/
extern "C" UNIV_INTERN
void
innobase_mysql_log_notify(
/*===============*/
ib_uint64_t write_lsn, /*!< in: LSN written to log file */
ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */
{
struct pending_checkpoint * pending;
struct pending_checkpoint * entry;
struct pending_checkpoint * last_ready;
/* It is safe to do a quick check for NULL first without lock.
Even if we should race, we will at most skip one checkpoint and
take the next one, which is harmless. */
if (!pending_checkpoint_list)
return;
mysql_mutex_lock(&pending_checkpoint_mutex);
pending = pending_checkpoint_list;
if (!pending)
{
mysql_mutex_unlock(&pending_checkpoint_mutex);
return;
}
last_ready = NULL;
for (entry = pending; entry != NULL; entry = entry -> next)
{
/* Notify checkpoints up until the first entry that has not
been fully flushed to the redo log. Since we do not maintain
the list ordered, in principle there could be more entries
later than were also flushed. But there is no harm in
delaying notifications for those a bit. And in practise, the
list is unlikely to have more than one element anyway, as we
flush the redo log at least once every second. */
if (entry->lsn > flush_lsn)
break;
last_ready = entry;
}
if (last_ready)
{
/* We found some pending checkpoints that are now flushed to
disk. So remove them from the list. */
pending_checkpoint_list = entry;
if (!entry)
pending_checkpoint_list_end = NULL;
}
mysql_mutex_unlock(&pending_checkpoint_mutex);
if (!last_ready)
return;
/* Now that we have released the lock, notify upper layer about all
commit checkpoints that have now completed. */
for (;;) {
entry = pending;
pending = pending->next;
commit_checkpoint_notify_ha(entry->hton, entry->cookie);
my_free(entry);
if (entry == last_ready)
break;
}
} }
/*****************************************************************//** /*****************************************************************//**
......
...@@ -136,6 +136,17 @@ innobase_mysql_print_thd( ...@@ -136,6 +136,17 @@ innobase_mysql_print_thd(
uint max_query_len); /*!< in: max query length to print, or 0 to uint max_query_len); /*!< in: max query length to print, or 0 to
use the default max length */ use the default max length */
/*****************************************************************//**
Log code calls this whenever log has been written and/or flushed up
to a new position. We use this to notify upper layer of a new commit
checkpoint when necessary.*/
UNIV_INTERN
void
innobase_mysql_log_notify(
/*===============*/
ib_uint64_t write_lsn, /*!< in: LSN written to log file */
ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */
/**************************************************************//** /**************************************************************//**
Converts a MySQL type to an InnoDB type. Note that this function returns Converts a MySQL type to an InnoDB type. Note that this function returns
the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
......
...@@ -151,6 +151,13 @@ UNIV_INLINE ...@@ -151,6 +151,13 @@ UNIV_INLINE
ib_uint64_t ib_uint64_t
log_get_lsn(void); log_get_lsn(void);
/*=============*/ /*=============*/
/************************************************************//**
Gets the last lsn that is fully flushed to disk.
@return last flushed lsn */
UNIV_INLINE
ib_uint64_t
log_get_flush_lsn(void);
/*=============*/
/**************************************************************** /****************************************************************
Gets the log group capacity. It is OK to read the value without Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant. holding log_sys->mutex because it is constant.
......
...@@ -411,6 +411,25 @@ log_get_lsn(void) ...@@ -411,6 +411,25 @@ log_get_lsn(void)
return(lsn); return(lsn);
} }
/************************************************************//**
Gets the last lsn that is fully flushed to disk.
@return last flushed lsn */
UNIV_INLINE
ib_uint64_t
log_get_flush_lsn(void)
/*=============*/
{
ib_uint64_t lsn;
mutex_enter(&(log_sys->mutex));
lsn = log_sys->flushed_to_disk_lsn;
mutex_exit(&(log_sys->mutex));
return(lsn);
}
/**************************************************************** /****************************************************************
Gets the log group capacity. It is OK to read the value without Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant. holding log_sys->mutex because it is constant.
......
...@@ -1390,6 +1390,8 @@ log_write_up_to( ...@@ -1390,6 +1390,8 @@ log_write_up_to(
ulint loop_count = 0; ulint loop_count = 0;
#endif /* UNIV_DEBUG */ #endif /* UNIV_DEBUG */
ulint unlock; ulint unlock;
ib_uint64_t write_lsn;
ib_uint64_t flush_lsn;
if (recv_no_ibuf_operations) { if (recv_no_ibuf_operations) {
/* Recovery is running and no operations on the log files are /* Recovery is running and no operations on the log files are
...@@ -1568,8 +1570,13 @@ loop: ...@@ -1568,8 +1570,13 @@ loop:
log_flush_do_unlocks(unlock); log_flush_do_unlocks(unlock);
write_lsn = log_sys->write_lsn;
flush_lsn = log_sys->flushed_to_disk_lsn;
mutex_exit(&(log_sys->mutex)); mutex_exit(&(log_sys->mutex));
innobase_mysql_log_notify(write_lsn, flush_lsn);
return; return;
do_waits: do_waits:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment