Commit 8d8f52e9 authored by unknown's avatar unknown

Many files:

  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released


sql/log.cc:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
sql/handler.cc:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
sql/handler.h:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
sql/ha_innodb.cc:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
sql/ha_innodb.h:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/include/log0log.h:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/include/trx0trx.h:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/os/os0file.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/buf/buf0flu.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/trx/trx0trx.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/log/log0log.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/srv/srv0srv.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/row/row0mysql.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
parent 87039789
...@@ -398,7 +398,7 @@ buf_flush_write_block_low( ...@@ -398,7 +398,7 @@ buf_flush_write_block_low(
"Warning: cannot force log to disk in the log debug version!\n"); "Warning: cannot force log to disk in the log debug version!\n");
#else #else
/* Force the log to the disk before writing the modified block */ /* Force the log to the disk before writing the modified block */
log_flush_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS); log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
#endif #endif
buf_flush_init_for_writing(block->frame, block->newest_modification, buf_flush_init_for_writing(block->frame, block->newest_modification,
block->space, block->offset); block->space, block->offset);
......
...@@ -20,7 +20,7 @@ typedef struct log_group_struct log_group_t; ...@@ -20,7 +20,7 @@ typedef struct log_group_struct log_group_t;
extern ibool log_do_write; extern ibool log_do_write;
extern ibool log_debug_writes; extern ibool log_debug_writes;
/* Wait modes for log_flush_up_to */ /* Wait modes for log_write_up_to */
#define LOG_NO_WAIT 91 #define LOG_NO_WAIT 91
#define LOG_WAIT_ONE_GROUP 92 #define LOG_WAIT_ONE_GROUP 92
#define LOG_WAIT_ALL_GROUPS 93 #define LOG_WAIT_ALL_GROUPS 93
...@@ -157,26 +157,21 @@ log_io_complete( ...@@ -157,26 +157,21 @@ log_io_complete(
/*============*/ /*============*/
log_group_t* group); /* in: log group */ log_group_t* group); /* in: log group */
/********************************************************** /**********************************************************
Flushes the log files to the disk, using, for example, the Unix fsync.
This function does the flush even if the user has set
srv_flush_log_at_trx_commit = FALSE. */
void
log_flush_to_disk(void);
/*===================*/
/**********************************************************
This function is called, e.g., when a transaction wants to commit. It checks This function is called, e.g., when a transaction wants to commit. It checks
that the log has been flushed to disk up to the last log entry written by the that the log has been written to the log file up to the last log entry written
transaction. If there is a flush running, it waits and checks if the flush by the transaction. If there is a flush running, it waits and checks if the
flushed enough. If not, starts a new flush. */ flush flushed enough. If not, starts a new flush. */
void void
log_flush_up_to( log_write_up_to(
/*============*/ /*============*/
dulint lsn, /* in: log sequence number up to which the log should dulint lsn, /* in: log sequence number up to which the log should
be flushed, ut_dulint_max if not specified */ be written, ut_dulint_max if not specified */
ulint wait); /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
or LOG_WAIT_ALL_GROUPS */ or LOG_WAIT_ALL_GROUPS */
ibool flush_to_disk);
/* in: TRUE if we want the written log also to be
flushed to disk */
/******************************************************************** /********************************************************************
Advances the smallest lsn for which there are unflushed dirty blocks in the Advances the smallest lsn for which there are unflushed dirty blocks in the
buffer pool and also may make a new checkpoint. NOTE: this function may only buffer pool and also may make a new checkpoint. NOTE: this function may only
...@@ -741,27 +736,37 @@ struct log_struct{ ...@@ -741,27 +736,37 @@ struct log_struct{
be advanced, it is enough that the be advanced, it is enough that the
write i/o has been completed for all write i/o has been completed for all
log groups */ log groups */
dulint flush_lsn; /* end lsn for the current flush */ dulint write_lsn; /* end lsn for the current running
ulint flush_end_offset;/* the data in buffer has been flushed write */
ulint write_end_offset;/* the data in buffer has been written
up to this offset when the current up to this offset when the current
flush ends: this field will then write ends: this field will then
be copied to buf_next_to_write */ be copied to buf_next_to_write */
ulint n_pending_writes;/* number of currently pending flush dulint current_flush_lsn;/* end lsn for the current running
writes */ write + flush operation */
dulint flushed_to_disk_lsn;
/* how far we have written the log
AND flushed to disk */
ulint n_pending_writes;/* number of currently pending flushes
or writes */
/* NOTE on the 'flush' in names of the fields below: starting from
4.0.14, we separate the write of the log file and the actual fsync()
or other method to flush it to disk. The names below shhould really
be 'flush_or_write'! */
os_event_t no_flush_event; /* this event is in the reset state os_event_t no_flush_event; /* this event is in the reset state
when a flush is running; a thread when a flush or a write is running;
should wait for this without owning a thread should wait for this without
the log mutex, but NOTE that to set or owning the log mutex, but NOTE that
reset this event, the thread MUST own to set or reset this event, the
the log mutex! */ thread MUST own the log mutex! */
ibool one_flushed; /* during a flush, this is first FALSE ibool one_flushed; /* during a flush, this is first FALSE
and becomes TRUE when one log group and becomes TRUE when one log group
has been flushed */ has been written or flushed */
os_event_t one_flushed_event;/* this event is reset when the os_event_t one_flushed_event;/* this event is reset when the
flush has not yet completed for any flush or write has not yet completed
log group; e.g., this means that a for any log group; e.g., this means
transaction has been committed when that a transaction has been committed
this is set; a thread should wait when this is set; a thread should wait
for this without owning the log mutex, for this without owning the log mutex,
but NOTE that to set or reset this but NOTE that to set or reset this
event, the thread MUST own the log event, the thread MUST own the log
......
...@@ -157,6 +157,15 @@ trx_commit_for_mysql( ...@@ -157,6 +157,15 @@ trx_commit_for_mysql(
/* out: 0 or error number */ /* out: 0 or error number */
trx_t* trx); /* in: trx handle */ trx_t* trx); /* in: trx handle */
/************************************************************************** /**************************************************************************
If required, flushes the log to disk if we called trx_commit_for_mysql()
with trx->flush_log_later == TRUE. */
ulint
trx_commit_complete_for_mysql(
/*==========================*/
/* out: 0 or error number */
trx_t* trx); /* in: trx handle */
/**************************************************************************
Marks the latest SQL statement ended. */ Marks the latest SQL statement ended. */
void void
...@@ -343,6 +352,11 @@ struct trx_struct{ ...@@ -343,6 +352,11 @@ struct trx_struct{
dulint no; /* transaction serialization number == dulint no; /* transaction serialization number ==
max trx id when the transaction is max trx id when the transaction is
moved to COMMITTED_IN_MEMORY state */ moved to COMMITTED_IN_MEMORY state */
ibool flush_log_later;/* when we commit the transaction
in MySQL's binlog write, we will
flush the log to disk later in
a separate call */
dulint commit_lsn; /* lsn at the time of the commit */
ibool dict_operation; /* TRUE if the trx is used to create ibool dict_operation; /* TRUE if the trx is used to create
a table, create an index, or drop a a table, create an index, or drop a
table */ table */
......
...@@ -178,7 +178,7 @@ loop: ...@@ -178,7 +178,7 @@ loop:
/* Not enough free space, do a syncronous flush of the log /* Not enough free space, do a syncronous flush of the log
buffer */ buffer */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ALL_GROUPS); log_write_up_to(ut_dulint_max, LOG_WAIT_ALL_GROUPS, TRUE);
count++; count++;
...@@ -675,7 +675,9 @@ log_init(void) ...@@ -675,7 +675,9 @@ log_init(void)
log_sys->buf_next_to_write = 0; log_sys->buf_next_to_write = 0;
log_sys->flush_lsn = ut_dulint_zero; log_sys->write_lsn = ut_dulint_zero;
log_sys->current_flush_lsn = ut_dulint_zero;
log_sys->flushed_to_disk_lsn = ut_dulint_zero;
log_sys->written_to_some_lsn = log_sys->lsn; log_sys->written_to_some_lsn = log_sys->lsn;
log_sys->written_to_all_lsn = log_sys->lsn; log_sys->written_to_all_lsn = log_sys->lsn;
...@@ -867,7 +869,7 @@ log_group_check_flush_completion( ...@@ -867,7 +869,7 @@ log_group_check_flush_completion(
printf("Log flushed first to group %lu\n", group->id); printf("Log flushed first to group %lu\n", group->id);
} }
log_sys->written_to_some_lsn = log_sys->flush_lsn; log_sys->written_to_some_lsn = log_sys->write_lsn;
log_sys->one_flushed = TRUE; log_sys->one_flushed = TRUE;
return(LOG_UNLOCK_NONE_FLUSHED_LOCK); return(LOG_UNLOCK_NONE_FLUSHED_LOCK);
...@@ -896,15 +898,15 @@ log_sys_check_flush_completion(void) ...@@ -896,15 +898,15 @@ log_sys_check_flush_completion(void)
if (log_sys->n_pending_writes == 0) { if (log_sys->n_pending_writes == 0) {
log_sys->written_to_all_lsn = log_sys->flush_lsn; log_sys->written_to_all_lsn = log_sys->write_lsn;
log_sys->buf_next_to_write = log_sys->flush_end_offset; log_sys->buf_next_to_write = log_sys->write_end_offset;
if (log_sys->flush_end_offset > log_sys->max_buf_free / 2) { if (log_sys->write_end_offset > log_sys->max_buf_free / 2) {
/* Move the log buffer content to the start of the /* Move the log buffer content to the start of the
buffer */ buffer */
move_start = ut_calc_align_down( move_start = ut_calc_align_down(
log_sys->flush_end_offset, log_sys->write_end_offset,
OS_FILE_LOG_BLOCK_SIZE); OS_FILE_LOG_BLOCK_SIZE);
move_end = ut_calc_align(log_sys->buf_free, move_end = ut_calc_align(log_sys->buf_free,
OS_FILE_LOG_BLOCK_SIZE); OS_FILE_LOG_BLOCK_SIZE);
...@@ -981,57 +983,6 @@ log_io_complete( ...@@ -981,57 +983,6 @@ log_io_complete(
mutex_exit(&(log_sys->mutex)); mutex_exit(&(log_sys->mutex));
} }
/**********************************************************
Flushes the log files to the disk, using, for example, the Unix fsync.
This function does the flush even if the user has set
srv_flush_log_at_trx_commit = FALSE. */
void
log_flush_to_disk(void)
/*===================*/
{
log_group_t* group;
loop:
mutex_enter(&(log_sys->mutex));
if (log_sys->n_pending_writes > 0) {
/* A log file write is running */
mutex_exit(&(log_sys->mutex));
/* Wait for the log file write to complete and try again */
os_event_wait(log_sys->no_flush_event);
goto loop;
}
group = UT_LIST_GET_FIRST(log_sys->log_groups);
log_sys->n_pending_writes++;
group->n_pending_writes++;
os_event_reset(log_sys->no_flush_event);
os_event_reset(log_sys->one_flushed_event);
mutex_exit(&(log_sys->mutex));
fil_flush(group->space_id);
mutex_enter(&(log_sys->mutex));
ut_a(group->n_pending_writes == 1);
ut_a(log_sys->n_pending_writes == 1);
group->n_pending_writes--;
log_sys->n_pending_writes--;
os_event_set(log_sys->no_flush_event);
os_event_set(log_sys->one_flushed_event);
mutex_exit(&(log_sys->mutex));
}
/********************************************************** /**********************************************************
Writes a log file header to a log file space. */ Writes a log file header to a log file space. */
static static
...@@ -1205,12 +1156,15 @@ by the transaction. If there is a flush running, it waits and checks if the ...@@ -1205,12 +1156,15 @@ by the transaction. If there is a flush running, it waits and checks if the
flush flushed enough. If not, starts a new flush. */ flush flushed enough. If not, starts a new flush. */
void void
log_flush_up_to( log_write_up_to(
/*============*/ /*============*/
dulint lsn, /* in: log sequence number up to which the log should dulint lsn, /* in: log sequence number up to which the log should
be written, ut_dulint_max if not specified */ be written, ut_dulint_max if not specified */
ulint wait) /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
or LOG_WAIT_ALL_GROUPS */ or LOG_WAIT_ALL_GROUPS */
ibool flush_to_disk)
/* in: TRUE if we want the written log also to be
flushed to disk */
{ {
log_group_t* group; log_group_t* group;
ulint start_offset; ulint start_offset;
...@@ -1239,9 +1193,18 @@ loop: ...@@ -1239,9 +1193,18 @@ loop:
mutex_enter(&(log_sys->mutex)); mutex_enter(&(log_sys->mutex));
if ((ut_dulint_cmp(log_sys->written_to_all_lsn, lsn) >= 0) if (flush_to_disk
|| ((ut_dulint_cmp(log_sys->written_to_some_lsn, lsn) >= 0) && ut_dulint_cmp(log_sys->flushed_to_disk_lsn, lsn) >= 0) {
&& (wait != LOG_WAIT_ALL_GROUPS))) {
mutex_exit(&(log_sys->mutex));
return;
}
if (!flush_to_disk
&& (ut_dulint_cmp(log_sys->written_to_all_lsn, lsn) >= 0
|| (ut_dulint_cmp(log_sys->written_to_some_lsn, lsn) >= 0
&& wait != LOG_WAIT_ALL_GROUPS))) {
mutex_exit(&(log_sys->mutex)); mutex_exit(&(log_sys->mutex));
...@@ -1249,10 +1212,19 @@ loop: ...@@ -1249,10 +1212,19 @@ loop:
} }
if (log_sys->n_pending_writes > 0) { if (log_sys->n_pending_writes > 0) {
/* A flush is running */ /* A write (+ possibly flush to disk) is running */
if (flush_to_disk
&& ut_dulint_cmp(log_sys->current_flush_lsn, lsn) >= 0) {
/* The write + flush will write enough: wait for it to
complete */
if (ut_dulint_cmp(log_sys->flush_lsn, lsn) >= 0) { goto do_waits;
/* The flush will flush enough: wait for it to }
if (!flush_to_disk
&& ut_dulint_cmp(log_sys->write_lsn, lsn) >= 0) {
/* The write will write enough: wait for it to
complete */ complete */
goto do_waits; goto do_waits;
...@@ -1260,16 +1232,17 @@ loop: ...@@ -1260,16 +1232,17 @@ loop:
mutex_exit(&(log_sys->mutex)); mutex_exit(&(log_sys->mutex));
/* Wait for the flush to complete and try to start a new /* Wait for the write to complete and try to start a new
flush */ write */
os_event_wait(log_sys->no_flush_event); os_event_wait(log_sys->no_flush_event);
goto loop; goto loop;
} }
if (log_sys->buf_free == log_sys->buf_next_to_write) { if (!flush_to_disk
/* Nothing to flush */ && log_sys->buf_free == log_sys->buf_next_to_write) {
/* Nothing to write and no flush to disk requested */
mutex_exit(&(log_sys->mutex)); mutex_exit(&(log_sys->mutex));
...@@ -1277,7 +1250,7 @@ loop: ...@@ -1277,7 +1250,7 @@ loop:
} }
if (log_debug_writes) { if (log_debug_writes) {
printf("Flushing log from %lu %lu up to lsn %lu %lu\n", printf("Writing log from %lu %lu up to lsn %lu %lu\n",
ut_dulint_get_high(log_sys->written_to_all_lsn), ut_dulint_get_high(log_sys->written_to_all_lsn),
ut_dulint_get_low(log_sys->written_to_all_lsn), ut_dulint_get_low(log_sys->written_to_all_lsn),
ut_dulint_get_high(log_sys->lsn), ut_dulint_get_high(log_sys->lsn),
...@@ -1301,7 +1274,12 @@ loop: ...@@ -1301,7 +1274,12 @@ loop:
ut_ad(area_end - area_start > 0); ut_ad(area_end - area_start > 0);
log_sys->flush_lsn = log_sys->lsn; log_sys->write_lsn = log_sys->lsn;
if (flush_to_disk) {
log_sys->current_flush_lsn = log_sys->lsn;
}
log_sys->one_flushed = FALSE; log_sys->one_flushed = FALSE;
log_block_set_flush_bit(log_sys->buf + area_start, TRUE); log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
...@@ -1318,10 +1296,12 @@ loop: ...@@ -1318,10 +1296,12 @@ loop:
OS_FILE_LOG_BLOCK_SIZE); OS_FILE_LOG_BLOCK_SIZE);
log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE; log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE;
log_sys->flush_end_offset = log_sys->buf_free; log_sys->write_end_offset = log_sys->buf_free;
group = UT_LIST_GET_FIRST(log_sys->log_groups); group = UT_LIST_GET_FIRST(log_sys->log_groups);
/* Do the write to the log files */
while (group) { while (group) {
log_group_write_buf(LOG_FLUSH, group, log_group_write_buf(LOG_FLUSH, group,
log_sys->buf + area_start, log_sys->buf + area_start,
...@@ -1330,20 +1310,25 @@ loop: ...@@ -1330,20 +1310,25 @@ loop:
OS_FILE_LOG_BLOCK_SIZE), OS_FILE_LOG_BLOCK_SIZE),
start_offset - area_start); start_offset - area_start);
log_group_set_fields(group, log_sys->flush_lsn); log_group_set_fields(group, log_sys->write_lsn);
group = UT_LIST_GET_NEXT(log_groups, group); group = UT_LIST_GET_NEXT(log_groups, group);
} }
mutex_exit(&(log_sys->mutex)); mutex_exit(&(log_sys->mutex));
if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC /* O_DSYNC means the OS did not buffer the log file at all:
&& srv_flush_log_at_trx_commit != 2) { so we have also flushed to disk what we have written */
log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
} else if (flush_to_disk) {
group = UT_LIST_GET_FIRST(log_sys->log_groups); group = UT_LIST_GET_FIRST(log_sys->log_groups);
fil_flush(group->space_id); fil_flush(group->space_id);
log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
} }
mutex_enter(&(log_sys->mutex)); mutex_enter(&(log_sys->mutex));
...@@ -1403,7 +1388,7 @@ log_flush_margin(void) ...@@ -1403,7 +1388,7 @@ log_flush_margin(void)
mutex_exit(&(log->mutex)); mutex_exit(&(log->mutex));
if (do_flush) { if (do_flush) {
log_flush_up_to(ut_dulint_max, LOG_NO_WAIT); log_write_up_to(ut_dulint_max, LOG_NO_WAIT, FALSE);
} }
} }
...@@ -1555,7 +1540,8 @@ log_group_checkpoint( ...@@ -1555,7 +1540,8 @@ log_group_checkpoint(
buf = group->checkpoint_buf; buf = group->checkpoint_buf;
mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no); mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn); mach_write_to_8(buf + LOG_CHECKPOINT_LSN,
log_sys->next_checkpoint_lsn);
mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET, mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
log_group_calc_lsn_offset( log_group_calc_lsn_offset(
...@@ -1664,8 +1650,10 @@ log_reset_first_header_and_checkpoint( ...@@ -1664,8 +1650,10 @@ log_reset_first_header_and_checkpoint(
lsn = ut_dulint_add(start, LOG_BLOCK_HDR_SIZE); lsn = ut_dulint_add(start, LOG_BLOCK_HDR_SIZE);
/* Write the label of ibbackup --restore */ /* Write the label of ibbackup --restore */
sprintf((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, "ibbackup "); sprintf((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
ut_sprintf_timestamp((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP "ibbackup ");
ut_sprintf_timestamp(
(char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
+ strlen("ibbackup ")); + strlen("ibbackup "));
buf = hdr_buf + LOG_CHECKPOINT_1; buf = hdr_buf + LOG_CHECKPOINT_1;
...@@ -1773,7 +1761,7 @@ log_checkpoint( ...@@ -1773,7 +1761,7 @@ log_checkpoint(
write-ahead-logging algorithm ensures that the log has been flushed write-ahead-logging algorithm ensures that the log has been flushed
up to oldest_lsn. */ up to oldest_lsn. */
log_flush_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS); log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
mutex_enter(&(log_sys->mutex)); mutex_enter(&(log_sys->mutex));
...@@ -2466,7 +2454,7 @@ loop: ...@@ -2466,7 +2454,7 @@ loop:
mutex_exit(&(log_sys->mutex)); mutex_exit(&(log_sys->mutex));
log_flush_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS); log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
calc_new_limit = FALSE; calc_new_limit = FALSE;
...@@ -3104,8 +3092,8 @@ log_print( ...@@ -3104,8 +3092,8 @@ log_print(
"Last checkpoint at %lu %lu\n", "Last checkpoint at %lu %lu\n",
ut_dulint_get_high(log_sys->lsn), ut_dulint_get_high(log_sys->lsn),
ut_dulint_get_low(log_sys->lsn), ut_dulint_get_low(log_sys->lsn),
ut_dulint_get_high(log_sys->written_to_some_lsn), ut_dulint_get_high(log_sys->flushed_to_disk_lsn),
ut_dulint_get_low(log_sys->written_to_some_lsn), ut_dulint_get_low(log_sys->flushed_to_disk_lsn),
ut_dulint_get_high(log_sys->last_checkpoint_lsn), ut_dulint_get_high(log_sys->last_checkpoint_lsn),
ut_dulint_get_low(log_sys->last_checkpoint_lsn)); ut_dulint_get_low(log_sys->last_checkpoint_lsn));
......
...@@ -521,10 +521,11 @@ try_again: ...@@ -521,10 +521,11 @@ try_again:
} }
#endif #endif
#ifdef UNIV_NON_BUFFERED_IO #ifdef UNIV_NON_BUFFERED_IO
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { if (type == OS_LOG_FILE) {
/* Do not use unbuffered i/o to log files because /* Do not use unbuffered i/o to log files because
value 2 denotes that we do not flush the log at every to allow group commit to work when MySQL binlogging
commit, but only once per second */ is used we must separate log file write and log
file flush to disk. */
} else { } else {
if (srv_win_file_flush_method == if (srv_win_file_flush_method ==
SRV_WIN_IO_UNBUFFERED) { SRV_WIN_IO_UNBUFFERED) {
......
...@@ -1664,7 +1664,7 @@ row_drop_table_for_mysql_in_background( ...@@ -1664,7 +1664,7 @@ row_drop_table_for_mysql_in_background(
the InnoDB data dictionary get out-of-sync if the user runs the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */ with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
trx_commit_for_mysql(trx); trx_commit_for_mysql(trx);
......
...@@ -2812,8 +2812,7 @@ loop: ...@@ -2812,8 +2812,7 @@ loop:
at transaction commit */ at transaction commit */
srv_main_thread_op_info = (char*)"flushing log"; srv_main_thread_op_info = (char*)"flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
log_flush_to_disk();
/* If there were less than 10 i/os during the /* If there were less than 10 i/os during the
one second sleep, we assume that there is free one second sleep, we assume that there is free
...@@ -2831,8 +2830,8 @@ loop: ...@@ -2831,8 +2830,8 @@ loop:
srv_main_thread_op_info = srv_main_thread_op_info =
(char*)"flushing log"; (char*)"flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP,
log_flush_to_disk(); TRUE);
} }
if (srv_activity_count == old_activity_count) { if (srv_activity_count == old_activity_count) {
...@@ -2867,8 +2866,7 @@ loop: ...@@ -2867,8 +2866,7 @@ loop:
buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
srv_main_thread_op_info = (char*) "flushing log"; srv_main_thread_op_info = (char*) "flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
log_flush_to_disk();
} }
/* We run a batch of insert buffer merge every 10 seconds, /* We run a batch of insert buffer merge every 10 seconds,
...@@ -2878,8 +2876,7 @@ loop: ...@@ -2878,8 +2876,7 @@ loop:
ibuf_contract_for_n_pages(TRUE, 5); ibuf_contract_for_n_pages(TRUE, 5);
srv_main_thread_op_info = (char*)"flushing log"; srv_main_thread_op_info = (char*)"flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
log_flush_to_disk();
/* We run a full purge every 10 seconds, even if the server /* We run a full purge every 10 seconds, even if the server
were active */ were active */
...@@ -2903,8 +2900,8 @@ loop: ...@@ -2903,8 +2900,8 @@ loop:
if (difftime(current_time, last_flush_time) > 1) { if (difftime(current_time, last_flush_time) > 1) {
srv_main_thread_op_info = (char*) "flushing log"; srv_main_thread_op_info = (char*) "flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP,
log_flush_to_disk(); TRUE);
last_flush_time = current_time; last_flush_time = current_time;
} }
} }
......
...@@ -89,6 +89,8 @@ trx_create( ...@@ -89,6 +89,8 @@ trx_create(
trx->check_foreigns = TRUE; trx->check_foreigns = TRUE;
trx->check_unique_secondary = TRUE; trx->check_unique_secondary = TRUE;
trx->flush_log_later = FALSE;
trx->dict_operation = FALSE; trx->dict_operation = FALSE;
trx->mysql_thd = NULL; trx->mysql_thd = NULL;
...@@ -780,13 +782,26 @@ trx_commit_off_kernel( ...@@ -780,13 +782,26 @@ trx_commit_off_kernel(
/*-------------------------------------*/ /*-------------------------------------*/
/* Most MySQL users run with srv_flush_.. set to FALSE: */ /* Most MySQL users run with srv_flush_.. set to 0: */
if (srv_flush_log_at_trx_commit != 0) {
if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC
&& srv_flush_log_at_trx_commit != 2
&& !trx->flush_log_later) {
/* Write the log to the log files AND flush
them to disk */
if (srv_flush_log_at_trx_commit) { log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
} else {
/* Write the log but do not flush it to disk */
log_flush_up_to(lsn, LOG_WAIT_ONE_GROUP); log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
}
} }
trx->commit_lsn = lsn;
/*-------------------------------------*/ /*-------------------------------------*/
mutex_enter(&kernel_mutex); mutex_enter(&kernel_mutex);
...@@ -1467,6 +1482,31 @@ trx_commit_for_mysql( ...@@ -1467,6 +1482,31 @@ trx_commit_for_mysql(
return(0); return(0);
} }
/**************************************************************************
If required, flushes the log to disk if we called trx_commit_for_mysql()
with trx->flush_log_later == TRUE. */
ulint
trx_commit_complete_for_mysql(
/*==========================*/
/* out: 0 or error number */
trx_t* trx) /* in: trx handle */
{
ut_a(trx);
if (srv_flush_log_at_trx_commit == 1
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
trx->op_info = (char *) "flushing log";
/* Flush the log files to disk */
log_write_up_to(trx->commit_lsn, LOG_WAIT_ONE_GROUP, TRUE);
trx->op_info = (char *) "";
}
}
/************************************************************************** /**************************************************************************
Marks the latest SQL statement ended. */ Marks the latest SQL statement ended. */
......
...@@ -872,8 +872,7 @@ innobase_flush_logs(void) ...@@ -872,8 +872,7 @@ innobase_flush_logs(void)
DBUG_ENTER("innobase_flush_logs"); DBUG_ENTER("innobase_flush_logs");
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
log_flush_to_disk();
DBUG_RETURN(result); DBUG_RETURN(result);
} }
...@@ -920,7 +919,7 @@ Commits a transaction in an InnoDB database. */ ...@@ -920,7 +919,7 @@ Commits a transaction in an InnoDB database. */
int int
innobase_commit( innobase_commit(
/*============*/ /*============*/
/* out: 0 or error number */ /* out: 0 */
THD* thd, /* in: MySQL thread handle of the user for whom THD* thd, /* in: MySQL thread handle of the user for whom
the transaction should be committed */ the transaction should be committed */
void* trx_handle)/* in: InnoDB trx handle or void* trx_handle)/* in: InnoDB trx handle or
...@@ -928,7 +927,6 @@ innobase_commit( ...@@ -928,7 +927,6 @@ innobase_commit(
that the current SQL statement ended, and we should that the current SQL statement ended, and we should
mark the start of a new statement with a savepoint */ mark the start of a new statement with a savepoint */
{ {
int error = 0;
trx_t* trx; trx_t* trx;
DBUG_ENTER("innobase_commit"); DBUG_ENTER("innobase_commit");
...@@ -955,29 +953,27 @@ innobase_commit( ...@@ -955,29 +953,27 @@ innobase_commit(
innobase_release_stat_resources(trx); innobase_release_stat_resources(trx);
trx_mark_sql_stat_end(trx); trx_mark_sql_stat_end(trx);
#ifndef DBUG_OFF
if (error) {
DBUG_PRINT("error", ("error: %d", error));
}
#endif
/* Tell InnoDB server that there might be work for /* Tell InnoDB server that there might be work for
utility threads: */ utility threads: */
srv_active_wake_master_thread(); srv_active_wake_master_thread();
DBUG_RETURN(error); DBUG_RETURN(0);
} }
/********************************************************************* /*********************************************************************
This is called when MySQL writes the binlog entry for the current This is called when MySQL writes the binlog entry for the current
transaction. Writes to the InnoDB tablespace info which tells where the transaction. Writes to the InnoDB tablespace info which tells where the
MySQL binlog entry for the current transaction ended. Also commits the MySQL binlog entry for the current transaction ended. Also commits the
transaction inside InnoDB. */ transaction inside InnoDB but does NOT flush InnoDB log files to disk.
To flush you have to call innobase_flush_log_to_disk. We have separated
flushing to eliminate the bottleneck of LOCK_log in log.cc which disabled
InnoDB's group commit capability. */
int int
innobase_report_binlog_offset_and_commit( innobase_report_binlog_offset_and_commit(
/*=====================================*/ /*=====================================*/
/* out: 0 or error code */ /* out: 0 */
THD* thd, /* in: user thread */ THD* thd, /* in: user thread */
void* trx_handle, /* in: InnoDB trx handle */ void* trx_handle, /* in: InnoDB trx handle */
char* log_file_name, /* in: latest binlog file name */ char* log_file_name, /* in: latest binlog file name */
...@@ -993,7 +989,39 @@ innobase_report_binlog_offset_and_commit( ...@@ -993,7 +989,39 @@ innobase_report_binlog_offset_and_commit(
trx->mysql_log_file_name = log_file_name; trx->mysql_log_file_name = log_file_name;
trx->mysql_log_offset = (ib_longlong)end_offset; trx->mysql_log_offset = (ib_longlong)end_offset;
return(innobase_commit(thd, trx_handle)); trx->flush_log_later = TRUE;
innobase_commit(thd, trx_handle);
trx->flush_log_later = FALSE;
return(0);
}
/*********************************************************************
This is called after MySQL has written the binlog entry for the current
transaction. Flushes the InnoDB log files to disk if required. */
int
innobase_commit_complete(
/*=====================*/
/* out: 0 */
void* trx_handle) /* in: InnoDB trx handle */
{
trx_t* trx;
if (srv_flush_log_at_trx_commit == 0) {
return(0);
}
trx = (trx_t*)trx_handle;
ut_a(trx != NULL);
trx_commit_complete_for_mysql(trx);
return(0);
} }
/********************************************************************* /*********************************************************************
...@@ -3202,7 +3230,7 @@ ha_innobase::create( ...@@ -3202,7 +3230,7 @@ ha_innobase::create(
the InnoDB data dictionary get out-of-sync if the user runs the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */ with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
innobase_table = dict_table_get(norm_name, NULL); innobase_table = dict_table_get(norm_name, NULL);
...@@ -3277,7 +3305,7 @@ ha_innobase::delete_table( ...@@ -3277,7 +3305,7 @@ ha_innobase::delete_table(
the InnoDB data dictionary get out-of-sync if the user runs the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */ with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* Tell the InnoDB server that there might be work for /* Tell the InnoDB server that there might be work for
utility threads: */ utility threads: */
...@@ -3347,7 +3375,7 @@ innobase_drop_database( ...@@ -3347,7 +3375,7 @@ innobase_drop_database(
the InnoDB data dictionary get out-of-sync if the user runs the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */ with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* Tell the InnoDB server that there might be work for /* Tell the InnoDB server that there might be work for
utility threads: */ utility threads: */
...@@ -3419,7 +3447,7 @@ ha_innobase::rename_table( ...@@ -3419,7 +3447,7 @@ ha_innobase::rename_table(
the InnoDB data dictionary get out-of-sync if the user runs the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */ with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* Tell the InnoDB server that there might be work for /* Tell the InnoDB server that there might be work for
utility threads: */ utility threads: */
......
...@@ -211,6 +211,8 @@ int innobase_report_binlog_offset_and_commit( ...@@ -211,6 +211,8 @@ int innobase_report_binlog_offset_and_commit(
void* trx_handle, void* trx_handle,
char* log_file_name, char* log_file_name,
my_off_t end_offset); my_off_t end_offset);
int innobase_commit_complete(
void* trx_handle);
int innobase_rollback(THD *thd, void* trx_handle); int innobase_rollback(THD *thd, void* trx_handle);
int innobase_close_connection(THD *thd); int innobase_close_connection(THD *thd);
int innobase_drop_database(char *path); int innobase_drop_database(char *path);
......
...@@ -243,6 +243,9 @@ int ha_autocommit_or_rollback(THD *thd, int error) ...@@ -243,6 +243,9 @@ int ha_autocommit_or_rollback(THD *thd, int error)
replication. This function also calls the commit of the table replication. This function also calls the commit of the table
handler, because the order of transactions in the log of the table handler, because the order of transactions in the log of the table
handler must be the same as in the binlog. handler must be the same as in the binlog.
NOTE that to eliminate the bottleneck of the group commit, we do not
flush the handler log files here, but only later in a call of
ha_commit_complete().
arguments: arguments:
thd: the thread handle of the current connection thd: the thread handle of the current connection
...@@ -269,12 +272,37 @@ int ha_report_binlog_offset_and_commit(THD *thd, ...@@ -269,12 +272,37 @@ int ha_report_binlog_offset_and_commit(THD *thd,
my_error(ER_ERROR_DURING_COMMIT, MYF(0), error); my_error(ER_ERROR_DURING_COMMIT, MYF(0), error);
error=1; error=1;
} }
trans->innodb_active_trans=0;
} }
#endif #endif
return error; return error;
} }
/*
Flushes the handler log files (if my.cnf settings do not free us from it)
after we have called ha_report_binlog_offset_and_commit(). To eliminate
the bottleneck from the group commit, this should be called when
LOCK_log has been released in log.cc.
arguments:
thd: the thread handle of the current connection
return value: always 0
*/
int ha_commit_complete(THD *thd)
{
#ifdef HAVE_INNOBASE_DB
THD_TRANS *trans;
trans = &thd->transaction.all;
if (trans->innobase_tid)
{
innobase_commit_complete(trans->innobase_tid);
trans->innodb_active_trans=0;
}
#endif
return 0;
}
/* /*
This function should be called when MySQL sends rows of a SELECT result set This function should be called when MySQL sends rows of a SELECT result set
or the EOF mark to the client. It releases a possible adaptive hash index or the EOF mark to the client. It releases a possible adaptive hash index
......
...@@ -372,6 +372,7 @@ void ha_resize_key_cache(void); ...@@ -372,6 +372,7 @@ void ha_resize_key_cache(void);
int ha_start_stmt(THD *thd); int ha_start_stmt(THD *thd);
int ha_report_binlog_offset_and_commit(THD *thd, char *log_file_name, int ha_report_binlog_offset_and_commit(THD *thd, char *log_file_name,
my_off_t end_offset); my_off_t end_offset);
int ha_commit_complete(THD *thd);
int ha_release_temporary_latches(THD *thd); int ha_release_temporary_latches(THD *thd);
int ha_commit_trans(THD *thd, THD_TRANS *trans); int ha_commit_trans(THD *thd, THD_TRANS *trans);
int ha_rollback_trans(THD *thd, THD_TRANS *trans); int ha_rollback_trans(THD *thd, THD_TRANS *trans);
......
...@@ -1033,6 +1033,8 @@ bool MYSQL_LOG::write(THD *thd,enum enum_server_command command, ...@@ -1033,6 +1033,8 @@ bool MYSQL_LOG::write(THD *thd,enum enum_server_command command,
bool MYSQL_LOG::write(Log_event* event_info) bool MYSQL_LOG::write(Log_event* event_info)
{ {
THD *thd=event_info->thd;
bool called_handler_commit=0;
bool error=0; bool error=0;
DBUG_ENTER("MYSQL_LOG::write(event)"); DBUG_ENTER("MYSQL_LOG::write(event)");
...@@ -1047,7 +1049,6 @@ bool MYSQL_LOG::write(Log_event* event_info) ...@@ -1047,7 +1049,6 @@ bool MYSQL_LOG::write(Log_event* event_info)
if (is_open()) if (is_open())
{ {
bool should_rotate = 0; bool should_rotate = 0;
THD *thd=event_info->thd;
const char *local_db = event_info->get_db(); const char *local_db = event_info->get_db();
#ifdef USING_TRANSACTIONS #ifdef USING_TRANSACTIONS
IO_CACHE *file = ((event_info->get_cache_stmt()) ? IO_CACHE *file = ((event_info->get_cache_stmt()) ?
...@@ -1147,6 +1148,7 @@ bool MYSQL_LOG::write(Log_event* event_info) ...@@ -1147,6 +1148,7 @@ bool MYSQL_LOG::write(Log_event* event_info)
{ {
error = ha_report_binlog_offset_and_commit(thd, log_file_name, error = ha_report_binlog_offset_and_commit(thd, log_file_name,
file->pos_in_file); file->pos_in_file);
called_handler_commit=1;
} }
should_rotate= (my_b_tell(file) >= (my_off_t) max_binlog_size); should_rotate= (my_b_tell(file) >= (my_off_t) max_binlog_size);
...@@ -1172,6 +1174,15 @@ err: ...@@ -1172,6 +1174,15 @@ err:
} }
pthread_mutex_unlock(&LOCK_log); pthread_mutex_unlock(&LOCK_log);
/* Flush the transactional handler log file now that we have released
LOCK_log; the flush is placed here to eliminate the bottleneck on the
group commit */
if (called_handler_commit) {
ha_commit_complete(thd);
}
DBUG_RETURN(error); DBUG_RETURN(error);
} }
...@@ -1277,6 +1288,13 @@ bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache) ...@@ -1277,6 +1288,13 @@ bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache)
} }
VOID(pthread_mutex_unlock(&LOCK_log)); VOID(pthread_mutex_unlock(&LOCK_log));
/* Flush the transactional handler log file now that we have released
LOCK_log; the flush is placed here to eliminate the bottleneck on the
group commit */
ha_commit_complete(thd);
DBUG_RETURN(0); DBUG_RETURN(0);
err: err:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment