Commit 626d26f3 authored by inaam's avatar inaam

branches/innodb+ rb://210

Introduce a new mutex to protect flush_list.
Redesign mtr_commit() in a way that log_sys mutex is not held while all
mtr_memos are popped and is released just after the modified blocks are
inserted into the flush_list. This should reduce contention on log_sys
mutex.

Approved by: Heikki
parent d47d8c55
......@@ -153,12 +153,12 @@ list. We also keep a pointer to near the end of the LRU list,
which we can use when we want to artificially age a page in the
buf_pool. This is used if we know that some page is not needed
again for some time: we insert the block right after the pointer,
causing it to be replaced sooner than would noramlly be the case.
causing it to be replaced sooner than would normally be the case.
Currently this aging mechanism is used for read-ahead mechanism
of pages, and it can also be used when there is a scan of a full
table which cannot fit in the memory. Putting the pages near the
of the LRU list, we make sure that most of the buf_pool stays in the
main memory, undisturbed.
end of the LRU list, we make sure that most of the buf_pool stays
in the main memory, undisturbed.
The unzip_LRU list contains a subset of the common LRU list. The
blocks on the unzip_LRU list hold a compressed file page and the
......@@ -172,6 +172,7 @@ The chain of modified blocks (buf_pool->flush_list) contains the blocks
holding file pages that have been modified in the memory
but not written to disk yet. The block with the oldest modification
which has not yet been written to disk is at the end of the chain.
The access to this list is protected by flush_list_mutex.
The chain of unmodified compressed blocks (buf_pool->zip_clean)
contains the control blocks (buf_page_t) of those compressed pages
......@@ -981,6 +982,7 @@ buf_pool_init(void)
/* 2. Initialize flushing fields
-------------------------------- */
mutex_create(&buf_pool->flush_list_mutex, SYNC_BUF_FLUSH_LIST);
for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
buf_pool->no_flush[i] = os_event_create(NULL);
}
......@@ -1407,6 +1409,7 @@ buf_pool_page_hash_rebuild(void)
buf_page_address_fold(b->space, b->offset), b);
}
buf_flush_list_mutex_enter();
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
......@@ -1434,6 +1437,7 @@ buf_pool_page_hash_rebuild(void)
}
}
buf_flush_list_mutex_exit();
buf_pool_mutex_exit();
}
......@@ -3534,11 +3538,6 @@ buf_validate(void)
}
n_lru++;
if (block->page.oldest_modification > 0) {
n_flush++;
}
break;
case BUF_BLOCK_NOT_USED:
......@@ -3577,6 +3576,10 @@ buf_validate(void)
ut_error;
break;
}
/* It is OK to read oldest_modification here because
we have acquired buf_pool_zip_mutex above which acts
as the 'block->mutex' for these bpages. */
ut_a(!b->oldest_modification);
ut_a(buf_page_hash_get(b->space, b->offset) == b);
......@@ -3584,23 +3587,23 @@ buf_validate(void)
n_zip++;
}
/* Check dirty compressed-only blocks. */
/* Check dirty blocks. */
buf_flush_list_mutex_enter();
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
ut_a(b->oldest_modification);
n_flush++;
switch (buf_page_get_state(b)) {
case BUF_BLOCK_ZIP_DIRTY:
ut_a(b->oldest_modification);
n_lru++;
n_flush++;
n_zip++;
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
case BUF_IO_READ:
break;
case BUF_IO_WRITE:
switch (buf_page_get_flush_type(b)) {
case BUF_FLUSH_LRU:
......@@ -3633,6 +3636,10 @@ buf_validate(void)
ut_a(buf_page_hash_get(b->space, b->offset) == b);
}
ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
buf_flush_list_mutex_exit();
mutex_exit(&buf_pool_zip_mutex);
if (n_lru + n_free > buf_pool->curr_size + n_zip) {
......@@ -3649,7 +3656,6 @@ buf_validate(void)
(ulong) n_free);
ut_error;
}
ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
......@@ -3690,6 +3696,7 @@ buf_print(void)
counts = mem_alloc(sizeof(ulint) * size);
buf_pool_mutex_enter();
buf_flush_list_mutex_enter();
fprintf(stderr,
"buf_pool size %lu\n"
......@@ -3716,6 +3723,8 @@ buf_print(void)
(ulong) buf_pool->stat.n_pages_created,
(ulong) buf_pool->stat.n_pages_written);
buf_flush_list_mutex_exit();
/* Count the number of blocks belonging to each index in the buffer */
n_found = 0;
......@@ -3839,6 +3848,7 @@ buf_get_latched_pages_number(void)
}
}
buf_flush_list_mutex_enter();
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
......@@ -3864,6 +3874,7 @@ buf_get_latched_pages_number(void)
}
}
buf_flush_list_mutex_exit();
mutex_exit(&buf_pool_zip_mutex);
buf_pool_mutex_exit();
......@@ -3896,16 +3907,13 @@ buf_get_modified_ratio_pct(void)
{
ulint ratio;
buf_pool_mutex_enter();
/* This is for heuristics. No need to grab any mutex here. */
ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
/ (1 + UT_LIST_GET_LEN(buf_pool->LRU)
+ UT_LIST_GET_LEN(buf_pool->free));
/* 1 + is there to avoid division by zero */
buf_pool_mutex_exit();
return(ratio);
}
......@@ -3924,6 +3932,7 @@ buf_print_io(
ut_ad(buf_pool);
buf_pool_mutex_enter();
buf_flush_list_mutex_enter();
fprintf(file,
"Buffer pool size %lu\n"
......@@ -3945,6 +3954,8 @@ buf_print_io(
+ buf_pool->init_flush[BUF_FLUSH_LIST],
(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
buf_flush_list_mutex_exit();
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
buf_pool->last_printout_time);
......
This diff is collapsed.
......@@ -2018,6 +2018,7 @@ buf_LRU_print(void)
while (bpage != NULL) {
mutex_enter(buf_page_get_mutex(bpage));
fprintf(stderr, "BLOCK space %lu page %lu ",
(ulong) buf_page_get_space(bpage),
(ulong) buf_page_get_page_no(bpage));
......@@ -2066,6 +2067,7 @@ buf_LRU_print(void)
break;
}
mutex_exit(buf_page_get_mutex(bpage));
bpage = UT_LIST_GET_NEXT(LRU, bpage);
}
......
......@@ -347,9 +347,8 @@ void
buf_page_release(
/*=============*/
buf_block_t* block, /*!< in: buffer block */
ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
ulint rw_latch); /*!< in: RW_S_LATCH, RW_X_LATCH,
RW_NO_LATCH */
mtr_t* mtr); /*!< in: mtr */
/********************************************************************//**
Moves a page to the start of the buffer pool LRU list. This high-level
function can be used to prevent an important page from slipping out of
......@@ -1102,8 +1101,9 @@ struct buf_page_struct{
UT_LIST_NODE_T(buf_page_t) list;
/*!< based on state, this is a
list node, protected only by
buf_pool_mutex, in one of the
list node, protected either by
buf_pool_mutex or by
flush_list_mutex, in one of the
following lists in buf_pool:
- BUF_BLOCK_NOT_USED: free
......@@ -1112,6 +1112,12 @@ struct buf_page_struct{
- BUF_BLOCK_ZIP_PAGE: zip_clean
- BUF_BLOCK_ZIP_FREE: zip_free[]
If bpage is part of flush_list
then the node pointers are
covered by flush_list_mutex.
Otherwise these pointers are
protected by buf_pool_mutex.
The contents of the list node
is undefined if !in_flush_list
&& state == BUF_BLOCK_FILE_PAGE,
......@@ -1122,10 +1128,15 @@ struct buf_page_struct{
#ifdef UNIV_DEBUG
ibool in_flush_list; /*!< TRUE if in buf_pool->flush_list;
when buf_pool_mutex is free, the
when flush_list_mutex is free, the
following should hold: in_flush_list
== (state == BUF_BLOCK_FILE_PAGE
|| state == BUF_BLOCK_ZIP_DIRTY) */
|| state == BUF_BLOCK_ZIP_DIRTY)
Writes to this field must be
covered by both block->mutex
and flush_list_mutex. Hence
reads can happen while holding
any one of the two mutexes */
ibool in_free_list; /*!< TRUE if in buf_pool->free; when
buf_pool_mutex is free, the following
should hold: in_free_list
......@@ -1135,7 +1146,8 @@ struct buf_page_struct{
/*!< log sequence number of
the youngest modification to
this block, zero if not
modified */
modified. Protected by block
mutex */
ib_uint64_t oldest_modification;
/*!< log sequence number of
the START of the log entry
......@@ -1143,7 +1155,12 @@ struct buf_page_struct{
modification to this block
which has not yet been flushed
on disk; zero if all
modifications are on disk */
modifications are on disk.
Writes to this field must be
covered by both block->mutex
and flush_list_mutex. Hence
reads can happen while holding
any one of the two mutexes */
/* @} */
/** @name LRU replacement algorithm fields
These fields are protected by buf_pool_mutex only (not
......@@ -1375,6 +1392,13 @@ struct buf_pool_struct{
/* @{ */
mutex_t flush_list_mutex;/*!< mutex protecting the
flush list access. This mutex
protects flush_list, flush_rbt
and bpage::list pointers when
the bpage is on flush_list. It
also protects writes to
bpage::oldest_modification */
UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
/*!< base node of the modified block
list */
......@@ -1400,7 +1424,8 @@ struct buf_pool_struct{
also be on the flush_list.
This tree is relevant only in
recovery and is set to NULL
once the recovery is over. */
once the recovery is over.
Protected by flush_list_mutex */
ulint freed_page_clock;/*!< a sequence number used
to count the number of buffer
blocks removed from the end of
......@@ -1492,6 +1517,18 @@ Use these instead of accessing buf_pool_mutex directly. */
mutex_enter(&buf_pool_mutex); \
} while (0)
/** Test if flush list mutex is owned. */
#define buf_flush_list_mutex_own() mutex_own(&buf_pool->flush_list_mutex)
/** Acquire the flush list mutex. */
#define buf_flush_list_mutex_enter() do { \
mutex_enter(&buf_pool->flush_list_mutex); \
} while (0)
/** Release the flush list mutex. */
# define buf_flush_list_mutex_exit() do { \
mutex_exit(&buf_pool->flush_list_mutex); \
} while (0)
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/** Flag to forbid the release of the buffer pool mutex.
Protected by buf_pool_mutex. */
......
......@@ -121,7 +121,7 @@ buf_pool_get_oldest_modification(void)
buf_page_t* bpage;
ib_uint64_t lsn;
buf_pool_mutex_enter();
buf_flush_list_mutex_enter();
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
......@@ -132,7 +132,7 @@ buf_pool_get_oldest_modification(void)
lsn = bpage->oldest_modification;
}
buf_pool_mutex_exit();
buf_flush_list_mutex_exit();
/* The returned answer may be out of date: the flush_list can
change after the mutex has been released. */
......@@ -1018,21 +1018,14 @@ void
buf_page_release(
/*=============*/
buf_block_t* block, /*!< in: buffer block */
ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
ulint rw_latch) /*!< in: RW_S_LATCH, RW_X_LATCH,
RW_NO_LATCH */
mtr_t* mtr) /*!< in: mtr */
{
ut_ad(block);
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_a(block->page.buf_fix_count > 0);
if (rw_latch == RW_X_LATCH && mtr->modifications) {
buf_pool_mutex_enter();
buf_flush_note_modification(block, mtr);
buf_pool_mutex_exit();
}
mutex_enter(&block->mutex);
#ifdef UNIV_SYNC_DEBUG
......
......@@ -33,7 +33,8 @@ UNIV_INTERN
void
buf_flush_insert_into_flush_list(
/*=============================*/
buf_block_t* block); /*!< in/out: block which is modified */
buf_block_t* block, /*!< in/out: block which is modified */
ib_uint64_t lsn); /*!< in: oldest modification */
/********************************************************************//**
Inserts a modified block into the flush list in the right sorted position.
This function is used by recovery, because there the modifications do not
......@@ -42,7 +43,8 @@ UNIV_INTERN
void
buf_flush_insert_sorted_into_flush_list(
/*====================================*/
buf_block_t* block); /*!< in/out: block which is modified */
buf_block_t* block, /*!< in/out: block which is modified */
ib_uint64_t lsn); /*!< in: oldest modification */
/********************************************************************//**
This function should be called at a mini-transaction commit, if a page was
......@@ -61,24 +63,26 @@ buf_flush_note_modification(
#ifdef UNIV_SYNC_DEBUG
ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
ut_ad(buf_pool_mutex_own());
ut_ad(!buf_pool_mutex_own());
ut_ad(!buf_flush_list_mutex_own());
ut_ad(mtr->start_lsn != 0);
ut_ad(mtr->modifications);
mutex_enter(&block->mutex);
ut_ad(block->page.newest_modification <= mtr->end_lsn);
block->page.newest_modification = mtr->end_lsn;
if (!block->page.oldest_modification) {
block->page.oldest_modification = mtr->start_lsn;
ut_ad(block->page.oldest_modification != 0);
buf_flush_insert_into_flush_list(block);
buf_flush_insert_into_flush_list(block, mtr->start_lsn);
} else {
ut_ad(block->page.oldest_modification <= mtr->start_lsn);
}
mutex_exit(&block->mutex);
++srv_buf_pool_write_requests;
}
......@@ -101,23 +105,22 @@ buf_flush_recv_note_modification(
ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
buf_pool_mutex_enter();
ut_ad(!buf_pool_mutex_own());
ut_ad(!buf_flush_list_mutex_own());
ut_ad(start_lsn != 0);
ut_ad(block->page.newest_modification <= end_lsn);
mutex_enter(&block->mutex);
block->page.newest_modification = end_lsn;
if (!block->page.oldest_modification) {
block->page.oldest_modification = start_lsn;
ut_ad(block->page.oldest_modification != 0);
buf_flush_insert_sorted_into_flush_list(block);
buf_flush_insert_sorted_into_flush_list(block, start_lsn);
} else {
ut_ad(block->page.oldest_modification <= start_lsn);
}
buf_pool_mutex_exit();
mutex_exit(&block->mutex);
}
#endif /* !UNIV_HOTBACKUP */
......@@ -475,8 +475,9 @@ or row lock! */
SYNC_SEARCH_SYS, as memory allocation
can call routines there! Otherwise
the level is SYNC_MEM_HASH. */
#define SYNC_BUF_POOL 150
#define SYNC_BUF_BLOCK 149
#define SYNC_BUF_POOL 150 /* Buffer pool mutex */
#define SYNC_BUF_BLOCK 149 /* Block mutex */
#define SYNC_BUF_FLUSH_LIST 145 /* Buffer flush list mutex */
#define SYNC_DOUBLEWRITE 140
#define SYNC_ANY_LATCH 135
#define SYNC_THR_LOCAL 133
......
......@@ -30,6 +30,7 @@ Created 11/26/1995 Heikki Tuuri
#endif
#include "buf0buf.h"
#include "buf0flu.h"
#include "page0types.h"
#include "mtr0log.h"
#include "log0log.h"
......@@ -38,7 +39,7 @@ Created 11/26/1995 Heikki Tuuri
# include "log0recv.h"
/*****************************************************************//**
Releases the item in the slot given. */
UNIV_INLINE
static
void
mtr_memo_slot_release(
/*==================*/
......@@ -48,14 +49,19 @@ mtr_memo_slot_release(
void* object;
ulint type;
ut_ad(mtr && slot);
ut_ad(mtr);
ut_ad(slot);
#ifndef UNIV_DEBUG
UT_NOT_USED(mtr);
#endif /* UNIV_DEBUG */
object = slot->object;
type = slot->type;
if (UNIV_LIKELY(object != NULL)) {
if (type <= MTR_MEMO_BUF_FIX) {
buf_page_release((buf_block_t*)object, type, mtr);
buf_page_release((buf_block_t*)object, type);
} else if (type == MTR_MEMO_S_LOCK) {
rw_lock_s_unlock((rw_lock_t*)object);
#ifdef UNIV_DEBUG
......@@ -73,13 +79,10 @@ mtr_memo_slot_release(
}
/**********************************************************//**
Releases the mlocks and other objects stored in an mtr memo. They are released
in the order opposite to which they were pushed to the memo. NOTE! It is
essential that the x-rw-lock on a modified buffer page is not released before
buf_page_note_modification is called for that page! Otherwise, some thread
might race to modify it, and the flush list sort order on lsn would be
destroyed. */
UNIV_INLINE
Releases the mlocks and other objects stored in an mtr memo.
They are released in the order opposite to which they were pushed
to the memo. */
static
void
mtr_memo_pop_all(
/*=============*/
......@@ -105,6 +108,58 @@ mtr_memo_pop_all(
}
}
/*****************************************************************//**
Releases the item in the slot given. */
static
void
mtr_memo_slot_note_modification(
/*============================*/
mtr_t* mtr, /*!< in: mtr */
mtr_memo_slot_t* slot) /*!< in: memo slot */
{
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
ut_ad(mtr->modifications);
if (slot->object != NULL && slot->type == MTR_MEMO_PAGE_X_FIX) {
buf_flush_note_modification((buf_block_t*) slot->object, mtr);
}
}
/**********************************************************//**
Add the modified pages to the buffer flush list. They are released
in the order opposite to which they were pushed to the memo. NOTE! It is
essential that the x-rw-lock on a modified buffer page is not released
before buf_page_note_modification is called for that page! Otherwise,
some thread might race to modify it, and the flush list sort order on
lsn would be destroyed. */
static
void
mtr_memo_note_modifications(
/*========================*/
mtr_t* mtr) /*!< in: mtr */
{
dyn_array_t* memo;
ulint offset;
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
commit */
memo = &mtr->memo;
offset = dyn_array_get_data_size(memo);
while (offset > 0) {
mtr_memo_slot_t* slot;
offset -= sizeof(mtr_memo_slot_t);
slot = dyn_array_get_element(memo, offset);
mtr_memo_slot_note_modification(mtr, slot);
}
}
/************************************************************//**
Writes the contents of a mini-transaction log, if any, to the database log. */
static
......@@ -137,7 +192,9 @@ mtr_log_reserve_and_write(
&mtr->start_lsn);
if (mtr->end_lsn) {
return;
/* Success. We have the log mutex.
Add pages to flush list and exit */
goto func_exit;
}
}
......@@ -161,6 +218,13 @@ mtr_log_reserve_and_write(
}
mtr->end_lsn = log_close();
func_exit:
if (mtr->modifications) {
mtr_memo_note_modifications(mtr);
}
log_release();
}
#endif /* !UNIV_HOTBACKUP */
......@@ -172,10 +236,6 @@ mtr_commit(
/*=======*/
mtr_t* mtr) /*!< in: mini-transaction */
{
#ifndef UNIV_HOTBACKUP
ibool write_log;
#endif /* !UNIV_HOTBACKUP */
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
ut_ad(mtr->state == MTR_ACTIVE);
......@@ -184,25 +244,12 @@ mtr_commit(
#ifndef UNIV_HOTBACKUP
/* This is a dirty read, for debugging. */
ut_ad(!recv_no_log_write);
write_log = mtr->modifications && mtr->n_log_recs;
if (write_log) {
if (mtr->modifications && mtr->n_log_recs) {
mtr_log_reserve_and_write(mtr);
}
/* We first update the modification info to buffer pages, and only
after that release the log mutex: this guarantees that when the log
mutex is free, all buffer pages contain an up-to-date info of their
modifications. This fact is used in making a checkpoint when we look
at the oldest modification of any page in the buffer pool. It is also
required when we insert modified buffer pages in to the flush list
which must be sorted on oldest_modification. */
mtr_memo_pop_all(mtr);
if (write_log) {
log_release();
}
#endif /* !UNIV_HOTBACKUP */
ut_d(mtr->state = MTR_COMMITTED);
......@@ -241,6 +288,10 @@ mtr_rollback_to_savepoint(
slot = dyn_array_get_element(memo, offset);
ut_ad(slot->type != MTR_MEMO_MODIFY);
/* We do not call mtr_memo_slot_note_modification()
because there MUST be no changes made to the buffer
pages after the savepoint */
mtr_memo_slot_release(mtr, slot);
}
}
......@@ -272,7 +323,10 @@ mtr_memo_release(
slot = dyn_array_get_element(memo, offset);
if ((object == slot->object) && (type == slot->type)) {
if (object == slot->object && type == slot->type) {
if (mtr->modifications) {
mtr_memo_slot_note_modification(mtr, slot);
}
mtr_memo_slot_release(mtr, slot);
......
......@@ -1092,6 +1092,7 @@ sync_thread_add_level(
case SYNC_TRX_SYS_HEADER:
case SYNC_FILE_FORMAT_TAG:
case SYNC_DOUBLEWRITE:
case SYNC_BUF_FLUSH_LIST:
case SYNC_BUF_POOL:
case SYNC_SEARCH_SYS:
case SYNC_SEARCH_SYS_CONF:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment