Commit 90c00c9e authored by inaam's avatar inaam

branches/innodb+ rb://48

This patch is to improve recovery performance in InnoDB+.
It includes introduction of red-black tree for sorted insertion into
the flush_list and couple of other quirks. More can be found
at: https://svn.innodb.com/innobase/Recovery_Performance_Improvements

Reviewed by: Marko
parent c1d46655
...@@ -37,6 +37,142 @@ buf_flush_validate_low(void); ...@@ -37,6 +37,142 @@ buf_flush_validate_low(void);
/* out: TRUE if ok */ /* out: TRUE if ok */
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
/**********************************************************************
Insert a block in the flush_rbt and returns a pointer to its
predecessor or NULL if no predecessor. The ordering is maintained
on the basis of the <oldest_modification, space, offset> key. */
static
buf_page_t*
buf_flush_insert_in_flush_rbt(
/*==========================*/
/* out: pointer to the predecessor or
NULL if no predecessor. */
buf_page_t* bpage) /* in: bpage to be inserted. */
{
buf_page_t* prev = NULL;
const ib_rbt_node_t* c_node;
const ib_rbt_node_t* p_node;
ut_ad(buf_pool_mutex_own());
/* Insert this buffer into the rbt. */
c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
ut_a(c_node != NULL);
/* Get the predecessor. */
p_node = rbt_prev(buf_pool->flush_rbt, c_node);
if (p_node != NULL) {
prev = *rbt_value(buf_page_t*, p_node);
ut_a(prev != NULL);
}
return(prev);
}
/*************************************************************
Delete a bpage from the flush_rbt. */
static
void
buf_flush_delete_from_flush_rbt(
/*============================*/
buf_page_t* bpage) /* in: bpage to be removed. */
{
ibool ret = FALSE;
ut_ad(buf_pool_mutex_own());
ret = rbt_delete(buf_pool->flush_rbt, &bpage);
ut_ad(ret);
}
/*********************************************************************
Compare two modified blocks in the buffer pool. The key for comparison
is:
key = <oldest_modification, space, offset>
This comparison is used to maintian ordering of blocks in the
buf_pool->flush_rbt.
Note that for the purpose of flush_rbt, we only need to order blocks
on the oldest_modification. The other two fields are used to uniquely
identify the blocks. */
static
int
buf_flush_block_cmp(
/*================*/
/* out:
< 0 if b2 < b1,
0 if b2 == b1,
> 0 if b2 > b1 */
const void* p1, /* in: block1 */
const void* p2) /* in: block2 */
{
int ret;
ut_ad(p1 != NULL);
ut_ad(p2 != NULL);
const buf_page_t* b1 = *(const buf_page_t**) p1;
const buf_page_t* b2 = *(const buf_page_t**) p2;
ut_ad(b1 != NULL);
ut_ad(b2 != NULL);
ut_ad(b1->in_flush_list);
ut_ad(b2->in_flush_list);
if (b2->oldest_modification
> b1->oldest_modification) {
return(1);
}
if (b2->oldest_modification
< b1->oldest_modification) {
return(-1);
}
/* If oldest_modification is same then decide on the space. */
ret = (int)(b2->space - b1->space);
/* Or else decide ordering on the offset field. */
return(ret ? ret : (int)(b2->offset - b1->offset));
}
/************************************************************************
Initialize the red-black tree to speed up insertions into the flush_list
during recovery process. Should be called at the start of recovery
process before any page has been read/written. */
UNIV_INTERN
void
buf_flush_init_flush_rbt(void)
/*==========================*/
{
buf_pool_mutex_enter();
/* Create red black tree for speedy insertions in flush list. */
buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
buf_flush_block_cmp);
buf_pool_mutex_exit();
}
/************************************************************************
Frees up the red-black tree. */
UNIV_INTERN
void
buf_flush_free_flush_rbt(void)
/*==========================*/
{
buf_pool_mutex_enter();
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
rbt_free(buf_pool->flush_rbt);
buf_pool->flush_rbt = NULL;
buf_pool_mutex_exit();
}
/************************************************************************ /************************************************************************
Inserts a modified block into the flush list. */ Inserts a modified block into the flush list. */
UNIV_INTERN UNIV_INTERN
...@@ -50,6 +186,13 @@ buf_flush_insert_into_flush_list( ...@@ -50,6 +186,13 @@ buf_flush_insert_into_flush_list(
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
<= bpage->oldest_modification)); <= bpage->oldest_modification));
/* If we are in the recovery then we need to update the flush
red-black tree as well. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
buf_flush_insert_sorted_into_flush_list(bpage);
return;
}
switch (buf_page_get_state(bpage)) { switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_PAGE:
mutex_enter(&buf_pool_zip_mutex); mutex_enter(&buf_pool_zip_mutex);
...@@ -120,13 +263,28 @@ buf_flush_insert_sorted_into_flush_list( ...@@ -120,13 +263,28 @@ buf_flush_insert_sorted_into_flush_list(
} }
prev_b = NULL; prev_b = NULL;
/* For the most part when this function is called the flush_rbt
should not be NULL. In a very rare boundary case it is possible
that the flush_rbt has already been freed by the recovery thread
before the last page was hooked up in the flush_list by the
io-handler thread. In that case we'll just do a simple
linear search in the else block. */
if (buf_pool->flush_rbt) {
prev_b = buf_flush_insert_in_flush_rbt(bpage);
} else {
b = UT_LIST_GET_FIRST(buf_pool->flush_list); b = UT_LIST_GET_FIRST(buf_pool->flush_list);
while (b && b->oldest_modification > bpage->oldest_modification) { while (b && b->oldest_modification
> bpage->oldest_modification) {
ut_ad(b->in_flush_list); ut_ad(b->in_flush_list);
prev_b = b; prev_b = b;
b = UT_LIST_GET_NEXT(list, b); b = UT_LIST_GET_NEXT(list, b);
} }
}
if (prev_b == NULL) { if (prev_b == NULL) {
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, bpage); UT_LIST_ADD_FIRST(list, buf_pool->flush_list, bpage);
...@@ -242,6 +400,11 @@ buf_flush_remove( ...@@ -242,6 +400,11 @@ buf_flush_remove(
break; break;
} }
/* If the flush_rbt is active then delete from it as well. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
buf_flush_delete_from_flush_rbt(bpage);
}
bpage->oldest_modification = 0; bpage->oldest_modification = 0;
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list)); ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list));
...@@ -1275,6 +1438,15 @@ buf_flush_validate_low(void) ...@@ -1275,6 +1438,15 @@ buf_flush_validate_low(void)
ut_a(buf_page_in_file(bpage)); ut_a(buf_page_in_file(bpage));
ut_a(om > 0); ut_a(om > 0);
/* If we are in recovery mode i.e.: flush_rbt != NULL
then each block in the flush_list must also be present
in the flush_rbt. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
ut_a(*rbt_value(buf_page_t*,
rbt_lookup(buf_pool->flush_rbt, &bpage))
== bpage);
}
bpage = UT_LIST_GET_NEXT(list, bpage); bpage = UT_LIST_GET_NEXT(list, bpage);
ut_a(!bpage || om >= bpage->oldest_modification); ut_a(!bpage || om >= bpage->oldest_modification);
......
...@@ -745,14 +745,14 @@ buf_read_recv_pages( ...@@ -745,14 +745,14 @@ buf_read_recv_pages(
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
os_aio_simulated_wake_handler_threads(); os_aio_simulated_wake_handler_threads();
os_thread_sleep(500000); os_thread_sleep(10000);
count++; count++;
if (count > 100) { if (count > 1000) {
fprintf(stderr, fprintf(stderr,
"InnoDB: Error: InnoDB has waited for" "InnoDB: Error: InnoDB has waited for"
" 50 seconds for pending\n" " 10 seconds for pending\n"
"InnoDB: reads to the buffer pool to" "InnoDB: reads to the buffer pool to"
" be finished.\n" " be finished.\n"
"InnoDB: Number of pending reads %lu," "InnoDB: Number of pending reads %lu,"
......
...@@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri ...@@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri
#include "sync0rw.h" #include "sync0rw.h"
#include "hash0hash.h" #include "hash0hash.h"
#include "ut0byte.h" #include "ut0byte.h"
#include "ut0rbt.h"
#include "os0proc.h" #include "os0proc.h"
#include "page0types.h" #include "page0types.h"
...@@ -1285,6 +1286,19 @@ struct buf_pool_struct{ ...@@ -1285,6 +1286,19 @@ struct buf_pool_struct{
/* this is in the set state when there /* this is in the set state when there
is no flush batch of the given type is no flush batch of the given type
running */ running */
ib_rbt_t* flush_rbt; /* a red-black tree is used
exclusively during recovery to
speed up insertions in the
flush_list. This tree contains
blocks in order of
oldest_modification LSN and is
kept in sync with the
flush_list.
Each member of the tree MUST
also be on the flush_list.
This tree is relevant only in
recovery and is set to NULL
once the recovery is over. */
ulint ulint_clock; /* a sequence number used to count ulint ulint_clock; /* a sequence number used to count
time. NOTE! This counter wraps time. NOTE! This counter wraps
around at 4 billion (if ulint == around at 4 billion (if ulint ==
......
...@@ -126,6 +126,22 @@ buf_flush_validate(void); ...@@ -126,6 +126,22 @@ buf_flush_validate(void);
/* out: TRUE if ok */ /* out: TRUE if ok */
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
/************************************************************************
Initialize the red-black tree to speed up insertions into the flush_list
during recovery process. Should be called at the start of recovery
process before any page has been read/written. */
UNIV_INTERN
void
buf_flush_init_flush_rbt(void);
/*==========================*/
/************************************************************************
Frees up the red-black tree. */
UNIV_INTERN
void
buf_flush_free_flush_rbt(void);
/*==========================*/
/* When buf_flush_free_margin is called, it tries to make this many blocks /* When buf_flush_free_margin is called, it tries to make this many blocks
available to replacement in the free list and at the end of the LRU list (to available to replacement in the free list and at the end of the LRU list (to
make sure that a read-ahead batch can be read efficiently in a single make sure that a read-ahead batch can be read efficiently in a single
......
...@@ -101,7 +101,9 @@ UNIV_INTERN ulint recv_max_parsed_page_no = 0; ...@@ -101,7 +101,9 @@ UNIV_INTERN ulint recv_max_parsed_page_no = 0;
/* This many frames must be left free in the buffer pool when we scan /* This many frames must be left free in the buffer pool when we scan
the log and store the scanned log records in the buffer pool: we will the log and store the scanned log records in the buffer pool: we will
use these free frames to read in pages when we start applying the use these free frames to read in pages when we start applying the
log records to the database. */ log records to the database.
This is the default value. If the actual size of the buffer pool is
larger than 10 MB we'll set this value to 512. */
UNIV_INTERN ulint recv_n_pool_free_frames = 256; UNIV_INTERN ulint recv_n_pool_free_frames = 256;
...@@ -156,6 +158,12 @@ recv_sys_init( ...@@ -156,6 +158,12 @@ recv_sys_init(
return; return;
} }
/* Initialize red-black tree for fast insertions into the
flush_list during recovery process.
As this initialization is done while holding the buffer pool
mutex we perform it before acquiring recv_sys->mutex. */
buf_flush_init_flush_rbt();
mutex_enter(&(recv_sys->mutex)); mutex_enter(&(recv_sys->mutex));
if (!recover_from_backup) { if (!recover_from_backup) {
...@@ -165,6 +173,12 @@ recv_sys_init( ...@@ -165,6 +173,12 @@ recv_sys_init(
recv_is_from_backup = TRUE; recv_is_from_backup = TRUE;
} }
/* Set appropriate value of recv_n_pool_free_frames. */
if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) {
/* Buffer pool of size greater than 10 MB. */
recv_n_pool_free_frames = 512;
}
recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE); recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE);
recv_sys->len = 0; recv_sys->len = 0;
recv_sys->recovered_offset = 0; recv_sys->recovered_offset = 0;
...@@ -231,6 +245,9 @@ recv_sys_free(void) ...@@ -231,6 +245,9 @@ recv_sys_free(void)
recv_sys->heap = NULL; recv_sys->heap = NULL;
mutex_exit(&(recv_sys->mutex)); mutex_exit(&(recv_sys->mutex));
/* Free up the flush_rbt. */
buf_flush_free_flush_rbt();
} }
#endif /* UNIV_LOG_DEBUG */ #endif /* UNIV_LOG_DEBUG */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment