Commit 60ed4797 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-26004 Excessive wait times in buf_LRU_get_free_block()

buf_LRU_get_free_block(): Initially wait for a single block to be
freed, signaled by buf_pool.done_free. Only if that fails and no
LRU eviction flushing batch is already running, we initiate a
flushing batch that should serve all threads that are currently
waiting in buf_LRU_get_free_block().

Note: In an extreme case, this may introduce a performance regression
at larger numbers of connections. We observed this in sysbench
oltp_update_index with 512MiB buffer pool, 4GiB of data on fast NVMe,
and 1000 concurrent connections, on a 20-thread CPU. The contention point
appears to be buf_pool.mutex, and the improvement would turn into a
regression somewhere beyond 32 concurrent connections.

On slower storage, such regression was not observed; instead, the
throughput was improving and maximum latency was reduced.

The excessive waits were pointed out by Vladislav Vaintroub.
parent 6441bc61
...@@ -1467,6 +1467,7 @@ bool buf_pool_t::create() ...@@ -1467,6 +1467,7 @@ bool buf_pool_t::create()
pthread_cond_init(&done_flush_LRU, nullptr); pthread_cond_init(&done_flush_LRU, nullptr);
pthread_cond_init(&done_flush_list, nullptr); pthread_cond_init(&done_flush_list, nullptr);
pthread_cond_init(&do_flush_list, nullptr); pthread_cond_init(&do_flush_list, nullptr);
pthread_cond_init(&done_free, nullptr);
try_LRU_scan= true; try_LRU_scan= true;
...@@ -1532,6 +1533,7 @@ void buf_pool_t::close() ...@@ -1532,6 +1533,7 @@ void buf_pool_t::close()
pthread_cond_destroy(&done_flush_LRU); pthread_cond_destroy(&done_flush_LRU);
pthread_cond_destroy(&done_flush_list); pthread_cond_destroy(&done_flush_list);
pthread_cond_destroy(&do_flush_list); pthread_cond_destroy(&do_flush_list);
pthread_cond_destroy(&done_free);
ut_free(chunks); ut_free(chunks);
chunks= nullptr; chunks= nullptr;
......
...@@ -392,18 +392,19 @@ void buf_page_write_complete(const IORequest &request) ...@@ -392,18 +392,19 @@ void buf_page_write_complete(const IORequest &request)
rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE); rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE);
if (request.is_LRU()) if (request.is_LRU())
{
buf_LRU_free_page(bpage, true); buf_LRU_free_page(bpage, true);
else
ut_ad(!temp);
if (request.is_LRU())
{
ut_ad(buf_pool.n_flush_LRU_); ut_ad(buf_pool.n_flush_LRU_);
if (!--buf_pool.n_flush_LRU_) if (!--buf_pool.n_flush_LRU_)
{
pthread_cond_broadcast(&buf_pool.done_flush_LRU); pthread_cond_broadcast(&buf_pool.done_flush_LRU);
pthread_cond_signal(&buf_pool.done_free);
}
} }
else else
{ {
ut_ad(!temp);
ut_ad(buf_pool.n_flush_list_); ut_ad(buf_pool.n_flush_list_);
if (!--buf_pool.n_flush_list_) if (!--buf_pool.n_flush_list_)
pthread_cond_broadcast(&buf_pool.done_flush_list); pthread_cond_broadcast(&buf_pool.done_flush_list);
...@@ -1717,7 +1718,10 @@ ulint buf_flush_LRU(ulint max_n) ...@@ -1717,7 +1718,10 @@ ulint buf_flush_LRU(ulint max_n)
mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_unlock(&buf_pool.mutex);
if (!n_flushing) if (!n_flushing)
{
pthread_cond_broadcast(&buf_pool.done_flush_LRU); pthread_cond_broadcast(&buf_pool.done_flush_LRU);
pthread_cond_signal(&buf_pool.done_free);
}
buf_dblwr.flush_buffered_writes(); buf_dblwr.flush_buffered_writes();
......
...@@ -401,7 +401,7 @@ we put it to free list to be used. ...@@ -401,7 +401,7 @@ we put it to free list to be used.
@param have_mutex whether buf_pool.mutex is already being held @param have_mutex whether buf_pool.mutex is already being held
@return the free control block, in state BUF_BLOCK_MEMORY */ @return the free control block, in state BUF_BLOCK_MEMORY */
buf_block_t* buf_LRU_get_free_block(bool have_mutex) buf_block_t *buf_LRU_get_free_block(bool have_mutex)
{ {
ulint n_iterations = 0; ulint n_iterations = 0;
ulint flush_failures = 0; ulint flush_failures = 0;
...@@ -413,6 +413,7 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex) ...@@ -413,6 +413,7 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex)
mysql_mutex_lock(&buf_pool.mutex); mysql_mutex_lock(&buf_pool.mutex);
got_mutex: got_mutex:
buf_LRU_check_size_of_non_data_objects(); buf_LRU_check_size_of_non_data_objects();
buf_block_t* block;
DBUG_EXECUTE_IF("ib_lru_force_no_free_page", DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
if (!buf_lru_free_blocks_error_printed) { if (!buf_lru_free_blocks_error_printed) {
...@@ -421,7 +422,8 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex) ...@@ -421,7 +422,8 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex)
retry: retry:
/* If there is a block in the free list, take it */ /* If there is a block in the free list, take it */
if (buf_block_t* block = buf_LRU_get_free_only()) { if ((block = buf_LRU_get_free_only()) != nullptr) {
got_block:
if (!have_mutex) { if (!have_mutex) {
mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_unlock(&buf_pool.mutex);
} }
...@@ -446,10 +448,19 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex) ...@@ -446,10 +448,19 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex)
buf_pool.try_LRU_scan = false; buf_pool.try_LRU_scan = false;
} }
for (;;) {
if ((block = buf_LRU_get_free_only()) != nullptr) {
goto got_block;
}
if (!buf_pool.n_flush_LRU_) {
break;
}
my_cond_wait(&buf_pool.done_free, &buf_pool.mutex.m_mutex);
}
#ifndef DBUG_OFF #ifndef DBUG_OFF
not_found: not_found:
#endif #endif
buf_flush_wait_batch_end(true);
mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_unlock(&buf_pool.mutex);
if (n_iterations > 20 && !buf_lru_free_blocks_error_printed if (n_iterations > 20 && !buf_lru_free_blocks_error_printed
...@@ -477,13 +488,11 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex) ...@@ -477,13 +488,11 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex)
} }
/* No free block was found: try to flush the LRU list. /* No free block was found: try to flush the LRU list.
This call will flush one page from the LRU and put it on the The freed blocks will be up for grabs for all threads.
free list. That means that the free block is up for grabs for
all user threads.
TODO: A more elegant way would have been to return the freed TODO: A more elegant way would have been to return one freed
up block to the caller here but the code that deals with up block to the caller here but the code that deals with
removing the block from page_hash and LRU_list is fairly removing the block from buf_pool.page_hash and buf_pool.LRU is fairly
involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
can do that in a separate patch sometime in future. */ can do that in a separate patch sometime in future. */
...@@ -1027,6 +1036,7 @@ buf_LRU_block_free_non_file_page( ...@@ -1027,6 +1036,7 @@ buf_LRU_block_free_non_file_page(
} else { } else {
UT_LIST_ADD_FIRST(buf_pool.free, &block->page); UT_LIST_ADD_FIRST(buf_pool.free, &block->page);
ut_d(block->page.in_free_list = true); ut_d(block->page.in_free_list = true);
pthread_cond_signal(&buf_pool.done_free);
} }
MEM_NOACCESS(block->frame, srv_page_size); MEM_NOACCESS(block->frame, srv_page_size);
......
...@@ -2020,6 +2020,8 @@ class buf_pool_t ...@@ -2020,6 +2020,8 @@ class buf_pool_t
UT_LIST_BASE_NODE_T(buf_page_t) free; UT_LIST_BASE_NODE_T(buf_page_t) free;
/*!< base node of the free /*!< base node of the free
block list */ block list */
/** signaled each time when the free list grows; protected by mutex */
pthread_cond_t done_free;
UT_LIST_BASE_NODE_T(buf_page_t) withdraw; UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
/*!< base node of the withdraw /*!< base node of the withdraw
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment