Commit 6e3bd663 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-25776 Race conditions in buf_pool.page_hash around buf_pool.watch

Any modification of buf_pool.page_hash is supposed to be protected
by buf_pool.mutex and page_hash_latch::write_lock(). The buffer pool
watch mechanism of the InnoDB change buffer was violating that
ever since commit b1ab211d (MDEV-15053).

buf_pool_t::watch_set(): Extend the critical section of buf_pool.mutex.

buf_pool_t::watch_unset(): Define non-inline, because
calls are infrequent and this function became larger.
If we have to detach a sentinel from page_hash,
do it while holding both the mutex and the exclusive hash latch.

buf_pool_t::watch_remove(): Assert that the mutex is being held.

buf_page_init_for_read(): Remove some work-arounds for
previously encountered race conditions related to buf_pool.watch.
parent eb2f2c1e
Branches unavailable
Tags unavailable
No related merge requests found
......@@ -2393,14 +2393,10 @@ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
w->id_= id;
*hash_lock= page_hash.lock_get(fold);
(*hash_lock)->write_lock();
mysql_mutex_unlock(&mutex);
buf_page_t *bpage= page_hash_get_low(id, fold);
if (UNIV_LIKELY_NULL(bpage))
{
(*hash_lock)->write_unlock();
mysql_mutex_lock(&mutex);
w->set_state(BUF_BLOCK_NOT_USED);
*hash_lock= page_hash.lock_get(fold);
(*hash_lock)->write_lock();
......@@ -2408,11 +2404,13 @@ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
goto retry;
}
(*hash_lock)->write_lock();
ut_ad(!w->buf_fix_count_);
w->buf_fix_count_= 1;
ut_ad(!w->in_page_hash);
ut_d(w->in_page_hash= true); /* Not holding buf_pool.mutex here! */
ut_d(w->in_page_hash= true);
HASH_INSERT(buf_page_t, hash, &page_hash, fold, w);
mysql_mutex_unlock(&mutex);
return nullptr;
}
......@@ -2421,6 +2419,48 @@ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
return nullptr;
}
/** Stop watching whether a page has been read in.
watch_set(id) must have returned nullptr before.
@param id page identifier */
void buf_pool_t::watch_unset(const page_id_t id)
{
mysql_mutex_assert_not_owner(&mutex);
const ulint fold= id.fold();
page_hash_latch *hash_lock= page_hash.lock<true>(fold);
/* The page must exist because watch_set() increments buf_fix_count. */
buf_page_t *w= page_hash_get_low(id, fold);
const auto buf_fix_count= w->buf_fix_count();
ut_ad(buf_fix_count);
const bool must_remove= buf_fix_count == 1 && watch_is_sentinel(*w);
ut_ad(w->in_page_hash);
if (!must_remove)
w->unfix();
hash_lock->write_unlock();
if (must_remove)
{
const auto old= w;
/* The following is based on buf_pool_t::watch_remove(). */
mysql_mutex_lock(&mutex);
w= page_hash_get_low(id, fold);
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
if (w->unfix() == 0 && w == old)
{
ut_ad(w->in_page_hash);
ut_d(w->in_page_hash= false);
HASH_DELETE(buf_page_t, hash, &page_hash, fold, w);
// Now that the watch is detached from page_hash, release it to watch[].
ut_ad(w->id_ == id);
ut_ad(!w->buf_fix_count());
ut_ad(w->state() == BUF_BLOCK_ZIP_PAGE);
w->set_state(BUF_BLOCK_NOT_USED);
}
hash_lock->write_unlock();
mysql_mutex_unlock(&mutex);
}
}
/** Mark the page status as FREED for the given tablespace id and
page number. If the page is not in buffer pool then ignore it.
@param[in,out] space tablespace
......
......@@ -53,6 +53,7 @@ that the block has been replaced with the real block.
@param watch sentinel */
inline void buf_pool_t::watch_remove(buf_page_t *watch)
{
mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(hash_lock_get(watch->id())->is_write_locked());
ut_a(watch_is_sentinel(*watch));
if (watch->buf_fix_count())
......@@ -123,16 +124,10 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
mysql_mutex_lock(&buf_pool.mutex);
/* We must acquire hash_lock this early to prevent
a race condition with buf_pool_t::watch_remove() */
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
{
/* The page is already in the buffer pool. */
hash_lock->write_unlock();
if (block)
{
rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ);
......@@ -146,6 +141,9 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bpage= &block->page;
/* Insert into the hash table of file pages */
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
if (hash_page)
{
/* Preserve the reference count. */
......@@ -184,8 +182,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
}
else
{
hash_lock->write_unlock();
/* The compressed page must be allocated before the
control block (bpage), in order to avoid the
invocation of buf_buddy_relocate_block() on
......@@ -193,8 +189,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bool lru= false;
void *data= buf_buddy_alloc(zip_size, &lru);
hash_lock->write_lock();
/* If buf_buddy_alloc() allocated storage from the LRU list,
it released and reacquired buf_pool.mutex. Thus, we must
check the page_hash again, as it may have been modified. */
......@@ -205,7 +199,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
{
/* The block was added by some other thread. */
hash_lock->write_unlock();
buf_buddy_free(data, zip_size);
goto func_exit;
}
......@@ -219,6 +212,9 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bpage->init(BUF_BLOCK_ZIP_PAGE, page_id);
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
if (hash_page)
{
/* Preserve the reference count. It can be 0 if
......
......@@ -1739,33 +1739,7 @@ class buf_pool_t
/** Stop watching whether a page has been read in.
watch_set(id) must have returned nullptr before.
@param id page identifier */
void watch_unset(const page_id_t id)
{
const ulint fold= id.fold();
page_hash_latch *hash_lock= page_hash.lock<true>(fold);
/* The page must exist because watch_set() increments buf_fix_count. */
buf_page_t *watch= page_hash_get_low(id, fold);
if (watch->unfix() == 0 && watch_is_sentinel(*watch))
{
/* The following is based on watch_remove(). */
ut_ad(watch->in_page_hash);
ut_d(watch->in_page_hash= false);
HASH_DELETE(buf_page_t, hash, &page_hash, fold, watch);
hash_lock->write_unlock();
// Now that the watch is detached from page_hash, release it to watch[].
mysql_mutex_lock(&mutex);
/* It is possible that watch_remove() already removed the watch. */
if (watch->id_ == id)
{
ut_ad(!watch->buf_fix_count());
ut_ad(watch->state() == BUF_BLOCK_ZIP_PAGE);
watch->set_state(BUF_BLOCK_NOT_USED);
}
mysql_mutex_unlock(&mutex);
}
else
hash_lock->write_unlock();
}
void watch_unset(const page_id_t id);
/** Remove the sentinel block for the watch before replacing it with a
real block. watch_unset() or watch_occurred() will notice
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment