Commit 6e3bd663 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-25776 Race conditions in buf_pool.page_hash around buf_pool.watch

Any modification of buf_pool.page_hash is supposed to be protected
by buf_pool.mutex and page_hash_latch::write_lock(). The buffer pool
watch mechanism of the InnoDB change buffer was violating that
ever since commit b1ab211d (MDEV-15053).

buf_pool_t::watch_set(): Extend the critical section of buf_pool.mutex.

buf_pool_t::watch_unset(): Define non-inline, because
calls are infrequent and this function became larger.
If we have to detach a sentinel from page_hash,
do it while holding both the mutex and the exclusive hash latch.

buf_pool_t::watch_remove(): Assert that the mutex is being held.

buf_page_init_for_read(): Remove some work-arounds for
previously encountered race conditions related to buf_pool.watch.
parent eb2f2c1e
...@@ -2393,14 +2393,10 @@ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id, ...@@ -2393,14 +2393,10 @@ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
w->id_= id; w->id_= id;
*hash_lock= page_hash.lock_get(fold); *hash_lock= page_hash.lock_get(fold);
(*hash_lock)->write_lock();
mysql_mutex_unlock(&mutex);
buf_page_t *bpage= page_hash_get_low(id, fold); buf_page_t *bpage= page_hash_get_low(id, fold);
if (UNIV_LIKELY_NULL(bpage)) if (UNIV_LIKELY_NULL(bpage))
{ {
(*hash_lock)->write_unlock();
mysql_mutex_lock(&mutex);
w->set_state(BUF_BLOCK_NOT_USED); w->set_state(BUF_BLOCK_NOT_USED);
*hash_lock= page_hash.lock_get(fold); *hash_lock= page_hash.lock_get(fold);
(*hash_lock)->write_lock(); (*hash_lock)->write_lock();
...@@ -2408,11 +2404,13 @@ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id, ...@@ -2408,11 +2404,13 @@ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
goto retry; goto retry;
} }
(*hash_lock)->write_lock();
ut_ad(!w->buf_fix_count_); ut_ad(!w->buf_fix_count_);
w->buf_fix_count_= 1; w->buf_fix_count_= 1;
ut_ad(!w->in_page_hash); ut_ad(!w->in_page_hash);
ut_d(w->in_page_hash= true); /* Not holding buf_pool.mutex here! */ ut_d(w->in_page_hash= true);
HASH_INSERT(buf_page_t, hash, &page_hash, fold, w); HASH_INSERT(buf_page_t, hash, &page_hash, fold, w);
mysql_mutex_unlock(&mutex);
return nullptr; return nullptr;
} }
...@@ -2421,6 +2419,48 @@ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id, ...@@ -2421,6 +2419,48 @@ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
return nullptr; return nullptr;
} }
/** Stop watching whether a page has been read in.
watch_set(id) must have returned nullptr before.
@param id page identifier */
void buf_pool_t::watch_unset(const page_id_t id)
{
mysql_mutex_assert_not_owner(&mutex);
const ulint fold= id.fold();
page_hash_latch *hash_lock= page_hash.lock<true>(fold);
/* The page must exist because watch_set() increments buf_fix_count. */
buf_page_t *w= page_hash_get_low(id, fold);
const auto buf_fix_count= w->buf_fix_count();
ut_ad(buf_fix_count);
const bool must_remove= buf_fix_count == 1 && watch_is_sentinel(*w);
ut_ad(w->in_page_hash);
if (!must_remove)
w->unfix();
hash_lock->write_unlock();
if (must_remove)
{
const auto old= w;
/* The following is based on buf_pool_t::watch_remove(). */
mysql_mutex_lock(&mutex);
w= page_hash_get_low(id, fold);
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
if (w->unfix() == 0 && w == old)
{
ut_ad(w->in_page_hash);
ut_d(w->in_page_hash= false);
HASH_DELETE(buf_page_t, hash, &page_hash, fold, w);
// Now that the watch is detached from page_hash, release it to watch[].
ut_ad(w->id_ == id);
ut_ad(!w->buf_fix_count());
ut_ad(w->state() == BUF_BLOCK_ZIP_PAGE);
w->set_state(BUF_BLOCK_NOT_USED);
}
hash_lock->write_unlock();
mysql_mutex_unlock(&mutex);
}
}
/** Mark the page status as FREED for the given tablespace id and /** Mark the page status as FREED for the given tablespace id and
page number. If the page is not in buffer pool then ignore it. page number. If the page is not in buffer pool then ignore it.
@param[in,out] space tablespace @param[in,out] space tablespace
......
...@@ -53,6 +53,7 @@ that the block has been replaced with the real block. ...@@ -53,6 +53,7 @@ that the block has been replaced with the real block.
@param watch sentinel */ @param watch sentinel */
inline void buf_pool_t::watch_remove(buf_page_t *watch) inline void buf_pool_t::watch_remove(buf_page_t *watch)
{ {
mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(hash_lock_get(watch->id())->is_write_locked()); ut_ad(hash_lock_get(watch->id())->is_write_locked());
ut_a(watch_is_sentinel(*watch)); ut_a(watch_is_sentinel(*watch));
if (watch->buf_fix_count()) if (watch->buf_fix_count())
...@@ -123,16 +124,10 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, ...@@ -123,16 +124,10 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
mysql_mutex_lock(&buf_pool.mutex); mysql_mutex_lock(&buf_pool.mutex);
/* We must acquire hash_lock this early to prevent
a race condition with buf_pool_t::watch_remove() */
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold); buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
if (hash_page && !buf_pool.watch_is_sentinel(*hash_page)) if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
{ {
/* The page is already in the buffer pool. */ /* The page is already in the buffer pool. */
hash_lock->write_unlock();
if (block) if (block)
{ {
rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ); rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ);
...@@ -146,6 +141,9 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, ...@@ -146,6 +141,9 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bpage= &block->page; bpage= &block->page;
/* Insert into the hash table of file pages */ /* Insert into the hash table of file pages */
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
if (hash_page) if (hash_page)
{ {
/* Preserve the reference count. */ /* Preserve the reference count. */
...@@ -184,8 +182,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, ...@@ -184,8 +182,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
} }
else else
{ {
hash_lock->write_unlock();
/* The compressed page must be allocated before the /* The compressed page must be allocated before the
control block (bpage), in order to avoid the control block (bpage), in order to avoid the
invocation of buf_buddy_relocate_block() on invocation of buf_buddy_relocate_block() on
...@@ -193,8 +189,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, ...@@ -193,8 +189,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bool lru= false; bool lru= false;
void *data= buf_buddy_alloc(zip_size, &lru); void *data= buf_buddy_alloc(zip_size, &lru);
hash_lock->write_lock();
/* If buf_buddy_alloc() allocated storage from the LRU list, /* If buf_buddy_alloc() allocated storage from the LRU list,
it released and reacquired buf_pool.mutex. Thus, we must it released and reacquired buf_pool.mutex. Thus, we must
check the page_hash again, as it may have been modified. */ check the page_hash again, as it may have been modified. */
...@@ -205,7 +199,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, ...@@ -205,7 +199,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page))) if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
{ {
/* The block was added by some other thread. */ /* The block was added by some other thread. */
hash_lock->write_unlock();
buf_buddy_free(data, zip_size); buf_buddy_free(data, zip_size);
goto func_exit; goto func_exit;
} }
...@@ -219,6 +212,9 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, ...@@ -219,6 +212,9 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bpage->init(BUF_BLOCK_ZIP_PAGE, page_id); bpage->init(BUF_BLOCK_ZIP_PAGE, page_id);
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
if (hash_page) if (hash_page)
{ {
/* Preserve the reference count. It can be 0 if /* Preserve the reference count. It can be 0 if
......
...@@ -1739,33 +1739,7 @@ class buf_pool_t ...@@ -1739,33 +1739,7 @@ class buf_pool_t
/** Stop watching whether a page has been read in. /** Stop watching whether a page has been read in.
watch_set(id) must have returned nullptr before. watch_set(id) must have returned nullptr before.
@param id page identifier */ @param id page identifier */
void watch_unset(const page_id_t id) void watch_unset(const page_id_t id);
{
const ulint fold= id.fold();
page_hash_latch *hash_lock= page_hash.lock<true>(fold);
/* The page must exist because watch_set() increments buf_fix_count. */
buf_page_t *watch= page_hash_get_low(id, fold);
if (watch->unfix() == 0 && watch_is_sentinel(*watch))
{
/* The following is based on watch_remove(). */
ut_ad(watch->in_page_hash);
ut_d(watch->in_page_hash= false);
HASH_DELETE(buf_page_t, hash, &page_hash, fold, watch);
hash_lock->write_unlock();
// Now that the watch is detached from page_hash, release it to watch[].
mysql_mutex_lock(&mutex);
/* It is possible that watch_remove() already removed the watch. */
if (watch->id_ == id)
{
ut_ad(!watch->buf_fix_count());
ut_ad(watch->state() == BUF_BLOCK_ZIP_PAGE);
watch->set_state(BUF_BLOCK_NOT_USED);
}
mysql_mutex_unlock(&mutex);
}
else
hash_lock->write_unlock();
}
/** Remove the sentinel block for the watch before replacing it with a /** Remove the sentinel block for the watch before replacing it with a
real block. watch_unset() or watch_occurred() will notice real block. watch_unset() or watch_occurred() will notice
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment