Commit bf435a3f authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-26200 buf_pool.flush_list corrupted by buffer pool resizing or ROW_FORMAT=COMPRESSED

The lazy deletion of clean blocks from buf_pool.flush_list that was
introduced in commit 6441bc61 (MDEV-25113)
introduced a race condition around the function
buf_flush_relocate_on_flush_list().

The test innodb_zip.wl5522_debug_zip as well as the buffer pool
resizing tests would occasionally fail in debug builds due to
buf_pool.flush_list.count disagreeing with the actual length of the
doubly-linked list.

The safe procedure for relocating a block in buf_pool.flush_list should be
as follows, now that we implement lazy deletion from buf_pool.flush_list:

1. Acquire buf_pool.mutex.
2. Acquire the exclusive buf_pool.page_hash.latch.
3. Acquire buf_pool.flush_list_mutex.
4. Copy the block descriptor.
5. Invoke buf_flush_relocate_on_flush_list().
6. Release buf_pool.flush_list_mutex.

buf_flush_relocate_on_flush_list(): Assert that
buf_pool.flush_list_mutex is being held. Invoke
buf_page_t::oldest_modification() only once, using
std::memory_order_relaxed, now that the mutex protects us.

buf_LRU_free_page(), buf_LRU_block_remove_hashed(): Avoid
an unlock-lock cycle on hash_lock. (We must not acquire hash_lock
while already holding buf_pool.flush_list_mutex, because that
could lead to a deadlock due to latching order violation.)
parent 316a8ceb
...@@ -1566,6 +1566,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block) ...@@ -1566,6 +1566,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
if (block->page.can_relocate()) { if (block->page.can_relocate()) {
memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>( memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
new_block->frame, block->frame, srv_page_size); new_block->frame, block->frame, srv_page_size);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
new (&new_block->page) buf_page_t(block->page); new (&new_block->page) buf_page_t(block->page);
/* relocate LRU list */ /* relocate LRU list */
...@@ -1625,6 +1626,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block) ...@@ -1625,6 +1626,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
buf_flush_relocate_on_flush_list(&block->page, buf_flush_relocate_on_flush_list(&block->page,
&new_block->page); &new_block->page);
} }
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
block->page.set_corrupt_id(); block->page.set_corrupt_id();
/* set other flags of buf_block_t */ /* set other flags of buf_block_t */
...@@ -3131,12 +3133,14 @@ buf_page_get_low( ...@@ -3131,12 +3133,14 @@ buf_page_get_low(
/* Note: this is the uncompressed block and it is not /* Note: this is the uncompressed block and it is not
accessible by other threads yet because it is not in accessible by other threads yet because it is not in
any list or hash table */ any list or hash table */
mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_relocate(bpage, &block->page); buf_relocate(bpage, &block->page);
/* Set after buf_relocate(). */ /* Set after buf_relocate(). */
block->page.set_buf_fix_count(1); block->page.set_buf_fix_count(1);
buf_flush_relocate_on_flush_list(bpage, &block->page); buf_flush_relocate_on_flush_list(bpage, &block->page);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
/* Buffer-fix, I/O-fix, and X-latch the block /* Buffer-fix, I/O-fix, and X-latch the block
for the duration of the decompression. for the duration of the decompression.
...@@ -3646,8 +3650,10 @@ buf_page_create(fil_space_t *space, uint32_t offset, ...@@ -3646,8 +3650,10 @@ buf_page_create(fil_space_t *space, uint32_t offset,
} }
rw_lock_x_lock(&free_block->lock); rw_lock_x_lock(&free_block->lock);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_relocate(&block->page, &free_block->page); buf_relocate(&block->page, &free_block->page);
buf_flush_relocate_on_flush_list(&block->page, &free_block->page); buf_flush_relocate_on_flush_list(&block->page, &free_block->page);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
free_block->page.set_state(BUF_BLOCK_FILE_PAGE); free_block->page.set_state(BUF_BLOCK_FILE_PAGE);
buf_unzip_LRU_add_block(free_block, FALSE); buf_unzip_LRU_add_block(free_block, FALSE);
......
...@@ -286,43 +286,29 @@ buf_flush_relocate_on_flush_list( ...@@ -286,43 +286,29 @@ buf_flush_relocate_on_flush_list(
{ {
buf_page_t* prev; buf_page_t* prev;
mysql_mutex_assert_owner(&buf_pool.mutex); mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
ut_ad(!fsp_is_system_temporary(bpage->id().space())); ut_ad(!fsp_is_system_temporary(bpage->id().space()));
const lsn_t lsn = bpage->oldest_modification_acquire(); const lsn_t lsn = bpage->oldest_modification();
if (!lsn) { if (!lsn) {
return; return;
} }
ut_ad(lsn == 1 || lsn > 2); ut_ad(lsn == 1 || lsn > 2);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
/* FIXME: Can we avoid holding buf_pool.mutex here? */
ut_ad(dpage->oldest_modification() == lsn); ut_ad(dpage->oldest_modification() == lsn);
if (ut_d(const lsn_t o_lsn =) bpage->oldest_modification()) { /* Important that we adjust the hazard pointer before removing
ut_ad(o_lsn == lsn); the bpage from the flush list. */
buf_pool.flush_hp.adjust(bpage);
/* Important that we adjust the hazard pointer before removing prev = UT_LIST_GET_PREV(list, bpage);
the bpage from the flush list. */ UT_LIST_REMOVE(buf_pool.flush_list, bpage);
buf_pool.flush_hp.adjust(bpage);
prev = UT_LIST_GET_PREV(list, bpage); bpage->clear_oldest_modification();
UT_LIST_REMOVE(buf_pool.flush_list, bpage);
bpage->clear_oldest_modification();
} else {
/* bpage was removed from buf_pool.flush_list
since we last checked, and before we acquired
buf_pool.flush_list_mutex. */
goto was_clean;
}
if (lsn == 1) { if (lsn == 1) {
buf_pool.stat.flush_list_bytes -= dpage->physical_size(); buf_pool.stat.flush_list_bytes -= dpage->physical_size();
was_clean:
dpage->list.prev = nullptr; dpage->list.prev = nullptr;
dpage->list.next = nullptr; dpage->list.next = nullptr;
dpage->clear_oldest_modification(); dpage->clear_oldest_modification();
...@@ -334,7 +320,6 @@ buf_flush_relocate_on_flush_list( ...@@ -334,7 +320,6 @@ buf_flush_relocate_on_flush_list(
} }
ut_d(buf_flush_validate_low()); ut_d(buf_flush_validate_low());
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
} }
/** Complete write of a file page from buf_pool. /** Complete write of a file page from buf_pool.
......
...@@ -845,6 +845,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) ...@@ -845,6 +845,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
} else if (bpage->state() == BUF_BLOCK_FILE_PAGE) { } else if (bpage->state() == BUF_BLOCK_FILE_PAGE) {
b = buf_page_alloc_descriptor(); b = buf_page_alloc_descriptor();
ut_a(b); ut_a(b);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
new (b) buf_page_t(*bpage); new (b) buf_page_t(*bpage);
b->set_state(BUF_BLOCK_ZIP_PAGE); b->set_state(BUF_BLOCK_ZIP_PAGE);
} }
...@@ -859,6 +860,8 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) ...@@ -859,6 +860,8 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
ut_ad(bpage->can_relocate()); ut_ad(bpage->can_relocate());
if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) { if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) {
ut_ad(!b);
mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
return(true); return(true);
} }
...@@ -872,8 +875,6 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) ...@@ -872,8 +875,6 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
if (UNIV_LIKELY_NULL(b)) { if (UNIV_LIKELY_NULL(b)) {
buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
hash_lock->write_lock();
ut_ad(!buf_pool.page_hash_get_low(id, fold)); ut_ad(!buf_pool.page_hash_get_low(id, fold));
ut_ad(b->zip_size()); ut_ad(b->zip_size());
...@@ -940,6 +941,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) ...@@ -940,6 +941,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
} }
buf_flush_relocate_on_flush_list(bpage, b); buf_flush_relocate_on_flush_list(bpage, b);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
bpage->zip.data = nullptr; bpage->zip.data = nullptr;
...@@ -950,6 +952,8 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) ...@@ -950,6 +952,8 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
hash_lock. */ hash_lock. */
b->set_io_fix(BUF_IO_PIN); b->set_io_fix(BUF_IO_PIN);
hash_lock->write_unlock(); hash_lock->write_unlock();
} else if (!zip) {
hash_lock->write_unlock();
} }
buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
...@@ -1182,6 +1186,10 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, ...@@ -1182,6 +1186,10 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
MEM_UNDEFINED(((buf_block_t*) bpage)->frame, srv_page_size); MEM_UNDEFINED(((buf_block_t*) bpage)->frame, srv_page_size);
bpage->set_state(BUF_BLOCK_REMOVE_HASH); bpage->set_state(BUF_BLOCK_REMOVE_HASH);
if (!zip) {
return true;
}
/* Question: If we release hash_lock here /* Question: If we release hash_lock here
then what protects us against: then what protects us against:
1) Some other thread buffer fixing this page 1) Some other thread buffer fixing this page
...@@ -1203,7 +1211,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, ...@@ -1203,7 +1211,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
page_hash. */ page_hash. */
hash_lock->write_unlock(); hash_lock->write_unlock();
if (zip && bpage->zip.data) { if (bpage->zip.data) {
/* Free the compressed page. */ /* Free the compressed page. */
void* data = bpage->zip.data; void* data = bpage->zip.data;
bpage->zip.data = NULL; bpage->zip.data = NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment