MDEV-26033: Race condition between buf_pool.page_hash and resize()

The replacement of buf_pool.page_hash with a different type of hash table in commit 5155a300 (MDEV-22871) introduced a race condition with buffer pool resizing. We have an execution trace where buf_pool.page_hash.array is changed to point to something else while page_hash_latch::read_lock() is executing. The same should also affect page_hash_latch::write_lock(). We fix the race condition by never resizing (and reallocating) the buf_pool.page_hash. We assume that resizing the buffer pool is a rare operation. Yes, there might be a performance regression if a server is first started up with a tiny buffer pool, which is later enlarged. In that case, the tiny buf_pool.page_hash.array could cause increased use of the hash bucket lists. That problem can be worked around by initially starting up the server with a larger buffer pool and then shrinking that, until changing to a larger size again. buf_pool_t::resize_hash(): Remove. buf_pool_t::page_hash_table::lock(): Do not attempt to deal with hash table resizing. If we really wanted that in a safe manner, we would probably have to introduce a global rw-lock around the operation, or at the very least, poll buf_pool.resizing, both of which would be detrimental to performance.

MDEV-26033: Race condition between buf_pool.page_hash and resize()
The replacement of buf_pool.page_hash with a different type of hash table in commit 5155a300 (MDEV-22871) introduced a race condition with buffer pool resizing. We have an execution trace where buf_pool.page_hash.array is changed to point to something else while page_hash_latch::read_lock() is executing. The same should also affect page_hash_latch::write_lock(). We fix the race condition by never resizing (and reallocating) the buf_pool.page_hash. We assume that resizing the buffer pool is a rare operation. Yes, there might be a performance regression if a server is first started up with a tiny buffer pool, which is later enlarged. In that case, the tiny buf_pool.page_hash.array could cause increased use of the hash bucket lists. That problem can be worked around by initially starting up the server with a larger buffer pool and then shrinking that, until changing to a larger size again. buf_pool_t::resize_hash(): Remove. buf_pool_t::page_hash_table::lock(): Do not attempt to deal with hash table resizing. If we really wanted that in a safe manner, we would probably have to introduce a global rw-lock around the operation, or at the very least, poll buf_pool.resizing, both of which would be detrimental to performance.
bd5a6403 · Marko Mäkelä · 77926284 · bd5a6403 · bd5a6403
Commit bd5a6403 authored Jul 03, 2021 by Marko Mäkelä
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 113 deletions

storage/innobase/buf/buf0buf.cc storage/innobase/buf/buf0buf.cc +2 -88

storage/innobase/include/buf0buf.h storage/innobase/include/buf0buf.h +4 -25

No files found.
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -1536,13 +1536,6 @@ void buf_pool_t::close()
  ut_free(chunks);
  chunks= nullptr;
  page_hash.free();
-  while (page_hash_table *old_page_hash= freed_page_hash)
-  {
-    freed_page_hash= static_cast<page_hash_table*>
-      (old_page_hash->array[1].node);
-    old_page_hash->free();
-    UT_DELETE(old_page_hash);
-  }
  zip_hash.free();

  io_buf.close();
@@ -1829,57 +1822,6 @@ inline bool buf_pool_t::withdraw_blocks()
 	return(false);
 }

-/** resize page_hash and zip_hash */
-inline void buf_pool_t::resize_hash()
-{
-  page_hash_table *new_page_hash= UT_NEW_NOKEY(page_hash_table());
-  new_page_hash->create(2 * buf_pool.curr_size);
-  new_page_hash->write_lock_all();
-
-  for (auto i= page_hash.pad(page_hash.n_cells); i--; )
-  {
-    static_assert(!((page_hash_table::ELEMENTS_PER_LATCH + 1) &
-                    page_hash_table::ELEMENTS_PER_LATCH),
-                  "must be one less than a power of 2");
-    if (!(i & page_hash_table::ELEMENTS_PER_LATCH))
-    {
-      ut_ad(reinterpret_cast<page_hash_latch*>
-            (&page_hash.array[i])->is_write_locked());
-      continue;
-    }
-    while (buf_page_t *bpage= static_cast<buf_page_t*>
-           (page_hash.array[i].node))
-    {
-      ut_ad(bpage->in_page_hash);
-      const ulint fold= bpage->id().fold();
-      HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
-      HASH_INSERT(buf_page_t, hash, new_page_hash, fold, bpage);
-    }
-  }
-
-  buf_pool.page_hash.array[1].node= freed_page_hash;
-  std::swap(buf_pool.page_hash, *new_page_hash);
-  freed_page_hash= new_page_hash;
-
-  /* recreate zip_hash */
-  hash_table_t new_hash;
-  new_hash.create(2 * buf_pool.curr_size);
-
-  for (ulint i= 0; i < buf_pool.zip_hash.n_cells; i++)
-  {
-    while (buf_page_t *bpage= static_cast<buf_page_t*>
-           (HASH_GET_FIRST(&buf_pool.zip_hash, i)))
-    {
-      const ulint fold= BUF_POOL_ZIP_FOLD_BPAGE(bpage);
-      HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage);
-      HASH_INSERT(buf_page_t, hash, &new_hash, fold, bpage);
-    }
-  }
-
-  std::swap(buf_pool.zip_hash.array, new_hash.array);
-  buf_pool.zip_hash.n_cells= new_hash.n_cells;
-  new_hash.free();
-}


 inline void buf_pool_t::page_hash_table::write_lock_all()
@@ -1904,26 +1846,6 @@ inline void buf_pool_t::page_hash_table::write_unlock_all()
 }


-inline void buf_pool_t::write_lock_all_page_hash()
-{
-  mysql_mutex_assert_owner(&mutex);
-  page_hash.write_lock_all();
-  for (page_hash_table *old_page_hash= freed_page_hash; old_page_hash;
-       old_page_hash= static_cast<page_hash_table*>
-         (old_page_hash->array[1].node))
-    old_page_hash->write_lock_all();
-}
-
-
-inline void buf_pool_t::write_unlock_all_page_hash()
-{
-  page_hash.write_unlock_all();
-  for (page_hash_table *old_page_hash= freed_page_hash; old_page_hash;
-       old_page_hash= static_cast<page_hash_table*>
-         (old_page_hash->array[1].node))
-    old_page_hash->write_unlock_all();
-}
-
 namespace
 {

@@ -2097,7 +2019,7 @@ inline void buf_pool_t::resize()
 	resizing.store(true, std::memory_order_relaxed);

  mysql_mutex_lock(&mutex);
-  write_lock_all_page_hash();
+  page_hash.write_lock_all();

 	chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());

@@ -2252,16 +2174,8 @@ inline void buf_pool_t::resize()
 		= srv_buf_pool_base_size > srv_buf_pool_size * 2
 			|| srv_buf_pool_base_size * 2 < srv_buf_pool_size;

-	/* Normalize page_hash and zip_hash,
-	if the new size is too different */
-	if (!warning && new_size_too_diff) {
-		buf_resize_status("Resizing hash table");
-		resize_hash();
-		ib::info() << "hash tables were resized";
-	}
-
  mysql_mutex_unlock(&mutex);
-  write_unlock_all_page_hash();
+  page_hash.write_unlock_all();

 	UT_DELETE(chunk_map_old);


--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -1895,22 +1895,14 @@ class buf_pool_t
    page_hash_latch *lock_get(ulint fold) const
    { return lock_get(fold, n_cells); }

-    /** Acquire an array latch, tolerating concurrent buf_pool_t::resize()
+    /** Acquire an array latch.
    @tparam exclusive  whether the latch is to be acquired exclusively
    @param fold    hash bucket key */
    template<bool exclusive> page_hash_latch *lock(ulint fold)
    {
-      for (;;)
-      {
-        auto n= n_cells;
-        page_hash_latch *latch= lock_get(fold, n);
-        latch->acquire<exclusive>();
-        /* Our latch prevents n_cells from changing. */
-        if (UNIV_LIKELY(n == n_cells))
-          return latch;
-        /* Retry, because buf_pool_t::resize_hash() affected us. */
-        latch->release<exclusive>();
-      }
+      page_hash_latch *latch= lock_get(fold, n_cells);
+      latch->acquire<exclusive>();
+      return latch;
    }

    /** Exclusively aqcuire all latches */
@@ -1920,19 +1912,6 @@ class buf_pool_t
    inline void write_unlock_all();
  };

-private:
-  /** Former page_hash that has been deleted during resize();
-  singly-linked list via freed_page_hash->array[1] */
-  page_hash_table *freed_page_hash;
-
-  /** Lock all page_hash, also freed_page_hash. */
-  inline void write_lock_all_page_hash();
-  /** Release all page_hash, also freed_page_hash. */
-  inline void write_unlock_all_page_hash();
-  /** Resize page_hash and zip_hash. */
-  inline void resize_hash();
-
-public:
  /** Hash table of file pages (buf_page_t::in_file() holds),
  indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */
  page_hash_table page_hash;