Commit 0a7d85c9 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-30148 Race condition between non-persistent statistics and purge

btr_cur_t::open_random_leaf(): Replaces btr_cur_open_at_rnd_pos().
Acquire a shared latch on each page, and finally release all
latches except the one on the leaf page.

This fixes a race condition between the purge of history and
btr_estimate_number_of_different_key_vals(), which turned out
to only hold a buffer-fix on the randomly chosen leaf page.
Typically, an assertion would fail in page_rec_is_supremum().

ibuf_contract(): Start from the beginning of the change buffer,
to simplify the logic. Starting with
commit b42294bc
it does not matter much where the change buffer merge is being initiated.

The race condition may have been introduced as early as
mysql/mysql-server@ac74632293bea967b352d1b472abedeeaa921b98
from where it was copied to
commit 2e814d47.

Reviewed by: Vladislav Lesin
Tested by: Matthias Leich
parent 95d71272
......@@ -2629,288 +2629,6 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
return err;
}
/**********************************************************************//**
Positions a cursor at a randomly chosen position within a B-tree.
@return true if the index is available and we have put the cursor, false
if the index is unavailable */
bool
btr_cur_open_at_rnd_pos(
dict_index_t* index, /*!< in: index */
btr_latch_mode latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_cur_t* cursor, /*!< in/out: B-tree cursor */
mtr_t* mtr) /*!< in: mtr */
{
page_cur_t* page_cursor;
ulint node_ptr_max_size = srv_page_size / 2;
ulint height;
rec_t* node_ptr;
btr_intention_t lock_intention;
buf_block_t* tree_blocks[BTR_MAX_LEVELS];
ulint tree_savepoints[BTR_MAX_LEVELS];
ulint n_blocks = 0;
ulint n_releases = 0;
mem_heap_t* heap = NULL;
rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
rec_offs* offsets = offsets_;
rec_offs_init(offsets_);
ut_ad(!index->is_spatial());
lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
ulint savepoint = mtr_set_savepoint(mtr);
rw_lock_type_t upper_rw_latch;
switch (latch_mode) {
case BTR_MODIFY_TREE:
/* Most of delete-intended operations are purging.
Free blocks and read IO bandwidth should be prior
for them, when the history list is glowing huge. */
if (lock_intention == BTR_INTENTION_DELETE
&& buf_pool.n_pend_reads
&& trx_sys.history_size_approx()
> BTR_CUR_FINE_HISTORY_LENGTH) {
mtr_x_lock_index(index, mtr);
} else {
mtr_sx_lock_index(index, mtr);
}
upper_rw_latch = RW_X_LATCH;
break;
case BTR_SEARCH_PREV:
case BTR_MODIFY_PREV:
/* This function doesn't support left uncle
page lock for left leaf page lock, when
needed. */
case BTR_SEARCH_TREE:
case BTR_CONT_MODIFY_TREE:
case BTR_CONT_SEARCH_TREE:
ut_ad(0);
/* fall through */
default:
if (!srv_read_only_mode) {
mtr_s_lock_index(index, mtr);
upper_rw_latch = RW_S_LATCH;
} else {
upper_rw_latch = RW_NO_LATCH;
}
}
DBUG_EXECUTE_IF("test_index_is_unavailable",
return(false););
if (index->page == FIL_NULL) {
/* Since we don't hold index lock until just now, the index
could be modified by others, for example, if this is a
statistics updater for referenced table, it could be marked
as unavailable by 'DROP TABLE' in the mean time, since
we don't hold lock for statistics updater */
return(false);
}
const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
latch_mode);
page_cursor = btr_cur_get_page_cur(cursor);
page_cursor->index = index;
page_id_t page_id(index->table->space_id, index->page);
const ulint zip_size = index->table->space->zip_size();
dberr_t err;
if (root_leaf_rw_latch == RW_X_LATCH) {
node_ptr_max_size = btr_node_ptr_max_size(index);
}
height = ULINT_UNDEFINED;
for (;;) {
page_t* page;
ut_ad(n_blocks < BTR_MAX_LEVELS);
tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
const rw_lock_type_t rw_latch = height
&& latch_mode != BTR_MODIFY_TREE
? upper_rw_latch : RW_NO_LATCH;
buf_block_t* block = buf_page_get_gen(page_id, zip_size,
rw_latch, NULL, BUF_GET,
mtr, &err,
height == 0
&& !index->is_clust());
tree_blocks[n_blocks] = block;
ut_ad((block != NULL) == (err == DB_SUCCESS));
if (!block) {
if (err == DB_DECRYPTION_FAILED) {
btr_decryption_failed(*index);
}
break;
}
page = buf_block_get_frame(block);
if (height == ULINT_UNDEFINED
&& page_is_leaf(page)
&& rw_latch != RW_NO_LATCH
&& rw_latch != root_leaf_rw_latch) {
/* We should retry to get the page, because the root page
is latched with different level as a leaf page. */
ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
ut_ad(rw_latch == RW_S_LATCH);
ut_ad(n_blocks == 0);
mtr_release_block_at_savepoint(
mtr, tree_savepoints[n_blocks],
tree_blocks[n_blocks]);
upper_rw_latch = root_leaf_rw_latch;
continue;
}
ut_ad(fil_page_index_page_check(page));
ut_ad(index->id == btr_page_get_index_id(page));
if (height == ULINT_UNDEFINED) {
/* We are in the root node */
height = btr_page_get_level(page);
}
if (height == 0) {
if (rw_latch == RW_NO_LATCH
|| srv_read_only_mode) {
btr_cur_latch_leaves(block, latch_mode, cursor,
mtr);
}
/* btr_cur_t::open_leaf() and
btr_cur_search_to_nth_level() release
tree s-latch here.*/
switch (latch_mode) {
case BTR_MODIFY_TREE:
case BTR_CONT_MODIFY_TREE:
case BTR_CONT_SEARCH_TREE:
break;
default:
/* Release the tree s-latch */
if (!srv_read_only_mode) {
mtr_release_s_latch_at_savepoint(
mtr, savepoint,
&index->lock);
}
/* release upper blocks */
for (; n_releases < n_blocks; n_releases++) {
mtr_release_block_at_savepoint(
mtr,
tree_savepoints[n_releases],
tree_blocks[n_releases]);
}
}
}
page_cursor->block = block;
page_cur_open_on_rnd_user_rec(page_cursor);
if (height == 0) {
break;
}
ut_ad(height > 0);
height--;
node_ptr = page_cur_get_rec(page_cursor);
offsets = rec_get_offsets(node_ptr, page_cursor->index,
offsets, 0, ULINT_UNDEFINED, &heap);
/* If the rec is the first or last in the page for
pessimistic delete intention, it might cause node_ptr insert
for the upper level. We should change the intention and retry.
*/
if (latch_mode == BTR_MODIFY_TREE
&& btr_cur_need_opposite_intention(
page, lock_intention, node_ptr)) {
ut_ad(upper_rw_latch == RW_X_LATCH);
/* release all blocks */
for (; n_releases <= n_blocks; n_releases++) {
mtr_release_block_at_savepoint(
mtr, tree_savepoints[n_releases],
tree_blocks[n_releases]);
}
lock_intention = BTR_INTENTION_BOTH;
page_id.set_page_no(dict_index_get_page(index));
height = ULINT_UNDEFINED;
n_blocks = 0;
n_releases = 0;
continue;
}
if (latch_mode == BTR_MODIFY_TREE
&& !btr_cur_will_modify_tree(
page_cursor->index, page, lock_intention,
node_ptr, node_ptr_max_size, zip_size, mtr)) {
ut_ad(upper_rw_latch == RW_X_LATCH);
ut_ad(n_releases <= n_blocks);
/* we can release upper blocks */
for (; n_releases < n_blocks; n_releases++) {
if (n_releases == 0) {
/* we should not release root page
to pin to same block. */
continue;
}
/* release unused blocks to unpin */
mtr_release_block_at_savepoint(
mtr, tree_savepoints[n_releases],
tree_blocks[n_releases]);
}
}
if (height == 0
&& latch_mode == BTR_MODIFY_TREE) {
ut_ad(upper_rw_latch == RW_X_LATCH);
/* we should sx-latch root page, if released already.
It contains seg_header. */
if (n_releases > 0) {
mtr->sx_latch_at_savepoint(
tree_savepoints[0],
tree_blocks[0]);
}
/* x-latch the branch blocks not released yet. */
for (ulint i = n_releases; i <= n_blocks; i++) {
mtr->x_latch_at_savepoint(
tree_savepoints[i],
tree_blocks[i]);
}
}
/* Go to the child node */
page_id.set_page_no(
btr_node_ptr_get_child_page_no(node_ptr, offsets));
n_blocks++;
}
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
return err == DB_SUCCESS;
}
/*==================== B-TREE INSERT =========================*/
/*************************************************************//**
......
......@@ -1079,6 +1079,60 @@ btr_record_not_null_field_in_rec(
}
}
inline dberr_t
btr_cur_t::open_random_leaf(rec_offs *&offsets, mem_heap_t *&heap, mtr_t &mtr)
{
ut_ad(!index()->is_spatial());
ut_ad(!mtr.get_savepoint());
mtr_s_lock_index(index(), &mtr);
if (index()->page == FIL_NULL)
return DB_CORRUPTION;
dberr_t err;
auto offset= index()->page;
bool merge= false;
ulint height= ULINT_UNDEFINED;
while (buf_block_t *block=
btr_block_get(*index(), offset, RW_S_LATCH, merge, &mtr, &err))
{
page_cur.block= block;
if (height == ULINT_UNDEFINED)
{
height= btr_page_get_level(block->page.frame);
if (height > BTR_MAX_LEVELS)
return DB_CORRUPTION;
if (height == 0)
goto got_leaf;
}
if (height == 0)
{
mtr.rollback_to_savepoint(0, mtr.get_savepoint() - 1);
got_leaf:
page_cur.rec= page_get_infimum_rec(block->page.frame);
return DB_SUCCESS;
}
if (!--height)
merge= !index()->is_clust();
page_cur_open_on_rnd_user_rec(&page_cur);
offsets= rec_get_offsets(page_cur.rec, page_cur.index, offsets, 0,
ULINT_UNDEFINED, &heap);
/* Go to the child node */
offset= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
}
return err;
}
/** Estimated table level stats from sampled value.
@param value sampled stats
@param index index being sampled
......@@ -1107,7 +1161,6 @@ std::vector<index_field_stats_t>
btr_estimate_number_of_different_key_vals(dict_index_t* index,
trx_id_t bulk_trx_id)
{
btr_cur_t cursor;
page_t* page;
rec_t* rec;
ulint n_cols;
......@@ -1222,14 +1275,15 @@ btr_estimate_number_of_different_key_vals(dict_index_t* index,
ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
/* We sample some pages in the index to get an estimate */
btr_cur_t cursor;
cursor.page_cur.index = index;
for (ulint i = 0; i < n_sample_pages; i++) {
mtr.start();
if (!btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
&cursor, &mtr)
|| index->table->bulk_trx_id != bulk_trx_id
|| !index->is_readable()) {
if (cursor.open_random_leaf(offsets_rec, heap, mtr) !=
DB_SUCCESS
|| index->table->bulk_trx_id != bulk_trx_id) {
mtr.commit();
goto exit_loop;
}
......@@ -1242,9 +1296,8 @@ btr_estimate_number_of_different_key_vals(dict_index_t* index,
page = btr_cur_get_page(&cursor);
rec = page_rec_get_next(page_get_infimum_rec(page));
const ulint n_core = page_is_leaf(page)
? index->n_core_fields : 0;
rec = page_rec_get_next(cursor.page_cur.rec);
const ulint n_core = index->n_core_fields;
if (rec && !page_rec_is_supremum(rec)) {
not_empty_flag = 1;
......
......@@ -2416,36 +2416,26 @@ will be merged from ibuf trees to the pages read
ulint ibuf_contract()
{
mtr_t mtr;
btr_pcur_t pcur;
btr_cur_t cur;
ulint sum_sizes;
uint32_t page_nos[IBUF_MAX_N_PAGES_MERGED];
uint32_t space_ids[IBUF_MAX_N_PAGES_MERGED];
ibuf_mtr_start(&mtr);
/* Open a cursor to a randomly chosen leaf of the tree, at a random
position within the leaf */
pcur.pos_state = BTR_PCUR_IS_POSITIONED;
pcur.old_rec = nullptr;
pcur.trx_if_known = nullptr;
pcur.search_mode = PAGE_CUR_G;
pcur.latch_mode = BTR_SEARCH_LEAF;
btr_pcur_init(&pcur);
if (!btr_cur_open_at_rnd_pos(ibuf.index, BTR_SEARCH_LEAF,
btr_pcur_get_btr_cur(&pcur), &mtr)) {
if (cur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr) !=
DB_SUCCESS) {
return 0;
}
ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
ut_ad(page_validate(btr_cur_get_page(&cur), ibuf.index));
if (page_is_empty(btr_pcur_get_page(&pcur))) {
if (page_is_empty(btr_cur_get_page(&cur))) {
/* If a B-tree page is empty, it must be the root page
and the whole B-tree must be empty. InnoDB does not
allow empty B-tree pages other than the root. */
ut_ad(ibuf.empty);
ut_ad(btr_pcur_get_block(&pcur)->page.id()
ut_ad(btr_cur_get_block(&cur)->page.id()
== page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
ibuf_mtr_commit(&mtr);
......@@ -2455,7 +2445,7 @@ ulint ibuf_contract()
ulint n_pages = 0;
sum_sizes = ibuf_get_merge_page_nos(TRUE,
btr_pcur_get_rec(&pcur), &mtr,
btr_cur_get_rec(&cur), &mtr,
space_ids,
page_nos, &n_pages);
ibuf_mtr_commit(&mtr);
......
......@@ -172,17 +172,6 @@ dberr_t btr_cur_search_to_nth_level(ulint level,
btr_cur_t *cursor, mtr_t *mtr,
ib_uint64_t autoinc= 0);
/**********************************************************************//**
Positions a cursor at a randomly chosen position within a B-tree.
@return true if the index is available and we have put the cursor, false
if the index is unavailable */
bool
btr_cur_open_at_rnd_pos(
dict_index_t* index, /*!< in: index */
btr_latch_mode latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_cur_t* cursor, /*!< in/out: B-tree cursor */
mtr_t* mtr) /*!< in: mtr */
MY_ATTRIBUTE((nonnull,warn_unused_result));
/*************************************************************//**
Tries to perform an insert to a page in an index tree, next to cursor.
It is assumed that mtr holds an x-latch on the page. The operation does
......@@ -813,6 +802,14 @@ struct btr_cur_t {
@return error code */
dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
mtr_t *mtr);
/** Open the cursor at a random leaf page record.
@param offsets temporary memory for rec_get_offsets()
@param heap memory heap for rec_get_offsets()
@param mtr mini-transaction
@return error code */
inline dberr_t open_random_leaf(rec_offs *&offsets, mem_heap_t *& heap,
mtr_t &mtr);
};
/** Modify the delete-mark flag of a record.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment