Commit f99cace7 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-22924 Corruption in MVCC read via secondary index

An unsafe optimization was introduced by
commit 2347ffd8 (MDEV-20301)
which is based on
mysql/mysql-server@3f3136188f1bd383f77f97823cf6ebd72d5e4d7e or
mysql/mysql-server@647a3814a91c3d3bffc70ddff5513398e3f37bd4
in MySQL 8.0.12 or MySQL 8.0.13
(which in turn is based on the contribution in MySQL Bug #84958).

Row_sel_get_clust_rec_for_mysql::operator(): In addition to checking
that the pointer to the record matches, also check the latest
modification of the page (FIL_PAGE_LSN) as well as the page identifier.
Only if all three match, it is safe to reuse cached_old_vers.

Row_sel_get_clust_rec_for_mysql::check_eq(): Assert that the PRIMARY KEY
of the cached old version of the record corresponds to the latest version.

We got a test case where CHECK TABLE, UPDATE and purge would be
hammering on the same table (with only 6 rows) and a pointer that
was originally pointing to a record pk=2 would match a cached_clust_rec
that was pointing to a record pk=1. In the diagnosed `rr replay` trace,
we would wrongly return an old cached version of the pk=1 record,
instead of retrieving the correct version of the pk=2 record. Because
of this, CHECK TABLE would fail to count one of the records in a
secondary index, and report failure.

This bug appears to affect MVCC reads via secondary indexes only.
The purge of history in secondary indexes uses a different code path,
and so do checks for implicit record locks.
parent 9dedba16
......@@ -1014,6 +1014,25 @@ struct dict_index_t{
/** @return whether the index includes virtual columns */
bool has_virtual() const { return type & DICT_VIRTUAL; }
/** @return the position of DB_TRX_ID */
uint16_t db_trx_id() const {
DBUG_ASSERT(is_primary());
DBUG_ASSERT(n_uniq);
return n_uniq;
}
/** @return the position of DB_ROLL_PTR */
uint16_t db_roll_ptr() const
{
return static_cast<uint16_t>(db_trx_id() + 1);
}
/** @return the offset of the metadata BLOB field,
or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */
uint16_t first_user_field() const
{
return static_cast<uint16_t>(db_trx_id() + 2);
}
/** @return whether the index is corrupted */
inline bool is_corrupted() const;
......
......@@ -3303,20 +3303,46 @@ row_sel_build_prev_vers_for_mysql(
return(err);
}
/** Helper class to cache clust_rec and old_ver */
/** Helper class to cache clust_rec and old_vers */
class Row_sel_get_clust_rec_for_mysql
{
const rec_t *cached_clust_rec;
rec_t *cached_old_vers;
const rec_t *cached_clust_rec;
rec_t *cached_old_vers;
lsn_t cached_lsn;
page_id_t cached_page_id;
public:
Row_sel_get_clust_rec_for_mysql() :
cached_clust_rec(NULL), cached_old_vers(NULL) {}
#ifdef UNIV_DEBUG
void check_eq(const dict_index_t *index, const rec_offs *offsets) const
{
rec_offs vers_offs[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS];
rec_offs_init(vers_offs);
mem_heap_t *heap= nullptr;
ut_ad(rec_offs_validate(cached_clust_rec, index, offsets));
ut_ad(index->first_user_field() <= rec_offs_n_fields(offsets));
ut_ad(vers_offs == rec_get_offsets(cached_old_vers, index, vers_offs, true,
index->db_trx_id(), &heap));
ut_ad(!heap);
for (auto n= index->db_trx_id(); n--; )
{
const dict_col_t *col= dict_index_get_nth_col(index, n);
ulint len1, len2;
const byte *b1= rec_get_nth_field(cached_clust_rec, offsets, n, &len1);
const byte *b2= rec_get_nth_field(cached_old_vers, vers_offs, n, &len2);
ut_ad(!cmp_data_data(col->mtype, col->prtype, b1, len1, b2, len2));
}
}
#endif
dberr_t operator()(row_prebuilt_t *prebuilt, dict_index_t *sec_index,
const rec_t *rec, que_thr_t *thr, const rec_t **out_rec,
rec_offs **offsets, mem_heap_t **offset_heap,
dtuple_t **vrow, mtr_t *mtr);
public:
Row_sel_get_clust_rec_for_mysql() :
cached_clust_rec(NULL), cached_old_vers(NULL), cached_lsn(0),
cached_page_id(page_id_t(0,0)) {}
dberr_t operator()(row_prebuilt_t *prebuilt, dict_index_t *sec_index,
const rec_t *rec, que_thr_t *thr, const rec_t **out_rec,
rec_offs **offsets, mem_heap_t **offset_heap,
dtuple_t **vrow, mtr_t *mtr);
};
/*********************************************************************//**
......@@ -3516,8 +3542,18 @@ Row_sel_get_clust_rec_for_mysql::operator()(
&& !lock_clust_rec_cons_read_sees(
clust_rec, clust_index, *offsets,
trx_get_read_view(trx))) {
const buf_page_t& bpage = btr_pcur_get_block(
prebuilt->clust_pcur)->page;
lsn_t lsn = bpage.newest_modification;
if (!lsn) {
lsn = mach_read_from_8(
page_align(clust_rec) + FIL_PAGE_LSN);
}
if (clust_rec != cached_clust_rec) {
if (lsn != cached_lsn
|| bpage.id != cached_page_id
|| clust_rec != cached_clust_rec) {
/* The following call returns 'offsets' associated with
'old_vers' */
err = row_sel_build_prev_vers_for_mysql(
......@@ -3529,6 +3565,8 @@ Row_sel_get_clust_rec_for_mysql::operator()(
goto err_exit;
}
cached_lsn = lsn;
cached_page_id = bpage.id;
cached_clust_rec = clust_rec;
cached_old_vers = old_vers;
} else {
......@@ -3539,7 +3577,8 @@ Row_sel_get_clust_rec_for_mysql::operator()(
version of clust_rec and its old version
old_vers. Re-calculate the offsets for old_vers. */
if (old_vers != NULL) {
if (old_vers) {
ut_d(check_eq(clust_index, *offsets));
*offsets = rec_get_offsets(
old_vers, clust_index, *offsets,
true, ULINT_UNDEFINED, offset_heap);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment