Commit 40630774 authored by marko's avatar marko

branches/zip: Write PAGE_MAX_TRX_ID to the redo log. Otherwise,

transactions that are started before the rollback of incomplete
transactions has finished may have an inconsistent view of the
secondary indexes.

dict_index_is_sec_or_ibuf(): Auxiliary function for controlling
updates and checks of PAGE_MAX_TRX_ID: check whether an index is a
secondary index or the insert buffer tree.

page_set_max_trx_id(), page_update_max_trx_id(),
lock_rec_insert_check_and_lock(),
lock_sec_rec_modify_check_and_lock(), btr_cur_ins_lock_and_undo(),
btr_cur_upd_lock_and_undo(): Add the parameter mtr.

page_set_max_trx_id(): Allow mtr to be NULL.  When mtr==NULL, do not
attempt to write to the redo log.  This only occurs when creating a
page or reorganizing a compressed page.  In these cases, the
PAGE_MAX_TRX_ID will be set correctly during the application of redo
log records, even though there is no explicit log record about it.

btr_discard_only_page_on_level(): Preserve PAGE_MAX_TRX_ID.  This
function should be unreachable, though.

btr_cur_pessimistic_update(): Update PAGE_MAX_TRX_ID.

Add some assertions for checking that PAGE_MAX_TRX_ID is set on all
secondary index leaf pages.

rb://115 tested by Michael, fixes Issue #211
parent 91cd92c7
2009-05-19 The InnoDB Team
* btr/btr0btr.c, btr/btr0cur.c, lock/lock0lock.c,
include/page0page.ic, include/lock0lock.h, include/dict0dict.h,
include/page0page.h, include/dict0dict.ic, ibuf/ibuf0ibuf.c,
page/page0zip.c, page/page0page.c:
Write updates of PAGE_MAX_TRX_ID to the redo log and add debug
assertions for checking that PAGE_MAX_TRX_ID is valid on leaf
pages of secondary indexes and the insert buffer B-tree. This bug
could cause failures in secondary index lookups in consistent
reads right after crash recovery.
2009-05-18 The InnoDB Team
* btr/btr0cur.c:
......
......@@ -998,8 +998,16 @@ btr_page_reorganize_low(
page_copy_rec_list_end_no_locks(block, temp_block,
page_get_infimum_rec(temp_page),
index, mtr);
if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
/* Copy max trx id to recreated page */
page_set_max_trx_id(block, NULL, page_get_max_trx_id(temp_page));
trx_id_t max_trx_id = page_get_max_trx_id(temp_page);
page_set_max_trx_id(block, NULL, max_trx_id, mtr);
/* In crash recovery, dict_index_is_sec_or_ibuf() always
returns TRUE, even for clustered indexes. max_trx_id is
unused in clustered index pages. */
ut_ad(!ut_dulint_is_zero(max_trx_id) || recovery);
}
if (UNIV_LIKELY_NULL(page_zip)
&& UNIV_UNLIKELY
......@@ -2758,6 +2766,10 @@ btr_discard_only_page_on_level(
mtr_t* mtr) /* in: mtr */
{
ulint page_level = 0;
trx_id_t max_trx_id;
/* Save the PAGE_MAX_TRX_ID from the leaf page. */
max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
btr_cur_t cursor;
......@@ -2800,9 +2812,16 @@ btr_discard_only_page_on_level(
btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
/* We play it safe and reset the free bits for the root */
if (!dict_index_is_clust(index)) {
/* We play it safe and reset the free bits for the root */
ibuf_reset_free_bits(block);
if (page_is_leaf(buf_block_get_frame(block))) {
ut_a(!ut_dulint_is_zero(max_trx_id));
page_set_max_trx_id(block,
buf_block_get_page_zip(block),
max_trx_id, mtr);
}
}
}
......
......@@ -939,6 +939,7 @@ btr_cur_ins_lock_and_undo(
btr_cur_t* cursor, /* in: cursor on page after which to insert */
const dtuple_t* entry, /* in: entry to insert */
que_thr_t* thr, /* in: query thread or NULL */
mtr_t* mtr, /* in/out: mini-transaction */
ibool* inherit)/* out: TRUE if the inserted new record maybe
should inherit LOCK_GAP type locks from the
successor record */
......@@ -956,7 +957,7 @@ btr_cur_ins_lock_and_undo(
err = lock_rec_insert_check_and_lock(flags, rec,
btr_cur_get_block(cursor),
index, thr, inherit);
index, thr, mtr, inherit);
if (err != DB_SUCCESS) {
......@@ -1170,7 +1171,8 @@ fail_err:
}
/* Check locks and write to the undo log, if specified */
err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &inherit);
err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
thr, mtr, &inherit);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
......@@ -1344,7 +1346,8 @@ btr_cur_pessimistic_insert(
/* Retry with a pessimistic insert. Check locks and write to undo log,
if specified */
err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &dummy_inh);
err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
thr, mtr, &dummy_inh);
if (err != DB_SUCCESS) {
......@@ -1439,6 +1442,7 @@ btr_cur_upd_lock_and_undo(
ulint cmpl_info,/* in: compiler info on secondary index
updates */
que_thr_t* thr, /* in: query thread */
mtr_t* mtr, /* in/out: mini-transaction */
roll_ptr_t* roll_ptr)/* out: roll pointer */
{
dict_index_t* index;
......@@ -1455,7 +1459,7 @@ btr_cur_upd_lock_and_undo(
record */
return(lock_sec_rec_modify_check_and_lock(
flags, btr_cur_get_block(cursor), rec,
index, thr));
index, thr, mtr));
}
/* Check if we have to wait for a lock: enqueue an explicit lock
......@@ -1736,7 +1740,7 @@ btr_cur_update_in_place(
/* Do lock checking and undo logging */
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
thr, &roll_ptr);
thr, mtr, &roll_ptr);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
if (UNIV_LIKELY_NULL(heap)) {
......@@ -1953,8 +1957,8 @@ any_extern:
}
/* Do lock checking and undo logging */
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, thr,
&roll_ptr);
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
thr, mtr, &roll_ptr);
if (err != DB_SUCCESS) {
err_exit:
mem_heap_free(heap);
......@@ -2133,7 +2137,7 @@ btr_cur_pessimistic_update(
/* Do lock checking and undo logging */
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
thr, &roll_ptr);
thr, mtr, &roll_ptr);
if (err != DB_SUCCESS) {
return(err);
......@@ -2308,6 +2312,19 @@ make_external:
ut_a(err == DB_SUCCESS);
ut_a(dummy_big_rec == NULL);
if (dict_index_is_sec_or_ibuf(index)) {
/* Update PAGE_MAX_TRX_ID in the index page header.
It was not updated by btr_cur_pessimistic_insert()
because of BTR_NO_LOCKING_FLAG. */
buf_block_t* rec_block;
rec_block = btr_cur_get_block(cursor);
page_update_max_trx_id(rec_block,
buf_block_get_page_zip(rec_block),
trx->id, mtr);
}
if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
/* The new inserted record owns its possible externally
stored fields */
......@@ -2683,7 +2700,7 @@ btr_cur_del_mark_set_sec_rec(
err = lock_sec_rec_modify_check_and_lock(flags,
btr_cur_get_block(cursor),
rec, cursor->index, thr);
rec, cursor->index, thr, mtr);
if (err != DB_SUCCESS) {
return(err);
......
......@@ -2692,7 +2692,7 @@ ibuf_insert_low(
if (err == DB_SUCCESS) {
/* Update the page max trx id field */
page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
thr_get_trx(thr)->id);
thr_get_trx(thr)->id, &mtr);
}
} else {
ut_ad(mode == BTR_MODIFY_TREE);
......@@ -2712,7 +2712,7 @@ ibuf_insert_low(
if (err == DB_SUCCESS) {
/* Update the page max trx id field */
page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
thr_get_trx(thr)->id);
thr_get_trx(thr)->id, &mtr);
}
ibuf_size_update(root, &mtr);
......@@ -3318,7 +3318,8 @@ loop:
dict_index_t* dummy_index;
max_trx_id = page_get_max_trx_id(page_align(rec));
page_update_max_trx_id(block, page_zip, max_trx_id);
page_update_max_trx_id(block, page_zip, max_trx_id,
&mtr);
entry = ibuf_build_entry_from_ibuf_rec(
rec, heap, &dummy_index);
......
......@@ -568,6 +568,16 @@ dict_index_is_ibuf(
zero for other indexes */
const dict_index_t* index) /* in: index */
__attribute__((pure));
/************************************************************************
Check whether the index is a secondary index or the insert buffer tree. */
UNIV_INLINE
ulint
dict_index_is_sec_or_ibuf(
/*======================*/
/* out: nonzero for insert buffer,
zero for other indexes */
const dict_index_t* index) /* in: index */
__attribute__((pure));
/************************************************************************
Gets the number of user-defined columns in a table in the dictionary
......
......@@ -245,6 +245,26 @@ dict_index_is_ibuf(
return(UNIV_UNLIKELY(index->type & DICT_IBUF));
}
/************************************************************************
Check whether the index is a secondary index or the insert buffer tree. */
UNIV_INLINE
ulint
dict_index_is_sec_or_ibuf(
/*======================*/
/* out: nonzero for insert buffer,
zero for other indexes */
const dict_index_t* index) /* in: index */
{
ulint type;
ut_ad(index);
ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
type = index->type;
return(UNIV_LIKELY(!(type & DICT_CLUSTERED) || (type & DICT_IBUF)));
}
/************************************************************************
Gets the number of user-defined columns in a table in the dictionary
cache. */
......
......@@ -28,6 +28,7 @@ Created 5/7/1996 Heikki Tuuri
#include "univ.i"
#include "buf0types.h"
#include "trx0types.h"
#include "mtr0types.h"
#include "rem0types.h"
#include "dict0types.h"
#include "que0types.h"
......@@ -288,10 +289,11 @@ lock_rec_insert_check_and_lock(
DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is
set, does nothing */
rec_t* rec, /* in: record after which to insert */
const rec_t* rec, /* in: record after which to insert */
buf_block_t* block, /* in/out: buffer block of rec */
dict_index_t* index, /* in: index */
que_thr_t* thr, /* in: query thread */
mtr_t* mtr, /* in/out: mini-transaction */
ibool* inherit);/* out: set to TRUE if the new
inserted record maybe should inherit
LOCK_GAP type locks from the successor
......@@ -330,13 +332,14 @@ lock_sec_rec_modify_check_and_lock(
ulint flags, /* in: if BTR_NO_LOCKING_FLAG
bit is set, does nothing */
buf_block_t* block, /* in/out: buffer block of rec */
rec_t* rec, /* in: record which should be
const rec_t* rec, /* in: record which should be
modified; NOTE: as this is a secondary
index, we always have to modify the
clustered index record first: see the
comment below */
dict_index_t* index, /* in: secondary index */
que_thr_t* thr); /* in: query thread */
que_thr_t* thr, /* in: query thread */
mtr_t* mtr); /* in/out: mini-transaction */
/*************************************************************************
Like the counterpart for a clustered index below, but now we read a
secondary index record. */
......
......@@ -189,7 +189,8 @@ page_set_max_trx_id(
/*================*/
buf_block_t* block, /* in/out: page */
page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */
trx_id_t trx_id);/* in: transaction id */
trx_id_t trx_id, /* in: transaction id */
mtr_t* mtr); /* in/out: mini-transaction, or NULL */
/*****************************************************************
Sets the max trx id field value if trx_id is bigger than the previous
value. */
......@@ -200,7 +201,8 @@ page_update_max_trx_id(
buf_block_t* block, /* in/out: page */
page_zip_des_t* page_zip,/* in/out: compressed page whose
uncompressed part will be updated, or NULL */
trx_id_t trx_id);/* in: transaction id */
trx_id_t trx_id, /* in: transaction id */
mtr_t* mtr); /* in/out: mini-transaction */
/*****************************************************************
Reads the given header field. */
UNIV_INLINE
......
......@@ -23,6 +23,9 @@ Created 2/2/1994 Heikki Tuuri
*******************************************************/
#include "mach0data.h"
#ifdef UNIV_DEBUG
# include "log0recv.h"
#endif /* !UNIV_DEBUG */
#ifndef UNIV_HOTBACKUP
# include "rem0cmp.h"
#endif /* !UNIV_HOTBACKUP */
......@@ -79,14 +82,24 @@ page_update_max_trx_id(
buf_block_t* block, /* in/out: page */
page_zip_des_t* page_zip,/* in/out: compressed page whose
uncompressed part will be updated, or NULL */
trx_id_t trx_id) /* in: transaction id */
trx_id_t trx_id, /* in: transaction id */
mtr_t* mtr) /* in/out: mini-transaction */
{
ut_ad(block);
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
/* During crash recovery, this function may be called on
something else than a leaf page of a secondary index or the
insert buffer index tree (dict_index_is_sec_or_ibuf() returns
TRUE for the dummy indexes constructed during redo log
application). In that case, PAGE_MAX_TRX_ID is unused,
and trx_id is usually zero. */
ut_ad(!ut_dulint_is_zero(trx_id) || recv_recovery_is_on());
ut_ad(page_is_leaf(buf_block_get_frame(block)));
if (ut_dulint_cmp(page_get_max_trx_id(buf_block_get_frame(block)),
trx_id) < 0) {
page_set_max_trx_id(block, page_zip, trx_id);
page_set_max_trx_id(block, page_zip, trx_id, mtr);
}
}
......
......@@ -563,6 +563,7 @@ lock_sec_rec_cons_read_sees(
}
max_trx_id = page_get_max_trx_id(page_align(rec));
ut_ad(!ut_dulint_is_zero(max_trx_id));
return(ut_dulint_cmp(max_trx_id, view->up_limit_id) < 0);
}
......@@ -4923,10 +4924,11 @@ lock_rec_insert_check_and_lock(
DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is
set, does nothing */
rec_t* rec, /* in: record after which to insert */
const rec_t* rec, /* in: record after which to insert */
buf_block_t* block, /* in/out: buffer block of rec */
dict_index_t* index, /* in: index */
que_thr_t* thr, /* in: query thread */
mtr_t* mtr, /* in/out: mini-transaction */
ibool* inherit)/* out: set to TRUE if the new
inserted record maybe should inherit
LOCK_GAP type locks from the successor
......@@ -4946,7 +4948,7 @@ lock_rec_insert_check_and_lock(
}
trx = thr_get_trx(thr);
next_rec = page_rec_get_next(rec);
next_rec = page_rec_get_next((rec_t*) rec);
next_rec_heap_no = page_rec_get_heap_no(next_rec);
lock_mutex_enter_kernel();
......@@ -4969,7 +4971,7 @@ lock_rec_insert_check_and_lock(
/* Update the page max trx id field */
page_update_max_trx_id(block,
buf_block_get_page_zip(block),
trx->id);
trx->id, mtr);
}
*inherit = FALSE;
......@@ -5008,7 +5010,7 @@ lock_rec_insert_check_and_lock(
/* Update the page max trx id field */
page_update_max_trx_id(block,
buf_block_get_page_zip(block),
trx->id);
trx->id, mtr);
}
#ifdef UNIV_DEBUG
......@@ -5144,13 +5146,14 @@ lock_sec_rec_modify_check_and_lock(
ulint flags, /* in: if BTR_NO_LOCKING_FLAG
bit is set, does nothing */
buf_block_t* block, /* in/out: buffer block of rec */
rec_t* rec, /* in: record which should be
const rec_t* rec, /* in: record which should be
modified; NOTE: as this is a secondary
index, we always have to modify the
clustered index record first: see the
comment below */
dict_index_t* index, /* in: secondary index */
que_thr_t* thr) /* in: query thread */
que_thr_t* thr, /* in: query thread */
mtr_t* mtr) /* in/out: mini-transaction */
{
ulint err;
ulint heap_no;
......@@ -5199,7 +5202,7 @@ lock_sec_rec_modify_check_and_lock(
/* Update the page max trx id field */
page_update_max_trx_id(block,
buf_block_get_page_zip(block),
thr_get_trx(thr)->id);
thr_get_trx(thr)->id, mtr);
}
return(err);
......
......@@ -209,7 +209,8 @@ page_set_max_trx_id(
/*================*/
buf_block_t* block, /* in/out: page */
page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */
trx_id_t trx_id) /* in: transaction id */
trx_id_t trx_id, /* in: transaction id */
mtr_t* mtr) /* in/out: mini-transaction, or NULL */
{
page_t* page = buf_block_get_frame(block);
#ifndef UNIV_HOTBACKUP
......@@ -218,17 +219,24 @@ page_set_max_trx_id(
if (is_hashed) {
rw_lock_x_lock(&btr_search_latch);
}
ut_ad(!mtr || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
#endif /* !UNIV_HOTBACKUP */
/* It is not necessary to write this change to the redo log, as
during a database recovery we assume that the max trx id of every
page is the maximum trx id assigned before the crash. */
mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
if (UNIV_LIKELY_NULL(page_zip)) {
mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
page_zip_write_header(page_zip,
page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
8, NULL);
8, mtr);
} else if (mtr) {
mlog_write_dulint(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
trx_id, mtr);
} else {
mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
}
#ifndef UNIV_HOTBACKUP
......@@ -447,7 +455,7 @@ page_create_low(
page_header_set_field(page, NULL, PAGE_DIRECTION, PAGE_NO_DIRECTION);
page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
page_header_set_field(page, NULL, PAGE_N_RECS, 0);
page_set_max_trx_id(block, NULL, ut_dulint_zero);
page_set_max_trx_id(block, NULL, ut_dulint_zero, NULL);
memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START
- page_offset(heap_top));
......@@ -692,8 +700,10 @@ page_copy_rec_list_end(
lock_move_rec_list_end(new_block, block, rec);
if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
page_update_max_trx_id(new_block, new_page_zip,
page_get_max_trx_id(page));
page_get_max_trx_id(page), mtr);
}
btr_search_move_or_delete_hash_entries(new_block, block, index);
......@@ -803,8 +813,12 @@ page_copy_rec_list_start(
/* Update MAX_TRX_ID, the lock table, and possible hash index */
if (dict_index_is_sec_or_ibuf(index)
&& page_is_leaf(page_align(rec))) {
page_update_max_trx_id(new_block, new_page_zip,
page_get_max_trx_id(page_align(rec)));
page_get_max_trx_id(page_align(rec)),
mtr);
}
lock_move_rec_list_start(new_block, block, rec, ret);
......
......@@ -273,6 +273,8 @@ page_zip_compress_write_log(
byte* log_ptr;
ulint trailer_size;
ut_ad(!dict_index_is_ibuf(index));
log_ptr = mlog_open(mtr, 11 + 2 + 2);
if (!log_ptr) {
......@@ -346,6 +348,7 @@ page_zip_get_n_prev_extern(
ut_ad(page_is_comp(page));
ut_ad(dict_table_is_comp(index->table));
ut_ad(dict_index_is_clust(index));
ut_ad(!dict_index_is_ibuf(index));
heap_no = rec_get_heap_no_new(rec);
ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
......@@ -1137,6 +1140,8 @@ page_zip_compress(
ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
ut_ad(page_simple_validate_new((page_t*) page));
ut_ad(page_zip_simple_validate(page_zip));
ut_ad(dict_table_is_comp(index->table));
ut_ad(!dict_index_is_ibuf(index));
UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
......@@ -4369,6 +4374,7 @@ page_zip_reorganize(
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
ut_ad(page_is_comp(page));
ut_ad(!dict_index_is_ibuf(index));
/* Note that page_zip_validate(page_zip, page) may fail here. */
UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
......@@ -4400,8 +4406,13 @@ page_zip_reorganize(
page_copy_rec_list_end_no_locks(block, temp_block,
page_get_infimum_rec(temp_page),
index, mtr);
if (!dict_index_is_clust(index) && page_is_leaf(temp_page)) {
/* Copy max trx id to recreated page */
page_set_max_trx_id(block, NULL, page_get_max_trx_id(temp_page));
trx_id_t max_trx_id = page_get_max_trx_id(temp_page);
page_set_max_trx_id(block, NULL, max_trx_id, NULL);
ut_ad(!ut_dulint_is_zero(max_trx_id));
}
/* Restore logging. */
mtr_set_log_mode(mtr, log_mode);
......@@ -4446,6 +4457,7 @@ page_zip_copy_recs(
{
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX));
ut_ad(!dict_index_is_ibuf(index));
#ifdef UNIV_ZIP_DEBUG
/* The B-tree operations that call this function may set
FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
......@@ -4459,6 +4471,11 @@ page_zip_copy_recs(
ut_a(dict_index_is_clust(index));
}
/* The PAGE_MAX_TRX_ID must be set on leaf pages of secondary
indexes. It does not matter on other pages. */
ut_a(dict_index_is_clust(index) || !page_is_leaf(src)
|| !ut_dulint_is_zero(page_get_max_trx_id(src)));
UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE);
UNIV_MEM_ASSERT_W(page_zip->data, page_zip_get_size(page_zip));
UNIV_MEM_ASSERT_RW(src, UNIV_PAGE_SIZE);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment