Commit 94db78ce authored by heikki@donna.mysql.fi's avatar heikki@donna.mysql.fi

srv0srv.h Support raw disk partitions as data files

srv0start.c	Support raw disk partitions as data files
srv0srv.c	Support raw disk partitions as data files
row0purge.c	< 4 GB rows, doublewrite, hang fixes
row0row.c	< 4 GB rows, doublewrite, hang fixes
row0sel.c	< 4 GB rows, doublewrite, hang fixes
row0uins.c	< 4 GB rows, doublewrite, hang fixes
row0umod.c	< 4 GB rows, doublewrite, hang fixes
row0undo.c	< 4 GB rows, doublewrite, hang fixes
row0upd.c	< 4 GB rows, doublewrite, hang fixes
srv0srv.c	< 4 GB rows, doublewrite, hang fixes
srv0start.c	< 4 GB rows, doublewrite, hang fixes
sync0rw.c	< 4 GB rows, doublewrite, hang fixes
sync0sync.c	< 4 GB rows, doublewrite, hang fixes
trx0purge.c	< 4 GB rows, doublewrite, hang fixes
trx0rec.c	< 4 GB rows, doublewrite, hang fixes
trx0sys.c	< 4 GB rows, doublewrite, hang fixes
btr0btr.c	< 4 GB rows, doublewrite, hang fixes
btr0cur.c	< 4 GB rows, doublewrite, hang fixes
buf0buf.c	< 4 GB rows, doublewrite, hang fixes
buf0flu.c	< 4 GB rows, doublewrite, hang fixes
buf0rea.c	< 4 GB rows, doublewrite, hang fixes
data0data.c	< 4 GB rows, doublewrite, hang fixes
fil0fil.c	< 4 GB rows, doublewrite, hang fixes
fsp0fsp.c	< 4 GB rows, doublewrite, hang fixes
ibuf0ibuf.c	< 4 GB rows, doublewrite, hang fixes
lock0lock.c	< 4 GB rows, doublewrite, hang fixes
log0log.c	< 4 GB rows, doublewrite, hang fixes
log0recv.c	< 4 GB rows, doublewrite, hang fixes
os0file.c	< 4 GB rows, doublewrite, hang fixes
page0cur.c	< 4 GB rows, doublewrite, hang fixes
pars0pars.c	< 4 GB rows, doublewrite, hang fixes
rem0cmp.c	< 4 GB rows, doublewrite, hang fixes
rem0rec.c	< 4 GB rows, doublewrite, hang fixes
row0ins.c	< 4 GB rows, doublewrite, hang fixes
row0mysql.c	< 4 GB rows, doublewrite, hang fixes
univ.i  	< 4 GB rows, doublewrite, hang fixes
data0data.ic	< 4 GB rows, doublewrite, hang fixes
mach0data.ic	< 4 GB rows, doublewrite, hang fixes
rem0rec.ic	< 4 GB rows, doublewrite, hang fixes
row0upd.ic	< 4 GB rows, doublewrite, hang fixes
trx0rec.ic	< 4 GB rows, doublewrite, hang fixes
rem0cmp.h	< 4 GB rows, doublewrite, hang fixes
rem0rec.h	< 4 GB rows, doublewrite, hang fixes
row0ins.h	< 4 GB rows, doublewrite, hang fixes
row0mysql.h	< 4 GB rows, doublewrite, hang fixes
row0row.h	< 4 GB rows, doublewrite, hang fixes
row0upd.h	< 4 GB rows, doublewrite, hang fixes
srv0srv.h	< 4 GB rows, doublewrite, hang fixes
sync0sync.h	< 4 GB rows, doublewrite, hang fixes
trx0rec.h	< 4 GB rows, doublewrite, hang fixes
trx0sys.h	< 4 GB rows, doublewrite, hang fixes
trx0types.h	< 4 GB rows, doublewrite, hang fixes
trx0undo.h	< 4 GB rows, doublewrite, hang fixes
ut0dbg.h	< 4 GB rows, doublewrite, hang fixes
ut0ut.h 	< 4 GB rows, doublewrite, hang fixes
btr0btr.h	< 4 GB rows, doublewrite, hang fixes
btr0cur.h	< 4 GB rows, doublewrite, hang fixes
buf0buf.h	< 4 GB rows, doublewrite, hang fixes
buf0flu.h	< 4 GB rows, doublewrite, hang fixes
data0data.h	< 4 GB rows, doublewrite, hang fixes
dict0mem.h	< 4 GB rows, doublewrite, hang fixes
fil0fil.h	< 4 GB rows, doublewrite, hang fixes
fsp0fsp.h	< 4 GB rows, doublewrite, hang fixes
os0file.h	< 4 GB rows, doublewrite, hang fixes
parent 596d69b5
......@@ -71,30 +71,6 @@ btr_page_create(
dict_tree_t* tree, /* in: index tree */
mtr_t* mtr); /* in: mtr */
/******************************************************************
Allocates a new file page to be used in an index tree. */
static
page_t*
btr_page_alloc(
/*===========*/
/* out: new allocated page,
x-latched */
dict_tree_t* tree, /* in: index tree */
ulint hint_page_no, /* in: hint of a good page */
byte file_direction, /* in: direction where a possible
page split is made */
ulint level, /* in: level where the page is placed
in the tree */
mtr_t* mtr); /* in: mtr */
/******************************************************************
Frees a file page used in an index tree. */
static
void
btr_page_free(
/*==========*/
dict_tree_t* tree, /* in: index tree */
page_t* page, /* in, own: page to be freed */
mtr_t* mtr); /* in: mtr */
/******************************************************************
Sets the child node file address in a node pointer. */
UNIV_INLINE
void
......@@ -319,11 +295,12 @@ btr_page_alloc_for_ibuf(
/******************************************************************
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents! */
static
page_t*
btr_page_alloc(
/*===========*/
/* out: new allocated page, x-latched */
/* out: new allocated page, x-latched;
NULL if out of space */
dict_tree_t* tree, /* in: index tree */
ulint hint_page_no, /* in: hint of a good page */
byte file_direction, /* in: direction where a possible
......@@ -358,7 +335,10 @@ btr_page_alloc(
new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
file_direction, TRUE, mtr);
ut_a(new_page_no != FIL_NULL);
if (new_page_no == FIL_NULL) {
return(NULL);
}
new_page = buf_page_get(dict_tree_get_space(tree), new_page_no,
RW_X_LATCH, mtr);
......@@ -435,20 +415,22 @@ btr_page_free_for_ibuf(
}
/******************************************************************
Frees a file page used in an index tree. */
static
Frees a file page used in an index tree. Can be used also to (BLOB)
external storage pages, because the page level 0 can be given as an
argument. */
void
btr_page_free(
/*==========*/
btr_page_free_low(
/*==============*/
dict_tree_t* tree, /* in: index tree */
page_t* page, /* in: page to be freed, x-latched */
ulint level, /* in: page level */
mtr_t* mtr) /* in: mtr */
{
fseg_header_t* seg_header;
page_t* root;
ulint space;
ulint page_no;
ulint level;
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
MTR_MEMO_PAGE_X_FIX));
......@@ -465,8 +447,6 @@ btr_page_free(
}
root = btr_root_get(tree, mtr);
level = btr_page_get_level(page, mtr);
if (level == 0) {
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
......@@ -480,6 +460,26 @@ btr_page_free(
fseg_free_page(seg_header, space, page_no, mtr);
}
/******************************************************************
Frees a file page used in an index tree. NOTE: cannot free field external
storage pages because the page must contain info on its level. */
void
btr_page_free(
/*==========*/
dict_tree_t* tree, /* in: index tree */
page_t* page, /* in: page to be freed, x-latched */
mtr_t* mtr) /* in: mtr */
{
ulint level;
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
MTR_MEMO_PAGE_X_FIX));
level = btr_page_get_level(page, mtr);
btr_page_free_low(tree, page, level, mtr);
}
/******************************************************************
Sets the child node file address in a node pointer. */
UNIV_INLINE
......@@ -1276,6 +1276,7 @@ btr_insert_on_non_leaf_level(
dtuple_t* tuple, /* in: the record to be inserted */
mtr_t* mtr) /* in: mtr */
{
big_rec_t* dummy_big_rec;
btr_cur_t cursor;
ulint err;
rec_t* rec;
......@@ -1294,7 +1295,7 @@ btr_insert_on_non_leaf_level(
| BTR_KEEP_SYS_FLAG
| BTR_NO_UNDO_LOG_FLAG,
&cursor, tuple,
&rec, NULL, mtr);
&rec, &dummy_big_rec, NULL, mtr);
ut_a(err == DB_SUCCESS);
}
......
This diff is collapsed.
......@@ -216,14 +216,44 @@ buf_calc_page_checksum(
/* out: checksum */
byte* page) /* in: buffer page */
{
ulint checksum;
ulint checksum;
checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+ ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA
- FIL_PAGE_END_LSN);
checksum = checksum & 0xFFFFFFFF;
checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+ ut_fold_binary(page + FIL_PAGE_DATA,
UNIV_PAGE_SIZE - FIL_PAGE_DATA
- FIL_PAGE_END_LSN);
checksum = checksum & 0xFFFFFFFF;
return(checksum);
return(checksum);
}
/************************************************************************
Checks if a page is corrupt. */
ibool
buf_page_is_corrupted(
/*==================*/
/* out: TRUE if corrupted */
byte* read_buf) /* in: a database page */
{
ulint checksum;
checksum = buf_calc_page_checksum(read_buf);
if ((mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
!= mach_read_from_4(read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN + 4))
|| (checksum != mach_read_from_4(read_buf
+ UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN)
&& mach_read_from_4(read_buf + FIL_PAGE_LSN)
!= mach_read_from_4(read_buf
+ UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN))) {
return(TRUE);
}
return(FALSE);
}
/************************************************************************
......@@ -1265,34 +1295,22 @@ buf_page_io_complete(
dulint id;
dict_index_t* index;
ulint io_type;
ulint checksum;
ut_ad(block);
io_type = block->io_fix;
if (io_type == BUF_IO_READ) {
checksum = buf_calc_page_checksum(block->frame);
/* From version 3.23.38 up we store the page checksum
to the 4 upper bytes of the page end lsn field */
if ((mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
!= mach_read_from_4(block->frame + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN + 4))
|| (checksum != mach_read_from_4(block->frame
+ UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN)
&& mach_read_from_4(block->frame + FIL_PAGE_LSN)
!= mach_read_from_4(block->frame
+ UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN))) {
fprintf(stderr,
if (buf_page_is_corrupted(block->frame)) {
fprintf(stderr,
"InnoDB: Database page corruption or a failed\n"
"InnoDB: file read of page %lu.\n", block->offset);
fprintf(stderr,
fprintf(stderr,
"InnoDB: You may have to recover from a backup.\n");
exit(1);
exit(1);
}
if (recv_recovery_is_on()) {
......@@ -1601,11 +1619,28 @@ void
buf_print_io(void)
/*==============*/
{
ulint size;
ut_ad(buf_pool);
size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE;
mutex_enter(&(buf_pool->mutex));
printf("LRU list length %lu \n", UT_LIST_GET_LEN(buf_pool->LRU));
printf("Free list length %lu \n", UT_LIST_GET_LEN(buf_pool->free));
printf("Flush list length %lu \n",
UT_LIST_GET_LEN(buf_pool->flush_list));
printf("Buffer pool size in pages %lu\n", size);
printf("pages read %lu, created %lu, written %lu\n",
printf("Pending reads %lu \n", buf_pool->n_pend_reads);
printf("Pending writes: LRU %lu, flush list %lu, single page %lu\n",
buf_pool->n_flush[BUF_FLUSH_LRU],
buf_pool->n_flush[BUF_FLUSH_LIST],
buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
printf("Pages read %lu, created %lu, written %lu\n",
buf_pool->n_pages_read, buf_pool->n_pages_created,
buf_pool->n_pages_written);
mutex_exit(&(buf_pool->mutex));
......
/******************************************************
The database buffer buf_pool flush algorithm
(c) 1995 Innobase Oy
(c) 1995-2001 Innobase Oy
Created 11/11/1995 Heikki Tuuri
*******************************************************/
......@@ -15,7 +15,6 @@ Created 11/11/1995 Heikki Tuuri
#include "ut0byte.h"
#include "ut0lst.h"
#include "fil0fil.h"
#include "buf0buf.h"
#include "buf0lru.h"
#include "buf0rea.h"
......@@ -195,9 +194,145 @@ buf_flush_write_complete(
}
/************************************************************************
Does an asynchronous write of a buffer page. NOTE: in simulated aio we must
call os_aio_simulated_wake_handler_threads after we have posted a batch
of writes! */
Flushes possible buffered writes from the doublewrite memory buffer to disk,
and also wakes up the aio thread if simulated aio is used. It is very
important to call this function after a batch of writes has been posted,
and also when we may have to wait for a page latch! Otherwise a deadlock
of threads can occur. */
static
void
buf_flush_buffered_writes(void)
/*===========================*/
{
buf_block_t* block;
ulint len;
ulint i;
if (trx_doublewrite == NULL) {
os_aio_simulated_wake_handler_threads();
return;
}
mutex_enter(&(trx_doublewrite->mutex));
/* Write first to doublewrite buffer blocks. We use synchronous
aio and thus know that file write has been completed when the
control returns. */
if (trx_doublewrite->first_free == 0) {
mutex_exit(&(trx_doublewrite->mutex));
return;
}
if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
} else {
len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
}
fil_io(OS_FILE_WRITE,
TRUE, TRX_SYS_SPACE,
trx_doublewrite->block1, 0, len,
(void*)trx_doublewrite->write_buf, NULL);
if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
len = (trx_doublewrite->first_free
- TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
fil_io(OS_FILE_WRITE,
TRUE, TRX_SYS_SPACE,
trx_doublewrite->block2, 0, len,
(void*)(trx_doublewrite->write_buf
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE),
NULL);
}
/* Now flush the doublewrite buffer data to disk */
fil_flush(TRX_SYS_SPACE);
/* We know that the writes have been flushed to disk now
and in recovery we will find them in the doublewrite buffer
blocks. Next do the writes to the intended positions. */
for (i = 0; i < trx_doublewrite->first_free; i++) {
block = trx_doublewrite->buf_block_arr[i];
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
(void*)block->frame, (void*)block);
}
/* Wake possible simulated aio thread to actually post the
writes to the operating system */
os_aio_simulated_wake_handler_threads();
/* Wait that all async writes to tablespaces have been posted to
the OS */
os_aio_wait_until_no_pending_writes();
/* Now we flush the data to disk (for example, with fsync) */
fil_flush_file_spaces(FIL_TABLESPACE);
/* We can now reuse the doublewrite memory buffer: */
trx_doublewrite->first_free = 0;
mutex_exit(&(trx_doublewrite->mutex));
}
/************************************************************************
Posts a buffer page for writing. If the doublewrite memory buffer is
full, calls buf_flush_buffered_writes and waits for for free space to
appear. */
static
void
buf_flush_post_to_doublewrite_buf(
/*==============================*/
buf_block_t* block) /* in: buffer block to write */
{
try_again:
mutex_enter(&(trx_doublewrite->mutex));
if (trx_doublewrite->first_free
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
mutex_exit(&(trx_doublewrite->mutex));
buf_flush_buffered_writes();
goto try_again;
}
ut_memcpy(trx_doublewrite->write_buf
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
block->frame, UNIV_PAGE_SIZE);
trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;
trx_doublewrite->first_free++;
if (trx_doublewrite->first_free
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
mutex_exit(&(trx_doublewrite->mutex));
buf_flush_buffered_writes();
return;
}
mutex_exit(&(trx_doublewrite->mutex));
}
/************************************************************************
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
also when the doublewrite buffer is used, we must call
buf_flush_buffered_writes after we have posted a batch of writes! */
static
void
buf_flush_write_block_low(
......@@ -222,15 +357,24 @@ buf_flush_write_block_low(
mach_write_to_8(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
block->newest_modification);
/* Write to the page the space id and page number */
mach_write_to_4(block->frame + FIL_PAGE_SPACE, block->space);
mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->offset);
/* We overwrite the first 4 bytes of the end lsn field to store
a page checksum */
mach_write_to_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
buf_calc_page_checksum(block->frame));
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
if (!trx_doublewrite) {
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
(void*)block->frame, (void*)block);
} else {
buf_flush_post_to_doublewrite_buf(block);
}
}
/************************************************************************
......@@ -251,14 +395,14 @@ buf_flush_try_page(
buf_block_t* block;
ibool locked;
ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)
|| (flush_type == BUF_FLUSH_SINGLE_PAGE));
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
|| flush_type == BUF_FLUSH_SINGLE_PAGE);
mutex_enter(&(buf_pool->mutex));
block = buf_page_hash_get(space, offset);
if ((flush_type == BUF_FLUSH_LIST)
if (flush_type == BUF_FLUSH_LIST
&& block && buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
......@@ -286,7 +430,7 @@ buf_flush_try_page(
mutex_exit(&(buf_pool->mutex));
if (!locked) {
os_aio_simulated_wake_handler_threads();
buf_flush_buffered_writes();
rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
}
......@@ -300,7 +444,7 @@ buf_flush_try_page(
return(1);
} else if ((flush_type == BUF_FLUSH_LRU) && block
} else if (flush_type == BUF_FLUSH_LRU && block
&& buf_flush_ready_for_flush(block, flush_type)) {
/* VERY IMPORTANT:
......@@ -328,7 +472,7 @@ buf_flush_try_page(
return(1);
} else if ((flush_type == BUF_FLUSH_SINGLE_PAGE) && block
} else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
&& buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
......@@ -385,6 +529,14 @@ buf_flush_try_neighbors(
/* If there is little space, it is better not to flush any
block except from the end of the LRU list */
low = offset;
high = offset + 1;
} else if (flush_type == BUF_FLUSH_LIST) {
/* Since semaphore waits require us to flush the
doublewrite buffer to disk, it is best that the
search area is just the page itself, to minimize
chances for semaphore waits */
low = offset;
high = offset + 1;
}
......@@ -418,13 +570,6 @@ buf_flush_try_neighbors(
mutex_exit(&(buf_pool->mutex));
/* In simulated aio we wake up the i/o-handler threads now that
we have posted a batch of writes: */
/* printf("Flush count %lu ; Waking i/o handlers\n", count); */
os_aio_simulated_wake_handler_threads();
return(count);
}
......@@ -565,13 +710,15 @@ buf_flush_batch(
mutex_exit(&(buf_pool->mutex));
if (buf_debug_prints && (page_count > 0)) {
buf_flush_buffered_writes();
if (buf_debug_prints && page_count > 0) {
if (flush_type == BUF_FLUSH_LRU) {
printf("To flush %lu pages in LRU flush\n",
printf("Flushed %lu pages in LRU flush\n",
page_count);
} else if (flush_type == BUF_FLUSH_LIST) {
printf("To flush %lu pages in flush list flush\n",
page_count, flush_type);
printf("Flushed %lu pages in flush list flush\n",
page_count);
} else {
ut_error;
}
......
......@@ -49,7 +49,9 @@ ulint
buf_read_page_low(
/*==============*/
/* out: 1 if a read request was queued, 0 if the page
already resided in buf_pool */
already resided in buf_pool or if the page is in
the doublewrite buffer blocks in which case it is never
read into the pool */
ibool sync, /* in: TRUE if synchronous aio is desired */
ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ...,
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
......@@ -63,6 +65,16 @@ buf_read_page_low(
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
if (trx_doublewrite && space == TRX_SYS_SPACE
&& ( (offset >= trx_doublewrite->block1
&& offset < trx_doublewrite->block1
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
|| (offset >= trx_doublewrite->block2
&& offset < trx_doublewrite->block2
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
return(0);
}
#ifdef UNIV_LOG_DEBUG
if (space % 2 == 1) {
/* We are updating a replicate space while holding the
......
......@@ -13,7 +13,10 @@ Created 5/30/1994 Heikki Tuuri
#endif
#include "ut0rnd.h"
#include "rem0rec.h"
#include "page0page.h"
#include "dict0dict.h"
#include "btr0cur.h"
byte data_error; /* data pointers of tuple fields are initialized
to point here for error checking */
......@@ -378,6 +381,172 @@ dtuple_sprintf(
return(len);
}
/******************************************************************
Moves parts of long fields in entry to the big record vector so that
the size of tuple drops below the maximum record size allowed in the
database. Moves data only from those fields which are not necessary
to determine uniquely the insertion place of the tuple in the index. */
big_rec_t*
dtuple_convert_big_rec(
/*===================*/
/* out, own: created big record vector,
NULL if we are not able to shorten
the entry enough, i.e., if there are
too many short fields in entry */
dict_index_t* index, /* in: index */
dtuple_t* entry) /* in: index entry */
{
mem_heap_t* heap;
big_rec_t* vector;
dfield_t* dfield;
ulint size;
ulint n_fields;
ulint longest;
ulint longest_i;
ulint i;
size = rec_get_converted_size(entry);
heap = mem_heap_create(size + dtuple_get_n_fields(entry)
* sizeof(big_rec_field_t) + 1000);
vector = mem_heap_alloc(heap, sizeof(big_rec_t));
vector->heap = heap;
vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry)
* sizeof(big_rec_field_t));
/* Decide which fields to shorten: the algorithm is to look for
the longest field which does not occur in the ordering part
of any index on the table */
n_fields = 0;
while ((rec_get_converted_size(entry)
>= page_get_free_space_of_empty() / 2)
|| rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE) {
longest = 0;
for (i = dict_index_get_n_unique_in_tree(index);
i < dtuple_get_n_fields(entry); i++) {
/* Skip over fields which are ordering in some index */
if (dict_field_get_col(
dict_index_get_nth_field(index, i))
->ord_part == 0) {
dfield = dtuple_get_nth_field(entry, i);
if (dfield->len != UNIV_SQL_NULL &&
dfield->len > longest) {
longest = dfield->len;
longest_i = i;
}
}
}
if (longest < BTR_EXTERN_FIELD_REF_SIZE + 10) {
/* Cannot shorten more */
mem_heap_free(heap);
return(NULL);
}
/* Move data from field longest_i to big rec vector,
but do not let data size of the remaining entry
drop below 128 which is the limit for the 2-byte
offset storage format in a physical record */
dfield = dtuple_get_nth_field(entry, longest_i);
vector->fields[n_fields].field_no = longest_i;
if (dtuple_get_data_size(entry) - dfield->len
<= REC_1BYTE_OFFS_LIMIT) {
vector->fields[n_fields].len =
dtuple_get_data_size(entry)
- REC_1BYTE_OFFS_LIMIT;
/* Since dfield will contain at least
a 20-byte reference to the extern storage,
we know that the data size of entry will be
> REC_1BYTE_OFFS_LIMIT */
} else {
vector->fields[n_fields].len = dfield->len;
}
vector->fields[n_fields].data = mem_heap_alloc(heap,
vector->fields[n_fields].len);
/* Copy data (from the end of field) to big rec vector */
ut_memcpy(vector->fields[n_fields].data,
((byte*)dfield->data) + dfield->len
- vector->fields[n_fields].len,
vector->fields[n_fields].len);
dfield->len = dfield->len - vector->fields[n_fields].len
+ BTR_EXTERN_FIELD_REF_SIZE;
/* Set the extern field reference in dfield to zero */
memset(((byte*)dfield->data)
+ dfield->len - BTR_EXTERN_FIELD_REF_SIZE,
0, BTR_EXTERN_FIELD_REF_SIZE);
n_fields++;
}
vector->n_fields = n_fields;
return(vector);
}
/******************************************************************
Puts back to entry the data stored in vector. Note that to ensure the
fields in entry can accommodate the data, vector must have been created
from entry with dtuple_convert_big_rec. */
void
dtuple_convert_back_big_rec(
/*========================*/
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: entry whose data was put to vector */
big_rec_t* vector) /* in, own: big rec vector; it is
freed in this function */
{
dfield_t* dfield;
ulint i;
for (i = 0; i < vector->n_fields; i++) {
dfield = dtuple_get_nth_field(entry,
vector->fields[i].field_no);
/* Copy data from big rec vector */
ut_memcpy(((byte*)dfield->data)
+ dfield->len - BTR_EXTERN_FIELD_REF_SIZE,
vector->fields[i].data,
vector->fields[i].len);
dfield->len = dfield->len + vector->fields[i].len
- BTR_EXTERN_FIELD_REF_SIZE;
}
mem_heap_free(vector->heap);
}
/******************************************************************
Frees the memory in a big rec vector. */
void
dtuple_big_rec_free(
/*================*/
big_rec_t* vector) /* in, own: big rec vector; it is
freed in this function */
{
mem_heap_free(vector->heap);
}
#ifdef notdefined
/******************************************************************
......
......@@ -90,6 +90,9 @@ struct fil_node_struct {
is ignored) */
ulint n_pending;
/* count of pending i/o-ops on this file */
ibool is_modified; /* this is set to TRUE when we write
to the file and FALSE when we call fil_flush
for this file space */
UT_LIST_NODE_T(fil_node_t) chain;
/* link field for the file chain */
UT_LIST_NODE_T(fil_node_t) LRU;
......@@ -301,6 +304,8 @@ fil_node_create(
node->size = size;
node->magic_n = FIL_NODE_MAGIC_N;
node->n_pending = 0;
node->is_modified = FALSE;
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
......@@ -720,6 +725,47 @@ fil_space_get_size(
return(size);
}
/***********************************************************************
Checks if the pair space, page_no refers to an existing page in a
tablespace file space. */
ibool
fil_check_adress_in_tablespace(
/*===========================*/
/* out: TRUE if the address is meaningful */
ulint id, /* in: space id */
ulint page_no)/* in: page number */
{
fil_space_t* space;
fil_system_t* system = fil_system;
ulint size;
ibool ret;
ut_ad(system);
mutex_enter(&(system->mutex));
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
if (space == NULL) {
ret = FALSE;
} else {
size = space->size;
if (page_no > size) {
ret = FALSE;
} else if (space->purpose != FIL_TABLESPACE) {
ret = FALSE;
} else {
ret = TRUE;
}
}
mutex_exit(&(system->mutex));
return(ret);
}
/***********************************************************************
Tries to reserve free extents in a file space. */
......@@ -812,8 +858,14 @@ fil_node_prepare_for_io(
fil_node_close(last_node, system);
}
node->handle = os_file_create(node->name, OS_FILE_OPEN,
OS_FILE_AIO, &ret);
if (space->purpose == FIL_LOG) {
node->handle = os_file_create(node->name, OS_FILE_OPEN,
OS_FILE_AIO, OS_LOG_FILE, &ret);
} else {
node->handle = os_file_create(node->name, OS_FILE_OPEN,
OS_FILE_AIO, OS_DATA_FILE, &ret);
}
ut_a(ret);
node->open = TRUE;
......@@ -851,7 +903,8 @@ void
fil_node_complete_io(
/*=================*/
fil_node_t* node, /* in: file node */
fil_system_t* system) /* in: file system */
fil_system_t* system, /* in: file system */
ulint type) /* in: OS_FILE_WRITE or ..._READ */
{
ut_ad(node);
ut_ad(system);
......@@ -860,6 +913,10 @@ fil_node_complete_io(
node->n_pending--;
if (type != OS_FILE_READ) {
node->is_modified = TRUE;
}
if (node->n_pending == 0) {
/* The node must be put back to the LRU list */
UT_LIST_ADD_FIRST(LRU, system->LRU, node);
......@@ -1016,7 +1073,7 @@ loop:
mutex_enter(&(system->mutex));
fil_node_complete_io(node, system);
fil_node_complete_io(node, system, type);
mutex_exit(&(system->mutex));
......@@ -1090,12 +1147,14 @@ fil_aio_wait(
fil_node_t* fil_node;
fil_system_t* system = fil_system;
void* message;
ulint type;
ut_ad(fil_validate());
if (os_aio_use_native_aio) {
#ifdef WIN_ASYNC_IO
ret = os_aio_windows_handle(segment, 0, &fil_node, &message);
ret = os_aio_windows_handle(segment, 0, &fil_node, &message,
&type);
#elif defined(POSIX_ASYNC_IO)
ret = os_aio_posix_handle(segment, &fil_node, &message);
#else
......@@ -1103,14 +1162,14 @@ fil_aio_wait(
#endif
} else {
ret = os_aio_simulated_handle(segment, (void**) &fil_node,
&message);
&message, &type);
}
ut_a(ret);
mutex_enter(&(system->mutex));
fil_node_complete_io(fil_node, fil_system);
fil_node_complete_io(fil_node, fil_system, type);
mutex_exit(&(system->mutex));
......@@ -1149,8 +1208,10 @@ fil_flush(
node = UT_LIST_GET_FIRST(space->chain);
while (node) {
if (node->open) {
if (node->open && node->is_modified) {
file = node->handle;
node->is_modified = FALSE;
mutex_exit(&(system->mutex));
......@@ -1159,9 +1220,11 @@ fil_flush(
handle is still open: we assume that the OS
will not crash or trap even if we pass a handle
to a closed file below in os_file_flush! */
/* printf("Flushing to file %s\n", node->name); */
os_file_flush(file);
mutex_enter(&(system->mutex));
}
......
......@@ -3239,8 +3239,8 @@ fsp_validate(
ut_a(descr_count * FSP_EXTENT_SIZE == free_limit);
ut_a(n_used + n_full_frag_pages
== n_used2 + (free_limit + XDES_DESCRIBED_PER_PAGE - 1)
/ XDES_DESCRIBED_PER_PAGE
== n_used2 + 2* ((free_limit + XDES_DESCRIBED_PER_PAGE - 1)
/ XDES_DESCRIBED_PER_PAGE)
+ seg_inode_len_full + seg_inode_len_free);
ut_a(frag_n_used == n_used);
......
......@@ -1946,6 +1946,7 @@ ibuf_insert_low(
ulint page_no,/* in: page number where to insert */
que_thr_t* thr) /* in: query thread */
{
big_rec_t* dummy_big_rec;
ulint entry_size;
btr_pcur_t pcur;
btr_cur_t* cursor;
......@@ -2101,7 +2102,8 @@ ibuf_insert_low(
if (mode == BTR_MODIFY_PREV) {
err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor,
ibuf_entry, &ins_rec, thr,
ibuf_entry, &ins_rec,
&dummy_big_rec, thr,
&mtr);
if (err == DB_SUCCESS) {
/* Update the page max trx id field */
......@@ -2121,7 +2123,8 @@ ibuf_insert_low(
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
| BTR_NO_UNDO_LOG_FLAG,
cursor,
ibuf_entry, &ins_rec, thr,
ibuf_entry, &ins_rec,
&dummy_big_rec, thr,
&mtr);
if (err == DB_SUCCESS) {
/* Update the page max trx id field */
......
......@@ -357,6 +357,44 @@ btr_get_size(
/* out: number of pages */
dict_index_t* index, /* in: index */
ulint flag); /* in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
/******************************************************************
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents! */
page_t*
btr_page_alloc(
/*===========*/
/* out: new allocated page, x-latched;
NULL if out of space */
dict_tree_t* tree, /* in: index tree */
ulint hint_page_no, /* in: hint of a good page */
byte file_direction, /* in: direction where a possible
page split is made */
ulint level, /* in: level where the page is placed
in the tree */
mtr_t* mtr); /* in: mtr */
/******************************************************************
Frees a file page used in an index tree. NOTE: cannot free field external
storage pages because the page must contain info on its level. */
void
btr_page_free(
/*==========*/
dict_tree_t* tree, /* in: index tree */
page_t* page, /* in: page to be freed, x-latched */
mtr_t* mtr); /* in: mtr */
/******************************************************************
Frees a file page used in an index tree. Can be used also to BLOB
external storage pages, because the page level 0 can be given as an
argument. */
void
btr_page_free_low(
/*==============*/
dict_tree_t* tree, /* in: index tree */
page_t* page, /* in: page to be freed, x-latched */
ulint level, /* in: page level */
mtr_t* mtr); /* in: mtr */
/*****************************************************************
Prints size info of a B-tree. */
......
......@@ -151,11 +151,14 @@ btr_cur_optimistic_insert(
ulint flags, /* in: undo logging and locking flags: if not
zero, the parameters index and thr should be
specified */
btr_cur_t* cursor, /* in: cursor on page after which
to insert; cursor stays valid */
btr_cur_t* cursor, /* in: cursor on page after which to insert;
cursor stays valid */
dtuple_t* entry, /* in: entry to insert */
rec_t** rec, /* out: pointer to inserted record if
succeed */
big_rec_t** big_rec,/* out: big rec vector whose fields have to
be stored externally by the caller, or
NULL */
que_thr_t* thr, /* in: query thread or NULL */
mtr_t* mtr); /* in: mtr */
/*****************************************************************
......@@ -169,13 +172,19 @@ btr_cur_pessimistic_insert(
/*=======================*/
/* out: DB_SUCCESS or error number */
ulint flags, /* in: undo logging and locking flags: if not
zero, the parameters index and thr should be
specified */
zero, the parameter thr should be
specified; if no undo logging is specified,
then the caller must have reserved enough
free extents in the file space so that the
insertion will certainly succeed */
btr_cur_t* cursor, /* in: cursor after which to insert;
cursor does not stay valid */
cursor stays valid */
dtuple_t* entry, /* in: entry to insert */
rec_t** rec, /* out: pointer to inserted record if
succeed */
big_rec_t** big_rec,/* out: big rec vector whose fields have to
be stored externally by the caller, or
NULL */
que_thr_t* thr, /* in: query thread or NULL */
mtr_t* mtr); /* in: mtr */
/*****************************************************************
......@@ -228,8 +237,9 @@ btr_cur_pessimistic_update(
/* out: DB_SUCCESS or error code */
ulint flags, /* in: undo logging, locking, and rollback
flags */
btr_cur_t* cursor, /* in: cursor on the record to update;
cursor does not stay valid */
btr_cur_t* cursor, /* in: cursor on the record to update */
big_rec_t** big_rec,/* out: big rec vector whose fields have to
be stored externally by the caller, or NULL */
upd_t* update, /* in: update vector; this is allowed also
contain trx id and roll ptr fields, but
the values in update vector have no effect */
......@@ -407,6 +417,92 @@ btr_estimate_number_of_different_key_vals(
/*======================================*/
/* out: estimated number of key values */
dict_index_t* index); /* in: index */
/***********************************************************************
Stores the fields in big_rec_vec to the tablespace and puts pointers to
them in rec. The fields are stored on pages allocated from leaf node
file segment of the index tree. */
ulint
btr_store_big_rec_extern_fields(
/*============================*/
/* out: DB_SUCCESS or error */
dict_index_t* index, /* in: index of rec; the index tree
MUST be X-latched */
rec_t* rec, /* in: record */
big_rec_t* big_rec_vec, /* in: vector containing fields
to be stored externally */
mtr_t* local_mtr); /* in: mtr containing the latch to
rec and to the tree */
/***********************************************************************
Frees the space in an externally stored field to the file space
management. */
void
btr_free_externally_stored_field(
/*=============================*/
dict_index_t* index, /* in: index of the data, the index
tree MUST be X-latched */
byte* data, /* in: internally stored data
+ reference to the externally
stored part */
ulint local_len, /* in: length of data */
mtr_t* local_mtr); /* in: mtr containing the latch to
data an an X-latch to the index
tree */
/***************************************************************
Frees the externally stored fields for a record. */
void
btr_rec_free_externally_stored_fields(
/*==================================*/
dict_index_t* index, /* in: index of the data, the index
tree MUST be X-latched */
rec_t* rec, /* in: record */
mtr_t* mtr); /* in: mini-transaction handle which contains
an X-latch to record page and to the index
tree */
/***********************************************************************
Copies an externally stored field of a record to mem heap. */
byte*
btr_rec_copy_externally_stored_field(
/*=================================*/
/* out: the field copied to heap */
rec_t* rec, /* in: record */
ulint no, /* in: field number */
ulint* len, /* out: length of the field */
mem_heap_t* heap); /* in: mem heap */
/***********************************************************************
Copies an externally stored field of a record to mem heap. Parameter
data contains a pointer to 'internally' stored part of the field:
possibly some data, and the reference to the externally stored part in
the last 20 bytes of data. */
byte*
btr_copy_externally_stored_field(
/*=============================*/
/* out: the whole field copied to heap */
ulint* len, /* out: length of the whole field */
byte* data, /* in: 'internally' stored part of the
field containing also the reference to
the external part */
ulint local_len,/* in: length of data */
mem_heap_t* heap); /* in: mem heap */
/***********************************************************************
Stores the positions of the fields marked as extern storage in the update
vector, and also those fields who are marked as extern storage in rec
and not mentioned in updated fields. We use this function to remember
which fields we must mark as extern storage in a record inserted for an
update. */
ulint
btr_push_update_extern_fields(
/*==========================*/
/* out: number of values stored in ext_vect */
ulint* ext_vect, /* in: array of ulints, must be preallocated
to have place for all fields in rec */
rec_t* rec, /* in: record */
upd_t* update); /* in: update vector */
/*######################################################################*/
......@@ -516,6 +612,19 @@ and sleep this many microseconds in between */
#define BTR_CUR_RETRY_DELETE_N_TIMES 100
#define BTR_CUR_RETRY_SLEEP_TIME 50000
/* The reference in a field of which data is stored on a different page */
/*--------------------------------------*/
#define BTR_EXTERN_SPACE_ID 0 /* space id where stored */
#define BTR_EXTERN_PAGE_NO 4 /* page no where stored */
#define BTR_EXTERN_OFFSET 8 /* offset of BLOB header
on that page */
#define BTR_EXTERN_LEN 12 /* 8 bytes containing the
length of the externally
stored part of the BLOB */
/*--------------------------------------*/
#define BTR_EXTERN_FIELD_REF_SIZE 20
extern ulint btr_cur_n_non_sea;
#ifndef UNIV_NONINL
......
......@@ -378,6 +378,14 @@ buf_calc_page_checksum(
/*===================*/
/* out: checksum */
byte* page); /* in: buffer page */
/************************************************************************
Checks if a page is corrupt. */
ibool
buf_page_is_corrupted(
/*==================*/
/* out: TRUE if corrupted */
byte* read_buf); /* in: a database page */
/**************************************************************************
Gets the page number of a pointer pointing within a buffer frame containing
a file page. */
......
......@@ -101,7 +101,7 @@ make sure that a read-ahead batch can be read efficiently in a single
sweep). */
#define BUF_FLUSH_FREE_BLOCK_MARGIN (5 + BUF_READ_AHEAD_AREA)
#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4)
#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4 + 100)
#ifndef UNIV_NONINL
#include "buf0flu.ic"
......
......@@ -14,6 +14,9 @@ Created 5/30/1994 Heikki Tuuri
#include "data0types.h"
#include "data0type.h"
#include "mem0mem.h"
#include "dict0types.h"
typedef struct big_rec_struct big_rec_t;
/* Some non-inlined functions used in the MySQL interface: */
void
......@@ -312,6 +315,41 @@ dtuple_sprintf(
char* buf, /* in: print buffer */
ulint buf_len,/* in: buf length in bytes */
dtuple_t* tuple); /* in: tuple */
/******************************************************************
Moves parts of long fields in entry to the big record vector so that
the size of tuple drops below the maximum record size allowed in the
database. Moves data only from those fields which are not necessary
to determine uniquely the insertion place of the tuple in the index. */
big_rec_t*
dtuple_convert_big_rec(
/*===================*/
/* out, own: created big record vector,
NULL if we are not able to shorten
the entry enough, i.e., if there are
too many short fields in entry */
dict_index_t* index, /* in: index */
dtuple_t* entry); /* in: index entry */
/******************************************************************
Puts back to entry the data stored in vector. Note that to ensure the
fields in entry can accommodate the data, vector must have been created
from entry with dtuple_convert_big_rec. */
void
dtuple_convert_back_big_rec(
/*========================*/
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: entry whose data was put to vector */
big_rec_t* vector);/* in, own: big rec vector; it is
freed in this function */
/******************************************************************
Frees the memory in a big rec vector. */
void
dtuple_big_rec_free(
/*================*/
big_rec_t* vector); /* in, own: big rec vector; it is
freed in this function */
/***************************************************************
Generates a random tuple. */
......@@ -396,7 +434,7 @@ dtuple_gen_search_tuple_TPC_C(
/* Structure for an SQL data field */
struct dfield_struct{
void* data; /* pointer to data */
ulint len; /* data length; UNIV_SQL_NULL if SQL null */
ulint len; /* data length; UNIV_SQL_NULL if SQL null; */
dtype_t type; /* type of data */
ulint col_no; /* when building index entries, the column
number can be stored here */
......@@ -423,6 +461,24 @@ struct dtuple_struct {
};
#define DATA_TUPLE_MAGIC_N 65478679
/* A slot for a field in a big rec vector */
typedef struct big_rec_field_struct big_rec_field_t;
struct big_rec_field_struct {
ulint field_no; /* field number in record */
ulint len; /* stored data len */
byte* data; /* stored data */
};
/* Storage format for overflow data in a big record, that is, a record
which needs external storage of data fields */
struct big_rec_struct {
mem_heap_t* heap; /* memory heap from which allocated */
ulint n_fields; /* number of stored fields */
big_rec_field_t* fields; /* stored fields */
};
#ifndef UNIV_NONINL
#include "data0data.ic"
#endif
......
......@@ -307,12 +307,13 @@ dtuple_create(
/**************************************************************
The following function returns the sum of data lengths of a tuple. The space
occupied by the field structs or the tuple struct is not counted. */
occupied by the field structs or the tuple struct is not counted. Neither
is possible space in externally stored parts of the field. */
UNIV_INLINE
ulint
dtuple_get_data_size(
/*=================*/
/* out: sum of data lens */
/* out: sum of data lengths */
dtuple_t* tuple) /* in: typed data tuple */
{
dfield_t* field;
......@@ -382,7 +383,7 @@ dtuple_datas_are_equal(
field2 = dtuple_get_nth_field(tuple2, i);
data2 = (byte*) dfield_get_data(field2);
len2 = dfield_get_len(field2);
len2 = dfield_get_len(field2);
if (len1 != len2) {
......
......@@ -143,7 +143,7 @@ struct dict_col_struct{
ulint clust_pos;/* position of the column in the
clustered index */
ulint ord_part;/* count of how many times this column
appears in an ordering fields of an index */
appears in ordering fields of an index */
char* name; /* name */
dtype_t type; /* data type */
dict_table_t* table; /* back pointer to table of this column */
......
......@@ -196,6 +196,16 @@ fil_space_get_size(
/* out: space size */
ulint id); /* in: space id */
/***********************************************************************
Checks if the pair space, page_no refers to an existing page in a
tablespace file space. */
ibool
fil_check_adress_in_tablespace(
/*===========================*/
/* out: TRUE if the address is meaningful */
ulint id, /* in: space id */
ulint page_no);/* in: page number */
/***********************************************************************
Appends a new file to the chain of files of a space.
File must be closed. */
......
......@@ -70,7 +70,7 @@ page_t*
fseg_create(
/*========*/
/* out: the page where the segment header is placed,
x-latched, FIL_NULL if could not create segment
x-latched, NULL if could not create segment
because of lack of space */
ulint space, /* in: space id */
ulint page, /* in: page where the segment header is placed: if
......
......@@ -115,7 +115,7 @@ mach_write_to_4(
{
ut_ad(b);
#if notdefined && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
#if (0 == 1) && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
/* We do not use this even on Intel, because unaligned accesses may
be slow */
......@@ -143,7 +143,7 @@ mach_read_from_4(
/* out: ulint integer */
byte* b) /* in: pointer to four bytes */
{
#if notdefined && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
#if (0 == 1) && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
/* We do not use this even on Intel, because unaligned accesses may
be slow */
......
......@@ -59,6 +59,10 @@ log. */
#define OS_FILE_AIO 61
#define OS_FILE_NORMAL 62
/* Types for file create */
#define OS_DATA_FILE 100
#define OS_LOG_FILE 101
/* Error codes from os_file_get_last_error */
#define OS_FILE_NOT_FOUND 71
#define OS_FILE_DISK_FULL 72
......@@ -125,6 +129,7 @@ os_file_create(
if a new file is created or an old overwritten */
ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o
is desired, OS_FILE_NORMAL, if any normal file */
ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */
ibool* success);/* out: TRUE if succeed, FALSE if error */
/***************************************************************************
Closes a file handle. In case of error, error number can be retrieved with
......@@ -263,6 +268,13 @@ os_aio(
operation); if mode is OS_AIO_SYNC, these
are ignored */
void* message2);
/****************************************************************************
Waits until there are no pending writes in os_aio_write_array. There can
be other, synchronous, pending writes. */
void
os_aio_wait_until_no_pending_writes(void);
/*=====================================*/
/**************************************************************************
Wakes up simulated aio i/o-handler threads if they have something to do. */
......@@ -298,7 +310,8 @@ os_aio_windows_handle(
the aio operation failed, these output
parameters are valid and can be used to
restart the operation, for example */
void** message2);
void** message2,
ulint* type); /* out: OS_FILE_WRITE or ..._READ */
#endif
#ifdef POSIX_ASYNC_IO
/**************************************************************************
......@@ -335,7 +348,8 @@ os_aio_simulated_handle(
the aio operation failed, these output
parameters are valid and can be used to
restart the operation, for example */
void** message2);
void** message2,
ulint* type); /* out: OS_FILE_WRITE or ..._READ */
/**************************************************************************
Validates the consistency of the aio system. */
......
/***********************************************************************
Comparison services for records
(c) 1994-1996 Innobase Oy
(c) 1994-2001 Innobase Oy
Created 7/1/1994 Heikki Tuuri
************************************************************************/
......@@ -31,14 +31,18 @@ This function is used to compare a data tuple to a physical record.
Only dtuple->n_fields_cmp first fields are taken into account for
the the data tuple! If we denote by n = n_fields_cmp, then rec must
have either m >= n fields, or it must differ from dtuple in some of
the m fields rec has. */
the m fields rec has. If rec has an externally stored field we do not
compare it but return with value 0 if such a comparison should be
made. */
int
cmp_dtuple_rec_with_match(
/*======================*/
/* out: 1, 0, -1, if dtuple is greater, equal,
less than rec, respectively, when only the
common first fields are compared */
common first fields are compared, or
until the first externally stored field in
rec */
dtuple_t* dtuple, /* in: data tuple */
rec_t* rec, /* in: physical record which differs from
dtuple in some of the common fields, or which
......@@ -89,7 +93,8 @@ cmp_dtuple_rec_prefix_equal(
fields in dtuple */
/*****************************************************************
This function is used to compare two physical records. Only the common
first fields are compared. */
first fields are compared, and if an externally stored field is
encountered, then 0 is returned. */
int
cmp_rec_rec_with_match(
......
......@@ -12,6 +12,7 @@ Created 5/30/1994 Heikki Tuuri
#include "univ.i"
#include "data0data.h"
#include "rem0types.h"
#include "mtr0types.h"
/* Maximum values for various fields (for non-blob tuples) */
#define REC_MAX_N_FIELDS (1024 - 1)
......@@ -162,6 +163,49 @@ rec_get_nth_field_size(
/* out: field size in bytes */
rec_t* rec, /* in: record */
ulint n); /* in: index of the field */
/***************************************************************
Gets the value of the ith field extern storage bit. If it is TRUE
it means that the field is stored on another page. */
UNIV_INLINE
ibool
rec_get_nth_field_extern_bit(
/*=========================*/
/* in: TRUE or FALSE */
rec_t* rec, /* in: record */
ulint i); /* in: ith field */
/**********************************************************
Returns TRUE if the extern bit is set in any of the fields
of rec. */
UNIV_INLINE
ibool
rec_contains_externally_stored_field(
/*=================================*/
/* out: TRUE if a field is stored externally */
rec_t* rec); /* in: record */
/***************************************************************
Sets the value of the ith field extern storage bit. */
void
rec_set_nth_field_extern_bit(
/*=========================*/
rec_t* rec, /* in: record */
ulint i, /* in: ith field */
ibool val, /* in: value to set */
mtr_t* mtr); /* in: mtr holding an X-latch to the page where
rec is, or NULL; in the NULL case we do not
write to log about the change */
/***************************************************************
Sets TRUE the extern storage bits of fields mentioned in an array. */
void
rec_set_field_extern_bits(
/*======================*/
rec_t* rec, /* in: record */
ulint* vec, /* in: array of field numbers */
ulint n_fields, /* in: number of fields numbers */
mtr_t* mtr); /* in: mtr holding an X-latch to the page
where rec is, or NULL; in the NULL case we
do not write to log about the change */
/****************************************************************
The following function is used to get a copy of the nth
data field in the record to a buffer. */
......@@ -350,6 +394,15 @@ rec_sprintf(
#define REC_INFO_BITS 6 /* This is single byte bit-field */
/* Maximum lengths for the data in a physical record if the offsets
are given in one byte (resp. two byte) format. */
#define REC_1BYTE_OFFS_LIMIT 0x7F
#define REC_2BYTE_OFFS_LIMIT 0x7FFF
/* The data size of record must be smaller than this because we reserve
two upmost bits in a two byte offset for special purposes */
#define REC_MAX_DATA_SIZE (16 * 1024)
#ifndef UNIV_NONINL
#include "rem0rec.ic"
#endif
......
......@@ -25,12 +25,6 @@ significant bytes and bits are written below less significant.
4 bits info bits
*/
/* Maximum lengths for the data in a physical record if the offsets
are given as one byte (resp. two byte) format. */
#define REC_1BYTE_OFFS_LIMIT 0x7F
#define REC_2BYTE_OFFS_LIMIT 0x7FFF
/* We list the byte offsets from the origin of the record, the mask,
and the shift needed to obtain each bit-field of the record. */
......@@ -66,6 +60,11 @@ one-byte and two-byte offsets */
#define REC_1BYTE_SQL_NULL_MASK 0x80
#define REC_2BYTE_SQL_NULL_MASK 0x8000
/* In a 2-byte offset the second most significant bit denotes
a field stored to another page: */
#define REC_2BYTE_EXTERN_MASK 0x4000
/***************************************************************
Sets the value of the ith field SQL null bit. */
......@@ -489,7 +488,7 @@ ulint
rec_2_get_field_end_info(
/*=====================*/
/* out: offset of the start of the field, SQL null
flag ORed */
flag and extern storage flag ORed */
rec_t* rec, /* in: record */
ulint n) /* in: field index */
{
......@@ -499,6 +498,63 @@ rec_2_get_field_end_info(
return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2)));
}
/***************************************************************
Gets the value of the ith field extern storage bit. If it is TRUE
it means that the field is stored on another page. */
UNIV_INLINE
ibool
rec_get_nth_field_extern_bit(
/*=========================*/
/* in: TRUE or FALSE */
rec_t* rec, /* in: record */
ulint i) /* in: ith field */
{
ulint info;
if (rec_get_1byte_offs_flag(rec)) {
return(FALSE);
}
info = rec_2_get_field_end_info(rec, i);
if (info & REC_2BYTE_EXTERN_MASK) {
return(TRUE);
}
return(FALSE);
}
/**********************************************************
Returns TRUE if the extern bit is set in any of the fields
of rec. */
UNIV_INLINE
ibool
rec_contains_externally_stored_field(
/*=================================*/
/* out: TRUE if a field is stored externally */
rec_t* rec) /* in: record */
{
ulint n;
ulint i;
if (rec_get_1byte_offs_flag(rec)) {
return(FALSE);
}
n = rec_get_n_fields(rec);
for (i = 0; i < n; i++) {
if (rec_get_nth_field_extern_bit(rec, i)) {
return(TRUE);
}
}
return(FALSE);
}
/**********************************************************
Returns the offset of n - 1th field end if the record is stored in the 1-byte
offsets form. If the field is SQL null, the flag is ORed in the returned
......@@ -616,7 +672,7 @@ rec_2_get_field_start_offs(
}
return(rec_2_get_prev_field_end_info(rec, n)
& ~REC_2BYTE_SQL_NULL_MASK);
& ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
}
/**********************************************************
......
......@@ -56,6 +56,9 @@ row_ins_index_entry_low(
pessimistic descent down the index tree */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry to insert */
ulint* ext_vec,/* in: array containing field numbers of
externally stored fields in entry, or NULL */
ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr); /* in: query thread */
/*******************************************************************
Inserts an index entry to index. Tries first optimistic, then pessimistic
......@@ -70,6 +73,9 @@ row_ins_index_entry(
DB_DUPLICATE_KEY, or some other error code */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry to insert */
ulint* ext_vec,/* in: array containing field numbers of
externally stored fields in entry, or NULL */
ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr); /* in: query thread */
/***************************************************************
Inserts a row to a table. */
......
......@@ -189,7 +189,9 @@ row_update_for_mysql(
row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL
handle */
/*************************************************************************
Does a table creation operation for MySQL. */
Does a table creation operation for MySQL. If the name of the created
table ends to characters INNODB_MONITOR, then this also starts
printing of monitor output by the master thread. */
int
row_create_table_for_mysql(
......@@ -209,7 +211,9 @@ row_create_index_for_mysql(
dict_index_t* index, /* in: index defintion */
trx_t* trx); /* in: transaction handle */
/*************************************************************************
Drops a table for MySQL. */
Drops a table for MySQL. If the name of the dropped table ends to
characters INNODB_MONITOR, then this also stops printing of monitor
output by the master thread. */
int
row_drop_table_for_mysql(
......
......@@ -250,6 +250,7 @@ row_search_index_entry(
#define ROW_COPY_DATA 1
#define ROW_COPY_POINTERS 2
#define ROW_COPY_ALSO_EXTERNALS 3
/* The allowed latching order of index records is the following:
(1) a secondary index record ->
......
......@@ -147,6 +147,9 @@ row_upd_build_difference(
fields, excluding roll ptr and trx id */
dict_index_t* index, /* in: clustered index */
dtuple_t* entry, /* in: entry to insert */
ulint* ext_vec,/* in: array containing field numbers of
externally stored fields in entry, or NULL */
ulint n_ext_vec,/* in: number of fields in ext_vec */
rec_t* rec, /* in: clustered index record */
mem_heap_t* heap); /* in: memory heap from which allocated */
/***************************************************************
......@@ -262,6 +265,9 @@ struct upd_field_struct{
constants in the symbol table of the
query graph */
dfield_t new_val; /* new value for the column */
ibool extern_storage; /* this is set to TRUE if dfield
actually contains a reference to
an externally stored field */
};
/* Update vector structure */
......@@ -318,6 +324,10 @@ struct upd_node_struct{
dtuple_t* row; /* NULL, or a copy (also fields copied to
heap) of the row to update; this must be reset
to NULL after a successful update */
ulint* ext_vec;/* array describing which fields are stored
externally in the clustered index record of
row */
ulint n_ext_vec;/* number of fields in ext_vec */
mem_heap_t* heap; /* memory heap used as auxiliary storage for
row; this must be emptied after a successful
update if node->row != NULL */
......@@ -349,7 +359,7 @@ struct upd_node_struct{
looked at and updated if an ordering
field changed */
/* Compilation info flags: these must fit within one byte */
/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be
changed in the update and no ordering
field of the clustered index */
......
......@@ -23,6 +23,7 @@ upd_create(
mem_heap_t* heap) /* in: heap from which memory allocated */
{
upd_t* update;
ulint i;
update = mem_heap_alloc(heap, sizeof(upd_t));
......@@ -30,6 +31,10 @@ upd_create(
update->n_fields = n;
update->fields = mem_heap_alloc(heap, sizeof(upd_field_t) * n);
for (i = 0; i < n; i++) {
update->fields[i].extern_storage = 0;
}
return(update);
}
......
......@@ -27,6 +27,9 @@ extern char** srv_data_file_names;
extern ulint* srv_data_file_sizes;
extern ulint* srv_data_file_is_raw_partition;
#define SRV_NEW_RAW 1
#define SRV_OLD_RAW 2
extern char** srv_log_group_home_dirs;
extern ulint srv_n_log_groups;
......@@ -52,10 +55,14 @@ extern ulint srv_lock_wait_timeout;
extern char* srv_unix_file_flush_method_str;
extern ulint srv_unix_file_flush_method;
extern ibool srv_use_doublewrite_buf;
extern ibool srv_set_thread_priorities;
extern int srv_query_thread_priority;
/*-------------------------------------------*/
extern ibool srv_print_innodb_monitor;
extern ulint srv_n_spin_wait_rounds;
extern ulint srv_spin_wait_delay;
extern ibool srv_priority_boost;
......@@ -104,26 +111,13 @@ typedef struct srv_sys_struct srv_sys_t;
/* The server system */
extern srv_sys_t* srv_sys;
/* Alternatives for file flush option in Unix; see the InnoDB manual about
/* Alternatives for fiel flush option in Unix; see the InnoDB manual about
what these mean */
#define SRV_UNIX_FDATASYNC 1
#define SRV_UNIX_O_DSYNC 2
#define SRV_UNIX_LITTLESYNC 3
#define SRV_UNIX_NOSYNC 4
/* Raw partition flags */
#define SRV_OLD_RAW 1
#define SRV_NEW_RAW 2
void
srv_mysql_thread_release(void);
/*==========================*/
os_event_t
srv_mysql_thread_event_get(void);
void
srv_mysql_thread_slot_free(
/*==========================*/
os_event_t event);
/*************************************************************************
Boots Innobase server. */
......
......@@ -393,6 +393,7 @@ Memory pool mutex */
#define SYNC_RSEG_HEADER_NEW 591
#define SYNC_RSEG_HEADER 590
#define SYNC_TRX_UNDO_PAGE 570
#define SYNC_EXTERN_STORAGE 500
#define SYNC_FSP 400
#define SYNC_FSP_PAGE 395
/*------------------------------------- Insert buffer headers */
......@@ -415,6 +416,7 @@ Memory pool mutex */
the level is SYNC_MEM_HASH. */
#define SYNC_BUF_POOL 150
#define SYNC_BUF_BLOCK 149
#define SYNC_DOUBLEWRITE 140
#define SYNC_ANY_LATCH 135
#define SYNC_MEM_HASH 131
#define SYNC_MEM_POOL 130
......
......@@ -45,6 +45,14 @@ trx_undo_rec_get_cmpl_info(
/* out: compiler info */
trx_undo_rec_t* undo_rec); /* in: undo log record */
/**************************************************************************
Returns TRUE if an undo log record contains an extern storage field. */
UNIV_INLINE
ibool
trx_undo_rec_get_extern_storage(
/*============================*/
/* out: TRUE if extern */
trx_undo_rec_t* undo_rec); /* in: undo log record */
/**************************************************************************
Reads the undo log record number. */
UNIV_INLINE
dulint
......@@ -65,6 +73,8 @@ trx_undo_rec_get_pars(
TRX_UNDO_INSERT_REC, ... */
ulint* cmpl_info, /* out: compiler info, relevant only
for update type records */
ibool* updated_extern, /* out: TRUE if we updated an
externally stored fild */
dulint* undo_no, /* out: undo log record number */
dulint* table_id); /* out: table id */
/***********************************************************************
......@@ -272,7 +282,11 @@ record */
do not change */
#define TRX_UNDO_CMPL_INFO_MULT 16 /* compilation info is multiplied by
this and ORed to the type above */
#define TRX_UNDO_UPD_EXTERN 128 /* This bit can be ORed to type_cmpl
to denote that we updated external
storage fields: used by purge to
free the external storage */
/* Operation type flags used in trx_undo_report_row_operation */
#define TRX_UNDO_INSERT_OP 1
#define TRX_UNDO_MODIFY_OP 2
......
......@@ -30,6 +30,23 @@ trx_undo_rec_get_cmpl_info(
return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT);
}
/**************************************************************************
Returns TRUE if an undo log record contains an extern storage field. */
UNIV_INLINE
ibool
trx_undo_rec_get_extern_storage(
/*============================*/
/* out: TRUE if extern */
trx_undo_rec_t* undo_rec) /* in: undo log record */
{
if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) {
return(TRUE);
}
return(FALSE);
}
/**************************************************************************
Reads the undo log record number. */
UNIV_INLINE
......
......@@ -27,6 +27,23 @@ Created 3/26/1996 Heikki Tuuri
/* The transaction system */
extern trx_sys_t* trx_sys;
/* Doublewrite system */
extern trx_doublewrite_t* trx_doublewrite;
/********************************************************************
Creates the doublewrite buffer at a database start. The header of the
doublewrite buffer is placed on the trx system header page. */
void
trx_sys_create_doublewrite_buf(void);
/*================================*/
/********************************************************************
At a database startup uses a possible doublewrite buffer to restore
half-written pages in the data files. */
void
trx_sys_doublewrite_restore_corrupt_pages(void);
/*===========================================*/
/*******************************************************************
Checks if a page address is the trx sys header page. */
UNIV_INLINE
......@@ -235,6 +252,59 @@ therefore 256 */
segment specification slots */
/*-------------------------------------------------------------*/
/* The offset of the doublewrite buffer header on the trx system header page */
#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200)
/*-------------------------------------------------------------*/
#define TRX_SYS_DOUBLEWRITE_FSEG 0 /* fseg header of the fseg
containing the doublewrite
buffer */
#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE
/* 4-byte magic number which
shows if we already have
created the doublewrite
buffer */
#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE)
/* page number of the
first page in the first
sequence of 64
(= FSP_EXTENT_SIZE) consecutive
pages in the doublewrite
buffer */
#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE)
/* page number of the
first page in the second
sequence of 64 consecutive
pages in the doublewrite
buffer */
#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /* we repeat the above 3
numbers so that if the trx
sys header is half-written
to disk, we still may be able
to recover the information */
/*-------------------------------------------------------------*/
#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855
#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
/* Doublewrite control struct */
struct trx_doublewrite_struct{
mutex_t mutex; /* mutex protecting the first_free field and
write_buf */
ulint block1; /* the page number of the first
doublewrite block (64 pages) */
ulint block2; /* page number of the second block */
ulint first_free; /* first free position in write_buf measured
in units of UNIV_PAGE_SIZE */
byte* write_buf; /* write buffer used in writing to the
doublewrite buffer, aligned to an
address divisible by UNIV_PAGE_SIZE
(which is required by Windows aio) */
byte* write_buf_unaligned; /* pointer to write_buf, but unaligned */
buf_block_t**
buf_block_arr; /* array to store pointers to the buffer
blocks which have been cached to write_buf */
};
/* The transaction system central memory data structure; protected by the
kernel mutex */
struct trx_sys_struct{
......
......@@ -15,6 +15,7 @@ Created 3/26/1996 Heikki Tuuri
/* Memory objects */
typedef struct trx_struct trx_t;
typedef struct trx_sys_struct trx_sys_t;
typedef struct trx_doublewrite_struct trx_doublewrite_t;
typedef struct trx_sig_struct trx_sig_t;
typedef struct trx_rseg_struct trx_rseg_t;
typedef struct trx_undo_struct trx_undo_t;
......
......@@ -341,7 +341,9 @@ struct trx_undo_struct{
have delete marked records, because of
a delete of a row or an update of an
indexed field; purge is then
necessary. */
necessary; also TRUE if the transaction
has updated an externally stored
field */
dulint trx_id; /* id of the trx assigned to the undo
log */
ibool dict_operation; /* TRUE if a dict operation trx */
......
......@@ -9,11 +9,12 @@ Created 1/20/1994 Heikki Tuuri
#ifndef univ_i
#define univ_i
#undef UNIV_INTEL_X86
#if (defined(_WIN32) || defined(_WIN64)) && !defined(MYSQL_SERVER)
#if (defined(_WIN32) || defined(_WIN64))
#define __WIN__
#ifndef MYSQL_SERVER
#include <windows.h>
#endif
/* If you want to check for errors with compiler level -W4,
comment out the above include of windows.h and let the following defines
......@@ -40,10 +41,8 @@ subdirectory of 'mysql'. */
#include <global.h>
#include <my_pthread.h>
#ifndef __WIN__
/* Include <sys/stat.h> to get S_I... macros defined for os0file.c */
#include <sys/stat.h>
#endif
#undef PACKAGE
#undef VERSION
......@@ -63,19 +62,21 @@ subdirectory of 'mysql'. */
/* DEBUG VERSION CONTROL
===================== */
/*
#define UNIV_SYNC_DEBUG
*/
/* Make a non-inline debug version */
/*
#define UNIV_DEBUG
#define UNIV_MEM_DEBUG
#define UNIV_SYNC_DEBUG
#define UNIV_SEARCH_DEBUG
#define UNIV_IBUF_DEBUG
#define UNIV_SYNC_PERF_STAT
#define UNIV_SEARCH_PERF_STAT
#define UNIV_DEBUG_FILE_ACCESSES
*/
#define UNIV_LIGHT_MEM_DEBUG
......@@ -192,6 +193,13 @@ headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
has the SQL NULL as its value. */
#define UNIV_SQL_NULL ULINT_UNDEFINED
/* Lengths which are not UNIV_SQL_NULL, but bigger than the following
number indicate that a field contains a reference to an externally
stored part of the field in the tablespace. The length field then
contains the sum of the following flag and the locally stored len. */
#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE)
/* The following definition of __FILE__ removes compiler warnings
associated with const char* / char* mismatches with __FILE__ */
......
......@@ -41,7 +41,7 @@ extern ulint* ut_dbg_null_ptr;
}\
if (ut_dbg_stop_threads) {\
fprintf(stderr,\
"Innobase: Thread %lu stopped in file %s line %lu\n",\
"InnoDB: Thread %lu stopped in file %s line %lu\n",\
os_thread_get_curr_id(), IB__FILE__, (ulint)__LINE__);\
os_thread_sleep(1000000000);\
}\
......@@ -50,19 +50,17 @@ extern ulint* ut_dbg_null_ptr;
#define ut_error {\
ulint dbg_i;\
fprintf(stderr,\
"Innobase: Assertion failure in thread %lu in file %s line %lu\n",\
"InnoDB: Assertion failure in thread %lu in file %s line %lu\n",\
os_thread_get_curr_id(), IB__FILE__, (ulint)__LINE__);\
fprintf(stderr,\
"Innobase: we intentionally generate a memory trap.\n");\
"InnoDB: We intentionally generate a memory trap.\n");\
fprintf(stderr,\
"Innobase: Send a bug report to mysql@lists.mysql.com\n");\
"InnoDB: Send a detailed bug report to mysql@lists.mysql.com\n");\
ut_dbg_stop_threads = TRUE;\
dbg_i = *(ut_dbg_null_ptr);\
printf("%lu", dbg_i);\
}
#ifdef UNIV_DEBUG
#define ut_ad(EXPR) ut_a(EXPR)
#define ut_d(EXPR) {EXPR;}
......
......@@ -11,8 +11,7 @@ Created 1/20/1994 Heikki Tuuri
#include "univ.i"
#include <time.h>
#include <m_ctype.h>
#include <ctype.h>
typedef time_t ib_time_t;
......
......@@ -3219,6 +3219,7 @@ lock_rec_print(
ulint space;
ulint page_no;
ulint i;
ulint count = 0;
mtr_t mtr;
ut_ad(mutex_own(&kernel_mutex));
......@@ -3230,7 +3231,8 @@ lock_rec_print(
printf("\nRECORD LOCKS space id %lu page no %lu n bits %lu",
space, page_no, lock_rec_get_n_bits(lock));
printf(" index %s trx id %lu %lu", (lock->index)->name,
printf(" table %s index %s trx id %lu %lu",
lock->index->table->name, lock->index->name,
(lock->trx)->id.high, (lock->trx)->id.low);
if (lock_get_mode(lock) == LOCK_S) {
......@@ -3281,10 +3283,18 @@ lock_rec_print(
rec_print(page_find_rec_with_heap_no(page, i));
}
count++;
printf("\n");
}
}
if (count >= 3) {
printf(
"3 LOCKS PRINTED FOR THIS TRX AND PAGE: SUPPRESSING FURTHER PRINTS\n");
goto end_prints;
}
}
end_prints:
mtr_commit(&mtr);
}
......@@ -3335,7 +3345,6 @@ lock_print_info(void)
lock_mutex_enter_kernel();
printf("------------------------------------\n");
printf("LOCK INFO:\n");
printf("Number of locks in the record hash table %lu\n",
lock_get_n_rec_locks());
......@@ -3352,7 +3361,7 @@ loop:
if (trx == NULL) {
lock_mutex_exit_kernel();
lock_validate();
/* lock_validate(); */
return;
}
......@@ -3360,6 +3369,19 @@ loop:
if (nth_lock == 0) {
printf("\nLOCKS FOR TRANSACTION ID %lu %lu\n", trx->id.high,
trx->id.low);
if (trx->que_state == TRX_QUE_LOCK_WAIT) {
printf(
"################# TRX IS WAITING FOR THE LOCK: ###\n");
if (lock_get_type(trx->wait_lock) == LOCK_REC) {
lock_rec_print(trx->wait_lock);
} else {
lock_table_print(trx->wait_lock);
}
printf(
"##################################################\n");
}
}
i = 0;
......@@ -3409,6 +3431,16 @@ loop:
nth_lock++;
if (nth_lock >= 25) {
printf(
"25 LOCKS PRINTED FOR THIS TRX: SUPPRESSING FURTHER PRINTS\n");
nth_trx++;
nth_lock = 0;
goto loop;
}
goto loop;
}
......
......@@ -838,7 +838,9 @@ log_io_complete(
/* It was a checkpoint write */
group = (log_group_t*)((ulint)group - 1);
if (srv_unix_file_flush_method == SRV_UNIX_LITTLESYNC) {
if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
fil_flush(group->space_id);
}
......@@ -847,7 +849,9 @@ log_io_complete(
return;
}
if (srv_unix_file_flush_method == SRV_UNIX_LITTLESYNC) {
if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
fil_flush(group->space_id);
}
......@@ -1478,7 +1482,7 @@ log_checkpoint(
recv_apply_hashed_log_recs(TRUE);
}
if (srv_unix_file_flush_method == SRV_UNIX_LITTLESYNC) {
if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
fil_flush_file_spaces(FIL_TABLESPACE);
}
......@@ -1885,10 +1889,11 @@ loop:
fil_reserve_right_to_open();
file_handle = os_file_create(name, open_mode, OS_FILE_AIO,
&ret);
OS_DATA_FILE, &ret);
if (!ret && (open_mode == OS_FILE_CREATE)) {
file_handle = os_file_create(name, OS_FILE_OPEN,
OS_FILE_AIO, &ret);
OS_FILE_AIO, OS_DATA_FILE, &ret);
}
if (!ret) {
......
......@@ -2234,7 +2234,8 @@ try_open_again:
fil_reserve_right_to_open();
file_handle = os_file_create(name, OS_FILE_OPEN, OS_FILE_AIO, &ret);
file_handle = os_file_create(name, OS_FILE_OPEN,
OS_FILE_LOG, OS_FILE_AIO, &ret);
if (ret == FALSE) {
fil_release_right_to_open();
......
......@@ -10,6 +10,7 @@ Created 10/21/1995 Heikki Tuuri
#include "os0sync.h"
#include "ut0mem.h"
#include "srv0srv.h"
#include "trx0sys.h"
#undef HAVE_FDATASYNC
......@@ -74,9 +75,12 @@ typedef struct os_aio_array_struct os_aio_array_t;
struct os_aio_array_struct{
os_mutex_t mutex; /* the mutex protecting the aio array */
os_event_t not_full; /* The event which is set to signaled
os_event_t not_full; /* The event which is set to the signaled
state when there is space in the aio
outside the ibuf segment */
os_event_t is_empty; /* The event which is set to the signaled
state when there are no pending i/os
in this array */
ulint n_slots; /* Total number of slots in the aio array.
This must be divisible by n_threads. */
ulint n_segments;/* Number of segments in the aio array of
......@@ -254,6 +258,7 @@ os_file_create(
if a new is created or an old overwritten */
ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o
is desired, OS_FILE_NORMAL, if any normal file */
ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */
ibool* success)/* out: TRUE if succeed, FALSE if error */
{
#ifdef __WIN__
......@@ -347,11 +352,10 @@ try_again:
UT_NOT_USED(purpose);
/* Currently use only O_SYNC because there may be a bug in
Linux O_DSYNC! */
#ifdef O_SYNC
if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
if ((!srv_use_doublewrite_buf || type != OS_DATA_FILE)
&& srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
create_flag = create_flag | O_SYNC;
}
#endif
......@@ -551,12 +555,6 @@ os_file_flush(
#else
int ret;
#ifdef O_DSYNC
if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
return(TRUE);
}
#endif
#ifdef HAVE_FDATASYNC
ret = fdatasync(file);
#else
......@@ -637,7 +635,8 @@ os_file_pwrite(
ret = pwrite(file, buf, n, offs);
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
&& !trx_doublewrite) {
/* Always do fsync to reduce the probability that when
the OS crashes, a database page is only partially
......@@ -666,7 +665,8 @@ os_file_pwrite(
ret = write(file, buf, n);
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
&& !trx_doublewrite) {
/* Always do fsync to reduce the probability that when
the OS crashes, a database page is only partially
......@@ -825,7 +825,9 @@ try_again:
/* Always do fsync to reduce the probability that when the OS crashes,
a database page is only partially physically written to disk. */
ut_a(TRUE == os_file_flush(file));
if (!trx_doublewrite) {
ut_a(TRUE == os_file_flush(file));
}
os_mutex_exit(os_file_seek_mutexes[i]);
......@@ -900,6 +902,10 @@ os_aio_array_create(
array->mutex = os_mutex_create(NULL);
array->not_full = os_event_create(NULL);
array->is_empty = os_event_create(NULL);
os_event_set(array->is_empty);
array->n_slots = n;
array->n_segments = n_segments;
array->n_reserved = 0;
......@@ -999,6 +1005,17 @@ os_aio_init(
#endif
}
/****************************************************************************
Waits until there are no pending writes in os_aio_write_array. There can
be other, synchronous, pending writes. */
void
os_aio_wait_until_no_pending_writes(void)
/*=====================================*/
{
os_event_wait(os_aio_write_array->is_empty);
}
/**************************************************************************
Calculates segment number for a slot. */
static
......@@ -1191,6 +1208,10 @@ loop:
array->n_reserved++;
if (array->n_reserved == 1) {
os_event_reset(array->is_empty);
}
if (array->n_reserved == array->n_slots) {
os_event_reset(array->not_full);
}
......@@ -1264,6 +1285,10 @@ os_aio_array_free_slot(
os_event_set(array->not_full);
}
if (array->n_reserved == 0) {
os_event_set(array->is_empty);
}
#ifdef WIN_ASYNC_IO
os_event_reset(slot->control.hEvent);
#endif
......@@ -1377,6 +1402,7 @@ os_aio(
DWORD len = n;
void* dummy_mess1;
void* dummy_mess2;
ulint dummy_type;
#endif
ulint err = 0;
ibool retry;
......@@ -1489,8 +1515,9 @@ try_again:
use the same wait mechanism as for async i/o */
return(os_aio_windows_handle(ULINT_UNDEFINED,
slot->pos,
&dummy_mess1, &dummy_mess2));
slot->pos,
&dummy_mess1, &dummy_mess2,
&dummy_type));
}
return(TRUE);
......@@ -1547,7 +1574,8 @@ os_aio_windows_handle(
the aio operation failed, these output
parameters are valid and can be used to
restart the operation, for example */
void** message2)
void** message2,
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
{
os_aio_array_t* array;
os_aio_slot_t* slot;
......@@ -1592,10 +1620,12 @@ os_aio_windows_handle(
*message1 = slot->message1;
*message2 = slot->message2;
*type = slot->type;
if (ret && len == slot->len) {
ret_val = TRUE;
if (slot->type == OS_FILE_WRITE) {
if (slot->type == OS_FILE_WRITE && !trx_doublewrite) {
ut_a(TRUE == os_file_flush(slot->file));
}
} else {
......@@ -1679,7 +1709,7 @@ os_aio_posix_handle(
*message1 = slot->message1;
*message2 = slot->message2;
if (slot->type == OS_FILE_WRITE) {
if (slot->type == OS_FILE_WRITE && !trx_doublewrite) {
ut_a(TRUE == os_file_flush(slot->file));
}
......@@ -1709,7 +1739,8 @@ os_aio_simulated_handle(
the aio operation failed, these output
parameters are valid and can be used to
restart the operation, for example */
void** message2)
void** message2,
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
{
os_aio_array_t* array;
ulint segment;
......@@ -1906,6 +1937,8 @@ slot_io_done:
*message1 = slot->message1;
*message2 = slot->message2;
*type = slot->type;
os_mutex_exit(array->mutex);
os_aio_array_free_slot(array, slot);
......@@ -1989,13 +2022,13 @@ os_aio_print(void)
os_aio_slot_t* slot;
ulint n_reserved;
ulint i;
printf("Pending normal aio reads:\n");
array = os_aio_read_array;
loop:
ut_a(array);
printf("INFO OF AN AIO ARRAY\n");
os_mutex_enter(array->mutex);
ut_a(array->n_slots > 0);
......@@ -2022,24 +2055,29 @@ loop:
os_mutex_exit(array->mutex);
if (array == os_aio_read_array) {
printf("Pending aio writes:\n");
array = os_aio_write_array;
goto loop;
}
if (array == os_aio_write_array) {
printf("Pending insert buffer aio reads:\n");
array = os_aio_ibuf_array;
goto loop;
}
if (array == os_aio_ibuf_array) {
printf("Pending log writes or reads:\n");
array = os_aio_log_array;
goto loop;
}
if (array == os_aio_log_array) {
printf("Pending synchronous reads or writes:\n");
array = os_aio_sync_array;
goto loop;
......
......@@ -1019,16 +1019,16 @@ page_cur_delete_rec(
page_cur_t* cursor, /* in: a page cursor */
mtr_t* mtr) /* in: mini-transaction handle */
{
page_dir_slot_t* cur_dir_slot;
page_dir_slot_t* prev_slot;
page_t* page;
rec_t* current_rec;
rec_t* prev_rec = NULL;
rec_t* next_rec;
ulint cur_slot_no;
page_dir_slot_t* cur_dir_slot;
page_dir_slot_t* prev_slot;
ulint cur_n_owned;
rec_t* rec;
ut_ad(cursor && mtr);
page = page_cur_get_page(cursor);
......@@ -1037,7 +1037,7 @@ page_cur_delete_rec(
/* The record must not be the supremum or infimum record. */
ut_ad(current_rec != page_get_supremum_rec(page));
ut_ad(current_rec != page_get_infimum_rec(page));
/* Save to local variables some data associated with current_rec */
cur_slot_no = page_dir_find_owner_slot(current_rec);
cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no);
......
......@@ -2028,11 +2028,7 @@ pars_complete_graph_for_exec(
que_node_set_parent(node, thr);
mutex_enter(&kernel_mutex);
trx->graph = NULL;
mutex_exit(&kernel_mutex);
return(thr);
}
......@@ -295,14 +295,18 @@ This function is used to compare a data tuple to a physical record.
Only dtuple->n_fields_cmp first fields are taken into account for
the the data tuple! If we denote by n = n_fields_cmp, then rec must
have either m >= n fields, or it must differ from dtuple in some of
the m fields rec has. */
the m fields rec has. If rec has an externally stored field we do not
compare it but return with value 0 if such a comparison should be
made. */
int
cmp_dtuple_rec_with_match(
/*======================*/
/* out: 1, 0, -1, if dtuple is greater, equal,
less than rec, respectively, when only the
common first fields are compared */
common first fields are compared, or
until the first externally stored field in
rec */
dtuple_t* dtuple, /* in: data tuple */
rec_t* rec, /* in: physical record which differs from
dtuple in some of the common fields, or which
......@@ -344,7 +348,8 @@ cmp_dtuple_rec_with_match(
ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple));
ut_ad(cur_field <= rec_get_n_fields(rec));
/* Match fields in a loop; stop if we run out of fields in dtuple */
/* Match fields in a loop; stop if we run out of fields in dtuple
or find an externally stored field */
while (cur_field < dtuple_get_n_fields_cmp(dtuple)) {
......@@ -357,7 +362,8 @@ cmp_dtuple_rec_with_match(
/* If we have matched yet 0 bytes, it may be that one or
both the fields are SQL null, or the record or dtuple may be
the predefined minimum record */
the predefined minimum record, or the field is externally
stored */
if (cur_bytes == 0) {
if (cur_field == 0) {
......@@ -384,6 +390,15 @@ cmp_dtuple_rec_with_match(
}
}
if (rec_get_nth_field_extern_bit(rec, cur_field)) {
/* We do not compare to an externally
stored field */
ret = 0;
goto order_resolved;
}
if (dtuple_f_len == UNIV_SQL_NULL
|| rec_f_len == UNIV_SQL_NULL) {
......@@ -604,7 +619,8 @@ cmp_dtuple_rec_prefix_equal(
/*****************************************************************
This function is used to compare two physical records. Only the common
first fields are compared. */
first fields are compared, and if an externally stored field is
encountered, then 0 is returned. */
int
cmp_rec_rec_with_match(
......@@ -688,8 +704,18 @@ cmp_rec_rec_with_match(
goto order_resolved;
}
}
}
if (rec_get_nth_field_extern_bit(rec1, cur_field)
|| rec_get_nth_field_extern_bit(rec2, cur_field)) {
/* We do not compare to an externally
stored field */
ret = 0;
goto order_resolved;
}
if (rec1_f_len == UNIV_SQL_NULL
|| rec2_f_len == UNIV_SQL_NULL) {
......@@ -812,7 +838,8 @@ order_resolved:
Used in debug checking of cmp_dtuple_... .
This function is used to compare a data tuple to a physical record. If
dtuple has n fields then rec must have either m >= n fields, or it must
differ from dtuple in some of the m fields rec has. */
differ from dtuple in some of the m fields rec has. If encounters an
externally stored field, returns 0. */
static
int
cmp_debug_dtuple_rec_with_match(
......@@ -882,6 +909,14 @@ cmp_debug_dtuple_rec_with_match(
rec_f_data = rec_get_nth_field(rec, cur_field, &rec_f_len);
if (rec_get_nth_field_extern_bit(rec, cur_field)) {
/* We do not compare to an externally stored field */
ret = 0;
goto order_resolved;
}
ret = cmp_data_data(cur_type, dtuple_f_data, dtuple_f_len,
rec_f_data, rec_f_len);
if (ret != 0) {
......
/************************************************************************
Record manager
(c) 1994-1996 Innobase Oy
(c) 1994-2001 Innobase Oy
Created 5/30/1994 Heikki Tuuri
*************************************************************************/
......@@ -12,6 +12,9 @@ Created 5/30/1994 Heikki Tuuri
#include "rem0rec.ic"
#endif
#include "mtr0mtr.h"
#include "mtr0log.h"
/* PHYSICAL RECORD
===============
......@@ -21,7 +24,10 @@ found in index pages of the database, has the following format
represented on a higher text line):
| offset of the end of the last field of data, the most significant
bit is set to 1 if and only if the field is SQL-null |
bit is set to 1 if and only if the field is SQL-null,
if the offset is 2-byte, then the second most significant
bit is set to 1 if the field is stored on another page:
mostly this will occur in the case of big BLOB fields |
...
| offset of the end of the first field of data + the SQL-null bit |
| 4 bits used to delete mark a record, and mark a predefined
......@@ -122,7 +128,8 @@ rec_get_nth_field(
return(rec + os);
}
next_os = next_os & ~REC_2BYTE_SQL_NULL_MASK;
next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK
| REC_2BYTE_EXTERN_MASK);
}
*len = next_os - os;
......@@ -170,6 +177,60 @@ rec_set_nth_field_null_bit(
rec_2_set_field_end_info(rec, i, info);
}
/***************************************************************
Sets the value of the ith field extern storage bit. */
void
rec_set_nth_field_extern_bit(
/*=========================*/
rec_t* rec, /* in: record */
ulint i, /* in: ith field */
ibool val, /* in: value to set */
mtr_t* mtr) /* in: mtr holding an X-latch to the page where
rec is, or NULL; in the NULL case we do not
write to log about the change */
{
ulint info;
ut_a(!rec_get_1byte_offs_flag(rec));
ut_a(i < rec_get_n_fields(rec));
info = rec_2_get_field_end_info(rec, i);
if (val) {
info = info | REC_2BYTE_EXTERN_MASK;
} else {
info = info & ~REC_2BYTE_EXTERN_MASK;
}
if (mtr) {
mlog_write_ulint(rec - REC_N_EXTRA_BYTES - 2 * (i + 1), info,
MLOG_2BYTES, mtr);
} else {
rec_2_set_field_end_info(rec, i, info);
}
}
/***************************************************************
Sets TRUE the extern storage bits of fields mentioned in an array. */
void
rec_set_field_extern_bits(
/*======================*/
rec_t* rec, /* in: record */
ulint* vec, /* in: array of field numbers */
ulint n_fields, /* in: number of fields numbers */
mtr_t* mtr) /* in: mtr holding an X-latch to the page
where rec is, or NULL; in the NULL case we
do not write to log about the change */
{
ulint i;
for (i = 0; i < n_fields; i++) {
rec_set_nth_field_extern_bit(rec, vec[i], TRUE, mtr);
}
}
/***************************************************************
Sets a record field to SQL null. The physical size of the field is not
changed. */
......
......@@ -234,7 +234,13 @@ row_ins_clust_index_entry_by_modify(
depending on whether mtr holds just a leaf
latch or also a tree latch */
btr_cur_t* cursor, /* in: B-tree cursor */
big_rec_t** big_rec,/* out: possible big rec vector of fields
which have to be stored externally by the
caller */
dtuple_t* entry, /* in: index entry to insert */
ulint* ext_vec,/* in: array containing field numbers of
externally stored fields in entry, or NULL */
ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr, /* in: query thread */
mtr_t* mtr) /* in: mtr */
{
......@@ -243,8 +249,10 @@ row_ins_clust_index_entry_by_modify(
upd_t* update;
ulint err;
ut_ad((cursor->index)->type & DICT_CLUSTERED);
ut_ad(cursor->index->type & DICT_CLUSTERED);
*big_rec = NULL;
rec = btr_cur_get_rec(cursor);
ut_ad(rec_get_deleted_flag(rec));
......@@ -254,21 +262,21 @@ row_ins_clust_index_entry_by_modify(
/* Build an update vector containing all the fields to be modified;
NOTE that this vector may contain also system columns! */
update = row_upd_build_difference(cursor->index, entry, rec, heap);
update = row_upd_build_difference(cursor->index, entry, ext_vec,
n_ext_vec, rec, heap);
if (mode == BTR_MODIFY_LEAF) {
/* Try optimistic updating of the record, keeping changes
within the page */
err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
mtr);
if ((err == DB_OVERFLOW) || (err == DB_UNDERFLOW)) {
err = btr_cur_optimistic_update(0, cursor, update, 0, thr, mtr);
if (err == DB_OVERFLOW || err == DB_UNDERFLOW) {
err = DB_FAIL;
}
} else {
ut_ad(mode == BTR_MODIFY_TREE);
err = btr_cur_pessimistic_update(0, cursor, update, 0, thr,
mtr);
ut_a(mode == BTR_MODIFY_TREE);
err = btr_cur_pessimistic_update(0, cursor, big_rec, update,
0, thr, mtr);
}
mem_heap_free(heap);
......@@ -597,14 +605,18 @@ row_ins_index_entry_low(
pessimistic descent down the index tree */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry to insert */
ulint* ext_vec,/* in: array containing field numbers of
externally stored fields in entry, or NULL */
ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr) /* in: query thread */
{
btr_cur_t cursor;
ulint modify;
rec_t* dummy_rec;
rec_t* insert_rec;
rec_t* rec;
ulint err;
ulint n_unique;
big_rec_t* big_rec = NULL;
mtr_t mtr;
log_free_check();
......@@ -682,24 +694,54 @@ row_ins_index_entry_low(
if (index->type & DICT_CLUSTERED) {
err = row_ins_clust_index_entry_by_modify(mode,
&cursor, entry,
thr, &mtr);
&cursor, &big_rec,
entry,
ext_vec, n_ext_vec,
thr, &mtr);
} else {
err = row_ins_sec_index_entry_by_modify(&cursor,
thr, &mtr);
}
} else if (mode == BTR_MODIFY_LEAF) {
err = btr_cur_optimistic_insert(0, &cursor, entry,
&dummy_rec, thr, &mtr);
} else {
ut_ad(mode == BTR_MODIFY_TREE);
err = btr_cur_pessimistic_insert(0, &cursor, entry,
&dummy_rec, thr, &mtr);
if (mode == BTR_MODIFY_LEAF) {
err = btr_cur_optimistic_insert(0, &cursor, entry,
&insert_rec, &big_rec, thr, &mtr);
} else {
ut_a(mode == BTR_MODIFY_TREE);
err = btr_cur_pessimistic_insert(0, &cursor, entry,
&insert_rec, &big_rec, thr, &mtr);
}
if (err == DB_SUCCESS) {
if (ext_vec) {
rec_set_field_extern_bits(insert_rec,
ext_vec, n_ext_vec, &mtr);
}
}
}
function_exit:
mtr_commit(&mtr);
if (big_rec) {
mtr_start(&mtr);
btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
BTR_MODIFY_TREE, &cursor, 0, &mtr);
err = btr_store_big_rec_extern_fields(index,
btr_cur_get_rec(&cursor),
big_rec, &mtr);
if (modify) {
dtuple_big_rec_free(big_rec);
} else {
dtuple_convert_back_big_rec(index, entry, big_rec);
}
mtr_commit(&mtr);
}
return(err);
}
......@@ -716,14 +758,17 @@ row_ins_index_entry(
DB_DUPLICATE_KEY, or some other error code */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry to insert */
ulint* ext_vec,/* in: array containing field numbers of
externally stored fields in entry, or NULL */
ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr) /* in: query thread */
{
ulint err;
/* Try first optimistic descent to the B-tree */
err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, thr);
err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry,
ext_vec, n_ext_vec, thr);
if (err != DB_FAIL) {
return(err);
......@@ -731,8 +776,8 @@ row_ins_index_entry(
/* Try then pessimistic descent to the B-tree */
err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, thr);
err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry,
ext_vec, n_ext_vec, thr);
return(err);
}
......@@ -784,7 +829,7 @@ row_ins_index_entry_step(
ut_ad(dtuple_check_typed(node->entry));
err = row_ins_index_entry(node->index, node->entry, thr);
err = row_ins_index_entry(node->index, node->entry, NULL, 0, thr);
return(err);
}
......
......@@ -625,7 +625,8 @@ row_update_for_mysql(
ut_ad(prebuilt && trx);
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
UT_NOT_USED(mysql_rec);
node = prebuilt->upd_node;
clust_index = dict_table_get_first_index(table);
......@@ -777,7 +778,9 @@ row_get_mysql_key_number_for_index(
}
/*************************************************************************
Does a table creation operation for MySQL. */
Does a table creation operation for MySQL. If the name of the created
table ends to characters INNODB_MONITOR, then this also starts
printing of monitor output by the master thread. */
int
row_create_table_for_mysql(
......@@ -789,6 +792,8 @@ row_create_table_for_mysql(
tab_node_t* node;
mem_heap_t* heap;
que_thr_t* thr;
ulint namelen;
ulint keywordlen;
ulint err;
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
......@@ -833,6 +838,20 @@ row_create_table_for_mysql(
}
trx->error_state = DB_SUCCESS;
} else {
namelen = ut_strlen(table->name);
keywordlen = ut_strlen("innodb_monitor");
if (namelen >= keywordlen
&& 0 == ut_memcmp(table->name + namelen - keywordlen,
"innodb_monitor", keywordlen)) {
/* Table name ends to characters innodb_monitor:
start monitor prints */
srv_print_innodb_monitor = TRUE;
}
}
mutex_exit(&(dict_sys->mutex));
......@@ -900,7 +919,9 @@ row_create_index_for_mysql(
}
/*************************************************************************
Drops a table for MySQL. */
Drops a table for MySQL. If the name of the dropped table ends to
characters INNODB_MONITOR, then this also stops printing of monitor
output by the master thread. */
int
row_drop_table_for_mysql(
......@@ -918,11 +939,26 @@ row_drop_table_for_mysql(
char* str1;
char* str2;
ulint len;
ulint namelen;
ulint keywordlen;
char buf[10000];
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
ut_a(name != NULL);
namelen = ut_strlen(name);
keywordlen = ut_strlen("innodb_monitor");
if (namelen >= keywordlen
&& 0 == ut_memcmp(name + namelen - keywordlen,
"innodb_monitor", keywordlen)) {
/* Table name ends to characters innodb_monitor:
stop monitor prints */
srv_print_innodb_monitor = FALSE;
}
/* We use the private SQL parser of Innobase to generate the
query graphs needed in deleting the dictionary data from system
tables in Innobase. Deleting a row from SYS_INDEXES table also
......
......@@ -347,20 +347,36 @@ row_purge_del_mark(
}
/***************************************************************
Purges an update of an existing record. */
Purges an update of an existing record. Also purges an update of a delete
marked record if that record contained an externally stored field. */
static
void
row_purge_upd_exist(
/*================*/
row_purge_upd_exist_or_extern(
/*==========================*/
purge_node_t* node, /* in: row purge node */
que_thr_t* thr) /* in: query thread */
{
mem_heap_t* heap;
dtuple_t* entry;
dict_index_t* index;
upd_field_t* ufield;
ibool is_insert;
ulint rseg_id;
ulint page_no;
ulint offset;
ulint internal_offset;
byte* data_field;
ulint data_field_len;
ulint i;
mtr_t mtr;
ut_ad(node && thr);
if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
goto skip_secondaries;
}
heap = mem_heap_create(1024);
while (node->index != NULL) {
......@@ -378,6 +394,53 @@ row_purge_upd_exist(
}
mem_heap_free(heap);
skip_secondaries:
/* Free possible externally stored fields */
for (i = 0; i < upd_get_n_fields(node->update); i++) {
ufield = upd_get_nth_field(node->update, i);
if (ufield->extern_storage) {
/* We use the fact that new_val points to
node->undo_rec and get thus the offset of
dfield data inside the unod record. Then we
can calculate from node->roll_ptr the file
address of the new_val data */
internal_offset = ((byte*)ufield->new_val.data)
- node->undo_rec;
ut_a(internal_offset < UNIV_PAGE_SIZE);
trx_undo_decode_roll_ptr(node->roll_ptr,
&is_insert, &rseg_id,
&page_no, &offset);
mtr_start(&mtr);
/* We have to acquire an X-latch to the clustered
index tree */
index = dict_table_get_first_index(node->table);
mtr_x_lock(dict_tree_get_lock(index->tree), &mtr);
/* We assume in purge of externally stored fields
that the space id of the undo log record is 0! */
data_field = buf_page_get(0, page_no, RW_X_LATCH, &mtr)
+ offset + internal_offset;
buf_page_dbg_add_level(buf_frame_align(data_field),
SYNC_TRX_UNDO_PAGE);
data_field_len = ufield->new_val.len;
btr_free_externally_stored_field(index, data_field,
data_field_len, &mtr);
mtr_commit(&mtr);
}
}
}
/***************************************************************
......@@ -388,6 +451,9 @@ row_purge_parse_undo_rec(
/*=====================*/
/* out: TRUE if purge operation required */
purge_node_t* node, /* in: row undo node */
ibool* updated_extern,
/* out: TRUE if an externally stored field
was updated */
que_thr_t* thr) /* in: query thread */
{
dict_index_t* clust_index;
......@@ -403,10 +469,10 @@ row_purge_parse_undo_rec(
ut_ad(node && thr);
ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
&undo_no, &table_id);
updated_extern, &undo_no, &table_id);
node->rec_type = type;
if (type == TRX_UNDO_UPD_DEL_REC) {
if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) {
return(FALSE);
}
......@@ -416,7 +482,7 @@ row_purge_parse_undo_rec(
node->table = NULL;
if (type == TRX_UNDO_UPD_EXIST_REC
&& cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
&& cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) {
/* Purge requires no changes to indexes: we may return */
......@@ -455,8 +521,11 @@ row_purge_parse_undo_rec(
/* Read to the partial row the fields that occur in indexes */
ptr = trx_undo_rec_get_partial_row(ptr, clust_index, &(node->row),
node->heap);
if (!cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
ptr = trx_undo_rec_get_partial_row(ptr, clust_index,
&(node->row), node->heap);
}
return(TRUE);
}
......@@ -475,6 +544,7 @@ row_purge(
{
dulint roll_ptr;
ibool purge_needed;
ibool updated_extern;
ut_ad(node && thr);
......@@ -494,7 +564,8 @@ row_purge(
if (node->undo_rec == &trx_purge_dummy_rec) {
purge_needed = FALSE;
} else {
purge_needed = row_purge_parse_undo_rec(node, thr);
purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
thr);
}
if (purge_needed) {
......@@ -503,11 +574,13 @@ row_purge(
node->index = dict_table_get_next_index(
dict_table_get_first_index(node->table));
if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
row_purge_upd_exist(node, thr);
} else {
ut_ad(node->rec_type == TRX_UNDO_DEL_MARK_REC);
if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
row_purge_del_mark(node, thr);
} else if (updated_extern
|| node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
row_purge_upd_exist_or_extern(node, thr);
}
if (node->found_clust) {
......
......@@ -146,15 +146,17 @@ row_build_index_entry(
/***********************************************************************
An inverse function to dict_row_build_index_entry. Builds a row from a
record in a clustered index. */
record in a clustered index. NOTE that externally stored (often big)
fields are always copied to heap. */
dtuple_t*
row_build(
/*======*/
/* out, own: row built; see the NOTE below! */
ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
the former copies also the data fields to
heap as the latter only places pointers to
ulint type, /* in: ROW_COPY_POINTERS, ROW_COPY_DATA, or
ROW_COPY_ALSO_EXTERNALS,
the two last copy also the data fields to
heap as the first only places pointers to
data fields on the index page, and thus is
more efficient */
dict_index_t* index, /* in: clustered index */
......@@ -170,19 +172,19 @@ row_build(
{
dtuple_t* row;
dict_table_t* table;
ulint n_fields;
ulint i;
dict_col_t* col;
dfield_t* dfield;
ulint n_fields;
byte* field;
ulint len;
ulint row_len;
dict_col_t* col;
byte* buf;
ulint i;
ut_ad(index && rec && heap);
ut_ad(index->type & DICT_CLUSTERED);
if (type == ROW_COPY_DATA) {
if (type != ROW_COPY_POINTERS) {
/* Take a copy of rec to heap */
buf = mem_heap_alloc(heap, rec_get_size(rec));
rec = rec_copy(buf, rec);
......@@ -207,6 +209,13 @@ row_build(
dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
field = rec_get_nth_field(rec, i, &len);
if (type == ROW_COPY_ALSO_EXTERNALS
&& rec_get_nth_field_extern_bit(rec, i)) {
field = btr_rec_copy_externally_stored_field(rec,
i, &len, heap);
}
dfield_set_data(dfield, field, len);
}
......@@ -215,6 +224,7 @@ row_build(
return(row);
}
#ifdef notdefined
/***********************************************************************
An inverse function to dict_row_build_index_entry. Builds a row from a
record in a clustered index. */
......@@ -229,7 +239,9 @@ row_build_to_tuple(
directly into this record, therefore,
the buffer page of this record must be
at least s-latched and the latch held
as long as the row dtuple is used! */
as long as the row dtuple is used!
NOTE 2: does not work with externally
stored fields! */
{
dict_table_t* table;
ulint n_fields;
......@@ -265,9 +277,11 @@ row_build_to_tuple(
ut_ad(dtuple_check_typed(row));
}
#endif
/***********************************************************************
Converts an index record to a typed data tuple. */
Converts an index record to a typed data tuple. NOTE that externally
stored (often big) fields are NOT copied to heap. */
dtuple_t*
row_rec_to_index_entry(
......
......@@ -2036,7 +2036,8 @@ row_sel_store_mysql_rec(
which was described in prebuilt's
template */
{
mysql_row_templ_t* templ;
mysql_row_templ_t* templ;
mem_heap_t* extern_field_heap = NULL;
byte* data;
ulint len;
byte* blob_buf;
......@@ -2059,6 +2060,24 @@ row_sel_store_mysql_rec(
data = rec_get_nth_field(rec, templ->rec_field_no, &len);
if (rec_get_nth_field_extern_bit(rec, templ->rec_field_no)) {
/* Copy an externally stored field to the temporary
heap */
if (prebuilt->trx->has_search_latch) {
rw_lock_s_unlock(&btr_search_latch);
prebuilt->trx->has_search_latch = FALSE;
}
extern_field_heap = mem_heap_create(UNIV_PAGE_SIZE);
data = btr_rec_copy_externally_stored_field(rec,
templ->rec_field_no, &len,
extern_field_heap);
ut_a(len != UNIV_SQL_NULL);
}
if (len != UNIV_SQL_NULL) {
if (templ->type == DATA_BLOB) {
......@@ -2081,6 +2100,10 @@ row_sel_store_mysql_rec(
mysql_rec + templ->mysql_col_offset,
templ->mysql_col_len, data, len,
templ->type, templ->is_unsigned);
if (extern_field_heap) {
mem_heap_free(extern_field_heap);
}
} else {
mysql_rec[templ->mysql_null_byte_offset] |=
(byte) (templ->mysql_null_bit_mask);
......@@ -2450,6 +2473,7 @@ row_search_for_mysql(
ibool unique_search_from_clust_index = FALSE;
ibool mtr_has_extra_clust_latch = FALSE;
ibool moves_up = FALSE;
ulint cnt = 0;
mtr_t mtr;
ut_ad(index && pcur && search_tuple);
......@@ -2457,6 +2481,11 @@ row_search_for_mysql(
ut_ad(sync_thread_levels_empty_gen(FALSE));
/* printf("Match mode %lu\n search tuple ", match_mode);
dtuple_print(search_tuple);
printf("N tables locked %lu\n", trx->mysql_n_tables_locked);
*/
if (direction == 0) {
prebuilt->n_rows_fetched = 0;
prebuilt->n_fetch_cached = 0;
......@@ -2528,6 +2557,8 @@ row_search_for_mysql(
mtr_commit(&mtr);
/* printf("%s record not found 1\n", index->name); */
return(DB_RECORD_NOT_FOUND);
}
......@@ -2565,17 +2596,18 @@ row_search_for_mysql(
mtr_commit(&mtr);
/* printf("%s shortcut\n", index->name); */
return(DB_SUCCESS);
} else if (shortcut == SEL_EXHAUSTED) {
mtr_commit(&mtr);
/* printf("%s record not found 2\n",
index->name); */
return(DB_RECORD_NOT_FOUND);
}
/* Commit the mini-transaction since it can
hold latches */
mtr_commit(&mtr);
mtr_start(&mtr);
......@@ -2659,7 +2691,12 @@ rec_loop:
cons_read_requires_clust_rec = FALSE;
rec = btr_pcur_get_rec(pcur);
/*
printf("Using index %s cnt %lu ", index->name, cnt);
printf("; Page no %lu\n",
buf_frame_get_page_no(buf_frame_align(rec)));
rec_print(rec);
*/
if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
/* The infimum record on a page cannot be in the result set,
......@@ -2700,12 +2737,15 @@ rec_loop:
/* Test if the index record matches completely to search_tuple
in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
/* printf("Comparing rec and search tuple\n"); */
if (0 != cmp_dtuple_rec(search_tuple, rec)) {
btr_pcur_store_position(pcur, &mtr);
ret = DB_RECORD_NOT_FOUND;
/* printf("%s record not found 3\n", index->name); */
goto normal_return;
}
......@@ -2716,6 +2756,7 @@ rec_loop:
btr_pcur_store_position(pcur, &mtr);
ret = DB_RECORD_NOT_FOUND;
/* printf("%s record not found 4\n", index->name); */
goto normal_return;
}
......@@ -2884,6 +2925,8 @@ next_rec:
moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
moves_up, &mtr);
if (moved) {
cnt++;
goto rec_loop;
}
}
......@@ -2906,6 +2949,8 @@ next_rec:
goto normal_return;
}
cnt++;
goto rec_loop;
/*-------------------------------------------------------------*/
lock_wait_or_error:
......@@ -2931,7 +2976,9 @@ lock_wait_or_error:
goto rec_loop;
}
/* printf("Using index %s cnt %lu ret value %lu err\n", index->name,
cnt, err); */
return(err);
normal_return:
......@@ -2945,5 +2992,7 @@ normal_return:
ret = DB_SUCCESS;
}
/* printf("Using index %s cnt %lu ret value %lu\n", index->name,
cnt, err); */
return(ret);
}
......@@ -242,11 +242,12 @@ row_undo_ins_parse_undo_rec(
dulint table_id;
ulint type;
ulint dummy;
ibool dummy_extern;
ut_ad(node && thr);
ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, &undo_no,
&table_id);
ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
&dummy_extern, &undo_no, &table_id);
ut_ad(type == TRX_UNDO_INSERT_REC);
node->rec_type = type;
......@@ -284,9 +285,9 @@ row_undo_ins(
row_undo_ins_parse_undo_rec(node, thr);
if (node->table == NULL) {
found = FALSE;
found = FALSE;
} else {
found = row_undo_search_clust_to_pcur(node, thr);
found = row_undo_search_clust_to_pcur(node, thr);
}
if (!found) {
......
......@@ -94,12 +94,12 @@ row_undo_mod_clust_low(
mtr_t* mtr, /* in: mtr */
ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
{
big_rec_t* dummy_big_rec;
dict_index_t* index;
btr_pcur_t* pcur;
btr_cur_t* btr_cur;
ulint err;
ibool success;
ibool do_remove;
index = dict_table_get_first_index(node->table);
......@@ -110,49 +110,80 @@ row_undo_mod_clust_low(
ut_ad(success);
if (mode == BTR_MODIFY_LEAF) {
err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
| BTR_NO_UNDO_LOG_FLAG
| BTR_KEEP_SYS_FLAG,
btr_cur, node->update,
node->cmpl_info, thr, mtr);
} else {
ut_ad(mode == BTR_MODIFY_TREE);
err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG
| BTR_NO_UNDO_LOG_FLAG
| BTR_KEEP_SYS_FLAG,
btr_cur, &dummy_big_rec, node->update,
node->cmpl_info, thr, mtr);
}
return(err);
}
/***************************************************************
Removes a clustered index record after undo if possible. */
static
ulint
row_undo_mod_remove_clust_low(
/*==========================*/
/* out: DB_SUCCESS, DB_FAIL, or error code:
we may run out of file space */
undo_node_t* node, /* in: row undo node */
que_thr_t* thr, /* in: query thread */
mtr_t* mtr, /* in: mtr */
ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
{
btr_pcur_t* pcur;
btr_cur_t* btr_cur;
ulint err;
ibool success;
pcur = &(node->pcur);
btr_cur = btr_pcur_get_btr_cur(pcur);
success = btr_pcur_restore_position(mode, pcur, mtr);
if (!success) {
return(DB_SUCCESS);
}
/* Find out if we can remove the whole clustered index record */
if (node->rec_type == TRX_UNDO_UPD_DEL_REC
&& !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
do_remove = TRUE;
/* Ok, we can remove */
} else {
do_remove = FALSE;
return(DB_SUCCESS);
}
if (mode == BTR_MODIFY_LEAF) {
success = btr_cur_optimistic_delete(btr_cur, mtr);
if (do_remove) {
success = btr_cur_optimistic_delete(btr_cur, mtr);
if (success) {
err = DB_SUCCESS;
} else {
err = DB_FAIL;
}
if (success) {
err = DB_SUCCESS;
} else {
err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
| BTR_NO_UNDO_LOG_FLAG
| BTR_KEEP_SYS_FLAG,
btr_cur, node->update,
node->cmpl_info, thr, mtr);
err = DB_FAIL;
}
} else {
ut_ad(mode == BTR_MODIFY_TREE);
if (do_remove) {
btr_cur_pessimistic_delete(&err, FALSE, btr_cur, mtr);
btr_cur_pessimistic_delete(&err, FALSE, btr_cur, mtr);
/* The delete operation may fail if we have little
file space left: TODO: easiest to crash the database
and restart with more file space */
} else {
err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG
| BTR_NO_UNDO_LOG_FLAG
| BTR_KEEP_SYS_FLAG,
btr_cur, node->update,
node->cmpl_info, thr, mtr);
}
/* The delete operation may fail if we have little
file space left: TODO: easiest to crash the database
and restart with more file space */
}
return(err);
......@@ -204,10 +235,31 @@ row_undo_mod_clust(
err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
}
node->state = UNDO_NODE_FETCH_NEXT;
btr_pcur_commit_specify_mtr(pcur, &mtr);
if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) {
mtr_start(&mtr);
err = row_undo_mod_remove_clust_low(node, thr, &mtr,
BTR_MODIFY_LEAF);
if (err != DB_SUCCESS) {
btr_pcur_commit_specify_mtr(pcur, &mtr);
/* We may have to modify tree structure: do a
pessimistic descent down the index tree */
mtr_start(&mtr);
err = row_undo_mod_remove_clust_low(node, thr, &mtr,
BTR_MODIFY_TREE);
}
btr_pcur_commit_specify_mtr(pcur, &mtr);
}
node->state = UNDO_NODE_FETCH_NEXT;
trx_undo_rec_release(node->trx, node->undo_no);
if (more_vers && err == DB_SUCCESS) {
......@@ -388,7 +440,6 @@ row_undo_mod_del_unmark_sec(
mem_free(err_buf);
} else {
btr_cur = btr_pcur_get_btr_cur(&pcur);
err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
......@@ -546,11 +597,12 @@ row_undo_mod_parse_undo_rec(
ulint info_bits;
ulint type;
ulint cmpl_info;
ibool dummy_extern;
ut_ad(node && thr);
ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
&undo_no, &table_id);
&dummy_extern, &undo_no, &table_id);
node->rec_type = type;
node->table = dict_table_get_on_id(table_id, thr_get_trx(thr));
......@@ -598,10 +650,9 @@ row_undo_mod(
row_undo_mod_parse_undo_rec(node, thr);
if (node->table == NULL) {
found = FALSE;
found = FALSE;
} else {
found = row_undo_search_clust_to_pcur(node, thr);
found = row_undo_search_clust_to_pcur(node, thr);
}
if (!found) {
......
......@@ -124,6 +124,8 @@ row_undo_node_create(
undo->state = UNDO_NODE_FETCH_NEXT;
undo->trx = trx;
btr_pcur_init(&(undo->pcur));
undo->heap = mem_heap_create(256);
return(undo);
......@@ -303,6 +305,16 @@ row_undo_step(
if (err != DB_SUCCESS) {
/* SQL error detected */
fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", err);
if (err == DB_OUT_OF_FILE_SPACE) {
fprintf(stderr,
"InnoDB: Error 13 means out of tablespace.\n"
"InnoDB: Consider increasing your tablespace.\n");
exit(1);
}
ut_a(0);
return(NULL);
......
......@@ -90,8 +90,10 @@ upd_node_create(
node->in_mysql_interface = FALSE;
node->row = NULL;
node->ext_vec = NULL;
node->index = NULL;
node->update = NULL;
node->select = NULL;
node->heap = mem_heap_create(128);
......@@ -160,7 +162,8 @@ row_upd_index_entry_sys_field(
}
/***************************************************************
Returns TRUE if row update changes size of some field in index. */
Returns TRUE if row update changes size of some field in index
or if some field to be updated is stored externally in rec or update. */
ibool
row_upd_changes_field_size(
......@@ -199,6 +202,16 @@ row_upd_changes_field_size(
return(TRUE);
}
if (rec_get_nth_field_extern_bit(rec, upd_field->field_no)) {
return(TRUE);
}
if (upd_field->extern_storage) {
return(TRUE);
}
}
return(FALSE);
......@@ -441,6 +454,34 @@ row_upd_index_parse(
return(ptr);
}
/*******************************************************************
Returns TRUE if ext_vec contains i. */
UNIV_INLINE
ibool
upd_ext_vec_contains(
/*=================*/
/* out: TRUE if i is in ext_vec */
ulint* ext_vec, /* in: array of indexes or NULL */
ulint n_ext_vec, /* in: number of numbers in ext_vec */
ulint i) /* in: a number */
{
ulint j;
if (ext_vec == NULL) {
return(FALSE);
}
for (j = 0; j < n_ext_vec; j++) {
if (ext_vec[j] == i) {
return(TRUE);
}
}
return(FALSE);
}
/*******************************************************************
Builds an update vector from those fields, excluding the roll ptr and
......@@ -454,6 +495,9 @@ row_upd_build_difference(
fields, excluding roll ptr and trx id */
dict_index_t* index, /* in: clustered index */
dtuple_t* entry, /* in: entry to insert */
ulint* ext_vec,/* in: array containing field numbers of
externally stored fields in entry, or NULL */
ulint n_ext_vec,/* in: number of fields in ext_vec */
rec_t* rec, /* in: clustered index record */
mem_heap_t* heap) /* in: memory heap from which allocated */
{
......@@ -480,16 +524,25 @@ row_upd_build_difference(
for (i = 0; i < dtuple_get_n_fields(entry); i++) {
data = rec_get_nth_field(rec, i, &len);
dfield = dtuple_get_nth_field(entry, i);
if ((i != trx_id_pos) && (i != roll_ptr_pos)
&& !dfield_data_is_equal(dfield, len, data)) {
if ((rec_get_nth_field_extern_bit(rec, i)
!= upd_ext_vec_contains(ext_vec, n_ext_vec, i))
|| ((i != trx_id_pos) && (i != roll_ptr_pos)
&& !dfield_data_is_equal(dfield, len, data))) {
upd_field = upd_get_nth_field(update, n_diff);
dfield_copy(&(upd_field->new_val), dfield);
upd_field_set_field_no(upd_field, i, index);
if (upd_ext_vec_contains(ext_vec, n_ext_vec, i)) {
upd_field->extern_storage = TRUE;
} else {
upd_field->extern_storage = FALSE;
}
n_diff++;
}
......@@ -630,9 +683,7 @@ row_upd_changes_ord_field(
}
/***************************************************************
Checks if an update vector changes an ordering field of an index record.
This function is fast if the update vector is short or the number of ordering
fields in the index is small. Otherwise, this can be quadratic. */
Checks if an update vector changes an ordering field of an index record. */
ibool
row_upd_changes_some_index_ord_field(
......@@ -642,19 +693,24 @@ row_upd_changes_some_index_ord_field(
dict_table_t* table, /* in: table */
upd_t* update) /* in: update vector for the row */
{
upd_field_t* upd_field;
dict_index_t* index;
ulint i;
index = dict_table_get_first_index(table);
while (index) {
if (row_upd_changes_ord_field(NULL, index, update)) {
for (i = 0; i < upd_get_n_fields(update); i++) {
return(TRUE);
}
upd_field = upd_get_nth_field(update, i);
index = dict_table_get_next_index(index);
}
if (dict_field_get_col(dict_index_get_nth_field(index,
upd_field->field_no))
->ord_part) {
return(TRUE);
}
}
return(FALSE);
}
......@@ -710,15 +766,17 @@ row_upd_eval_new_vals(
/***************************************************************
Stores to the heap the row on which the node->pcur is positioned. */
UNIV_INLINE
static
void
row_upd_store_row(
/*==============*/
upd_node_t* node) /* in: row update node */
{
dict_index_t* clust_index;
upd_t* update;
rec_t* rec;
ut_ad((node->pcur)->latch_mode != BTR_NO_LATCHES);
ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
if (node->row != NULL) {
mem_heap_empty(node->heap);
......@@ -727,8 +785,20 @@ row_upd_store_row(
clust_index = dict_table_get_first_index(node->table);
node->row = row_build(ROW_COPY_DATA, clust_index,
btr_pcur_get_rec(node->pcur), node->heap);
rec = btr_pcur_get_rec(node->pcur);
node->row = row_build(ROW_COPY_DATA, clust_index, rec, node->heap);
node->ext_vec = mem_heap_alloc(node->heap, rec_get_n_fields(rec));
if (node->is_delete) {
update = NULL;
} else {
update = node->update;
}
node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec,
rec, update);
}
/***************************************************************
......@@ -812,7 +882,7 @@ row_upd_sec_index_entry(
row_upd_index_replace_new_col_vals(entry, index, node->update);
/* Insert new index entry */
err = row_ins_index_entry(index, entry, thr);
err = row_ins_index_entry(index, entry, NULL, 0, thr);
mem_heap_free(heap);
......@@ -870,6 +940,8 @@ row_upd_clust_rec_by_insert(
dict_table_t* table;
mem_heap_t* heap;
dtuple_t* entry;
ulint* ext_vec;
ulint n_ext_vec;
ulint err;
ut_ad(node);
......@@ -897,14 +969,18 @@ row_upd_clust_rec_by_insert(
heap = mem_heap_create(1024);
ext_vec = mem_heap_alloc(heap,
sizeof(ulint) * dtuple_get_n_fields(node->row));
n_ext_vec = 0;
entry = row_build_index_entry(node->row, index, heap);
row_upd_clust_index_replace_new_col_vals(entry, node->update);
row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
err = row_ins_index_entry(index, entry, thr);
err = row_ins_index_entry(index, entry, node->ext_vec,
node->n_ext_vec, thr);
mem_heap_free(heap);
return(err);
......@@ -924,6 +1000,7 @@ row_upd_clust_rec(
que_thr_t* thr, /* in: query thread */
mtr_t* mtr) /* in: mtr; gets committed here */
{
big_rec_t* big_rec = NULL;
btr_pcur_t* pcur;
btr_cur_t* btr_cur;
ulint err;
......@@ -973,9 +1050,24 @@ row_upd_clust_rec(
ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
node->update, node->cmpl_info, thr, mtr);
&big_rec, node->update,
node->cmpl_info, thr, mtr);
mtr_commit(mtr);
if (err == DB_SUCCESS && big_rec) {
mtr_start(mtr);
ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
err = btr_store_big_rec_extern_fields(index,
btr_cur_get_rec(btr_cur),
big_rec, mtr);
mtr_commit(mtr);
}
if (big_rec) {
dtuple_big_rec_free(big_rec);
}
return(err);
}
......@@ -1194,10 +1286,12 @@ row_upd(
ut_ad(node && thr);
if (node->in_mysql_interface) {
/* We do not get the cmpl_info value from the MySQL
interpreter: we must calculate it on the fly: */
if (row_upd_changes_some_index_ord_field(node->table,
if (node->is_delete ||
row_upd_changes_some_index_ord_field(node->table,
node->update)) {
node->cmpl_info = 0;
} else {
......@@ -1239,6 +1333,7 @@ function_exit:
if (node->row != NULL) {
mem_heap_empty(node->heap);
node->row = NULL;
node->n_ext_vec = 0;
}
node->state = UPD_NODE_UPDATE_CLUSTERED;
......
This diff is collapsed.
This diff is collapsed.
......@@ -810,11 +810,10 @@ rw_lock_print(
ulint count = 0;
rw_lock_debug_t* info;
printf("----------------------------------------------\n");
printf("-------------------------------------------------\n");
printf("RW-LOCK INFO\n");
printf("RW-LOCK: %lx ", (ulint)lock);
mutex_enter(&(lock->mutex));
if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
|| (rw_lock_get_reader_count(lock) != 0)
|| (rw_lock_get_waiters(lock) != 0)) {
......@@ -831,8 +830,6 @@ rw_lock_print(
info = UT_LIST_GET_NEXT(list, info);
}
}
mutex_exit(&(lock->mutex));
#endif
}
......
......@@ -158,7 +158,7 @@ struct sync_thread_struct{
};
/* Number of slots reserved for each OS thread in the sync level array */
#define SYNC_THREAD_N_LEVELS 256
#define SYNC_THREAD_N_LEVELS 10000
struct sync_level_struct{
void* latch; /* pointer to a mutex or an rw-lock; NULL means that
......@@ -768,6 +768,9 @@ sync_thread_levels_g(
thread */
ulint limit) /* in: level limit */
{
char* file_name;
ulint line;
ulint thread_id;
sync_level_t* slot;
rw_lock_t* lock;
mutex_t* mutex;
......@@ -783,8 +786,29 @@ sync_thread_levels_g(
lock = slot->latch;
mutex = slot->latch;
ut_error;
printf(
"InnoDB error: sync levels should be > %lu but a level is %lu\n",
limit, slot->level);
if (mutex->magic_n == MUTEX_MAGIC_N) {
printf("Mutex created at %s %lu\n", &(mutex->cfile_name),
mutex->cline);
if (mutex_get_lock_word(mutex) != 0) {
mutex_get_debug_info(mutex,
&file_name, &line, &thread_id);
printf("InnoDB: Locked mutex: addr %lx thread %ld file %s line %ld\n",
(ulint)mutex, thread_id,
file_name, line);
} else {
printf("Not locked\n");
}
} else {
rw_lock_print(lock);
}
return(FALSE);
}
}
......@@ -973,6 +997,8 @@ sync_thread_add_level(
ut_a(sync_thread_levels_g(array, SYNC_ANY_LATCH));
} else if (level == SYNC_TRX_SYS_HEADER) {
ut_a(sync_thread_levels_contain(array, SYNC_KERNEL));
} else if (level == SYNC_DOUBLEWRITE) {
ut_a(sync_thread_levels_g(array, SYNC_DOUBLEWRITE));
} else if (level == SYNC_BUF_BLOCK) {
ut_a((sync_thread_levels_contain(array, SYNC_BUF_POOL)
&& sync_thread_levels_g(array, SYNC_BUF_BLOCK - 1))
......@@ -1000,6 +1026,8 @@ sync_thread_add_level(
} else if (level == SYNC_FSP) {
ut_a(sync_thread_levels_contain(array, SYNC_FSP)
|| sync_thread_levels_g(array, SYNC_FSP));
} else if (level == SYNC_EXTERN_STORAGE) {
ut_a(TRUE);
} else if (level == SYNC_TRX_UNDO_PAGE) {
ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO)
|| sync_thread_levels_contain(array, SYNC_RSEG)
......@@ -1221,10 +1249,10 @@ void
sync_print(void)
/*============*/
{
printf("SYNC INFO:------------------------------------------\n");
printf("SYNC INFO:\n");
mutex_list_print_info();
rw_lock_list_print_info();
sync_array_print_info(sync_primary_wait_array);
sync_print_wait_info();
printf("----------------------------------------------------\n");
printf("-----------------------------------------------------\n");
}
......@@ -692,6 +692,9 @@ trx_purge_choose_next_log(void)
min_rseg = rseg;
min_trx_no = rseg->last_trx_no;
space = rseg->space;
ut_a(space == 0); /* We assume in purge of
externally stored fields
that space id == 0 */
page_no = rseg->last_page_no;
offset = rseg->last_offset;
}
......@@ -820,6 +823,10 @@ trx_purge_get_next_rec(
}
cmpl_info = trx_undo_rec_get_cmpl_info(rec2);
if (trx_undo_rec_get_extern_storage(rec2)) {
break;
}
if ((type == TRX_UNDO_UPD_EXIST_REC)
&& !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment