buf0flu.c 29.7 KB
Newer Older
osku's avatar
osku committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/******************************************************
The database buffer buf_pool flush algorithm

(c) 1995-2001 Innobase Oy

Created 11/11/1995 Heikki Tuuri
*******************************************************/

#include "buf0flu.h"

#ifdef UNIV_NONINL
#include "buf0flu.ic"
#include "trx0sys.h"
#endif

#include "ut0byte.h"
#include "ut0lst.h"
#include "page0page.h"
19
#include "page0zip.h"
osku's avatar
osku committed
20 21 22 23 24 25 26 27 28 29 30 31 32 33
#include "fil0fil.h"
#include "buf0buf.h"
#include "buf0lru.h"
#include "buf0rea.h"
#include "ibuf0ibuf.h"
#include "log0log.h"
#include "os0file.h"
#include "trx0sys.h"
#include "srv0srv.h"

/* When flushed, dirty blocks are searched in neigborhoods of this size, and
flushed along with the original page. */

#define BUF_FLUSH_AREA		ut_min(BUF_READ_AHEAD_AREA,\
34
		buf_pool->curr_size / 16)
osku's avatar
osku committed
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

/**********************************************************************
Validates the flush list. */
static
ibool
buf_flush_validate_low(void);
/*========================*/
		/* out: TRUE if ok */

/************************************************************************
Inserts a modified block into the flush list. */

void
buf_flush_insert_into_flush_list(
/*=============================*/
	buf_block_t*	block)	/* in: block which is modified */
{
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */

	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
59
		|| (ut_dulint_cmp(
osku's avatar
osku committed
60
			(UT_LIST_GET_FIRST(buf_pool->flush_list))
61
			->oldest_modification,
osku's avatar
osku committed
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
			block->oldest_modification) <= 0));

	UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);

	ut_ad(buf_flush_validate_low());
}

/************************************************************************
Inserts a modified block into the flush list in the right sorted position.
This function is used by recovery, because there the modifications do not
necessarily come in the order of lsn's. */

void
buf_flush_insert_sorted_into_flush_list(
/*====================================*/
	buf_block_t*	block)	/* in: block which is modified */
{
	buf_block_t*	prev_b;
	buf_block_t*	b;
81

osku's avatar
osku committed
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */

	prev_b = NULL;
	b = UT_LIST_GET_FIRST(buf_pool->flush_list);

	while (b && (ut_dulint_cmp(b->oldest_modification,
					block->oldest_modification) > 0)) {
		prev_b = b;
		b = UT_LIST_GET_NEXT(flush_list, b);
	}

	if (prev_b == NULL) {
		UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);
	} else {
		UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b,
								block);
	}

	ut_ad(buf_flush_validate_low());
}

/************************************************************************
Returns TRUE if the file page block is immediately suitable for replacement,
i.e., the transition FILE_PAGE => NOT_USED allowed. */

ibool
buf_flush_ready_for_replace(
/*========================*/
				/* out: TRUE if can replace immediately */
	buf_block_t*	block)	/* in: buffer control block, must be in state
				BUF_BLOCK_FILE_PAGE and in the LRU list */
{
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
	if (block->state != BUF_BLOCK_FILE_PAGE) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
"  InnoDB: Error: buffer block state %lu in the LRU list!\n",
			(ulong)block->state);
124
		ut_print_buf(stderr, block, sizeof(buf_block_t));
osku's avatar
osku committed
125 126 127 128 129

		return(FALSE);
	}

	if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
130 131
		|| (block->buf_fix_count != 0)
		|| (block->io_fix != 0)) {
osku's avatar
osku committed
132 133 134

		return(FALSE);
	}
135

osku's avatar
osku committed
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
	return(TRUE);
}

/************************************************************************
Returns TRUE if the block is modified and ready for flushing. */
UNIV_INLINE
ibool
buf_flush_ready_for_flush(
/*======================*/
				/* out: TRUE if can flush immediately */
	buf_block_t*	block,	/* in: buffer control block, must be in state
				BUF_BLOCK_FILE_PAGE */
	ulint		flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
{
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

	if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
156 157
						&& (block->io_fix == 0)) {
		if (flush_type != BUF_FLUSH_LRU) {
osku's avatar
osku committed
158 159 160 161

			return(TRUE);

		} else if (block->buf_fix_count == 0) {
162

osku's avatar
osku committed
163 164 165 166 167 168 169
			/* If we are flushing the LRU list, to avoid deadlocks
			we require the block not to be bufferfixed, and hence
			not latched. */

			return(TRUE);
		}
	}
170

osku's avatar
osku committed
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
	return(FALSE);
}

/************************************************************************
Updates the flush system data structures when a write is completed. */

void
buf_flush_write_complete(
/*=====================*/
	buf_block_t*	block)	/* in: pointer to the block in question */
{
	ut_ad(block);
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

	block->oldest_modification = ut_dulint_zero;

	UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block);

	ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list));

	(buf_pool->n_flush[block->flush_type])--;

	if (block->flush_type == BUF_FLUSH_LRU) {
		/* Put the block to the end of the LRU list to wait to be
		moved to the free list */

		buf_LRU_make_block_old(block);

		buf_pool->LRU_flush_ended++;
	}

	/* fprintf(stderr, "n pending flush %lu\n",
		buf_pool->n_flush[block->flush_type]); */

	if ((buf_pool->n_flush[block->flush_type] == 0)
209
		&& (buf_pool->init_flush[block->flush_type] == FALSE)) {
osku's avatar
osku committed
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238

		/* The running flush batch has ended */

		os_event_set(buf_pool->no_flush[block->flush_type]);
	}
}

/************************************************************************
Flushes possible buffered writes from the doublewrite memory buffer to disk,
and also wakes up the aio thread if simulated aio is used. It is very
important to call this function after a batch of writes has been posted,
and also when we may have to wait for a page latch! Otherwise a deadlock
of threads can occur. */
static
void
buf_flush_buffered_writes(void)
/*===========================*/
{
	buf_block_t*	block;
	byte*		write_buf;
	ulint		len;
	ulint		len2;
	ulint		i;

	if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
		os_aio_simulated_wake_handler_threads();

		return;
	}
239

osku's avatar
osku committed
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
	mutex_enter(&(trx_doublewrite->mutex));

	/* Write first to doublewrite buffer blocks. We use synchronous
	aio and thus know that file write has been completed when the
	control returns. */

	if (trx_doublewrite->first_free == 0) {

		mutex_exit(&(trx_doublewrite->mutex));

		return;
	}

	for (i = 0; i < trx_doublewrite->first_free; i++) {

		block = trx_doublewrite->buf_block_arr[i];
256
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);
osku's avatar
osku committed
257

258 259
		/* TODO: page_zip */

osku's avatar
osku committed
260
		if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
261 262 263 264
			!= mach_read_from_4(block->frame + UNIV_PAGE_SIZE
				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
			ut_print_timestamp(stderr);
			fprintf(stderr,
osku's avatar
osku committed
265 266 267
"  InnoDB: ERROR: The page to be written seems corrupt!\n"
"InnoDB: The lsn fields do not match! Noticed in the buffer pool\n"
"InnoDB: before posting to the doublewrite buffer.\n");
268
		}
osku's avatar
osku committed
269

marko's avatar
marko committed
270
		if (!block->check_index_page_at_flush) {
marko's avatar
marko committed
271 272
		} else if (page_is_comp(block->frame)) {
			if (UNIV_UNLIKELY(!page_simple_validate_new(
marko's avatar
marko committed
273 274
						block->frame))) {
corrupted_page:
275
				buf_page_print(block->frame, 0);
osku's avatar
osku committed
276

marko's avatar
marko committed
277 278
				ut_print_timestamp(stderr);
				fprintf(stderr,
osku's avatar
osku committed
279 280 281 282
	"  InnoDB: Apparent corruption of an index page n:o %lu in space %lu\n"
	"InnoDB: to be written to data file. We intentionally crash server\n"
	"InnoDB: to prevent corrupt data from ending up in data\n"
	"InnoDB: files.\n",
marko's avatar
marko committed
283 284
					(ulong) block->offset,
					(ulong) block->space);
osku's avatar
osku committed
285

marko's avatar
marko committed
286 287
				ut_error;
			}
marko's avatar
marko committed
288 289 290 291
		} else if (UNIV_UNLIKELY(!page_simple_validate_old(
						block->frame))) {

			goto corrupted_page;
osku's avatar
osku committed
292 293 294
		}
	}

295 296 297 298
	/* increment the doublewrite flushed pages counter */
	srv_dblwr_pages_written+= trx_doublewrite->first_free;
	srv_dblwr_writes++;

osku's avatar
osku committed
299 300 301 302 303
	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
	} else {
		len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
	}
304

osku's avatar
osku committed
305 306 307
	fil_io(OS_FILE_WRITE,
		TRUE, TRX_SYS_SPACE,
		trx_doublewrite->block1, 0, len,
308 309
			(void*)trx_doublewrite->write_buf, NULL);

osku's avatar
osku committed
310 311
	write_buf = trx_doublewrite->write_buf;

312 313 314 315
	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) {
		if (mach_read_from_4(write_buf + len2 + FIL_PAGE_LSN + 4)
			!= mach_read_from_4(write_buf + len2 + UNIV_PAGE_SIZE
				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
osku's avatar
osku committed
316 317 318 319 320 321 322 323 324 325
			ut_print_timestamp(stderr);
			fprintf(stderr,
"  InnoDB: ERROR: The page to be written seems corrupt!\n"
"InnoDB: The lsn fields do not match! Noticed in the doublewrite block1.\n");
		}
	}

	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		len = (trx_doublewrite->first_free
			- TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
326

osku's avatar
osku committed
327 328 329
		fil_io(OS_FILE_WRITE,
			TRUE, TRX_SYS_SPACE,
			trx_doublewrite->block2, 0, len,
330 331
			(void*)(trx_doublewrite->write_buf
			+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE),
osku's avatar
osku committed
332 333 334
			NULL);

		write_buf = trx_doublewrite->write_buf
335
			+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
osku's avatar
osku committed
336 337
		for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
						len2 += UNIV_PAGE_SIZE) {
338 339 340
			if (mach_read_from_4(write_buf + len2
					+ FIL_PAGE_LSN + 4)
				!= mach_read_from_4(write_buf + len2
osku's avatar
osku committed
341
					+ UNIV_PAGE_SIZE
342
					- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
osku's avatar
osku committed
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
				ut_print_timestamp(stderr);
				fprintf(stderr,
"  InnoDB: ERROR: The page to be written seems corrupt!\n"
"InnoDB: The lsn fields do not match! Noticed in the doublewrite block2.\n");
			}
		}
	}

	/* Now flush the doublewrite buffer data to disk */

	fil_flush(TRX_SYS_SPACE);

	/* We know that the writes have been flushed to disk now
	and in recovery we will find them in the doublewrite buffer
	blocks. Next do the writes to the intended positions. */

	for (i = 0; i < trx_doublewrite->first_free; i++) {
		block = trx_doublewrite->buf_block_arr[i];

		if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
363 364 365 366
			!= mach_read_from_4(block->frame + UNIV_PAGE_SIZE
				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
			ut_print_timestamp(stderr);
			fprintf(stderr,
osku's avatar
osku committed
367 368 369 370
"  InnoDB: ERROR: The page to be written seems corrupt!\n"
"InnoDB: The lsn fields do not match! Noticed in the buffer pool\n"
"InnoDB: after posting and flushing the doublewrite buffer.\n"
"InnoDB: Page buf fix count %lu, io fix %lu, state %lu\n",
371 372 373 374
				(ulong)block->buf_fix_count,
				(ulong)block->io_fix,
				(ulong)block->state);
		}
osku's avatar
osku committed
375 376 377 378
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);

		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
			FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
379
					(void*)block->frame, (void*)block);
osku's avatar
osku committed
380
	}
381

osku's avatar
osku committed
382 383 384 385 386 387
	/* Wake possible simulated aio thread to actually post the
	writes to the operating system */

	os_aio_simulated_wake_handler_threads();

	/* Wait that all async writes to tablespaces have been posted to
388 389
	the OS */

osku's avatar
osku committed
390 391 392 393 394 395 396 397 398 399
	os_aio_wait_until_no_pending_writes();

	/* Now we flush the data to disk (for example, with fsync) */

	fil_flush_file_spaces(FIL_TABLESPACE);

	/* We can now reuse the doublewrite memory buffer: */

	trx_doublewrite->first_free = 0;

400
	mutex_exit(&(trx_doublewrite->mutex));
osku's avatar
osku committed
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
}

/************************************************************************
Posts a buffer page for writing. If the doublewrite memory buffer is
full, calls buf_flush_buffered_writes and waits for for free space to
appear. */
static
void
buf_flush_post_to_doublewrite_buf(
/*==============================*/
	buf_block_t*	block)	/* in: buffer block to write */
{
try_again:
	mutex_enter(&(trx_doublewrite->mutex));

	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

	if (trx_doublewrite->first_free
				>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		mutex_exit(&(trx_doublewrite->mutex));

		buf_flush_buffered_writes();

		goto try_again;
	}

	ut_memcpy(trx_doublewrite->write_buf
				+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
			block->frame, UNIV_PAGE_SIZE);

	trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;

	trx_doublewrite->first_free++;

	if (trx_doublewrite->first_free
				>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		mutex_exit(&(trx_doublewrite->mutex));

		buf_flush_buffered_writes();

		return;
	}

	mutex_exit(&(trx_doublewrite->mutex));
}

/************************************************************************
Initializes a page for writing to the tablespace. */

void
buf_flush_init_for_writing(
/*=======================*/
453 454
	byte*	page,		/* in/out: page */
	void*	page_zip_,	/* in/out: compressed page, or NULL */
osku's avatar
osku committed
455 456 457
	dulint	newest_lsn,	/* in: newest modification lsn to the page */
	ulint	space,		/* in: space id */
	ulint	page_no)	/* in: page number */
458
{
459
	page_zip_des_t*	page_zip = page_zip_;
marko's avatar
marko committed
460
	ulint		zip_size = fil_space_get_zip_size(space);
461

462 463
	if (zip_size && zip_size != ULINT_UNDEFINED) {
		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
464 465 466
		case FIL_PAGE_TYPE_ZBLOB:
			ut_ad(!page_zip);
			mach_write_to_4(page + FIL_PAGE_OFFSET, page_no);
467
			mach_write_to_4(page + FIL_PAGE_ZBLOB_SPACE_ID, space);
468 469 470
			mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
			mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
					srv_use_checksums
marko's avatar
marko committed
471
					? page_zip_calc_checksum(
marko's avatar
marko committed
472
							page, zip_size)
473 474
					: BUF_NO_CHECKSUM_MAGIC);
			return;
475 476
		case FIL_PAGE_TYPE_XDES:
			/* This is essentially an uncompressed page. */
477
			break;
478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
		case FIL_PAGE_INDEX:
			ut_a(zip_size == page_zip->size);
			mach_write_to_4(page_zip->data
					+ FIL_PAGE_OFFSET, page_no);
			mach_write_to_8(page_zip->data
					+ FIL_PAGE_LSN, newest_lsn);
			mach_write_to_4(page_zip->data
					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
					space);
			memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
			mach_write_to_4(page_zip->data
					+ FIL_PAGE_SPACE_OR_CHKSUM,
					srv_use_checksums
					? page_zip_calc_checksum(
						page_zip->data, zip_size)
					: BUF_NO_CHECKSUM_MAGIC);
			return;
		default:
			ut_error;
497 498 499
		}
	}

osku's avatar
osku committed
500 501 502 503 504 505 506 507
	/* Write the newest modification lsn to the page header and trailer */
	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);

	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
								newest_lsn);
	/* Write the page number and the space id */

	mach_write_to_4(page + FIL_PAGE_OFFSET, page_no);
508
	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space);
osku's avatar
osku committed
509 510 511 512 513

	/* Store the new formula checksum */

	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
					srv_use_checksums ?
514
		buf_calc_page_new_checksum(page) : BUF_NO_CHECKSUM_MAGIC);
osku's avatar
osku committed
515 516 517 518 519 520 521 522

	/* We overwrite the first 4 bytes of the end lsn field to store
	the old formula checksum. Since it depends also on the field
	FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
	new formula checksum. */

	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
					srv_use_checksums ?
523
		buf_calc_page_old_checksum(page) : BUF_NO_CHECKSUM_MAGIC);
osku's avatar
osku committed
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
}

/************************************************************************
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
also when the doublewrite buffer is used, we must call
buf_flush_buffered_writes after we have posted a batch of writes! */
static
void
buf_flush_write_block_low(
/*======================*/
	buf_block_t*	block)	/* in: buffer block to write */
{
#ifdef UNIV_LOG_DEBUG
	static ibool univ_log_debug_warned;
#endif /* UNIV_LOG_DEBUG */
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

#ifdef UNIV_IBUF_DEBUG
	ut_a(ibuf_count_get(block->space, block->offset) == 0);
#endif
	ut_ad(!ut_dulint_is_zero(block->newest_modification));

#ifdef UNIV_LOG_DEBUG
	if (!univ_log_debug_warned) {
		univ_log_debug_warned = TRUE;
		fputs(
	"Warning: cannot force log to disk if UNIV_LOG_DEBUG is defined!\n"
	"Crash recovery will not work!\n",
			stderr);
	}
#else
	/* Force the log to the disk before writing the modified block */
	log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
557
#endif
558 559 560 561
	buf_flush_init_for_writing(block->frame,
			buf_block_get_page_zip(block),
			block->newest_modification,
			block->space, block->offset);
osku's avatar
osku committed
562 563 564
	if (!srv_use_doublewrite_buf || !trx_doublewrite) {
		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
			FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
565
					(void*)block->frame, (void*)block);
osku's avatar
osku committed
566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
	} else {
		buf_flush_post_to_doublewrite_buf(block);
	}
}

/************************************************************************
Writes a page asynchronously from the buffer buf_pool to a file, if it can be
found in the buf_pool and it is in a flushable state. NOTE: in simulated aio
we must call os_aio_simulated_wake_handler_threads after we have posted a batch
of writes! */
static
ulint
buf_flush_try_page(
/*===============*/
				/* out: 1 if a page was flushed, 0 otherwise */
	ulint	space,		/* in: space id */
	ulint	offset,		/* in: page offset */
	ulint	flush_type)	/* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST, or
				BUF_FLUSH_SINGLE_PAGE */
{
	buf_block_t*	block;
	ibool		locked;
588

osku's avatar
osku committed
589 590 591 592 593 594 595 596 597 598
	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
				|| flush_type == BUF_FLUSH_SINGLE_PAGE);

	mutex_enter(&(buf_pool->mutex));

	block = buf_page_hash_get(space, offset);

	ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);

	if (flush_type == BUF_FLUSH_LIST
599 600
		&& block && buf_flush_ready_for_flush(block, flush_type)) {

osku's avatar
osku committed
601 602 603 604 605 606 607 608 609 610 611
		block->io_fix = BUF_IO_WRITE;

		/* If AWE is enabled and the page is not mapped to a frame,
		then map it */

		if (block->frame == NULL) {
			ut_a(srv_use_awe);

			/* We set second parameter TRUE because the block is
			in the LRU list and we must put it to
			awe_LRU_free_mapped list once mapped to a frame */
612

osku's avatar
osku committed
613 614 615 616 617 618 619 620 621 622 623 624 625
			buf_awe_map_page_to_frame(block, TRUE);
		}

		block->flush_type = flush_type;

		if (buf_pool->n_flush[flush_type] == 0) {

			os_event_reset(buf_pool->no_flush[flush_type]);
		}

		(buf_pool->n_flush[flush_type])++;

		locked = FALSE;
626

osku's avatar
osku committed
627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
		/* If the simulated aio thread is not running, we must
		not wait for any latch, as we may end up in a deadlock:
		if buf_fix_count == 0, then we know we need not wait */

		if (block->buf_fix_count == 0) {
			rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);

			locked = TRUE;
		}

		mutex_exit(&(buf_pool->mutex));

		if (!locked) {
			buf_flush_buffered_writes();

			rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
		}

#ifdef UNIV_DEBUG
		if (buf_debug_prints) {
			fprintf(stderr,
				"Flushing page space %lu, page no %lu \n",
				(ulong) block->space, (ulong) block->offset);
		}
#endif /* UNIV_DEBUG */

		buf_flush_write_block_low(block);
654

osku's avatar
osku committed
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
		return(1);

	} else if (flush_type == BUF_FLUSH_LRU && block
			&& buf_flush_ready_for_flush(block, flush_type)) {

		/* VERY IMPORTANT:
		Because any thread may call the LRU flush, even when owning
		locks on pages, to avoid deadlocks, we must make sure that the
		s-lock is acquired on the page without waiting: this is
		accomplished because in the if-condition above we require
		the page not to be bufferfixed (in function
		..._ready_for_flush). */

		block->io_fix = BUF_IO_WRITE;

		/* If AWE is enabled and the page is not mapped to a frame,
		then map it */

		if (block->frame == NULL) {
			ut_a(srv_use_awe);

			/* We set second parameter TRUE because the block is
			in the LRU list and we must put it to
			awe_LRU_free_mapped list once mapped to a frame */
679

osku's avatar
osku committed
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
			buf_awe_map_page_to_frame(block, TRUE);
		}

		block->flush_type = flush_type;

		if (buf_pool->n_flush[flush_type] == 0) {

			os_event_reset(buf_pool->no_flush[flush_type]);
		}

		(buf_pool->n_flush[flush_type])++;

		rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);

		/* Note that the s-latch is acquired before releasing the
		buf_pool mutex: this ensures that the latch is acquired
		immediately. */
697

osku's avatar
osku committed
698 699 700 701 702 703 704 705
		mutex_exit(&(buf_pool->mutex));

		buf_flush_write_block_low(block);

		return(1);

	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
			&& buf_flush_ready_for_flush(block, flush_type)) {
706

osku's avatar
osku committed
707 708 709 710 711 712 713 714 715 716 717
		block->io_fix = BUF_IO_WRITE;

		/* If AWE is enabled and the page is not mapped to a frame,
		then map it */

		if (block->frame == NULL) {
			ut_a(srv_use_awe);

			/* We set second parameter TRUE because the block is
			in the LRU list and we must put it to
			awe_LRU_free_mapped list once mapped to a frame */
718

osku's avatar
osku committed
719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739
			buf_awe_map_page_to_frame(block, TRUE);
		}

		block->flush_type = flush_type;

		if (buf_pool->n_flush[block->flush_type] == 0) {

			os_event_reset(buf_pool->no_flush[block->flush_type]);
		}

		(buf_pool->n_flush[flush_type])++;

		mutex_exit(&(buf_pool->mutex));

		rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);

#ifdef UNIV_DEBUG
		if (buf_debug_prints) {
			fprintf(stderr,
			"Flushing single page space %lu, page no %lu \n",
						(ulong) block->space,
740
						(ulong) block->offset);
osku's avatar
osku committed
741 742 743 744
		}
#endif /* UNIV_DEBUG */

		buf_flush_write_block_low(block);
745

osku's avatar
osku committed
746 747 748 749 750
		return(1);
	} else {
		mutex_exit(&(buf_pool->mutex));

		return(0);
751
	}
osku's avatar
osku committed
752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
}

/***************************************************************
Flushes to disk all flushable pages within the flush area. */
static
ulint
buf_flush_try_neighbors(
/*====================*/
				/* out: number of pages flushed */
	ulint	space,		/* in: space id */
	ulint	offset,		/* in: page offset */
	ulint	flush_type)	/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
{
	buf_block_t*	block;
	ulint		low, high;
	ulint		count		= 0;
	ulint		i;

	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);

	low = (offset / BUF_FLUSH_AREA) * BUF_FLUSH_AREA;
	high = (offset / BUF_FLUSH_AREA + 1) * BUF_FLUSH_AREA;

	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
		/* If there is little space, it is better not to flush any
		block except from the end of the LRU list */
778

osku's avatar
osku committed
779 780 781 782 783
		low = offset;
		high = offset + 1;
	}

	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
784

osku's avatar
osku committed
785 786 787 788 789 790 791 792 793 794 795 796
	if (high > fil_space_get_size(space)) {
		high = fil_space_get_size(space);
	}

	mutex_enter(&(buf_pool->mutex));

	for (i = low; i < high; i++) {

		block = buf_page_hash_get(space, i);
		ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);

		if (block && flush_type == BUF_FLUSH_LRU && i != offset
797
			&& !block->old) {
osku's avatar
osku committed
798

799 800
			/* We avoid flushing 'non-old' blocks in an LRU flush,
			because the flushed blocks are soon freed */
osku's avatar
osku committed
801

802
			continue;
osku's avatar
osku committed
803 804 805
		}

		if (block && buf_flush_ready_for_flush(block, flush_type)
806
		    && (i == offset || block->buf_fix_count == 0)) {
osku's avatar
osku committed
807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
			/* We only try to flush those neighbors != offset
			where the buf fix count is zero, as we then know that
			we probably can latch the page without a semaphore
			wait. Semaphore waits are expensive because we must
			flush the doublewrite buffer before we start
			waiting. */

			mutex_exit(&(buf_pool->mutex));

			/* Note: as we release the buf_pool mutex above, in
			buf_flush_try_page we cannot be sure the page is still
			in a flushable state: therefore we check it again
			inside that function. */

			count += buf_flush_try_page(space, i, flush_type);

			mutex_enter(&(buf_pool->mutex));
		}
	}
826

osku's avatar
osku committed
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
	mutex_exit(&(buf_pool->mutex));

	return(count);
}

/***********************************************************************
This utility flushes dirty blocks from the end of the LRU list or flush_list.
NOTE 1: in the case of an LRU flush the calling thread may own latches to
pages: to avoid deadlocks, this function must be written so that it cannot
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
the calling thread is not allowed to own any latches on pages! */

ulint
buf_flush_batch(
/*============*/
				/* out: number of blocks for which the write
				request was queued; ULINT_UNDEFINED if there
				was a flush of the same type already running */
	ulint	flush_type,	/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
				BUF_FLUSH_LIST, then the caller must not own
				any latches on pages */
	ulint	min_n,		/* in: wished minimum mumber of blocks flushed
				(it is not guaranteed that the actual number
				is that big, though) */
	dulint	lsn_limit)	/* in the case BUF_FLUSH_LIST all blocks whose
				oldest_modification is smaller than this
				should be flushed (if their number does not
				exceed min_n), otherwise ignored */
{
	buf_block_t*	block;
857
	ulint		page_count	= 0;
osku's avatar
osku committed
858 859 860 861
	ulint		old_page_count;
	ulint		space;
	ulint		offset;
	ibool		found;
862

osku's avatar
osku committed
863
	ut_ad((flush_type == BUF_FLUSH_LRU)
864
					|| (flush_type == BUF_FLUSH_LIST));
osku's avatar
osku committed
865 866 867 868 869
	ut_ad((flush_type != BUF_FLUSH_LIST)
					|| sync_thread_levels_empty_gen(TRUE));
	mutex_enter(&(buf_pool->mutex));

	if ((buf_pool->n_flush[flush_type] > 0)
870
		|| (buf_pool->init_flush[flush_type] == TRUE)) {
osku's avatar
osku committed
871 872

		/* There is already a flush batch of the same type running */
873

osku's avatar
osku committed
874 875 876 877 878 879
		mutex_exit(&(buf_pool->mutex));

		return(ULINT_UNDEFINED);
	}

	(buf_pool->init_flush)[flush_type] = TRUE;
880

osku's avatar
osku committed
881 882 883 884 885 886
	for (;;) {
		/* If we have flushed enough, leave the loop */
		if (page_count >= min_n) {

			break;
		}
887

osku's avatar
osku committed
888 889
		/* Start from the end of the list looking for a suitable
		block to be flushed. */
890 891

		if (flush_type == BUF_FLUSH_LRU) {
osku's avatar
osku committed
892
			block = UT_LIST_GET_LAST(buf_pool->LRU);
893
		} else {
osku's avatar
osku committed
894 895 896 897
			ut_ad(flush_type == BUF_FLUSH_LIST);

			block = UT_LIST_GET_LAST(buf_pool->flush_list);
			if (!block
898 899
				|| (ut_dulint_cmp(block->oldest_modification,
						lsn_limit) >= 0)) {
osku's avatar
osku committed
900 901 902 903
				/* We have flushed enough */

				break;
			}
904 905 906 907
		}

		found = FALSE;

osku's avatar
osku committed
908 909 910 911 912 913
		/* Note that after finding a single flushable page, we try to
		flush also all its neighbors, and after that start from the
		END of the LRU list or flush list again: the list may change
		during the flushing and we cannot safely preserve within this
		function a pointer to a block in the list! */

914
		while ((block != NULL) && !found) {
osku's avatar
osku committed
915 916 917 918 919 920 921
			ut_a(block->state == BUF_BLOCK_FILE_PAGE);

			if (buf_flush_ready_for_flush(block, flush_type)) {

				found = TRUE;
				space = block->space;
				offset = block->offset;
922

osku's avatar
osku committed
923 924 925
				mutex_exit(&(buf_pool->mutex));

				old_page_count = page_count;
926

osku's avatar
osku committed
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945
				/* Try to flush also all the neighbors */
				page_count +=
					buf_flush_try_neighbors(space, offset,
								flush_type);
				/* fprintf(stderr,
				"Flush type %lu, page no %lu, neighb %lu\n",
				flush_type, offset,
				page_count - old_page_count); */

				mutex_enter(&(buf_pool->mutex));

			} else if (flush_type == BUF_FLUSH_LRU) {

				block = UT_LIST_GET_PREV(LRU, block);
			} else {
				ut_ad(flush_type == BUF_FLUSH_LIST);

				block = UT_LIST_GET_PREV(flush_list, block);
			}
946
		}
osku's avatar
osku committed
947

948
		/* If we could not find anything to flush, leave the loop */
osku's avatar
osku committed
949

950 951 952
		if (!found) {
			break;
		}
osku's avatar
osku committed
953 954 955 956 957
	}

	(buf_pool->init_flush)[flush_type] = FALSE;

	if ((buf_pool->n_flush[flush_type] == 0)
958
		&& (buf_pool->init_flush[flush_type] == FALSE)) {
osku's avatar
osku committed
959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978

		/* The running flush batch has ended */

		os_event_set(buf_pool->no_flush[flush_type]);
	}

	mutex_exit(&(buf_pool->mutex));

	buf_flush_buffered_writes();

#ifdef UNIV_DEBUG
	if (buf_debug_prints && page_count > 0) {
		ut_a(flush_type == BUF_FLUSH_LRU
			|| flush_type == BUF_FLUSH_LIST);
		fprintf(stderr, flush_type == BUF_FLUSH_LRU
			? "Flushed %lu pages in LRU flush\n"
			: "Flushed %lu pages in flush list flush\n",
			(ulong) page_count);
	}
#endif /* UNIV_DEBUG */
979 980 981 982

	if (page_count != ULINT_UNDEFINED) {
		srv_buf_pool_flushed += page_count;
	}
osku's avatar
osku committed
983 984 985 986 987 988 989 990 991 992 993 994 995

	return(page_count);
}

/**********************************************************************
Waits until a flush batch of the given type ends */

void
buf_flush_wait_batch_end(
/*=====================*/
	ulint	type)	/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
{
	ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
996

osku's avatar
osku committed
997
	os_event_wait(buf_pool->no_flush[type]);
998
}
osku's avatar
osku committed
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013

/**********************************************************************
Gives a recommendation of how many blocks should be flushed to establish
a big enough margin of replaceable blocks near the end of the LRU list
and in the free list. */
static
ulint
buf_flush_LRU_recommendation(void)
/*==============================*/
			/* out: number of blocks which should be flushed
			from the end of the LRU list */
{
	buf_block_t*	block;
	ulint		n_replaceable;
	ulint		distance	= 0;
1014

osku's avatar
osku committed
1015 1016 1017 1018 1019 1020 1021
	mutex_enter(&(buf_pool->mutex));

	n_replaceable = UT_LIST_GET_LEN(buf_pool->free);

	block = UT_LIST_GET_LAST(buf_pool->LRU);

	while ((block != NULL)
1022 1023 1024
		&& (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
			+ BUF_FLUSH_EXTRA_MARGIN)
		&& (distance < BUF_LRU_FREE_SEARCH_LEN)) {
osku's avatar
osku committed
1025 1026 1027 1028 1029 1030

		if (buf_flush_ready_for_replace(block)) {
			n_replaceable++;
		}

		distance++;
1031

osku's avatar
osku committed
1032 1033
		block = UT_LIST_GET_PREV(LRU, block);
	}
1034

osku's avatar
osku committed
1035 1036 1037 1038 1039 1040
	mutex_exit(&(buf_pool->mutex));

	if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {

		return(0);
	}
1041

osku's avatar
osku committed
1042 1043 1044 1045 1046 1047 1048 1049 1050
	return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
							- n_replaceable);
}

/*************************************************************************
Flushes pages from the end of the LRU list if there is too small a margin
of replaceable pages there or in the free list. VERY IMPORTANT: this function
is called also by threads which have locks on pages. To avoid deadlocks, we
flush only pages such that the s-lock required for flushing can be acquired
1051
immediately, without waiting. */
osku's avatar
osku committed
1052 1053 1054 1055 1056 1057 1058 1059 1060

void
buf_flush_free_margin(void)
/*=======================*/
{
	ulint	n_to_flush;
	ulint	n_flushed;

	n_to_flush = buf_flush_LRU_recommendation();
1061

osku's avatar
osku committed
1062 1063 1064 1065 1066 1067
	if (n_to_flush > 0) {
		n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush,
							ut_dulint_zero);
		if (n_flushed == ULINT_UNDEFINED) {
			/* There was an LRU type flush batch already running;
			let us wait for it to end */
1068 1069

			buf_flush_wait_batch_end(BUF_FLUSH_LRU);
osku's avatar
osku committed
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
		}
	}
}

/**********************************************************************
Validates the flush list. */
static
ibool
buf_flush_validate_low(void)
/*========================*/
		/* out: TRUE if ok */
{
	buf_block_t*	block;
	dulint		om;
1084

osku's avatar
osku committed
1085 1086 1087 1088 1089 1090 1091 1092
	UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list);

	block = UT_LIST_GET_FIRST(buf_pool->flush_list);

	while (block != NULL) {
		om = block->oldest_modification;
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);
		ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0);
1093

osku's avatar
osku committed
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113
		block = UT_LIST_GET_NEXT(flush_list, block);

		if (block) {
			ut_a(ut_dulint_cmp(om, block->oldest_modification)
									>= 0);
		}
	}

	return(TRUE);
}

/**********************************************************************
Validates the flush list. */

ibool
buf_flush_validate(void)
/*====================*/
		/* out: TRUE if ok */
{
	ibool	ret;
1114

osku's avatar
osku committed
1115 1116 1117
	mutex_enter(&(buf_pool->mutex));

	ret = buf_flush_validate_low();
1118

osku's avatar
osku committed
1119 1120 1121 1122
	mutex_exit(&(buf_pool->mutex));

	return(ret);
}