btr0cur.cc 229 KB
Newer Older
1 2
/*****************************************************************************

3
Copyright (c) 1994, 2018, Oracle and/or its affiliates. All Rights Reserved.
4
Copyright (c) 2008, Google Inc.
5
Copyright (c) 2012, Facebook Inc.
6
Copyright (c) 2015, 2019, MariaDB Corporation.
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22

Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
23
this program; if not, write to the Free Software Foundation, Inc.,
24
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
25 26 27

*****************************************************************************/

28
/**************************************************//**
29
@file btr/btr0cur.cc
osku's avatar
osku committed
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
The index tree cursor

All changes that row operations make to a B-tree or the records
there must go through this module! Undo log records are written here
of every modify or insert of a clustered index record.

			NOTE!!!
To make sure we do not run out of disk space during a pessimistic
insert or update, we have to reserve 2 x the height of the index tree
many pages in the tablespace before we start the operation, because
if leaf splitting has been started, it is difficult to undo, except
by crashing the database and doing a roll-forward.

Created 10/16/1994 Heikki Tuuri
*******************************************************/

#include "btr0cur.h"
47
#include "row0upd.h"
48
#include "mtr0log.h"
osku's avatar
osku committed
49
#include "page0page.h"
marko's avatar
marko committed
50
#include "page0zip.h"
osku's avatar
osku committed
51 52
#include "rem0rec.h"
#include "rem0cmp.h"
53
#include "buf0lru.h"
osku's avatar
osku committed
54 55
#include "btr0btr.h"
#include "btr0sea.h"
56
#include "row0log.h"
57
#include "row0purge.h"
osku's avatar
osku committed
58 59
#include "row0upd.h"
#include "trx0rec.h"
60
#include "trx0roll.h"
osku's avatar
osku committed
61 62 63 64 65
#include "que0que.h"
#include "row0row.h"
#include "srv0srv.h"
#include "ibuf0ibuf.h"
#include "lock0lock.h"
66
#include "zlib.h"
67
#include "srv0start.h"
68 69
#include "mysql_com.h"
#include "dict0stats.h"
70

71
/** Buffered B-tree operation types, introduced as part of delete buffering. */
72
enum btr_op_t {
73 74 75 76 77
	BTR_NO_OP = 0,			/*!< Not buffered */
	BTR_INSERT_OP,			/*!< Insert, do not ignore UNIQUE */
	BTR_INSERT_IGNORE_UNIQUE_OP,	/*!< Insert, ignoring UNIQUE */
	BTR_DELETE_OP,			/*!< Purge a delete-marked record */
	BTR_DELMARK_OP			/*!< Mark a record for deletion */
78
};
79

80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
/** Modification types for the B-tree operation. */
enum btr_intention_t {
	BTR_INTENTION_DELETE,
	BTR_INTENTION_BOTH,
	BTR_INTENTION_INSERT
};
#if BTR_INTENTION_DELETE > BTR_INTENTION_BOTH
#error "BTR_INTENTION_DELETE > BTR_INTENTION_BOTH"
#endif
#if BTR_INTENTION_BOTH > BTR_INTENTION_INSERT
#error "BTR_INTENTION_BOTH > BTR_INTENTION_INSERT"
#endif

/** For the index->lock scalability improvement, only possibility of clear
performance regression observed was caused by grown huge history list length.
That is because the exclusive use of index->lock also worked as reserving
free blocks and read IO bandwidth with priority. To avoid huge glowing history
list as same level with previous implementation, prioritizes pessimistic tree
operations by purge as the previous, when it seems to be growing huge.

 Experimentally, the history list length starts to affect to performance
throughput clearly from about 100000. */
#define BTR_CUR_FINE_HISTORY_LENGTH	100000
osku's avatar
osku committed
103

104
/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
105
ulint	btr_cur_n_non_sea;
106 107 108
/** Old value of btr_cur_n_non_sea.  Copied by
srv_refresh_innodb_monitor_stats().  Referenced by
srv_printf_innodb_monitor(). */
109 110 111 112 113
ulint	btr_cur_n_non_sea_old;
#ifdef BTR_CUR_HASH_ADAPT
/** Number of successful adaptive hash index lookups in
btr_cur_search_to_nth_level(). */
ulint	btr_cur_n_sea;
114 115 116
/** Old value of btr_cur_n_sea.  Copied by
srv_refresh_innodb_monitor_stats().  Referenced by
srv_printf_innodb_monitor(). */
117 118
ulint	btr_cur_n_sea_old;
#endif /* BTR_CUR_HASH_ADAPT */
osku's avatar
osku committed
119

120 121
#ifdef UNIV_DEBUG
/* Flag to limit optimistic insert records */
122
uint	btr_cur_limit_optimistic_insert_debug;
123 124
#endif /* UNIV_DEBUG */

125
/** In the optimistic insert, if the insert does not fit, but this much space
osku's avatar
osku committed
126 127 128
can be released by page reorganize, then it is reorganized */
#define BTR_CUR_PAGE_REORGANIZE_LIMIT	(UNIV_PAGE_SIZE / 32)

129 130
/** The structure of a BLOB part header */
/* @{ */
osku's avatar
osku committed
131
/*--------------------------------------*/
132
#define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
osku's avatar
osku committed
133
						page */
134
#define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
osku's avatar
osku committed
135 136
						FIL_NULL if none */
/*--------------------------------------*/
137 138
#define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
						part header, in bytes */
139 140

/** Estimated table level stats from sampled value.
141 142 143 144 145
@param value sampled stats
@param index index being sampled
@param sample number of sampled rows
@param ext_size external stored data size
@param not_empty table not empty
146
@return estimated table wide stats from sampled value */
147
#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
148
	(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
149 150
	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))

151
/* @} */
osku's avatar
osku committed
152

153
/*******************************************************************//**
osku's avatar
osku committed
154 155 156 157 158 159 160
Marks all extern fields in a record as owned by the record. This function
should be called if the delete mark of a record is removed: a not delete
marked record always owns all its extern fields. */
static
void
btr_cur_unmark_extern_fields(
/*=========================*/
161
	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
162
				part will be updated, or NULL */
163 164 165 166
	rec_t*		rec,	/*!< in/out: record in a clustered index */
	dict_index_t*	index,	/*!< in: index of the page */
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
167
/*******************************************************************//**
osku's avatar
osku committed
168 169 170 171 172 173
Adds path information to the cursor for the current page, for which
the binary search has been performed. */
static
void
btr_cur_add_path_info(
/*==================*/
174 175
	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
	ulint		height,		/*!< in: height of the page in tree;
osku's avatar
osku committed
176
					0 means leaf node */
177
	ulint		root_height);	/*!< in: root node height in tree */
178
/***********************************************************//**
osku's avatar
osku committed
179 180 181 182 183 184
Frees the externally stored fields for a record, if the field is mentioned
in the update vector. */
static
void
btr_rec_free_updated_extern_fields(
/*===============================*/
185
	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
osku's avatar
osku committed
186
				X-latched */
187 188
	rec_t*		rec,	/*!< in: record */
	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
189
				part will be updated, or NULL */
190 191
	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
	const upd_t*	update,	/*!< in: update vector */
192
	bool		rollback,/*!< in: performing rollback? */
193
	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
osku's avatar
osku committed
194
				an X-latch to record page and to the tree */
195
/***********************************************************//**
196 197 198 199 200
Frees the externally stored fields for a record. */
static
void
btr_rec_free_externally_stored_fields(
/*==================================*/
201
	dict_index_t*	index,	/*!< in: index of the data, the index
202
				tree MUST be X-latched */
203 204 205
	rec_t*		rec,	/*!< in: record */
	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
206
				part will be updated, or NULL */
207
	bool		rollback,/*!< in: performing rollback? */
208
	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
209 210
				an X-latch to record page and to the index
				tree */
osku's avatar
osku committed
211 212

/*==================== B-TREE SEARCH =========================*/
213

214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
#if MTR_MEMO_PAGE_S_FIX != RW_S_LATCH
#error "MTR_MEMO_PAGE_S_FIX != RW_S_LATCH"
#endif
#if MTR_MEMO_PAGE_X_FIX != RW_X_LATCH
#error "MTR_MEMO_PAGE_X_FIX != RW_X_LATCH"
#endif
#if MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH
#error "MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH"
#endif

/** Latches the leaf page or pages requested.
@param[in]	block		leaf page where the search converged
@param[in]	page_id		page id of the leaf
@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
@param[in]	cursor		cursor
@param[in]	mtr		mini-transaction
@return	blocks and savepoints which actually latched. */
btr_latch_leaves_t
osku's avatar
osku committed
232
btr_cur_latch_leaves(
233
	buf_block_t*		block,
234
	const page_id_t		page_id,
235 236 237 238
	const page_size_t&	page_size,
	ulint			latch_mode,
	btr_cur_t*		cursor,
	mtr_t*			mtr)
osku's avatar
osku committed
239
{
240 241 242 243
	ulint		mode;
	ulint		left_page_no;
	ulint		right_page_no;
	buf_block_t*	get_block;
244 245 246
	page_t*		page = buf_block_get_frame(block);
	bool		spatial;
	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
247

248 249
	spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
	ut_ad(buf_page_in_file(&block->page));
osku's avatar
osku committed
250

251 252 253
	switch (latch_mode) {
	case BTR_SEARCH_LEAF:
	case BTR_MODIFY_LEAF:
254 255 256 257 258 259 260 261 262 263 264
	case BTR_SEARCH_TREE:
		if (spatial) {
			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
				= mtr_set_savepoint(mtr);
		}

		mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
		get_block = btr_block_get(page_id, page_size, mode,
					  cursor->index, mtr);
		latch_leaves.blocks[1] = get_block;
265
#ifdef UNIV_BTR_DEBUG
266
		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
267
#endif /* UNIV_BTR_DEBUG */
268 269 270 271 272 273
		if (spatial) {
			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
				= get_block;
		}

		return(latch_leaves);
274
	case BTR_MODIFY_TREE:
275 276
		/* It is exclusive for other operations which calls
		btr_page_set_prev() */
277 278 279 280
		ut_ad(mtr_memo_contains_flagged(
			      mtr,
			      dict_index_get_lock(cursor->index),
			      MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
281
		/* x-latch also siblings from left to right */
osku's avatar
osku committed
282
		left_page_no = btr_page_get_prev(page, mtr);
283
		mode = latch_mode;
osku's avatar
osku committed
284 285

		if (left_page_no != FIL_NULL) {
286 287 288 289 290 291 292

			if (spatial) {
				cursor->rtr_info->tree_savepoints[
					RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
			}

			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
293
			get_block = btr_block_get(
294 295 296 297 298 299 300 301
				page_id_t(page_id.space(), left_page_no),
				page_size, RW_X_LATCH, cursor->index, mtr);
			latch_leaves.blocks[0] = get_block;

			if (spatial) {
				cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
					= get_block;
			}
osku's avatar
osku committed
302
		}
303

304 305 306 307 308 309
		if (spatial) {
			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
				= mtr_set_savepoint(mtr);
		}

		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
310
		get_block = btr_block_get(
311 312 313
			page_id, page_size, RW_X_LATCH, cursor->index, mtr);
		latch_leaves.blocks[1] = get_block;

314
#ifdef UNIV_BTR_DEBUG
315 316 317 318 319 320 321 322
		/* Sanity check only after both the blocks are latched. */
		if (latch_leaves.blocks[0] != NULL) {
			ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
				== page_is_comp(page));
			ut_a(btr_page_get_next(
				latch_leaves.blocks[0]->frame, mtr)
				== page_get_page_no(page));
		}
323
		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
324
#endif /* UNIV_BTR_DEBUG */
325 326 327 328 329

		if (spatial) {
			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
				= get_block;
		}
osku's avatar
osku committed
330 331 332 333

		right_page_no = btr_page_get_next(page, mtr);

		if (right_page_no != FIL_NULL) {
334 335 336 337 338 339
			if (spatial) {
				cursor->rtr_info->tree_savepoints[
					RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
								mtr);
			}
			latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
340
			get_block = btr_block_get(
341 342 343
				page_id_t(page_id.space(), right_page_no),
				page_size, RW_X_LATCH, cursor->index, mtr);
			latch_leaves.blocks[2] = get_block;
344
#ifdef UNIV_BTR_DEBUG
345 346 347
			ut_a(page_is_comp(get_block->frame)
			     == page_is_comp(page));
			ut_a(btr_page_get_prev(get_block->frame, mtr)
348
			     == page_get_page_no(page));
349
#endif /* UNIV_BTR_DEBUG */
350 351 352 353
			if (spatial) {
				cursor->rtr_info->tree_blocks[
					RTR_MAX_LEVELS + 2] = get_block;
			}
osku's avatar
osku committed
354 355
		}

356
		return(latch_leaves);
osku's avatar
osku committed
357

358 359 360
	case BTR_SEARCH_PREV:
	case BTR_MODIFY_PREV:
		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
361 362
		/* latch also left sibling */
		rw_lock_s_lock(&block->lock);
osku's avatar
osku committed
363
		left_page_no = btr_page_get_prev(page, mtr);
364
		rw_lock_s_unlock(&block->lock);
osku's avatar
osku committed
365 366

		if (left_page_no != FIL_NULL) {
367
			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
368
			get_block = btr_block_get(
369 370 371
				page_id_t(page_id.space(), left_page_no),
				page_size, mode, cursor->index, mtr);
			latch_leaves.blocks[0] = get_block;
372
			cursor->left_block = get_block;
373
#ifdef UNIV_BTR_DEBUG
374
			ut_a(page_is_comp(get_block->frame)
375
			     == page_is_comp(page));
376
			ut_a(btr_page_get_next(get_block->frame, mtr)
377
			     == page_get_page_no(page));
378
#endif /* UNIV_BTR_DEBUG */
osku's avatar
osku committed
379 380
		}

381 382 383 384
		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
		get_block = btr_block_get(page_id, page_size, mode,
					  cursor->index, mtr);
		latch_leaves.blocks[1] = get_block;
385
#ifdef UNIV_BTR_DEBUG
386
		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
387
#endif /* UNIV_BTR_DEBUG */
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
		return(latch_leaves);
	case BTR_CONT_MODIFY_TREE:
		ut_ad(dict_index_is_spatial(cursor->index));
		return(latch_leaves);
	}

	ut_error;
	return(latch_leaves);
}

/** Optimistically latches the leaf page or pages requested.
@param[in]	block		guessed buffer block
@param[in]	modify_clock	modify clock value
@param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
@param[in,out]	cursor		cursor
@param[in]	file		file name
@param[in]	line		line where called
@param[in]	mtr		mini-transaction
@return true if success */
bool
btr_cur_optimistic_latch_leaves(
	buf_block_t*	block,
	ib_uint64_t	modify_clock,
	ulint*		latch_mode,
	btr_cur_t*	cursor,
	const char*	file,
414
	unsigned	line,
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572
	mtr_t*		mtr)
{
	ulint		mode;
	ulint		left_page_no;

	switch (*latch_mode) {
	case BTR_SEARCH_LEAF:
	case BTR_MODIFY_LEAF:
		return(buf_page_optimistic_get(*latch_mode, block,
				modify_clock, file, line, mtr));
	case BTR_SEARCH_PREV:
	case BTR_MODIFY_PREV:
		mode = *latch_mode == BTR_SEARCH_PREV
			? RW_S_LATCH : RW_X_LATCH;

		buf_page_mutex_enter(block);
		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
			buf_page_mutex_exit(block);
			return(false);
		}
		/* pin the block not to be relocated */
		buf_block_buf_fix_inc(block, file, line);
		buf_page_mutex_exit(block);

		rw_lock_s_lock(&block->lock);
		if (block->modify_clock != modify_clock) {
			rw_lock_s_unlock(&block->lock);

			goto unpin_failed;
		}
		left_page_no = btr_page_get_prev(
			buf_block_get_frame(block), mtr);
		rw_lock_s_unlock(&block->lock);

		if (left_page_no != FIL_NULL) {
			const page_id_t	page_id(
				dict_index_get_space(cursor->index),
				left_page_no);

			cursor->left_block = btr_block_get(
				page_id,
				dict_table_page_size(cursor->index->table),
				mode, cursor->index, mtr);
		} else {
			cursor->left_block = NULL;
		}

		if (buf_page_optimistic_get(mode, block, modify_clock,
					    file, line, mtr)) {
			if (btr_page_get_prev(buf_block_get_frame(block), mtr)
			    == left_page_no) {
				buf_block_buf_fix_dec(block);
				*latch_mode = mode;
				return(true);
			} else {
				/* release the block */
				btr_leaf_page_release(block, mode, mtr);
			}
		}

		/* release the left block */
		if (cursor->left_block != NULL) {
			btr_leaf_page_release(cursor->left_block,
					      mode, mtr);
		}
unpin_failed:
		/* unpin the block */
		buf_block_buf_fix_dec(block);
		return(false);

	default:
		ut_error;
		return(false);
	}
}

/**
Gets intention in btr_intention_t from latch_mode, and cleares the intention
at the latch_mode.
@param latch_mode	in/out: pointer to latch_mode
@return intention for latching tree */
static
btr_intention_t
btr_cur_get_and_clear_intention(
	ulint	*latch_mode)
{
	btr_intention_t	intention;

	switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
	case BTR_LATCH_FOR_INSERT:
		intention = BTR_INTENTION_INSERT;
		break;
	case BTR_LATCH_FOR_DELETE:
		intention = BTR_INTENTION_DELETE;
		break;
	default:
		/* both or unknown */
		intention = BTR_INTENTION_BOTH;
	}
	*latch_mode &= ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE);

	return(intention);
}

/**
Gets the desired latch type for the root leaf (root page is root leaf)
at the latch mode.
@param latch_mode	in: BTR_SEARCH_LEAF, ...
@return latch type */
static
rw_lock_type_t
btr_cur_latch_for_root_leaf(
	ulint	latch_mode)
{
	switch (latch_mode) {
	case BTR_SEARCH_LEAF:
	case BTR_SEARCH_TREE:
	case BTR_SEARCH_PREV:
		return(RW_S_LATCH);
	case BTR_MODIFY_LEAF:
	case BTR_MODIFY_TREE:
	case BTR_MODIFY_PREV:
		return(RW_X_LATCH);
	case BTR_CONT_MODIFY_TREE:
	case BTR_CONT_SEARCH_TREE:
		/* A root page should be latched already,
		and don't need to be latched here.
		fall through (RW_NO_LATCH) */
	case BTR_NO_LATCHES:
		return(RW_NO_LATCH);
	}

	ut_error;
	return(RW_NO_LATCH); /* avoid compiler warnings */
}

/** Detects whether the modifying record might need a modifying tree structure.
@param[in]	index		index
@param[in]	page		page
@param[in]	lock_intention	lock intention for the tree operation
@param[in]	rec		record (current node_ptr)
@param[in]	rec_size	size of the record or max size of node_ptr
@param[in]	page_size	page size
@param[in]	mtr		mtr
@return true if tree modification is needed */
static
bool
btr_cur_will_modify_tree(
	dict_index_t*	index,
	const page_t*	page,
	btr_intention_t	lock_intention,
	const rec_t*	rec,
	ulint		rec_size,
	const page_size_t&	page_size,
	mtr_t*		mtr)
{
	ut_ad(!page_is_leaf(page));
	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
573
					MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
574 575 576 577 578 579 580 581 582 583

	/* Pessimistic delete of the first record causes delete & insert
	of node_ptr at upper level. And a subsequent page shrink is
	possible. It causes delete of node_ptr at the upper level.
	So we should pay attention also to 2nd record not only
	first record and last record. Because if the "delete & insert" are
	done for the different page, the 2nd record become
	first record and following compress might delete the record and causes
	the uppper level node_ptr modification. */

584 585
	const ulint n_recs = page_get_n_recs(page);

586 587 588 589 590 591
	if (lock_intention <= BTR_INTENTION_BOTH) {
		ulint	margin;

		/* check delete will cause. (BTR_INTENTION_BOTH
		or BTR_INTENTION_DELETE) */
		/* first, 2nd, 2nd-last and last records are 4 records */
592
		if (n_recs < 5) {
593 594 595 596 597
			return(true);
		}

		/* is first, 2nd or last record */
		if (page_rec_is_first(rec, page)
598
		    || (page_has_next(page)
599 600
			&& (page_rec_is_last(rec, page)
			    || page_rec_is_second_last(rec, page)))
601
		    || (page_has_prev(page)
602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
			&& page_rec_is_second(rec, page))) {
			return(true);
		}

		if (lock_intention == BTR_INTENTION_BOTH) {
			/* Delete at leftmost record in a page causes delete
			& insert at its parent page. After that, the delete
			might cause btr_compress() and delete record at its
			parent page. Thus we should consider max 2 deletes. */

			margin = rec_size * 2;
		} else {
			ut_ad(lock_intention == BTR_INTENTION_DELETE);

			margin = rec_size;
		}
		/* NOTE: call mach_read_from_4() directly to avoid assertion
		failure. It is safe because we already have SX latch of the
		index tree */
		if (page_get_data_size(page)
			< margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)
		    || (mach_read_from_4(page + FIL_PAGE_NEXT)
				== FIL_NULL
			&& mach_read_from_4(page + FIL_PAGE_PREV)
				== FIL_NULL)) {
			return(true);
		}
	}

	if (lock_intention >= BTR_INTENTION_BOTH) {
		/* check insert will cause. BTR_INTENTION_BOTH
		or BTR_INTENTION_INSERT*/

		/* Once we invoke the btr_cur_limit_optimistic_insert_debug,
		we should check it here in advance, since the max allowable
		records in a page is limited. */
638
		LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true);
639 640 641 642 643 644 645 646 647 648 649 650

		/* needs 2 records' space for the case the single split and
		insert cannot fit.
		page_get_max_insert_size_after_reorganize() includes space
		for page directory already */
		ulint	max_size
			= page_get_max_insert_size_after_reorganize(page, 2);

		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
		    || max_size < rec_size * 2) {
			return(true);
		}
651 652 653 654

		/* TODO: optimize this condition for ROW_FORMAT=COMPRESSED.
		This is based on the worst case, and we could invoke
		page_zip_available() on the block->page.zip. */
655 656 657 658
		/* needs 2 records' space also for worst compress rate. */
		if (page_size.is_compressed()
		    && page_zip_empty_size(index->n_fields,
					   page_size.physical())
659 660
		    <= rec_size * 2 + page_get_data_size(page)
		    + page_dir_calc_reserved_space(n_recs + 2)) {
661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682
			return(true);
		}
	}

	return(false);
}

/** Detects whether the modifying record might need a opposite modification
to the intention.
@param[in]	page		page
@param[in]	lock_intention	lock intention for the tree operation
@param[in]	rec		record (current node_ptr)
@return	true if tree modification is needed */
static
bool
btr_cur_need_opposite_intention(
	const page_t*	page,
	btr_intention_t	lock_intention,
	const rec_t*	rec)
{
	switch (lock_intention) {
	case BTR_INTENTION_DELETE:
683 684
		return (page_has_prev(page) && page_rec_is_first(rec, page)) ||
			(page_has_next(page) && page_rec_is_last(rec, page));
685
	case BTR_INTENTION_INSERT:
686
		return page_has_next(page) && page_rec_is_last(rec, page);
687 688
	case BTR_INTENTION_BOTH:
		return(false);
osku's avatar
osku committed
689
	}
690 691

	ut_error;
692
	return(false);
osku's avatar
osku committed
693 694
}

695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
/**
@param[in]	index b-tree
@return maximum size of a node pointer record in bytes */
static ulint btr_node_ptr_max_size(const dict_index_t* index)
{
	if (dict_index_is_ibuf(index)) {
		/* cannot estimate accurately */
		/* This is universal index for change buffer.
		The max size of the entry is about max key length * 2.
		(index key + primary key to be inserted to the index)
		(The max key length is UNIV_PAGE_SIZE / 16 * 3 at
		 ha_innobase::max_supported_key_length(),
		 considering MAX_KEY_LENGTH = 3072 at MySQL imposes
		 the 3500 historical InnoDB value for 16K page size case.)
		For the universal index, node_ptr contains most of the entry.
		And 512 is enough to contain ibuf columns and meta-data */
		return srv_page_size / 8 * 3 + 512;
	}

	/* Each record has page_no, length of page_no and header. */
	ulint comp = dict_table_is_comp(index->table);
	ulint rec_max_size = comp
		? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES
		+ UT_BITS_IN_BYTES(index->n_nullable)
		: REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES
		+ 2 * index->n_fields;

	/* Compute the maximum possible record size. */
	for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) {
		const dict_field_t*	field
			= dict_index_get_nth_field(index, i);
		const dict_col_t*	col
			= dict_field_get_col(field);
		ulint			field_max_size;
		ulint			field_ext_max_size;

		/* Determine the maximum length of the index field. */

		field_max_size = dict_col_get_fixed_size(col, comp);
		if (field_max_size) {
			/* dict_index_add_col() should guarantee this */
			ut_ad(!field->prefix_len
			      || field->fixed_len == field->prefix_len);
			/* Fixed lengths are not encoded
			in ROW_FORMAT=COMPACT. */
			rec_max_size += field_max_size;
			continue;
		}

		field_max_size = dict_col_get_max_size(col);
		if (UNIV_UNLIKELY(!field_max_size)) {
746
			switch (col->mtype) {
747 748 749 750 751 752 753 754 755 756
			case DATA_VARCHAR:
				if (!comp
				    && (!strcmp(index->table->name.m_name,
						"SYS_FOREIGN")
					|| !strcmp(index->table->name.m_name,
						   "SYS_FOREIGN_COLS"))) {
					break;
				}
				/* fall through */
			case DATA_VARMYSQL:
757 758
			case DATA_CHAR:
			case DATA_MYSQL:
759 760
				/* CHAR(0) and VARCHAR(0) are possible
				data type definitions in MariaDB.
761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
				The InnoDB internal SQL parser maps
				CHAR to DATA_VARCHAR, so DATA_CHAR (or
				DATA_MYSQL) is only coming from the
				MariaDB SQL layer. */
				if (comp) {
					/* Add a length byte, because
					fixed-length empty field are
					encoded as variable-length.
					For ROW_FORMAT=REDUNDANT,
					these bytes were added to
					rec_max_size before this loop. */
					rec_max_size++;
				}
				continue;
			}
776

777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
			/* SYS_FOREIGN.ID is defined as CHAR in the
			InnoDB internal SQL parser, which translates
			into the incorrect VARCHAR(0).  InnoDB does
			not enforce maximum lengths of columns, so
			that is why any data can be inserted in the
			first place.

			Likewise, SYS_FOREIGN.FOR_NAME,
			SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are
			defined as CHAR, and also they are part of a key. */

			ut_ad(!strcmp(index->table->name.m_name,
				      "SYS_FOREIGN")
			      || !strcmp(index->table->name.m_name,
					 "SYS_FOREIGN_COLS"));
			ut_ad(!comp);
793
			ut_ad(col->mtype == DATA_VARCHAR);
794 795 796 797

			rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX)
				? REDUNDANT_REC_MAX_DATA_SIZE
				: page_get_free_space_of_empty(FALSE) / 2;
798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813
		} else if (field_max_size == NAME_LEN && i == 1
			   && (!strcmp(index->table->name.m_name,
				       TABLE_STATS_NAME)
			       || !strcmp(index->table->name.m_name,
					  INDEX_STATS_NAME))) {
			ut_ad(!strcmp(field->name, "table_name"));
			/* Interpret "table_name" as VARCHAR(199) even
			if it was incorrectly defined as VARCHAR(64).
			While the caller of ha_innobase enforces the
			maximum length on any data written, the InnoDB
			internal SQL parser will happily write as much
			data as is provided. The purpose of this hack
			is to avoid InnoDB hangs after persistent
			statistics on partitioned tables are
			deleted. */
			field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN;
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
		}
		field_ext_max_size = field_max_size < 256 ? 1 : 2;

		if (field->prefix_len
		    && field->prefix_len < field_max_size) {
			field_max_size = field->prefix_len;
		}

		if (comp) {
			/* Add the extra size for ROW_FORMAT=COMPACT.
			For ROW_FORMAT=REDUNDANT, these bytes were
			added to rec_max_size before this loop. */
			rec_max_size += field_ext_max_size;
		}

		rec_max_size += field_max_size;
	}

	return rec_max_size;
}

835
/********************************************************************//**
osku's avatar
osku committed
836 837 838 839 840
Searches an index tree and positions a tree cursor on a given level.
NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
to node pointer page number fields on the upper levels of the tree!
Note that if mode is PAGE_CUR_LE, which is used in inserts, then
cursor->up_match and cursor->low_match both will have sensible values.
841 842 843 844 845 846
If mode is PAGE_CUR_GE, then up_match will a have a sensible value.

If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
search tuple should be performed in the B-tree. InnoDB does an insert
immediately after the cursor. Thus, the cursor may end up on a user record,
or on a page infimum record. */
847
dberr_t
osku's avatar
osku committed
848 849
btr_cur_search_to_nth_level(
/*========================*/
850 851 852
	dict_index_t*	index,	/*!< in: index */
	ulint		level,	/*!< in: the tree level of search */
	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
osku's avatar
osku committed
853 854
				tuple must be set so that it cannot get
				compared to the node ptr page number field! */
855
	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_L, ...;
osku's avatar
osku committed
856 857
				Inserts should always be made using
				PAGE_CUR_LE to search the position! */
858
	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
859 860
				at most one of BTR_INSERT, BTR_DELETE_MARK,
				BTR_DELETE, or BTR_ESTIMATE;
861
				cursor->left_block is used to store a pointer
osku's avatar
osku committed
862 863 864 865 866 867 868
				to the left neighbor page, in the cases
				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
				NOTE that if has_search_latch
				is != 0, we maybe do not have a latch set
				on the cursor page, we assume
				the caller uses his search latch
				to protect the record! */
869
	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
osku's avatar
osku committed
870
				s- or x-latched, but see also above! */
871 872 873
	ulint		has_search_latch,
				/*!< in: info on the latch mode the
				caller currently has on search system:
osku's avatar
osku committed
874
				RW_S_LATCH, or 0 */
875
	const char*	file,	/*!< in: file name */
876
	unsigned	line,	/*!< in: line where called */
877 878 879
	mtr_t*		mtr,	/*!< in: mtr */
	ib_uint64_t	autoinc)/*!< in: PAGE_ROOT_AUTO_INC to be written
				(0 if none) */
osku's avatar
osku committed
880
{
881
	page_t*		page = NULL; /* remove warning */
882
	buf_block_t*	block;
883
	buf_block_t*	guess;
884
	ulint		height;
osku's avatar
osku committed
885 886 887
	ulint		up_match;
	ulint		up_bytes;
	ulint		low_match;
888
	ulint		low_bytes;
osku's avatar
osku committed
889 890
	ulint		savepoint;
	ulint		rw_latch;
891 892
	page_cur_mode_t	page_mode;
	page_cur_mode_t	search_mode = PAGE_CUR_UNSUPP;
osku's avatar
osku committed
893 894
	ulint		buf_mode;
	ulint		estimate;
895
	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
896
	page_cur_t*	page_cursor;
897
	btr_op_t	btr_op;
osku's avatar
osku committed
898
	ulint		root_height = 0; /* remove warning */
899
	dberr_t		err = DB_SUCCESS;
900

901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922
	ulint		upper_rw_latch, root_leaf_rw_latch;
	btr_intention_t	lock_intention;
	bool		modify_external;
	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
	ulint		tree_savepoints[BTR_MAX_LEVELS];
	ulint		n_blocks = 0;
	ulint		n_releases = 0;
	bool		detected_same_key_root = false;

	bool		retrying_for_search_prev = false;
	ulint		leftmost_from_level = 0;
	buf_block_t**	prev_tree_blocks = NULL;
	ulint*		prev_tree_savepoints = NULL;
	ulint		prev_n_blocks = 0;
	ulint		prev_n_releases = 0;
	bool		need_path = true;
	bool		rtree_parent_modified = false;
	bool		mbr_adj = false;
	bool		found = false;

	DBUG_ENTER("btr_cur_search_to_nth_level");

osku's avatar
osku committed
923 924
#ifdef BTR_CUR_ADAPT
	btr_search_t*	info;
925
#endif /* BTR_CUR_ADAPT */
osku's avatar
osku committed
926 927 928
	mem_heap_t*	heap		= NULL;
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
	ulint*		offsets		= offsets_;
929 930
	ulint		offsets2_[REC_OFFS_NORMAL_SIZE];
	ulint*		offsets2	= offsets2_;
931
	rec_offs_init(offsets_);
932
	rec_offs_init(offsets2_);
osku's avatar
osku committed
933 934 935
	/* Currently, PAGE_CUR_LE is the only search mode used for searches
	ending to upper levels */

936 937
	ut_ad(level == 0 || mode == PAGE_CUR_LE
	      || RTREE_SEARCH_MODE(mode));
938
	ut_ad(dict_index_check_search_tuple(index, tuple));
939
	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
osku's avatar
osku committed
940
	ut_ad(dtuple_check_typed(tuple));
941
	ut_ad(!(index->type & DICT_FTS));
942
	ut_ad(index->page != FIL_NULL);
osku's avatar
osku committed
943

944 945 946 947
	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
	UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
	UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
	UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
osku's avatar
osku committed
948 949 950
#ifdef UNIV_DEBUG
	cursor->up_match = ULINT_UNDEFINED;
	cursor->low_match = ULINT_UNDEFINED;
951
#endif /* UNIV_DEBUG */
952

953 954 955 956 957
	ibool	s_latch_by_caller;

	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;

	ut_ad(!s_latch_by_caller
958 959 960 961 962
	      || srv_read_only_mode
	      || mtr_memo_contains_flagged(mtr,
					   dict_index_get_lock(index),
					   MTR_MEMO_S_LOCK
					   | MTR_MEMO_SX_LOCK));
963

964
	/* These flags are mutually exclusive, they are lumped together
965 966
	with the latch mode for historical reasons. It's possible for
	none of the flags to be set. */
967 968 969 970
	switch (UNIV_EXPECT(latch_mode
			    & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
			    0)) {
	case 0:
971
		btr_op = BTR_NO_OP;
972 973
		break;
	case BTR_INSERT:
974 975 976
		btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
			? BTR_INSERT_IGNORE_UNIQUE_OP
			: BTR_INSERT_OP;
977 978
		break;
	case BTR_DELETE:
979
		btr_op = BTR_DELETE_OP;
980
		ut_a(cursor->purge_node);
981 982
		break;
	case BTR_DELETE_MARK:
983
		btr_op = BTR_DELMARK_OP;
984 985 986 987 988
		break;
	default:
		/* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
		should be specified at a time */
		ut_error;
989 990
	}

991 992 993 994
	/* Operations on the insert buffer tree cannot be buffered. */
	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
	/* Operations on the clustered index cannot be buffered. */
	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
995 996 997 998
	/* Operations on the temporary table(indexes) cannot be buffered. */
	ut_ad(btr_op == BTR_NO_OP || !dict_table_is_temporary(index->table));
	/* Operation on the spatial index cannot be buffered. */
	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
999

osku's avatar
osku committed
1000 1001
	estimate = latch_mode & BTR_ESTIMATE;

1002 1003 1004 1005
	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);

	modify_external = latch_mode & BTR_MODIFY_EXTERNAL;

1006
	/* Turn the flags unrelated to the latch mode off. */
1007 1008
	latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);

1009 1010
	ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);

1011 1012
	ut_ad(!s_latch_by_caller
	      || latch_mode == BTR_SEARCH_LEAF
1013
	      || latch_mode == BTR_SEARCH_TREE
1014
	      || latch_mode == BTR_MODIFY_LEAF);
1015

1016 1017 1018 1019 1020 1021
	ut_ad(autoinc == 0 || dict_index_is_clust(index));
	ut_ad(autoinc == 0
	      || latch_mode == BTR_MODIFY_TREE
	      || latch_mode == BTR_MODIFY_LEAF);
	ut_ad(autoinc == 0 || level == 0);

osku's avatar
osku committed
1022 1023 1024 1025 1026 1027 1028 1029
	cursor->flag = BTR_CUR_BINARY;
	cursor->index = index;

#ifndef BTR_CUR_ADAPT
	guess = NULL;
#else
	info = btr_search_get_info(index);

1030 1031 1032 1033 1034
	if (!buf_pool_is_obsolete(info->withdraw_clock)) {
		guess = info->root_guess;
	} else {
		guess = NULL;
	}
osku's avatar
osku committed
1035 1036 1037

#ifdef BTR_CUR_HASH_ADAPT

1038
# ifdef UNIV_SEARCH_PERF_STAT
osku's avatar
osku committed
1039
	info->n_searches++;
1040
# endif
1041
	if (autoinc == 0
1042 1043
	    && latch_mode <= BTR_MODIFY_LEAF
	    && info->last_hash_succ
1044
# ifdef MYSQL_INDEX_DISABLE_AHI
1045
	    && !index->disable_ahi
1046
# endif
1047
	    && !estimate
1048
# ifdef PAGE_CUR_LE_OR_EXTENDS
1049
	    && mode != PAGE_CUR_LE_OR_EXTENDS
1050
# endif /* PAGE_CUR_LE_OR_EXTENDS */
1051
	    && !dict_index_is_spatial(index)
1052 1053 1054
	    /* If !has_search_latch, we do a dirty read of
	    btr_search_enabled below, and btr_search_guess_on_hash()
	    will have to check it again. */
1055
	    && btr_search_enabled
1056
	    && !modify_external
1057 1058
	    && rw_lock_get_writer(btr_get_search_latch(index))
	    == RW_LOCK_NOT_LOCKED
1059 1060 1061
	    && btr_search_guess_on_hash(index, info, tuple, mode,
					latch_mode, cursor,
					has_search_latch, mtr)) {
osku's avatar
osku committed
1062 1063 1064 1065

		/* Search using the hash index succeeded */

		ut_ad(cursor->up_match != ULINT_UNDEFINED
1066
		      || mode != PAGE_CUR_GE);
osku's avatar
osku committed
1067
		ut_ad(cursor->up_match != ULINT_UNDEFINED
1068
		      || mode != PAGE_CUR_LE);
osku's avatar
osku committed
1069
		ut_ad(cursor->low_match != ULINT_UNDEFINED
1070
		      || mode != PAGE_CUR_LE);
osku's avatar
osku committed
1071 1072
		btr_cur_n_sea++;

1073
		DBUG_RETURN(err);
osku's avatar
osku committed
1074
	}
1075
# endif /* BTR_CUR_HASH_ADAPT */
1076
#endif /* BTR_CUR_ADAPT */
osku's avatar
osku committed
1077 1078 1079 1080 1081 1082 1083
	btr_cur_n_non_sea++;

	/* If the hash search did not succeed, do binary search down the
	tree */

	if (has_search_latch) {
		/* Release possible search latch to obey latching order */
1084
		btr_search_s_unlock(index);
osku's avatar
osku committed
1085 1086 1087 1088 1089 1090 1091
	}

	/* Store the position of the tree latch we push to mtr so that we
	know how to release it when we have latched leaf node(s) */

	savepoint = mtr_set_savepoint(mtr);

1092 1093
	switch (latch_mode) {
	case BTR_MODIFY_TREE:
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
		/* Most of delete-intended operations are purging.
		Free blocks and read IO bandwidth should be prior
		for them, when the history list is glowing huge. */
		if (lock_intention == BTR_INTENTION_DELETE
		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
			&& buf_get_n_pending_read_ios()) {
			mtr_x_lock(dict_index_get_lock(index), mtr);
		} else if (dict_index_is_spatial(index)
			   && lock_intention <= BTR_INTENTION_BOTH) {
			/* X lock the if there is possibility of
			pessimistic delete on spatial index. As we could
			lock upward for the tree */

			mtr_x_lock(dict_index_get_lock(index), mtr);
		} else {
			mtr_sx_lock(dict_index_get_lock(index), mtr);
		}
		upper_rw_latch = RW_X_LATCH;
1112 1113
		break;
	case BTR_CONT_MODIFY_TREE:
1114
	case BTR_CONT_SEARCH_TREE:
osku's avatar
osku committed
1115
		/* Do nothing */
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
		ut_ad(srv_read_only_mode
		      || mtr_memo_contains_flagged(mtr,
						   dict_index_get_lock(index),
						   MTR_MEMO_X_LOCK
						   | MTR_MEMO_SX_LOCK));
		if (dict_index_is_spatial(index)
		    && latch_mode == BTR_CONT_MODIFY_TREE) {
			/* If we are about to locating parent page for split
			and/or merge operation for R-Tree index, X latch
			the parent */
			upper_rw_latch = RW_X_LATCH;
		} else {
			upper_rw_latch = RW_NO_LATCH;
		}
1130 1131
		break;
	default:
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
		if (!srv_read_only_mode) {
			if (s_latch_by_caller) {
				ut_ad(rw_lock_own(dict_index_get_lock(index),
				              RW_LOCK_S));
			} else if (!modify_external) {
				/* BTR_SEARCH_TREE is intended to be used with
				BTR_ALREADY_S_LATCHED */
				ut_ad(latch_mode != BTR_SEARCH_TREE);

				mtr_s_lock(dict_index_get_lock(index), mtr);
			} else {
				/* BTR_MODIFY_EXTERNAL needs to be excluded */
				mtr_sx_lock(dict_index_get_lock(index), mtr);
			}
			upper_rw_latch = RW_S_LATCH;
		} else {
			upper_rw_latch = RW_NO_LATCH;
1149
		}
osku's avatar
osku committed
1150
	}
1151
	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
1152

osku's avatar
osku committed
1153 1154
	page_cursor = btr_cur_get_page_cur(cursor);

1155 1156 1157 1158 1159 1160 1161
	const ulint		space = dict_index_get_space(index);
	const page_size_t	page_size(dict_table_page_size(index->table));

	/* Start with the root page. */
	page_id_t		page_id(space, dict_index_get_page(index));

	if (root_leaf_rw_latch == RW_X_LATCH) {
1162
		node_ptr_max_size = btr_node_ptr_max_size(index);
1163
	}
osku's avatar
osku committed
1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185

	up_match = 0;
	up_bytes = 0;
	low_match = 0;
	low_bytes = 0;

	height = ULINT_UNDEFINED;

	/* We use these modified search modes on non-leaf levels of the
	B-tree. These let us end up in the right B-tree leaf. In that leaf
	we use the original search mode. */

	switch (mode) {
	case PAGE_CUR_GE:
		page_mode = PAGE_CUR_L;
		break;
	case PAGE_CUR_G:
		page_mode = PAGE_CUR_LE;
		break;
	default:
#ifdef PAGE_CUR_LE_OR_EXTENDS
		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1186
		      || RTREE_SEARCH_MODE(mode)
1187
		      || mode == PAGE_CUR_LE_OR_EXTENDS);
osku's avatar
osku committed
1188
#else /* PAGE_CUR_LE_OR_EXTENDS */
1189 1190
		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
		      || RTREE_SEARCH_MODE(mode));
osku's avatar
osku committed
1191 1192 1193 1194 1195 1196
#endif /* PAGE_CUR_LE_OR_EXTENDS */
		page_mode = mode;
		break;
	}

	/* Loop and search until we arrive at the desired level */
1197
	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
osku's avatar
osku committed
1198

1199
search_loop:
1200 1201
	buf_mode = BUF_GET;
	rw_latch = RW_NO_LATCH;
1202
	rtree_parent_modified = false;
1203

1204 1205
	if (height != 0) {
		/* We are about to fetch the root or a non-leaf page. */
1206
		if ((latch_mode != BTR_MODIFY_TREE || height == level)
1207 1208 1209
		    && !retrying_for_search_prev) {
			/* If doesn't have SX or X latch of index,
			each pages should be latched before reading. */
1210 1211 1212
			if (height == ULINT_UNDEFINED
			    && upper_rw_latch == RW_S_LATCH
			    && (modify_external || autoinc)) {
1213
				/* needs sx-latch of root page
1214 1215
				for fseg operation or for writing
				PAGE_ROOT_AUTO_INC */
1216 1217 1218 1219 1220
				rw_latch = RW_SX_LATCH;
			} else {
				rw_latch = upper_rw_latch;
			}
		}
1221 1222
	} else if (latch_mode <= BTR_MODIFY_LEAF) {
		rw_latch = latch_mode;
1223

1224
		if (btr_op != BTR_NO_OP
1225
		    && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1226

1227 1228
			/* Try to buffer the operation if the leaf
			page is not in the buffer pool. */
1229

1230 1231 1232 1233
			buf_mode = btr_op == BTR_DELETE_OP
				? BUF_GET_IF_IN_POOL_OR_WATCH
				: BUF_GET_IF_IN_POOL;
		}
1234 1235
	}

1236
retry_page_get:
1237 1238 1239 1240 1241
	ut_ad(n_blocks < BTR_MAX_LEVELS);
	tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
	block = buf_page_get_gen(page_id, page_size, rw_latch, guess,
				 buf_mode, file, line, mtr, &err);
	tree_blocks[n_blocks] = block;
1242

1243 1244 1245
	/* Note that block==NULL signifies either an error or change
	buffering. */

1246
	if (err != DB_SUCCESS) {
1247
		ut_ad(block == NULL);
1248
		if (err == DB_DECRYPTION_FAILED) {
1249
			ib_push_warning((void *)NULL,
1250
				DB_DECRYPTION_FAILED,
1251 1252 1253
				"Table %s is encrypted but encryption service or"
				" used key_id is not available. "
				" Can't continue reading table.",
1254
				index->table->name.m_name);
1255
			index->table->file_unreadable = true;
1256 1257 1258 1259
		}

		goto func_exit;
	}
1260

1261
	if (block == NULL) {
1262 1263 1264
		/* This must be a search to perform an insert/delete
		mark/ delete; try using the insert/delete buffer */

1265
		ut_ad(height == 0);
1266 1267
		ut_ad(cursor->thr);

1268 1269
		switch (btr_op) {
		case BTR_INSERT_OP:
1270
		case BTR_INSERT_IGNORE_UNIQUE_OP:
1271
			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1272
			ut_ad(!dict_index_is_spatial(index));
1273

1274
			if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1275
					page_id, page_size, cursor->thr)) {
1276

1277
				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1278

1279 1280 1281
				goto func_exit;
			}
			break;
1282

1283 1284
		case BTR_DELMARK_OP:
			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1285
			ut_ad(!dict_index_is_spatial(index));
1286

1287
			if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1288 1289
					index, page_id, page_size,
					cursor->thr)) {
1290

1291
				cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1292

1293 1294
				goto func_exit;
			}
osku's avatar
osku committed
1295

1296
			break;
osku's avatar
osku committed
1297

1298 1299
		case BTR_DELETE_OP:
			ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1300
			ut_ad(!dict_index_is_spatial(index));
osku's avatar
osku committed
1301

1302 1303
			if (!row_purge_poss_sec(cursor->purge_node,
						index, tuple)) {
1304

1305 1306 1307
				/* The record cannot be purged yet. */
				cursor->flag = BTR_CUR_DELETE_REF;
			} else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1308
					       index, page_id, page_size,
1309 1310 1311 1312 1313 1314
					       cursor->thr)) {

				/* The purge was buffered. */
				cursor->flag = BTR_CUR_DELETE_IBUF;
			} else {
				/* The purge could not be buffered. */
1315
				buf_pool_watch_unset(page_id);
1316 1317
				break;
			}
1318

1319
			buf_pool_watch_unset(page_id);
1320 1321 1322 1323
			goto func_exit;

		default:
			ut_error;
osku's avatar
osku committed
1324 1325
		}

1326 1327 1328 1329 1330 1331 1332 1333
		/* Insert to the insert/delete buffer did not succeed, we
		must read the page from disk. */

		buf_mode = BUF_GET;

		goto retry_page_get;
	}

1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366
	if (retrying_for_search_prev && height != 0) {
		/* also latch left sibling */
		ulint		left_page_no;
		buf_block_t*	get_block;

		ut_ad(rw_latch == RW_NO_LATCH);

		rw_latch = upper_rw_latch;

		rw_lock_s_lock(&block->lock);
		left_page_no = btr_page_get_prev(
			buf_block_get_frame(block), mtr);
		rw_lock_s_unlock(&block->lock);

		if (left_page_no != FIL_NULL) {
			ut_ad(prev_n_blocks < leftmost_from_level);

			prev_tree_savepoints[prev_n_blocks]
				= mtr_set_savepoint(mtr);
			get_block = buf_page_get_gen(
				page_id_t(page_id.space(), left_page_no),
				page_size, rw_latch, NULL, buf_mode,
				file, line, mtr, &err);
			prev_tree_blocks[prev_n_blocks] = get_block;
			prev_n_blocks++;

			if (err != DB_SUCCESS) {
				if (err == DB_DECRYPTION_FAILED) {
					ib_push_warning((void *)NULL,
						DB_DECRYPTION_FAILED,
						"Table %s is encrypted but encryption service or"
						" used key_id is not available. "
						" Can't continue reading table.",
1367
						index->table->name.m_name);
Marko Mäkelä's avatar
Marko Mäkelä committed
1368
					index->table->file_unreadable = true;
1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395
				}

				goto func_exit;
			}

			/* BTR_MODIFY_TREE doesn't update prev/next_page_no,
			without their parent page's lock. So, not needed to
			retry here, because we have the parent page's lock. */
		}

		/* release RW_NO_LATCH page and lock with RW_S_LATCH */
		mtr_release_block_at_savepoint(
			mtr, tree_savepoints[n_blocks],
			tree_blocks[n_blocks]);

		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
					 buf_mode, file, line, mtr, &err);
		tree_blocks[n_blocks] = block;

		if (err != DB_SUCCESS) {
			if (err == DB_DECRYPTION_FAILED) {
				ib_push_warning((void *)NULL,
					DB_DECRYPTION_FAILED,
					"Table %s is encrypted but encryption service or"
					" used key_id is not available. "
					" Can't continue reading table.",
1396
					index->table->name.m_name);
Marko Mäkelä's avatar
Marko Mäkelä committed
1397
				index->table->file_unreadable = true;
1398 1399 1400 1401 1402 1403
			}

			goto func_exit;
		}
	}

1404 1405
	page = buf_block_get_frame(block);

1406 1407 1408 1409
	if (height == ULINT_UNDEFINED
	    && page_is_leaf(page)
	    && rw_latch != RW_NO_LATCH
	    && rw_latch != root_leaf_rw_latch) {
1410 1411 1412
		/* The root page is also a leaf page (root_leaf).
		We should reacquire the page, because the root page
		is latched differently from leaf pages. */
1413 1414
		ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
		ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1415 1416
		ut_ad(rw_latch == RW_S_LATCH || modify_external || autoinc);
		ut_ad(!autoinc || root_leaf_rw_latch == RW_X_LATCH);
1417 1418 1419 1420 1421 1422 1423 1424 1425 1426

		ut_ad(n_blocks == 0);
		mtr_release_block_at_savepoint(
			mtr, tree_savepoints[n_blocks],
			tree_blocks[n_blocks]);

		upper_rw_latch = root_leaf_rw_latch;
		goto search_loop;
	}

1427
	if (rw_latch != RW_NO_LATCH) {
1428 1429 1430
#ifdef UNIV_ZIP_DEBUG
		const page_zip_des_t*	page_zip
			= buf_block_get_page_zip(block);
Sergei Golubchik's avatar
Sergei Golubchik committed
1431
		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1432
#endif /* UNIV_ZIP_DEBUG */
1433

1434 1435 1436
		buf_block_dbg_add_level(
			block, dict_index_is_ibuf(index)
			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1437
	}
1438

1439
	ut_ad(fil_page_index_page_check(page));
1440
	ut_ad(index->id == btr_page_get_index_id(page));
osku's avatar
osku committed
1441

1442 1443
	if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
		/* We are in the root node */
osku's avatar
osku committed
1444

1445 1446 1447 1448
		height = btr_page_get_level(page, mtr);
		root_height = height;
		cursor->tree_height = root_height + 1;

1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470
		if (dict_index_is_spatial(index)) {
			ut_ad(cursor->rtr_info);

			node_seq_t      seq_no = rtr_get_current_ssn_id(index);

			/* If SSN in memory is not initialized, fetch
			it from root page */
			if (seq_no < 1) {
				node_seq_t      root_seq_no;

				root_seq_no = page_get_ssn_id(page);

				mutex_enter(&(index->rtr_ssn.mutex));
				index->rtr_ssn.seq_no = root_seq_no + 1;
				mutex_exit(&(index->rtr_ssn.mutex));
			}

			/* Save the MBR */
			cursor->rtr_info->thr = cursor->thr;
			rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
		}

osku's avatar
osku committed
1471
#ifdef BTR_CUR_ADAPT
1472 1473
		if (block != guess) {
			info->root_guess = block;
1474
			info->withdraw_clock = buf_withdraw_clock;
osku's avatar
osku committed
1475
		}
1476 1477
#endif
	}
1478

1479 1480
	if (height == 0) {
		if (rw_latch == RW_NO_LATCH) {
osku's avatar
osku committed
1481

1482 1483
			latch_leaves = btr_cur_latch_leaves(
				block, page_id, page_size, latch_mode,
1484 1485
				cursor, mtr);
		}
osku's avatar
osku committed
1486

1487 1488 1489
		switch (latch_mode) {
		case BTR_MODIFY_TREE:
		case BTR_CONT_MODIFY_TREE:
1490
		case BTR_CONT_SEARCH_TREE:
1491 1492
			break;
		default:
1493 1494 1495
			if (!s_latch_by_caller
			    && !srv_read_only_mode
			    && !modify_external) {
1496
				/* Release the tree s-latch */
1497 1498
				/* NOTE: BTR_MODIFY_EXTERNAL
				needs to keep tree sx-latch */
1499 1500 1501 1502
				mtr_release_s_latch_at_savepoint(
					mtr, savepoint,
					dict_index_get_lock(index));
			}
1503 1504 1505

			/* release upper blocks */
			if (retrying_for_search_prev) {
1506
				ut_ad(!autoinc);
1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519
				for (;
				     prev_n_releases < prev_n_blocks;
				     prev_n_releases++) {
					mtr_release_block_at_savepoint(
						mtr,
						prev_tree_savepoints[
							prev_n_releases],
						prev_tree_blocks[
							prev_n_releases]);
				}
			}

			for (; n_releases < n_blocks; n_releases++) {
1520 1521 1522
				if (n_releases == 0
				    && (modify_external || autoinc)) {
					/* keep the root page latch */
1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533
					ut_ad(mtr_memo_contains_flagged(
						mtr, tree_blocks[n_releases],
						MTR_MEMO_PAGE_SX_FIX
						| MTR_MEMO_PAGE_X_FIX));
					continue;
				}

				mtr_release_block_at_savepoint(
					mtr, tree_savepoints[n_releases],
					tree_blocks[n_releases]);
			}
osku's avatar
osku committed
1534 1535
		}

1536 1537
		page_mode = mode;
	}
1538

1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551
	if (dict_index_is_spatial(index)) {
		/* Remember the page search mode */
		search_mode = page_mode;

		/* Some adjustment on search mode, when the
		page search mode is PAGE_CUR_RTREE_LOCATE
		or PAGE_CUR_RTREE_INSERT, as we are searching
		with MBRs. When it is not the target level, we
		should search all sub-trees that "CONTAIN" the
		search range/MBR. When it is at the target
		level, the search becomes PAGE_CUR_LE */
		if (page_mode == PAGE_CUR_RTREE_LOCATE
		    && level == height) {
1552 1553 1554 1555 1556
			if (level == 0) {
				page_mode = PAGE_CUR_LE;
			} else {
				page_mode = PAGE_CUR_RTREE_GET_FATHER;
			}
1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606
		}

		if (page_mode == PAGE_CUR_RTREE_INSERT) {
			page_mode = (level == height)
					? PAGE_CUR_LE
					: PAGE_CUR_RTREE_INSERT;

			ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
		}

		/* "need_path" indicates if we need to tracking the parent
		pages, if it is not spatial comparison, then no need to
		track it */
		if (page_mode < PAGE_CUR_CONTAIN) {
			need_path = false;
		}

		up_match = 0;
		low_match = 0;

		if (latch_mode == BTR_MODIFY_TREE
		    || latch_mode == BTR_CONT_MODIFY_TREE
		    || latch_mode == BTR_CONT_SEARCH_TREE) {
			/* Tree are locked, no need for Page Lock to protect
			the "path" */
			cursor->rtr_info->need_page_lock = false;
		}
        }

	if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
		ut_ad(need_path);
		found = rtr_cur_search_with_match(
			block, index, tuple, page_mode, page_cursor,
			cursor->rtr_info);

		/* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
		if (search_mode == PAGE_CUR_RTREE_INSERT
		    && cursor->rtr_info->mbr_adj) {
			if (latch_mode & BTR_MODIFY_LEAF) {
				/* Parent MBR needs updated, should retry
				with BTR_MODIFY_TREE */
				goto func_exit;
			} else if (latch_mode & BTR_MODIFY_TREE) {
				rtree_parent_modified = true;
				cursor->rtr_info->mbr_adj = false;
				mbr_adj = true;
			} else {
				ut_ad(0);
			}
		}
1607 1608 1609 1610 1611

		if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
			cursor->low_match =
				DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
		}
1612
#ifdef BTR_CUR_HASH_ADAPT
1613 1614 1615 1616 1617 1618 1619 1620 1621
	} else if (height == 0 && btr_search_enabled
		   && !dict_index_is_spatial(index)) {
		/* The adaptive hash index is only used when searching
		for leaf pages (height==0), but not in r-trees.
		We only need the byte prefix comparison for the purpose
		of updating the adaptive hash index. */
		page_cur_search_with_match_bytes(
			block, index, tuple, page_mode, &up_match, &up_bytes,
			&low_match, &low_bytes, page_cursor);
1622
#endif /* BTR_CUR_HASH_ADAPT */
1623 1624 1625 1626 1627 1628 1629 1630
	} else {
		/* Search for complete index fields. */
		up_bytes = low_bytes = 0;
		page_cur_search_with_match(
			block, index, tuple, page_mode, &up_match,
			&low_match, page_cursor,
			need_path ? cursor->rtr_info : NULL);
	}
osku's avatar
osku committed
1631

1632 1633 1634
	if (estimate) {
		btr_cur_add_path_info(cursor, height, root_height);
	}
osku's avatar
osku committed
1635

1636
	/* If this is the desired level, leave the loop */
osku's avatar
osku committed
1637

1638 1639
	ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
					   mtr));
osku's avatar
osku committed
1640

1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668
	/* Add Predicate lock if it is serializable isolation
	and only if it is in the search case */
	if (dict_index_is_spatial(index)
	    && cursor->rtr_info->need_prdt_lock
	    && mode != PAGE_CUR_RTREE_INSERT
	    && mode != PAGE_CUR_RTREE_LOCATE
	    && mode >= PAGE_CUR_CONTAIN) {
		trx_t*		trx = thr_get_trx(cursor->thr);
		lock_prdt_t	prdt;

		lock_mutex_enter();
		lock_init_prdt_from_mbr(
			&prdt, &cursor->rtr_info->mbr, mode,
			trx->lock.lock_heap);
		lock_mutex_exit();

		if (rw_latch == RW_NO_LATCH && height != 0) {
			rw_lock_s_lock(&(block->lock));
		}

		lock_prdt_lock(block, &prdt, index, LOCK_S,
			       LOCK_PREDICATE, cursor->thr, mtr);

		if (rw_latch == RW_NO_LATCH && height != 0) {
			rw_lock_s_unlock(&(block->lock));
		}
	}

1669
	if (level != height) {
osku's avatar
osku committed
1670

1671
		const rec_t*	node_ptr;
1672
		ut_ad(height > 0);
marko's avatar
marko committed
1673

1674 1675
		height--;
		guess = NULL;
osku's avatar
osku committed
1676

1677
		node_ptr = page_cur_get_rec(page_cursor);
1678

1679 1680
		offsets = rec_get_offsets(node_ptr, index, offsets, false,
					  ULINT_UNDEFINED, &heap);
osku's avatar
osku committed
1681

1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698
		/* If the rec is the first or last in the page for
		pessimistic delete intention, it might cause node_ptr insert
		for the upper level. We should change the intention and retry.
		*/
		if (latch_mode == BTR_MODIFY_TREE
		    && btr_cur_need_opposite_intention(
			page, lock_intention, node_ptr)) {

need_opposite_intention:
			ut_ad(upper_rw_latch == RW_X_LATCH);

			if (n_releases > 0) {
				/* release root block */
				mtr_release_block_at_savepoint(
					mtr, tree_savepoints[0],
					tree_blocks[0]);
			}
1699

1700 1701 1702 1703 1704 1705 1706 1707 1708
			/* release all blocks */
			for (; n_releases <= n_blocks; n_releases++) {
				mtr_release_block_at_savepoint(
					mtr, tree_savepoints[n_releases],
					tree_blocks[n_releases]);
			}

			lock_intention = BTR_INTENTION_BOTH;

1709
			page_id = page_id_t(space, dict_index_get_page(index));
1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
			up_match = 0;
			low_match = 0;
			height = ULINT_UNDEFINED;

			n_blocks = 0;
			n_releases = 0;

			goto search_loop;
		}

		if (dict_index_is_spatial(index)) {
			if (page_rec_is_supremum(node_ptr)) {
				cursor->low_match = 0;
				cursor->up_match = 0;
				goto func_exit;
			}

			/* If we are doing insertion or record locating,
			remember the tree nodes we visited */
			if (page_mode == PAGE_CUR_RTREE_INSERT
			    || (search_mode == PAGE_CUR_RTREE_LOCATE
			        && (latch_mode != BTR_MODIFY_LEAF))) {
				bool		add_latch = false;

				if (latch_mode == BTR_MODIFY_TREE
				    && rw_latch == RW_NO_LATCH) {
					ut_ad(mtr_memo_contains_flagged(
						mtr, dict_index_get_lock(index),
						MTR_MEMO_X_LOCK
						| MTR_MEMO_SX_LOCK));
					rw_lock_s_lock(&block->lock);
					add_latch = true;
				}

				/* Store the parent cursor location */
#ifdef UNIV_DEBUG
				ulint	num_stored = rtr_store_parent_path(
					block, cursor, latch_mode,
					height + 1, mtr);
#else
				rtr_store_parent_path(
					block, cursor, latch_mode,
					height + 1, mtr);
#endif

				if (page_mode == PAGE_CUR_RTREE_INSERT) {
					btr_pcur_t*     r_cursor =
						rtr_get_parent_cursor(
							cursor, height + 1,
							true);
					/* If it is insertion, there should
					be only one parent for each level
					traverse */
#ifdef UNIV_DEBUG
					ut_ad(num_stored == 1);
#endif

					node_ptr = btr_pcur_get_rec(r_cursor);

				}

				if (add_latch) {
					rw_lock_s_unlock(&block->lock);
				}

				ut_ad(!page_rec_is_supremum(node_ptr));
			}

			ut_ad(page_mode == search_mode
			      || (page_mode == PAGE_CUR_WITHIN
				  && search_mode == PAGE_CUR_RTREE_LOCATE));

			page_mode = search_mode;
		}

		/* If the first or the last record of the page
		or the same key value to the first record or last record,
		the another page might be choosen when BTR_CONT_MODIFY_TREE.
		So, the parent page should not released to avoiding deadlock
		with blocking the another search with the same key value. */
		if (!detected_same_key_root
		    && lock_intention == BTR_INTENTION_BOTH
		    && !dict_index_is_unique(index)
		    && latch_mode == BTR_MODIFY_TREE
		    && (up_match >= rec_offs_n_fields(offsets) - 1
			|| low_match >= rec_offs_n_fields(offsets) - 1)) {
1796 1797
			const rec_t*	first_rec = page_rec_get_next_const(
				page_get_infimum_rec(page));
1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
			ulint		matched_fields;

			ut_ad(upper_rw_latch == RW_X_LATCH);

			if (node_ptr == first_rec
			    || page_rec_is_last(node_ptr, page)) {
				detected_same_key_root = true;
			} else {
				matched_fields = 0;

				offsets2 = rec_get_offsets(
					first_rec, index, offsets2,
1810
					false, ULINT_UNDEFINED, &heap);
1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821
				cmp_rec_rec_with_match(node_ptr, first_rec,
					offsets, offsets2, index, FALSE,
					&matched_fields);

				if (matched_fields
				    >= rec_offs_n_fields(offsets) - 1) {
					detected_same_key_root = true;
				} else {
					const rec_t*	last_rec;

					last_rec = page_rec_get_prev_const(
1822
						page_get_supremum_rec(page));
1823 1824 1825 1826 1827

					matched_fields = 0;

					offsets2 = rec_get_offsets(
						last_rec, index, offsets2,
1828
						false, ULINT_UNDEFINED, &heap);
1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897
					cmp_rec_rec_with_match(
						node_ptr, last_rec,
						offsets, offsets2, index,
						FALSE, &matched_fields);
					if (matched_fields
					    >= rec_offs_n_fields(offsets) - 1) {
						detected_same_key_root = true;
					}
				}
			}
		}

		/* If the page might cause modify_tree,
		we should not release the parent page's lock. */
		if (!detected_same_key_root
		    && latch_mode == BTR_MODIFY_TREE
		    && !btr_cur_will_modify_tree(
				index, page, lock_intention, node_ptr,
				node_ptr_max_size, page_size, mtr)
		    && !rtree_parent_modified) {
			ut_ad(upper_rw_latch == RW_X_LATCH);
			ut_ad(n_releases <= n_blocks);

			/* we can release upper blocks */
			for (; n_releases < n_blocks; n_releases++) {
				if (n_releases == 0) {
					/* we should not release root page
					to pin to same block. */
					continue;
				}

				/* release unused blocks to unpin */
				mtr_release_block_at_savepoint(
					mtr, tree_savepoints[n_releases],
					tree_blocks[n_releases]);
			}
		}

		if (height == level
		    && latch_mode == BTR_MODIFY_TREE) {
			ut_ad(upper_rw_latch == RW_X_LATCH);
			/* we should sx-latch root page, if released already.
			It contains seg_header. */
			if (n_releases > 0) {
				mtr_block_sx_latch_at_savepoint(
					mtr, tree_savepoints[0],
					tree_blocks[0]);
			}

			/* x-latch the branch blocks not released yet. */
			for (ulint i = n_releases; i <= n_blocks; i++) {
				mtr_block_x_latch_at_savepoint(
					mtr, tree_savepoints[i],
					tree_blocks[i]);
			}
		}

		/* We should consider prev_page of parent page, if the node_ptr
		is the leftmost of the page. because BTR_SEARCH_PREV and
		BTR_MODIFY_PREV latches prev_page of the leaf page. */
		if ((latch_mode == BTR_SEARCH_PREV
		     || latch_mode == BTR_MODIFY_PREV)
		    && !retrying_for_search_prev) {
			/* block should be latched for consistent
			   btr_page_get_prev() */
			ut_ad(mtr_memo_contains_flagged(mtr, block,
				MTR_MEMO_PAGE_S_FIX
				| MTR_MEMO_PAGE_X_FIX));

1898
			if (page_has_prev(page)
1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924
			    && page_rec_is_first(node_ptr, page)) {

				if (leftmost_from_level == 0) {
					leftmost_from_level = height + 1;
				}
			} else {
				leftmost_from_level = 0;
			}

			if (height == 0 && leftmost_from_level > 0) {
				/* should retry to get also prev_page
				from level==leftmost_from_level. */
				retrying_for_search_prev = true;

				prev_tree_blocks = static_cast<buf_block_t**>(
					ut_malloc_nokey(sizeof(buf_block_t*)
							* leftmost_from_level));

				prev_tree_savepoints = static_cast<ulint*>(
					ut_malloc_nokey(sizeof(ulint)
							* leftmost_from_level));

				/* back to the level (leftmost_from_level+1) */
				ulint	idx = n_blocks
					- (leftmost_from_level - 1);

1925
				page_id = page_id_t(
1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959
					space,
					tree_blocks[idx]->page.id.page_no());

				for (ulint i = n_blocks
					       - (leftmost_from_level - 1);
				     i <= n_blocks; i++) {
					mtr_release_block_at_savepoint(
						mtr, tree_savepoints[i],
						tree_blocks[i]);
				}

				n_blocks -= (leftmost_from_level - 1);
				height = leftmost_from_level;
				ut_ad(n_releases == 0);

				/* replay up_match, low_match */
				up_match = 0;
				low_match = 0;
				rtr_info_t*	rtr_info	= need_path
					? cursor->rtr_info : NULL;

				for (ulint i = 0; i < n_blocks; i++) {
					page_cur_search_with_match(
						tree_blocks[i], index, tuple,
						page_mode, &up_match,
						&low_match, page_cursor,
						rtr_info);
				}

				goto search_loop;
			}
		}

		/* Go to the child node */
1960
		page_id = page_id_t(
1961 1962 1963 1964 1965 1966 1967 1968
			space,
			btr_node_ptr_get_child_page_no(node_ptr, offsets));

		n_blocks++;

		if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
			/* We're doing a search on an ibuf tree and we're one
			level above the leaf page. */
1969 1970 1971 1972 1973 1974 1975 1976

			ut_ad(level == 0);

			buf_mode = BUF_GET;
			rw_latch = RW_NO_LATCH;
			goto retry_page_get;
		}

1977 1978 1979 1980 1981 1982 1983 1984
		if (dict_index_is_spatial(index)
		    && page_mode >= PAGE_CUR_CONTAIN
		    && page_mode != PAGE_CUR_RTREE_INSERT) {
			ut_ad(need_path);
			rtr_node_path_t* path =
				cursor->rtr_info->path;

			if (!path->empty() && found) {
1985 1986
				ut_ad(path->back().page_no
				      == page_id.page_no());
1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998
				path->pop_back();
#ifdef UNIV_DEBUG
				if (page_mode == PAGE_CUR_RTREE_LOCATE
				    && (latch_mode != BTR_MODIFY_LEAF)) {
					btr_pcur_t*	cur
					= cursor->rtr_info->parent_path->back(
					  ).cursor;
					rec_t*	my_node_ptr
						= btr_pcur_get_rec(cur);

					offsets = rec_get_offsets(
						my_node_ptr, index, offsets,
1999
						false, ULINT_UNDEFINED, &heap);
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010

					ulint	my_page_no
					= btr_node_ptr_get_child_page_no(
						my_node_ptr, offsets);

					ut_ad(page_id.page_no() == my_page_no);
				}
#endif
			}
		}

2011
		goto search_loop;
2012 2013 2014
	} else if (!dict_index_is_spatial(index)
		   && latch_mode == BTR_MODIFY_TREE
		   && lock_intention == BTR_INTENTION_INSERT
2015
		   && page_has_next(page)
2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035
		   && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {

		/* btr_insert_into_right_sibling() might cause
		deleting node_ptr at upper level */

		guess = NULL;

		if (height == 0) {
			/* release the leaf pages if latched */
			for (uint i = 0; i < 3; i++) {
				if (latch_leaves.blocks[i] != NULL) {
					mtr_release_block_at_savepoint(
						mtr, latch_leaves.savepoints[i],
						latch_leaves.blocks[i]);
					latch_leaves.blocks[i] = NULL;
				}
			}
		}

		goto need_opposite_intention;
osku's avatar
osku committed
2036 2037
	}

2038
	if (level != 0) {
2039 2040
		ut_ad(!autoinc);

2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077
		if (upper_rw_latch == RW_NO_LATCH) {
			/* latch the page */
			buf_block_t*	child_block;

			if (latch_mode == BTR_CONT_MODIFY_TREE) {
				child_block = btr_block_get(
					page_id, page_size, RW_X_LATCH,
					index, mtr);
			} else {
				ut_ad(latch_mode == BTR_CONT_SEARCH_TREE);
				child_block = btr_block_get(
					page_id, page_size, RW_SX_LATCH,
					index, mtr);
			}

			btr_assert_not_corrupted(child_block, index);
		} else {
			ut_ad(mtr_memo_contains(mtr, block, upper_rw_latch));
			btr_assert_not_corrupted(block, index);

			if (s_latch_by_caller) {
				ut_ad(latch_mode == BTR_SEARCH_TREE);
				/* to exclude modifying tree operations
				should sx-latch the index. */
				ut_ad(mtr_memo_contains(
					mtr, dict_index_get_lock(index),
					MTR_MEMO_SX_LOCK));
				/* because has sx-latch of index,
				can release upper blocks. */
				for (; n_releases < n_blocks; n_releases++) {
					mtr_release_block_at_savepoint(
						mtr,
						tree_savepoints[n_releases],
						tree_blocks[n_releases]);
				}
			}
		}
2078

2079 2080 2081 2082
		if (page_mode <= PAGE_CUR_LE) {
			cursor->low_match = low_match;
			cursor->up_match = up_match;
		}
2083
	} else {
osku's avatar
osku committed
2084 2085 2086 2087 2088
		cursor->low_match = low_match;
		cursor->low_bytes = low_bytes;
		cursor->up_match = up_match;
		cursor->up_bytes = up_bytes;

2089 2090 2091 2092 2093
		if (autoinc) {
			page_set_autoinc(tree_blocks[0],
					 index, autoinc, mtr, false);
		}

2094
#ifdef BTR_CUR_HASH_ADAPT
2095 2096 2097
		/* We do a dirty read of btr_search_enabled here.  We
		will properly check btr_search_enabled again in
		btr_search_build_page_hash_index() before building a
2098
		page hash index, while holding search latch. */
2099 2100 2101 2102 2103
		if (btr_search_enabled
# ifdef MYSQL_INDEX_DISABLE_AHI
		    && !index->disable_ahi
# endif
		    ) {
2104 2105
			btr_search_info_update(index, cursor);
		}
2106
#endif /* BTR_CUR_HASH_ADAPT */
osku's avatar
osku committed
2107
		ut_ad(cursor->up_match != ULINT_UNDEFINED
2108
		      || mode != PAGE_CUR_GE);
osku's avatar
osku committed
2109
		ut_ad(cursor->up_match != ULINT_UNDEFINED
2110
		      || mode != PAGE_CUR_LE);
osku's avatar
osku committed
2111
		ut_ad(cursor->low_match != ULINT_UNDEFINED
2112
		      || mode != PAGE_CUR_LE);
osku's avatar
osku committed
2113 2114
	}

2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129
	/* For spatial index, remember  what blocks are still latched */
	if (dict_index_is_spatial(index)
	    && (latch_mode == BTR_MODIFY_TREE
		|| latch_mode == BTR_MODIFY_LEAF)) {
		for (ulint i = 0; i < n_releases; i++) {
			cursor->rtr_info->tree_blocks[i] = NULL;
			cursor->rtr_info->tree_savepoints[i] = 0;
		}

		for (ulint i = n_releases; i <= n_blocks; i++) {
			cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
			cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
		}
	}

2130
func_exit:
2131 2132 2133 2134 2135

	if (UNIV_LIKELY_NULL(heap)) {
		mem_heap_free(heap);
	}

2136 2137 2138 2139 2140
	if (retrying_for_search_prev) {
		ut_free(prev_tree_blocks);
		ut_free(prev_tree_savepoints);
	}

osku's avatar
osku committed
2141
	if (has_search_latch) {
2142
		btr_search_s_lock(index);
2143 2144 2145 2146 2147 2148 2149 2150 2151 2152
	}

	if (mbr_adj) {
		/* remember that we will need to adjust parent MBR */
		cursor->rtr_info->mbr_adj = true;
	}

	DBUG_RETURN(err);
}

2153
/*****************************************************************//**
osku's avatar
osku committed
2154
Opens a cursor at either end of an index. */
2155
dberr_t
2156 2157
btr_cur_open_at_index_side_func(
/*============================*/
2158 2159
	bool		from_left,	/*!< in: true if open to the low end,
					false if to the high end */
2160 2161
	dict_index_t*	index,		/*!< in: index */
	ulint		latch_mode,	/*!< in: latch mode */
2162 2163 2164
	btr_cur_t*	cursor,		/*!< in/out: cursor */
	ulint		level,		/*!< in: level to search for
					(0=leaf). */
2165
	const char*	file,		/*!< in: file name */
2166
	unsigned	line,		/*!< in: line where called */
2167
	mtr_t*		mtr)		/*!< in/out: mini-transaction */
osku's avatar
osku committed
2168 2169
{
	page_cur_t*	page_cursor;
2170
	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
osku's avatar
osku committed
2171 2172 2173 2174
	ulint		height;
	ulint		root_height = 0; /* remove warning */
	rec_t*		node_ptr;
	ulint		estimate;
2175
	ulint		savepoint;
2176 2177 2178 2179 2180 2181
	ulint		upper_rw_latch, root_leaf_rw_latch;
	btr_intention_t	lock_intention;
	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
	ulint		tree_savepoints[BTR_MAX_LEVELS];
	ulint		n_blocks = 0;
	ulint		n_releases = 0;
osku's avatar
osku committed
2182 2183 2184
	mem_heap_t*	heap		= NULL;
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
	ulint*		offsets		= offsets_;
2185
	dberr_t		err = DB_SUCCESS;
2186

2187
	rec_offs_init(offsets_);
osku's avatar
osku committed
2188 2189

	estimate = latch_mode & BTR_ESTIMATE;
2190 2191 2192
	latch_mode &= ~BTR_ESTIMATE;

	ut_ad(level != ULINT_UNDEFINED);
2193

2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209
	bool	s_latch_by_caller;

	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
	latch_mode &= ~BTR_ALREADY_S_LATCHED;

	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);

	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));

	/* This function doesn't need to lock left page of the leaf page */
	if (latch_mode == BTR_SEARCH_PREV) {
		latch_mode = BTR_SEARCH_LEAF;
	} else if (latch_mode == BTR_MODIFY_PREV) {
		latch_mode = BTR_MODIFY_LEAF;
	}

osku's avatar
osku committed
2210 2211 2212 2213 2214
	/* Store the position of the tree latch we push to mtr so that we
	know how to release it when we have latched the leaf node */

	savepoint = mtr_set_savepoint(mtr);

2215 2216
	switch (latch_mode) {
	case BTR_CONT_MODIFY_TREE:
2217 2218
	case BTR_CONT_SEARCH_TREE:
		upper_rw_latch = RW_NO_LATCH;
2219 2220
		break;
	case BTR_MODIFY_TREE:
2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231
		/* Most of delete-intended operations are purging.
		Free blocks and read IO bandwidth should be prior
		for them, when the history list is glowing huge. */
		if (lock_intention == BTR_INTENTION_DELETE
		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
		    && buf_get_n_pending_read_ios()) {
			mtr_x_lock(dict_index_get_lock(index), mtr);
		} else {
			mtr_sx_lock(dict_index_get_lock(index), mtr);
		}
		upper_rw_latch = RW_X_LATCH;
2232 2233
		break;
	default:
2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250
		ut_ad(!s_latch_by_caller
		      || mtr_memo_contains_flagged(mtr,
						 dict_index_get_lock(index),
						 MTR_MEMO_SX_LOCK
						 | MTR_MEMO_S_LOCK));
		if (!srv_read_only_mode) {
			if (!s_latch_by_caller) {
				/* BTR_SEARCH_TREE is intended to be used with
				BTR_ALREADY_S_LATCHED */
				ut_ad(latch_mode != BTR_SEARCH_TREE);

				mtr_s_lock(dict_index_get_lock(index), mtr);
			}
			upper_rw_latch = RW_S_LATCH;
		} else {
			upper_rw_latch = RW_NO_LATCH;
		}
osku's avatar
osku committed
2251
	}
2252
	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2253

osku's avatar
osku committed
2254 2255 2256
	page_cursor = btr_cur_get_page_cur(cursor);
	cursor->index = index;

2257 2258 2259 2260 2261
	page_id_t		page_id(dict_index_get_space(index),
					dict_index_get_page(index));
	const page_size_t&	page_size = dict_table_page_size(index->table);

	if (root_leaf_rw_latch == RW_X_LATCH) {
2262
		node_ptr_max_size = btr_node_ptr_max_size(index);
2263
	}
osku's avatar
osku committed
2264 2265 2266 2267

	height = ULINT_UNDEFINED;

	for (;;) {
Marko Mäkelä's avatar
Marko Mäkelä committed
2268
		buf_block_t*	block;
2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283
		ulint		rw_latch;

		ut_ad(n_blocks < BTR_MAX_LEVELS);

		if (height != 0
		    && (latch_mode != BTR_MODIFY_TREE
			|| height == level)) {
			rw_latch = upper_rw_latch;
		} else {
			rw_latch = RW_NO_LATCH;
		}

		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
					 BUF_GET, file, line, mtr, &err);
2284
		ut_ad((block != NULL) == (err == DB_SUCCESS));
2285
		tree_blocks[n_blocks] = block;
2286 2287

		if (err != DB_SUCCESS) {
2288
			if (err == DB_DECRYPTION_FAILED) {
2289
				ib_push_warning((void *)NULL,
2290
					DB_DECRYPTION_FAILED,
2291 2292 2293
					"Table %s is encrypted but encryption service or"
					" used key_id is not available. "
					" Can't continue reading table.",
2294
					index->table->name.m_name);
2295
				index->table->file_unreadable = true;
2296 2297 2298 2299 2300
			}

			goto exit_loop;
		}

Marko Mäkelä's avatar
Marko Mäkelä committed
2301
		const page_t* page = buf_block_get_frame(block);
osku's avatar
osku committed
2302

2303
		if (height == ULINT_UNDEFINED
2304
		    && page_is_leaf(page)
2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322
		    && rw_latch != RW_NO_LATCH
		    && rw_latch != root_leaf_rw_latch) {
			/* We should retry to get the page, because the root page
			is latched with different level as a leaf page. */
			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
			ut_ad(rw_latch == RW_S_LATCH);

			ut_ad(n_blocks == 0);
			mtr_release_block_at_savepoint(
				mtr, tree_savepoints[n_blocks],
				tree_blocks[n_blocks]);

			upper_rw_latch = root_leaf_rw_latch;
			continue;
		}

		ut_ad(fil_page_index_page_check(page));
		ut_ad(index->id == btr_page_get_index_id(page));
osku's avatar
osku committed
2323 2324 2325 2326 2327 2328

		if (height == ULINT_UNDEFINED) {
			/* We are in the root node */

			height = btr_page_get_level(page, mtr);
			root_height = height;
2329 2330 2331 2332
			ut_a(height >= level);
		} else {
			/* TODO: flag the index corrupted if this fails */
			ut_ad(height == btr_page_get_level(page, mtr));
osku's avatar
osku committed
2333 2334
		}

2335
		if (height == level) {
2336 2337 2338 2339 2340 2341 2342 2343 2344 2345
			if (srv_read_only_mode) {
				btr_cur_latch_leaves(
					block, page_id, page_size,
					latch_mode, cursor, mtr);
			} else if (height == 0) {
				if (rw_latch == RW_NO_LATCH) {
					btr_cur_latch_leaves(
						block, page_id, page_size,
						latch_mode, cursor, mtr);
				}
2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356
				/* In versions <= 3.23.52 we had
				forgotten to release the tree latch
				here. If in an index scan we had to
				scan far to find a record visible to
				the current transaction, that could
				starve others waiting for the tree
				latch. */

				switch (latch_mode) {
				case BTR_MODIFY_TREE:
				case BTR_CONT_MODIFY_TREE:
2357
				case BTR_CONT_SEARCH_TREE:
2358 2359
					break;
				default:
2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405
					if (!s_latch_by_caller) {
						/* Release the tree s-latch */
						mtr_release_s_latch_at_savepoint(
							mtr, savepoint,
							dict_index_get_lock(
								index));
					}

					/* release upper blocks */
					for (; n_releases < n_blocks;
					     n_releases++) {
						mtr_release_block_at_savepoint(
							mtr,
							tree_savepoints[
								n_releases],
							tree_blocks[
								n_releases]);
					}
				}
			} else { /* height != 0 */
				/* We already have the block latched. */
				ut_ad(latch_mode == BTR_SEARCH_TREE);
				ut_ad(s_latch_by_caller);
				ut_ad(upper_rw_latch == RW_S_LATCH);

				ut_ad(mtr_memo_contains(mtr, block,
							upper_rw_latch));

				if (s_latch_by_caller) {
					/* to exclude modifying tree operations
					should sx-latch the index. */
					ut_ad(mtr_memo_contains(
						mtr,
						dict_index_get_lock(index),
						MTR_MEMO_SX_LOCK));
					/* because has sx-latch of index,
					can release upper blocks. */
					for (; n_releases < n_blocks;
					     n_releases++) {
						mtr_release_block_at_savepoint(
							mtr,
							tree_savepoints[
								n_releases],
							tree_blocks[
								n_releases]);
					}
2406
				}
osku's avatar
osku committed
2407 2408
			}
		}
2409

osku's avatar
osku committed
2410
		if (from_left) {
2411
			page_cur_set_before_first(block, page_cursor);
osku's avatar
osku committed
2412
		} else {
2413
			page_cur_set_after_last(block, page_cursor);
osku's avatar
osku committed
2414 2415
		}

2416
		if (height == level) {
2417 2418
			if (estimate) {
				btr_cur_add_path_info(cursor, height,
2419
						      root_height);
2420
			}
osku's avatar
osku committed
2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440

			break;
		}

		ut_ad(height > 0);

		if (from_left) {
			page_cur_move_to_next(page_cursor);
		} else {
			page_cur_move_to_prev(page_cursor);
		}

		if (estimate) {
			btr_cur_add_path_info(cursor, height, root_height);
		}

		height--;

		node_ptr = page_cur_get_rec(page_cursor);
		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2441
					  false, ULINT_UNDEFINED, &heap);
2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511

		/* If the rec is the first or last in the page for
		pessimistic delete intention, it might cause node_ptr insert
		for the upper level. We should change the intention and retry.
		*/
		if (latch_mode == BTR_MODIFY_TREE
		    && btr_cur_need_opposite_intention(
			page, lock_intention, node_ptr)) {

			ut_ad(upper_rw_latch == RW_X_LATCH);
			/* release all blocks */
			for (; n_releases <= n_blocks; n_releases++) {
				mtr_release_block_at_savepoint(
					mtr, tree_savepoints[n_releases],
					tree_blocks[n_releases]);
			}

			lock_intention = BTR_INTENTION_BOTH;

			page_id.set_page_no(dict_index_get_page(index));

			height = ULINT_UNDEFINED;

			n_blocks = 0;
			n_releases = 0;

			continue;
		}

		if (latch_mode == BTR_MODIFY_TREE
		    && !btr_cur_will_modify_tree(
				cursor->index, page, lock_intention, node_ptr,
				node_ptr_max_size, page_size, mtr)) {
			ut_ad(upper_rw_latch == RW_X_LATCH);
			ut_ad(n_releases <= n_blocks);

			/* we can release upper blocks */
			for (; n_releases < n_blocks; n_releases++) {
				if (n_releases == 0) {
					/* we should not release root page
					to pin to same block. */
					continue;
				}

				/* release unused blocks to unpin */
				mtr_release_block_at_savepoint(
					mtr, tree_savepoints[n_releases],
					tree_blocks[n_releases]);
			}
		}

		if (height == level
		    && latch_mode == BTR_MODIFY_TREE) {
			ut_ad(upper_rw_latch == RW_X_LATCH);
			/* we should sx-latch root page, if released already.
			It contains seg_header. */
			if (n_releases > 0) {
				mtr_block_sx_latch_at_savepoint(
					mtr, tree_savepoints[0],
					tree_blocks[0]);
			}

			/* x-latch the branch blocks not released yet. */
			for (ulint i = n_releases; i <= n_blocks; i++) {
				mtr_block_x_latch_at_savepoint(
					mtr, tree_savepoints[i],
					tree_blocks[i]);
			}
		}

osku's avatar
osku committed
2512
		/* Go to the child node */
2513 2514 2515 2516
		page_id.set_page_no(
			btr_node_ptr_get_child_page_no(node_ptr, offsets));

		n_blocks++;
osku's avatar
osku committed
2517 2518
	}

2519
 exit_loop:
2520
	if (heap) {
osku's avatar
osku committed
2521 2522
		mem_heap_free(heap);
	}
2523 2524

	return err;
osku's avatar
osku committed
2525
}
2526

2527
/**********************************************************************//**
2528 2529 2530 2531
Positions a cursor at a randomly chosen position within a B-tree.
@return true if the index is available and we have put the cursor, false
if the index is unavailable */
bool
2532 2533
btr_cur_open_at_rnd_pos_func(
/*=========================*/
2534 2535 2536
	dict_index_t*	index,		/*!< in: index */
	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
2537
	const char*	file,		/*!< in: file name */
2538
	unsigned	line,		/*!< in: line where called */
2539
	mtr_t*		mtr)		/*!< in: mtr */
osku's avatar
osku committed
2540 2541
{
	page_cur_t*	page_cursor;
2542
	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
osku's avatar
osku committed
2543 2544
	ulint		height;
	rec_t*		node_ptr;
2545 2546 2547 2548 2549 2550 2551
	ulint		savepoint;
	ulint		upper_rw_latch, root_leaf_rw_latch;
	btr_intention_t	lock_intention;
	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
	ulint		tree_savepoints[BTR_MAX_LEVELS];
	ulint		n_blocks = 0;
	ulint		n_releases = 0;
osku's avatar
osku committed
2552 2553 2554
	mem_heap_t*	heap		= NULL;
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
	ulint*		offsets		= offsets_;
2555
	rec_offs_init(offsets_);
osku's avatar
osku committed
2556

2557 2558 2559 2560 2561 2562 2563 2564
	ut_ad(!dict_index_is_spatial(index));

	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);

	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));

	savepoint = mtr_set_savepoint(mtr);

2565 2566
	switch (latch_mode) {
	case BTR_MODIFY_TREE:
2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577
		/* Most of delete-intended operations are purging.
		Free blocks and read IO bandwidth should be prior
		for them, when the history list is glowing huge. */
		if (lock_intention == BTR_INTENTION_DELETE
		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
		    && buf_get_n_pending_read_ios()) {
			mtr_x_lock(dict_index_get_lock(index), mtr);
		} else {
			mtr_sx_lock(dict_index_get_lock(index), mtr);
		}
		upper_rw_latch = RW_X_LATCH;
2578
		break;
2579 2580 2581 2582 2583 2584 2585 2586 2587 2588
	case BTR_SEARCH_PREV:
	case BTR_MODIFY_PREV:
		/* This function doesn't support left uncle
		   page lock for left leaf page lock, when
		   needed. */
	case BTR_SEARCH_TREE:
	case BTR_CONT_MODIFY_TREE:
	case BTR_CONT_SEARCH_TREE:
		ut_ad(0);
		/* fall through */
2589
	default:
2590 2591 2592 2593 2594 2595
		if (!srv_read_only_mode) {
			mtr_s_lock(dict_index_get_lock(index), mtr);
			upper_rw_latch = RW_S_LATCH;
		} else {
			upper_rw_latch = RW_NO_LATCH;
		}
osku's avatar
osku committed
2596
	}
2597

2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611
	DBUG_EXECUTE_IF("test_index_is_unavailable",
			return(false););

	if (index->page == FIL_NULL) {
		/* Since we don't hold index lock until just now, the index
		could be modified by others, for example, if this is a
		statistics updater for referenced table, it could be marked
		as unavailable by 'DROP TABLE' in the mean time, since
		we don't hold lock for statistics updater */
		return(false);
	}

	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);

osku's avatar
osku committed
2612 2613 2614
	page_cursor = btr_cur_get_page_cur(cursor);
	cursor->index = index;

2615 2616 2617
	page_id_t		page_id(dict_index_get_space(index),
					dict_index_get_page(index));
	const page_size_t&	page_size = dict_table_page_size(index->table);
2618
	dberr_t			err = DB_SUCCESS;
2619 2620

	if (root_leaf_rw_latch == RW_X_LATCH) {
2621
		node_ptr_max_size = btr_node_ptr_max_size(index);
2622
	}
osku's avatar
osku committed
2623 2624

	height = ULINT_UNDEFINED;
2625

osku's avatar
osku committed
2626
	for (;;) {
2627 2628
		buf_block_t*	block;
		page_t*		page;
2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641
		ulint		rw_latch;

		ut_ad(n_blocks < BTR_MAX_LEVELS);

		if (height != 0
		    && latch_mode != BTR_MODIFY_TREE) {
			rw_latch = upper_rw_latch;
		} else {
			rw_latch = RW_NO_LATCH;
		}

		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2642
			BUF_GET, file, line, mtr, &err);
2643
		tree_blocks[n_blocks] = block;
2644

2645 2646
		ut_ad((block != NULL) == (err == DB_SUCCESS));

2647
		if (err != DB_SUCCESS) {
2648
			if (err == DB_DECRYPTION_FAILED) {
2649
				ib_push_warning((void *)NULL,
2650
					DB_DECRYPTION_FAILED,
2651 2652 2653
					"Table %s is encrypted but encryption service or"
					" used key_id is not available. "
					" Can't continue reading table.",
2654
					index->table->name.m_name);
2655
				index->table->file_unreadable = true;
2656
			}
2657

2658
			break;
2659 2660
		}

2661
		page = buf_block_get_frame(block);
2662 2663

		if (height == ULINT_UNDEFINED
2664
		    && page_is_leaf(page)
2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681
		    && rw_latch != RW_NO_LATCH
		    && rw_latch != root_leaf_rw_latch) {
			/* We should retry to get the page, because the root page
			is latched with different level as a leaf page. */
			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
			ut_ad(rw_latch == RW_S_LATCH);

			ut_ad(n_blocks == 0);
			mtr_release_block_at_savepoint(
				mtr, tree_savepoints[n_blocks],
				tree_blocks[n_blocks]);

			upper_rw_latch = root_leaf_rw_latch;
			continue;
		}

		ut_ad(fil_page_index_page_check(page));
2682
		ut_ad(index->id == btr_page_get_index_id(page));
osku's avatar
osku committed
2683 2684 2685 2686 2687 2688 2689

		if (height == ULINT_UNDEFINED) {
			/* We are in the root node */

			height = btr_page_get_level(page, mtr);
		}

2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721
		if (height == 0) {
			if (rw_latch == RW_NO_LATCH
			    || srv_read_only_mode) {
				btr_cur_latch_leaves(
					block, page_id, page_size,
					latch_mode, cursor, mtr);
			}

			/* btr_cur_open_at_index_side_func() and
			btr_cur_search_to_nth_level() release
			tree s-latch here.*/
			switch (latch_mode) {
			case BTR_MODIFY_TREE:
			case BTR_CONT_MODIFY_TREE:
			case BTR_CONT_SEARCH_TREE:
				break;
			default:
				/* Release the tree s-latch */
				if (!srv_read_only_mode) {
					mtr_release_s_latch_at_savepoint(
						mtr, savepoint,
						dict_index_get_lock(index));
				}

				/* release upper blocks */
				for (; n_releases < n_blocks; n_releases++) {
					mtr_release_block_at_savepoint(
						mtr,
						tree_savepoints[n_releases],
						tree_blocks[n_releases]);
				}
			}
osku's avatar
osku committed
2722 2723
		}

2724
		page_cur_open_on_rnd_user_rec(block, page_cursor);
osku's avatar
osku committed
2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736

		if (height == 0) {

			break;
		}

		ut_ad(height > 0);

		height--;

		node_ptr = page_cur_get_rec(page_cursor);
		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2737
					  false, ULINT_UNDEFINED, &heap);
2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807

		/* If the rec is the first or last in the page for
		pessimistic delete intention, it might cause node_ptr insert
		for the upper level. We should change the intention and retry.
		*/
		if (latch_mode == BTR_MODIFY_TREE
		    && btr_cur_need_opposite_intention(
			page, lock_intention, node_ptr)) {

			ut_ad(upper_rw_latch == RW_X_LATCH);
			/* release all blocks */
			for (; n_releases <= n_blocks; n_releases++) {
				mtr_release_block_at_savepoint(
					mtr, tree_savepoints[n_releases],
					tree_blocks[n_releases]);
			}

			lock_intention = BTR_INTENTION_BOTH;

			page_id.set_page_no(dict_index_get_page(index));

			height = ULINT_UNDEFINED;

			n_blocks = 0;
			n_releases = 0;

			continue;
		}

		if (latch_mode == BTR_MODIFY_TREE
		    && !btr_cur_will_modify_tree(
				cursor->index, page, lock_intention, node_ptr,
				node_ptr_max_size, page_size, mtr)) {
			ut_ad(upper_rw_latch == RW_X_LATCH);
			ut_ad(n_releases <= n_blocks);

			/* we can release upper blocks */
			for (; n_releases < n_blocks; n_releases++) {
				if (n_releases == 0) {
					/* we should not release root page
					to pin to same block. */
					continue;
				}

				/* release unused blocks to unpin */
				mtr_release_block_at_savepoint(
					mtr, tree_savepoints[n_releases],
					tree_blocks[n_releases]);
			}
		}

		if (height == 0
		    && latch_mode == BTR_MODIFY_TREE) {
			ut_ad(upper_rw_latch == RW_X_LATCH);
			/* we should sx-latch root page, if released already.
			It contains seg_header. */
			if (n_releases > 0) {
				mtr_block_sx_latch_at_savepoint(
					mtr, tree_savepoints[0],
					tree_blocks[0]);
			}

			/* x-latch the branch blocks not released yet. */
			for (ulint i = n_releases; i <= n_blocks; i++) {
				mtr_block_x_latch_at_savepoint(
					mtr, tree_savepoints[i],
					tree_blocks[i]);
			}
		}

osku's avatar
osku committed
2808
		/* Go to the child node */
2809 2810 2811 2812
		page_id.set_page_no(
			btr_node_ptr_get_child_page_no(node_ptr, offsets));

		n_blocks++;
osku's avatar
osku committed
2813 2814 2815 2816 2817
	}

	if (UNIV_LIKELY_NULL(heap)) {
		mem_heap_free(heap);
	}
2818

2819
	return err == DB_SUCCESS;
2820
}
osku's avatar
osku committed
2821 2822 2823

/*==================== B-TREE INSERT =========================*/

2824
/*************************************************************//**
osku's avatar
osku committed
2825
Inserts a record if there is enough space, or if enough space can
2826
be freed by reorganizing. Differs from btr_cur_optimistic_insert because
osku's avatar
osku committed
2827
no heuristics is applied to whether it pays to use CPU time for
2828
reorganizing the page or not.
2829 2830 2831 2832 2833 2834

IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
if this is a compressed leaf page in a secondary index.
This has to be done either within the same mini-transaction,
or by invoking ibuf_reset_free_bits() before mtr_commit().

2835
@return pointer to inserted record if succeed, else NULL */
Sergei Golubchik's avatar
Sergei Golubchik committed
2836
static MY_ATTRIBUTE((nonnull, warn_unused_result))
osku's avatar
osku committed
2837 2838 2839
rec_t*
btr_cur_insert_if_possible(
/*=======================*/
2840
	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
osku's avatar
osku committed
2841
				cursor stays valid */
2842
	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
osku's avatar
osku committed
2843
				have been stored to tuple */
2844 2845
	ulint**		offsets,/*!< out: offsets on *rec */
	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
2846
	ulint		n_ext,	/*!< in: number of externally stored columns */
2847
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
osku's avatar
osku committed
2848 2849 2850 2851 2852
{
	page_cur_t*	page_cursor;
	rec_t*		rec;

	ut_ad(dtuple_check_typed(tuple));
2853

2854 2855 2856
	ut_ad(mtr_is_block_fix(
		mtr, btr_cur_get_block(cursor),
		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
osku's avatar
osku committed
2857
	page_cursor = btr_cur_get_page_cur(cursor);
2858

osku's avatar
osku committed
2859
	/* Now, try the insert */
2860 2861
	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
				    offsets, heap, n_ext, mtr);
osku's avatar
osku committed
2862

2863 2864 2865 2866 2867 2868 2869 2870
	/* If the record did not fit, reorganize.
	For compressed pages, page_cur_tuple_insert()
	attempted this already. */
	if (!rec && !page_cur_get_page_zip(page_cursor)
	    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
		rec = page_cur_tuple_insert(
			page_cursor, tuple, cursor->index,
			offsets, heap, n_ext, mtr);
osku's avatar
osku committed
2871 2872
	}

2873
	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
osku's avatar
osku committed
2874 2875 2876
	return(rec);
}

2877
/*************************************************************//**
2878
For an insert, checks the locks and does the undo logging if desired.
2879
@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
Sergei Golubchik's avatar
Sergei Golubchik committed
2880
UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
2881
dberr_t
osku's avatar
osku committed
2882 2883
btr_cur_ins_lock_and_undo(
/*======================*/
2884
	ulint		flags,	/*!< in: undo logging and locking flags: if
osku's avatar
osku committed
2885 2886
				not zero, the parameters index and thr
				should be specified */
2887
	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
2888
	dtuple_t*	entry,	/*!< in/out: entry to insert */
2889 2890 2891
	que_thr_t*	thr,	/*!< in: query thread or NULL */
	mtr_t*		mtr,	/*!< in/out: mini-transaction */
	ibool*		inherit)/*!< out: TRUE if the inserted new record maybe
osku's avatar
osku committed
2892 2893 2894 2895
				should inherit LOCK_GAP type locks from the
				successor record */
{
	dict_index_t*	index;
2896
	dberr_t		err = DB_SUCCESS;
osku's avatar
osku committed
2897
	rec_t*		rec;
2898
	roll_ptr_t	roll_ptr;
osku's avatar
osku committed
2899 2900 2901 2902 2903 2904

	/* Check if we have to wait for a lock: enqueue an explicit lock
	request if yes */

	rec = btr_cur_get_rec(cursor);
	index = cursor->index;
2905

2906 2907 2908
	ut_ad(!dict_index_is_online_ddl(index)
	      || dict_index_is_clust(index)
	      || (flags & BTR_CREATE_FLAG));
2909
	ut_ad(mtr->is_named_space(index->space));
2910

2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934
	/* Check if there is predicate or GAP lock preventing the insertion */
	if (!(flags & BTR_NO_LOCKING_FLAG)) {
		if (dict_index_is_spatial(index)) {
			lock_prdt_t	prdt;
			rtr_mbr_t	mbr;

			rtr_get_mbr_from_tuple(entry, &mbr);

			/* Use on stack MBR variable to test if a lock is
			needed. If so, the predicate (MBR) will be allocated
			from lock heap in lock_prdt_insert_check_and_lock() */
			lock_init_prdt_from_mbr(
				&prdt, &mbr, 0, NULL);

			err = lock_prdt_insert_check_and_lock(
				flags, rec, btr_cur_get_block(cursor),
				index, thr, mtr, &prdt);
			*inherit = false;
		} else {
			err = lock_rec_insert_check_and_lock(
				flags, rec, btr_cur_get_block(cursor),
				index, thr, mtr, inherit);
		}
	}
2935

2936
	if (err != DB_SUCCESS
2937
	    || !(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))
2938
	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
osku's avatar
osku committed
2939 2940 2941 2942

		return(err);
	}

2943
	if (flags & BTR_NO_UNDO_LOG_FLAG) {
2944
		roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS;
2945 2946 2947 2948 2949 2950 2951
	} else {
		err = trx_undo_report_row_operation(thr, index, entry,
						    NULL, 0, NULL, NULL,
						    &roll_ptr);
		if (err != DB_SUCCESS) {
			return(err);
		}
2952
	}
osku's avatar
osku committed
2953

2954 2955
	/* Now we can fill in the roll ptr field in entry */
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
osku's avatar
osku committed
2956

2957 2958
		row_upd_index_entry_sys_field(entry, index,
					      DATA_ROLL_PTR, roll_ptr);
osku's avatar
osku committed
2959 2960 2961 2962 2963
	}

	return(DB_SUCCESS);
}

2964 2965 2966
/**
Prefetch siblings of the leaf for the pessimistic operation.
@param block	leaf page */
osku's avatar
osku committed
2967 2968
static
void
2969 2970
btr_cur_prefetch_siblings(
	buf_block_t*	block)
osku's avatar
osku committed
2971
{
2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992
	page_t*	page = buf_block_get_frame(block);

	ut_ad(page_is_leaf(page));

	ulint left_page_no = fil_page_get_prev(page);
	ulint right_page_no = fil_page_get_next(page);

	if (left_page_no != FIL_NULL) {
		buf_read_page_background(
			page_id_t(block->page.id.space(), left_page_no),
			block->page.size, false);
	}
	if (right_page_no != FIL_NULL) {
		buf_read_page_background(
			page_id_t(block->page.id.space(), right_page_no),
			block->page.size, false);
	}
	if (left_page_no != FIL_NULL
	    || right_page_no != FIL_NULL) {
		os_aio_simulated_wake_handler_threads();
	}
osku's avatar
osku committed
2993 2994
}

2995
/*************************************************************//**
osku's avatar
osku committed
2996 2997 2998 2999
Tries to perform an insert to a page in an index tree, next to cursor.
It is assumed that mtr holds an x-latch on the page. The operation does
not succeed if there is too little space on the page. If there is just
one record on the page, the insert will always succeed; this is to
3000
prevent trying to split a page with just one record.
3001
@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3002
dberr_t
osku's avatar
osku committed
3003 3004
btr_cur_optimistic_insert(
/*======================*/
3005
	ulint		flags,	/*!< in: undo logging and locking flags: if not
osku's avatar
osku committed
3006 3007
				zero, the parameters index and thr should be
				specified */
3008
	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
osku's avatar
osku committed
3009
				cursor stays valid */
3010
	ulint**		offsets,/*!< out: offsets on *rec */
3011
	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap */
3012 3013
	dtuple_t*	entry,	/*!< in/out: entry to insert */
	rec_t**		rec,	/*!< out: pointer to inserted record if
osku's avatar
osku committed
3014
				succeed */
3015
	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3016
				be stored externally by the caller */
3017
	ulint		n_ext,	/*!< in: number of externally stored columns */
3018 3019 3020 3021
	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
				!(~flags
				& (BTR_NO_LOCKING_FLAG
				| BTR_NO_UNDO_LOG_FLAG)) */
3022 3023 3024 3025 3026
	mtr_t*		mtr)	/*!< in/out: mini-transaction;
				if this function returns DB_SUCCESS on
				a leaf page of a secondary index in a
				compressed tablespace, the caller must
				mtr_commit(mtr) before latching
3027
				any further pages */
osku's avatar
osku committed
3028 3029 3030 3031
{
	big_rec_t*	big_rec_vec	= NULL;
	dict_index_t*	index;
	page_cur_t*	page_cursor;
3032
	buf_block_t*	block;
osku's avatar
osku committed
3033
	page_t*		page;
3034
	rec_t*		dummy;
3035
	ibool		leaf;
osku's avatar
osku committed
3036
	ibool		reorg;
Sergei Golubchik's avatar
Sergei Golubchik committed
3037
	ibool		inherit = TRUE;
osku's avatar
osku committed
3038
	ulint		rec_size;
3039
	dberr_t		err;
osku's avatar
osku committed
3040

3041
	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
osku's avatar
osku committed
3042 3043
	*big_rec = NULL;

3044
	block = btr_cur_get_block(cursor);
3045
	page = buf_block_get_frame(block);
osku's avatar
osku committed
3046
	index = cursor->index;
3047

3048
	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3049 3050 3051
	ut_ad(!dict_index_is_online_ddl(index)
	      || dict_index_is_clust(index)
	      || (flags & BTR_CREATE_FLAG));
3052 3053
	ut_ad(dtuple_check_typed(entry));

3054 3055
	const page_size_t&	page_size = block->page.size;

3056
#ifdef UNIV_DEBUG_VALGRIND
3057 3058 3059
	if (page_size.is_compressed()) {
		UNIV_MEM_ASSERT_RW(page, page_size.logical());
		UNIV_MEM_ASSERT_RW(block->page.zip.data, page_size.physical());
3060 3061
	}
#endif /* UNIV_DEBUG_VALGRIND */
osku's avatar
osku committed
3062

3063 3064
	leaf = page_is_leaf(page);

osku's avatar
osku committed
3065
	/* Calculate the record size when entry is converted to a record */
3066
	rec_size = rec_get_converted_size(index, entry, n_ext);
osku's avatar
osku committed
3067

3068
	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3069
				   dtuple_get_n_fields(entry), page_size)) {
osku's avatar
osku committed
3070 3071 3072

		/* The record is so big that we have to store some fields
		externally on separate database pages */
3073
		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
osku's avatar
osku committed
3074

3075
		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3076

osku's avatar
osku committed
3077 3078
			return(DB_TOO_BIG_RECORD);
		}
3079

3080
		rec_size = rec_get_converted_size(index, entry, n_ext);
osku's avatar
osku committed
3081 3082
	}

3083 3084 3085
	if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) {
		if (big_rec_vec != NULL) {
			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3086
		}
3087

3088
		return(DB_TOO_BIG_RECORD);
3089 3090
	}

3091 3092 3093
	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
				      goto fail);

3094
	if (leaf && page_size.is_compressed()
3095 3096 3097 3098 3099 3100
	    && (page_get_data_size(page) + rec_size
		>= dict_index_zip_pad_optimal_page_size(index))) {
		/* If compression padding tells us that insertion will
		result in too packed up page i.e.: which is likely to
		cause compression failure then don't do an optimistic
		insertion. */
3101
fail:
3102
		err = DB_FAIL;
3103 3104 3105 3106 3107 3108

		/* prefetch siblings of the leaf for the pessimistic
		operation, if the page is leaf. */
		if (page_is_leaf(page)) {
			btr_cur_prefetch_siblings(block);
		}
3109 3110
fail_err:

3111
		if (big_rec_vec) {
osku's avatar
osku committed
3112 3113 3114
			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
		}

3115
		return(err);
osku's avatar
osku committed
3116
	}
3117

3118 3119 3120 3121 3122 3123 3124
	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);

	if (page_has_garbage(page)) {
		if ((max_size < rec_size
		     || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
		    && page_get_n_recs(page) > 1
		    && page_get_max_insert_size(page, 1) < rec_size) {
osku's avatar
osku committed
3125

3126 3127 3128
			goto fail;
		}
	} else if (max_size < rec_size) {
3129
		goto fail;
osku's avatar
osku committed
3130 3131
	}

3132 3133 3134 3135
	/* If there have been many consecutive inserts to the
	clustered index leaf page of an uncompressed table, check if
	we have to split the page to reserve enough free space for
	future updates of records. */
3136

3137
	if (leaf && !page_size.is_compressed() && dict_index_is_clust(index)
3138 3139 3140
	    && page_get_n_recs(page) >= 2
	    && dict_index_get_space_reserve() + rec_size > max_size
	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
3141
		|| btr_page_get_split_rec_to_left(cursor))) {
3142 3143
		goto fail;
	}
3144

3145
	page_cursor = btr_cur_get_page_cur(cursor);
osku's avatar
osku committed
3146

3147 3148
	DBUG_LOG("ib_cur",
		 "insert " << index->name << " (" << index->id << ") by "
3149
		 << ib::hex(thr ? thr->graph->trx->id : 0)
3150
		 << ' ' << rec_printer(entry).str());
3151 3152
	DBUG_EXECUTE_IF("do_page_reorganize",
			btr_page_reorganize(page_cursor, index, mtr););
osku's avatar
osku committed
3153 3154

	/* Now, try the insert */
3155
	{
3156 3157
		const rec_t*	page_cursor_rec = page_cur_get_rec(page_cursor);

3158 3159 3160 3161 3162 3163
		/* Check locks and write to the undo log,
		if specified */
		err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
						thr, mtr, &inherit);
		if (err != DB_SUCCESS) {
			goto fail_err;
3164 3165
		}

3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188
#ifdef UNIV_DEBUG
		if (!(flags & BTR_CREATE_FLAG)
		    && index->is_primary() && page_is_leaf(page)) {
			const dfield_t* trx_id = dtuple_get_nth_field(
				entry, dict_col_get_clust_pos(
					dict_table_get_sys_col(index->table,
							       DATA_TRX_ID),
					index));

			ut_ad(trx_id->len == DATA_TRX_ID_LEN);
			ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN);
			ut_ad(*static_cast<const byte*>
			      (trx_id[1].data) & 0x80);
			if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
				ut_ad(thr->graph->trx->id);
				ut_ad(thr->graph->trx->id
				      == trx_read_trx_id(
					      static_cast<const byte*>(
						      trx_id->data)));
			}
		}
#endif

3189 3190 3191 3192
		*rec = page_cur_tuple_insert(
			page_cursor, entry, index, offsets, heap,
			n_ext, mtr);

3193
		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3194
	}
3195

3196
	if (*rec) {
3197
	} else if (page_size.is_compressed()) {
3198
		ut_ad(!dict_table_is_temporary(index->table));
3199 3200 3201
		/* Reset the IBUF_BITMAP_FREE bits, because
		page_cur_tuple_insert() will have attempted page
		reorganize before failing. */
3202
		if (leaf
3203
		    && !dict_index_is_clust(index)) {
3204
			ibuf_reset_free_bits(block);
3205
		}
3206

3207 3208 3209
		goto fail;
	} else {
		ut_ad(!reorg);
3210

3211 3212 3213
		/* If the record did not fit, reorganize */
		if (!btr_page_reorganize(page_cursor, index, mtr)) {
			ut_ad(0);
3214
			goto fail;
3215
		}
osku's avatar
osku committed
3216

3217
		ut_ad(page_get_max_insert_size(page, 1) == max_size);
3218

osku's avatar
osku committed
3219 3220
		reorg = TRUE;

3221
		*rec = page_cur_tuple_insert(page_cursor, entry, index,
3222
					     offsets, heap, n_ext, mtr);
osku's avatar
osku committed
3223 3224

		if (UNIV_UNLIKELY(!*rec)) {
3225 3226 3227 3228
			ib::fatal() <<  "Cannot insert tuple " << *entry
				<< "into index " << index->name
				<< " of table " << index->table->name
				<< ". Max size: " << max_size;
osku's avatar
osku committed
3229 3230 3231 3232
		}
	}

#ifdef BTR_CUR_HASH_ADAPT
3233
	if (!leaf) {
3234
# ifdef MYSQL_INDEX_DISABLE_AHI
3235
	} else if (index->disable_ahi) {
3236
# endif
3237
	} else if (!reorg && cursor->flag == BTR_CUR_HASH) {
3238 3239 3240
		btr_search_update_hash_node_on_insert(cursor);
	} else {
		btr_search_update_hash_on_insert(cursor);
osku's avatar
osku committed
3241
	}
3242
#endif /* BTR_CUR_HASH_ADAPT */
osku's avatar
osku committed
3243 3244 3245

	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {

3246
		lock_update_insert(block, *rec);
osku's avatar
osku committed
3247 3248
	}

3249 3250 3251
	if (leaf
	    && !dict_index_is_clust(index)
	    && !dict_table_is_temporary(index->table)) {
3252
		/* Update the free bits of the B-tree page in the
3253
		insert buffer bitmap. */
3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264

		/* The free bits in the insert buffer bitmap must
		never exceed the free space on a page.  It is safe to
		decrement or reset the bits in the bitmap in a
		mini-transaction that is committed before the
		mini-transaction that affects the free space. */

		/* It is unsafe to increment the bits in a separately
		committed mini-transaction, because in crash recovery,
		the free bits could momentarily be set too high. */

3265
		if (page_size.is_compressed()) {
3266
			/* Update the bits in the same mini-transaction. */
3267
			ibuf_update_free_bits_zip(block, mtr);
3268
		} else {
3269 3270
			/* Decrement the bits in a separate
			mini-transaction. */
3271
			ibuf_update_free_bits_if_full(
3272
				block, max_size,
3273 3274
				rec_size + PAGE_DIR_SLOT_SIZE);
		}
osku's avatar
osku committed
3275 3276 3277 3278 3279 3280 3281
	}

	*big_rec = big_rec_vec;

	return(DB_SUCCESS);
}

3282
/*************************************************************//**
osku's avatar
osku committed
3283 3284 3285
Performs an insert on a page of an index tree. It is assumed that mtr
holds an x-latch on the tree and on the cursor page. If the insert is
made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3286
to brothers of page, if those brothers exist.
3287
@return DB_SUCCESS or error number */
3288
dberr_t
osku's avatar
osku committed
3289 3290
btr_cur_pessimistic_insert(
/*=======================*/
3291
	ulint		flags,	/*!< in: undo logging and locking flags: if not
osku's avatar
osku committed
3292 3293 3294 3295 3296
				zero, the parameter thr should be
				specified; if no undo logging is specified,
				then the caller must have reserved enough
				free extents in the file space so that the
				insertion will certainly succeed */
3297
	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
osku's avatar
osku committed
3298
				cursor stays valid */
3299 3300
	ulint**		offsets,/*!< out: offsets on *rec */
	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
3301
				that can be emptied */
3302 3303
	dtuple_t*	entry,	/*!< in/out: entry to insert */
	rec_t**		rec,	/*!< out: pointer to inserted record if
osku's avatar
osku committed
3304
				succeed */
3305
	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3306
				be stored externally by the caller */
3307
	ulint		n_ext,	/*!< in: number of externally stored columns */
3308 3309 3310 3311
	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
				!(~flags
				& (BTR_NO_LOCKING_FLAG
				| BTR_NO_UNDO_LOG_FLAG)) */
3312
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
osku's avatar
osku committed
3313 3314 3315
{
	dict_index_t*	index		= cursor->index;
	big_rec_t*	big_rec_vec	= NULL;
3316
	dberr_t		err;
Sergei Golubchik's avatar
Sergei Golubchik committed
3317
	ibool		inherit = FALSE;
3318
	bool		success;
3319
	ulint		n_reserved	= 0;
3320

osku's avatar
osku committed
3321
	ut_ad(dtuple_check_typed(entry));
3322
	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
osku's avatar
osku committed
3323 3324 3325

	*big_rec = NULL;

3326
	ut_ad(mtr_memo_contains_flagged(
3327 3328
		      mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
		      MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
3329 3330 3331
	ut_ad(mtr_is_block_fix(
		mtr, btr_cur_get_block(cursor),
		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
3332 3333 3334
	ut_ad(!dict_index_is_online_ddl(index)
	      || dict_index_is_clust(index)
	      || (flags & BTR_CREATE_FLAG));
osku's avatar
osku committed
3335 3336 3337

	cursor->flag = BTR_CUR_BINARY;

3338
	/* Check locks and write to undo log, if specified */
osku's avatar
osku committed
3339

3340
	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
Sergei Golubchik's avatar
Sergei Golubchik committed
3341
					thr, mtr, &inherit);
osku's avatar
osku committed
3342 3343 3344 3345 3346 3347

	if (err != DB_SUCCESS) {

		return(err);
	}

3348
	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
osku's avatar
osku committed
3349 3350 3351 3352
		/* First reserve enough free space for the file segments
		of the index tree, so that the insert will not fail because
		of lack of space */

3353
		ulint	n_extents = cursor->tree_height / 16 + 3;
osku's avatar
osku committed
3354 3355

		success = fsp_reserve_free_extents(&n_reserved, index->space,
3356
						   n_extents, FSP_NORMAL, mtr);
osku's avatar
osku committed
3357
		if (!success) {
3358
			return(DB_OUT_OF_FILE_SPACE);
osku's avatar
osku committed
3359 3360 3361
		}
	}

3362
	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3363
				   dict_table_is_comp(index->table),
3364
				   dtuple_get_n_fields(entry),
3365
				   dict_table_page_size(index->table))) {
osku's avatar
osku committed
3366 3367
		/* The record is so big that we have to store some fields
		externally on separate database pages */
3368

3369 3370 3371 3372 3373 3374 3375
		if (UNIV_LIKELY_NULL(big_rec_vec)) {
			/* This should never happen, but we handle
			the situation in a robust manner. */
			ut_ad(0);
			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
		}

3376
		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
osku's avatar
osku committed
3377 3378

		if (big_rec_vec == NULL) {
3379

3380
			if (n_reserved > 0) {
3381
				fil_space_release_free_extents(index->space,
3382
							       n_reserved);
osku's avatar
osku committed
3383 3384 3385 3386 3387
			}
			return(DB_TOO_BIG_RECORD);
		}
	}

marko's avatar
marko committed
3388
	if (dict_index_get_page(index)
3389
	    == btr_cur_get_block(cursor)->page.id.page_no()) {
osku's avatar
osku committed
3390 3391

		/* The page is the root page */
3392 3393
		*rec = btr_root_raise_and_insert(
			flags, cursor, offsets, heap, entry, n_ext, mtr);
osku's avatar
osku committed
3394
	} else {
3395 3396
		*rec = btr_page_split_and_insert(
			flags, cursor, offsets, heap, entry, n_ext, mtr);
3397 3398
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
3399 3400 3401 3402
	if (*rec == NULL && os_has_said_disk_full) {
		return(DB_OUT_OF_FILE_SPACE);
	}

3403 3404 3405
	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
	      || dict_index_is_spatial(index));

Sergei Golubchik's avatar
Sergei Golubchik committed
3406
	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419
		ut_ad(!dict_table_is_temporary(index->table));
		if (dict_index_is_spatial(index)) {
			/* Do nothing */
		} else {
			/* The cursor might be moved to the other page
			and the max trx id field should be updated after
			the cursor was fixed. */
			if (!dict_index_is_clust(index)) {
				page_update_max_trx_id(
					btr_cur_get_block(cursor),
					btr_cur_get_page_zip(cursor),
					thr_get_trx(thr)->id, mtr);
			}
3420

3421 3422 3423 3424 3425 3426 3427 3428 3429 3430
			if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
			    || btr_page_get_prev(
				buf_block_get_frame(
					btr_cur_get_block(cursor)), mtr)
			       == FIL_NULL) {
				/* split and inserted need to call
				lock_update_insert() always. */
				inherit = TRUE;
			}
		}
Sergei Golubchik's avatar
Sergei Golubchik committed
3431 3432
	}

3433 3434 3435
	if (!page_is_leaf(btr_cur_get_page(cursor))) {
		ut_ad(!big_rec_vec);
	} else {
3436
#ifdef BTR_CUR_HASH_ADAPT
3437
# ifdef MYSQL_INDEX_DISABLE_AHI
3438
		if (index->disable_ahi); else
3439
# endif
3440
			btr_search_update_hash_on_insert(cursor);
3441
#endif /* BTR_CUR_HASH_ADAPT */
3442
		if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
osku's avatar
osku committed
3443

3444 3445
			lock_update_insert(btr_cur_get_block(cursor), *rec);
		}
osku's avatar
osku committed
3446 3447
	}

3448
	if (n_reserved > 0) {
osku's avatar
osku committed
3449 3450 3451 3452 3453
		fil_space_release_free_extents(index->space, n_reserved);
	}

	*big_rec = big_rec_vec;

3454
	return(DB_SUCCESS);
osku's avatar
osku committed
3455 3456 3457 3458
}

/*==================== B-TREE UPDATE =========================*/

3459
/*************************************************************//**
3460
For an update, checks the locks and does the undo logging.
3461
@return DB_SUCCESS, DB_WAIT_LOCK, or error number */
3462
UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3463
dberr_t
osku's avatar
osku committed
3464 3465
btr_cur_upd_lock_and_undo(
/*======================*/
3466 3467
	ulint		flags,	/*!< in: undo logging and locking flags */
	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
3468
	const ulint*	offsets,/*!< in: rec_get_offsets() on cursor */
3469 3470
	const upd_t*	update,	/*!< in: update vector */
	ulint		cmpl_info,/*!< in: compiler info on secondary index
osku's avatar
osku committed
3471
				updates */
3472 3473
	que_thr_t*	thr,	/*!< in: query thread
				(can be NULL if BTR_NO_LOCKING_FLAG) */
3474 3475
	mtr_t*		mtr,	/*!< in/out: mini-transaction */
	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
osku's avatar
osku committed
3476 3477
{
	dict_index_t*	index;
3478 3479
	const rec_t*	rec;
	dberr_t		err;
3480

Sergei Golubchik's avatar
Sergei Golubchik committed
3481
	ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
osku's avatar
osku committed
3482 3483 3484

	rec = btr_cur_get_rec(cursor);
	index = cursor->index;
3485

3486
	ut_ad(rec_offs_validate(rec, index, offsets));
3487
	ut_ad(mtr->is_named_space(index->space));
3488

3489
	if (!dict_index_is_clust(index)) {
3490 3491 3492
		ut_ad(dict_index_is_online_ddl(index)
		      == !!(flags & BTR_CREATE_FLAG));

osku's avatar
osku committed
3493 3494
		/* We do undo logging only when we update a clustered index
		record */
3495
		return(lock_sec_rec_modify_check_and_lock(
3496
			       flags, btr_cur_get_block(cursor), rec,
3497
			       index, thr, mtr));
osku's avatar
osku committed
3498 3499 3500 3501 3502 3503
	}

	/* Check if we have to wait for a lock: enqueue an explicit lock
	request if yes */

	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3504
		err = lock_clust_rec_modify_check_and_lock(
3505
			flags, btr_cur_get_block(cursor), rec, index,
3506
			offsets, thr);
osku's avatar
osku committed
3507 3508 3509 3510 3511 3512 3513
		if (err != DB_SUCCESS) {
			return(err);
		}
	}

	/* Append the info about the update in the undo log */

3514 3515 3516 3517
	return((flags & BTR_NO_UNDO_LOG_FLAG)
	       ? DB_SUCCESS
	       : trx_undo_report_row_operation(
		       thr, index, NULL, update,
3518
		       cmpl_info, rec, offsets, roll_ptr));
osku's avatar
osku committed
3519 3520
}

3521
/***********************************************************//**
osku's avatar
osku committed
3522 3523 3524 3525
Writes a redo log record of updating a record in-place. */
void
btr_cur_update_in_place_log(
/*========================*/
3526
	ulint		flags,		/*!< in: flags */
3527 3528
	const rec_t*	rec,		/*!< in: record */
	dict_index_t*	index,		/*!< in: index of the record */
3529
	const upd_t*	update,		/*!< in: update vector */
3530
	trx_id_t	trx_id,		/*!< in: transaction id */
3531 3532
	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
	mtr_t*		mtr)		/*!< in: mtr */
osku's avatar
osku committed
3533
{
3534 3535
	byte*		log_ptr;
	const page_t*	page	= page_align(rec);
osku's avatar
osku committed
3536
	ut_ad(flags < 256);
3537
	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
osku's avatar
osku committed
3538 3539

	log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
3540 3541 3542 3543
					    ? MLOG_COMP_REC_UPDATE_IN_PLACE
					    : MLOG_REC_UPDATE_IN_PLACE,
					    1 + DATA_ROLL_PTR_LEN + 14 + 2
					    + MLOG_BUF_MARGIN);
osku's avatar
osku committed
3544 3545 3546 3547 3548 3549

	if (!log_ptr) {
		/* Logging in mtr is switched off during crash recovery */
		return;
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
3550 3551 3552 3553 3554 3555
	/* For secondary indexes, we could skip writing the dummy system fields
	to the redo log but we have to change redo log parsing of
	MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
	new redo log record. For now, just write dummy sys fields to the redo
	log if we are updating a secondary index record.
	*/
osku's avatar
osku committed
3556 3557 3558
	mach_write_to_1(log_ptr, flags);
	log_ptr++;

Sergei Golubchik's avatar
Sergei Golubchik committed
3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569
	if (dict_index_is_clust(index)) {
		log_ptr = row_upd_write_sys_vals_to_log(
				index, trx_id, roll_ptr, log_ptr, mtr);
	} else {
		/* Dummy system fields for a secondary index */
		/* TRX_ID Position */
		log_ptr += mach_write_compressed(log_ptr, 0);
		/* ROLL_PTR */
		trx_write_roll_ptr(log_ptr, 0);
		log_ptr += DATA_ROLL_PTR_LEN;
		/* TRX_ID */
3570
		log_ptr += mach_u64_write_compressed(log_ptr, 0);
Sergei Golubchik's avatar
Sergei Golubchik committed
3571 3572
	}

3573
	mach_write_to_2(log_ptr, page_offset(rec));
osku's avatar
osku committed
3574 3575 3576
	log_ptr += 2;

	row_upd_index_write_log(update, log_ptr, mtr);
3577
}
osku's avatar
osku committed
3578

3579
/***********************************************************//**
3580
Parses a redo log record of updating a record in-place.
3581
@return end of log record or NULL */
osku's avatar
osku committed
3582 3583 3584
byte*
btr_cur_parse_update_in_place(
/*==========================*/
3585 3586 3587 3588 3589
	byte*		ptr,	/*!< in: buffer */
	byte*		end_ptr,/*!< in: buffer end */
	page_t*		page,	/*!< in/out: page or NULL */
	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
	dict_index_t*	index)	/*!< in: index corresponding to page */
osku's avatar
osku committed
3590
{
3591 3592 3593 3594 3595 3596 3597 3598 3599
	ulint		flags;
	rec_t*		rec;
	upd_t*		update;
	ulint		pos;
	trx_id_t	trx_id;
	roll_ptr_t	roll_ptr;
	ulint		rec_offset;
	mem_heap_t*	heap;
	ulint*		offsets;
osku's avatar
osku committed
3600 3601 3602 3603 3604

	if (end_ptr < ptr + 1) {

		return(NULL);
	}
3605

osku's avatar
osku committed
3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626
	flags = mach_read_from_1(ptr);
	ptr++;

	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);

	if (ptr == NULL) {

		return(NULL);
	}

	if (end_ptr < ptr + 2) {

		return(NULL);
	}

	rec_offset = mach_read_from_2(ptr);
	ptr += 2;

	ut_a(rec_offset <= UNIV_PAGE_SIZE);

	heap = mem_heap_create(256);
3627

osku's avatar
osku committed
3628 3629 3630 3631 3632 3633 3634
	ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);

	if (!ptr || !page) {

		goto func_exit;
	}

3635
	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
osku's avatar
osku committed
3636
	rec = page + rec_offset;
3637

3638
	/* We do not need to reserve search latch, as the page is only
osku's avatar
osku committed
3639 3640
	being recovered, and there cannot be a hash index to it. */

3641 3642 3643 3644 3645 3646 3647 3648 3649
	/* The function rtr_update_mbr_field_in_place() is generating
	these records on node pointer pages; therefore we have to
	check if this is a leaf page. */

	offsets = rec_get_offsets(rec, index, NULL,
				  flags != (BTR_NO_UNDO_LOG_FLAG
					    | BTR_NO_LOCKING_FLAG
					    | BTR_KEEP_SYS_FLAG)
				  || page_is_leaf(page),
3650
				  ULINT_UNDEFINED, &heap);
osku's avatar
osku committed
3651 3652

	if (!(flags & BTR_KEEP_SYS_FLAG)) {
marko's avatar
marko committed
3653
		row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
3654
						   pos, trx_id, roll_ptr);
osku's avatar
osku committed
3655 3656
	}

3657
	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
marko's avatar
marko committed
3658

osku's avatar
osku committed
3659 3660 3661 3662 3663 3664
func_exit:
	mem_heap_free(heap);

	return(ptr);
}

3665
/*************************************************************//**
3666
See if there is enough place in the page modification log to log
3667
an update-in-place.
3668 3669 3670

@retval false if out of space; IBUF_BITMAP_FREE will be reset
outside mtr if the page was recompressed
3671
@retval true if enough place;
3672 3673 3674 3675 3676 3677 3678 3679

IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
a secondary index leaf page. This has to be done either within the
same mini-transaction, or by invoking ibuf_reset_free_bits() before
mtr_commit(mtr). */
bool
btr_cur_update_alloc_zip_func(
/*==========================*/
3680
	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
3681 3682 3683 3684 3685
	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
#ifdef UNIV_DEBUG
	ulint*		offsets,/*!< in/out: offsets of the cursor record */
#endif /* UNIV_DEBUG */
3686
	ulint		length,	/*!< in: size needed */
3687 3688 3689
	bool		create,	/*!< in: true=delete-and-insert,
				false=update-in-place */
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3690
{
3691 3692 3693

	/* Have a local copy of the variables as these can change
	dynamically. */
3694
	const page_t*	page = page_cur_get_page(cursor);
3695

3696
	ut_ad(page_zip == page_cur_get_page_zip(cursor));
3697
	ut_ad(!dict_index_is_ibuf(index));
3698
	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3699 3700

	if (page_zip_available(page_zip, dict_index_is_clust(index),
3701
			       length, create)) {
3702
		return(true);
3703 3704
	}

3705
	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
3706
		/* The page has been freshly compressed, so
3707 3708
		reorganizing it will not help. */
		return(false);
3709 3710
	}

3711 3712 3713
	if (create && page_is_leaf(page)
	    && (length + page_get_data_size(page)
		>= dict_index_zip_pad_optimal_page_size(index))) {
3714
		return(false);
3715 3716
	}

3717 3718
	if (!btr_page_reorganize(cursor, index, mtr)) {
		goto out_of_space;
3719 3720
	}

3721
	rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
3722

3723 3724 3725 3726 3727 3728 3729
	/* After recompressing a page, we must make sure that the free
	bits in the insert buffer bitmap will not exceed the free
	space on the page.  Because this function will not attempt
	recompression unless page_zip_available() fails above, it is
	safe to reset the free bits if page_zip_available() fails
	again, below.  The free bits can safely be reset in a separate
	mini-transaction.  If page_zip_available() succeeds below, we
3730
	can be sure that the btr_page_reorganize() above did not reduce
3731 3732
	the free space available on the page. */

3733 3734 3735
	if (page_zip_available(page_zip, dict_index_is_clust(index),
			       length, create)) {
		return(true);
3736
	}
3737

3738 3739 3740 3741
out_of_space:
	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));

	/* Out of space: reset the free bits. */
3742 3743 3744
	if (!dict_index_is_clust(index)
	    && !dict_table_is_temporary(index->table)
	    && page_is_leaf(page)) {
3745 3746 3747 3748
		ibuf_reset_free_bits(page_cur_get_block(cursor));
	}

	return(false);
3749 3750
}

3751
/*************************************************************//**
osku's avatar
osku committed
3752
Updates a record when the update causes no size changes in its fields.
3753
We assume here that the ordering fields of the record do not change.
3754 3755 3756 3757
@return locking or undo log related error code, or
@retval DB_SUCCESS on success
@retval DB_ZIP_OVERFLOW if there is not enough space left
on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3758
dberr_t
osku's avatar
osku committed
3759 3760
btr_cur_update_in_place(
/*====================*/
3761 3762
	ulint		flags,	/*!< in: undo logging and locking flags */
	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
osku's avatar
osku committed
3763 3764
				cursor stays valid and positioned on the
				same record */
3765
	ulint*		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
3766 3767
	const upd_t*	update,	/*!< in: update vector */
	ulint		cmpl_info,/*!< in: compiler info on secondary index
osku's avatar
osku committed
3768
				updates */
3769
	que_thr_t*	thr,	/*!< in: query thread */
3770
	trx_id_t	trx_id,	/*!< in: transaction id */
3771 3772 3773 3774
	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
				is a secondary index, the caller must
				mtr_commit(mtr) before latching any
				further pages */
osku's avatar
osku committed
3775 3776 3777
{
	dict_index_t*	index;
	buf_block_t*	block;
marko's avatar
marko committed
3778
	page_zip_des_t*	page_zip;
3779
	dberr_t		err;
osku's avatar
osku committed
3780
	rec_t*		rec;
3781
	roll_ptr_t	roll_ptr	= 0;
osku's avatar
osku committed
3782 3783
	ulint		was_delete_marked;

3784
	ut_ad(page_is_leaf(cursor->page_cur.block->frame));
osku's avatar
osku committed
3785 3786
	rec = btr_cur_get_rec(cursor);
	index = cursor->index;
3787
	ut_ad(rec_offs_validate(rec, index, offsets));
3788
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3789
	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG));
3790 3791
	/* The insert buffer tree should never be updated in place. */
	ut_ad(!dict_index_is_ibuf(index));
3792 3793
	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
	      || dict_index_is_clust(index));
3794
	ut_ad(thr_get_trx(thr)->id == trx_id
3795
	      || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
3796 3797
	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3798
	ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
3799
	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
3800

3801 3802 3803 3804
	DBUG_LOG("ib_cur",
		 "update-in-place " << index->name << " (" << index->id
		 << ") by " << ib::hex(trx_id) << ": "
		 << rec_printer(rec, offsets).str());
osku's avatar
osku committed
3805

3806
	block = btr_cur_get_block(cursor);
3807
	page_zip = buf_block_get_page_zip(block);
3808 3809

	/* Check that enough space is available on the compressed page. */
3810
	if (page_zip) {
3811 3812
		ut_ad(!dict_table_is_temporary(index->table));

3813 3814 3815 3816 3817 3818 3819 3820
		if (!btr_cur_update_alloc_zip(
			    page_zip, btr_cur_get_page_cur(cursor),
			    index, offsets, rec_offs_size(offsets),
			    false, mtr)) {
			return(DB_ZIP_OVERFLOW);
		}

		rec = btr_cur_get_rec(cursor);
3821 3822
	}

osku's avatar
osku committed
3823
	/* Do lock checking and undo logging */
3824 3825
	err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
					update, cmpl_info,
3826
					thr, mtr, &roll_ptr);
osku's avatar
osku committed
3827
	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
3828 3829 3830 3831
		/* We may need to update the IBUF_BITMAP_FREE
		bits after a reorganize that was done in
		btr_cur_update_alloc_zip(). */
		goto func_exit;
osku's avatar
osku committed
3832 3833
	}

3834
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
3835 3836
		row_upd_rec_sys_fields(rec, NULL, index, offsets,
				       thr_get_trx(thr), roll_ptr);
3837 3838 3839 3840
	}

	was_delete_marked = rec_get_deleted_flag(
		rec, page_is_comp(buf_block_get_frame(block)));
3841 3842 3843 3844 3845
	/* In delete-marked records, DB_TRX_ID must always refer to an
	existing undo log record. */
	ut_ad(!was_delete_marked
	      || !dict_index_is_clust(index)
	      || row_get_rec_trx_id(rec, index, offsets));
3846

3847
#ifdef BTR_CUR_HASH_ADAPT
3848 3849
	{
		rw_lock_t* ahi_latch = block->index
3850
			? btr_get_search_latch(index) : NULL;
3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866
		if (ahi_latch) {
			/* TO DO: Can we skip this if none of the fields
			index->search_info->curr_n_fields
			are being updated? */

			/* The function row_upd_changes_ord_field_binary
			does not work on a secondary index. */

			if (!dict_index_is_clust(index)
			    || row_upd_changes_ord_field_binary(
				    index, update, thr, NULL, NULL)) {

				/* Remove possible hash index pointer
				to this record */
				btr_search_update_hash_on_delete(cursor);
			}
osku's avatar
osku committed
3867

3868 3869
			rw_lock_x_lock(ahi_latch);
		}
3870

3871
		assert_block_ahi_valid(block);
3872
#endif /* BTR_CUR_HASH_ADAPT */
osku's avatar
osku committed
3873

3874
		row_upd_rec_in_place(rec, index, offsets, update, page_zip);
osku's avatar
osku committed
3875

3876
#ifdef BTR_CUR_HASH_ADAPT
3877 3878 3879
		if (ahi_latch) {
			rw_lock_x_unlock(ahi_latch);
		}
osku's avatar
osku committed
3880
	}
3881
#endif /* BTR_CUR_HASH_ADAPT */
osku's avatar
osku committed
3882

3883
	btr_cur_update_in_place_log(flags, rec, index, update,
3884
				    trx_id, roll_ptr, mtr);
3885

3886
	if (was_delete_marked
3887 3888
	    && !rec_get_deleted_flag(
		    rec, page_is_comp(buf_block_get_frame(block)))) {
osku's avatar
osku committed
3889 3890 3891
		/* The new updated record owns its possible externally
		stored fields */

3892 3893
		btr_cur_unmark_extern_fields(page_zip,
					     rec, index, offsets, mtr);
marko's avatar
marko committed
3894 3895
	}

3896 3897 3898 3899 3900 3901 3902 3903
	ut_ad(err == DB_SUCCESS);

func_exit:
	if (page_zip
	    && !(flags & BTR_KEEP_IBUF_BITMAP)
	    && !dict_index_is_clust(index)
	    && page_is_leaf(buf_block_get_frame(block))) {
		/* Update the free bits in the insert buffer. */
3904
		ut_ad(!dict_table_is_temporary(index->table));
3905 3906 3907 3908
		ibuf_update_free_bits_zip(block, mtr);
	}

	return(err);
osku's avatar
osku committed
3909 3910
}

3911
/*************************************************************//**
osku's avatar
osku committed
3912 3913 3914 3915
Tries to update a record on a page in an index tree. It is assumed that mtr
holds an x-latch on the page. The operation does not succeed if there is too
little space on the page or if the update would result in too empty a page,
so that tree compression is recommended. We assume here that the ordering
3916
fields of the record do not change.
3917 3918 3919 3920 3921 3922
@return error code, including
@retval DB_SUCCESS on success
@retval DB_OVERFLOW if the updated record does not fit
@retval DB_UNDERFLOW if the page would become too empty
@retval DB_ZIP_OVERFLOW if there is not enough space left
on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3923
dberr_t
osku's avatar
osku committed
3924 3925
btr_cur_optimistic_update(
/*======================*/
3926 3927
	ulint		flags,	/*!< in: undo logging and locking flags */
	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
osku's avatar
osku committed
3928 3929
				cursor stays valid and positioned on the
				same record */
3930
	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
3931
	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
3932
	const upd_t*	update,	/*!< in: update vector; this must also
osku's avatar
osku committed
3933
				contain trx id and roll ptr fields */
3934
	ulint		cmpl_info,/*!< in: compiler info on secondary index
osku's avatar
osku committed
3935
				updates */
Sergei Golubchik's avatar
Sergei Golubchik committed
3936
	que_thr_t*	thr,	/*!< in: query thread */
3937
	trx_id_t	trx_id,	/*!< in: transaction id */
3938 3939 3940 3941
	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
				is a secondary index, the caller must
				mtr_commit(mtr) before latching any
				further pages */
osku's avatar
osku committed
3942 3943 3944
{
	dict_index_t*	index;
	page_cur_t*	page_cursor;
3945
	dberr_t		err;
3946
	buf_block_t*	block;
osku's avatar
osku committed
3947
	page_t*		page;
marko's avatar
marko committed
3948
	page_zip_des_t*	page_zip;
osku's avatar
osku committed
3949 3950 3951 3952
	rec_t*		rec;
	ulint		max_size;
	ulint		new_rec_size;
	ulint		old_rec_size;
Sergei Golubchik's avatar
Sergei Golubchik committed
3953
	ulint		max_ins_size = 0;
osku's avatar
osku committed
3954
	dtuple_t*	new_entry;
3955
	roll_ptr_t	roll_ptr;
osku's avatar
osku committed
3956
	ulint		i;
3957
	ulint		n_ext;
osku's avatar
osku committed
3958

3959 3960
	block = btr_cur_get_block(cursor);
	page = buf_block_get_frame(block);
3961
	rec = btr_cur_get_rec(cursor);
osku's avatar
osku committed
3962
	index = cursor->index;
3963
	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG));
3964
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3965 3966 3967
	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
	/* This is intended only for leaf page updates */
	ut_ad(page_is_leaf(page));
3968 3969
	/* The insert buffer tree should never be updated in place. */
	ut_ad(!dict_index_is_ibuf(index));
3970 3971
	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
	      || dict_index_is_clust(index));
3972
	ut_ad(thr_get_trx(thr)->id == trx_id
3973
	      || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
3974 3975
	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3976
	ut_ad(fil_page_index_page_check(page));
3977 3978
	ut_ad(btr_page_get_index_id(page) == index->id);

3979
	*offsets = rec_get_offsets(rec, index, *offsets, true,
3980
				   ULINT_UNDEFINED, heap);
3981
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
3982
	ut_a(!rec_offs_any_null_extern(rec, *offsets)
3983 3984
	     || trx_is_recv(thr_get_trx(thr)));
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
osku's avatar
osku committed
3985

3986
	if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
osku's avatar
osku committed
3987 3988 3989

		/* The simplest and the most common case: the update does not
		change the size of any field and none of the updated fields is
3990 3991 3992
		externally stored in rec or update, and there is enough space
		on the compressed page to log the update. */

3993 3994 3995
		return(btr_cur_update_in_place(
			       flags, cursor, *offsets, update,
			       cmpl_info, thr, trx_id, mtr));
osku's avatar
osku committed
3996 3997
	}

3998
	if (rec_offs_any_extern(*offsets)) {
3999
any_extern:
4000 4001 4002
		/* Externally stored fields are treated in pessimistic
		update */

4003 4004 4005 4006
		/* prefetch siblings of the leaf for the pessimistic
		operation. */
		btr_cur_prefetch_siblings(block);

4007 4008 4009
		return(DB_OVERFLOW);
	}

osku's avatar
osku committed
4010
	for (i = 0; i < upd_get_n_fields(update); i++) {
4011
		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
osku's avatar
osku committed
4012

4013
			goto any_extern;
osku's avatar
osku committed
4014 4015 4016
		}
	}

4017 4018 4019 4020
	DBUG_LOG("ib_cur",
		 "update " << index->name << " (" << index->id << ") by "
		 << ib::hex(trx_id) << ": "
		 << rec_printer(rec, *offsets).str());
4021

osku's avatar
osku committed
4022
	page_cursor = btr_cur_get_page_cur(cursor);
4023

4024 4025 4026 4027 4028 4029 4030 4031
	if (!*heap) {
		*heap = mem_heap_create(
			rec_offs_size(*offsets)
			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
	}

	new_entry = row_rec_to_index_entry(rec, index, *offsets,
					   &n_ext, *heap);
4032 4033
	/* We checked above that there are no externally stored fields. */
	ut_a(!n_ext);
osku's avatar
osku committed
4034

4035 4036 4037
	/* The page containing the clustered index record
	corresponding to new_entry is latched in mtr.
	Thus the following call is safe. */
osku's avatar
osku committed
4038
	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4039 4040
						     FALSE, *heap);
	old_rec_size = rec_offs_size(*offsets);
4041
	new_rec_size = rec_get_converted_size(index, new_entry, 0);
4042

4043
	page_zip = buf_block_get_page_zip(block);
4044
#ifdef UNIV_ZIP_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
4045
	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4046
#endif /* UNIV_ZIP_DEBUG */
4047

4048
	if (page_zip) {
4049 4050
		ut_ad(!dict_table_is_temporary(index->table));

4051 4052
		if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
					   dict_index_get_n_fields(index),
4053
					   dict_table_page_size(index->table))) {
4054 4055 4056
			goto any_extern;
		}

4057 4058 4059 4060 4061 4062 4063
		if (!btr_cur_update_alloc_zip(
			    page_zip, page_cursor, index, *offsets,
			    new_rec_size, true, mtr)) {
			return(DB_ZIP_OVERFLOW);
		}

		rec = page_cur_get_rec(page_cursor);
4064 4065
	}

4066
	/* We limit max record size to 16k even for 64k page size. */
4067 4068 4069 4070
  if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
      (!dict_table_is_comp(index->table)
       && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
          err = DB_OVERFLOW;
4071 4072 4073 4074

		goto func_exit;
	}

4075 4076 4077
	if (UNIV_UNLIKELY(new_rec_size
			  >= (page_get_free_space_of_empty(page_is_comp(page))
			      / 2))) {
4078 4079 4080 4081 4082
		/* We may need to update the IBUF_BITMAP_FREE
		bits after a reorganize that was done in
		btr_cur_update_alloc_zip(). */
		err = DB_OVERFLOW;
		goto func_exit;
osku's avatar
osku committed
4083 4084 4085
	}

	if (UNIV_UNLIKELY(page_get_data_size(page)
4086
			  - old_rec_size + new_rec_size
4087
			  < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4088 4089 4090
		/* We may need to update the IBUF_BITMAP_FREE
		bits after a reorganize that was done in
		btr_cur_update_alloc_zip(). */
osku's avatar
osku committed
4091 4092

		/* The page would become too empty */
4093 4094
		err = DB_UNDERFLOW;
		goto func_exit;
osku's avatar
osku committed
4095 4096
	}

4097 4098 4099 4100 4101 4102
	/* We do not attempt to reorganize if the page is compressed.
	This is because the page may fail to compress after reorganization. */
	max_size = page_zip
		? page_get_max_insert_size(page, 1)
		: (old_rec_size
		   + page_get_max_insert_size_after_reorganize(page, 1));
4103

Sergei Golubchik's avatar
Sergei Golubchik committed
4104
	if (!page_zip) {
4105 4106
		max_ins_size = page_get_max_insert_size_after_reorganize(
				page, 1);
Sergei Golubchik's avatar
Sergei Golubchik committed
4107 4108
	}

osku's avatar
osku committed
4109
	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4110 4111
	       && (max_size >= new_rec_size))
	      || (page_get_n_recs(page) <= 1))) {
osku's avatar
osku committed
4112

4113 4114 4115 4116
		/* We may need to update the IBUF_BITMAP_FREE
		bits after a reorganize that was done in
		btr_cur_update_alloc_zip(). */

osku's avatar
osku committed
4117 4118 4119 4120
		/* There was not enough space, or it did not pay to
		reorganize: for simplicity, we decide what to do assuming a
		reorganization is needed, though it might not be necessary */

4121 4122
		err = DB_OVERFLOW;
		goto func_exit;
osku's avatar
osku committed
4123 4124 4125
	}

	/* Do lock checking and undo logging */
4126 4127
	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
					update, cmpl_info,
4128
					thr, mtr, &roll_ptr);
osku's avatar
osku committed
4129
	if (err != DB_SUCCESS) {
4130 4131 4132 4133
		/* We may need to update the IBUF_BITMAP_FREE
		bits after a reorganize that was done in
		btr_cur_update_alloc_zip(). */
		goto func_exit;
osku's avatar
osku committed
4134
	}
4135 4136

	/* Ok, we may do the replacement. Store on the page infimum the
osku's avatar
osku committed
4137
	explicit locks on rec, before deleting rec (see the comment in
4138
	btr_cur_pessimistic_update). */
4139 4140 4141
	if (!dict_table_is_locking_disabled(index->table)) {
		lock_rec_store_on_page_infimum(block, rec);
	}
osku's avatar
osku committed
4142 4143 4144

	btr_search_update_hash_on_delete(cursor);

4145
	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
osku's avatar
osku committed
4146 4147

	page_cur_move_to_prev(page_cursor);
4148

4149
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
osku's avatar
osku committed
4150
		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4151
					      roll_ptr);
osku's avatar
osku committed
4152
		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4153
					      trx_id);
osku's avatar
osku committed
4154 4155
	}

4156
	/* There are no externally stored columns in new_entry */
4157 4158
	rec = btr_cur_insert_if_possible(
		cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4159
	ut_a(rec); /* <- We calculated above the insert would fit */
osku's avatar
osku committed
4160 4161

	/* Restore the old explicit lock state on the record */
4162 4163 4164
	if (!dict_table_is_locking_disabled(index->table)) {
		lock_rec_restore_from_page_infimum(block, rec, block);
	}
osku's avatar
osku committed
4165

4166
	page_cur_move_to_next(page_cursor);
4167
	ut_ad(err == DB_SUCCESS);
4168

4169
func_exit:
Sergei Golubchik's avatar
Sergei Golubchik committed
4170
	if (!(flags & BTR_KEEP_IBUF_BITMAP)
4171
	    && !dict_index_is_clust(index)) {
4172
		/* Update the free bits in the insert buffer. */
Sergei Golubchik's avatar
Sergei Golubchik committed
4173
		if (page_zip) {
4174
			ut_ad(!dict_table_is_temporary(index->table));
Sergei Golubchik's avatar
Sergei Golubchik committed
4175
			ibuf_update_free_bits_zip(block, mtr);
4176
		} else if (!dict_table_is_temporary(index->table)) {
Sergei Golubchik's avatar
Sergei Golubchik committed
4177 4178
			ibuf_update_free_bits_low(block, max_ins_size, mtr);
		}
4179 4180
	}

4181 4182 4183 4184 4185 4186
	if (err != DB_SUCCESS) {
		/* prefetch siblings of the leaf for the pessimistic
		operation. */
		btr_cur_prefetch_siblings(block);
	}

4187
	return(err);
osku's avatar
osku committed
4188 4189
}

4190
/*************************************************************//**
osku's avatar
osku committed
4191 4192 4193 4194 4195 4196 4197 4198 4199
If, in a split, a new supremum record was created as the predecessor of the
updated record, the supremum record must inherit exactly the locks on the
updated record. In the split it may have inherited locks from the successor
of the updated record, which is not correct. This function restores the
right locks for the new supremum. */
static
void
btr_cur_pess_upd_restore_supremum(
/*==============================*/
4200 4201 4202
	buf_block_t*	block,	/*!< in: buffer block of rec */
	const rec_t*	rec,	/*!< in: updated record */
	mtr_t*		mtr)	/*!< in: mtr */
osku's avatar
osku committed
4203
{
4204 4205
	page_t*		page;
	buf_block_t*	prev_block;
4206

4207
	page = buf_block_get_frame(block);
osku's avatar
osku committed
4208 4209

	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4210 4211
		/* Updated record is not the first user record on its page */

osku's avatar
osku committed
4212 4213 4214
		return;
	}

4215 4216 4217
	const ulint	prev_page_no = btr_page_get_prev(page, mtr);

	const page_id_t	page_id(block->page.id.space(), prev_page_no);
4218

osku's avatar
osku committed
4219
	ut_ad(prev_page_no != FIL_NULL);
4220
	prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr);
4221
#ifdef UNIV_BTR_DEBUG
4222
	ut_a(btr_page_get_next(prev_block->frame, mtr)
4223
	     == page_get_page_no(page));
4224
#endif /* UNIV_BTR_DEBUG */
osku's avatar
osku committed
4225

4226
	/* We must already have an x-latch on prev_block! */
4227
	ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
osku's avatar
osku committed
4228

4229 4230 4231
	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
					     PAGE_HEAP_NO_SUPREMUM,
					     page_rec_get_heap_no(rec));
osku's avatar
osku committed
4232 4233
}

4234
/*************************************************************//**
osku's avatar
osku committed
4235 4236 4237 4238
Performs an update of a record on a page of a tree. It is assumed
that mtr holds an x-latch on the tree and on the cursor page. If the
update is made on the leaf level, to avoid deadlocks, mtr must also
own x-latches to brothers of page, if those brothers exist. We assume
4239
here that the ordering fields of the record do not change.
4240
@return DB_SUCCESS or error code */
4241
dberr_t
osku's avatar
osku committed
4242 4243
btr_cur_pessimistic_update(
/*=======================*/
4244
	ulint		flags,	/*!< in: undo logging, locking, and rollback
osku's avatar
osku committed
4245
				flags */
4246 4247 4248
	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
				cursor may become invalid if *big_rec == NULL
				|| !(flags & BTR_KEEP_POS_FLAG) */
4249 4250 4251
	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
	mem_heap_t**	offsets_heap,
				/*!< in/out: pointer to memory heap
4252
				that can be emptied */
4253 4254 4255
	mem_heap_t*	entry_heap,
				/*!< in/out: memory heap for allocating
				big_rec and the index tuple */
4256
	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
4257
				be stored externally by the caller */
4258 4259 4260 4261
	upd_t*		update,	/*!< in/out: update vector; this is allowed to
				also contain trx id and roll ptr fields.
				Non-updated columns that are moved offpage will
				be appended to this. */
4262
	ulint		cmpl_info,/*!< in: compiler info on secondary index
osku's avatar
osku committed
4263
				updates */
Sergei Golubchik's avatar
Sergei Golubchik committed
4264
	que_thr_t*	thr,	/*!< in: query thread */
4265
	trx_id_t	trx_id,	/*!< in: transaction id */
4266 4267
	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
				committed before latching any further pages */
osku's avatar
osku committed
4268 4269 4270 4271
{
	big_rec_t*	big_rec_vec	= NULL;
	big_rec_t*	dummy_big_rec;
	dict_index_t*	index;
4272
	buf_block_t*	block;
osku's avatar
osku committed
4273
	page_t*		page;
marko's avatar
marko committed
4274
	page_zip_des_t*	page_zip;
osku's avatar
osku committed
4275 4276
	rec_t*		rec;
	page_cur_t*	page_cursor;
4277 4278
	dberr_t		err;
	dberr_t		optim_err;
4279
	roll_ptr_t	roll_ptr;
osku's avatar
osku committed
4280
	ibool		was_first;
4281
	ulint		n_reserved	= 0;
4282
	ulint		n_ext;
Sergei Golubchik's avatar
Sergei Golubchik committed
4283
	ulint		max_ins_size	= 0;
4284

4285
	*offsets = NULL;
osku's avatar
osku committed
4286
	*big_rec = NULL;
4287

4288 4289 4290
	block = btr_cur_get_block(cursor);
	page = buf_block_get_frame(block);
	page_zip = buf_block_get_page_zip(block);
osku's avatar
osku committed
4291 4292
	index = cursor->index;

4293 4294
	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
					MTR_MEMO_X_LOCK |
4295
					MTR_MEMO_SX_LOCK));
4296
	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
4297
#ifdef UNIV_ZIP_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
4298
	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4299
#endif /* UNIV_ZIP_DEBUG */
4300
	ut_ad(!page_zip || !dict_table_is_temporary(index->table));
4301 4302
	/* The insert buffer tree should never be updated in place. */
	ut_ad(!dict_index_is_ibuf(index));
4303
	ut_ad(trx_id > 0
4304
	      || (flags & BTR_KEEP_SYS_FLAG));
4305 4306
	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
	      || dict_index_is_clust(index));
4307
	ut_ad(thr_get_trx(thr)->id == trx_id
4308
	      || (flags & ulint(~BTR_KEEP_POS_FLAG))
4309 4310
	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
osku's avatar
osku committed
4311

4312 4313 4314
	err = optim_err = btr_cur_optimistic_update(
		flags | BTR_KEEP_IBUF_BITMAP,
		cursor, offsets, offsets_heap, update,
4315
		cmpl_info, thr, trx_id, mtr);
osku's avatar
osku committed
4316

4317 4318
	switch (err) {
	case DB_ZIP_OVERFLOW:
4319 4320 4321 4322
	case DB_UNDERFLOW:
	case DB_OVERFLOW:
		break;
	default:
4323 4324 4325 4326 4327 4328 4329 4330 4331
	err_exit:
		/* We suppressed this with BTR_KEEP_IBUF_BITMAP.
		For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
		already reset by btr_cur_update_alloc_zip() if the
		page was recompressed. */
		if (page_zip
		    && optim_err != DB_ZIP_OVERFLOW
		    && !dict_index_is_clust(index)
		    && page_is_leaf(page)) {
4332
			ut_ad(!dict_table_is_temporary(index->table));
4333 4334 4335
			ibuf_update_free_bits_zip(block, mtr);
		}

4336 4337
		if (big_rec_vec != NULL) {
			dtuple_big_rec_free(big_rec_vec);
osku's avatar
osku committed
4338
		}
4339

4340
		return(err);
osku's avatar
osku committed
4341
	}
4342

4343
	rec = btr_cur_get_rec(cursor);
Eugene Kosov's avatar
Eugene Kosov committed
4344
	ut_ad(rec_offs_validate(rec, index, *offsets));
4345

4346 4347
	dtuple_t*	new_entry = row_rec_to_index_entry(
		rec, index, *offsets, &n_ext, entry_heap);
osku's avatar
osku committed
4348

4349
	/* The page containing the clustered index record
4350 4351 4352 4353 4354
	corresponding to new_entry is latched in mtr.  If the
	clustered index record is delete-marked, then its externally
	stored fields cannot have been purged yet, because then the
	purge would also have removed the clustered index record
	itself.  Thus the following call is safe. */
osku's avatar
osku committed
4355
	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4356
						     FALSE, entry_heap);
osku's avatar
osku committed
4357

4358 4359 4360 4361 4362 4363 4364 4365
	/* We have to set appropriate extern storage bits in the new
	record to be inserted: we have to remember which fields were such */

	ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
	ut_ad(rec_offs_validate(rec, index, *offsets));
	n_ext += btr_push_update_extern_fields(new_entry, update, entry_heap);

	if ((flags & BTR_NO_UNDO_LOG_FLAG)
4366
	    && rec_offs_any_extern(*offsets)) {
osku's avatar
osku committed
4367 4368 4369 4370 4371 4372 4373
		/* We are in a transaction rollback undoing a row
		update: we must free possible externally stored fields
		which got new values in the update, if they are not
		inherited values. They can be inherited if we have
		updated the primary key to another value, and then
		update it back again. */

4374
		ut_ad(big_rec_vec == NULL);
4375 4376 4377
		ut_ad(dict_index_is_clust(index));
		ut_ad(thr_get_trx(thr)->in_rollback);

4378
		DEBUG_SYNC_C("blob_rollback_middle");
4379

4380
		btr_rec_free_updated_extern_fields(
4381
			index, rec, page_zip, *offsets, update, true, mtr);
osku's avatar
osku committed
4382 4383
	}

4384 4385 4386 4387 4388
	if (page_zip_rec_needs_ext(
			rec_get_converted_size(index, new_entry, n_ext),
			page_is_comp(page),
			dict_index_get_n_fields(index),
			block->page.size)) {
osku's avatar
osku committed
4389

4390
		big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
4391
		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
osku's avatar
osku committed
4392

4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405
			/* We cannot goto return_after_reservations,
			because we may need to update the
			IBUF_BITMAP_FREE bits, which was suppressed by
			BTR_KEEP_IBUF_BITMAP. */
#ifdef UNIV_ZIP_DEBUG
			ut_a(!page_zip
			     || page_zip_validate(page_zip, page, index));
#endif /* UNIV_ZIP_DEBUG */
			if (n_reserved > 0) {
				fil_space_release_free_extents(
					index->space, n_reserved);
			}

osku's avatar
osku committed
4406
			err = DB_TOO_BIG_RECORD;
4407
			goto err_exit;
osku's avatar
osku committed
4408
		}
4409

4410 4411 4412
		ut_ad(page_is_leaf(page));
		ut_ad(dict_index_is_clust(index));
		ut_ad(flags & BTR_KEEP_POS_FLAG);
osku's avatar
osku committed
4413 4414
	}

4415 4416 4417 4418 4419 4420 4421
	/* Do lock checking and undo logging */
	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
					update, cmpl_info,
					thr, mtr, &roll_ptr);
	if (err != DB_SUCCESS) {
		goto err_exit;
	}
Sergei Golubchik's avatar
Sergei Golubchik committed
4422

4423
	if (optim_err == DB_OVERFLOW) {
Sergei Golubchik's avatar
Sergei Golubchik committed
4424

4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436
		/* First reserve enough free space for the file segments
		of the index tree, so that the update will not fail because
		of lack of space */

		ulint	n_extents = cursor->tree_height / 16 + 3;

		if (!fsp_reserve_free_extents(
		            &n_reserved, index->space, n_extents,
		            flags & BTR_NO_UNDO_LOG_FLAG
		            ? FSP_CLEANING : FSP_NORMAL,
		            mtr)) {
			err = DB_OUT_OF_FILE_SPACE;
Sergei Golubchik's avatar
Sergei Golubchik committed
4437 4438 4439 4440
			goto err_exit;
		}
	}

4441
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
4442 4443 4444 4445 4446 4447
		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
					      roll_ptr);
		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
					      trx_id);
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
4448
	if (!page_zip) {
4449 4450
		max_ins_size = page_get_max_insert_size_after_reorganize(
				page, 1);
Sergei Golubchik's avatar
Sergei Golubchik committed
4451 4452
	}

osku's avatar
osku committed
4453 4454 4455 4456 4457 4458 4459 4460
	/* Store state of explicit locks on rec on the page infimum record,
	before deleting rec. The page infimum acts as a dummy carrier of the
	locks, taking care also of lock releases, before we can move the locks
	back on the actual record. There is a special case: if we are
	inserting on the root page and the insert causes a call of
	btr_root_raise_and_insert. Therefore we cannot in the lock system
	delete the lock structs set on the root page even if the root
	page carries just node pointers. */
4461 4462 4463
	if (!dict_table_is_locking_disabled(index->table)) {
		lock_rec_store_on_page_infimum(block, rec);
	}
osku's avatar
osku committed
4464 4465 4466

	btr_search_update_hash_on_delete(cursor);

4467
#ifdef UNIV_ZIP_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
4468
	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4469
#endif /* UNIV_ZIP_DEBUG */
4470 4471
	page_cursor = btr_cur_get_page_cur(cursor);

4472
	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
osku's avatar
osku committed
4473 4474 4475

	page_cur_move_to_prev(page_cursor);

4476 4477
	rec = btr_cur_insert_if_possible(cursor, new_entry,
					 offsets, offsets_heap, n_ext, mtr);
osku's avatar
osku committed
4478 4479

	if (rec) {
4480 4481
		page_cursor->rec = rec;

4482 4483 4484 4485
		if (!dict_table_is_locking_disabled(index->table)) {
			lock_rec_restore_from_page_infimum(
				btr_cur_get_block(cursor), rec, block);
		}
4486

4487
		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
osku's avatar
osku committed
4488 4489
			/* The new inserted record owns its possible externally
			stored fields */
4490 4491
			btr_cur_unmark_extern_fields(
				page_zip, rec, index, *offsets, mtr);
4492 4493 4494 4495
		} else {
			/* In delete-marked records, DB_TRX_ID must
			always refer to an existing undo log record. */
			ut_ad(row_get_rec_trx_id(rec, index, *offsets));
4496 4497
		}

4498 4499
		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);

4500 4501 4502 4503 4504
		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
			if (adjust) {
				rec_offs_make_valid(
					page_cursor->rec, index, *offsets);
			}
Sergei Golubchik's avatar
Sergei Golubchik committed
4505
		} else if (!dict_index_is_clust(index)
4506 4507 4508 4509
			   && page_is_leaf(page)) {
			/* Update the free bits in the insert buffer.
			This is the same block which was skipped by
			BTR_KEEP_IBUF_BITMAP. */
Sergei Golubchik's avatar
Sergei Golubchik committed
4510
			if (page_zip) {
4511
				ut_ad(!dict_table_is_temporary(index->table));
Sergei Golubchik's avatar
Sergei Golubchik committed
4512
				ibuf_update_free_bits_zip(block, mtr);
4513
			} else if (!dict_table_is_temporary(index->table)) {
Sergei Golubchik's avatar
Sergei Golubchik committed
4514 4515 4516
				ibuf_update_free_bits_low(block, max_ins_size,
							  mtr);
			}
4517 4518
		}

4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530
		if (!srv_read_only_mode
		    && !big_rec_vec
		    && page_is_leaf(page)
		    && !dict_index_is_online_ddl(index)) {

			mtr_memo_release(mtr, dict_index_get_lock(index),
					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);

			/* NOTE: We cannot release root block latch here, because it
			has segment header and already modified in most of cases.*/
		}

osku's avatar
osku committed
4531 4532
		err = DB_SUCCESS;
		goto return_after_reservations;
4533
	} else {
4534 4535 4536 4537 4538 4539
		/* If the page is compressed and it initially
		compresses very well, and there is a subsequent insert
		of a badly-compressing record, it is possible for
		btr_cur_optimistic_update() to return DB_UNDERFLOW and
		btr_cur_insert_if_possible() to return FALSE. */
		ut_a(page_zip || optim_err != DB_UNDERFLOW);
4540

4541 4542 4543
		/* Out of space: reset the free bits.
		This is the same block which was skipped by
		BTR_KEEP_IBUF_BITMAP. */
4544 4545 4546
		if (!dict_index_is_clust(index)
		    && !dict_table_is_temporary(index->table)
		    && page_is_leaf(page)) {
4547 4548
			ibuf_reset_free_bits(block);
		}
osku's avatar
osku committed
4549 4550
	}

4551
	if (big_rec_vec != NULL) {
4552 4553 4554
		ut_ad(page_is_leaf(page));
		ut_ad(dict_index_is_clust(index));
		ut_ad(flags & BTR_KEEP_POS_FLAG);
4555

4556 4557
		/* btr_page_split_and_insert() in
		btr_cur_pessimistic_insert() invokes
4558
		mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
4559 4560 4561 4562
		We must keep the index->lock when we created a
		big_rec, so that row_upd_clust_rec() can store the
		big_rec in the same mini-transaction. */

4563 4564 4565 4566 4567 4568
		ut_ad(mtr_memo_contains_flagged(mtr,
						dict_index_get_lock(index),
						MTR_MEMO_X_LOCK |
						MTR_MEMO_SX_LOCK));

		mtr_sx_lock(dict_index_get_lock(index), mtr);
4569 4570
	}

4571 4572 4573
	/* Was the record to be updated positioned as the first user
	record on its page? */
	was_first = page_cur_is_before_first(page_cursor);
osku's avatar
osku committed
4574

Marko Mäkelä's avatar
Marko Mäkelä committed
4575
	/* Lock checks and undo logging were already performed by
4576 4577 4578
	btr_cur_upd_lock_and_undo(). We do not try
	btr_cur_optimistic_insert() because
	btr_cur_insert_if_possible() already failed above. */
osku's avatar
osku committed
4579 4580

	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
4581 4582
					 | BTR_NO_LOCKING_FLAG
					 | BTR_KEEP_SYS_FLAG,
4583 4584
					 cursor, offsets, offsets_heap,
					 new_entry, &rec,
4585
					 &dummy_big_rec, n_ext, NULL, mtr);
osku's avatar
osku committed
4586 4587 4588
	ut_a(rec);
	ut_a(err == DB_SUCCESS);
	ut_a(dummy_big_rec == NULL);
4589
	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
4590
	page_cursor->rec = rec;
osku's avatar
osku committed
4591

4592 4593 4594 4595 4596 4597
	/* Multiple transactions cannot simultaneously operate on the
	same temp-table in parallel.
	max_trx_id is ignored for temp tables because it not required
	for MVCC. */
	if (dict_index_is_sec_or_ibuf(index)
	    && !dict_table_is_temporary(index->table)) {
4598 4599 4600 4601 4602 4603 4604 4605 4606
		/* Update PAGE_MAX_TRX_ID in the index page header.
		It was not updated by btr_cur_pessimistic_insert()
		because of BTR_NO_LOCKING_FLAG. */
		buf_block_t*	rec_block;

		rec_block = btr_cur_get_block(cursor);

		page_update_max_trx_id(rec_block,
				       buf_block_get_page_zip(rec_block),
4607
				       trx_id, mtr);
4608 4609
	}

4610
	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
osku's avatar
osku committed
4611 4612
		/* The new inserted record owns its possible externally
		stored fields */
4613 4614 4615
		buf_block_t*	rec_block = btr_cur_get_block(cursor);

#ifdef UNIV_ZIP_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
4616
		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4617 4618 4619
		page = buf_block_get_frame(rec_block);
#endif /* UNIV_ZIP_DEBUG */
		page_zip = buf_block_get_page_zip(rec_block);
osku's avatar
osku committed
4620

4621
		btr_cur_unmark_extern_fields(page_zip,
4622
					     rec, index, *offsets, mtr);
4623 4624 4625 4626
	} else {
		/* In delete-marked records, DB_TRX_ID must
		always refer to an existing undo log record. */
		ut_ad(row_get_rec_trx_id(rec, index, *offsets));
osku's avatar
osku committed
4627 4628
	}

4629 4630 4631 4632
	if (!dict_table_is_locking_disabled(index->table)) {
		lock_rec_restore_from_page_infimum(
			btr_cur_get_block(cursor), rec, block);
	}
osku's avatar
osku committed
4633 4634 4635 4636 4637 4638

	/* If necessary, restore also the correct lock state for a new,
	preceding supremum record created in a page split. While the old
	record was nonexistent, the supremum might have inherited its locks
	from a wrong record. */

4639
	if (!was_first && !dict_table_is_locking_disabled(index->table)) {
4640 4641
		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
						  rec, mtr);
osku's avatar
osku committed
4642 4643 4644
	}

return_after_reservations:
4645
#ifdef UNIV_ZIP_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
4646
	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4647
#endif /* UNIV_ZIP_DEBUG */
osku's avatar
osku committed
4648

4649
	if (n_reserved > 0) {
osku's avatar
osku committed
4650 4651 4652 4653 4654 4655 4656 4657 4658 4659
		fil_space_release_free_extents(index->space, n_reserved);
	}

	*big_rec = big_rec_vec;

	return(err);
}

/*==================== B-TREE DELETE MARK AND UNMARK ===============*/

4660
/****************************************************************//**
osku's avatar
osku committed
4661 4662 4663 4664 4665 4666
Writes the redo log record for delete marking or unmarking of an index
record. */
UNIV_INLINE
void
btr_cur_del_mark_set_clust_rec_log(
/*===============================*/
4667 4668
	rec_t*		rec,	/*!< in: record */
	dict_index_t*	index,	/*!< in: index of the record */
4669
	trx_id_t	trx_id,	/*!< in: transaction id */
4670 4671
	roll_ptr_t	roll_ptr,/*!< in: roll ptr to the undo log record */
	mtr_t*		mtr)	/*!< in: mtr */
osku's avatar
osku committed
4672 4673 4674
{
	byte*	log_ptr;

4675
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4676
	ut_ad(mtr->is_named_space(index->space));
osku's avatar
osku committed
4677 4678

	log_ptr = mlog_open_and_write_index(mtr, rec, index,
4679 4680 4681 4682 4683
					    page_rec_is_comp(rec)
					    ? MLOG_COMP_REC_CLUST_DELETE_MARK
					    : MLOG_REC_CLUST_DELETE_MARK,
					    1 + 1 + DATA_ROLL_PTR_LEN
					    + 14 + 2);
osku's avatar
osku committed
4684 4685 4686 4687 4688 4689

	if (!log_ptr) {
		/* Logging in mtr is switched off during crash recovery */
		return;
	}

4690 4691
	*log_ptr++ = 0;
	*log_ptr++ = 1;
osku's avatar
osku committed
4692

4693 4694
	log_ptr = row_upd_write_sys_vals_to_log(
		index, trx_id, roll_ptr, log_ptr, mtr);
4695
	mach_write_to_2(log_ptr, page_offset(rec));
osku's avatar
osku committed
4696 4697 4698 4699 4700
	log_ptr += 2;

	mlog_close(mtr, log_ptr);
}

4701
/****************************************************************//**
osku's avatar
osku committed
4702
Parses the redo log record for delete marking or unmarking of a clustered
4703
index record.
4704
@return end of log record or NULL */
osku's avatar
osku committed
4705 4706 4707
byte*
btr_cur_parse_del_mark_set_clust_rec(
/*=================================*/
4708 4709 4710 4711 4712
	byte*		ptr,	/*!< in: buffer */
	byte*		end_ptr,/*!< in: buffer end */
	page_t*		page,	/*!< in/out: page or NULL */
	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
	dict_index_t*	index)	/*!< in: index corresponding to page */
osku's avatar
osku committed
4713
{
4714 4715 4716 4717 4718 4719 4720
	ulint		flags;
	ulint		val;
	ulint		pos;
	trx_id_t	trx_id;
	roll_ptr_t	roll_ptr;
	ulint		offset;
	rec_t*		rec;
osku's avatar
osku committed
4721

4722
	ut_ad(!page
4723
	      || !!page_is_comp(page) == dict_table_is_comp(index->table));
osku's avatar
osku committed
4724 4725 4726 4727 4728

	if (end_ptr < ptr + 2) {

		return(NULL);
	}
4729

osku's avatar
osku committed
4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751
	flags = mach_read_from_1(ptr);
	ptr++;
	val = mach_read_from_1(ptr);
	ptr++;

	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);

	if (ptr == NULL) {

		return(NULL);
	}

	if (end_ptr < ptr + 2) {

		return(NULL);
	}

	offset = mach_read_from_2(ptr);
	ptr += 2;

	ut_a(offset <= UNIV_PAGE_SIZE);

4752 4753 4754 4755
	/* In delete-marked records, DB_TRX_ID must
	always refer to an existing undo log record. */
	ut_ad(trx_id || (flags & BTR_KEEP_SYS_FLAG));

osku's avatar
osku committed
4756 4757
	if (page) {
		rec = page + offset;
marko's avatar
marko committed
4758

4759
		/* We do not need to reserve search latch, as the page
marko's avatar
marko committed
4760
		is only being recovered, and there cannot be a hash index to
4761 4762
		it. Besides, these fields are being updated in place
		and the adaptive hash index does not depend on them. */
marko's avatar
marko committed
4763

4764
		btr_rec_set_deleted_flag(rec, page_zip, val);
4765 4766 4767 4768
		/* pos is the offset of DB_TRX_ID in the clustered index.
		Debug assertions may also access DB_ROLL_PTR at pos+1.
		Therefore, we must compute offsets for the first pos+2
		clustered index fields. */
4769
		ut_ad(pos <= MAX_REF_PARTS);
marko's avatar
marko committed
4770

4771
		ulint offsets[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
4772 4773
		rec_offs_init(offsets);
		mem_heap_t*	heap	= NULL;
osku's avatar
osku committed
4774

4775
		if (!(flags & BTR_KEEP_SYS_FLAG)) {
4776 4777
			row_upd_rec_sys_fields_in_recovery(
				rec, page_zip,
4778
				rec_get_offsets(rec, index, offsets, true,
4779
						pos + 2, &heap),
4780
				pos, trx_id, roll_ptr);
4781 4782 4783 4784 4785 4786
		} else {
			/* In delete-marked records, DB_TRX_ID must
			always refer to an existing undo log record. */
			ut_ad(memcmp(rec_get_nth_field(
					     rec,
					     rec_get_offsets(rec, index,
4787 4788
							     offsets, true,
							     pos, &heap),
4789 4790 4791 4792 4793 4794 4795
					     pos, &offset),
				     field_ref_zero, DATA_TRX_ID_LEN));
			ut_ad(offset == DATA_TRX_ID_LEN);
		}

		if (UNIV_LIKELY_NULL(heap)) {
			mem_heap_free(heap);
osku's avatar
osku committed
4796 4797
		}
	}
4798

osku's avatar
osku committed
4799 4800 4801
	return(ptr);
}

4802
/***********************************************************//**
osku's avatar
osku committed
4803 4804 4805
Marks a clustered index record deleted. Writes an undo log record to
undo log on this delete marking. Writes in the trx id field the id
of the deleting transaction, and in the roll ptr field pointer to the
4806
undo log record created.
4807
@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4808
dberr_t
osku's avatar
osku committed
4809 4810
btr_cur_del_mark_set_clust_rec(
/*===========================*/
4811 4812 4813 4814
	buf_block_t*	block,	/*!< in/out: buffer block of the record */
	rec_t*		rec,	/*!< in/out: record */
	dict_index_t*	index,	/*!< in: clustered index of the record */
	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
4815
	que_thr_t*	thr,	/*!< in: query thread */
4816 4817
	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record, also
				contains the virtual cols if there are any */
4818
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
osku's avatar
osku committed
4819
{
4820
	roll_ptr_t	roll_ptr;
4821
	dberr_t		err;
marko's avatar
marko committed
4822
	page_zip_des_t*	page_zip;
osku's avatar
osku committed
4823 4824
	trx_t*		trx;

4825 4826
	ut_ad(dict_index_is_clust(index));
	ut_ad(rec_offs_validate(rec, index, offsets));
4827
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4828
	ut_ad(buf_block_get_frame(block) == page_align(rec));
4829
	ut_ad(page_rec_is_leaf(rec));
4830
	ut_ad(mtr->is_named_space(index->space));
osku's avatar
osku committed
4831

4832
	if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
4833 4834 4835 4836
		/* We may already have delete-marked this record
		when executing an ON DELETE CASCADE operation. */
		ut_ad(row_get_rec_trx_id(rec, index, offsets)
		      == thr_get_trx(thr)->id);
4837
		return(DB_SUCCESS);
osku's avatar
osku committed
4838 4839
	}

4840
	err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
4841
						   rec, index, offsets, thr);
osku's avatar
osku committed
4842 4843 4844

	if (err != DB_SUCCESS) {

4845
		return(err);
osku's avatar
osku committed
4846 4847
	}

Marko Mäkelä's avatar
Marko Mäkelä committed
4848 4849
	err = trx_undo_report_row_operation(thr, index,
					    entry, NULL, 0, rec, offsets,
4850
					    &roll_ptr);
osku's avatar
osku committed
4851 4852
	if (err != DB_SUCCESS) {

4853
		return(err);
osku's avatar
osku committed
4854 4855
	}

4856
	/* The search latch is not needed here, because
4857 4858
	the adaptive hash index does not depend on the delete-mark
	and the delete-mark is being updated in place. */
osku's avatar
osku committed
4859

4860 4861
	page_zip = buf_block_get_page_zip(block);

4862
	btr_rec_set_deleted_flag(rec, page_zip, TRUE);
osku's avatar
osku committed
4863 4864

	trx = thr_get_trx(thr);
4865

4866 4867 4868 4869 4870
	DBUG_LOG("ib_cur",
		 "delete-mark clust " << index->table->name
		 << " (" << index->id << ") by "
		 << ib::hex(trx_get_id_for_print(trx)) << ": "
		 << rec_printer(rec, offsets).str());
4871

4872
	if (dict_index_is_online_ddl(index)) {
4873
		row_log_table_delete(rec, index, offsets, NULL);
osku's avatar
osku committed
4874
	}
4875

4876 4877 4878
	row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);

	btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
4879
					   roll_ptr, mtr);
marko's avatar
marko committed
4880 4881

	return(err);
osku's avatar
osku committed
4882 4883
}

4884
/****************************************************************//**
osku's avatar
osku committed
4885 4886 4887 4888 4889 4890
Writes the redo log record for a delete mark setting of a secondary
index record. */
UNIV_INLINE
void
btr_cur_del_mark_set_sec_rec_log(
/*=============================*/
4891 4892 4893
	rec_t*		rec,	/*!< in: record */
	ibool		val,	/*!< in: value to set */
	mtr_t*		mtr)	/*!< in: mtr */
osku's avatar
osku committed
4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905
{
	byte*	log_ptr;
	ut_ad(val <= 1);

	log_ptr = mlog_open(mtr, 11 + 1 + 2);

	if (!log_ptr) {
		/* Logging in mtr is switched off during crash recovery:
		in that case mlog_open returns NULL */
		return;
	}

4906 4907
	log_ptr = mlog_write_initial_log_record_fast(
		rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
osku's avatar
osku committed
4908 4909 4910
	mach_write_to_1(log_ptr, val);
	log_ptr++;

4911
	mach_write_to_2(log_ptr, page_offset(rec));
osku's avatar
osku committed
4912 4913 4914 4915 4916
	log_ptr += 2;

	mlog_close(mtr, log_ptr);
}

4917
/****************************************************************//**
osku's avatar
osku committed
4918
Parses the redo log record for delete marking or unmarking of a secondary
4919
index record.
4920
@return end of log record or NULL */
osku's avatar
osku committed
4921 4922 4923
byte*
btr_cur_parse_del_mark_set_sec_rec(
/*===============================*/
4924 4925 4926 4927
	byte*		ptr,	/*!< in: buffer */
	byte*		end_ptr,/*!< in: buffer end */
	page_t*		page,	/*!< in/out: page or NULL */
	page_zip_des_t*	page_zip)/*!< in/out: compressed page, or NULL */
osku's avatar
osku committed
4928 4929 4930 4931 4932 4933 4934 4935 4936
{
	ulint	val;
	ulint	offset;
	rec_t*	rec;

	if (end_ptr < ptr + 3) {

		return(NULL);
	}
4937

osku's avatar
osku committed
4938 4939 4940 4941 4942 4943 4944 4945 4946 4947
	val = mach_read_from_1(ptr);
	ptr++;

	offset = mach_read_from_2(ptr);
	ptr += 2;

	ut_a(offset <= UNIV_PAGE_SIZE);

	if (page) {
		rec = page + offset;
4948

4949
		/* We do not need to reserve search latch, as the page
osku's avatar
osku committed
4950
		is only being recovered, and there cannot be a hash index to
4951 4952
		it. Besides, the delete-mark flag is being updated in place
		and the adaptive hash index does not depend on it. */
osku's avatar
osku committed
4953

4954
		btr_rec_set_deleted_flag(rec, page_zip, val);
osku's avatar
osku committed
4955
	}
4956

osku's avatar
osku committed
4957 4958
	return(ptr);
}
4959

4960
/***********************************************************//**
4961
Sets a secondary index record delete mark to TRUE or FALSE.
4962
@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4963
dberr_t
osku's avatar
osku committed
4964 4965
btr_cur_del_mark_set_sec_rec(
/*=========================*/
4966 4967 4968 4969
	ulint		flags,	/*!< in: locking flag */
	btr_cur_t*	cursor,	/*!< in: cursor */
	ibool		val,	/*!< in: value to set */
	que_thr_t*	thr,	/*!< in: query thread */
4970
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
osku's avatar
osku committed
4971 4972 4973
{
	buf_block_t*	block;
	rec_t*		rec;
4974
	dberr_t		err;
osku's avatar
osku committed
4975

4976 4977 4978
	block = btr_cur_get_block(cursor);
	rec = btr_cur_get_rec(cursor);

4979
	err = lock_sec_rec_modify_check_and_lock(flags,
4980
						 btr_cur_get_block(cursor),
4981
						 rec, cursor->index, thr, mtr);
osku's avatar
osku committed
4982 4983 4984 4985 4986
	if (err != DB_SUCCESS) {

		return(err);
	}

4987
	ut_ad(!!page_rec_is_comp(rec)
4988
	      == dict_table_is_comp(cursor->index->table));
4989

4990
	DBUG_PRINT("ib_cur", ("delete-mark=%u sec %u:%u:%u in %s("
4991
			      IB_ID_FMT ") by " TRX_ID_FMT,
4992 4993 4994 4995 4996 4997 4998
			      unsigned(val),
			      block->page.id.space(), block->page.id.page_no(),
			      unsigned(page_rec_get_heap_no(rec)),
			      cursor->index->name(), cursor->index->id,
			      trx_get_id_for_print(thr_get_trx(thr))));

	/* We do not need to reserve search latch, as the
4999 5000
	delete-mark flag is being updated in place and the adaptive
	hash index does not depend on it. */
5001
	btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
osku's avatar
osku committed
5002 5003 5004 5005 5006 5007

	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);

	return(DB_SUCCESS);
}

5008
/***********************************************************//**
5009 5010 5011 5012 5013
Sets a secondary index record's delete mark to the given value. This
function is only used by the insert buffer merge mechanism. */
void
btr_cur_set_deleted_flag_for_ibuf(
/*==============================*/
5014 5015
	rec_t*		rec,		/*!< in/out: record */
	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
5016 5017 5018
					corresponding to rec, or NULL
					when the tablespace is
					uncompressed */
5019
	ibool		val,		/*!< in: value to set */
Sergei Golubchik's avatar
Sergei Golubchik committed
5020
	mtr_t*		mtr)		/*!< in/out: mini-transaction */
osku's avatar
osku committed
5021
{
5022
	/* We do not need to reserve search latch, as the page
5023 5024 5025 5026
	has just been read to the buffer pool and there cannot be
	a hash index to it.  Besides, the delete-mark flag is being
	updated in place and the adaptive hash index does not depend
	on it. */
osku's avatar
osku committed
5027

marko's avatar
marko committed
5028
	btr_rec_set_deleted_flag(rec, page_zip, val);
osku's avatar
osku committed
5029

5030
	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
osku's avatar
osku committed
5031 5032 5033 5034
}

/*==================== B-TREE RECORD REMOVE =========================*/

5035
/*************************************************************//**
osku's avatar
osku committed
5036 5037 5038 5039
Tries to compress a page of the tree if it seems useful. It is assumed
that mtr holds an x-latch on the tree and on the cursor page. To avoid
deadlocks, mtr must also own x-latches to brothers of page, if those
brothers exist. NOTE: it is assumed that the caller has reserved enough
5040
free extents so that the compression will always succeed if done!
5041
@return TRUE if compression occurred */
osku's avatar
osku committed
5042 5043 5044
ibool
btr_cur_compress_if_useful(
/*=======================*/
5045 5046 5047 5048 5049 5050
	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
				cursor does not stay valid if !adjust and
				compression occurs */
	ibool		adjust,	/*!< in: TRUE if should adjust the
				cursor position even if compression occurs */
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
osku's avatar
osku committed
5051
{
5052 5053
	ut_ad(mtr_memo_contains_flagged(
		mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
5054
		MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
5055 5056 5057 5058 5059 5060
	ut_ad(mtr_is_block_fix(
		mtr, btr_cur_get_block(cursor),
		MTR_MEMO_PAGE_X_FIX, cursor->index->table));

	if (dict_index_is_spatial(cursor->index)) {
		const page_t*   page = btr_cur_get_page(cursor);
5061 5062 5063 5064 5065
		const trx_t*	trx = NULL;

		if (cursor->rtr_info->thr != NULL) {
			trx = thr_get_trx(cursor->rtr_info->thr);
		}
5066 5067

		/* Check whether page lock prevents the compression */
5068 5069
		if (!lock_test_prdt_page_lock(trx, page_get_space_id(page),
					      page_get_page_no(page))) {
5070 5071 5072
			return(false);
		}
	}
osku's avatar
osku committed
5073

marko's avatar
marko committed
5074
	return(btr_cur_compress_recommendation(cursor, mtr)
5075
	       && btr_compress(cursor, adjust, mtr));
osku's avatar
osku committed
5076 5077
}

5078
/*******************************************************//**
osku's avatar
osku committed
5079 5080
Removes the record on which the tree cursor is positioned on a leaf page.
It is assumed that the mtr has an x-latch on the page where the cursor is
5081
positioned, but no latch on the whole tree.
5082
@return TRUE if success, i.e., the page did not become too empty */
osku's avatar
osku committed
5083
ibool
5084 5085
btr_cur_optimistic_delete_func(
/*===========================*/
5086
	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
osku's avatar
osku committed
5087 5088 5089
				delete; cursor stays valid: if deletion
				succeeds, on function exit it points to the
				successor of the deleted record */
5090 5091 5092
#ifdef UNIV_DEBUG
	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
#endif /* UNIV_DEBUG */
5093
	mtr_t*		mtr)	/*!< in: mtr; if this function returns
5094 5095 5096
				TRUE on a leaf page of a secondary
				index, the mtr must be committed
				before latching any further pages */
osku's avatar
osku committed
5097
{
5098
	buf_block_t*	block;
osku's avatar
osku committed
5099 5100 5101 5102 5103
	rec_t*		rec;
	mem_heap_t*	heap		= NULL;
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
	ulint*		offsets		= offsets_;
	ibool		no_compress_needed;
5104
	rec_offs_init(offsets_);
osku's avatar
osku committed
5105

5106
	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5107 5108
	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
				MTR_MEMO_PAGE_X_FIX));
5109
	ut_ad(mtr->is_named_space(cursor->index->space));
5110
	ut_ad(!cursor->index->is_dummy);
5111

osku's avatar
osku committed
5112 5113
	/* This is intended only for leaf page deletions */

5114
	block = btr_cur_get_block(cursor);
5115

5116
	ut_ad(block->page.id.space() == cursor->index->space);
5117
	ut_ad(page_is_leaf(buf_block_get_frame(block)));
5118 5119 5120
	ut_ad(!dict_index_is_online_ddl(cursor->index)
	      || dict_index_is_clust(cursor->index)
	      || (flags & BTR_CREATE_FLAG));
osku's avatar
osku committed
5121 5122

	rec = btr_cur_get_rec(cursor);
5123
	offsets = rec_get_offsets(rec, cursor->index, offsets, true,
5124
				  ULINT_UNDEFINED, &heap);
osku's avatar
osku committed
5125 5126

	no_compress_needed = !rec_offs_any_extern(offsets)
5127 5128
		&& btr_cur_can_delete_without_compress(
			cursor, rec_offs_size(offsets), mtr);
osku's avatar
osku committed
5129 5130 5131

	if (no_compress_needed) {

5132 5133
		page_t*		page	= buf_block_get_frame(block);
		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
5134

5135
		lock_update_delete(block, rec);
osku's avatar
osku committed
5136 5137 5138

		btr_search_update_hash_on_delete(cursor);

5139
		if (page_zip) {
5140
#ifdef UNIV_ZIP_DEBUG
5141
			ut_a(page_zip_validate(page_zip, page, cursor->index));
5142
#endif /* UNIV_ZIP_DEBUG */
5143 5144
			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
					    cursor->index, offsets, mtr);
5145
#ifdef UNIV_ZIP_DEBUG
5146
			ut_a(page_zip_validate(page_zip, page, cursor->index));
5147
#endif /* UNIV_ZIP_DEBUG */
osku's avatar
osku committed
5148

5149 5150 5151 5152 5153
			/* On compressed pages, the IBUF_BITMAP_FREE
			space is not affected by deleting (purging)
			records, because it is defined as the minimum
			of space available *without* reorganize, and
			space available in the modification log. */
5154
		} else {
5155 5156 5157 5158 5159 5160 5161 5162 5163 5164
			const ulint	max_ins
				= page_get_max_insert_size_after_reorganize(
					page, 1);

			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
					    cursor->index, offsets, mtr);

			/* The change buffer does not handle inserts
			into non-leaf pages, into clustered indexes,
			or into the change buffer. */
5165 5166
			if (!dict_index_is_clust(cursor->index)
			    && !dict_table_is_temporary(cursor->index->table)
5167 5168 5169
			    && !dict_index_is_ibuf(cursor->index)) {
				ibuf_update_free_bits_low(block, max_ins, mtr);
			}
5170
		}
5171 5172 5173 5174
	} else {
		/* prefetch siblings of the leaf for the pessimistic
		operation. */
		btr_cur_prefetch_siblings(block);
osku's avatar
osku committed
5175 5176 5177 5178 5179 5180 5181 5182 5183
	}

	if (UNIV_LIKELY_NULL(heap)) {
		mem_heap_free(heap);
	}

	return(no_compress_needed);
}

5184
/*************************************************************//**
osku's avatar
osku committed
5185 5186 5187 5188 5189
Removes the record on which the tree cursor is positioned. Tries
to compress the page if its fillfactor drops below a threshold
or if it is the only page on the level. It is assumed that mtr holds
an x-latch on the tree and on the cursor page. To avoid deadlocks,
mtr must also own x-latches to brothers of page, if those brothers
5190
exist.
5191 5192
@return TRUE if compression occurred and FALSE if not or something
wrong. */
osku's avatar
osku committed
5193 5194 5195
ibool
btr_cur_pessimistic_delete(
/*=======================*/
5196
	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
osku's avatar
osku committed
5197 5198 5199 5200
				the latter may occur because we may have
				to update node pointers on upper levels,
				and in the case of variable length keys
				these may actually grow in size */
5201
	ibool		has_reserved_extents, /*!< in: TRUE if the
osku's avatar
osku committed
5202 5203 5204
				caller has already reserved enough free
				extents so that he knows that the operation
				will succeed */
5205
	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
osku's avatar
osku committed
5206 5207 5208
				if compression does not occur, the cursor
				stays valid: it points to successor of
				deleted record on function exit */
5209
	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
5210
	bool		rollback,/*!< in: performing rollback? */
5211
	mtr_t*		mtr)	/*!< in: mtr */
osku's avatar
osku committed
5212
{
5213
	buf_block_t*	block;
osku's avatar
osku committed
5214
	page_t*		page;
marko's avatar
marko committed
5215
	page_zip_des_t*	page_zip;
5216
	dict_index_t*	index;
osku's avatar
osku committed
5217
	rec_t*		rec;
5218
	ulint		n_reserved	= 0;
5219
	bool		success;
osku's avatar
osku committed
5220 5221 5222
	ibool		ret		= FALSE;
	mem_heap_t*	heap;
	ulint*		offsets;
5223 5224 5225
#ifdef UNIV_DEBUG
	bool		parent_latched	= false;
#endif /* UNIV_DEBUG */
5226

5227 5228
	block = btr_cur_get_block(cursor);
	page = buf_block_get_frame(block);
5229
	index = btr_cur_get_index(cursor);
osku's avatar
osku committed
5230

5231 5232 5233 5234
	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
	ut_ad(!dict_index_is_online_ddl(index)
	      || dict_index_is_clust(index)
	      || (flags & BTR_CREATE_FLAG));
5235 5236
	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
					MTR_MEMO_X_LOCK
5237
					| MTR_MEMO_SX_LOCK));
5238
	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
5239
	ut_ad(mtr->is_named_space(index->space));
5240 5241
	ut_ad(!index->is_dummy);
	ut_ad(block->page.id.space() == index->space);
5242

osku's avatar
osku committed
5243 5244 5245 5246 5247
	if (!has_reserved_extents) {
		/* First reserve enough free space for the file segments
		of the index tree, so that the node pointer updates will
		not fail because of lack of space */

5248
		ulint	n_extents = cursor->tree_height / 32 + 1;
osku's avatar
osku committed
5249 5250

		success = fsp_reserve_free_extents(&n_reserved,
5251
						   index->space,
5252 5253
						   n_extents,
						   FSP_CLEANING, mtr);
osku's avatar
osku committed
5254 5255 5256 5257 5258 5259 5260 5261 5262
		if (!success) {
			*err = DB_OUT_OF_FILE_SPACE;

			return(FALSE);
		}
	}

	heap = mem_heap_create(1024);
	rec = btr_cur_get_rec(cursor);
5263
	page_zip = buf_block_get_page_zip(block);
5264
#ifdef UNIV_ZIP_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
5265
	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5266
#endif /* UNIV_ZIP_DEBUG */
osku's avatar
osku committed
5267

5268 5269
	offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page),
				  ULINT_UNDEFINED, &heap);
osku's avatar
osku committed
5270

5271
	if (rec_offs_any_extern(offsets)) {
5272
		btr_rec_free_externally_stored_fields(index,
5273
						      rec, offsets, page_zip,
5274
						      rollback, mtr);
5275
#ifdef UNIV_ZIP_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
5276
		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5277
#endif /* UNIV_ZIP_DEBUG */
osku's avatar
osku committed
5278 5279
	}

5280 5281 5282 5283 5284 5285 5286 5287 5288
	rec_t* next_rec = NULL;
	bool min_mark_next_rec = false;

	if (page_is_leaf(page)) {
		ut_ad(!(rec_get_info_bits(rec, page_rec_is_comp(rec))
			& REC_INFO_MIN_REC_FLAG));
		if (flags == 0) {
			lock_update_delete(block, rec);
		}
Marko Mäkelä's avatar
Marko Mäkelä committed
5289
	}
5290

osku's avatar
osku committed
5291
	if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
5292
	    && UNIV_UNLIKELY(dict_index_get_page(index)
5293
			     != block->page.id.page_no())) {
osku's avatar
osku committed
5294 5295 5296

		/* If there is only one record, drop the whole page in
		btr_discard_page, if this is not the root page */
5297

osku's avatar
osku committed
5298 5299 5300
		btr_discard_page(cursor, mtr);

		ret = TRUE;
5301
		goto return_after_reservations;
osku's avatar
osku committed
5302 5303
	}

Marko Mäkelä's avatar
Marko Mäkelä committed
5304
	if (page_is_leaf(page)) {
5305 5306
		btr_search_update_hash_on_delete(cursor);
	} else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
5307
		next_rec = page_rec_get_next(rec);
osku's avatar
osku committed
5308

5309
		if (!page_has_prev(page)) {
osku's avatar
osku committed
5310 5311 5312 5313
			/* If we delete the leftmost node pointer on a
			non-leaf level, we must mark the new leftmost node
			pointer as the predefined minimum record */

5314
			min_mark_next_rec = true;
5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328
		} else if (dict_index_is_spatial(index)) {
			/* For rtree, if delete the leftmost node pointer,
			we need to update parent page. */
			rtr_mbr_t	father_mbr;
			rec_t*		father_rec;
			btr_cur_t	father_cursor;
			ulint*		offsets;
			bool		upd_ret;
			ulint		len;

			rtr_page_get_father_block(NULL, heap, index,
						  block, mtr, NULL,
						  &father_cursor);
			offsets = rec_get_offsets(
5329 5330
				btr_cur_get_rec(&father_cursor), index, NULL,
				false, ULINT_UNDEFINED, &heap);
5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347

			father_rec = btr_cur_get_rec(&father_cursor);
			rtr_read_mbr(rec_get_nth_field(
				father_rec, offsets, 0, &len), &father_mbr);

			upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
						       NULL, page, &father_mbr,
						       next_rec, mtr);

			if (!upd_ret) {
				*err = DB_ERROR;

				mem_heap_free(heap);
				return(FALSE);
			}

			ut_d(parent_latched = true);
osku's avatar
osku committed
5348 5349
		} else {
			/* Otherwise, if we delete the leftmost node pointer
5350
			on a page, we have to change the parent node pointer
osku's avatar
osku committed
5351 5352
			so that it is equal to the new leftmost node pointer
			on the page */
5353
			ulint level = btr_page_get_level(page, mtr);
osku's avatar
osku committed
5354

5355 5356 5357 5358
			btr_cur_t cursor;
			btr_page_get_father(index, block, mtr, &cursor);
			btr_cur_node_ptr_delete(&cursor, mtr);
			// FIXME: reuse the node_ptr from above
5359
			dtuple_t*	node_ptr = dict_index_build_node_ptr(
5360
				index, next_rec, block->page.id.page_no(),
5361
				heap, level);
osku's avatar
osku committed
5362

5363 5364
			btr_insert_on_non_leaf_level(
				flags, index, level + 1, node_ptr, mtr);
5365 5366

			ut_d(parent_latched = true);
osku's avatar
osku committed
5367
		}
5368
	}
osku's avatar
osku committed
5369

5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382
	/* SPATIAL INDEX never use SX locks; we can allow page merges
	while holding X lock on the spatial index tree.
	Do not allow merges of non-leaf B-tree pages unless it is
	safe to do so. */
	{
		const bool allow_merge = page_is_leaf(page)
			|| dict_index_is_spatial(index)
			|| btr_cur_will_modify_tree(
				index, page, BTR_INTENTION_DELETE, rec,
				btr_node_ptr_max_size(index),
				block->page.size, mtr);
		page_cur_delete_rec(btr_cur_get_page_cur(cursor), index,
				    offsets, mtr);
5383 5384 5385 5386 5387

		if (min_mark_next_rec) {
			btr_set_min_rec_mark(next_rec, mtr);
		}

5388
#ifdef UNIV_ZIP_DEBUG
5389
		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5390
#endif /* UNIV_ZIP_DEBUG */
osku's avatar
osku committed
5391

5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407
		ut_ad(!parent_latched
		      || btr_check_node_ptr(index, block, mtr));

		if (!ret && btr_cur_compress_recommendation(cursor, mtr)) {
			if (UNIV_LIKELY(allow_merge)) {
				ret = btr_cur_compress_if_useful(
					cursor, FALSE, mtr);
			} else {
				ib::warn() << "Not merging page "
					   << block->page.id
					   << " in index " << index->name
					   << " of " << index->table->name;
				ut_ad(!"MDEV-14637");
			}
		}
	}
osku's avatar
osku committed
5408

5409
return_after_reservations:
osku's avatar
osku committed
5410
	*err = DB_SUCCESS;
5411

osku's avatar
osku committed
5412 5413
	mem_heap_free(heap);

5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424
	if (!srv_read_only_mode
	    && page_is_leaf(page)
	    && !dict_index_is_online_ddl(index)) {

		mtr_memo_release(mtr, dict_index_get_lock(index),
				 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);

		/* NOTE: We cannot release root block latch here, because it
		has segment header and already modified in most of cases.*/
	}

5425
	if (n_reserved > 0) {
5426
		fil_space_release_free_extents(index->space, n_reserved);
osku's avatar
osku committed
5427 5428 5429 5430 5431
	}

	return(ret);
}

5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448
/** Delete the node pointer in a parent page.
@param[in,out]	parent	cursor pointing to parent record
@param[in,out]	mtr	mini-transaction */
void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
{
	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(parent),
				MTR_MEMO_PAGE_X_FIX));
	dberr_t err;
	ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
						      BTR_CREATE_FLAG, false,
						      mtr);
	ut_a(err == DB_SUCCESS);
	if (!compressed) {
		btr_cur_compress_if_useful(parent, FALSE, mtr);
	}
}

5449
/*******************************************************************//**
osku's avatar
osku committed
5450 5451 5452 5453 5454 5455
Adds path information to the cursor for the current page, for which
the binary search has been performed. */
static
void
btr_cur_add_path_info(
/*==================*/
5456 5457
	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
	ulint		height,		/*!< in: height of the page in tree;
osku's avatar
osku committed
5458
					0 means leaf node */
5459
	ulint		root_height)	/*!< in: root node height in tree */
osku's avatar
osku committed
5460 5461
{
	btr_path_t*	slot;
5462 5463
	const rec_t*	rec;
	const page_t*	page;
osku's avatar
osku committed
5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482

	ut_a(cursor->path_arr);

	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
		/* Do nothing; return empty path */

		slot = cursor->path_arr;
		slot->nth_rec = ULINT_UNDEFINED;

		return;
	}

	if (height == 0) {
		/* Mark end of slots for path */
		slot = cursor->path_arr + root_height + 1;
		slot->nth_rec = ULINT_UNDEFINED;
	}

	rec = btr_cur_get_rec(cursor);
5483

osku's avatar
osku committed
5484 5485
	slot = cursor->path_arr + (root_height - height);

5486 5487
	page = page_align(rec);

osku's avatar
osku committed
5488
	slot->nth_rec = page_rec_get_n_recs_before(rec);
5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504
	slot->n_recs = page_get_n_recs(page);
	slot->page_no = page_get_page_no(page);
	slot->page_level = btr_page_get_level_low(page);
}

/*******************************************************************//**
Estimate the number of rows between slot1 and slot2 for any level on a
B-tree. This function starts from slot1->page and reads a few pages to
the right, counting their records. If we reach slot2->page quickly then
we know exactly how many records there are between slot1 and slot2 and
we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
then we calculate the average number of records in the pages scanned
so far and assume that all pages that we did not scan up to slot2->page
contain the same number of records, then we multiply that average to
the number of pages between slot1->page and slot2->page (which is
n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
5505
@return number of rows, not including the borders (exact or estimated) */
5506
static
5507
int64_t
5508 5509 5510 5511 5512
btr_estimate_n_rows_in_range_on_level(
/*==================================*/
	dict_index_t*	index,			/*!< in: index */
	btr_path_t*	slot1,			/*!< in: left border */
	btr_path_t*	slot2,			/*!< in: right border */
5513
	int64_t		n_rows_on_prev_level,	/*!< in: number of rows
5514 5515
						on the previous level for the
						same descend paths; used to
5516
						determine the number of pages
5517 5518 5519 5520 5521
						on this level */
	ibool*		is_n_rows_exact)	/*!< out: TRUE if the returned
						value is exact i.e. not an
						estimation */
{
5522
	int64_t		n_rows;
5523 5524 5525 5526 5527 5528 5529
	ulint		n_pages_read;
	ulint		level;

	n_rows = 0;
	n_pages_read = 0;

	/* Assume by default that we will scan all pages between
5530
	slot1->page_no and slot2->page_no. */
5531 5532
	*is_n_rows_exact = TRUE;

5533 5534 5535 5536
	/* Add records from slot1->page_no which are to the right of
	the record which serves as a left border of the range, if any
	(we don't include the record itself in this count). */
	if (slot1->nth_rec <= slot1->n_recs) {
5537 5538 5539
		n_rows += slot1->n_recs - slot1->nth_rec;
	}

5540 5541 5542
	/* Add records from slot2->page_no which are to the left of
	the record which servers as a right border of the range, if any
	(we don't include the record itself in this count). */
5543 5544 5545 5546
	if (slot2->nth_rec > 1) {
		n_rows += slot2->nth_rec - 1;
	}

5547 5548
	/* Count the records in the pages between slot1->page_no and
	slot2->page_no (non inclusive), if any. */
5549 5550 5551 5552

	/* Do not read more than this number of pages in order not to hurt
	performance with this code which is just an estimation. If we read
	this many pages before reaching slot2->page_no then we estimate the
5553
	average from the pages scanned so far. */
5554
#	define N_PAGES_READ_LIMIT	10
5555

5556 5557 5558 5559 5560 5561
	page_id_t		page_id(
		dict_index_get_space(index), slot1->page_no);
	const fil_space_t*	space = fil_space_get(index->space);
	ut_ad(space);
	const page_size_t	page_size(space->flags);

5562 5563 5564 5565 5566 5567
	level = slot1->page_level;

	do {
		mtr_t		mtr;
		page_t*		page;
		buf_block_t*	block;
5568
		dberr_t		err=DB_SUCCESS;
5569 5570 5571

		mtr_start(&mtr);

5572 5573 5574 5575 5576
		/* Fetch the page. Because we are not holding the
		index->lock, the tree may have changed and we may be
		attempting to read a page that is no longer part of
		the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
		silence a debug assertion about this. */
5577
		block = buf_page_get_gen(page_id, page_size, RW_S_LATCH,
5578
					 NULL, BUF_GET_POSSIBLY_FREED,
5579 5580
					 __FILE__, __LINE__, &mtr, &err);

5581 5582
		ut_ad((block != NULL) == (err == DB_SUCCESS));

5583
		if (err != DB_SUCCESS) {
5584
			if (err == DB_DECRYPTION_FAILED) {
5585
				ib_push_warning((void *)NULL,
5586
					DB_DECRYPTION_FAILED,
5587 5588 5589
					"Table %s is encrypted but encryption service or"
					" used key_id is not available. "
					" Can't continue reading table.",
5590
					index->table->name.m_name);
5591
				index->table->file_unreadable = true;
5592 5593 5594 5595 5596 5597
			}

			mtr_commit(&mtr);
			goto inexact;
		}

5598 5599 5600 5601 5602 5603 5604 5605
		page = buf_block_get_frame(block);

		/* It is possible that the tree has been reorganized in the
		meantime and this is a different page. If this happens the
		calculated estimate will be bogus, which is not fatal as
		this is only an estimate. We are sure that a page with
		page_no exists because InnoDB never frees pages, only
		reuses them. */
5606
		if (!fil_page_index_page_check(page)
5607 5608 5609 5610
		    || btr_page_get_index_id(page) != index->id
		    || btr_page_get_level_low(page) != level) {

			/* The page got reused for something else */
5611
			mtr_commit(&mtr);
5612 5613 5614
			goto inexact;
		}

5615 5616 5617 5618 5619 5620 5621
		/* It is possible but highly unlikely that the page was
		originally written by an old version of InnoDB that did
		not initialize FIL_PAGE_TYPE on other than B-tree pages.
		For example, this could be an almost-empty BLOB page
		that happens to contain the magic values in the fields
		that we checked above. */

5622 5623
		n_pages_read++;

5624
		if (page_id.page_no() != slot1->page_no) {
5625 5626 5627 5628 5629
			/* Do not count the records on slot1->page_no,
			we already counted them before this loop. */
			n_rows += page_get_n_recs(page);
		}

5630
		page_id.set_page_no(btr_page_get_next(page, &mtr));
5631 5632 5633 5634

		mtr_commit(&mtr);

		if (n_pages_read == N_PAGES_READ_LIMIT
5635
		    || page_id.page_no() == FIL_NULL) {
5636 5637 5638 5639 5640 5641 5642
			/* Either we read too many pages or
			we reached the end of the level without passing
			through slot2->page_no, the tree must have changed
			in the meantime */
			goto inexact;
		}

5643
	} while (page_id.page_no() != slot2->page_no);
5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665

	return(n_rows);

inexact:

	*is_n_rows_exact = FALSE;

	/* We did interrupt before reaching slot2->page */

	if (n_pages_read > 0) {
		/* The number of pages on this level is
		n_rows_on_prev_level, multiply it by the
		average number of recs per page so far */
		n_rows = n_rows_on_prev_level
			* n_rows / n_pages_read;
	} else {
		/* The tree changed before we could even
		start with slot1->page_no */
		n_rows = 10;
	}

	return(n_rows);
osku's avatar
osku committed
5666 5667
}

5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701
/** If the tree gets changed too much between the two dives for the left
and right boundary then btr_estimate_n_rows_in_range_low() will retry
that many times before giving up and returning the value stored in
rows_in_range_arbitrary_ret_val. */
static const unsigned	rows_in_range_max_retries = 4;

/** We pretend that a range has that many records if the tree keeps changing
for rows_in_range_max_retries retries while we try to estimate the records
in a given range. */
static const int64_t	rows_in_range_arbitrary_ret_val = 10;

/** Estimates the number of rows in a given index range.
@param[in]	index		index
@param[in]	tuple1		range start, may also be empty tuple
@param[in]	mode1		search mode for range start
@param[in]	tuple2		range end, may also be empty tuple
@param[in]	mode2		search mode for range end
@param[in]	nth_attempt	if the tree gets modified too much while
we are trying to analyze it, then we will retry (this function will call
itself, incrementing this parameter)
@return estimated number of rows; if after rows_in_range_max_retries
retries the tree keeps changing, then we will just return
rows_in_range_arbitrary_ret_val as a result (if
nth_attempt >= rows_in_range_max_retries and the tree is modified between
the two dives). */
static
int64_t
btr_estimate_n_rows_in_range_low(
	dict_index_t*	index,
	const dtuple_t*	tuple1,
	page_cur_mode_t	mode1,
	const dtuple_t*	tuple2,
	page_cur_mode_t	mode2,
	unsigned	nth_attempt)
osku's avatar
osku committed
5702 5703 5704 5705 5706 5707 5708
{
	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
	btr_cur_t	cursor;
	btr_path_t*	slot1;
	btr_path_t*	slot2;
	ibool		diverged;
5709 5710
	ibool		diverged_lot;
	ulint		divergence_level;
5711
	int64_t		n_rows;
5712
	ibool		is_n_rows_exact;
osku's avatar
osku committed
5713 5714
	ulint		i;
	mtr_t		mtr;
5715
	int64_t		table_n_rows;
5716 5717

	table_n_rows = dict_table_get_n_rows(index->table);
osku's avatar
osku committed
5718

5719 5720 5721 5722 5723 5724 5725 5726 5727
	/* Below we dive to the two records specified by tuple1 and tuple2 and
	we remember the entire dive paths from the tree root. The place where
	the tuple1 path ends on the leaf level we call "left border" of our
	interval and the place where the tuple2 path ends on the leaf level -
	"right border". We take care to either include or exclude the interval
	boundaries depending on whether <, <=, > or >= was specified. For
	example if "5 < x AND x <= 10" then we should not include the left
	boundary, but should include the right one. */

5728
	mtr_start(&mtr);
osku's avatar
osku committed
5729 5730 5731

	cursor.path_arr = path1;

5732 5733
	bool	should_count_the_left_border;

osku's avatar
osku committed
5734
	if (dtuple_get_n_fields(tuple1) > 0) {
5735

osku's avatar
osku committed
5736
		btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
5737
					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
5738 5739
					    &cursor, 0,
					    __FILE__, __LINE__, &mtr);
5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750

		ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));

		/* We should count the border if there are any records to
		match the criteria, i.e. if the maximum record on the tree is
		5 and x > 3 is specified then the cursor will be positioned at
		5 and we should count the border, but if x > 7 is specified,
		then the cursor will be positioned at 'sup' on the rightmost
		leaf page in the tree and we should not count the border. */
		should_count_the_left_border
			= !page_rec_is_supremum(btr_cur_get_rec(&cursor));
osku's avatar
osku committed
5751
	} else {
5752 5753 5754
		dberr_t err = DB_SUCCESS;

		err = btr_cur_open_at_index_side(true, index,
5755
					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
5756
					   &cursor, 0, &mtr);
5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773

		if (err != DB_SUCCESS) {
			ib::warn() << " Error code: " << err
				   << " btr_estimate_n_rows_in_range_low "
				   << " called from file: "
				   << __FILE__ << " line: " << __LINE__
				   << " table: " << index->table->name
				   << " index: " << index->name;
		}

		ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));

		/* The range specified is wihout a left border, just
		'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
		positioned the cursor on the infimum record on the leftmost
		page, which must not be counted. */
		should_count_the_left_border = false;
osku's avatar
osku committed
5774
	}
5775

osku's avatar
osku committed
5776 5777
	mtr_commit(&mtr);

Marko Mäkelä's avatar
Marko Mäkelä committed
5778 5779
	if (!index->is_readable()) {
		return 0;
5780 5781
	}

5782
	mtr_start(&mtr);
osku's avatar
osku committed
5783 5784 5785

	cursor.path_arr = path2;

5786 5787
	bool	should_count_the_right_border;

osku's avatar
osku committed
5788
	if (dtuple_get_n_fields(tuple2) > 0) {
5789

osku's avatar
osku committed
5790
		btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
5791
					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
5792 5793
					    &cursor, 0,
					    __FILE__, __LINE__, &mtr);
5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820

		const rec_t*	rec = btr_cur_get_rec(&cursor);

		ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));

		should_count_the_right_border
			= (mode2 == PAGE_CUR_LE /* if the range is '<=' */
			   /* and the record was found */
			   && cursor.low_match >= dtuple_get_n_fields(tuple2))
			|| (mode2 == PAGE_CUR_L /* or if the range is '<' */
			    /* and there are any records to match the criteria,
			    i.e. if the minimum record on the tree is 5 and
			    x < 7 is specified then the cursor will be
			    positioned at 5 and we should count the border, but
			    if x < 2 is specified, then the cursor will be
			    positioned at 'inf' and we should not count the
			    border */
			    && !page_rec_is_infimum(rec));
		/* Notice that for "WHERE col <= 'foo'" MySQL passes to
		ha_innobase::records_in_range():
		min_key=NULL (left-unbounded) which is expected
		max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
		unexpected - one would expect
		flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
		cursor will be positioned on the first record to the right of
		the requested one (can also be positioned on the 'sup') and
		we should not count the right border. */
osku's avatar
osku committed
5821
	} else {
5822 5823 5824
		dberr_t err = DB_SUCCESS;

		err = btr_cur_open_at_index_side(false, index,
5825
					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
5826
					   &cursor, 0, &mtr);
5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843

		if (err != DB_SUCCESS) {
			ib::warn() << " Error code: " << err
				   << " btr_estimate_n_rows_in_range_low "
				   << " called from file: "
				   << __FILE__ << " line: " << __LINE__
				   << " table: " << index->table->name
				   << " index: " << index->name;
		}

		ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));

		/* The range specified is wihout a right border, just
		'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
		positioned the cursor on the supremum record on the rightmost
		page, which must not be counted. */
		should_count_the_right_border = false;
osku's avatar
osku committed
5844
	}
5845

osku's avatar
osku committed
5846 5847 5848 5849
	mtr_commit(&mtr);

	/* We have the path information for the range in path1 and path2 */

5850
	n_rows = 0;
5851
	is_n_rows_exact = TRUE;
5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864

	/* This becomes true when the two paths do not pass through the
	same pages anymore. */
	diverged = FALSE;

	/* This becomes true when the paths are not the same or adjacent
	any more. This means that they pass through the same or
	neighboring-on-the-same-level pages only. */
	diverged_lot = FALSE;

	/* This is the level where paths diverged a lot. */
	divergence_level = 1000000;

5865
	for (i = 0; ; i++) {
osku's avatar
osku committed
5866
		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
5867

osku's avatar
osku committed
5868 5869 5870 5871
		slot1 = path1 + i;
		slot2 = path2 + i;

		if (slot1->nth_rec == ULINT_UNDEFINED
5872
		    || slot2->nth_rec == ULINT_UNDEFINED) {
osku's avatar
osku committed
5873

5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937
			/* Here none of the borders were counted. For example,
			if on the leaf level we descended to:
			(inf, a, b, c, d, e, f, sup)
			         ^        ^
			       path1    path2
			then n_rows will be 2 (c and d). */

			if (is_n_rows_exact) {
				/* Only fiddle to adjust this off-by-one
				if the number is exact, otherwise we do
				much grosser adjustments below. */

				btr_path_t*	last1 = &path1[i - 1];
				btr_path_t*	last2 = &path2[i - 1];

				/* If both paths end up on the same record on
				the leaf level. */
				if (last1->page_no == last2->page_no
				    && last1->nth_rec == last2->nth_rec) {

					/* n_rows can be > 0 here if the paths
					were first different and then converged
					to the same record on the leaf level.
					For example:
					SELECT ... LIKE 'wait/synch/rwlock%'
					mode1=PAGE_CUR_GE,
					tuple1="wait/synch/rwlock"
					path1[0]={nth_rec=58, n_recs=58,
						  page_no=3, page_level=1}
					path1[1]={nth_rec=56, n_recs=55,
						  page_no=119, page_level=0}

					mode2=PAGE_CUR_G
					tuple2="wait/synch/rwlock"
					path2[0]={nth_rec=57, n_recs=57,
						  page_no=3, page_level=1}
					path2[1]={nth_rec=56, n_recs=55,
						  page_no=119, page_level=0} */

					/* If the range is such that we should
					count both borders, then avoid
					counting that record twice - once as a
					left border and once as a right
					border. */
					if (should_count_the_left_border
					    && should_count_the_right_border) {

						n_rows = 1;
					} else {
						/* Some of the borders should
						not be counted, e.g. [3,3). */
						n_rows = 0;
					}
				} else {
					if (should_count_the_left_border) {
						n_rows++;
					}

					if (should_count_the_right_border) {
						n_rows++;
					}
				}
			}

5938
			if (i > divergence_level + 1 && !is_n_rows_exact) {
5939 5940 5941
				/* In trees whose height is > 1 our algorithm
				tends to underestimate: multiply the estimate
				by 2: */
osku's avatar
osku committed
5942

5943 5944
				n_rows = n_rows * 2;
			}
osku's avatar
osku committed
5945

5946 5947
			DBUG_EXECUTE_IF("bug14007649", return(n_rows););

osku's avatar
osku committed
5948
			/* Do not estimate the number of rows in the range
5949
			to over 1 / 2 of the estimated rows in the whole
osku's avatar
osku committed
5950 5951
			table */

5952
			if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
5953

5954
				n_rows = table_n_rows / 2;
osku's avatar
osku committed
5955 5956 5957

				/* If there are just 0 or 1 rows in the table,
				then we estimate all rows are in the range */
5958 5959

				if (n_rows == 0) {
5960
					n_rows = table_n_rows;
5961
				}
osku's avatar
osku committed
5962 5963 5964 5965 5966 5967 5968
			}

			return(n_rows);
		}

		if (!diverged && slot1->nth_rec != slot2->nth_rec) {

5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990
			/* If both slots do not point to the same page,
			this means that the tree must have changed between
			the dive for slot1 and the dive for slot2 at the
			beginning of this function. */
			if (slot1->page_no != slot2->page_no
			    || slot1->page_level != slot2->page_level) {

				/* If the tree keeps changing even after a
				few attempts, then just return some arbitrary
				number. */
				if (nth_attempt >= rows_in_range_max_retries) {
					return(rows_in_range_arbitrary_ret_val);
				}

				const int64_t	ret =
					btr_estimate_n_rows_in_range_low(
						index, tuple1, mode1,
						tuple2, mode2, nth_attempt + 1);

				return(ret);
			}

osku's avatar
osku committed
5991 5992 5993
			diverged = TRUE;

			if (slot1->nth_rec < slot2->nth_rec) {
5994 5995 5996 5997 5998 5999 6000 6001 6002 6003
				/* We do not count the borders (nor the left
				nor the right one), thus "- 1". */
				n_rows = slot2->nth_rec - slot1->nth_rec - 1;

				if (n_rows > 0) {
					/* There is at least one row between
					the two borders pointed to by slot1
					and slot2, so on the level below the
					slots will point to non-adjacent
					pages. */
6004 6005
					diverged_lot = TRUE;
					divergence_level = i;
osku's avatar
osku committed
6006 6007
				}
			} else {
6008 6009 6010 6011 6012 6013 6014
				/* It is possible that
				slot1->nth_rec >= slot2->nth_rec
				if, for example, we have a single page
				tree which contains (inf, 5, 6, supr)
				and we select where x > 20 and x < 30;
				in this case slot1->nth_rec will point
				to the supr record and slot2->nth_rec
6015
				will point to 6. */
6016
				n_rows = 0;
6017 6018
				should_count_the_left_border = false;
				should_count_the_right_border = false;
osku's avatar
osku committed
6019 6020 6021 6022
			}

		} else if (diverged && !diverged_lot) {

6023
			if (slot1->nth_rec < slot1->n_recs
6024
			    || slot2->nth_rec > 1) {
osku's avatar
osku committed
6025

6026
				diverged_lot = TRUE;
osku's avatar
osku committed
6027 6028 6029 6030
				divergence_level = i;

				n_rows = 0;

6031 6032 6033
				if (slot1->nth_rec < slot1->n_recs) {
					n_rows += slot1->n_recs
						- slot1->nth_rec;
osku's avatar
osku committed
6034 6035 6036
				}

				if (slot2->nth_rec > 1) {
6037
					n_rows += slot2->nth_rec - 1;
osku's avatar
osku committed
6038
				}
6039
			}
osku's avatar
osku committed
6040 6041
		} else if (diverged_lot) {

6042 6043 6044
			n_rows = btr_estimate_n_rows_in_range_on_level(
				index, slot1, slot2, n_rows,
				&is_n_rows_exact);
6045
		}
osku's avatar
osku committed
6046 6047 6048
	}
}

6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069
/** Estimates the number of rows in a given index range.
@param[in]	index	index
@param[in]	tuple1	range start, may also be empty tuple
@param[in]	mode1	search mode for range start
@param[in]	tuple2	range end, may also be empty tuple
@param[in]	mode2	search mode for range end
@return estimated number of rows */
int64_t
btr_estimate_n_rows_in_range(
	dict_index_t*	index,
	const dtuple_t*	tuple1,
	page_cur_mode_t	mode1,
	const dtuple_t*	tuple2,
	page_cur_mode_t	mode2)
{
	const int64_t	ret = btr_estimate_n_rows_in_range_low(
		index, tuple1, mode1, tuple2, mode2, 1 /* first attempt */);

	return(ret);
}

6070
/*******************************************************************//**
6071
Record the number of non_null key values in a given index for
6072
each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6073
The estimates are eventually stored in the array:
6074
index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6075 6076 6077 6078
static
void
btr_record_not_null_field_in_rec(
/*=============================*/
6079
	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
6080 6081
					number of columns uniquely determine
					an index entry */
6082
	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
6083 6084
					its size could be for all fields or
					that of "n_unique" */
6085
	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096
					not null rows for n-column prefix */
{
	ulint	i;

	ut_ad(rec_offs_n_fields(offsets) >= n_unique);

	if (n_not_null == NULL) {
		return;
	}

	for (i = 0; i < n_unique; i++) {
6097
		if (rec_offs_nth_sql_null(offsets, i)) {
6098 6099
			break;
		}
6100 6101

		n_not_null[i]++;
6102 6103 6104
	}
}

6105
/*******************************************************************//**
osku's avatar
osku committed
6106
Estimates the number of different key values in a given index, for
6107 6108 6109 6110 6111 6112
each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
0..n_uniq-1) and the number of pages that were sampled is saved in
index->stat_n_sample_sizes[].
If innodb_stats_method is nulls_ignored, we also record the number of
non-null values for each prefix and stored the estimates in
6113 6114 6115 6116
array index->stat_n_non_null_key_vals.
@return true if the index is available and we get the estimated numbers,
false if the index is unavailable. */
bool
osku's avatar
osku committed
6117 6118
btr_estimate_number_of_different_key_vals(
/*======================================*/
6119
	dict_index_t*	index)	/*!< in: index */
osku's avatar
osku committed
6120 6121 6122 6123 6124
{
	btr_cur_t	cursor;
	page_t*		page;
	rec_t*		rec;
	ulint		n_cols;
6125 6126
	ib_uint64_t*	n_diff;
	ib_uint64_t*	n_not_null;
6127
	ibool		stats_null_not_equal;
6128
	uintmax_t	n_sample_pages=1; /* number of pages to sample */
osku's avatar
osku committed
6129 6130 6131 6132
	ulint		not_empty_flag	= 0;
	ulint		total_external_size = 0;
	ulint		i;
	ulint		j;
6133
	uintmax_t	add_on;
osku's avatar
osku committed
6134 6135
	mtr_t		mtr;
	mem_heap_t*	heap		= NULL;
6136 6137
	ulint*		offsets_rec	= NULL;
	ulint*		offsets_next_rec = NULL;
osku's avatar
osku committed
6138

6139 6140 6141 6142 6143 6144
	/* For spatial index, there is no such stats can be
	fetched. */
	if (dict_index_is_spatial(index)) {
		return(false);
	}

osku's avatar
osku committed
6145 6146
	n_cols = dict_index_get_n_unique(index);

6147
	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6148
			       * n_cols
6149 6150 6151 6152
			       + dict_index_get_n_fields(index)
			       * (sizeof *offsets_rec
				  + sizeof *offsets_next_rec));

6153
	n_diff = (ib_uint64_t*) mem_heap_zalloc(
6154
		heap, n_cols * sizeof(n_diff[0]));
6155 6156 6157 6158 6159 6160 6161 6162

	n_not_null = NULL;

	/* Check srv_innodb_stats_method setting, and decide whether we
	need to record non-null value and also decide if NULL is
	considered equal (by setting stats_null_not_equal value) */
	switch (srv_innodb_stats_method) {
	case SRV_STATS_NULLS_IGNORED:
6163 6164
		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
			heap, n_cols * sizeof *n_not_null);
6165 6166 6167 6168 6169 6170 6171
		/* fall through */

	case SRV_STATS_NULLS_UNEQUAL:
		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
		case, we will treat NULLs as unequal value */
		stats_null_not_equal = TRUE;
		break;
6172

6173 6174 6175 6176 6177 6178
	case SRV_STATS_NULLS_EQUAL:
		stats_null_not_equal = FALSE;
		break;

	default:
		ut_error;
6179
	}
osku's avatar
osku committed
6180

6181 6182 6183
	if (srv_stats_sample_traditional) {
		/* It makes no sense to test more pages than are contained
		in the index, thus we lower the number if it is too high */
Sergei Golubchik's avatar
Sergei Golubchik committed
6184
		if (srv_stats_transient_sample_pages > index->stat_index_size) {
6185 6186 6187
			if (index->stat_index_size > 0) {
				n_sample_pages = index->stat_index_size;
			}
6188
		} else {
Sergei Golubchik's avatar
Sergei Golubchik committed
6189
			n_sample_pages = srv_stats_transient_sample_pages;
6190 6191
		}
	} else {
6192 6193
		/* New logaritmic number of pages that are estimated.
		Number of pages estimated should be between 1 and
6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227
		index->stat_index_size.

		If we have only 0 or 1 index pages then we can only take 1
		sample. We have already initialized n_sample_pages to 1.

		So taking index size as I and sample as S and log(I)*S as L

		requirement 1) we want the out limit of the expression to not exceed I;
		requirement 2) we want the ideal pages to be at least S;
		so the current expression is min(I, max( min(S,I), L)

		looking for simplifications:

		case 1: assume S < I
		min(I, max( min(S,I), L) -> min(I , max( S, L))

		but since L=LOG2(I)*S and log2(I) >=1   L>S always so max(S,L) = L.

		so we have: min(I , L)

		case 2: assume I < S
		    min(I, max( min(S,I), L) -> min(I, max( I, L))

		case 2a: L > I
		    min(I, max( I, L)) -> min(I, L) -> I

		case 2b: when L < I
		    min(I, max( I, L))  ->  min(I, I ) -> I

		so taking all case2 paths is I, our expression is:
		n_pages = S < I? min(I,L) : I
                */
		if (index->stat_index_size > 1) {
			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size) ?
6228 6229
				ut_min(static_cast<ulint>(index->stat_index_size),
					static_cast<ulint>(log2(index->stat_index_size)*srv_stats_transient_sample_pages))
6230 6231
				: index->stat_index_size;

6232 6233 6234
		}
	}

6235
	/* Sanity check */
6236
	ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
6237

osku's avatar
osku committed
6238
	/* We sample some pages in the index to get an estimate */
6239

6240
	for (i = 0; i < n_sample_pages; i++) {
osku's avatar
osku committed
6241 6242
		mtr_start(&mtr);

6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253
		bool	available;

		available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
						    &cursor, &mtr);

		if (!available) {
			mtr_commit(&mtr);
			mem_heap_free(heap);

			return(false);
		}
6254

osku's avatar
osku committed
6255 6256
		/* Count the number of different key values for each prefix of
		the key on this index page. If the prefix does not determine
6257
		the index record uniquely in the B-tree, then we subtract one
osku's avatar
osku committed
6258 6259 6260
		because otherwise our algorithm would give a wrong estimate
		for an index where there is just one key value. */

6261
		if (!index->is_readable()) {
6262 6263 6264 6265
			mtr_commit(&mtr);
			goto exit_loop;
		}

osku's avatar
osku committed
6266 6267 6268
		page = btr_cur_get_page(&cursor);

		rec = page_rec_get_next(page_get_infimum_rec(page));
6269
		ut_d(const bool is_leaf = page_is_leaf(page));
osku's avatar
osku committed
6270

6271
		if (!page_rec_is_supremum(rec)) {
osku's avatar
osku committed
6272 6273
			not_empty_flag = 1;
			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6274
						      is_leaf,
6275
						      ULINT_UNDEFINED, &heap);
6276

6277
			if (n_not_null != NULL) {
6278
				btr_record_not_null_field_in_rec(
6279
					n_cols, offsets_rec, n_not_null);
6280
			}
osku's avatar
osku committed
6281 6282
		}

6283
		while (!page_rec_is_supremum(rec)) {
6284
			ulint	matched_fields;
osku's avatar
osku committed
6285
			rec_t*	next_rec = page_rec_get_next(rec);
6286
			if (page_rec_is_supremum(next_rec)) {
6287 6288 6289
				total_external_size +=
					btr_rec_get_externally_stored_len(
						rec, offsets_rec);
osku's avatar
osku committed
6290 6291 6292 6293
				break;
			}

			offsets_next_rec = rec_get_offsets(next_rec, index,
6294
							   offsets_next_rec,
6295
							   is_leaf,
6296 6297
							   ULINT_UNDEFINED,
							   &heap);
osku's avatar
osku committed
6298 6299

			cmp_rec_rec_with_match(rec, next_rec,
6300
					       offsets_rec, offsets_next_rec,
6301
					       index, stats_null_not_equal,
6302
					       &matched_fields);
osku's avatar
osku committed
6303

6304
			for (j = matched_fields; j < n_cols; j++) {
osku's avatar
osku committed
6305 6306 6307 6308 6309 6310
				/* We add one if this index record has
				a different prefix from the previous */

				n_diff[j]++;
			}

6311
			if (n_not_null != NULL) {
6312
				btr_record_not_null_field_in_rec(
6313
					n_cols, offsets_next_rec, n_not_null);
6314 6315
			}

6316
			total_external_size
6317 6318
				+= btr_rec_get_externally_stored_len(
					rec, offsets_rec);
6319

osku's avatar
osku committed
6320 6321 6322 6323 6324 6325 6326 6327 6328 6329
			rec = next_rec;
			/* Initialize offsets_rec for the next round
			and assign the old offsets_rec buffer to
			offsets_next_rec. */
			{
				ulint*	offsets_tmp = offsets_rec;
				offsets_rec = offsets_next_rec;
				offsets_next_rec = offsets_tmp;
			}
		}
6330

6331 6332
		if (n_cols == dict_index_get_n_unique_in_tree(index)
		    && page_has_siblings(page)) {
osku's avatar
osku committed
6333 6334 6335 6336 6337 6338 6339 6340 6341 6342

			/* If there is more than one leaf page in the tree,
			we add one because we know that the first record
			on the page certainly had a different prefix than the
			last record on the previous index page in the
			alphabetical order. Before this fix, if there was
			just one big record on each clustered index page, the
			algorithm grossly underestimated the number of rows
			in the table. */

6343
			n_diff[n_cols - 1]++;
osku's avatar
osku committed
6344 6345 6346 6347 6348
		}

		mtr_commit(&mtr);
	}

6349
exit_loop:
osku's avatar
osku committed
6350
	/* If we saw k borders between different key values on
6351
	n_sample_pages leaf pages, we can estimate how many
osku's avatar
osku committed
6352
	there will be in index->stat_n_leaf_pages */
6353

osku's avatar
osku committed
6354 6355
	/* We must take into account that our sample actually represents
	also the pages used for external storage of fields (those pages are
6356
	included in index->stat_n_leaf_pages) */
osku's avatar
osku committed
6357

6358
	for (j = 0; j < n_cols; j++) {
6359
		index->stat_n_diff_key_vals[j]
6360 6361
			= BTR_TABLE_STATS_FROM_SAMPLE(
				n_diff[j], index, n_sample_pages,
6362
				total_external_size, not_empty_flag);
6363 6364

		/* If the tree is small, smaller than
6365
		10 * n_sample_pages + total_external_size, then
osku's avatar
osku committed
6366 6367
		the above estimate is ok. For bigger trees it is common that we
		do not see any borders between key values in the few pages
6368
		we pick. But still there may be n_sample_pages
osku's avatar
osku committed
6369 6370 6371
		different key values, or even more. Let us try to approximate
		that: */

6372
		add_on = index->stat_n_leaf_pages
6373
			/ (10 * (n_sample_pages
6374
				 + total_external_size));
osku's avatar
osku committed
6375

6376 6377
		if (add_on > n_sample_pages) {
			add_on = n_sample_pages;
osku's avatar
osku committed
6378
		}
6379

osku's avatar
osku committed
6380
		index->stat_n_diff_key_vals[j] += add_on;
6381

6382 6383
		index->stat_n_sample_sizes[j] = n_sample_pages;

6384 6385 6386 6387
		/* Update the stat_n_non_null_key_vals[] with our
		sampled result. stat_n_non_null_key_vals[] is created
		and initialized to zero in dict_index_add_to_cache(),
		along with stat_n_diff_key_vals[] array */
6388
		if (n_not_null != NULL) {
6389 6390
			index->stat_n_non_null_key_vals[j] =
				 BTR_TABLE_STATS_FROM_SAMPLE(
6391
					n_not_null[j], index, n_sample_pages,
6392 6393
					total_external_size, not_empty_flag);
		}
osku's avatar
osku committed
6394
	}
6395 6396

	mem_heap_free(heap);
6397 6398

	return(true);
osku's avatar
osku committed
6399 6400 6401 6402
}

/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/

6403 6404
/***********************************************************//**
Gets the offset of the pointer to the externally stored part of a field.
6405
@return offset of the pointer to the externally stored part */
unknown's avatar
unknown committed
6406 6407
static
ulint
6408 6409 6410 6411
btr_rec_get_field_ref_offs(
/*=======================*/
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
	ulint		n)	/*!< in: index of the external field */
unknown's avatar
unknown committed
6412
{
6413
	ulint	field_ref_offs;
unknown's avatar
unknown committed
6414
	ulint	local_len;
6415 6416 6417 6418 6419 6420 6421 6422 6423 6424

	ut_a(rec_offs_nth_extern(offsets, n));
	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
	ut_a(local_len != UNIV_SQL_NULL);
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);

	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
}

/** Gets a pointer to the externally stored part of a field.
6425 6426 6427
@param rec record
@param offsets rec_get_offsets(rec)
@param n index of the externally stored field
6428 6429 6430 6431
@return pointer to the externally stored part */
#define btr_rec_get_field_ref(rec, offsets, n)			\
	((rec) + btr_rec_get_field_ref_offs(offsets, n))

Sergei Golubchik's avatar
Sergei Golubchik committed
6432 6433 6434
/** Gets the externally stored size of a record, in units of a database page.
@param[in]	rec	record
@param[in]	offsets	array returned by rec_get_offsets()
6435
@return externally stored part, in units of a database page */
osku's avatar
osku committed
6436 6437
ulint
btr_rec_get_externally_stored_len(
Sergei Golubchik's avatar
Sergei Golubchik committed
6438 6439
	const rec_t*	rec,
	const ulint*	offsets)
osku's avatar
osku committed
6440 6441 6442 6443 6444 6445
{
	ulint	n_fields;
	ulint	total_extern_len = 0;
	ulint	i;

	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6446 6447 6448 6449 6450

	if (!rec_offs_any_extern(offsets)) {
		return(0);
	}

osku's avatar
osku committed
6451 6452 6453 6454 6455
	n_fields = rec_offs_n_fields(offsets);

	for (i = 0; i < n_fields; i++) {
		if (rec_offs_nth_extern(offsets, i)) {

6456 6457 6458
			ulint	extern_len = mach_read_from_4(
				btr_rec_get_field_ref(rec, offsets, i)
				+ BTR_EXTERN_LEN + 4);
osku's avatar
osku committed
6459 6460

			total_extern_len += ut_calc_align(extern_len,
6461
							  UNIV_PAGE_SIZE);
osku's avatar
osku committed
6462 6463 6464 6465 6466 6467
		}
	}

	return(total_extern_len / UNIV_PAGE_SIZE);
}

6468
/*******************************************************************//**
osku's avatar
osku committed
6469 6470 6471 6472 6473
Sets the ownership bit of an externally stored field in a record. */
static
void
btr_cur_set_ownership_of_extern_field(
/*==================================*/
6474
	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6475
				part will be updated, or NULL */
6476 6477 6478 6479 6480 6481
	rec_t*		rec,	/*!< in/out: clustered index record */
	dict_index_t*	index,	/*!< in: index of the page */
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
	ulint		i,	/*!< in: field number */
	ibool		val,	/*!< in: value to set */
	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
osku's avatar
osku committed
6482 6483 6484 6485 6486 6487
{
	byte*	data;
	ulint	local_len;
	ulint	byte_val;

	data = rec_get_nth_field(rec, offsets, i, &local_len);
6488
	ut_ad(rec_offs_nth_extern(offsets, i));
osku's avatar
osku committed
6489 6490 6491 6492 6493 6494 6495
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);

	local_len -= BTR_EXTERN_FIELD_REF_SIZE;

	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);

	if (val) {
6496
		byte_val &= ~BTR_EXTERN_OWNER_FLAG;
osku's avatar
osku committed
6497
	} else {
6498 6499 6500
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6501
		byte_val |= BTR_EXTERN_OWNER_FLAG;
osku's avatar
osku committed
6502
	}
marko's avatar
marko committed
6503

6504
	if (page_zip) {
marko's avatar
marko committed
6505
		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6506
		page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
6507
	} else if (mtr != NULL) {
6508

marko's avatar
marko committed
6509
		mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
6510
				 MLOG_1BYTE, mtr);
marko's avatar
marko committed
6511 6512 6513
	} else {
		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
	}
osku's avatar
osku committed
6514 6515
}

6516
/*******************************************************************//**
6517 6518
Marks non-updated off-page fields as disowned by this record. The ownership
must be transferred to the updated record which is inserted elsewhere in the
osku's avatar
osku committed
6519
index tree. In purge only the owner of externally stored field is allowed
6520 6521
to free the field. */
void
6522 6523
btr_cur_disown_inherited_fields(
/*============================*/
6524
	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6525
				part will be updated, or NULL */
6526 6527 6528 6529
	rec_t*		rec,	/*!< in/out: record in a clustered index */
	dict_index_t*	index,	/*!< in: index of the page */
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
	const upd_t*	update,	/*!< in: update vector */
6530
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
osku's avatar
osku committed
6531 6532 6533
{
	ulint	i;

6534
	ut_ad(rec_offs_validate(rec, index, offsets));
osku's avatar
osku committed
6535
	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6536
	ut_ad(rec_offs_any_extern(offsets));
6537

6538 6539
	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
		if (rec_offs_nth_extern(offsets, i)
6540
		    && !upd_get_field_by_field_no(update, i, false)) {
6541 6542
			btr_cur_set_ownership_of_extern_field(
				page_zip, rec, index, offsets, i, FALSE, mtr);
6543
		}
osku's avatar
osku committed
6544 6545 6546
	}
}

6547
/*******************************************************************//**
osku's avatar
osku committed
6548 6549 6550 6551 6552 6553 6554
Marks all extern fields in a record as owned by the record. This function
should be called if the delete mark of a record is removed: a not delete
marked record always owns all its extern fields. */
static
void
btr_cur_unmark_extern_fields(
/*=========================*/
6555
	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6556
				part will be updated, or NULL */
6557 6558 6559 6560
	rec_t*		rec,	/*!< in/out: record in a clustered index */
	dict_index_t*	index,	/*!< in: index of the page */
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
osku's avatar
osku committed
6561 6562 6563 6564 6565 6566 6567
{
	ulint	n;
	ulint	i;

	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
	n = rec_offs_n_fields(offsets);

marko's avatar
marko committed
6568 6569 6570 6571 6572
	if (!rec_offs_any_extern(offsets)) {

		return;
	}

osku's avatar
osku committed
6573 6574 6575
	for (i = 0; i < n; i++) {
		if (rec_offs_nth_extern(offsets, i)) {

6576 6577
			btr_cur_set_ownership_of_extern_field(
				page_zip, rec, index, offsets, i, TRUE, mtr);
osku's avatar
osku committed
6578
		}
6579
	}
osku's avatar
osku committed
6580 6581
}

6582
/*******************************************************************//**
6583 6584
Flags the data tuple fields that are marked as extern storage in the
update vector.  We use this function to remember which fields we must
6585
mark as extern storage in a record inserted for an update.
6586
@return number of flagged external columns */
osku's avatar
osku committed
6587 6588 6589
ulint
btr_push_update_extern_fields(
/*==========================*/
6590 6591 6592
	dtuple_t*	tuple,	/*!< in/out: data tuple */
	const upd_t*	update,	/*!< in: update vector */
	mem_heap_t*	heap)	/*!< in: memory heap */
osku's avatar
osku committed
6593
{
6594 6595 6596
	ulint			n_pushed	= 0;
	ulint			n;
	const upd_field_t*	uf;
osku's avatar
osku committed
6597

6598 6599
	uf = update->fields;
	n = upd_get_n_fields(update);
6600

6601 6602 6603 6604
	for (; n--; uf++) {
		if (dfield_is_ext(&uf->new_val)) {
			dfield_t*	field
				= dtuple_get_nth_field(tuple, uf->field_no);
osku's avatar
osku committed
6605

6606 6607
			if (!dfield_is_ext(field)) {
				dfield_set_ext(field);
6608
				n_pushed++;
osku's avatar
osku committed
6609
			}
6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622

			switch (uf->orig_len) {
				byte*	data;
				ulint	len;
				byte*	buf;
			case 0:
				break;
			case BTR_EXTERN_FIELD_REF_SIZE:
				/* Restore the original locally stored
				part of the column.  In the undo log,
				InnoDB writes a longer prefix of externally
				stored columns, so that column prefixes
				in secondary indexes can be reconstructed. */
6623 6624
				dfield_set_data(field,
						(byte*) dfield_get_data(field)
6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635
						+ dfield_get_len(field)
						- BTR_EXTERN_FIELD_REF_SIZE,
						BTR_EXTERN_FIELD_REF_SIZE);
				dfield_set_ext(field);
				break;
			default:
				/* Reconstruct the original locally
				stored part of the column.  The data
				will have to be copied. */
				ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);

6636
				data = (byte*) dfield_get_data(field);
6637 6638
				len = dfield_get_len(field);

6639 6640
				buf = (byte*) mem_heap_alloc(heap,
							     uf->orig_len);
6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653
				/* Copy the locally stored prefix. */
				memcpy(buf, data,
				       uf->orig_len
				       - BTR_EXTERN_FIELD_REF_SIZE);
				/* Copy the BLOB pointer. */
				memcpy(buf + uf->orig_len
				       - BTR_EXTERN_FIELD_REF_SIZE,
				       data + len - BTR_EXTERN_FIELD_REF_SIZE,
				       BTR_EXTERN_FIELD_REF_SIZE);

				dfield_set_data(field, buf, uf->orig_len);
				dfield_set_ext(field);
			}
osku's avatar
osku committed
6654
		}
6655
	}
osku's avatar
osku committed
6656 6657 6658 6659

	return(n_pushed);
}

6660
/*******************************************************************//**
6661
Returns the length of a BLOB part stored on the header page.
6662
@return part length */
osku's avatar
osku committed
6663 6664 6665 6666
static
ulint
btr_blob_get_part_len(
/*==================*/
6667
	const byte*	blob_header)	/*!< in: blob header */
osku's avatar
osku committed
6668 6669 6670 6671
{
	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
}

6672
/*******************************************************************//**
6673
Returns the page number where the next BLOB part is stored.
6674
@return page number or FIL_NULL if no more pages */
osku's avatar
osku committed
6675 6676 6677 6678
static
ulint
btr_blob_get_next_page_no(
/*======================*/
6679
	const byte*	blob_header)	/*!< in: blob header */
osku's avatar
osku committed
6680 6681 6682 6683
{
	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
}

6684
/*******************************************************************//**
6685 6686 6687 6688 6689
Deallocate a buffer block that was reserved for a BLOB part. */
static
void
btr_blob_free(
/*==========*/
6690
	dict_index_t*	index,	/*!< in: index */
6691 6692
	buf_block_t*	block,	/*!< in: buffer block */
	ibool		all,	/*!< in: TRUE=remove also the compressed page
6693
				if there is one */
6694
	mtr_t*		mtr)	/*!< in: mini-transaction to commit */
6695
{
irana's avatar
irana committed
6696
	buf_pool_t*	buf_pool = buf_pool_from_block(block);
6697 6698
	ulint		space = block->page.id.space();
	ulint		page_no	= block->page.id.page_no();
6699

6700
	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
6701 6702 6703

	mtr_commit(mtr);

irana's avatar
irana committed
6704
	buf_pool_mutex_enter(buf_pool);
6705 6706 6707 6708 6709 6710

	/* Only free the block if it is still allocated to
	the same file page. */

	if (buf_block_get_state(block)
	    == BUF_BLOCK_FILE_PAGE
6711 6712
	    && block->page.id.space() == space
	    && block->page.id.page_no() == page_no) {
6713

6714
		if (!buf_LRU_free_page(&block->page, all)
6715 6716 6717 6718
		    && all && block->page.zip.data) {
			/* Attempt to deallocate the uncompressed page
			if the whole block cannot be deallocted. */

6719
			buf_LRU_free_page(&block->page, false);
6720 6721 6722
		}
	}

irana's avatar
irana committed
6723
	buf_pool_mutex_exit(buf_pool);
6724 6725
}

6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792
/** Helper class used while writing blob pages, during insert or update. */
struct btr_blob_log_check_t {
	/** Persistent cursor on a clusterex index record with blobs. */
	btr_pcur_t*	m_pcur;
	/** Mini transaction holding the latches for m_pcur */
	mtr_t*		m_mtr;
	/** rec_get_offsets(rec, index); offset of clust_rec */
	const ulint*	m_offsets;
	/** The block containing clustered record */
	buf_block_t**	m_block;
	/** The clustered record pointer */
	rec_t**		m_rec;
	/** The blob operation code */
	enum blob_op	m_op;

	/** Constructor
	@param[in]	pcur		persistent cursor on a clustered
					index record with blobs.
	@param[in]	mtr		mini-transaction holding latches for
					pcur.
	@param[in]	offsets		offsets of the clust_rec
	@param[in,out]	block		record block containing pcur record
	@param[in,out]	rec		the clustered record pointer
	@param[in]	op		the blob operation code */
	btr_blob_log_check_t(
		btr_pcur_t*	pcur,
		mtr_t*		mtr,
		const ulint*	offsets,
		buf_block_t**	block,
		rec_t**		rec,
		enum blob_op	op)
		: m_pcur(pcur),
		  m_mtr(mtr),
		  m_offsets(offsets),
		  m_block(block),
		  m_rec(rec),
		  m_op(op)
	{
		ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
		ut_ad((*m_block)->frame == page_align(*m_rec));
		ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
	}

	/** Check if there is enough space in log file. Commit and re-start the
	mini transaction. */
	void check()
	{
		dict_index_t*	index = m_pcur->index();
		ulint		offs = 0;
		ulint		page_no = ULINT_UNDEFINED;
		FlushObserver*	observer = m_mtr->get_flush_observer();

		if (m_op == BTR_STORE_INSERT_BULK) {
			offs = page_offset(*m_rec);
			page_no = page_get_page_no(
				buf_block_get_frame(*m_block));

			buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
		} else {
			btr_pcur_store_position(m_pcur, m_mtr);
		}
		m_mtr->commit();

		DEBUG_SYNC_C("blob_write_middle");

		log_free_check();

6793 6794
		DEBUG_SYNC_C("blob_write_middle_after_check");

6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831
		const mtr_log_t log_mode = m_mtr->get_log_mode();
		m_mtr->start();
		m_mtr->set_log_mode(log_mode);
		m_mtr->set_named_space(index->space);
		m_mtr->set_flush_observer(observer);

		if (m_op == BTR_STORE_INSERT_BULK) {
			page_id_t       page_id(dict_index_get_space(index),
						page_no);
			page_size_t     page_size(dict_table_page_size(
						index->table));
			page_cur_t*	page_cur = &m_pcur->btr_cur.page_cur;

			mtr_x_lock(dict_index_get_lock(index), m_mtr);
			page_cur->block = btr_block_get(
				page_id, page_size, RW_X_LATCH, index, m_mtr);
			page_cur->rec = buf_block_get_frame(page_cur->block)
				+ offs;

			buf_block_buf_fix_dec(page_cur->block);
		} else {
			ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
			bool ret = btr_pcur_restore_position(
				BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
				m_pcur, m_mtr);

			ut_a(ret);
		}

		*m_block	= btr_pcur_get_block(m_pcur);
		*m_rec		= btr_pcur_get_rec(m_pcur);

		ut_d(rec_offs_make_valid(
			*m_rec, index, const_cast<ulint*>(m_offsets)));

		ut_ad(m_mtr->memo_contains_page_flagged(
		      *m_rec,
6832
		      MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
6833 6834 6835

		ut_ad(mtr_memo_contains_flagged(m_mtr,
		      dict_index_get_lock(index),
6836
		      MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK));
6837 6838 6839
	}
};

6840
/*******************************************************************//**
osku's avatar
osku committed
6841
Stores the fields in big_rec_vec to the tablespace and puts pointers to
6842 6843
them in rec.  The extern flags in rec will have to be set beforehand.
The fields are stored on pages allocated from leaf node
6844
file segment of the index tree.
6845 6846 6847 6848 6849 6850 6851

TODO: If the allocation extends the tablespace, it will not be redo logged, in
any mini-transaction.  Tablespace extension should be redo-logged, so that
recovery will not fail when the big_rec was written to the extended portion of
the file, in case the file was somehow truncated in the crash.

@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
6852
dberr_t
6853 6854
btr_store_big_rec_extern_fields(
/*============================*/
6855 6856 6857 6858 6859 6860 6861
	btr_pcur_t*	pcur,		/*!< in/out: a persistent cursor. if
					btr_mtr is restarted, then this can
					be repositioned. */
	ulint*		offsets,	/*!< in/out: rec_get_offsets() on
					pcur. the "external storage" flags
					in offsets will correctly correspond
					to rec when this function returns */
6862
	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
6863
					to be stored externally */
6864 6865 6866
	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
					latches to the clustered index. can be
					committed and restarted. */
6867
	enum blob_op	op)		/*! in: operation code */
osku's avatar
osku committed
6868
{
6869 6870 6871 6872 6873 6874 6875 6876 6877 6878
	ulint		rec_page_no;
	byte*		field_ref;
	ulint		extern_len;
	ulint		store_len;
	ulint		page_no;
	ulint		space_id;
	ulint		prev_page_no;
	ulint		hint_page_no;
	ulint		i;
	mtr_t		mtr;
6879
	mtr_t		mtr_bulk;
6880
	mem_heap_t*	heap = NULL;
6881
	page_zip_des_t*	page_zip;
6882
	z_stream	c_stream;
6883
	dberr_t		error		= DB_SUCCESS;
6884 6885 6886
	dict_index_t*	index		= pcur->index();
	buf_block_t*	rec_block	= btr_pcur_get_block(pcur);
	rec_t*		rec		= btr_pcur_get_rec(pcur);
osku's avatar
osku committed
6887 6888

	ut_ad(rec_offs_validate(rec, index, offsets));
6889
	ut_ad(rec_offs_any_extern(offsets));
6890
	ut_ad(mtr_memo_contains_flagged(btr_mtr, dict_index_get_lock(index),
6891
					MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
6892 6893
	ut_ad(mtr_is_block_fix(
		btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX, index->table));
6894
	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
6895
	ut_a(dict_index_is_clust(index));
6896

6897 6898
	ut_a(dict_table_page_size(index->table)
		.equals_to(rec_block->page.size));
Sergei Golubchik's avatar
Sergei Golubchik committed
6899

6900 6901 6902 6903 6904 6905 6906
	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
				      &rec, op);
	page_zip = buf_block_get_page_zip(rec_block);
	space_id = rec_block->page.id.space();
	rec_page_no = rec_block->page.id.page_no();
	ut_a(fil_page_index_page_check(page_align(rec))
	     || op == BTR_STORE_INSERT_BULK);
Sergei Golubchik's avatar
Sergei Golubchik committed
6907

6908
	if (page_zip) {
6909 6910
		int	err;

6911 6912 6913 6914 6915 6916 6917
		/* Zlib deflate needs 128 kilobytes for the default
		window size, plus 512 << memLevel, plus a few
		kilobytes for small objects.  We use reduced memLevel
		to limit the memory consumption, and preallocate the
		heap, hoping to avoid memory fragmentation. */
		heap = mem_heap_create(250000);
		page_zip_set_alloc(&c_stream, heap);
6918

6919
		err = deflateInit2(&c_stream, page_zip_level,
6920
				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
6921 6922 6923
		ut_a(err == Z_OK);
	}

6924 6925 6926 6927
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
	/* All pointers to externally stored columns in the record
	must either be zero or they must be pointers to inherited
	columns, owned by this record or an earlier record version. */
6928 6929 6930
	for (i = 0; i < big_rec_vec->n_fields; i++) {
		field_ref = btr_rec_get_field_ref(
			rec, offsets, big_rec_vec->fields[i].field_no);
6931 6932 6933 6934 6935

		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
		/* Either this must be an update in place,
		or the BLOB must be inherited, or the BLOB pointer
		must be zero (will be written in this function). */
6936
		ut_a(op == BTR_STORE_UPDATE
6937 6938 6939 6940 6941
		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
		     || !memcmp(field_ref, field_ref_zero,
				BTR_EXTERN_FIELD_REF_SIZE));
	}
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952

	const page_size_t	page_size(dict_table_page_size(index->table));

	/* Space available in compressed page to carry blob data */
	const ulint	payload_size_zip = page_size.physical()
		- FIL_PAGE_DATA;

	/* Space available in uncompressed page to carry blob data */
	const ulint	payload_size = page_size.physical()
		- FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END;

osku's avatar
osku committed
6953 6954 6955 6956
	/* We have to create a file segment to the tablespace
	for each field and put the pointer to the field in rec */

	for (i = 0; i < big_rec_vec->n_fields; i++) {
6957 6958 6959
		const ulint field_no = big_rec_vec->fields[i].field_no;

		field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
6960 6961 6962 6963 6964
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
		/* A zero BLOB pointer should have been initially inserted. */
		ut_a(!memcmp(field_ref, field_ref_zero,
			     BTR_EXTERN_FIELD_REF_SIZE));
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
osku's avatar
osku committed
6965
		extern_len = big_rec_vec->fields[i].len;
6966 6967
		UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
				   extern_len);
osku's avatar
osku committed
6968 6969 6970 6971 6972

		ut_a(extern_len > 0);

		prev_page_no = FIL_NULL;

6973
		if (page_zip) {
6974 6975 6976
			int	err = deflateReset(&c_stream);
			ut_a(err == Z_OK);

6977 6978
			c_stream.next_in = (Bytef*)
				big_rec_vec->fields[i].data;
Sergei Golubchik's avatar
Sergei Golubchik committed
6979
			c_stream.avail_in = static_cast<uInt>(extern_len);
6980 6981
		}

6982
		for (ulint blob_npages = 0;; ++blob_npages) {
6983 6984
			buf_block_t*	block;
			page_t*		page;
6985
			const ulint	commit_freq = 4;
6986
			ulint		r_extents;
6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999

			ut_ad(page_align(field_ref) == page_align(rec));

			if (!(blob_npages % commit_freq)) {

				redo_log.check();

				field_ref = btr_rec_get_field_ref(
					rec, offsets, field_no);

				page_zip = buf_block_get_page_zip(rec_block);
				rec_page_no = rec_block->page.id.page_no();
			}
7000

osku's avatar
osku committed
7001
			mtr_start(&mtr);
7002 7003 7004 7005 7006 7007
			mtr.set_named_space(index->space);
			mtr.set_log_mode(btr_mtr->get_log_mode());
			mtr.set_flush_observer(btr_mtr->get_flush_observer());

			buf_page_get(rec_block->page.id,
				     rec_block->page.size, RW_X_LATCH, &mtr);
osku's avatar
osku committed
7008 7009

			if (prev_page_no == FIL_NULL) {
7010
				hint_page_no = 1 + rec_page_no;
osku's avatar
osku committed
7011 7012 7013
			} else {
				hint_page_no = prev_page_no + 1;
			}
7014

7015 7016
			mtr_t	*alloc_mtr;

7017
			if (op == BTR_STORE_INSERT_BULK) {
7018 7019 7020 7021 7022 7023 7024 7025 7026 7027
				mtr_start(&mtr_bulk);
				mtr_bulk.set_spaces(mtr);
				alloc_mtr = &mtr_bulk;
			} else {
				alloc_mtr = &mtr;
			}

			if (!fsp_reserve_free_extents(&r_extents, space_id, 1,
						      FSP_BLOB, alloc_mtr,
						      1)) {
7028

7029 7030 7031 7032
				mtr_commit(alloc_mtr);
				error = DB_OUT_OF_FILE_SPACE;
				goto func_exit;
			}
osku's avatar
osku committed
7033

7034 7035
			block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR,
					       0, alloc_mtr, &mtr);
7036

7037 7038 7039 7040
			alloc_mtr->release_free_extents(r_extents);

			if (op == BTR_STORE_INSERT_BULK) {
				mtr_commit(&mtr_bulk);
osku's avatar
osku committed
7041 7042
			}

7043 7044 7045
			ut_a(block != NULL);

			page_no = block->page.id.page_no();
7046
			page = buf_block_get_frame(block);
osku's avatar
osku committed
7047 7048

			if (prev_page_no != FIL_NULL) {
7049 7050
				buf_block_t*	prev_block;
				page_t*		prev_page;
7051

7052 7053 7054 7055 7056
				prev_block = buf_page_get(
					page_id_t(space_id, prev_page_no),
					rec_block->page.size,
					RW_X_LATCH, &mtr);

7057 7058 7059
				buf_block_dbg_add_level(prev_block,
							SYNC_EXTERN_STORAGE);
				prev_page = buf_block_get_frame(prev_block);
7060

7061
				if (page_zip) {
7062 7063 7064
					mlog_write_ulint(
						prev_page + FIL_PAGE_NEXT,
						page_no, MLOG_4BYTES, &mtr);
7065 7066
					memcpy(buf_block_get_page_zip(
						       prev_block)
7067 7068
					       ->data + FIL_PAGE_NEXT,
					       prev_page + FIL_PAGE_NEXT, 4);
7069
				} else {
7070 7071 7072 7073
					mlog_write_ulint(
						prev_page + FIL_PAGE_DATA
						+ BTR_BLOB_HDR_NEXT_PAGE_NO,
						page_no, MLOG_4BYTES, &mtr);
7074 7075
				}

7076 7077
			} else if (dict_index_is_online_ddl(index)) {
				row_log_table_blob_alloc(index, page_no);
osku's avatar
osku committed
7078 7079
			}

7080
			if (page_zip) {
7081 7082
				int		err;
				page_zip_des_t*	blob_page_zip;
7083

7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098
				/* Write FIL_PAGE_TYPE to the redo log
				separately, before logging any other
				changes to the page, so that the debug
				assertions in
				recv_parse_or_apply_log_rec_body() can
				be made simpler.  Before InnoDB Plugin
				1.0.4, the initialization of
				FIL_PAGE_TYPE was logged as part of
				the mlog_log_string() below. */

				mlog_write_ulint(page + FIL_PAGE_TYPE,
						 prev_page_no == FIL_NULL
						 ? FIL_PAGE_TYPE_ZBLOB
						 : FIL_PAGE_TYPE_ZBLOB2,
						 MLOG_2BYTES, &mtr);
7099 7100

				c_stream.next_out = page
7101
					+ FIL_PAGE_DATA;
7102 7103
				c_stream.avail_out = static_cast<uInt>(
					payload_size_zip);
7104 7105 7106 7107

				err = deflate(&c_stream, Z_FINISH);
				ut_a(err == Z_OK || err == Z_STREAM_END);
				ut_a(err == Z_STREAM_END
7108
				     || c_stream.avail_out == 0);
7109 7110

				/* Write the "next BLOB page" pointer */
7111
				mlog_write_ulint(page + FIL_PAGE_NEXT,
7112
						 FIL_NULL, MLOG_4BYTES, &mtr);
7113 7114 7115
				/* Initialize the unused "prev page" pointer */
				mlog_write_ulint(page + FIL_PAGE_PREV,
						 FIL_NULL, MLOG_4BYTES, &mtr);
7116 7117 7118 7119 7120 7121 7122 7123 7124 7125
				/* Write a back pointer to the record
				into the otherwise unused area.  This
				information could be useful in
				debugging.  Later, we might want to
				implement the possibility to relocate
				BLOB pages.  Then, we would need to be
				able to adjust the BLOB pointer in the
				record.  We do not store the heap
				number of the record, because it can
				change in page_zip_reorganize() or
7126 7127 7128
				btr_page_reorganize().  However, also
				the page number of the record may
				change when B-tree nodes are split or
7129 7130 7131 7132 7133 7134
				merged.
				NOTE: FIL_PAGE_FILE_FLUSH_LSN space is
				used by R-tree index for a Split Sequence
				Number */
				ut_ad(!dict_index_is_spatial(index));

7135
				mlog_write_ulint(page
Monty's avatar
Monty committed
7136
						 + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
7137 7138 7139
						 space_id,
						 MLOG_4BYTES, &mtr);
				mlog_write_ulint(page
Monty's avatar
Monty committed
7140
						 + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
7141 7142 7143
						 rec_page_no,
						 MLOG_4BYTES, &mtr);

7144
				/* Zero out the unused part of the page. */
7145
				memset(page + page_zip_get_size(page_zip)
7146 7147
				       - c_stream.avail_out,
				       0, c_stream.avail_out);
Monty's avatar
Monty committed
7148 7149
				mlog_log_string(page
						+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
7150
						page_zip_get_size(page_zip)
Monty's avatar
Monty committed
7151
						- FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
7152
						&mtr);
7153 7154 7155
				/* Copy the page to compressed storage,
				because it will be flushed to disk
				from there. */
7156
				blob_page_zip = buf_block_get_page_zip(block);
7157
				ut_ad(blob_page_zip);
7158 7159
				ut_ad(page_zip_get_size(blob_page_zip)
				      == page_zip_get_size(page_zip));
7160
				memcpy(blob_page_zip->data, page,
7161
				       page_zip_get_size(page_zip));
7162 7163 7164 7165 7166 7167 7168

				if (err == Z_OK && prev_page_no != FIL_NULL) {

					goto next_zip_page;
				}

				if (err == Z_STREAM_END) {
7169 7170 7171
					mach_write_to_4(field_ref
							+ BTR_EXTERN_LEN, 0);
					mach_write_to_4(field_ref
7172
							+ BTR_EXTERN_LEN + 4,
7173
							c_stream.total_in);
7174
				} else {
7175
					memset(field_ref + BTR_EXTERN_LEN,
7176
					       0, 8);
7177 7178 7179
				}

				if (prev_page_no == FIL_NULL) {
7180
					ut_ad(blob_npages == 0);
7181
					mach_write_to_4(field_ref
7182
							+ BTR_EXTERN_SPACE_ID,
7183
							space_id);
7184

7185
					mach_write_to_4(field_ref
7186
							+ BTR_EXTERN_PAGE_NO,
7187
							page_no);
7188

7189
					mach_write_to_4(field_ref
7190
							+ BTR_EXTERN_OFFSET,
7191
							FIL_PAGE_NEXT);
7192
				}
7193

7194 7195 7196 7197 7198 7199
				/* We compress a page when finish bulk insert.*/
				if (op != BTR_STORE_INSERT_BULK) {
					page_zip_write_blob_ptr(
						page_zip, rec, index, offsets,
						field_no, &mtr);
				}
7200 7201 7202 7203

next_zip_page:
				prev_page_no = page_no;

7204 7205
				/* Commit mtr and release the
				uncompressed page frame to save memory. */
7206
				btr_blob_free(index, block, FALSE, &mtr);
7207

7208 7209 7210 7211
				if (err == Z_STREAM_END) {
					break;
				}
			} else {
7212
				mlog_write_ulint(page + FIL_PAGE_TYPE,
7213 7214
						 FIL_PAGE_TYPE_BLOB,
						 MLOG_2BYTES, &mtr);
7215

7216 7217
				if (extern_len > payload_size) {
					store_len = payload_size;
7218 7219 7220
				} else {
					store_len = extern_len;
				}
osku's avatar
osku committed
7221

7222
				mlog_write_string(page + FIL_PAGE_DATA
7223
						  + BTR_BLOB_HDR_SIZE,
7224
						  (const byte*)
7225 7226 7227 7228
						  big_rec_vec->fields[i].data
						  + big_rec_vec->fields[i].len
						  - extern_len,
						  store_len, &mtr);
7229
				mlog_write_ulint(page + FIL_PAGE_DATA
7230 7231
						 + BTR_BLOB_HDR_PART_LEN,
						 store_len, MLOG_4BYTES, &mtr);
7232
				mlog_write_ulint(page + FIL_PAGE_DATA
7233 7234
						 + BTR_BLOB_HDR_NEXT_PAGE_NO,
						 FIL_NULL, MLOG_4BYTES, &mtr);
7235

7236
				extern_len -= store_len;
osku's avatar
osku committed
7237

7238
				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
7239
						 MLOG_4BYTES, &mtr);
7240
				mlog_write_ulint(field_ref
7241 7242 7243
						 + BTR_EXTERN_LEN + 4,
						 big_rec_vec->fields[i].len
						 - extern_len,
7244
						 MLOG_4BYTES, &mtr);
7245 7246

				if (prev_page_no == FIL_NULL) {
7247
					ut_ad(blob_npages == 0);
7248
					mlog_write_ulint(field_ref
7249
							 + BTR_EXTERN_SPACE_ID,
7250
							 space_id, MLOG_4BYTES,
7251
							 &mtr);
7252 7253

					mlog_write_ulint(field_ref
7254
							 + BTR_EXTERN_PAGE_NO,
7255
							 page_no, MLOG_4BYTES,
7256
							 &mtr);
7257

7258
					mlog_write_ulint(field_ref
7259 7260
							 + BTR_EXTERN_OFFSET,
							 FIL_PAGE_DATA,
7261
							 MLOG_4BYTES,
7262
							 &mtr);
7263
				}
osku's avatar
osku committed
7264

7265
				prev_page_no = page_no;
osku's avatar
osku committed
7266

7267
				mtr_commit(&mtr);
7268

7269 7270
				if (extern_len == 0) {
					break;
7271
				}
osku's avatar
osku committed
7272 7273
			}
		}
7274 7275 7276 7277

		DBUG_EXECUTE_IF("btr_store_big_rec_extern",
				error = DB_OUT_OF_FILE_SPACE;
				goto func_exit;);
7278 7279

		rec_offs_make_nth_extern(offsets, field_no);
osku's avatar
osku committed
7280 7281
	}

7282 7283
func_exit:
	if (page_zip) {
7284
		deflateEnd(&c_stream);
7285
	}
7286

7287
	if (heap != NULL) {
7288
		mem_heap_free(heap);
7289 7290
	}

7291 7292 7293 7294 7295 7296
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
	/* All pointers to externally stored columns in the record
	must be valid. */
	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
		if (!rec_offs_nth_extern(offsets, i)) {
			continue;
7297 7298
		}

7299 7300
		field_ref = btr_rec_get_field_ref(rec, offsets, i);

7301 7302
		/* The pointer must not be zero if the operation
		succeeded. */
7303
		ut_a(0 != memcmp(field_ref, field_ref_zero,
7304 7305
				 BTR_EXTERN_FIELD_REF_SIZE)
		     || error != DB_SUCCESS);
7306 7307 7308 7309
		/* The column must not be disowned by this record. */
		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
	}
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7310
	return(error);
osku's avatar
osku committed
7311 7312
}

7313
/*******************************************************************//**
7314 7315 7316 7317 7318
Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
static
void
btr_check_blob_fil_page_type(
/*=========================*/
7319 7320 7321 7322
	ulint		space_id,	/*!< in: space id */
	ulint		page_no,	/*!< in: page number */
	const page_t*	page,		/*!< in: page */
	ibool		read)		/*!< in: TRUE=read, FALSE=purge */
7323 7324 7325 7326 7327 7328 7329 7330 7331
{
	ulint	type = fil_page_get_type(page);

	ut_a(space_id == page_get_space_id(page));
	ut_a(page_no == page_get_page_no(page));

	if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
		ulint	flags = fil_space_get_flags(space_id);

7332
#ifndef UNIV_DEBUG /* Improve debug test coverage */
7333
		if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
7334 7335 7336 7337 7338 7339
			/* Old versions of InnoDB did not initialize
			FIL_PAGE_TYPE on BLOB pages.  Do not print
			anything about the type mismatch when reading
			a BLOB page that is in Antelope format.*/
			return;
		}
7340
#endif /* !UNIV_DEBUG */
7341

7342 7343 7344 7345
		ib::fatal() << "FIL_PAGE_TYPE=" << type
			<< " on BLOB " << (read ? "read" : "purge")
			<< " space " << space_id << " page " << page_no
			<< " flags " << flags;
7346 7347 7348
	}
}

7349
/*******************************************************************//**
osku's avatar
osku committed
7350
Frees the space in an externally stored field to the file space
7351
management if the field in data is owned by the externally stored field,
osku's avatar
osku committed
7352 7353 7354 7355 7356
in a rollback we may have the additional condition that the field must
not be inherited. */
void
btr_free_externally_stored_field(
/*=============================*/
7357
	dict_index_t*	index,		/*!< in: index of the data, the index
osku's avatar
osku committed
7358 7359 7360 7361 7362 7363 7364
					tree MUST be X-latched; if the tree
					height is 1, then also the root page
					must be X-latched! (this is relevant
					in the case this function is called
					from purge where 'data' is located on
					an undo log page, not an index
					page) */
7365 7366
	byte*		field_ref,	/*!< in/out: field reference */
	const rec_t*	rec,		/*!< in: record containing field_ref, for
marko's avatar
marko committed
7367
					page_zip_write_blob_ptr(), or NULL */
7368
	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
7369
					or NULL */
7370
	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
marko's avatar
marko committed
7371
					to rec, or NULL if rec == NULL */
7372
	ulint		i,		/*!< in: field number of field_ref;
marko's avatar
marko committed
7373
					ignored if rec == NULL */
7374
	bool		rollback,	/*!< in: performing rollback? */
7375
	mtr_t*		local_mtr)	/*!< in: mtr
7376 7377
					containing the latch to data an an
					X-latch to the index tree */
osku's avatar
osku committed
7378
{
7379
	page_t*		page;
7380 7381 7382 7383
	const ulint	space_id	= mach_read_from_4(
		field_ref + BTR_EXTERN_SPACE_ID);
	const ulint	start_page	= mach_read_from_4(
		field_ref + BTR_EXTERN_PAGE_NO);
7384 7385 7386
	ulint		page_no;
	ulint		next_page_no;
	mtr_t		mtr;
7387

7388
	ut_ad(dict_index_is_clust(index));
7389
	ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index),
7390
					MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
7391 7392
	ut_ad(mtr_is_page_fix(
		local_mtr, field_ref, MTR_MEMO_PAGE_X_FIX, index->table));
marko's avatar
marko committed
7393
	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7394
	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7395 7396
	ut_ad(local_mtr->is_named_space(
		      page_get_space_id(page_align(field_ref))));
7397

7398 7399
	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
				  BTR_EXTERN_FIELD_REF_SIZE))) {
7400 7401 7402
		/* In the rollback, we may encounter a clustered index
		record with some unwritten off-page columns. There is
		nothing to free then. */
7403
		ut_a(rollback);
7404 7405 7406
		return;
	}

7407 7408 7409
	ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
	        & ~((BTR_EXTERN_OWNER_FLAG
	             | BTR_EXTERN_INHERITED_FLAG) << 24)));
7410
	ut_ad(space_id == index->space);
7411

7412 7413 7414 7415 7416
	const page_size_t	ext_page_size(dict_table_page_size(index->table));
	const page_size_t&	rec_page_size(rec == NULL
					      ? univ_page_size
					      : ext_page_size);
	if (rec == NULL) {
7417 7418
		/* This is a call from row_purge_upd_exist_or_extern(). */
		ut_ad(!page_zip);
7419
	}
7420

osku's avatar
osku committed
7421
	for (;;) {
7422
#ifdef UNIV_DEBUG
7423
		buf_block_t*	rec_block;
7424
#endif /* UNIV_DEBUG */
7425 7426
		buf_block_t*	ext_block;

osku's avatar
osku committed
7427
		mtr_start(&mtr);
7428 7429 7430 7431 7432
		mtr.set_spaces(*local_mtr);
		mtr.set_log_mode(local_mtr->get_log_mode());

		ut_ad(!dict_table_is_temporary(index->table)
		      || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
osku's avatar
osku committed
7433

7434 7435 7436 7437 7438 7439
		const page_t*	p = page_align(field_ref);

		const page_id_t	page_id(page_get_space_id(p),
					page_get_page_no(p));

#ifdef UNIV_DEBUG
7440
		rec_block =
7441 7442 7443
#endif /* UNIV_DEBUG */
		buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr);

7444
		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7445
		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
osku's avatar
osku committed
7446

7447 7448 7449 7450
		if (/* There is no external storage data */
		    page_no == FIL_NULL
		    /* This field does not own the externally stored field */
		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7451
			& BTR_EXTERN_OWNER_FLAG)
7452
		    /* Rollback and inherited field */
7453
		    || (rollback
7454 7455
			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
			    & BTR_EXTERN_INHERITED_FLAG))) {
osku's avatar
osku committed
7456

7457
			/* Do not free */
osku's avatar
osku committed
7458 7459 7460 7461
			mtr_commit(&mtr);

			return;
		}
7462

7463 7464 7465 7466
		if (page_no == start_page && dict_index_is_online_ddl(index)) {
			row_log_table_blob_free(index, start_page);
		}

7467 7468 7469 7470
		ext_block = buf_page_get(
			page_id_t(space_id, page_no), ext_page_size,
			RW_X_LATCH, &mtr);

7471 7472 7473
		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
		page = buf_block_get_frame(ext_block);

7474
		if (ext_page_size.is_compressed()) {
7475 7476
			/* Note that page_zip will be NULL
			in row_purge_upd_exist_or_extern(). */
7477 7478 7479 7480 7481 7482 7483
			switch (fil_page_get_type(page)) {
			case FIL_PAGE_TYPE_ZBLOB:
			case FIL_PAGE_TYPE_ZBLOB2:
				break;
			default:
				ut_error;
			}
7484
			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
osku's avatar
osku committed
7485

7486
			btr_page_free(index, ext_block, &mtr, true);
osku's avatar
osku committed
7487

7488
			if (page_zip != NULL) {
7489 7490 7491 7492 7493
				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
						next_page_no);
				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
						0);
				page_zip_write_blob_ptr(page_zip, rec, index,
7494
							offsets, i, &mtr);
7495 7496
			} else {
				mlog_write_ulint(field_ref
7497 7498 7499
						 + BTR_EXTERN_PAGE_NO,
						 next_page_no,
						 MLOG_4BYTES, &mtr);
7500
				mlog_write_ulint(field_ref
7501 7502
						 + BTR_EXTERN_LEN + 4, 0,
						 MLOG_4BYTES, &mtr);
7503
			}
7504
		} else {
7505
			ut_a(!page_zip);
7506 7507
			btr_check_blob_fil_page_type(space_id, page_no, page,
						     FALSE);
osku's avatar
osku committed
7508

7509 7510 7511
			next_page_no = mach_read_from_4(
				page + FIL_PAGE_DATA
				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
7512
			btr_page_free(index, ext_block, &mtr, true);
7513 7514

			mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
7515 7516
					 next_page_no,
					 MLOG_4BYTES, &mtr);
7517 7518 7519 7520 7521
			/* Zero out the BLOB length.  If the server
			crashes during the execution of this function,
			trx_rollback_or_clean_all_recovered() could
			dereference the half-deleted BLOB, fetching a
			wrong prefix for the BLOB. */
7522
			mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
7523
					 0,
7524
					 MLOG_4BYTES, &mtr);
7525 7526
		}

7527
		/* Commit mtr and release the BLOB block to save memory. */
7528
		btr_blob_free(index, ext_block, TRUE, &mtr);
osku's avatar
osku committed
7529 7530 7531
	}
}

7532
/***********************************************************//**
osku's avatar
osku committed
7533
Frees the externally stored fields for a record. */
7534
static
osku's avatar
osku committed
7535 7536 7537
void
btr_rec_free_externally_stored_fields(
/*==================================*/
7538
	dict_index_t*	index,	/*!< in: index of the data, the index
osku's avatar
osku committed
7539
				tree MUST be X-latched */
7540 7541 7542
	rec_t*		rec,	/*!< in/out: record */
	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
7543
				part will be updated, or NULL */
7544
	bool		rollback,/*!< in: performing rollback? */
7545
	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
osku's avatar
osku committed
7546 7547 7548 7549 7550 7551 7552
				an X-latch to record page and to the index
				tree */
{
	ulint	n_fields;
	ulint	i;

	ut_ad(rec_offs_validate(rec, index, offsets));
7553
	ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7554 7555
	ut_ad(dict_index_is_clust(index));
	ut_ad(page_rec_is_leaf(rec));
osku's avatar
osku committed
7556 7557
	/* Free possible externally stored fields in the record */

7558
	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
osku's avatar
osku committed
7559 7560 7561 7562
	n_fields = rec_offs_n_fields(offsets);

	for (i = 0; i < n_fields; i++) {
		if (rec_offs_nth_extern(offsets, i)) {
7563
			btr_free_externally_stored_field(
7564
				index, btr_rec_get_field_ref(rec, offsets, i),
7565
				rec, offsets, page_zip, i, rollback, mtr);
osku's avatar
osku committed
7566 7567 7568 7569
		}
	}
}

7570
/***********************************************************//**
osku's avatar
osku committed
7571 7572 7573 7574 7575 7576
Frees the externally stored fields for a record, if the field is mentioned
in the update vector. */
static
void
btr_rec_free_updated_extern_fields(
/*===============================*/
7577
	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
osku's avatar
osku committed
7578
				X-latched */
7579 7580
	rec_t*		rec,	/*!< in/out: record */
	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
7581
				part will be updated, or NULL */
7582 7583
	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
	const upd_t*	update,	/*!< in: update vector */
7584
	bool		rollback,/*!< in: performing rollback? */
7585
	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
osku's avatar
osku committed
7586 7587
				an X-latch to record page and to the tree */
{
7588 7589
	ulint	n_fields;
	ulint	i;
osku's avatar
osku committed
7590 7591

	ut_ad(rec_offs_validate(rec, index, offsets));
7592
	ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
osku's avatar
osku committed
7593 7594 7595 7596 7597 7598

	/* Free possible externally stored fields in the record */

	n_fields = upd_get_n_fields(update);

	for (i = 0; i < n_fields; i++) {
7599
		const upd_field_t* ufield = upd_get_nth_field(update, i);
7600

osku's avatar
osku committed
7601
		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
marko's avatar
marko committed
7602
			ulint	len;
7603 7604
			byte*	data = rec_get_nth_field(
				rec, offsets, ufield->field_no, &len);
marko's avatar
marko committed
7605 7606
			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);

7607 7608 7609
			btr_free_externally_stored_field(
				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
				rec, offsets, page_zip,
7610
				ufield->field_no, rollback, mtr);
osku's avatar
osku committed
7611 7612 7613 7614
		}
	}
}

7615
/*******************************************************************//**
7616
Copies the prefix of an uncompressed BLOB.  The clustered index record
7617
that points to this BLOB must be protected by a lock or a page latch.
7618
@return number of bytes written to buf */
7619
static
7620
ulint
7621 7622
btr_copy_blob_prefix(
/*=================*/
7623
	byte*		buf,	/*!< out: the externally stored part of
7624
				the field, or a prefix of it */
7625 7626 7627
	ulint		len,	/*!< in: length of buf, in bytes */
	ulint		space_id,/*!< in: space id of the BLOB pages */
	ulint		page_no,/*!< in: page number of the first BLOB page */
7628
	ulint		offset)	/*!< in: offset on the first BLOB page */
osku's avatar
osku committed
7629
{
7630
	ulint	copied_len	= 0;
osku's avatar
osku committed
7631

7632 7633 7634 7635 7636 7637 7638
	for (;;) {
		mtr_t		mtr;
		buf_block_t*	block;
		const page_t*	page;
		const byte*	blob_header;
		ulint		part_len;
		ulint		copy_len;
osku's avatar
osku committed
7639

7640
		mtr_start(&mtr);
7641

7642 7643
		block = buf_page_get(page_id_t(space_id, page_no),
				     univ_page_size, RW_S_LATCH, &mtr);
7644 7645
		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
		page = buf_block_get_frame(block);
7646

7647 7648
		btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);

7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661
		blob_header = page + offset;
		part_len = btr_blob_get_part_len(blob_header);
		copy_len = ut_min(part_len, len - copied_len);

		memcpy(buf + copied_len,
		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
		copied_len += copy_len;

		page_no = btr_blob_get_next_page_no(blob_header);

		mtr_commit(&mtr);

		if (page_no == FIL_NULL || copy_len != part_len) {
7662
			UNIV_MEM_ASSERT_RW(buf, copied_len);
7663 7664 7665 7666 7667 7668 7669 7670 7671
			return(copied_len);
		}

		/* On other BLOB pages except the first the BLOB header
		always is at the page data start: */

		offset = FIL_PAGE_DATA;

		ut_ad(copied_len <= len);
7672
	}
7673 7674
}

7675 7676 7677 7678 7679 7680 7681 7682 7683 7684
/** Copies the prefix of a compressed BLOB.
The clustered index record that points to this BLOB must be protected
by a lock or a page latch.
@param[out]	buf		the externally stored part of the field,
or a prefix of it
@param[in]	len		length of buf, in bytes
@param[in]	page_size	compressed BLOB page size
@param[in]	space_id	space id of the BLOB pages
@param[in]	offset		offset on the first BLOB page
@return number of bytes written to buf */
7685
static
7686
ulint
7687
btr_copy_zblob_prefix(
7688 7689 7690 7691 7692 7693
	byte*			buf,
	ulint			len,
	const page_size_t&	page_size,
	ulint			space_id,
	ulint			page_no,
	ulint			offset)
7694
{
7695 7696 7697 7698 7699 7700
	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
	mem_heap_t*	heap;
	int		err;
	z_stream	d_stream;

	d_stream.next_out = buf;
Sergei Golubchik's avatar
Sergei Golubchik committed
7701
	d_stream.avail_out = static_cast<uInt>(len);
7702 7703 7704 7705 7706 7707 7708
	d_stream.next_in = Z_NULL;
	d_stream.avail_in = 0;

	/* Zlib inflate needs 32 kilobytes for the default
	window size, plus a few kilobytes for small objects. */
	heap = mem_heap_create(40000);
	page_zip_set_alloc(&d_stream, heap);
7709

7710
	ut_ad(page_size.is_compressed());
7711
	ut_ad(space_id);
7712

7713 7714 7715
	err = inflateInit(&d_stream);
	ut_a(err == Z_OK);

7716
	for (;;) {
7717
		buf_page_t*	bpage;
7718
		ulint		next_page_no;
7719

7720 7721 7722 7723
		/* There is no latch on bpage directly.  Instead,
		bpage is protected by the B-tree page latch that
		is being held on the clustered index record, or,
		in row_merge_copy_blobs(), by an exclusive table lock. */
7724 7725
		bpage = buf_page_get_zip(page_id_t(space_id, page_no),
					 page_size);
osku's avatar
osku committed
7726

7727
		if (UNIV_UNLIKELY(!bpage)) {
7728 7729
			ib::error() << "Cannot load compressed BLOB "
				<< page_id_t(space_id, page_no);
7730
			goto func_exit;
7731
		}
7732

7733 7734
		if (UNIV_UNLIKELY
		    (fil_page_get_type(bpage->zip.data) != page_type)) {
7735 7736 7737 7738 7739 7740

			ib::error() << "Unexpected type "
				<< fil_page_get_type(bpage->zip.data)
				<< " of compressed BLOB page "
				<< page_id_t(space_id, page_no);

7741
			ut_ad(0);
7742
			goto end_of_blob;
7743
		}
7744

7745
		next_page_no = mach_read_from_4(bpage->zip.data + offset);
7746

7747 7748 7749 7750 7751 7752 7753 7754
		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
			/* When the BLOB begins at page header,
			the compressed data payload does not
			immediately follow the next page pointer. */
			offset = FIL_PAGE_DATA;
		} else {
			offset += 4;
		}
7755

7756
		d_stream.next_in = bpage->zip.data + offset;
7757 7758
		d_stream.avail_in = static_cast<uInt>(page_size.physical()
						      - offset);
7759

7760
		err = inflate(&d_stream, Z_NO_FLUSH);
7761 7762
		switch (err) {
		case Z_OK:
7763
			if (!d_stream.avail_out) {
7764 7765 7766 7767 7768 7769 7770 7771 7772
				goto end_of_blob;
			}
			break;
		case Z_STREAM_END:
			if (next_page_no == FIL_NULL) {
				goto end_of_blob;
			}
			/* fall through */
		default:
7773
inflate_error:
7774 7775 7776 7777 7778
			ib::error() << "inflate() of compressed BLOB page "
				<< page_id_t(space_id, page_no)
				<< " returned " << err
				<< " (" << d_stream.msg << ")";

7779 7780 7781 7782 7783
		case Z_BUF_ERROR:
			goto end_of_blob;
		}

		if (next_page_no == FIL_NULL) {
7784
			if (!d_stream.avail_in) {
7785 7786 7787 7788
				ib::error()
					<< "Unexpected end of compressed "
					<< "BLOB page "
					<< page_id_t(space_id, page_no);
7789
			} else {
7790
				err = inflate(&d_stream, Z_FINISH);
7791 7792 7793 7794 7795
				switch (err) {
				case Z_STREAM_END:
				case Z_BUF_ERROR:
					break;
				default:
7796
					goto inflate_error;
7797
				}
7798
			}
osku's avatar
osku committed
7799

7800
end_of_blob:
7801
			buf_page_release_zip(bpage);
7802
			goto func_exit;
7803
		}
7804

7805
		buf_page_release_zip(bpage);
7806

7807 7808
		/* On other BLOB pages except the first
		the BLOB header always is at the page header: */
osku's avatar
osku committed
7809

7810 7811
		page_no = next_page_no;
		offset = FIL_PAGE_NEXT;
7812
		page_type = FIL_PAGE_TYPE_ZBLOB2;
7813
	}
7814 7815 7816 7817 7818 7819

func_exit:
	inflateEnd(&d_stream);
	mem_heap_free(heap);
	UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
	return(d_stream.total_out);
7820
}
7821

7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832
/** Copies the prefix of an externally stored field of a record.
The clustered index record that points to this BLOB must be protected
by a lock or a page latch.
@param[out]	buf		the externally stored part of the
field, or a prefix of it
@param[in]	len		length of buf, in bytes
@param[in]	page_size	BLOB page size
@param[in]	space_id	space id of the first BLOB page
@param[in]	page_no		page number of the first BLOB page
@param[in]	offset		offset on the first BLOB page
@return number of bytes written to buf */
7833 7834 7835
static
ulint
btr_copy_externally_stored_field_prefix_low(
7836 7837 7838 7839 7840 7841
	byte*			buf,
	ulint			len,
	const page_size_t&	page_size,
	ulint			space_id,
	ulint			page_no,
	ulint			offset)
7842
{
7843
	if (len == 0) {
7844 7845
		return(0);
	}
7846

7847 7848
	if (page_size.is_compressed()) {
		return(btr_copy_zblob_prefix(buf, len, page_size,
7849
					     space_id, page_no, offset));
7850
	} else {
7851
		ut_ad(page_size.equals_to(univ_page_size));
7852
		return(btr_copy_blob_prefix(buf, len, space_id,
7853
					    page_no, offset));
osku's avatar
osku committed
7854 7855 7856
	}
}

7857 7858 7859 7860 7861 7862 7863 7864 7865
/** Copies the prefix of an externally stored field of a record.
The clustered index record must be protected by a lock or a page latch.
@param[out]	buf		the field, or a prefix of it
@param[in]	len		length of buf, in bytes
@param[in]	page_size	BLOB page size
@param[in]	data		'internally' stored part of the field
containing also the reference to the external part; must be protected by
a lock or a page latch
@param[in]	local_len	length of data, in bytes
7866 7867
@return the length of the copied field, or 0 if the column was being
or has been deleted */
7868 7869
ulint
btr_copy_externally_stored_field_prefix(
7870 7871 7872 7873 7874
	byte*			buf,
	ulint			len,
	const page_size_t&	page_size,
	const byte*		data,
	ulint			local_len)
7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891
{
	ulint	space_id;
	ulint	page_no;
	ulint	offset;

	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);

	local_len -= BTR_EXTERN_FIELD_REF_SIZE;

	if (UNIV_UNLIKELY(local_len >= len)) {
		memcpy(buf, data, len);
		return(len);
	}

	memcpy(buf, data, local_len);
	data += local_len;

7892 7893
	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));

7894 7895 7896 7897 7898 7899 7900 7901
	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
		/* The externally stored part of the column has been
		(partially) deleted.  Signal the half-deleted BLOB
		to the caller. */

		return(0);
	}

7902
	space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
7903

7904
	page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
7905

7906
	offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
7907 7908 7909 7910

	return(local_len
	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
							     len - local_len,
7911
							     page_size,
7912
							     space_id, page_no,
7913
							     offset));
7914 7915
}

7916 7917 7918 7919 7920 7921 7922 7923 7924 7925
/** Copies an externally stored field of a record to mem heap.
The clustered index record must be protected by a lock or a page latch.
@param[out]	len		length of the whole field
@param[in]	data		'internally' stored part of the field
containing also the reference to the external part; must be protected by
a lock or a page latch
@param[in]	page_size	BLOB page size
@param[in]	local_len	length of data
@param[in,out]	heap		mem heap
@return the whole field copied to heap */
7926 7927
byte*
btr_copy_externally_stored_field(
7928 7929 7930 7931 7932
	ulint*			len,
	const byte*		data,
	const page_size_t&	page_size,
	ulint			local_len,
	mem_heap_t*		heap)
7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954
{
	ulint	space_id;
	ulint	page_no;
	ulint	offset;
	ulint	extern_len;
	byte*	buf;

	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);

	local_len -= BTR_EXTERN_FIELD_REF_SIZE;

	space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);

	page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);

	offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);

	/* Currently a BLOB cannot be bigger than 4 GB; we
	leave the 4 upper bytes in the length field unused */

	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);

7955
	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
7956 7957 7958 7959 7960

	memcpy(buf, data, local_len);
	*len = local_len
		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
							      extern_len,
7961
							      page_size,
7962
							      space_id,
7963
							      page_no, offset);
7964 7965 7966 7967

	return(buf);
}

7968 7969 7970 7971 7972 7973 7974 7975 7976
/** Copies an externally stored field of a record to mem heap.
@param[in]	rec		record in a clustered index; must be
protected by a lock or a page latch
@param[in]	offset		array returned by rec_get_offsets()
@param[in]	page_size	BLOB page size
@param[in]	no		field number
@param[out]	len		length of the field
@param[in,out]	heap		mem heap
@return the field copied to heap, or NULL if the field is incomplete */
osku's avatar
osku committed
7977 7978
byte*
btr_rec_copy_externally_stored_field(
7979 7980 7981 7982 7983 7984
	const rec_t*		rec,
	const ulint*		offsets,
	const page_size_t&	page_size,
	ulint			no,
	ulint*			len,
	mem_heap_t*		heap)
osku's avatar
osku committed
7985
{
7986
	ulint		local_len;
7987
	const byte*	data;
osku's avatar
osku committed
7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001

	ut_a(rec_offs_nth_extern(offsets, no));

	/* An externally stored field can contain some initial
	data from the field, and in the last 20 bytes it has the
	space id, page number, and offset where the rest of the
	field data is stored, and the data length in addition to
	the data stored locally. We may need to store some data
	locally to get the local record length above the 128 byte
	limit so that field offsets are stored in two bytes, and
	the extern bit is available in those two bytes. */

	data = rec_get_nth_field(rec, offsets, no, &local_len);

8002 8003 8004 8005 8006
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);

	if (UNIV_UNLIKELY
	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
8007 8008
		/* The externally stored field was not written yet.
		This record should only be seen by
8009 8010 8011 8012 8013
		recv_recovery_rollback_active() or any
		TRX_ISO_READ_UNCOMMITTED transactions. */
		return(NULL);
	}

8014
	return(btr_copy_externally_stored_field(len, data,
8015
						page_size, local_len, heap));
osku's avatar
osku committed
8016
}