fsp0fsp.cc 159 KB
Newer Older
1 2
/*****************************************************************************

Sergei Golubchik's avatar
Sergei Golubchik committed
3
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
4
Copyright (c) 2017, 2023, MariaDB Corporation.
5 6 7 8 9 10 11 12 13 14

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
15
this program; if not, write to the Free Software Foundation, Inc.,
16
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 18 19

*****************************************************************************/

20
/******************************************************************//**
21
@file fsp/fsp0fsp.cc
osku's avatar
osku committed
22 23 24 25 26
File space management

Created 11/29/1995 Heikki Tuuri
***********************************************************************/

27 28 29
#include <memory>
#include <cctype>
#include <cstdlib>
30 31
#include <thread>

osku's avatar
osku committed
32 33
#include "fsp0fsp.h"
#include "buf0buf.h"
34
#include "buf0flu.h"
osku's avatar
osku committed
35
#include "fil0fil.h"
36
#include "fil0crypt.h"
osku's avatar
osku committed
37 38
#include "mtr0log.h"
#include "ut0byte.h"
39
#include "page0page.h"
40 41 42 43 44 45 46
#include "srv0srv.h"
#include "srv0start.h"
#include "btr0btr.h"
#include "btr0sea.h"
#include "dict0boot.h"
#include "log0log.h"
#include "dict0mem.h"
47
#include "fsp0types.h"
48
#include "log.h"
49
#include "trx0trx.h"
50

51 52 53 54
/** Returns the first extent descriptor for a segment.
We think of the extent lists of the segment catenated in the order
FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
@param[in]	inode		segment inode
55
@param[in]	space		tablespace
56
@param[in,out]	mtr		mini-transaction
57
@param[out]	err		error code
58
@return the first extent descriptor, or NULL if none */
59
MY_ATTRIBUTE((nonnull, warn_unused_result))
osku's avatar
osku committed
60 61 62
static
xdes_t*
fseg_get_first_extent(
63
	fseg_inode_t*		inode,
64
	const fil_space_t*	space,
65 66
	mtr_t*			mtr,
	dberr_t*		err);
67

68
ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
69 70 71 72 73 74 75 76 77
/** Put new extents to the free list if there are free extents above the free
limit. If an extent happens to contain an extent descriptor page, the extent
is put to the FSP_FREE_FRAG list with the page marked as used.
@param[in]	init_space	true if this is a single-table tablespace
and we are only initializing the first extent and the first bitmap pages;
then we will not allocate more extents
@param[in,out]	space		tablespace
@param[in,out]	header		tablespace header
@param[in,out]	mtr		mini-transaction */
78 79
static
dberr_t
osku's avatar
osku committed
80
fsp_fill_free_list(
81 82
	bool		init_space,
	fil_space_t*	space,
83
	buf_block_t*	header,
84 85 86
	mtr_t*		mtr);

/** Allocates a single free page from a segment.
87 88
This function implements the intelligent allocation strategy which tries to
minimize file space fragmentation.
89 90
@param[in,out]	space			tablespace
@param[in,out]	seg_inode		segment inode
91
@param[in,out]	iblock			segment inode page
92 93 94 95 96 97
@param[in]	hint			hint of which page would be desirable
@param[in]	direction		if the new page is needed because of
an index page split, and records are inserted there in order, into which
direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
@param[in,out]	mtr			mini-transaction
@param[in,out]	init_mtr		mtr or another mini-transaction in
98
which the page should be initialized.
99 100 101
@param[out]	err			error code
@return the allocated page
@retval nullptr	if no page could be allocated */
osku's avatar
osku committed
102
static
103
buf_block_t*
osku's avatar
osku committed
104
fseg_alloc_free_page_low(
105 106
	fil_space_t*		space,
	fseg_inode_t*		seg_inode,
107
	buf_block_t*		iblock,
108
	uint32_t		hint,
109 110
	byte			direction,
#ifdef UNIV_DEBUG
111 112
	bool			has_done_reservation,
	/*!< whether the space has already been reserved */
113
#endif /* UNIV_DEBUG */
114
	mtr_t*			mtr,
115 116 117
	mtr_t*			init_mtr,
	dberr_t*		err)
	MY_ATTRIBUTE((nonnull, warn_unused_result));
118

119
MY_ATTRIBUTE((nonnull, warn_unused_result))
120 121 122
/** Get the tablespace header block, SX-latched
@param[in]      space           tablespace
@param[in,out]  mtr             mini-transaction
123 124 125 126 127
@param[out]     err             error code
@return pointer to the space header, page x-locked
@retval nullptr if the page cannot be retrieved or is corrupted */
static buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr,
                                   dberr_t *err)
osku's avatar
osku committed
128
{
129 130 131 132 133
  const page_id_t id{space->id, 0};
  buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_SX_FIX);
  if (block)
    *err= DB_SUCCESS;
  else
134
  {
135 136 137 138 139 140 141 142 143 144
    block= buf_page_get_gen(id, space->zip_size(), RW_SX_LATCH,
                            nullptr, BUF_GET_POSSIBLY_FREED,
                            mtr, err);
    if (block &&
        space->id != mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID +
                                      block->page.frame))
    {
      *err= DB_CORRUPTION;
      block= nullptr;
    }
145
  }
146
  return block;
osku's avatar
osku committed
147 148
}

149 150
/** Set the XDES_FREE_BIT of a page.
@tparam         free    desired value of XDES_FREE_BIT
151
@param[in]      block   extent descriptor block
152 153 154 155
@param[in,out]  descr   extent descriptor
@param[in]      offset  page offset within the extent
@param[in,out]  mtr     mini-transaction */
template<bool free>
156 157
inline void xdes_set_free(const buf_block_t &block, xdes_t *descr,
                          ulint offset, mtr_t *mtr)
osku's avatar
osku committed
158
{
Marko Mäkelä's avatar
Marko Mäkelä committed
159 160
  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
                                   MTR_MEMO_PAGE_X_FIX));
161
  ut_ad(offset < FSP_EXTENT_SIZE);
162
  ut_ad(page_align(descr) == block.page.frame);
163 164 165 166 167 168 169 170 171 172
  compile_time_assert(XDES_BITS_PER_PAGE == 2);
  compile_time_assert(XDES_FREE_BIT == 0);
  compile_time_assert(XDES_CLEAN_BIT == 1);

  ulint index= XDES_BITS_PER_PAGE * offset;
  byte *b= &descr[XDES_BITMAP + (index >> 3)];
  /* xdes_init() should have set all XDES_CLEAN_BIT. */
  ut_ad(!(~*b & 0xaa));
  /* Clear or set XDES_FREE_BIT. */
  byte val= free
173 174
    ? static_cast<byte>(*b | 1 << (index & 7))
    : static_cast<byte>(*b & ~(1 << (index & 7)));
175
  mtr->write<1>(block, b, val);
176
}
osku's avatar
osku committed
177

178 179 180 181 182
/**
Find a free page.
@param descr   extent descriptor
@param hint    page offset to start searching from (towards larger pages)
@return free page offset
183 184
@retval FIL_NULL if no page is free */
inline uint32_t xdes_find_free(const xdes_t *descr, uint32_t hint= 0)
osku's avatar
osku committed
185
{
186 187 188
  const uint32_t extent_size= FSP_EXTENT_SIZE;
  ut_ad(hint < extent_size);
  for (uint32_t i= hint; i < extent_size; i++)
189 190
    if (xdes_is_free(descr, i))
      return i;
191
  for (uint32_t i= 0; i < hint; i++)
192 193
    if (xdes_is_free(descr, i))
      return i;
194
  return FIL_NULL;
195
}
osku's avatar
osku committed
196

197 198 199
/**
Determine the number of used pages in a descriptor.
@param descr  file descriptor
200
@return number of pages used */
201
inline uint32_t xdes_get_n_used(const xdes_t *descr)
osku's avatar
osku committed
202
{
203
  uint32_t count= 0;
osku's avatar
osku committed
204

205
  for (uint32_t i= FSP_EXTENT_SIZE; i--; )
206 207
    if (!xdes_is_free(descr, i))
      count++;
osku's avatar
osku committed
208

209
  return count;
osku's avatar
osku committed
210 211
}

212 213 214 215 216
/**
Determine whether a file extent is full.
@param descr  file descriptor
@return whether all pages have been allocated */
inline bool xdes_is_full(const xdes_t *descr)
osku's avatar
osku committed
217
{
218
  return FSP_EXTENT_SIZE == xdes_get_n_used(descr);
osku's avatar
osku committed
219 220
}

221 222 223 224 225 226
/** Set the state of an extent descriptor.
@param[in]      block   extent descriptor block
@param[in,out]  descr   extent descriptor
@param[in]      state   the state
@param[in,out]  mtr     mini-transaction */
inline void xdes_set_state(const buf_block_t &block, xdes_t *descr,
227
                           byte state, mtr_t *mtr)
osku's avatar
osku committed
228
{
229 230 231
  ut_ad(descr && mtr);
  ut_ad(state >= XDES_FREE);
  ut_ad(state <= XDES_FSEG);
Marko Mäkelä's avatar
Marko Mäkelä committed
232 233
  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
                                   MTR_MEMO_PAGE_X_FIX));
234
  ut_ad(page_align(descr) == block.page.frame);
235 236
  ut_ad(mach_read_from_4(descr + XDES_STATE) <= XDES_FSEG);
  mtr->write<1>(block, XDES_STATE + 3 + descr, state);
osku's avatar
osku committed
237 238
}

239
/**********************************************************************//**
240
Gets the state of an xdes.
241
@return state */
osku's avatar
osku committed
242 243 244 245
UNIV_INLINE
ulint
xdes_get_state(
/*===========*/
246
	const xdes_t*	descr)	/*!< in: descriptor */
osku's avatar
osku committed
247
{
marko's avatar
marko committed
248 249
	ulint	state;

250
	ut_ad(descr);
251
	state = mach_read_from_4(descr + XDES_STATE);
marko's avatar
marko committed
252 253
	ut_ad(state - 1 < XDES_FSEG);
	return(state);
osku's avatar
osku committed
254 255
}

256
/**********************************************************************//**
osku's avatar
osku committed
257
Inits an extent descriptor to the free and clean state. */
258 259
inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr)
{
Marko Mäkelä's avatar
Marko Mäkelä committed
260 261
  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
                                   MTR_MEMO_PAGE_X_FIX));
262
  mtr->memset(&block, uint16_t(descr - block.page.frame) + XDES_BITMAP,
263
              XDES_SIZE - XDES_BITMAP, 0xff);
264 265 266 267 268 269 270 271 272
  xdes_set_state(block, descr, XDES_FREE, mtr);
}

/** Mark a page used in an extent descriptor.
@param[in,out]  seg_inode       segment inode
@param[in,out]  iblock          segment inode page
@param[in]      page            page number
@param[in,out]  descr           extent descriptor
@param[in,out]  xdes            extent descriptor page
273 274 275 276
@param[in,out]  mtr             mini-transaction
@return error code */
static MY_ATTRIBUTE((nonnull, warn_unused_result))
dberr_t
277 278
fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
                    ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr)
osku's avatar
osku committed
279
{
280
  ut_ad(fil_page_get_type(iblock->page.frame) == FIL_PAGE_INODE);
281
  ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
282
  ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
283 284
  ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4));

285 286
  const uint16_t xoffset= uint16_t(descr - xdes->page.frame + XDES_FLST_NODE);
  const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
287 288 289 290

  if (!xdes_get_n_used(descr))
  {
    /* We move the extent from the free list to the NOT_FULL list */
291 292 293 294 295 296
    if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_FREE + ioffset),
                                 xdes, xoffset, mtr))
      return err;
    if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
                                   xdes, xoffset, mtr))
      return err;
297 298
  }

299 300
  if (UNIV_UNLIKELY(!xdes_is_free(descr, page % FSP_EXTENT_SIZE)))
    return DB_CORRUPTION;
301 302 303 304 305 306 307 308 309 310

  /* We mark the page as used */
  xdes_set_free<false>(*xdes, descr, page % FSP_EXTENT_SIZE, mtr);

  byte* p_not_full= seg_inode + FSEG_NOT_FULL_N_USED;
  const uint32_t not_full_n_used= mach_read_from_4(p_not_full) + 1;
  mtr->write<4>(*iblock, p_not_full, not_full_n_used);
  if (xdes_is_full(descr))
  {
    /* We move the extent from the NOT_FULL list to the FULL list */
311 312 313 314 315 316
    if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
                                 xdes, xoffset, mtr))
      return err;
    if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset),
                                   xdes, xoffset, mtr))
      return err;
317 318 319
    mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
                  not_full_n_used - FSP_EXTENT_SIZE);
  }
320 321

  return DB_SUCCESS;
322
}
osku's avatar
osku committed
323

324 325
/** Get pointer to a the extent descriptor of a page.
@param[in,out]	sp_header	tablespace header page, x-latched
326
@param[in]	space		tablespace
327 328
@param[in]	offset		page offset
@param[in,out]	mtr		mini-transaction
329
@param[out]	err		error code
330
@param[out]	desc_block	descriptor block
331
@param[in]	init_space	whether the tablespace is being initialized
332
@return pointer to the extent descriptor, NULL if the page does not
333
exist in the space or if the offset exceeds free limit */
334
UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
osku's avatar
osku committed
335 336
xdes_t*
xdes_get_descriptor_with_space_hdr(
337
	buf_block_t*		header,
338
	const fil_space_t*	space,
339
	uint32_t		offset,
340
	mtr_t*			mtr,
341
	dberr_t*		err = nullptr,
342
	buf_block_t**		desc_block = nullptr,
343
	bool			init_space = false)
osku's avatar
osku committed
344
{
345
	ut_ad(space->is_owner());
Marko Mäkelä's avatar
Marko Mäkelä committed
346 347
	ut_ad(mtr->memo_contains_flagged(header, MTR_MEMO_PAGE_SX_FIX
					 | MTR_MEMO_PAGE_X_FIX));
osku's avatar
osku committed
348
	/* Read free limit and space size */
349
	uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
350
					  + header->page.frame);
351
	uint32_t size  = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
352
					  + header->page.frame);
353 354
	ut_ad(limit == space->free_limit
	      || (space->free_limit == 0
355
		  && (init_space
356
		      || space->purpose == FIL_TYPE_TEMPORARY
357
		      || (srv_startup_is_before_trx_rollback_phase
358 359
			  && (space->id == TRX_SYS_SPACE
			      || srv_is_undo_tablespace(space->id))))));
360
	ut_ad(size == space->size_in_header);
osku's avatar
osku committed
361

362 363
	if (offset >= size || offset >= limit) {
		return nullptr;
osku's avatar
osku committed
364 365
	}

366
	const unsigned zip_size = space->zip_size();
367

368
	uint32_t descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
369

370
	buf_block_t* block = header;
osku's avatar
osku committed
371

372
	if (descr_page_no) {
373
		block = buf_page_get_gen(page_id_t(space->id, descr_page_no),
374 375
					 zip_size, RW_SX_LATCH, nullptr,
					 BUF_GET_POSSIBLY_FREED, mtr, err);
376
	}
osku's avatar
osku committed
377

378
	if (desc_block) {
379 380 381
		*desc_block = block;
	}

382 383
	return block
		? XDES_ARR_OFFSET + XDES_SIZE
384
		* xdes_calc_descriptor_index(zip_size, offset)
385
		+ block->page.frame
386
		: nullptr;
osku's avatar
osku committed
387 388
}

389
MY_ATTRIBUTE((nonnull(1,3), warn_unused_result))
390 391 392 393 394 395 396
/** Get the extent descriptor of a page.
The page where the extent descriptor resides is x-locked. If the page
offset is equal to the free limit of the space, we will add new
extents from above the free limit to the space free list, if not free
limit == space size. This adding is necessary to make the descriptor
defined, as they are uninitialized above the free limit.
@param[in]	space		tablespace
397 398 399
@param[in]	offset		page offset; if equal to the free limit, we
try to add new extents to the space free list
@param[in,out]	mtr		mini-transaction
400
@param[out]	err		error code
401
@param[out]	xdes		extent descriptor page
402
@return the extent descriptor */
403
static xdes_t *xdes_get_descriptor(const fil_space_t *space, uint32_t offset,
404 405
                                   mtr_t *mtr, dberr_t *err= nullptr,
                                   buf_block_t **xdes= nullptr)
406
{
407 408 409 410 411 412
  if (buf_block_t *block=
      buf_page_get_gen(page_id_t(space->id, 0), space->zip_size(), RW_SX_LATCH,
                       nullptr, BUF_GET_POSSIBLY_FREED, mtr, err))
    return xdes_get_descriptor_with_space_hdr(block, space, offset, mtr,
                                              err, xdes);
  return nullptr;
413 414
}

415
MY_ATTRIBUTE((nonnull(3), warn_unused_result))
416
/** Get a pointer to the extent descriptor. The page where the
417
extent descriptor resides is x-locked.
418 419 420
@param space    tablespace
@param lst_node file address of the list node contained in the descriptor
@param mtr      mini-transaction
421
@param err      error code
422
@param block    extent descriptor block
423
@return pointer to the extent descriptor */
424 425
static inline
xdes_t *xdes_lst_get_descriptor(const fil_space_t &space, fil_addr_t lst_node,
426 427
                                mtr_t *mtr, buf_block_t **block= nullptr,
                                dberr_t *err= nullptr)
osku's avatar
osku committed
428
{
429
  ut_ad(mtr->memo_contains(space));
430 431 432 433 434 435 436 437 438 439 440 441
  ut_ad(lst_node.boffset < space.physical_size());
  buf_block_t *b;
  if (!block)
    block= &b;
  *block= buf_page_get_gen(page_id_t{space.id, lst_node.page},
                           space.zip_size(), RW_SX_LATCH,
                           nullptr, BUF_GET_POSSIBLY_FREED, mtr, err);
  if (*block)
    return (*block)->page.frame + lst_node.boffset - XDES_FLST_NODE;

  space.set_corrupted();
  return nullptr;
osku's avatar
osku committed
442 443
}

444
/********************************************************************//**
445
Returns page offset of the first page in extent described by a descriptor.
446
@return offset of the first page in extent */
447
static uint32_t xdes_get_offset(const xdes_t *descr)
osku's avatar
osku committed
448
{
449 450 451 452
  ut_ad(descr);
  return page_get_page_no(page_align(descr)) +
    uint32_t(((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) *
             FSP_EXTENT_SIZE);
osku's avatar
osku committed
453 454
}

455 456
/** Initialize a file page whose prior contents should be ignored.
@param[in,out]	block	buffer pool block */
457
void fsp_apply_init_file_page(buf_block_t *block)
osku's avatar
osku committed
458
{
459
  memset_aligned<UNIV_PAGE_SIZE_MIN>(block->page.frame, 0, srv_page_size);
460
  const page_id_t id(block->page.id());
461

462
  mach_write_to_4(block->page.frame + FIL_PAGE_OFFSET, id.page_no());
463
  memset_aligned<8>(block->page.frame + FIL_PAGE_PREV, 0xff, 8);
464 465
  mach_write_to_4(block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
                  id.space());
466 467 468 469 470 471
  if (page_zip_des_t* page_zip= buf_block_get_page_zip(block))
  {
    memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0,
                                      page_zip_get_size(page_zip));
    static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
    memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET,
472
                      block->page.frame + FIL_PAGE_OFFSET, 4);
473
    memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8);
474 475 476
    static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
                  "not perfect alignment");
    memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
477
                      block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
478
  }
479 480
}

481
#ifdef UNIV_DEBUG
482 483 484
/** Assert that the mini-transaction is compatible with
updating an allocation bitmap page.
@param[in]	mtr	mini-transaction */
485
void fil_space_t::modify_check(const mtr_t& mtr) const
486
{
487 488 489 490 491 492 493 494 495 496 497 498
  switch (mtr.get_log_mode()) {
  case MTR_LOG_NONE:
    /* These modes are only allowed within a non-bitmap page
       when there is a higher-level redo log record written. */
    ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_TEMPORARY);
    break;
  case MTR_LOG_NO_REDO:
    ut_ad(purpose == FIL_TYPE_TEMPORARY || purpose == FIL_TYPE_IMPORT);
    break;
  default:
    /* We may only write redo log for a persistent tablespace. */
    ut_ad(purpose == FIL_TYPE_TABLESPACE);
499 500
    ut_ad(mtr.is_named_space(id) ||
          id == SRV_SPACE_ID_BINLOG0 || id == SRV_SPACE_ID_BINLOG1);
501
  }
osku's avatar
osku committed
502
}
503 504
#endif

505
/** Initialize a tablespace header.
506 507
@param[in,out]	space	tablespace
@param[in]	size	current size in blocks
508 509 510
@param[in,out]	mtr	mini-transaction
@return error code */
dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
osku's avatar
osku committed
511
{
512 513
	const page_id_t page_id(space->id, 0);
	const ulint zip_size = space->zip_size();
osku's avatar
osku committed
514

515
	buf_block_t *free_block = buf_LRU_get_free_block(have_no_mutex);
516

517
	mtr->x_lock_space(space);
518

519 520 521 522 523 524
	buf_block_t* block = buf_page_create(space, 0, zip_size, mtr,
					     free_block);
	if (UNIV_UNLIKELY(block != free_block)) {
		buf_pool.free_block(free_block);
	}

525 526 527 528
	space->size_in_header = size;
	space->free_len = 0;
	space->free_limit = 0;

osku's avatar
osku committed
529 530
	/* The prior contents of the file page should be ignored */

531
	fsp_init_file_page(space, block, mtr);
osku's avatar
osku committed
532

533
	mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE,
534
		      FIL_PAGE_TYPE_FSP_HDR);
535

536
	mtr->write<4,mtr_t::MAYBE_NOP>(*block, FSP_HEADER_OFFSET + FSP_SPACE_ID
537
				       + block->page.frame, space->id);
538
	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED
539
				    + block->page.frame));
540 541 542 543 544
	/* recv_sys_t::parse() expects to find a WRITE record that
	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
	in order to avoid optimizing away any unchanged most
	significant bytes of FSP_SIZE. */
	mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE
545
				    + block->page.frame, size);
546
	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
547
				    + block->page.frame));
548 549 550
	if (auto f = space->flags & ~FSP_FLAGS_MEM_MASK) {
		mtr->write<4,mtr_t::FORCED>(*block,
					    FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
551
					    + block->page.frame, f);
552
	}
553
	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
554
				    + block->page.frame));
555 556 557 558 559 560

	flst_init(block, FSP_HEADER_OFFSET + FSP_FREE, mtr);
	flst_init(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, mtr);
	flst_init(block, FSP_HEADER_OFFSET + FSP_FULL_FRAG, mtr);
	flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr);
	flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr);
osku's avatar
osku committed
561

562 563
	mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID
		      + block->page.frame,
564
		      1U);
565

566 567 568 569
	if (dberr_t err = fsp_fill_free_list(!is_system_tablespace(space->id),
					     space, block, mtr)) {
		return err;
	}
570

571 572 573 574 575
	/* Write encryption metadata to page 0 if tablespace is
	encrypted or encryption is disabled by table option. */
	if (space->crypt_data &&
	    (space->crypt_data->should_encrypt() ||
	     space->crypt_data->not_encrypted())) {
576
		space->crypt_data->write_page0(block, mtr);
577
	}
578 579

	return DB_SUCCESS;
580
}
osku's avatar
osku committed
581

582
/** Try to extend a single-table tablespace so that a page would fit in the
583
data file.
584 585 586 587 588
@param[in,out]	space	tablespace
@param[in]	page_no	page number
@param[in,out]	header	tablespace header
@param[in,out]	mtr	mini-transaction
@return true if success */
589
static ATTRIBUTE_COLD __attribute__((warn_unused_result))
590
bool
osku's avatar
osku committed
591
fsp_try_extend_data_file_with_pages(
592
	fil_space_t*	space,
593
	uint32_t	page_no,
594
	buf_block_t*	header,
595
	mtr_t*		mtr)
osku's avatar
osku committed
596
{
597
	bool	success;
osku's avatar
osku committed
598 599
	ulint	size;

600
	ut_ad(!is_system_tablespace(space->id));
601
	ut_d(space->modify_check(*mtr));
osku's avatar
osku committed
602

603 604
	size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
				+ header->page.frame);
605
	ut_ad(size == space->size_in_header);
606

osku's avatar
osku committed
607 608
	ut_a(page_no >= size);

609 610
	success = fil_space_extend(space, page_no + 1);
	/* The size may be less than we wanted if we ran out of disk space. */
611 612 613 614 615
	/* recv_sys_t::parse() expects to find a WRITE record that
	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
	in order to avoid optimizing away any unchanged most
	significant bytes of FSP_SIZE. */
	mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
616
				    + header->page.frame, space->size);
617
	space->size_in_header = space->size;
osku's avatar
osku committed
618 619 620 621

	return(success);
}

622 623 624
/** Calculate the number of physical pages in an extent for this file.
@param[in]	physical_size	page_size of the datafile
@return number of pages in an extent for this file */
625
inline uint32_t fsp_get_extent_size_in_pages(ulint physical_size)
626
{
627
  return uint32_t((FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size);
628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
}


/** Calculate the number of pages to extend a datafile.
We extend single-table tablespaces first one extent at a time,
but 4 at a time for bigger tablespaces. It is not enough to extend always
by one extent, because we need to add at least one extent to FSP_FREE.
A single extent descriptor page will track many extents. And the extent
that uses its extent descriptor page is put onto the FSP_FREE_FRAG list.
Extents that do not use their extent descriptor page are added to FSP_FREE.
The physical page size is used to determine how many extents are tracked
on one extent descriptor page. See xdes_calc_descriptor_page().
@param[in]	physical_size	page size in data file
@param[in]	size		current number of pages in the datafile
@return number of pages to extend the file. */
643 644
static uint32_t fsp_get_pages_to_extend_ibd(unsigned physical_size,
					    uint32_t size)
645
{
646
	uint32_t extent_size = fsp_get_extent_size_in_pages(physical_size);
647 648
	/* The threshold is set at 32MiB except when the physical page
	size is small enough that it must be done sooner. */
649
	uint32_t threshold = std::min(32 * extent_size, physical_size);
650 651 652 653 654 655 656 657 658 659 660

	if (size >= threshold) {
		/* Below in fsp_fill_free_list() we assume
		that we add at most FSP_FREE_ADD extents at
		a time */
		extent_size *= FSP_FREE_ADD;
	}

	return extent_size;
}

661 662 663 664
/** Try to extend the last data file of a tablespace if it is auto-extending.
@param[in,out]	space	tablespace
@param[in,out]	header	tablespace header
@param[in,out]	mtr	mini-transaction
Marko Mäkelä's avatar
Marko Mäkelä committed
665 666
@return	number of pages added
@retval	0 if the tablespace was not extended */
667
ATTRIBUTE_COLD __attribute__((nonnull))
Marko Mäkelä's avatar
Marko Mäkelä committed
668
static
669
ulint
670
fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
osku's avatar
osku committed
671
{
672 673 674
	const char* OUT_OF_SPACE_MSG =
		"ran out of space. Please add another file or use"
		" 'autoextend' for the last file in setting";
osku's avatar
osku committed
675

676
	ut_d(space->modify_check(*mtr));
osku's avatar
osku committed
677

678
	if (space->id == TRX_SYS_SPACE
679
	    && !srv_sys_space.can_auto_extend_last_file()) {
osku's avatar
osku committed
680

681 682
		/* We print the error message only once to avoid
		spamming the error log. Note that we don't need
683
		to reset the flag to false as dealing with this
684
		error requires server restart. */
685
		if (!srv_sys_space.get_tablespace_full_status()) {
686 687 688
			sql_print_error("InnoDB: The InnoDB system tablespace "
                                        "%s" " innodb_data_file_path.",
                                        OUT_OF_SPACE_MSG);
689
			srv_sys_space.set_tablespace_full_status(true);
690
		}
Marko Mäkelä's avatar
Marko Mäkelä committed
691
		return(0);
692
	} else if (space->id == SRV_TMP_SPACE_ID
693 694 695 696 697 698 699
		   && !srv_tmp_space.can_auto_extend_last_file()) {

		/* We print the error message only once to avoid
		spamming the error log. Note that we don't need
		to reset the flag to false as dealing with this
		error requires server restart. */
		if (!srv_tmp_space.get_tablespace_full_status()) {
700 701 702 703
			sql_print_error("InnoDB: The InnoDB temporary"
                                        " tablespace %s"
                                        " innodb_temp_data_file_path.",
                                        OUT_OF_SPACE_MSG);
704 705
			srv_tmp_space.set_tablespace_full_status(true);
		}
Marko Mäkelä's avatar
Marko Mäkelä committed
706
		return(0);
osku's avatar
osku committed
707 708
	}

709
	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
710
					 + header->page.frame);
711
	ut_ad(size == space->size_in_header);
712
	uint32_t size_increase;
713

714
	const unsigned ps = space->physical_size();
osku's avatar
osku committed
715

716 717
	switch (space->id) {
	case TRX_SYS_SPACE:
718
		size_increase = srv_sys_space.get_increment();
719 720
		break;
	case SRV_TMP_SPACE_ID:
721
		size_increase = srv_tmp_space.get_increment();
722 723
		break;
	default:
724
		uint32_t extent_pages = fsp_get_extent_size_in_pages(ps);
725
		if (size < extent_pages) {
726
			/* Let us first extend the file to extent_size */
727 728
			if (!fsp_try_extend_data_file_with_pages(
				    space, extent_pages - 1, header, mtr)) {
Marko Mäkelä's avatar
Marko Mäkelä committed
729
				return(0);
osku's avatar
osku committed
730 731
			}

732
			size = extent_pages;
733 734
		}

735
		size_increase = fsp_get_pages_to_extend_ibd(ps, size);
osku's avatar
osku committed
736
	}
737

osku's avatar
osku committed
738
	if (size_increase == 0) {
Marko Mäkelä's avatar
Marko Mäkelä committed
739
		return(0);
osku's avatar
osku committed
740
	}
741

742
	if (!fil_space_extend(space, size + size_increase)) {
Marko Mäkelä's avatar
Marko Mäkelä committed
743
		return(0);
744 745
	}

746 747
	/* For the system tablespace, we ignore any fragments of a
	full megabyte when storing the size to the space header */
osku's avatar
osku committed
748

749 750 751
	space->size_in_header = space->id
		? space->size
		: ut_2pow_round(space->size, (1024 * 1024) / ps);
752

753 754 755 756 757
	/* recv_sys_t::parse() expects to find a WRITE record that
	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
	in order to avoid optimizing away any unchanged most
	significant bytes of FSP_SIZE. */
	mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
758 759
				    + header->page.frame,
				    space->size_in_header);
760

Marko Mäkelä's avatar
Marko Mäkelä committed
761
	return(size_increase);
762 763
}

Marko Mäkelä's avatar
Marko Mäkelä committed
764 765 766 767 768 769 770 771 772 773
/** Reset the page type.
Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE.
In MySQL 3.23.53, only undo log pages and index pages were tagged.
Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
@param[in]	block	block with invalid FIL_PAGE_TYPE
@param[in]	type	expected page type
@param[in,out]	mtr	mini-transaction */
ATTRIBUTE_COLD
void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr)
{
774 775 776
  ib::info() << "Resetting invalid page " << block.page.id() << " type "
             << fil_page_get_type(block.page.frame) << " to " << type << ".";
  mtr->write<2>(block, block.page.frame + FIL_PAGE_TYPE, type);
Marko Mäkelä's avatar
Marko Mäkelä committed
777 778
}

779
/** Put new extents to the free list if there are free extents above the free
osku's avatar
osku committed
780
limit. If an extent happens to contain an extent descriptor page, the extent
781 782 783 784 785 786
is put to the FSP_FREE_FRAG list with the page marked as used.
@param[in]	init_space	true if this is a single-table tablespace
and we are only initializing the first extent and the first bitmap pages;
then we will not allocate more extents
@param[in,out]	space		tablespace
@param[in,out]	header		tablespace header
787 788
@param[in,out]	mtr		mini-transaction
@return error code */
osku's avatar
osku committed
789
static
790
dberr_t
osku's avatar
osku committed
791
fsp_fill_free_list(
792 793
	bool		init_space,
	fil_space_t*	space,
794
	buf_block_t*	header,
795
	mtr_t*		mtr)
osku's avatar
osku committed
796
{
797
  ut_d(space->modify_check(*mtr));
798

799 800 801 802 803
  /* Check if we can fill free list from above the free list limit */
  uint32_t size=
    mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame);
  uint32_t limit=
    mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + header->page.frame);
osku's avatar
osku committed
804

805 806
  ut_ad(size == space->size_in_header);
  ut_ad(limit == space->free_limit);
osku's avatar
osku committed
807

808
  const auto zip_size= space->zip_size();
osku's avatar
osku committed
809

810 811 812 813 814 815 816 817 818 819 820
  if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD)
  {
    bool skip_resize= init_space;
    switch (space->id) {
    case TRX_SYS_SPACE:
      skip_resize= !srv_sys_space.can_auto_extend_last_file();
      break;
    case SRV_TMP_SPACE_ID:
      skip_resize= !srv_tmp_space.can_auto_extend_last_file();
      break;
    }
821

822 823 824 825 826 827
    if (!skip_resize)
    {
      fsp_try_extend_data_file(space, header, mtr);
      size= space->size_in_header;
    }
  }
osku's avatar
osku committed
828

829 830 831 832 833 834 835 836 837 838 839
  uint32_t count= 0;
  for (uint32_t i= limit, extent_size= FSP_EXTENT_SIZE,
         physical_size= space->physical_size();
       (init_space && i < 1) ||
         (i + extent_size <= size && count < FSP_FREE_ADD);
       i += extent_size)
  {
    const bool init_xdes= !ut_2pow_remainder(i, physical_size);
    space->free_limit= i + extent_size;
    mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT +
                  header->page.frame, i + extent_size);
osku's avatar
osku committed
840

841 842 843 844 845 846 847 848
    if (init_xdes)
    {
      /* We are going to initialize a new descriptor page
      and a new ibuf bitmap page: the prior contents of the
      pages should be ignored. */

      if (i)
      {
849
        buf_block_t *f= buf_LRU_get_free_block(have_no_mutex);
850
        buf_block_t *block= buf_page_create(space, i, zip_size, mtr, f);
851 852 853 854 855 856 857 858 859
        if (UNIV_UNLIKELY(block != f))
          buf_pool.free_block(f);
        fsp_init_file_page(space, block, mtr);
        mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
                      FIL_PAGE_TYPE_XDES);
      }

      if (space->purpose != FIL_TYPE_TEMPORARY)
      {
860
        buf_block_t *f= buf_LRU_get_free_block(have_no_mutex);
861
        buf_block_t *block=
Marko Mäkelä's avatar
Marko Mäkelä committed
862
          buf_page_create(space, i + 1, zip_size, mtr, f);
863 864
        if (UNIV_UNLIKELY(block != f))
          buf_pool.free_block(f);
865 866 867 868 869 870 871
        /* The zero-initialization will reset the change buffer bitmap bits
        to safe values for possible import to an earlier version that
        supports change buffering:

        IBUF_BITMAP_FREE     = 0 (no space left for buffering inserts)
        IBUF_BITMAP_BUFFERED = 0 (no changes have been buffered)
        IBUF_BITMAP_IBUF     = 0 (not part of the change buffer) */
872 873 874 875 876
        fsp_init_file_page(space, block, mtr);
        mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
                      FIL_PAGE_IBUF_BITMAP);
      }
    }
osku's avatar
osku committed
877

878 879 880 881 882 883 884 885 886
    buf_block_t *xdes= nullptr;
    xdes_t *descr;
    {
      dberr_t err= DB_SUCCESS;
      descr= xdes_get_descriptor_with_space_hdr(header, space, i, mtr,
                                                &err, &xdes, init_space);
      if (!descr)
        return err;
    }
887

888 889 890 891 892 893 894 895
    if (xdes != header && !space->full_crc32())
      fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr);
    xdes_init(*xdes, descr, mtr);
    const uint16_t xoffset=
      static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE);
    if (UNIV_UNLIKELY(init_xdes))
    {
      /* The first page in the extent is a descriptor page and the
896
      second was reserved for change buffer bitmap: mark them used */
897
      xdes_set_free<false>(*xdes, descr, 0, mtr);
898
      xdes_set_free<false>(*xdes, descr, 1, mtr);
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914
      xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
      if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
                                     xdes, xoffset, mtr))
        return err;
      byte *n_used= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame;
      mtr->write<4>(*header, n_used, 2U + mach_read_from_4(n_used));
    }
    else
    {
      if (dberr_t err=
          flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE,
                        xdes, xoffset, mtr))
        return err;
      count++;
    }
  }
915

916 917
  space->free_len+= count;
  return DB_SUCCESS;
918
}
osku's avatar
osku committed
919

920
MY_ATTRIBUTE((nonnull, warn_unused_result))
921
/** Allocates a new free extent.
922
@param[in,out]	space		tablespace
923 924
@param[in]	hint		hint of which extent would be desirable: any
page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT
925
@param[out]	xdes		extent descriptor page
926
@param[in,out]	mtr		mini-transaction
927 928 929 930 931
@return extent descriptor
@retval nullptr if cannot be allocated */
static xdes_t *fsp_alloc_free_extent(fil_space_t *space, uint32_t hint,
                                     buf_block_t **xdes, mtr_t *mtr,
                                     dberr_t *err)
osku's avatar
osku committed
932 933 934
{
	fil_addr_t	first;
	xdes_t*		descr;
935
	buf_block_t*	desc_block;
936

937
	buf_block_t* header = fsp_get_header(space, mtr, err);
938
	if (!header) {
939 940
corrupted:
		space->set_corrupted();
941 942
		return nullptr;
	}
osku's avatar
osku committed
943

944
	descr = xdes_get_descriptor_with_space_hdr(
945
		header, space, hint, mtr, err, &desc_block);
946
	if (!descr) {
947
		goto corrupted;
948
	}
949

950
	if (desc_block != header && !space->full_crc32()) {
951
		fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr);
952
	}
osku's avatar
osku committed
953

954
	if (xdes_get_state(descr) == XDES_FREE) {
osku's avatar
osku committed
955
		/* Ok, we can take this extent */
956
	} else {
osku's avatar
osku committed
957
		/* Take the first extent in the free list */
958
		first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
959
				       + header->page.frame);
osku's avatar
osku committed
960

961
		if (first.page == FIL_NULL) {
962 963 964 965
			*err = fsp_fill_free_list(false, space, header, mtr);
			if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
				goto corrupted;
			}
osku's avatar
osku committed
966

967
			first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
968
					       + header->page.frame);
969 970 971
			if (first.page == FIL_NULL) {
				return nullptr;	/* No free extents left */
			}
osku's avatar
osku committed
972
		}
973

974
		descr = xdes_lst_get_descriptor(*space, first, mtr,
975
						&desc_block, err);
976
		if (!descr) {
977
			return descr;
978
		}
osku's avatar
osku committed
979 980
	}

981 982 983 984 985 986 987 988
	*err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block,
			   static_cast<uint16_t>(descr - desc_block->page.frame
						 + XDES_FLST_NODE),
			   mtr);
	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
		return nullptr;
	}

989
	space->free_len--;
990
	*xdes = desc_block;
osku's avatar
osku committed
991 992 993 994

	return(descr);
}

995
MY_ATTRIBUTE((nonnull, warn_unused_result))
996 997 998 999 1000
/** Allocate a single free page.
@param[in,out]	header	tablespace header
@param[in,out]	xdes	extent descriptor page
@param[in,out]	descr	extent descriptor
@param[in]	bit	slot to allocate in the extent
1001 1002 1003
@param[in,out]	mtr	mini-transaction
@return error code */
static dberr_t
1004 1005
fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
			 ulint bit, mtr_t *mtr)
1006
{
1007 1008 1009 1010
  if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FREE_FRAG ||
                    !xdes_is_free(descr, bit)))
    return DB_CORRUPTION;
  xdes_set_free<false>(*xdes, descr, bit, mtr);
1011

1012 1013 1014
  /* Update the FRAG_N_USED field */
  byte *n_used_p= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame;
  uint32_t n_used = mach_read_from_4(n_used_p) + 1;
1015

1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
  if (xdes_is_full(descr))
  {
    /* The fragment is full: move it to another list */
    const uint16_t xoffset=
      static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE);
    if (dberr_t err= flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
                                 xdes, xoffset, mtr))
      return err;
    if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
                                   xdes, xoffset, mtr))
      return err;
    xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr);
    n_used-= FSP_EXTENT_SIZE;
  }
1030

1031 1032
  mtr->write<4>(*header, n_used_p, n_used);
  return DB_SUCCESS;
1033 1034
}

1035
/** Gets a buffer block for an allocated page.
1036 1037
@param[in,out]	space		tablespace
@param[in]	offset		page number of the allocated page
1038
@param[in,out]	mtr		mini-transaction
1039
@return block, initialized */
1040 1041
static buf_block_t* fsp_page_create(fil_space_t *space, uint32_t offset,
                                    mtr_t *mtr)
1042
{
Marko Mäkelä's avatar
Marko Mäkelä committed
1043
  buf_block_t *free_block= buf_LRU_get_free_block(have_no_mutex),
1044
    *block= buf_page_create(space, offset, space->zip_size(), mtr, free_block);
1045 1046
  if (UNIV_UNLIKELY(block != free_block))
    buf_pool.free_block(free_block);
Marko Mäkelä's avatar
Marko Mäkelä committed
1047 1048
  fsp_init_file_page(space, block, mtr);
  return block;
1049 1050
}

1051 1052
/** Allocates a single free page from a space.
The page is marked as used.
1053
@param[in,out]	space		tablespace
1054 1055 1056 1057
@param[in]	hint		hint of which page would be desirable
@param[in,out]	mtr		mini-transaction
@param[in,out]	init_mtr	mini-transaction in which the page should be
initialized (may be the same as mtr)
1058 1059 1060
@param[out]	err		error code
@return allocated block
@retval nullptr	if no page could be allocated */
1061
static MY_ATTRIBUTE((warn_unused_result, nonnull))
1062 1063
buf_block_t *fsp_alloc_free_page(fil_space_t *space, uint32_t hint,
                                 mtr_t *mtr, mtr_t *init_mtr, dberr_t *err)
osku's avatar
osku committed
1064
{
1065 1066 1067 1068
  ut_d(space->modify_check(*mtr));
  buf_block_t *block= fsp_get_header(space, mtr, err);
  if (!block)
    return block;
osku's avatar
osku committed
1069

1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
  buf_block_t *xdes;
  /* Get the hinted descriptor */
  xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, hint, mtr,
                                                    err, &xdes);
  if (descr && xdes_get_state(descr) == XDES_FREE_FRAG)
    /* Ok, we can take this extent */;
  else if (*err != DB_SUCCESS)
  {
  err_exit:
    space->set_corrupted();
    return nullptr;
  }
  else
  {
    /* Else take the first extent in free_frag list */
    fil_addr_t first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG +
                                      block->page.frame);
    if (first.page == FIL_NULL)
    {
      /* There are no partially full fragments: allocate a free extent
      and add it to the FREE_FRAG list. NOTE that the allocation may
      have as a side-effect that an extent containing a descriptor
      page is added to the FREE_FRAG list. But we will allocate our
      page from the the free extent anyway. */
      descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, err);
      if (!descr)
        return nullptr;
      *err= flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, xdes,
                          static_cast<uint16_t>(descr - xdes->page.frame +
                                                XDES_FLST_NODE), mtr);
      if (UNIV_UNLIKELY(*err != DB_SUCCESS))
        return nullptr;
      xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
    }
    else
    {
      descr= xdes_lst_get_descriptor(*space, first, mtr, &xdes, err);
      if (!descr)
        return nullptr;
      /* Reset the hint */
      hint= 0;
    }
  }
osku's avatar
osku committed
1113

1114 1115 1116 1117 1118 1119 1120 1121 1122
  /* Now we have in descr an extent with at least one free page. Look
  for a free page in the extent. */
  uint32_t free= xdes_find_free(descr, hint % FSP_EXTENT_SIZE);
  if (free == FIL_NULL)
  {
  corrupted:
    *err= DB_CORRUPTION;
    goto err_exit;
  }
osku's avatar
osku committed
1123

1124 1125 1126 1127 1128 1129
  uint32_t page_no= xdes_get_offset(descr) + free;
  uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE +
                                         block->page.frame);
  ut_ad(space_size == space->size_in_header ||
        (space->id == TRX_SYS_SPACE &&
         srv_startup_is_before_trx_rollback_phase));
osku's avatar
osku committed
1130

1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143
  if (space_size <= page_no)
  {
    /* It must be that we are extending a single-table tablespace
    whose size is still < 64 pages */
    ut_ad(!is_system_tablespace(space->id));
    if (page_no >= FSP_EXTENT_SIZE)
    {
      sql_print_error("InnoDB: Trying to extend %s"
                      " by single page(s) though the size is " UINT32PF "."
                      " Page no " UINT32PF ".",
                      space->chain.start->name, space_size, page_no);
      goto corrupted;
    }
osku's avatar
osku committed
1144

1145 1146 1147 1148 1149 1150
    if (!fsp_try_extend_data_file_with_pages(space, page_no, block, mtr))
    {
      *err= DB_OUT_OF_FILE_SPACE;
      return nullptr;
    }
  }
osku's avatar
osku committed
1151

1152 1153 1154 1155 1156
  *err= fsp_alloc_from_free_frag(block, xdes, descr, free, mtr);
  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
    goto corrupted;
  return fsp_page_create(space, page_no, init_mtr);
}
osku's avatar
osku committed
1157

1158 1159 1160 1161 1162 1163
MY_ATTRIBUTE((nonnull, warn_unused_result))
/** Return an extent to the free list of a space.
@param[in,out]  space   tablespace
@param[in]      offset  page number in the extent
@param[in,out]  mtr     mini-transaction
@return error code */
1164
static dberr_t fsp_free_extent(fil_space_t* space, uint32_t offset,
1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
                               mtr_t* mtr)
{
  ut_ad(space->is_owner());
  dberr_t err;
  buf_block_t *block= fsp_get_header(space, mtr, &err);
  if (!block)
    return err;
  buf_block_t *xdes;
  xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, offset, mtr,
                                                    &err, &xdes);
  if (!descr)
  {
    ut_ad(err || space->is_stopping());
    return err;
  }
1180

1181 1182 1183 1184 1185
  if (UNIV_UNLIKELY(xdes_get_state(descr) == XDES_FREE))
  {
    space->set_corrupted();
    return DB_CORRUPTION;
  }
osku's avatar
osku committed
1186

1187 1188 1189 1190 1191
  xdes_init(*xdes, descr, mtr);
  space->free_len++;
  return flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE,
                       xdes, static_cast<uint16_t>(descr - xdes->page.frame +
                                                   XDES_FLST_NODE), mtr);
osku's avatar
osku committed
1192 1193
}

1194
MY_ATTRIBUTE((nonnull))
1195 1196
/** Frees a single page of a space.
The page is marked as free and clean.
1197
@param[in,out]	space		tablespace
1198
@param[in]	offset		page number
1199 1200
@param[in,out]	mtr		mini-transaction
@return error code */
1201
static dberr_t fsp_free_page(fil_space_t *space, uint32_t offset, mtr_t *mtr)
osku's avatar
osku committed
1202 1203 1204
{
	xdes_t*		descr;
	ulint		frag_n_used;
1205

osku's avatar
osku committed
1206
	ut_ad(mtr);
1207
	ut_d(space->modify_check(*mtr));
osku's avatar
osku committed
1208

1209
	/* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */
osku's avatar
osku committed
1210

1211 1212
	dberr_t err;
	buf_block_t* header = fsp_get_header(space, mtr, &err);
1213 1214
	if (!header) {
		ut_ad(space->is_stopping());
1215
		return err;
1216 1217
	}
	buf_block_t* xdes;
osku's avatar
osku committed
1218

1219
	descr = xdes_get_descriptor_with_space_hdr(header, space, offset, mtr,
1220
						   &err, &xdes);
1221
	if (!descr) {
1222 1223
		ut_ad(err || space->is_stopping());
		return err;
1224
	}
osku's avatar
osku committed
1225

1226
	const auto state = xdes_get_state(descr);
1227

1228 1229 1230 1231 1232
	switch (state) {
	case XDES_FREE_FRAG:
	case XDES_FULL_FRAG:
		if (!xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) {
			break;
osku's avatar
osku committed
1233
		}
1234 1235 1236 1237
		/* fall through */
	default:
		space->set_corrupted();
		return DB_CORRUPTION;
osku's avatar
osku committed
1238 1239
	}

1240
	frag_n_used = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
1241
				       + header->page.frame);
osku's avatar
osku committed
1242

1243
	const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->page.frame
1244
						      + XDES_FLST_NODE);
1245

osku's avatar
osku committed
1246 1247
	if (state == XDES_FULL_FRAG) {
		/* The fragment was full: move it to another list */
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
		err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
				  xdes, xoffset, mtr);
		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
			return err;
		}
		err = flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
				    xdes, xoffset, mtr);
		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
			return err;
		}
1258 1259
		xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
1260
			      + header->page.frame,
1261
			      frag_n_used + FSP_EXTENT_SIZE - 1);
1262 1263
	} else if (UNIV_UNLIKELY(!frag_n_used)) {
		return DB_CORRUPTION;
osku's avatar
osku committed
1264
	} else {
1265
		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
1266
			      + header->page.frame, frag_n_used - 1);
osku's avatar
osku committed
1267 1268
	}

1269 1270 1271 1272
	mtr->free(*space, static_cast<uint32_t>(offset));
	xdes_set_free<true>(*xdes, descr, offset % FSP_EXTENT_SIZE, mtr);
	ut_ad(err == DB_SUCCESS);

1273
	if (!xdes_get_n_used(descr)) {
1274
		/* The extent has become free: move it to another list */
1275 1276
		err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
				  xdes, xoffset, mtr);
1277 1278
		if (err == DB_SUCCESS) {
			err = fsp_free_extent(space, offset, mtr);
1279
		}
1280
	}
osku's avatar
osku committed
1281

1282
	return err;
osku's avatar
osku committed
1283 1284
}

1285 1286 1287 1288 1289 1290
/** @return Number of segment inodes which fit on a single page */
inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size)
{
	return (physical_size - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE;
}

1291 1292 1293 1294
/** Returns the nth inode slot on an inode page.
@param[in]	page		segment inode page
@param[in]	i		inode index on page
@return segment inode */
1295 1296
#define fsp_seg_inode_page_get_nth_inode(page, i)	\
	FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i + page
osku's avatar
osku committed
1297

1298
/** Looks for a used segment inode on a segment inode page.
1299 1300 1301 1302
@param page             segment inode page
@param physical_size    page size
@return segment inode index
@retval ULINT_UNDEFINED if not found */
osku's avatar
osku committed
1303 1304
static
ulint
1305
fsp_seg_inode_page_find_used(const page_t *page, ulint physical_size)
osku's avatar
osku committed
1306
{
1307 1308 1309 1310 1311 1312 1313 1314 1315
  for (ulint i= 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++)
  {
    const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i);
    if (mach_read_from_8(FSEG_ID + inode))
    {
      ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
      return i;
    }
  }
osku's avatar
osku committed
1316

1317
  return ULINT_UNDEFINED;
osku's avatar
osku committed
1318 1319
}

1320 1321 1322
/** Looks for an unused segment inode on a segment inode page.
@param[in]	page		segment inode page
@param[in]	i		search forward starting from this index
1323
@param[in]	physical_size	page size
1324 1325
@return segment inode index
@retval ULINT_UNDEFINED if not found */
osku's avatar
osku committed
1326 1327
static
ulint
1328
fsp_seg_inode_page_find_free(const page_t *page, ulint i, ulint physical_size)
osku's avatar
osku committed
1329
{
1330 1331 1332 1333 1334 1335 1336 1337 1338 1339
  for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++)
  {
    const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i);
    if (mach_read_from_8(FSEG_ID + inode))
      ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
    else
      /* This is unused */
      return i;
  }
  return ULINT_UNDEFINED;
osku's avatar
osku committed
1340 1341
}

1342
MY_ATTRIBUTE((nonnull, warn_unused_result))
1343
/** Allocate a file segment inode page.
1344 1345 1346
@param[in,out]  space   tablespace
@param[in,out]  header  tablespace header
@param[in,out]  mtr     mini-transaction
1347 1348 1349
@return error code */
static dberr_t fsp_alloc_seg_inode_page(fil_space_t *space,
                                        buf_block_t *header, mtr_t *mtr)
osku's avatar
osku committed
1350
{
1351
  ut_ad(header->page.id().space() == space->id);
1352 1353
  dberr_t err;
  buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr, &err);
osku's avatar
osku committed
1354

1355
  if (!block)
1356
    return err;
osku's avatar
osku committed
1357

1358
  ut_ad(block->page.lock.not_recursive());
osku's avatar
osku committed
1359

1360
  mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE, FIL_PAGE_INODE);
osku's avatar
osku committed
1361

1362
#ifdef UNIV_DEBUG
1363
  const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->page.frame;
1364 1365 1366
  for (ulint i= FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--;
       inode += FSEG_INODE_SIZE)
    ut_ad(!mach_read_from_8(inode));
1367
#endif
osku's avatar
osku committed
1368

1369 1370
  return flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
                       block, FSEG_INODE_PAGE_NODE, mtr);
osku's avatar
osku committed
1371 1372
}

1373
MY_ATTRIBUTE((nonnull, warn_unused_result))
1374
/** Allocate a file segment inode.
1375 1376 1377 1378
@param[in,out]  space   tablespace
@param[in,out]  header  tablespace header
@param[out]     iblock  segment inode page
@param[in,out]  mtr     mini-transaction
1379
@param[out]     err     error code
1380
@return segment inode
1381
@retval nullptr on failure */
1382 1383
static fseg_inode_t*
fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header,
1384
                    buf_block_t **iblock, mtr_t *mtr, dberr_t *err)
osku's avatar
osku committed
1385
{
1386 1387 1388 1389 1390 1391 1392 1393
  /* Allocate a new segment inode page if needed. */
  if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE +
                    header->page.frame))
  {
    *err= fsp_alloc_seg_inode_page(space, header, mtr);
    if (*err != DB_SUCCESS)
      return nullptr;
  }
1394

1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
  const page_id_t page_id
  {
    space->id,
    mach_read_from_4(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + FLST_FIRST +
                     FIL_ADDR_PAGE + header->page.frame)
  };

  buf_block_t *block=
    buf_page_get_gen(page_id, space->zip_size(), RW_SX_LATCH,
                     nullptr, BUF_GET_POSSIBLY_FREED, mtr, err);
  if (!block)
    return nullptr;
1407

1408 1409
  if (!space->full_crc32())
    fil_block_check_type(*block, FIL_PAGE_INODE, mtr);
osku's avatar
osku committed
1410

1411 1412
  const ulint physical_size= space->physical_size();
  ulint n= fsp_seg_inode_page_find_free(block->page.frame, 0, physical_size);
osku's avatar
osku committed
1413

1414 1415 1416 1417 1418 1419
  if (UNIV_UNLIKELY(n >= FSP_SEG_INODES_PER_PAGE(physical_size)))
  {
    *err= DB_CORRUPTION;
    return nullptr;
  }
  fseg_inode_t *inode= fsp_seg_inode_page_get_nth_inode(block->page.frame, n);
osku's avatar
osku committed
1420

1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434
  if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->page.frame, n + 1,
                                                      physical_size))
  {
    /* There are no other unused headers left on the page: move it
    to another list */
    *err= flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
                      block, FSEG_INODE_PAGE_NODE, mtr);
    if (UNIV_UNLIKELY(*err != DB_SUCCESS))
      return nullptr;
    *err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
                        block, FSEG_INODE_PAGE_NODE, mtr);
    if (UNIV_UNLIKELY(*err != DB_SUCCESS))
      return nullptr;
  }
osku's avatar
osku committed
1435

1436 1437 1438 1439
  ut_ad(!mach_read_from_8(inode + FSEG_ID) ||
        !memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
  *iblock= block;
  return inode;
osku's avatar
osku committed
1440 1441
}

1442
MY_ATTRIBUTE((nonnull))
1443
/** Frees a file segment inode.
1444
@param[in,out]	space		tablespace
1445
@param[in,out]	inode		segment inode
1446
@param[in,out]	iblock		segment inode page
1447
@param[in,out]	mtr		mini-transaction */
1448 1449
static void fsp_free_seg_inode(fil_space_t *space, fseg_inode_t *inode,
                               buf_block_t *iblock, mtr_t *mtr)
osku's avatar
osku committed
1450
{
1451
  ut_d(space->modify_check(*mtr));
1452

1453 1454 1455 1456 1457 1458 1459 1460 1461
  dberr_t err;
  buf_block_t *header= fsp_get_header(space, mtr, &err);
  if (!header)
    return;
  if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
  {
    space->set_corrupted();
    return;
  }
osku's avatar
osku committed
1462

1463
  const ulint physical_size= space->physical_size();
osku's avatar
osku committed
1464

1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475
  if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(iblock->page.frame, 0,
                                                      physical_size))
  {
    /* Move the page to another list */
    if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
                    iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
      return;
    if (flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
                      iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
      return;
  }
1476

1477
  mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0);
osku's avatar
osku committed
1478

1479 1480 1481
  if (ULINT_UNDEFINED != fsp_seg_inode_page_find_used(iblock->page.frame,
                                                      physical_size))
    return;
1482

1483 1484 1485 1486
  /* There are no other used headers left on the page: free it */
  if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
                  iblock, FSEG_INODE_PAGE_NODE, mtr) == DB_SUCCESS)
    fsp_free_page(space, iblock->page.id().page_no(), mtr);
osku's avatar
osku committed
1487 1488
}

1489
MY_ATTRIBUTE((nonnull(1,4,5), warn_unused_result))
1490 1491 1492
/** Returns the file segment inode, page x-latched.
@param[in]	header		segment header
@param[in]	space		space id
1493
@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
1494
@param[in,out]	mtr		mini-transaction
1495 1496 1497 1498
@param[out]	block		inode block
@param[out]	err		error code
@return segment inode, page x-latched
@retrval nullptr if the inode is free or corruption was noticed */
osku's avatar
osku committed
1499 1500
static
fseg_inode_t*
1501
fseg_inode_try_get(
1502
	const fseg_header_t*	header,
1503
	uint32_t		space,
1504
	ulint			zip_size,
1505
	mtr_t*			mtr,
1506 1507
	buf_block_t**		block,
        dberr_t*		err = nullptr)
osku's avatar
osku committed
1508
{
1509 1510 1511 1512 1513 1514 1515
  if (UNIV_UNLIKELY(space != mach_read_from_4(header + FSEG_HDR_SPACE)))
  {
  corrupted:
    if (err)
      *err= DB_CORRUPTION;
    return nullptr;
  }
1516

1517 1518 1519 1520 1521 1522 1523
  *block=
    buf_page_get_gen(page_id_t(space,
                               mach_read_from_4(header + FSEG_HDR_PAGE_NO)),
                     zip_size, RW_SX_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
                     mtr, err);
  if (!*block)
    return nullptr;
1524

1525 1526 1527
  const uint16_t offset= mach_read_from_2(header + FSEG_HDR_OFFSET);
  if (UNIV_UNLIKELY(offset >= (*block)->physical_size()))
    goto corrupted;
osku's avatar
osku committed
1528

1529 1530 1531 1532
  fseg_inode_t *inode= (*block)->page.frame + offset;
  if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID) ||
                    memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
    goto corrupted;
osku's avatar
osku committed
1533

1534
  return inode;
1535 1536
}

1537 1538 1539 1540 1541
/** Get the page number from the nth fragment page slot.
@param inode  file segment findex
@param n      slot index
@return page number
@retval FIL_NULL if not in use */
1542
static uint32_t fseg_get_nth_frag_page_no(const fseg_inode_t *inode, ulint n)
osku's avatar
osku committed
1543
{
1544
	ut_ad(inode);
osku's avatar
osku committed
1545
	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
1546
	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
osku's avatar
osku committed
1547
	return(mach_read_from_4(inode + FSEG_FRAG_ARR
1548
				+ n * FSEG_FRAG_SLOT_SIZE));
osku's avatar
osku committed
1549 1550
}

1551 1552 1553 1554 1555 1556 1557 1558
/** Set the page number in the nth fragment page slot.
@param[in,out]  inode   segment inode
@param[in,out]  iblock  segment inode page
@param[in]      n       slot index
@param[in]      page_no page number to set
@param[in,out]  mtr     mini-transaction */
inline void fseg_set_nth_frag_page_no(fseg_inode_t *inode, buf_block_t *iblock,
                                      ulint n, ulint page_no, mtr_t *mtr)
osku's avatar
osku committed
1559
{
1560
  ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
1561
  ut_ad(mtr->memo_contains_flagged(iblock, MTR_MEMO_PAGE_SX_FIX));
1562
  ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
osku's avatar
osku committed
1563

1564 1565
  mtr->write<4>(*iblock, inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
                page_no);
osku's avatar
osku committed
1566 1567
}

1568
/**********************************************************************//**
1569
Finds a fragment page slot which is free.
1570
@return slot index; ULINT_UNDEFINED if none found */
osku's avatar
osku committed
1571 1572 1573 1574
static
ulint
fseg_find_free_frag_page_slot(
/*==========================*/
1575
	fseg_inode_t*	inode)	/*!< in: segment inode */
osku's avatar
osku committed
1576 1577 1578 1579 1580
{
	ulint	i;
	ulint	page_no;

	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
1581
		page_no = fseg_get_nth_frag_page_no(inode, i);
osku's avatar
osku committed
1582 1583 1584 1585 1586 1587 1588 1589 1590 1591

		if (page_no == FIL_NULL) {

			return(i);
		}
	}

	return(ULINT_UNDEFINED);
}

1592
/**********************************************************************//**
1593
Finds a fragment page slot which is used and last in the array.
1594
@return slot index; ULINT_UNDEFINED if none found */
osku's avatar
osku committed
1595 1596 1597 1598
static
ulint
fseg_find_last_used_frag_page_slot(
/*===============================*/
1599
	fseg_inode_t*	inode)	/*!< in: segment inode */
osku's avatar
osku committed
1600 1601 1602 1603 1604
{
	ulint	i;
	ulint	page_no;

	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
1605
		page_no = fseg_get_nth_frag_page_no(
1606
			inode, FSEG_FRAG_ARR_N_SLOTS - i - 1);
osku's avatar
osku committed
1607 1608 1609 1610 1611 1612 1613 1614 1615 1616

		if (page_no != FIL_NULL) {

			return(FSEG_FRAG_ARR_N_SLOTS - i - 1);
		}
	}

	return(ULINT_UNDEFINED);
}

1617 1618
/** Calculate reserved fragment page slots.
@param inode  file segment index
1619
@return number of fragment pages */
1620
static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode)
osku's avatar
osku committed
1621 1622 1623 1624 1625
{
	ulint	i;
	ulint	count	= 0;

	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
1626
		if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) {
osku's avatar
osku committed
1627 1628 1629 1630 1631 1632 1633
			count++;
		}
	}

	return(count);
}

Marko Mäkelä's avatar
Marko Mäkelä committed
1634 1635 1636 1637
/** Create a new segment.
@param space                tablespace
@param byte_offset          byte offset of the created segment header
@param mtr                  mini-transaction
1638
@param err                  error code
Marko Mäkelä's avatar
Marko Mäkelä committed
1639 1640 1641 1642
@param has_done_reservation whether fsp_reserve_free_extents() was invoked
@param block                block where segment header is placed,
                            or NULL to allocate an additional page for that
@return the block where the segment header is placed, x-latched
1643
@retval nullptr if could not create segment */
1644
buf_block_t*
1645
fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
Marko Mäkelä's avatar
Marko Mäkelä committed
1646
            bool has_done_reservation, buf_block_t *block)
osku's avatar
osku committed
1647 1648
{
	fseg_inode_t*	inode;
1649
	ib_id_t		seg_id;
1650
	uint32_t	n_reserved;
1651
	bool		reserved_extent = false;
osku's avatar
osku committed
1652

1653
	DBUG_ENTER("fseg_create");
1654

osku's avatar
osku committed
1655
	ut_ad(mtr);
Marko Mäkelä's avatar
Marko Mäkelä committed
1656
	ut_ad(byte_offset >= FIL_PAGE_DATA);
1657
	ut_ad(byte_offset + FSEG_HEADER_SIZE
1658
	      <= srv_page_size - FIL_PAGE_DATA_END);
Monty's avatar
Monty committed
1659
	buf_block_t* iblock= 0;
osku's avatar
osku committed
1660

1661
	mtr->x_lock_space(space);
1662
	ut_d(space->modify_check(*mtr));
1663

1664
	ut_ad(!block || block->page.id().space() == space->id);
osku's avatar
osku committed
1665

1666
	buf_block_t* header = fsp_get_header(space, mtr, err);
1667
	if (!header) {
1668
		block = nullptr;
1669 1670 1671
		goto funct_exit;
	}

1672
inode_alloc:
1673
	inode = fsp_alloc_seg_inode(space, header, &iblock, mtr, err);
osku's avatar
osku committed
1674

Marko Mäkelä's avatar
Marko Mäkelä committed
1675
	if (!inode) {
1676
		block = nullptr;
1677 1678
reserve_extent:
		if (!has_done_reservation && !reserved_extent) {
Marko Mäkelä's avatar
Marko Mäkelä committed
1679 1680 1681 1682
			*err = fsp_reserve_free_extents(&n_reserved, space, 2,
							FSP_NORMAL, mtr);
			if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
				DBUG_RETURN(nullptr);
1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695
			}

			/* Extents reserved successfully. So
			try allocating the page or inode */
			reserved_extent = true;
			if (inode) {
				goto page_alloc;
			}

			goto inode_alloc;
		}

		if (inode) {
Marko Mäkelä's avatar
Marko Mäkelä committed
1696
			fsp_free_seg_inode(space, inode, iblock, mtr);
1697
		}
osku's avatar
osku committed
1698 1699 1700 1701 1702 1703
		goto funct_exit;
	}

	/* Read the next segment id from space header and increment the
	value in space header */

1704
	seg_id = mach_read_from_8(FSP_HEADER_OFFSET + FSP_SEG_ID
1705
				  + header->page.frame);
osku's avatar
osku committed
1706

1707 1708
	mtr->write<8>(*header,
		      FSP_HEADER_OFFSET + FSP_SEG_ID + header->page.frame,
1709 1710
		      seg_id + 1);
	mtr->write<8>(*iblock, inode + FSEG_ID, seg_id);
1711
	ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED));
osku's avatar
osku committed
1712

1713 1714 1715
	flst_init(*iblock, inode + FSEG_FREE, mtr);
	flst_init(*iblock, inode + FSEG_NOT_FULL, mtr);
	flst_init(*iblock, inode + FSEG_FULL, mtr);
osku's avatar
osku committed
1716

1717
	mtr->memcpy(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4);
1718 1719
	compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4);
	compile_time_assert(FIL_NULL == 0xffffffff);
1720 1721
	mtr->memset(iblock,
		    uint16_t(inode - iblock->page.frame) + FSEG_FRAG_ARR,
1722
		    FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff);
osku's avatar
osku committed
1723

1724
	if (!block) {
1725
page_alloc:
1726
		block = fseg_alloc_free_page_low(space,
1727
						 inode, iblock, 0, FSP_UP,
1728
#ifdef UNIV_DEBUG
1729
						 has_done_reservation,
1730
#endif /* UNIV_DEBUG */
1731
						 mtr, mtr, err);
1732

1733
		if (!block) {
Marko Mäkelä's avatar
Marko Mäkelä committed
1734
			ut_ad(!has_done_reservation);
1735
			goto reserve_extent;
osku's avatar
osku committed
1736 1737
		}

1738 1739
		ut_d(const auto x = block->page.lock.x_lock_count());
		ut_ad(x || block->page.lock.not_recursive());
1740
		ut_ad(x <= 2);
1741 1742
		ut_ad(!fil_page_get_type(block->page.frame));
		mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
1743
			      FIL_PAGE_TYPE_SYS);
1744
	}
osku's avatar
osku committed
1745

1746
	mtr->write<2>(*block, byte_offset + FSEG_HDR_OFFSET
1747
		      + block->page.frame, page_offset(inode));
osku's avatar
osku committed
1748

1749
	mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO
1750
		      + block->page.frame, iblock->page.id().page_no());
osku's avatar
osku committed
1751

1752
	mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE
1753
				       + block->page.frame, space->id);
osku's avatar
osku committed
1754 1755

funct_exit:
1756
	if (!has_done_reservation && reserved_extent) {
1757
		space->release_free_extents(n_reserved);
osku's avatar
osku committed
1758
	}
1759

1760
	DBUG_RETURN(block);
osku's avatar
osku committed
1761 1762
}

1763
/**********************************************************************//**
osku's avatar
osku committed
1764
Calculates the number of pages reserved by a segment, and how many pages are
1765
currently used.
1766
@return number of reserved pages */
osku's avatar
osku committed
1767 1768 1769 1770
static
ulint
fseg_n_reserved_pages_low(
/*======================*/
1771 1772
	const fseg_inode_t*	inode,	/*!< in: segment inode */
	ulint*		used)	/*!< out: number of pages used (not
1773
				more than reserved) */
osku's avatar
osku committed
1774
{
1775 1776
	*used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)
		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL)
1777
		+ fseg_get_n_frag_pages(inode);
osku's avatar
osku committed
1778

1779
	return fseg_get_n_frag_pages(inode)
1780 1781 1782
		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE)
		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL)
		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL);
osku's avatar
osku committed
1783 1784
}

1785 1786 1787 1788 1789 1790
/** Calculate the number of pages reserved by a segment,
and how many pages are currently used.
@param[in]      block   buffer block containing the file segment header
@param[in]      header  file segment header
@param[out]     used    number of pages that are used (not more than reserved)
@param[in,out]  mtr     mini-transaction
1791
@return number of reserved pages */
1792 1793 1794
ulint fseg_n_reserved_pages(const buf_block_t &block,
                            const fseg_header_t *header, ulint *used,
                            mtr_t *mtr)
osku's avatar
osku committed
1795
{
1796
  ut_ad(page_align(header) == block.page.frame);
1797 1798 1799 1800 1801 1802
  buf_block_t *iblock;
  if (fseg_inode_t *inode=
      fseg_inode_try_get(header, block.page.id().space(), block.zip_size(),
                         mtr, &iblock))
    return fseg_n_reserved_pages_low(inode, used);
  return *used= 0;
osku's avatar
osku committed
1803 1804
}

1805
MY_ATTRIBUTE((nonnull, warn_unused_result))
1806
/** Tries to fill the free list of a segment with consecutive free extents.
osku's avatar
osku committed
1807 1808
This happens if the segment is big enough to allow extents in the free list,
the free list is empty, and the extents can be allocated consecutively from
1809
the hint onward.
1810
@param[in]	inode	segment inode
1811 1812 1813 1814
@param[in,out]	iblock	segment inode page
@param[in]	space	tablespace
@param[in]	hint	hint which extent would be good as the first extent
@param[in,out]	mtr	mini-transaction */
1815 1816 1817
static dberr_t fseg_fill_free_list(const fseg_inode_t *inode,
                                   buf_block_t *iblock, fil_space_t *space,
                                   uint32_t hint, mtr_t *mtr)
osku's avatar
osku committed
1818
{
1819
  ulint	used;
1820

1821 1822
  ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
  ut_d(space->modify_check(*mtr));
osku's avatar
osku committed
1823

1824 1825 1826 1827
  if (fseg_n_reserved_pages_low(inode, &used) <
      FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE)
    /* The segment is too small to allow extents in free list */
    return DB_SUCCESS;
osku's avatar
osku committed
1828

1829 1830 1831 1832 1833
  if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
  {
    space->set_corrupted();
    return DB_CORRUPTION;
  }
1834

1835 1836 1837
  if (flst_get_len(inode + FSEG_FREE) > 0)
    /* Free list is not empty */
    return DB_SUCCESS;
1838

1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861
  for (ulint i= 0; i < FSEG_FREE_LIST_MAX_LEN; i++, hint += FSP_EXTENT_SIZE)
  {
    buf_block_t *xdes;
    dberr_t err;
    xdes_t *descr= xdes_get_descriptor(space, hint, mtr, &err, &xdes);
    if (!descr || XDES_FREE != xdes_get_state(descr))
      /* We cannot allocate the desired extent: stop */
      return err;

    descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, &err);
    if (UNIV_UNLIKELY(!descr))
      return err;

    if (dberr_t err=
        flst_add_last(iblock,
                      static_cast<uint16_t>(inode - iblock->page.frame +
                                            FSEG_FREE), xdes,
                      static_cast<uint16_t>(descr - xdes->page.frame +
                                            XDES_FLST_NODE), mtr))
      return err;
    xdes_set_state(*xdes, descr, XDES_FSEG, mtr);
    mtr->memcpy(*xdes, descr + XDES_ID, inode + FSEG_ID, 8);
  }
osku's avatar
osku committed
1862

1863
  return DB_SUCCESS;
osku's avatar
osku committed
1864 1865
}

1866
MY_ATTRIBUTE((nonnull, warn_unused_result))
1867 1868 1869 1870
/** Allocates a free extent for the segment: looks first in the free list of
the segment, then tries to allocate from the space free list.
NOTE that the extent returned still resides in the segment free list, it is
not yet taken off it!
1871
@param[in]	inode		segment inode
1872 1873
@param[in,out]	iblock		segment inode page
@param[out]	xdes		extent descriptor page
1874
@param[in,out]	space		tablespace
1875
@param[in,out]	mtr		mini-transaction
1876 1877
@param[out]	err		error code
@retval nullptr	if no page could be allocated */
osku's avatar
osku committed
1878 1879 1880
static
xdes_t*
fseg_alloc_free_extent(
1881
	const fseg_inode_t*	inode,
1882 1883
	buf_block_t*		iblock,
	buf_block_t**		xdes,
1884
	fil_space_t*		space,
1885 1886
	mtr_t*			mtr,
	dberr_t*		err)
osku's avatar
osku committed
1887
{
1888 1889 1890
  ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
  ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
  ut_d(space->modify_check(*mtr));
osku's avatar
osku committed
1891

1892 1893 1894 1895 1896 1897
  if (flst_get_len(inode + FSEG_FREE))
  {
    /* Segment free list is not empty, allocate from it */
    return xdes_lst_get_descriptor(*space, flst_get_first(inode + FSEG_FREE),
                                   mtr, xdes, err);
  }
1898

1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915
  xdes_t* descr= fsp_alloc_free_extent(space, 0, xdes, mtr, err);
  if (UNIV_UNLIKELY(!descr))
    return descr;
  xdes_set_state(**xdes, descr, XDES_FSEG, mtr);
  mtr->memcpy<mtr_t::MAYBE_NOP>(**xdes, descr + XDES_ID, inode + FSEG_ID, 8);
  *err= flst_add_last(iblock,
                      static_cast<uint16_t>(inode - iblock->page.frame +
                                            FSEG_FREE), *xdes,
                      static_cast<uint16_t>(descr - (*xdes)->page.frame +
                                            XDES_FLST_NODE), mtr);
  if (UNIV_LIKELY(*err != DB_SUCCESS))
    return nullptr;
  /* Try to fill the segment free list */
  *err= fseg_fill_free_list(inode, iblock, space,
                            xdes_get_offset(descr) + FSP_EXTENT_SIZE, mtr);
  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
    return nullptr;
osku's avatar
osku committed
1916

1917
  return descr;
osku's avatar
osku committed
1918 1919
}

1920 1921 1922 1923 1924
/** Allocates a single free page from a segment.
This function implements the intelligent allocation strategy which tries to
minimize file space fragmentation.
@param[in,out]	space			tablespace
@param[in,out]	seg_inode		segment inode
1925
@param[in,out]	iblock			segment inode page
1926 1927 1928 1929 1930 1931
@param[in]	hint			hint of which page would be desirable
@param[in]	direction		if the new page is needed because of
an index page split, and records are inserted there in order, into which
direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
@param[in,out]	mtr			mini-transaction
@param[in,out]	init_mtr		mtr or another mini-transaction in
1932
which the page should be initialized.
1933 1934 1935
@param[out]	err			error code
@return the allocated page
@retval nullptr	if no page could be allocated */
osku's avatar
osku committed
1936
static
1937
buf_block_t*
osku's avatar
osku committed
1938
fseg_alloc_free_page_low(
1939 1940
	fil_space_t*		space,
	fseg_inode_t*		seg_inode,
1941
	buf_block_t*		iblock,
1942
	uint32_t		hint,
1943 1944
	byte			direction,
#ifdef UNIV_DEBUG
1945 1946
	bool			has_done_reservation,
	/*!< whether the space has already been reserved */
1947
#endif /* UNIV_DEBUG */
1948
	mtr_t*			mtr,
1949 1950
	mtr_t*			init_mtr,
	dberr_t*		err)
osku's avatar
osku committed
1951
{
1952
	ib_id_t		seg_id;
osku's avatar
osku committed
1953 1954
	ulint		used;
	ulint		reserved;
1955
	xdes_t*		descr;		/*!< extent of the hinted page */
1956
	uint32_t	ret_page;	/*!< the allocated page offset, FIL_NULL
osku's avatar
osku committed
1957
					if could not be allocated */
1958
	xdes_t*		ret_descr;	/*!< the extent of the allocated page */
1959
	buf_block_t*	xdes;
osku's avatar
osku committed
1960
	ulint		n;
1961

osku's avatar
osku committed
1962
	ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR));
1963
	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
1964
	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
1965
	seg_id = mach_read_from_8(seg_inode + FSEG_ID);
osku's avatar
osku committed
1966

1967
	ut_ad(seg_id);
1968
	ut_d(space->modify_check(*mtr));
1969
	ut_ad(fil_page_get_type(page_align(seg_inode)) == FIL_PAGE_INODE);
1970

1971
	reserved = fseg_n_reserved_pages_low(seg_inode, &used);
1972

1973
	buf_block_t* header = fsp_get_header(space, mtr, err);
1974
	if (!header) {
1975
		return header;
1976
	}
osku's avatar
osku committed
1977

1978
	descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr,
1979
						   err, &xdes);
1980
	if (!descr) {
1981 1982 1983
		if (*err != DB_SUCCESS) {
			return nullptr;
		}
osku's avatar
osku committed
1984 1985
		/* Hint outside space or too high above free limit: reset
		hint */
1986
		/* The file space header page is always allocated. */
osku's avatar
osku committed
1987
		hint = 0;
1988
		descr = xdes_get_descriptor(space, hint, mtr, err, &xdes);
1989 1990 1991
		if (!descr) {
			return nullptr;
		}
osku's avatar
osku committed
1992
	}
1993

osku's avatar
osku committed
1994
	/* In the big if-else below we look for ret_page and ret_descr */
1995
	/*-------------------------------------------------------------*/
1996
	if ((xdes_get_state(descr) == XDES_FSEG)
1997
	    && mach_read_from_8(descr + XDES_ID) == seg_id
1998
	    && xdes_is_free(descr, hint % FSP_EXTENT_SIZE)) {
1999
take_hinted_page:
osku's avatar
osku committed
2000 2001 2002 2003
		/* 1. We can take the hinted page
		=================================*/
		ret_descr = descr;
		ret_page = hint;
2004 2005 2006 2007
		/* Skip the check for extending the tablespace. If the
		page hint were not within the size of the tablespace,
		we would have got (descr == NULL) above and reset the hint. */
		goto got_hinted_page;
2008
		/*-----------------------------------------------------------*/
2009
	} else if (xdes_get_state(descr) == XDES_FREE
2010 2011
		   && reserved - used < reserved / FSEG_FILLFACTOR
		   && used >= FSEG_FRAG_LIMIT) {
osku's avatar
osku committed
2012 2013 2014 2015 2016

		/* 2. We allocate the free extent from space and can take
		=========================================================
		the hinted page
		===============*/
2017 2018
		ret_descr = fsp_alloc_free_extent(space, hint, &xdes,
						  mtr, err);
osku's avatar
osku committed
2019

2020 2021 2022 2023 2024 2025
		if (UNIV_UNLIKELY(ret_descr != descr)) {
			if (*err != DB_SUCCESS) {
				*err = DB_CORRUPTION;
			}
			return nullptr;
		}
2026

2027
		xdes_set_state(*xdes, ret_descr, XDES_FSEG, mtr);
2028 2029
		mtr->write<8,mtr_t::MAYBE_NOP>(*xdes, ret_descr + XDES_ID,
					       seg_id);
2030 2031 2032 2033 2034 2035 2036 2037 2038 2039
		*err = flst_add_last(
			iblock,
			static_cast<uint16_t>(seg_inode - iblock->page.frame
					      + FSEG_FREE), xdes,
			static_cast<uint16_t>(ret_descr
					      - xdes->page.frame
					      + XDES_FLST_NODE), mtr);
		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
			return nullptr;
		}
osku's avatar
osku committed
2040 2041

		/* Try to fill the segment free list */
2042 2043 2044 2045 2046
		*err = fseg_fill_free_list(seg_inode, iblock, space,
					   hint + FSP_EXTENT_SIZE, mtr);
		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
			return nullptr;
		}
2047
		goto take_hinted_page;
2048
		/*-----------------------------------------------------------*/
osku's avatar
osku committed
2049 2050 2051
	} else if ((direction != FSP_NO_DIR)
		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
		   && (used >= FSEG_FRAG_LIMIT)
2052 2053 2054
		   && (ret_descr = fseg_alloc_free_extent(seg_inode, iblock,
							  &xdes, space,
							  mtr, err))) {
osku's avatar
osku committed
2055 2056 2057 2058 2059 2060
		/* 3. We take any free extent (which was already assigned above
		===============================================================
		in the if-condition to ret_descr) and take the lowest or
		========================================================
		highest page in it, depending on the direction
		==============================================*/
2061
		ret_page = xdes_get_offset(ret_descr);
osku's avatar
osku committed
2062 2063 2064 2065

		if (direction == FSP_DOWN) {
			ret_page += FSP_EXTENT_SIZE - 1;
		}
2066
		ut_ad(!has_done_reservation || ret_page != FIL_NULL);
2067
		/*-----------------------------------------------------------*/
2068 2069
	} else if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
		return nullptr;
2070
	} else if ((xdes_get_state(descr) == XDES_FSEG)
2071
		   && mach_read_from_8(descr + XDES_ID) == seg_id
2072
		   && (!xdes_is_full(descr))) {
osku's avatar
osku committed
2073 2074 2075 2076 2077 2078 2079 2080

		/* 4. We can take the page from the same extent as the
		======================================================
		hinted page (and the extent already belongs to the
		==================================================
		segment)
		========*/
		ret_descr = descr;
2081 2082 2083 2084 2085 2086
		ret_page = xdes_find_free(ret_descr, hint % FSP_EXTENT_SIZE);
		if (ret_page == FIL_NULL) {
			ut_ad(!has_done_reservation);
		} else {
			ret_page += xdes_get_offset(ret_descr);
		}
2087
		/*-----------------------------------------------------------*/
osku's avatar
osku committed
2088 2089 2090 2091 2092
	} else if (reserved - used > 0) {
		/* 5. We take any unused page from the segment
		==============================================*/
		fil_addr_t	first;

2093
		if (flst_get_len(seg_inode + FSEG_NOT_FULL) > 0) {
2094
			first = flst_get_first(seg_inode + FSEG_NOT_FULL);
2095
		} else if (flst_get_len(seg_inode + FSEG_FREE) > 0) {
2096
			first = flst_get_first(seg_inode + FSEG_FREE);
osku's avatar
osku committed
2097
		} else {
2098
			ut_ad(!has_done_reservation);
2099
			return(NULL);
osku's avatar
osku committed
2100 2101
		}

2102 2103 2104 2105 2106
		ret_descr = xdes_lst_get_descriptor(*space, first, mtr, &xdes);
		if (!ret_descr) {
			return nullptr;
		}

2107 2108 2109 2110 2111 2112
		ret_page = xdes_find_free(ret_descr);
		if (ret_page == FIL_NULL) {
			ut_ad(!has_done_reservation);
		} else {
			ret_page += xdes_get_offset(ret_descr);
		}
2113
		/*-----------------------------------------------------------*/
osku's avatar
osku committed
2114 2115 2116
	} else if (used < FSEG_FRAG_LIMIT) {
		/* 6. We allocate an individual page from the space
		===================================================*/
2117
		buf_block_t* block = fsp_alloc_free_page(
2118
			space, hint, mtr, init_mtr, err);
2119

2120
		ut_ad(block || !has_done_reservation || *err);
2121

Marko Mäkelä's avatar
Marko Mäkelä committed
2122
		if (block) {
osku's avatar
osku committed
2123 2124
			/* Put the page in the fragment page array of the
			segment */
2125
			n = fseg_find_free_frag_page_slot(seg_inode);
2126 2127 2128 2129
			if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) {
				*err = DB_CORRUPTION;
				return nullptr;
			}
osku's avatar
osku committed
2130

2131
			fseg_set_nth_frag_page_no(
2132 2133
				seg_inode, iblock, n,
				block->page.id().page_no(), mtr);
osku's avatar
osku committed
2134
		}
2135 2136 2137 2138

		/* fsp_alloc_free_page() invoked fsp_init_file_page()
		already. */
		return(block);
2139
		/*-----------------------------------------------------------*/
osku's avatar
osku committed
2140 2141 2142
	} else {
		/* 7. We allocate a new extent and take its first page
		======================================================*/
2143
		ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes,
2144
						   space, mtr, err);
osku's avatar
osku committed
2145

2146 2147 2148
		if (!ret_descr) {
			ut_ad(!has_done_reservation || *err);
			return nullptr;
osku's avatar
osku committed
2149 2150
		} else {
			ret_page = xdes_get_offset(ret_descr);
2151
		}
osku's avatar
osku committed
2152
	}
2153

osku's avatar
osku committed
2154 2155
	if (ret_page == FIL_NULL) {
		/* Page could not be allocated */
2156

2157
		ut_ad(!has_done_reservation);
2158
		return(NULL);
osku's avatar
osku committed
2159 2160
	}

2161
	if (space->size <= ret_page && !is_predefined_tablespace(space->id)) {
2162 2163
		/* It must be that we are extending a single-table
		tablespace whose size is still < 64 pages */
2164

2165
		if (ret_page >= FSP_EXTENT_SIZE) {
2166 2167 2168 2169 2170 2171
			sql_print_error("InnoDB: Trying to extend '%s'"
					" by single page(s) though the"
					" space size " UINT32PF "."
					" Page no " UINT32PF ".",
					space->chain.start->name, space->size,
					ret_page);
2172 2173 2174 2175 2176
			ut_ad(!has_done_reservation);
			return(NULL);
		}

		if (!fsp_try_extend_data_file_with_pages(
2177
			    space, ret_page, header, mtr)) {
2178 2179 2180
			/* No disk space left */
			ut_ad(!has_done_reservation);
			return(NULL);
osku's avatar
osku committed
2181 2182 2183
		}
	}

2184 2185 2186 2187
got_hinted_page:
	/* ret_descr == NULL if the block was allocated from free_frag
	(XDES_FREE_FRAG) */
	if (ret_descr != NULL) {
osku's avatar
osku committed
2188 2189 2190
		/* At this point we know the extent and the page offset.
		The extent is still in the appropriate list (FSEG_NOT_FULL
		or FSEG_FREE), and the page is not yet marked as used. */
2191

2192
		ut_d(buf_block_t* xxdes);
2193
		ut_ad(xdes_get_descriptor(space, ret_page, mtr, err, &xxdes)
2194 2195
		      == ret_descr);
		ut_ad(xdes == xxdes);
2196
		ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE));
2197

2198 2199 2200 2201 2202
		*err = fseg_mark_page_used(seg_inode, iblock, ret_page,
                                           ret_descr, xdes, mtr);
		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
			return nullptr;
		}
osku's avatar
osku committed
2203 2204
	}

Marko Mäkelä's avatar
Marko Mäkelä committed
2205
	return fsp_page_create(space, ret_page, init_mtr);
osku's avatar
osku committed
2206 2207
}

2208
/**********************************************************************//**
osku's avatar
osku committed
2209 2210
Allocates a single free page from a segment. This function implements
the intelligent allocation strategy which tries to minimize file space
2211
fragmentation.
2212
@retval NULL if no page could be allocated */
2213
buf_block_t*
osku's avatar
osku committed
2214 2215
fseg_alloc_free_page_general(
/*=========================*/
2216
	fseg_header_t*	seg_header,/*!< in/out: segment header */
2217
	uint32_t	hint,	/*!< in: hint of which page would be
2218
				desirable */
2219
	byte		direction,/*!< in: if the new page is needed because
osku's avatar
osku committed
2220 2221 2222 2223
				of an index page split, and records are
				inserted there in order, into which
				direction they go alphabetically: FSP_DOWN,
				FSP_UP, FSP_NO_DIR */
2224
	bool		has_done_reservation, /*!< in: true if the caller has
osku's avatar
osku committed
2225 2226 2227 2228
				already done the reservation for the page
				with fsp_reserve_free_extents, then there
				is no need to do the check for this individual
				page */
2229
	mtr_t*		mtr,	/*!< in/out: mini-transaction */
2230
	mtr_t*		init_mtr,/*!< in/out: mtr or another mini-transaction
2231
				in which the page should be initialized. */
2232
	dberr_t*	err)	/*!< out: error code */
osku's avatar
osku committed
2233 2234
{
	fseg_inode_t*	inode;
2235 2236
	fil_space_t*	space;
	buf_block_t*	iblock;
2237
	buf_block_t*	block;
2238
	uint32_t	n_reserved;
osku's avatar
osku committed
2239

2240
	const uint32_t space_id = page_get_space_id(page_align(seg_header));
2241
	space = mtr->x_lock_space(space_id);
2242 2243 2244 2245 2246
	inode = fseg_inode_try_get(seg_header, space_id, space->zip_size(),
				   mtr, &iblock, err);
	if (!inode) {
		return nullptr;
	}
2247 2248 2249
	if (!space->full_crc32()) {
		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
	}
osku's avatar
osku committed
2250

2251 2252 2253 2254 2255 2256
	if (!has_done_reservation) {
		*err = fsp_reserve_free_extents(&n_reserved, space, 2,
						FSP_NORMAL, mtr);
		if (*err != DB_SUCCESS) {
			return nullptr;
		}
osku's avatar
osku committed
2257 2258
	}

2259
	block = fseg_alloc_free_page_low(space,
2260
					 inode, iblock, hint, direction,
2261
#ifdef UNIV_DEBUG
2262
					 has_done_reservation,
2263
#endif /* UNIV_DEBUG */
2264
					 mtr, init_mtr, err);
2265 2266 2267

	/* The allocation cannot fail if we have already reserved a
	space for the page. */
2268
	ut_ad(block || !has_done_reservation || *err);
2269

osku's avatar
osku committed
2270
	if (!has_done_reservation) {
2271
		space->release_free_extents(n_reserved);
osku's avatar
osku committed
2272 2273
	}

2274
	return(block);
2275 2276
}

2277
MY_ATTRIBUTE((nonnull, warn_unused_result))
2278 2279 2280 2281 2282
/** Check that we have at least n_pages frag pages free in the first extent
of a single-table tablespace, and they are also physically initialized to
the data file. That is we have already extended the data file so that those
pages are inside the data file. If not, this function extends the tablespace
with pages.
2283 2284 2285 2286 2287
@param[in,out]	space	tablespace
@param[in,out]	header	tablespace header, x-latched
@param[in]	size	tablespace size in pages, less than FSP_EXTENT_SIZE
@param[in,out]	mtr	mini-transaction
@param[in]	n_pages	number of pages to reserve
2288
@return error code */
osku's avatar
osku committed
2289
static
2290
dberr_t
osku's avatar
osku committed
2291
fsp_reserve_free_pages(
2292
	fil_space_t*	space,
2293
	buf_block_t*	header,
2294
	ulint		size,
2295
	mtr_t*		mtr,
2296
	uint32_t	n_pages)
osku's avatar
osku committed
2297
{
2298 2299
  ut_ad(space != fil_system.sys_space && space != fil_system.temp_space);
  ut_ad(size < FSP_EXTENT_SIZE);
osku's avatar
osku committed
2300

2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314
  dberr_t err= DB_OUT_OF_FILE_SPACE;
  const xdes_t *descr=
    xdes_get_descriptor_with_space_hdr(header, space, 0, mtr, &err);
  if (!descr)
    return err;
  const uint32_t n_used= xdes_get_n_used(descr);
  if (size >= n_used + n_pages)
    return DB_SUCCESS;
  if (n_used > size)
    return DB_CORRUPTION;
  return fsp_try_extend_data_file_with_pages(space, n_used + n_pages - 1,
                                             header, mtr)
    ? DB_SUCCESS
    : DB_OUT_OF_FILE_SPACE;
osku's avatar
osku committed
2315 2316
}

2317
/** Reserves free pages from a tablespace. All mini-transactions which may
osku's avatar
osku committed
2318 2319 2320
use several pages from the tablespace should call this function beforehand
and reserve enough free extents so that they certainly will be able
to do their operation, like a B-tree page split, fully. Reservations
2321
must be released with function fil_space_t::release_free_extents()!
osku's avatar
osku committed
2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335

The alloc_type below has the following meaning: FSP_NORMAL means an
operation which will probably result in more space usage, like an
insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
deleting rows, then this allocation will in the long run result in
less space usage (after a purge); FSP_CLEANING means allocation done
in a physical record delete (like in a purge) or other cleaning operation
which will result in less space usage in the long run. We prefer the latter
two types of allocation: when space is scarce, FSP_NORMAL allocations
will not succeed, but the latter two allocations will succeed, if possible.
The purpose is to avoid dead end where the database is full but the
user cannot free any space because these freeing operations temporarily
reserve some space.

2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346
Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special
case. In this function we would liberally reserve several extents for
every page split or merge in a B-tree. But we do not want to waste disk space
if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
different rules in that special case, just ensuring that there are n_pages
free pages available.

@param[out]	n_reserved	number of extents actually reserved; if we
				return true and the tablespace size is <
				FSP_EXTENT_SIZE pages, then this can be 0,
				otherwise it is n_ext
2347
@param[in,out]	space		tablespace
2348 2349 2350 2351 2352 2353
@param[in]	n_ext		number of extents to reserve
@param[in]	alloc_type	page reservation type (FSP_BLOB, etc)
@param[in,out]	mtr		the mini transaction
@param[in]	n_pages		for small tablespaces (tablespace size is
				less than FSP_EXTENT_SIZE), number of free
				pages to reserve.
2354 2355 2356
@return error code
@retval DB_SUCCESS if we were able to make the reservation */
dberr_t
osku's avatar
osku committed
2357
fsp_reserve_free_extents(
2358
	uint32_t*	n_reserved,
2359
	fil_space_t*	space,
2360
	uint32_t	n_ext,
2361
	fsp_reserve_t	alloc_type,
2362
	mtr_t*		mtr,
2363
	uint32_t	n_pages)
osku's avatar
osku committed
2364 2365 2366
{
	ulint		reserve;

2367
	ut_ad(mtr);
2368 2369
	*n_reserved = n_ext;

2370 2371
	const uint32_t extent_size = FSP_EXTENT_SIZE;

2372
	mtr->x_lock_space(space);
2373
	const unsigned physical_size = space->physical_size();
osku's avatar
osku committed
2374

2375 2376
	dberr_t err;
	buf_block_t* header = fsp_get_header(space, mtr, &err);
2377
	if (!header) {
2378
		return err;
2379
	}
osku's avatar
osku committed
2380
try_again:
2381
	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
2382
					 + header->page.frame);
2383
	ut_ad(size == space->size_in_header);
2384

2385
	if (size < extent_size && n_pages < extent_size / 2) {
osku's avatar
osku committed
2386 2387
		/* Use different rules for small single-table tablespaces */
		*n_reserved = 0;
2388 2389
		return fsp_reserve_free_pages(space, header, size,
					      mtr, n_pages);
osku's avatar
osku committed
2390 2391
	}

2392
	uint32_t n_free_list_ext = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
2393
						+ header->page.frame);
2394
	ut_ad(space->free_len == n_free_list_ext);
2395

2396 2397
	uint32_t free_limit = mach_read_from_4(FSP_HEADER_OFFSET
					       + FSP_FREE_LIMIT
2398
					       + header->page.frame);
2399
	ut_ad(space->free_limit == free_limit);
osku's avatar
osku committed
2400 2401 2402 2403 2404

	/* Below we play safe when counting free extents above the free limit:
	some of them will contain extent descriptor pages, and therefore
	will not be free extents */

2405 2406
	uint32_t n_free_up;

2407
	if (size >= free_limit) {
2408 2409 2410 2411 2412
		n_free_up = (size - free_limit) / extent_size;
		if (n_free_up) {
			n_free_up--;
			n_free_up -= n_free_up / (physical_size / extent_size);
		}
2413 2414 2415 2416
	} else {
		ut_ad(alloc_type == FSP_BLOB);
		n_free_up = 0;
	}
osku's avatar
osku committed
2417

2418
	uint32_t n_free = n_free_list_ext + n_free_up;
osku's avatar
osku committed
2419

2420 2421
	switch (alloc_type) {
	case FSP_NORMAL:
osku's avatar
osku committed
2422 2423 2424 2425
		/* We reserve 1 extent + 0.5 % of the space size to undo logs
		and 1 extent + 0.5 % to cleaning operations; NOTE: this source
		code is duplicated in the function below! */

2426
		reserve = 2 + ((size / extent_size) * 2) / 200;
osku's avatar
osku committed
2427 2428 2429 2430 2431

		if (n_free <= reserve + n_ext) {

			goto try_to_extend;
		}
2432 2433
		break;
	case FSP_UNDO:
osku's avatar
osku committed
2434 2435
		/* We reserve 0.5 % of the space size to cleaning operations */

2436
		reserve = 1 + ((size / extent_size) * 1) / 200;
osku's avatar
osku committed
2437 2438 2439 2440 2441

		if (n_free <= reserve + n_ext) {

			goto try_to_extend;
		}
2442 2443 2444
		break;
	case FSP_CLEANING:
	case FSP_BLOB:
Marko Mäkelä's avatar
Marko Mäkelä committed
2445
		reserve = 0;
2446 2447 2448
		break;
	default:
		ut_error;
osku's avatar
osku committed
2449 2450
	}

2451
	if (space->reserve_free_extents(n_free, n_ext)) {
2452
		return DB_SUCCESS;
osku's avatar
osku committed
2453 2454
	}
try_to_extend:
Marko Mäkelä's avatar
Marko Mäkelä committed
2455
	if (fsp_try_extend_data_file(space, header, mtr)) {
osku's avatar
osku committed
2456 2457 2458
		goto try_again;
	}

2459
	return DB_OUT_OF_FILE_SPACE;
osku's avatar
osku committed
2460 2461
}

2462
MY_ATTRIBUTE((nonnull, warn_unused_result))
2463 2464
/** Frees a single page of a segment.
@param[in]	seg_inode	segment inode
2465 2466
@param[in,out]	space		tablespace
@param[in]	offset		page number
2467
@param[in,out]	mtr		mini-transaction
2468 2469
@param[in]	ahi		Drop adaptive hash index
@return error code */
osku's avatar
osku committed
2470
static
2471
dberr_t
osku's avatar
osku committed
2472
fseg_free_page_low(
2473
	fseg_inode_t*		seg_inode,
2474
	buf_block_t*		iblock,
2475
	fil_space_t*		space,
2476
	uint32_t		offset,
2477 2478 2479 2480 2481
	mtr_t*			mtr
#ifdef BTR_CUR_HASH_ADAPT
	,bool			ahi=false
#endif /* BTR_CUR_HASH_ADAPT */
	)
osku's avatar
osku committed
2482
{
2483
	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
2484
	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
2485
	ut_ad(iblock->page.frame == page_align(seg_inode));
2486
	ut_d(space->modify_check(*mtr));
osku's avatar
osku committed
2487

2488 2489 2490 2491 2492 2493 2494
#ifdef BTR_CUR_HASH_ADAPT
	if (ahi) {
		btr_search_drop_page_hash_when_freed(
			page_id_t(space->id, offset));
	}
#endif /* BTR_CUR_HASH_ADAPT */

2495 2496
	const uint32_t extent_size = FSP_EXTENT_SIZE;
	ut_ad(ut_is_2pow(extent_size));
2497
	buf_block_t* xdes;
2498 2499
	dberr_t err;
	xdes_t* descr = xdes_get_descriptor(space, offset, mtr, &err, &xdes);
osku's avatar
osku committed
2500

2501 2502 2503 2504 2505 2506 2507
	if (!descr) {
		return err;
	}
	if (UNIV_UNLIKELY(xdes_is_free(descr, offset & (extent_size - 1)))) {
corrupted:
		space->set_corrupted();
		return DB_CORRUPTION;
osku's avatar
osku committed
2508
	}
2509

2510
	if (xdes_get_state(descr) != XDES_FSEG) {
osku's avatar
osku committed
2511
		/* The page is in the fragment pages of the segment */
2512
		for (ulint i = 0;; i++) {
2513
			if (fseg_get_nth_frag_page_no(seg_inode, i)
2514 2515
			    != offset) {
				continue;
osku's avatar
osku committed
2516
			}
2517 2518

			compile_time_assert(FIL_NULL == 0xffffffff);
2519 2520
			mtr->memset(iblock, uint16_t(seg_inode
						     - iblock->page.frame)
2521 2522
				    + FSEG_FRAG_ARR
				    + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff);
2523
			break;
osku's avatar
osku committed
2524 2525
		}

2526
		return fsp_free_page(space, offset, mtr);
osku's avatar
osku committed
2527 2528
	}

2529
	/* If we get here, the page is in some extent of the segment */
osku's avatar
osku committed
2530

2531 2532
	if (UNIV_UNLIKELY(memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8))) {
		goto corrupted;
osku's avatar
osku committed
2533 2534
	}

2535 2536
	byte* p_not_full = seg_inode + FSEG_NOT_FULL_N_USED;
	uint32_t not_full_n_used = mach_read_from_4(p_not_full);
2537 2538 2539
	const uint16_t xoffset= uint16_t(descr - xdes->page.frame
					 + XDES_FLST_NODE);
	const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
2540

2541
	if (xdes_is_full(descr)) {
osku's avatar
osku committed
2542
		/* The fragment is full: move it to another list */
2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554
		err = flst_remove(iblock,
				  static_cast<uint16_t>(FSEG_FULL + ioffset),
				  xdes, xoffset, mtr);
		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
			return err;
		}
		err = flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
								  + ioffset),
				    xdes, xoffset, mtr);
		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
			return err;
		}
2555
		not_full_n_used += extent_size - 1;
osku's avatar
osku committed
2556
	} else {
2557 2558 2559
		if (!not_full_n_used) {
			goto corrupted;
		}
2560
		not_full_n_used--;
osku's avatar
osku committed
2561 2562
	}

2563
	mtr->write<4>(*iblock, p_not_full, not_full_n_used);
2564
	xdes_set_free<true>(*xdes, descr, offset & (extent_size - 1), mtr);
osku's avatar
osku committed
2565

2566
	if (!xdes_get_n_used(descr)) {
2567 2568 2569 2570 2571 2572 2573 2574 2575 2576
		err = flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
								+ ioffset),
				  xdes, xoffset, mtr);
		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
			return err;
		}
		err = fsp_free_extent(space, offset, mtr);
		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
			return err;
		}
2577
	}
2578

2579
	mtr->free(*space, static_cast<uint32_t>(offset));
2580
	return DB_SUCCESS;
osku's avatar
osku committed
2581 2582
}

2583 2584 2585 2586
/** Free a page in a file segment.
@param[in,out]	seg_header	file segment header
@param[in,out]	space		tablespace
@param[in]	offset		page number
2587
@param[in,out]	mtr		mini-transaction
2588 2589 2590 2591
@param[in]	have_latch	whether space->x_lock() was already called
@return error code */
dberr_t fseg_free_page(fseg_header_t *seg_header, fil_space_t *space,
                       uint32_t offset, mtr_t *mtr, bool have_latch)
osku's avatar
osku committed
2592
{
2593 2594 2595 2596 2597 2598 2599 2600 2601
  buf_block_t *iblock;
  if (have_latch)
    ut_ad(space->is_owner());
  else
    mtr->x_lock_space(space);

  DBUG_PRINT("fseg_free_page",
             ("space_id: " ULINTPF ", page_no: %u", space->id, offset));

2602
  dberr_t err;
2603 2604
  if (fseg_inode_t *seg_inode= fseg_inode_try_get(seg_header,
                                                  space->id, space->zip_size(),
2605
                                                  mtr, &iblock, &err))
2606 2607 2608
  {
    if (!space->full_crc32())
      fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
2609
    return fseg_free_page_low(seg_inode, iblock, space, offset, mtr);
2610
  }
osku's avatar
osku committed
2611

2612
  return err;
osku's avatar
osku committed
2613 2614
}

2615 2616 2617 2618 2619 2620 2621
/** Determine whether a page is allocated.
@param space   tablespace
@param page    page number
@return error code
@retval DB_SUCCESS             if the page is marked as free
@retval DB_SUCCESS_LOCKED_REC  if the page is marked as allocated */
dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page)
2622
{
2623 2624 2625 2626 2627 2628 2629
  mtr_t mtr;
  uint32_t dpage= xdes_calc_descriptor_page(space->zip_size(), page);
  const unsigned zip_size= space->zip_size();
  dberr_t err= DB_SUCCESS;

  mtr.start();
  if (!space->is_owner())
2630
    mtr.x_lock_space(space);
2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651

  if (page >= space->free_limit || page >= space->size_in_header);
  else if (const buf_block_t *b=
           buf_page_get_gen(page_id_t(space->id, dpage), space->zip_size(),
                            RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
                            &mtr, &err))
  {
    if (!dpage &&
        (space->free_limit !=
         mach_read_from_4(FSP_FREE_LIMIT + FSP_HEADER_OFFSET +
                          b->page.frame) ||
         space->size_in_header !=
         mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET + b->page.frame)))
      err= DB_CORRUPTION;
    else
      err= xdes_is_free(b->page.frame + XDES_ARR_OFFSET + XDES_SIZE
                        * xdes_calc_descriptor_index(zip_size, page),
                        page & (FSP_EXTENT_SIZE - 1))
        ? DB_SUCCESS
        : DB_SUCCESS_LOCKED_REC;
  }
2652

2653 2654
  mtr.commit();
  return err;
2655 2656
}

2657
MY_ATTRIBUTE((nonnull, warn_unused_result))
2658 2659 2660 2661
/** Free an extent of a segment to the space free list.
@param[in,out]	seg_inode	segment inode
@param[in,out]	space		tablespace
@param[in]	page		page number in the extent
2662 2663
@param[in,out]	mtr		mini-transaction
@return error code */
2664
static
2665
dberr_t
osku's avatar
osku committed
2666
fseg_free_extent(
2667
	fseg_inode_t*		seg_inode,
2668
	buf_block_t*		iblock,
2669
	fil_space_t*		space,
2670
	uint32_t		page,
2671 2672 2673 2674 2675
	mtr_t*			mtr
#ifdef BTR_CUR_HASH_ADAPT
	,bool			ahi=false
#endif /* BTR_CUR_HASH_ADAPT */
	)
osku's avatar
osku committed
2676
{
2677
	buf_block_t* xdes;
2678 2679
	dberr_t err;
	xdes_t*	descr = xdes_get_descriptor(space, page, mtr, &err, &xdes);
2680 2681

	if (!descr) {
2682
		return err;
2683
	}
osku's avatar
osku committed
2684

2685 2686 2687 2688 2689 2690
	if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FSEG
			  || memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8)
			  || memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N
				    + seg_inode, 4))) {
		return DB_CORRUPTION;
	}
2691
	ut_d(space->modify_check(*mtr));
2692
	const uint32_t first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
osku's avatar
osku committed
2693

2694 2695 2696
	const uint16_t xoffset= uint16_t(descr - xdes->page.frame
					 + XDES_FLST_NODE);
	const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
2697

2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712
#ifdef BTR_CUR_HASH_ADAPT
	if (ahi) {
		for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
			if (!xdes_is_free(descr, i)) {
				/* Drop search system page hash index
				if the page is found in the pool and
				is hashed */
				btr_search_drop_page_hash_when_freed(
					page_id_t(space->id,
						 first_page_in_extent + i));
			}
		}
	}
#endif /* BTR_CUR_HASH_ADAPT */

2713 2714
	uint16_t lst;

2715
	if (xdes_is_full(descr)) {
2716 2717 2718 2719 2720 2721
		lst = static_cast<uint16_t>(FSEG_FULL + ioffset);
remove:
		err = flst_remove(iblock, lst, xdes, xoffset, mtr);
		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
			return err;
		}
2722
	} else if (!xdes_get_n_used(descr)) {
2723 2724
		lst = static_cast<uint16_t>(FSEG_FREE + ioffset);
                goto remove;
osku's avatar
osku committed
2725
	} else {
2726 2727 2728 2729 2730 2731
		err = flst_remove(
			iblock, static_cast<uint16_t>(FSEG_NOT_FULL + ioffset),
			xdes, xoffset, mtr);
		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
			return err;
		}
2732
		uint32_t not_full_n_used = mach_read_from_4(
2733
			FSEG_NOT_FULL_N_USED + seg_inode);
2734
		uint32_t descr_n_used = xdes_get_n_used(descr);
2735 2736 2737
		if (not_full_n_used < descr_n_used) {
			return DB_CORRUPTION;
		}
2738 2739
		mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
			      not_full_n_used - descr_n_used);
osku's avatar
osku committed
2740 2741
	}

2742 2743 2744
	std::vector<uint8_t> going_to_free;
	static_assert(FSP_EXTENT_SIZE_MIN == 256, "compatibility");
	static_assert(FSP_EXTENT_SIZE_MAX == 64, "compatibility");
osku's avatar
osku committed
2745

2746
	for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
2747
		if (!xdes_is_free(descr, i)) {
2748
			going_to_free.emplace_back(uint8_t(i));
2749
		}
osku's avatar
osku committed
2750
	}
2751

Marko Mäkelä's avatar
Marko Mäkelä committed
2752 2753 2754
	if (dberr_t err = fsp_free_extent(space, page, mtr)) {
		return err;
	}
2755 2756 2757

	for (uint32_t i : going_to_free) {
		mtr->free(*space, first_page_in_extent + i);
Marko Mäkelä's avatar
Marko Mäkelä committed
2758
		buf_page_free(space, first_page_in_extent + i, mtr);
2759
	}
Marko Mäkelä's avatar
Marko Mäkelä committed
2760

2761
	return DB_SUCCESS;
osku's avatar
osku committed
2762 2763
}

2764 2765 2766 2767 2768 2769 2770 2771 2772
/** Frees part of a segment. This function can be used to free
a segment by repeatedly calling this function in different
mini-transactions. Doing the freeing in a single mini-transaction
might result in too big a mini-transaction.
@param	header	segment header; NOTE: if the header resides on first
		page of the frag list of the segment, this pointer
		becomes obsolete after the last freeing step
@param	mtr	mini-transaction
@param	ahi	Drop the adaptive hash index
2773
@return whether the freeing was completed */
2774
bool
2775
fseg_free_step(
2776 2777 2778 2779 2780 2781
	fseg_header_t*	header,
	mtr_t*		mtr
#ifdef BTR_CUR_HASH_ADAPT
	,bool		ahi
#endif /* BTR_CUR_HASH_ADAPT */
	)
osku's avatar
osku committed
2782 2783 2784 2785
{
	ulint		n;
	fseg_inode_t*	inode;

2786 2787
	const uint32_t space_id = page_get_space_id(page_align(header));
	const uint32_t header_page = page_get_page_no(page_align(header));
2788

2789
	fil_space_t* space = mtr->x_lock_space(space_id);
2790 2791 2792
	xdes_t* descr = xdes_get_descriptor(space, header_page, mtr);

	if (!descr) {
2793
		return true;
2794
	}
osku's avatar
osku committed
2795 2796 2797 2798

	/* Check that the header resides on a page which has not been
	freed yet */

2799 2800 2801 2802 2803 2804
	if (UNIV_UNLIKELY(xdes_is_free(descr,
				       header_page & (FSP_EXTENT_SIZE - 1)))) {
		/* Some corruption was detected: stop the freeing
		in order to prevent a crash. */
		return true;
	}
2805
	buf_block_t* iblock;
2806 2807
	const ulint zip_size = space->zip_size();
	inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock);
2808 2809
	if (!inode || space->is_stopping()) {
		return true;
2810
	}
osku's avatar
osku committed
2811

2812 2813 2814
	if (!space->full_crc32()) {
		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
	}
osku's avatar
osku committed
2815

2816 2817
	dberr_t err;
	descr = fseg_get_first_extent(inode, space, mtr, &err);
2818

2819
	if (descr) {
osku's avatar
osku committed
2820
		/* Free the extent held by the segment */
2821 2822
		return fseg_free_extent(inode, iblock, space,
					xdes_get_offset(descr), mtr
2823
#ifdef BTR_CUR_HASH_ADAPT
2824
					, ahi
2825
#endif /* BTR_CUR_HASH_ADAPT */
2826 2827 2828 2829 2830
					) != DB_SUCCESS;
	}

	if (err != DB_SUCCESS || space->is_stopping()) {
		return true;
osku's avatar
osku committed
2831 2832 2833
	}

	/* Free a frag page */
2834
	n = fseg_find_last_used_frag_page_slot(inode);
osku's avatar
osku committed
2835 2836 2837

	if (n == ULINT_UNDEFINED) {
		/* Freeing completed: free the segment inode */
2838
		fsp_free_seg_inode(space, inode, iblock, mtr);
2839
		return true;
osku's avatar
osku committed
2840 2841
	}

2842
	uint32_t page_no = fseg_get_nth_frag_page_no(inode, n);
2843

2844
	if (fseg_free_page_low(inode, iblock, space, page_no, mtr
2845
#ifdef BTR_CUR_HASH_ADAPT
2846
			       , ahi
2847
#endif /* BTR_CUR_HASH_ADAPT */
2848 2849 2850
			       ) != DB_SUCCESS) {
		return true;
	}
2851

Marko Mäkelä's avatar
Marko Mäkelä committed
2852
	buf_page_free(space, page_no, mtr);
osku's avatar
osku committed
2853

2854
	n = fseg_find_last_used_frag_page_slot(inode);
osku's avatar
osku committed
2855 2856 2857

	if (n == ULINT_UNDEFINED) {
		/* Freeing completed: free the segment inode */
2858
		fsp_free_seg_inode(space, inode, iblock, mtr);
osku's avatar
osku committed
2859

2860
		return true;
osku's avatar
osku committed
2861 2862
	}

2863
	return false;
osku's avatar
osku committed
2864 2865
}

2866
bool
2867
fseg_free_step_not_header(
2868 2869 2870 2871 2872 2873
	fseg_header_t*	header,
	mtr_t*		mtr
#ifdef BTR_CUR_HASH_ADAPT
	,bool		ahi
#endif /* BTR_CUR_HASH_ADAPT */
	)
osku's avatar
osku committed
2874 2875
{
	fseg_inode_t*	inode;
2876

2877
	const uint32_t space_id = page_get_space_id(page_align(header));
2878
	ut_ad(mtr->is_named_space(space_id));
2879

2880
	fil_space_t*		space = mtr->x_lock_space(space_id);
2881
	buf_block_t*		iblock;
osku's avatar
osku committed
2882

2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895
	inode = fseg_inode_try_get(header, space_id, space->zip_size(),
				   mtr, &iblock);
	if (space->is_stopping()) {
		return true;
	}

	if (!inode) {
		ib::warn() << "Double free of "
			   << page_id_t(space_id,
					page_get_page_no(page_align(header)));
		return true;
	}

2896 2897 2898
	if (!space->full_crc32()) {
		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
	}
osku's avatar
osku committed
2899

2900 2901
	dberr_t err;
	if (xdes_t* descr = fseg_get_first_extent(inode, space, mtr, &err)) {
osku's avatar
osku committed
2902
		/* Free the extent held by the segment */
2903 2904 2905
		return fseg_free_extent(inode, iblock, space,
					xdes_get_offset(descr),
					mtr
2906
#ifdef BTR_CUR_HASH_ADAPT
2907
					, ahi
2908
#endif /* BTR_CUR_HASH_ADAPT */
2909 2910 2911
					) != DB_SUCCESS;
	} else if (err != DB_SUCCESS) {
		return true;
osku's avatar
osku committed
2912 2913 2914 2915
	}

	/* Free a frag page */

2916
	ulint n = fseg_find_last_used_frag_page_slot(inode);
osku's avatar
osku committed
2917

2918 2919 2920
	if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) {
		return true;
	}
osku's avatar
osku committed
2921

2922
	uint32_t page_no = fseg_get_nth_frag_page_no(inode, n);
2923

2924
	if (page_no == page_get_page_no(page_align(header))) {
2925
		return true;
osku's avatar
osku committed
2926
	}
2927

2928
	if (fseg_free_page_low(inode, iblock, space, page_no, mtr
2929
#ifdef BTR_CUR_HASH_ADAPT
2930
			       , ahi
2931
#endif /* BTR_CUR_HASH_ADAPT */
2932 2933 2934
			       ) != DB_SUCCESS) {
		return true;
	}
Marko Mäkelä's avatar
Marko Mäkelä committed
2935
	buf_page_free(space, page_no, mtr);
2936
	return false;
osku's avatar
osku committed
2937 2938
}

2939 2940 2941 2942
/** Returns the first extent descriptor for a segment.
We think of the extent lists of the segment catenated in the order
FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
@param[in]	inode		segment inode
2943
@param[in]	space		tablespace
2944
@param[in,out]	mtr		mini-transaction
2945 2946
@return the first extent descriptor
@retval nullptr if none, or on corruption */
2947
MY_ATTRIBUTE((nonnull, warn_unused_result))
osku's avatar
osku committed
2948 2949 2950
static
xdes_t*
fseg_get_first_extent(
2951
	fseg_inode_t*		inode,
2952
	const fil_space_t*	space,
2953 2954
	mtr_t*			mtr,
	dberr_t*		err)
osku's avatar
osku committed
2955
{
2956 2957 2958 2959 2960 2961 2962
  if (UNIV_UNLIKELY(space->id != page_get_space_id(page_align(inode)) ||
                    memcmp(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4)))
  {
  corrupted:
    *err= DB_CORRUPTION;
    return nullptr;
  }
osku's avatar
osku committed
2963

2964
  fil_addr_t first;
osku's avatar
osku committed
2965

2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976
  if (flst_get_len(inode + FSEG_FULL))
    first= flst_get_first(inode + FSEG_FULL);
  else if (flst_get_len(inode + FSEG_NOT_FULL))
    first= flst_get_first(inode + FSEG_NOT_FULL);
  else if (flst_get_len(inode + FSEG_FREE))
    first= flst_get_first(inode + FSEG_FREE);
  else
  {
    *err= DB_SUCCESS;
    return nullptr;
  }
osku's avatar
osku committed
2977

2978 2979
  if (first.page == FIL_NULL)
    goto corrupted;
2980

2981
  return xdes_lst_get_descriptor(*space, first, mtr, nullptr, err);
osku's avatar
osku committed
2982 2983
}

2984
#ifdef UNIV_BTR_PRINT
2985
/*******************************************************************//**
osku's avatar
osku committed
2986
Writes info of a segment. */
2987
static void fseg_print_low(const fseg_inode_t *inode)
osku's avatar
osku committed
2988 2989 2990 2991 2992 2993 2994 2995 2996 2997
{
	ulint	space;
	ulint	n_used;
	ulint	n_frag;
	ulint	n_free;
	ulint	n_not_full;
	ulint	n_full;
	ulint	reserved;
	ulint	used;
	ulint	page_no;
2998
	ib_id_t	seg_id;
2999

3000 3001
	space = page_get_space_id(page_align(inode));
	page_no = page_get_page_no(page_align(inode));
osku's avatar
osku committed
3002

3003
	reserved = fseg_n_reserved_pages_low(inode, &used);
osku's avatar
osku committed
3004

3005
	seg_id = mach_read_from_8(inode + FSEG_ID);
3006
	n_used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED);
3007
	n_frag = fseg_get_n_frag_pages(inode);
3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020
	n_free = flst_get_len(inode + FSEG_FREE);
	n_not_full = flst_get_len(inode + FSEG_NOT_FULL);
	n_full = flst_get_len(inode + FSEG_FULL);

	ib::info() << "SEGMENT id " << seg_id
		<< " space " << space << ";"
		<< " page " << page_no << ";"
		<< " res " << reserved << " used " << used << ";"
		<< " full ext " << n_full << ";"
		<< " fragm pages " << n_frag << ";"
		<< " free extents " << n_free << ";"
		<< " not full extents " << n_not_full << ": pages " << n_used;

3021
	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
osku's avatar
osku committed
3022 3023
}

3024
/*******************************************************************//**
osku's avatar
osku committed
3025 3026 3027 3028
Writes info of a segment. */
void
fseg_print(
/*=======*/
3029
	fseg_header_t*	header, /*!< in: segment header */
3030
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
osku's avatar
osku committed
3031
{
3032 3033 3034 3035 3036 3037
  const fil_space_t *space=
    mtr->x_lock_space(page_get_space_id(page_align(header)));
  buf_block_t *block;
  if (fseg_inode_t *inode=
      fseg_inode_try_get(header, space->id, space->zip_size(), mtr, &block))
    fseg_print_low(inode);
osku's avatar
osku committed
3038
}
3039
#endif /* UNIV_BTR_PRINT */
osku's avatar
osku committed
3040

3041
#ifdef UNIV_DEBUG
3042
std::ostream &fseg_header::to_stream(std::ostream &out) const
osku's avatar
osku committed
3043
{
3044 3045 3046 3047 3048
  out << "[fseg_header_t: space="
      << mach_read_from_4(m_header + FSEG_HDR_SPACE)
      << ", page=" << mach_read_from_4(m_header + FSEG_HDR_PAGE_NO)
      << ", offset=" << mach_read_from_2(m_header + FSEG_HDR_OFFSET) << "]";
  return out;
osku's avatar
osku committed
3049
}
3050
#endif /* UNIV_DEBUG */
3051 3052 3053

/** Get the latched extent descriptor page or
acquire the extent descriptor page.
3054
@param page_id  page identifier to be acquired
3055 3056 3057 3058 3059
@param mtr      mini-transaction
@param err      error code
@return block descriptor */
static
buf_block_t *fsp_get_latched_xdes_page(
3060
  page_id_t page_id, mtr_t *mtr, dberr_t *err)
3061 3062 3063
{
  buf_block_t *block= nullptr;
  block= mtr->get_already_latched(
3064
    page_id, MTR_MEMO_PAGE_SX_FIX);
3065 3066 3067
  if (block)
    return block;
  return buf_page_get_gen(
3068
    page_id, 0, RW_SX_LATCH, nullptr,
3069 3070 3071 3072 3073 3074 3075 3076 3077
    BUF_GET_POSSIBLY_FREED, mtr, err);
}

/** Used during system tablespace truncation. Stores
the "to be modified" extent descriptor page and its
old page state */
class fsp_xdes_old_page
{
  std::vector<buf_block_t*> m_old_xdes_pages;
3078
  const uint32_t m_space;
3079
public:
3080
  fsp_xdes_old_page(uint32_t space):m_space(space) {}
3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096
  ulint n_pages()
  {
    uint32_t count=0;
    for (uint32_t i= 0; i < m_old_xdes_pages.size(); i++)
      if (m_old_xdes_pages[i]) count++;
    return count;
  }

  __attribute__((warn_unused_result))
  dberr_t insert(uint32_t page_no, mtr_t *mtr)
  {
    uint32_t m_index= page_no >> srv_page_size_shift;
    if (m_old_xdes_pages.size() > m_index &&
        m_old_xdes_pages[m_index] != nullptr)
      return DB_SUCCESS;

3097
    DBUG_EXECUTE_IF("shrink_buffer_pool_full",
3098 3099
                    return DB_OUT_OF_MEMORY;);
    dberr_t err= DB_SUCCESS;
3100 3101
    buf_block_t *block= fsp_get_latched_xdes_page(
                          page_id_t(m_space, page_no), mtr, &err);
3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130
    if (block)
    {
      buf_block_t *old= buf_LRU_get_free_block(have_no_mutex_soft);
      if (!old) return DB_OUT_OF_MEMORY;

      memcpy_aligned<UNIV_PAGE_SIZE_MIN>(
        old->page.frame, block->page.frame, srv_page_size);

      if (m_index >= m_old_xdes_pages.size())
        m_old_xdes_pages.resize(m_index + 1);
      m_old_xdes_pages[m_index] = old;
    }
    return err;
  }

  buf_block_t *search(uint32_t page_no)
  {
    uint32_t m_index= page_no >> srv_page_size_shift;
    if (m_index > m_old_xdes_pages.size())
      return nullptr;
    return m_old_xdes_pages[m_index];
  }

  void restore(mtr_t *mtr)
  {
    for (uint32_t i= 0; i < m_old_xdes_pages.size(); i++)
    {
      if (m_old_xdes_pages[i] == nullptr) continue;
      buf_block_t *block= mtr->get_already_latched(
3131
        page_id_t{m_space, i << srv_page_size_shift},
3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163
        MTR_MEMO_PAGE_SX_FIX);
      ut_ad(block);
      memcpy_aligned<UNIV_PAGE_SIZE_MIN>(
        block->page.frame, m_old_xdes_pages[i]->page.frame, srv_page_size);
    }
  }

  ~fsp_xdes_old_page()
  {
    for (uint32_t i= 0; i < m_old_xdes_pages.size(); i++)
      if (m_old_xdes_pages[i])
        buf_block_free(m_old_xdes_pages[i]);
  }
};

/** Update the current descriptor entry with last valid
descriptor entry with skipped descriptor pages
@param header          File segment header
@param hdr_offset      FSP_FREE or FSP_FREE_FRAG
@param cur_addr        current descriptor
@param last_valid_addr last valid descriptor
@param skip_len        number of truncated extent descriptor entry
@param mtr             mini-transaction
@return error code or DB_SUCCESS */
__attribute__((warn_unused_result))
static
dberr_t fsp_lst_update_skip(
  buf_block_t *header, uint16_t hdr_offset,
  fil_addr_t cur_addr, fil_addr_t last_valid_addr,
  uint32_t skip_len, mtr_t *mtr)
{
  dberr_t err= DB_SUCCESS;
3164
  uint32_t space_id= header->page.id().space();
3165
  buf_block_t *cur= fsp_get_latched_xdes_page(
3166
    page_id_t(space_id, cur_addr.page), mtr, &err);
3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192

  if (!cur) return err;
  if (last_valid_addr.page == FIL_NULL)
  {
    /* First node, so update the FIRST pointer of base
    with current extent descriptor and update
    the PREV pointer of last valid descriptor with
    FIL_NULL */
    flst_write_addr(
      *header,
      header->page.frame + hdr_offset + FLST_FIRST,
      cur_addr.page, cur_addr.boffset, mtr);

    flst_write_addr(
      *cur,
      cur->page.frame + cur_addr.boffset + FLST_PREV,
      last_valid_addr.page, last_valid_addr.boffset, mtr);
  }
  else
  {
    buf_block_t *prev= nullptr;
    if (cur->page.id().page_no() == last_valid_addr.page)
      prev= cur;
    else
    {
      prev= fsp_get_latched_xdes_page(
3193 3194
              page_id_t(space_id, last_valid_addr.page),
              mtr, &err);
3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275
      if (!prev) return err;
    }

    /* Update the NEXT pointer of last valid extent
    descriptor entry with current extent descriptor */
    flst_write_addr(
      *prev,
      prev->page.frame + last_valid_addr.boffset + FLST_NEXT,
      cur_addr.page, cur_addr.boffset, mtr);

    /* Update the PREV pointer of current extent
    descriptor entry with last valid extent descriptor */
    flst_write_addr(
      *cur,
      cur->page.frame + cur_addr.boffset + FLST_PREV,
      last_valid_addr.page, last_valid_addr.boffset, mtr);
  }

  byte *len_bytes= &header->page.frame[hdr_offset + FLST_LEN];
  uint32_t len= mach_read_from_4(len_bytes);
  ut_ad(len > skip_len);
  mtr->write<4>(*header, len_bytes, len - skip_len);
  return DB_SUCCESS;
}

/** Write the FLST_NEXT pointer of the last valid node with FIL_NULL
@param header          File segment header
@param hdr_offset      FSP_HEADER_OFFSET + FSP_FREE or FSP_FREE_FRAG
@param cur_addr        current descriptor
@param skip_len        number of truncated extent descriptor entry
@param orig_len        original length of the list
@param mtr             mini-transaction
@return error code or DB_SUCCESS */
__attribute__((warn_unused_result))
dberr_t
fsp_lst_write_end(
  buf_block_t *header, uint16_t hdr_offset,
  fil_addr_t cur_addr, uint32_t skip_len, uint32_t orig_len,
  mtr_t *mtr)
{
  dberr_t err= DB_SUCCESS;
  byte *len_bytes= &header->page.frame[hdr_offset + FLST_LEN];
  uint32_t len= mach_read_from_4(len_bytes);
  if (skip_len == 0)
  {
func_exit:
    if (hdr_offset == FSP_FREE_FRAG + FSP_HEADER_OFFSET)
    {
      byte *frag_used_byte= &header->page.frame[
        FSP_HEADER_OFFSET + FSP_FRAG_N_USED];
      uint32_t n_used_frag= mach_read_from_4(frag_used_byte);
      /* Update the FSP_FRAG_N_USED value after removing
      the truncated pages from FSP_FREE_FRAG list */
      if (len != orig_len)
        mtr->write<4>(*header, frag_used_byte,
                      n_used_frag - ((orig_len - len) * 2));
    }
    return DB_SUCCESS;
  }

  if (cur_addr.page == FIL_NULL)
  {
    /* There is no list, so reset base node */
    mtr->memset(
      header,
      FLST_FIRST + FIL_ADDR_PAGE + hdr_offset, 4, 0xff);
    mtr->memset(
      header,
      FLST_LAST + FIL_ADDR_PAGE + hdr_offset, 4, 0xff);
  }
  else
  {
    /* Update the FLST_LAST pointer in base node with current
    valid extent descriptor and mark the FIL_NULL as next in
    current extent descriptr */
    flst_write_addr(
      *header,
      header->page.frame + hdr_offset + FLST_LAST,
      cur_addr.page, cur_addr.boffset, mtr);

    buf_block_t *cur_block= fsp_get_latched_xdes_page(
3276 3277
      page_id_t(header->page.id().space(), cur_addr.page),
      mtr, &err);
3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317

    if (!cur_block) return err;

    flst_write_addr(
      *cur_block,
      cur_block->page.frame + cur_addr.boffset + FLST_NEXT,
      FIL_NULL, 0, mtr);
  }

  ut_ad(len >= skip_len);
  len-= skip_len;
  mtr->write<4>(*header, len_bytes, len);
  goto func_exit;
}

/** Remove the truncated extents from the FSP_FREE list
@param header     tablespace header
@param hdr_offset FSP_FREE or FSP_FREE_FRAG
@param threshold  Remove the pages from the list which is
                  greater than threshold
@param mtr        mini-transaction to remove the extents
@return DB_SUCCESS on success or error code */
__attribute__((warn_unused_result))
static
dberr_t fsp_shrink_list(buf_block_t *header, uint16_t hdr_offset,
                        uint32_t threshold, mtr_t *mtr)
{
  ut_ad(mach_read_from_4(header->page.frame + FIL_PAGE_OFFSET) == 0);
  const uint32_t len= flst_get_len(hdr_offset + header->page.frame);
  if (len == 0)
    return DB_SUCCESS;

  buf_block_t *descr_block= nullptr;
  dberr_t err= DB_SUCCESS;
  uint32_t skip_len= 0;
  fil_addr_t last_valid_addr {FIL_NULL, 0}, next_addr{FIL_NULL, 0};
  fil_addr_t addr= flst_get_first(header->page.frame + hdr_offset);

  for (uint32_t i= len; i > 0; i--)
  {
3318 3319 3320 3321
    ut_d(fil_space_t *space= header->page.id().space() == 0
                             ? fil_system.sys_space
                             : fil_system.temp_space);
    ut_ad(addr.page < space->size);
3322 3323 3324 3325
    ut_ad(!(addr.page & (srv_page_size - 1)));
    if (!descr_block || descr_block->page.id().page_no() != addr.page)
    {
      descr_block= fsp_get_latched_xdes_page(
3326
        page_id_t(header->page.id().space(), addr.page), mtr, &err);
3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375
      if (!descr_block)
        return err;
    }

    if (addr.page < threshold)
    {
      /* Update only if only non-truncated page */
      if (skip_len)
      {
        err= fsp_lst_update_skip(
          header, hdr_offset, addr, last_valid_addr, skip_len, mtr);
        if (err) return err;
        skip_len= 0;
      }

      if (threshold <= xdes_get_offset(
            descr_block->page.frame + addr.boffset - XDES_FLST_NODE))
        skip_len++;
      else last_valid_addr= addr;
    }
    else skip_len++;

    next_addr= flst_get_next_addr(
      descr_block->page.frame + addr.boffset);
    if (next_addr.page != addr.page && addr.page >= threshold)
    {
      mtr->release_last_page();
      descr_block= nullptr;
    }

    if (next_addr.page == FIL_NULL)
    {
      err= fsp_lst_write_end(header, hdr_offset, last_valid_addr,
                             skip_len, len, mtr);
      break;
    }
    addr= next_addr;
  }
  ut_d(if (err == DB_SUCCESS) flst_validate(header, hdr_offset, mtr););
  return err;
}

/** Reset the XDES_BITMAP for the truncated extents
@param  space      tablespace to be truncated
@param  threshold  truncated size
@param  mtr        mini-transaction to reset XDES_BITMAP
@return DB_SUCCESS or error code on failure */
__attribute__((warn_unused_result))
static
3376
dberr_t fsp_xdes_reset(uint32_t space_id, uint32_t threshold, mtr_t *mtr)
3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389
{
  if (!(threshold & (srv_page_size - 1)))
    return DB_SUCCESS;

  uint32_t cur_descr_page= xdes_calc_descriptor_page(0, threshold);
  ulint descr_offset= XDES_ARR_OFFSET + XDES_SIZE
          * xdes_calc_descriptor_index(0, threshold);
  ulint last_descr_offset= XDES_ARR_OFFSET + XDES_SIZE
          * xdes_calc_descriptor_index(
               0, (cur_descr_page + srv_page_size - 1));
  last_descr_offset+= XDES_SIZE;
  dberr_t err= DB_SUCCESS;
  buf_block_t *block= fsp_get_latched_xdes_page(
3390
    page_id_t(space_id, cur_descr_page), mtr, &err);
3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440
  if (!block)
    return err;
  mtr->memset(
    block, descr_offset, (last_descr_offset - descr_offset), 0);
  return err;
}

/** This function does 2 things by traversing all the used
extents in the system tablespace
1) Find the last used extent
2) Store the old page frame of the "to be modified" extent
descriptor pages.
@param space             system tablespace
@param last_used_extent  value is 0 in case of finding the last used
                         extent; else it could be last used extent
@param old_xdes_entry    nullptr or object to store the
                         old page content of "to be modified"
                         extent descriptor pages
@return DB_SUCCESS or error code */
__attribute__((warn_unused_result))
dberr_t fsp_traverse_extents(
  fil_space_t *space, uint32_t *last_used_extent, mtr_t *mtr,
  fsp_xdes_old_page *old_xdes_entry= nullptr)
{
  dberr_t err= DB_SUCCESS;
  bool find_last_used_extent= (old_xdes_entry == nullptr);
  uint32_t threshold= *last_used_extent;
  uint32_t last_descr_page_no= xdes_calc_descriptor_page(
    0, space->free_limit - 1);

  if (find_last_used_extent)
    *last_used_extent= space->free_limit;
  else
  {
    err= old_xdes_entry->insert(0, mtr);
    if (err) return err;
    if (threshold & (srv_page_size - 1))
      err= old_xdes_entry->insert(
        xdes_calc_descriptor_page(0, threshold), mtr);
  }

  buf_block_t *block= nullptr;
  std::vector<uint32_t> modified_xdes;

  for (uint32_t cur_extent=
       ((space->free_limit - 1)/ FSP_EXTENT_SIZE) * FSP_EXTENT_SIZE;
       cur_extent >= threshold;)
  {
    if (!block)
    {
3441 3442 3443
      block= fsp_get_latched_xdes_page(
               page_id_t(space->id, last_descr_page_no),
               mtr, &err);
3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466
      if (!block) return err;
    }

    xdes_t *descr= XDES_ARR_OFFSET + XDES_SIZE
      * xdes_calc_descriptor_index(0, cur_extent)
      + block->page.frame;

    if (find_last_used_extent)
    {
      ulint state= xdes_get_state(descr);
      if (state == XDES_FREE)
        *last_used_extent= cur_extent;
      else if (state == XDES_FREE_FRAG &&
               !(cur_extent & (srv_page_size - 1)) &&
               xdes_get_n_used(descr) == 2)
        /* Extent Descriptor Page */
        *last_used_extent= cur_extent;
      else return DB_SUCCESS;
    }
    else
    {
      fil_addr_t prev_addr= flst_get_prev_addr(
                              descr + XDES_FLST_NODE);
3467
      ut_ad(prev_addr.page < space->size ||
3468 3469 3470 3471 3472 3473
            prev_addr.page == FIL_NULL);
      ut_ad(prev_addr.page == FIL_NULL ||
            !(prev_addr.page & (srv_page_size - 1)));

      fil_addr_t next_addr= flst_get_next_addr(
                              descr + XDES_FLST_NODE);
3474
      ut_ad(next_addr.page < space->size ||
3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511
            next_addr.page == FIL_NULL);
      ut_ad(next_addr.page == FIL_NULL ||
            !(next_addr.page & (srv_page_size - 1)));

      if (prev_addr.page < threshold)
        modified_xdes.push_back(prev_addr.page);

      if (next_addr.page < threshold)
        modified_xdes.push_back(next_addr.page);
    }

    cur_extent-= FSP_EXTENT_SIZE;
    uint32_t cur_descr_page= xdes_calc_descriptor_page(0, cur_extent);
    if (last_descr_page_no != cur_descr_page)
    {
      if (last_descr_page_no >= threshold)
        mtr->release_last_page();
      last_descr_page_no= cur_descr_page;
      block= nullptr;
    }
  }

  if (!find_last_used_extent)
  {
    for (auto it : modified_xdes)
    {
      err= old_xdes_entry->insert(it, mtr);
      if (err) return err;
    }
    modified_xdes.clear();
  }
  return err;
}

#ifdef UNIV_DEBUG
/** Validate the system tablespace list */
__attribute__((warn_unused_result))
3512
dberr_t fsp_tablespace_validate(fil_space_t *space)
3513 3514 3515 3516 3517 3518
{
  /* Validate all FSP list in system tablespace */
  mtr_t local_mtr;
  dberr_t err= DB_SUCCESS;
  local_mtr.start();
  if (buf_block_t *header= fsp_get_header(
3519
        space, &local_mtr, &err))
3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571
  {
    flst_validate(header, FSP_FREE + FSP_HEADER_OFFSET, &local_mtr);
    flst_validate(header, FSP_FREE_FRAG + FSP_HEADER_OFFSET,
                  &local_mtr);
    flst_validate(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
                  &local_mtr);
    flst_validate(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
                  &local_mtr);
    flst_validate(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
                  &local_mtr);
  }
  local_mtr.commit();
  return err;
}
#endif /* UNIV_DEBUG */

void fsp_system_tablespace_truncate()
{
  uint32_t last_used_extent= 0;
  fil_space_t *space= fil_system.sys_space;
  mtr_t mtr;
  mtr.start();
  mtr.x_lock_space(space);
  dberr_t err= fsp_traverse_extents(space, &last_used_extent, &mtr);
  if (err != DB_SUCCESS)
  {
func_exit:
    sql_print_warning("InnoDB: Cannot shrink the system tablespace "
                      "due to %s", ut_strerr(err));
    mtr.commit();
    return;
  }
  uint32_t fixed_size= srv_sys_space.get_min_size(),
           header_size= space->size_in_header;
  mtr.commit();

  if (last_used_extent >= header_size || fixed_size >= header_size)
    /* Tablespace is being used within fixed size */
    return;

  /* Set fixed size as threshold to truncate */
  if (fixed_size > last_used_extent)
    last_used_extent= fixed_size;

  my_bool old_dblwr_buf= srv_use_doublewrite_buf;
  /* Flush all pages in buffer pool, so that it doesn't have to
  use doublewrite buffer and disable dblwr and there should
  be enough space in redo log */
  log_make_checkpoint();
  srv_use_doublewrite_buf= false;

  buf_block_t *header= nullptr;
3572
  ut_ad(!fsp_tablespace_validate(space));
3573 3574 3575 3576 3577 3578 3579

  mtr.start();
  mtr.x_lock_space(space);

  {
    /* Take the rough estimation of modified extent
    descriptor page and store their old state */
3580
    fsp_xdes_old_page old_xdes_list(space->id);
3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597
    err= fsp_traverse_extents(space, &last_used_extent, &mtr, &old_xdes_list);

    if (err == DB_OUT_OF_MEMORY)
    {
      mtr.commit();
      sql_print_warning("InnoDB: Cannot shrink the system "
                        "tablespace from " UINT32PF" to "
                        UINT32PF " pages due to insufficient "
                        "innodb_buffer_pool_size", space->size,
                        last_used_extent);
      return;
    }

    sql_print_information("InnoDB: Truncating system tablespace from "
                          UINT32PF " to " UINT32PF " pages", space->size,
                          last_used_extent);

3598 3599
    header= fsp_get_latched_xdes_page(
              page_id_t(space->id, 0), &mtr, &err);
3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620
    if (!header)
      goto func_exit;

    mtr.write<4, mtr_t::FORCED>(
      *header, FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame,
      last_used_extent);

    if (space->free_limit > last_used_extent)
      mtr.write<4,mtr_t::MAYBE_NOP>(*header, FSP_HEADER_OFFSET
                                    + FSP_FREE_LIMIT + header->page.frame,
                                    last_used_extent);
    err= fsp_shrink_list(
      header, FSP_HEADER_OFFSET + FSP_FREE, last_used_extent, &mtr);
    if (err != DB_SUCCESS)
      goto func_exit;

    err= fsp_shrink_list(
      header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, last_used_extent, &mtr);
    if (err != DB_SUCCESS)
      goto func_exit;

3621
    err= fsp_xdes_reset(space->id, last_used_extent, &mtr);
3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639
    if (err != DB_SUCCESS)
      goto func_exit;

    mtr.trim_pages(page_id_t(0, last_used_extent));
    size_t shrink_redo_size= mtr.get_log_size();

    DBUG_EXECUTE_IF("mtr_log_max_size", goto mtr_max;);
    if (shrink_redo_size >
          (2 << 20) - 8 /* encryption nonce */ - 5 /* EOF, checksum */)
    {
#ifndef DBUG_OFF
mtr_max:
#endif
      /* Replace the modified copy from buffer pool with
      original copy of the pages. */
      old_xdes_list.restore(&mtr);
      mtr.discard_modifications();
      mtr.commit();
3640
      ut_ad(!fsp_tablespace_validate(space));
3641 3642 3643 3644 3645 3646 3647 3648 3649
      sql_print_error(
        "InnoDB: Cannot shrink the system tablespace "
        "because the mini-transaction log size (%zu bytes) "
        "exceeds 2 MiB", shrink_redo_size + 8 + 5);
      return;
    }
  }

  if (space->free_limit > last_used_extent)
Marko Mäkelä's avatar
Marko Mäkelä committed
3650 3651 3652 3653 3654
    space->free_limit= last_used_extent;
  space->free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE +
                                header->page.frame);

  mtr.commit_shrink(*space, last_used_extent);
3655 3656 3657
  sql_print_information("InnoDB: System tablespace truncated successfully");
  srv_use_doublewrite_buf= old_dblwr_buf;
}
3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773

inline void fil_space_t::clear_freed_ranges(uint32_t threshold)
{
  ut_ad(id == SRV_TMP_SPACE_ID);
  std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
  range_set current_ranges;
  for (const auto &range : freed_ranges)
  {
    if (range.first >= threshold)
      continue;
    else if (range.last > threshold)
    {
      range_t new_range{range.first, threshold - 1};
      current_ranges.add_range(new_range);
      continue;
    }
    current_ranges.add_range(range);
  }
  freed_ranges= std::move(current_ranges);
}

void fsp_shrink_temp_space()
{
  uint32_t last_used_extent= 0;
  fil_space_t *space= fil_system.temp_space;
  mtr_t mtr;
  mtr.start();
  mtr.set_log_mode(MTR_LOG_NO_REDO);
  mtr.x_lock_space(space);
  dberr_t err= fsp_traverse_extents(space, &last_used_extent, &mtr);
  if (err != DB_SUCCESS)
  {
func_exit:
    sql_print_warning("InnoDB: Cannot shrink the temporary tablespace "
                      "due to %s", ut_strerr(err));
    mtr.commit();
    return;
  }
  uint32_t fixed_size= srv_tmp_space.get_min_size(),
           header_size= space->size_in_header;

  if (last_used_extent >= header_size || fixed_size >= header_size)
  {
    /* Tablespace is being used within fixed size */
    mtr.commit();
    return;
  }

  /* Set fixed size as threshold to truncate */
  if (fixed_size > last_used_extent)
    last_used_extent= fixed_size;

  sql_print_information("InnoDB: Truncating temporary tablespace from "
                        UINT32PF " to " UINT32PF " pages", space->size,
                        last_used_extent);

  buf_block_t *header= fsp_get_latched_xdes_page(
      page_id_t(space->id, 0), &mtr, &err);
  if (!header)
    goto func_exit;

  mach_write_to_4(
    FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame,
    last_used_extent);

  if (space->free_limit > last_used_extent)
    mach_write_to_4(
      FSP_HEADER_OFFSET + FSP_FREE_LIMIT + header->page.frame,
      last_used_extent);

  mtr.set_modified(*header);

  err= fsp_shrink_list(header, FSP_HEADER_OFFSET + FSP_FREE,
                       last_used_extent, &mtr);

  if (err != DB_SUCCESS)
    goto func_exit;

  err= fsp_shrink_list(
         header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
         last_used_extent, &mtr);
  DBUG_EXECUTE_IF("fail_temp_truncate", err= DB_ERROR;);
  if (err != DB_SUCCESS)
    goto func_exit;

  err= fsp_xdes_reset(space->id, last_used_extent, &mtr);
  if (err != DB_SUCCESS)
    goto func_exit;

  space->clear_freed_ranges(last_used_extent);
  buf_LRU_truncate_temp(last_used_extent);
  mysql_mutex_lock(&fil_system.mutex);

  space->size= last_used_extent;
  if (space->free_limit > last_used_extent)
    space->free_limit= space->size;

  space->free_len= flst_get_len(
    FSP_HEADER_OFFSET + FSP_FREE+ header->page.frame);

  /* Last file new size after truncation */
  uint32_t new_last_file_size=
    last_used_extent -
    (fixed_size - srv_tmp_space.m_files.at(
     srv_tmp_space.m_files.size() - 1).param_size());

  space->size_in_header= space->size;
  space->chain.end->size= new_last_file_size;
  srv_tmp_space.set_last_file_size(new_last_file_size);
  mysql_mutex_unlock(&fil_system.mutex);
  os_file_truncate(
    space->chain.end->name, space->chain.end->handle,
    os_offset_t{space->chain.end->size} << srv_page_size_shift, true);
  mtr.commit();
  sql_print_information("InnoDB: Temporary tablespace truncated successfully");
}
3774 3775


3776
static uint32_t binlog_size_in_pages;
3777 3778
buf_block_t *binlog_cur_block;
uint32_t binlog_cur_page_no;
3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807
uint32_t binlog_cur_page_offset;
/*
  Mutex protecting active_binlog_file_no and active_binlog_space.
*/
mysql_mutex_t active_binlog_mutex;
pthread_cond_t active_binlog_cond;
static std::thread binlog_prealloc_thr_obj;
static bool prealloc_thread_end= false;
/* The currently being written binlog tablespace. */
std::atomic<uint64_t> active_binlog_file_no;
fil_space_t* active_binlog_space;
/*
  The first binlog tablespace that is still open.
  This can be equal to active_binlog_file_no, if the tablespace prior to the
  active one has been fully flushed out to disk and closed.
  Or it can be one less, if the prior tablespace is still being written out and
  closed.
*/
std::atomic<uint64_t> first_open_binlog_file_no;
/*
  The most recent created and open tablespace.
  This can be equal to active_binlog_file_no+1, if the next tablespace to be
  used has already been pre-allocated and opened.
  Or it can be the same as active_binlog_file_no, if the pre-allocation of the
  next tablespace is still pending.
*/
uint64_t last_created_binlog_file_no;
fil_space_t *last_created_binlog_space;

3808 3809 3810 3811
/*
  Point at which it is guaranteed that all data has been written out to the
  binlog file (on the OS level; not necessarily fsync()'ed yet).

3812
  Stores the most recent two values, each corresponding to active_binlog_file_no&1.
3813 3814 3815 3816 3817
*/
/* ToDo: maintain this offset value as up to where data has been written out to the OS. Needs to be binary-searched in current binlog file at server restart; which is also a reason why it might not be a multiple of the page size. */
std::atomic<uint64_t> binlog_cur_written_offset[2];
/* Offset of last valid byte of data in most recent 2 binlog files. */
std::atomic<uint64_t> binlog_cur_end_offset[2];
3818

3819
static void fsp_binlog_prealloc_thread();
3820
static int fsp_binlog_discover();
3821

3822

3823 3824 3825 3826
#define BINLOG_NAME_BASE "binlog-"
#define BINLOG_NAME_EXT ".ibb"
/* '.' + '/' + "binlog-" + (<=20 digits) + '.' + "ibb" + '\0'. */
#define BINLOG_NAME_LEN 1 + 1 + 7 + 20 + 1 + 3 + 1
3827 3828 3829
static inline void
binlog_name_make(char name_buf[BINLOG_NAME_LEN], uint64_t file_no)
{
3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861
  sprintf(name_buf, "./" BINLOG_NAME_BASE "%06" PRIu64 BINLOG_NAME_EXT,
          file_no);
}


/*
  Check if this is an InnoDB binlog file name.
  Return the index/file_no if so.
*/
static bool
is_binlog_name(const char *name, uint64_t *out_idx)
{
  const size_t base_len= sizeof(BINLOG_NAME_BASE) - 1;  // Length without '\0' terminator
  const size_t ext_len= sizeof(BINLOG_NAME_EXT) - 1;

  if (0 != strncmp(name, BINLOG_NAME_BASE, base_len))
    return false;
  size_t name_len= strlen(name);
  if (name_len < base_len + 1 + ext_len)
    return false;
  const char *ext_start= name + (name_len - ext_len);
  if (0 != strcmp(ext_start, BINLOG_NAME_EXT))
    return false;
  if (!std::isdigit((unsigned char)(name[base_len])))
    return false;
  char *conv_end= nullptr;
  unsigned long long idx= std::strtoull(name + base_len, &conv_end, 10);
  if (idx == ULLONG_MAX || conv_end != ext_start)
    return false;

  *out_idx= (uint64_t)idx;
  return true;
3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899
}


/** Write out all pages, flush, and close/detach a binlog tablespace.
@param[in] file_no	 Index of the binlog tablespace
@return DB_SUCCESS or error code */
static dberr_t
fsp_binlog_tablespace_close(uint64_t file_no)
{
  dberr_t res;

  uint32_t space_id= SRV_SPACE_ID_BINLOG0 + (file_no & 1);
  mysql_mutex_lock(&fil_system.mutex);
  fil_space_t *space= fil_space_get_by_id(space_id);
  mysql_mutex_unlock(&fil_system.mutex);
  if (!space) {
    res= DB_ERROR;
    goto end;
  }

  /*
    Write out any remaining pages in the buffer pool to the binlog tablespace.
    Then flush the file to disk, and close the old tablespace.
  */
  while (buf_flush_list_space(space))
    ;
  os_aio_wait_until_no_pending_writes(false);
  space->flush<false>();
  mysql_mutex_lock(&fil_system.mutex);
  fil_system.detach(space, false);
  mysql_mutex_unlock(&fil_system.mutex);

  res= DB_SUCCESS;
end:
  return res;
}


3900 3901 3902
/*
  Initialize the InnoDB implementation of binlog.
  Note that we do not create or open any binlog tablespaces here.
3903
  This is only done if InnoDB binlog is enabled on the server level.
3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938
*/
void
fsp_binlog_init()
{
  mysql_mutex_init(fsp_active_binlog_mutex_key, &active_binlog_mutex, nullptr);
  pthread_cond_init(&active_binlog_cond, nullptr);
}


/*
  Open the InnoDB binlog implementation.
  This is called from server binlog layer if the user configured the binlog to
  use the innodb implementation (with --binlog-storage-engine=innodb).
*/
bool
innodb_binlog_init(size_t binlog_size)
{
  uint64_t pages= binlog_size >> srv_page_size_shift;
  if (UNIV_LIKELY(pages > (uint64_t)UINT32_MAX)) {
    pages= UINT32_MAX;
    ib::warn() << "Requested max_binlog_size is larger than the maximum " <<
      "InnoDB tablespace size, truncated to " <<
      (pages << srv_page_size_shift) << ".";
  } else if (pages < 2) {  /* Minimum one data page and one index page. */
    pages= 2;
    ib::warn() << "Requested max_binlog_size is smaller than the minimum " <<
      "size supported by InnoDB, truncated to " <<
      (pages << srv_page_size_shift) << ".";
  }
  binlog_size_in_pages= (uint32_t)pages;

  first_open_binlog_file_no.store(~(uint64_t)0, std::memory_order_relaxed);
  last_created_binlog_file_no= ~(uint64_t)0;
  active_binlog_file_no.store(~(uint64_t)0, std::memory_order_release);
  active_binlog_space= nullptr;
3939
  binlog_cur_page_no= 0;
3940
  binlog_cur_page_offset= FIL_PAGE_DATA;
3941 3942
  /* Find any existing binlog files and continue writing in them. */
  fsp_binlog_discover();
3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956

  /* Start pre-allocating new binlog files. */
  binlog_prealloc_thr_obj= std::thread{fsp_binlog_prealloc_thread};

  mysql_mutex_lock(&active_binlog_mutex);
  while (last_created_binlog_file_no == ~(uint64_t)0) {
    /* Wait for the first binlog file to be available. */
    my_cond_wait(&active_binlog_cond, &active_binlog_mutex.m_mutex);
  }
  mysql_mutex_unlock(&active_binlog_mutex);

  return false;
}

3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322

struct found_binlogs {
  uint64_t last_file_no, prev_file_no;
  size_t last_size, prev_size;
  int found_binlogs;
};


/* Compute the (so far) last and last-but-one binlog files found. */
static void
process_binlog_name(found_binlogs *bls, uint64_t idx, size_t size)
{
  if (bls->found_binlogs == 0 ||
      idx > bls->last_file_no) {
    if (bls->found_binlogs >= 1 && idx == bls->last_file_no + 1) {
      bls->prev_file_no= bls->last_file_no;
      bls->prev_size= bls->last_size;
      bls->found_binlogs= 2;
    } else {
      bls->found_binlogs= 1;
    }
    bls->last_file_no= idx;
    bls->last_size= size;
  } else if (bls->found_binlogs == 1 && idx + 1 == bls->last_file_no) {
    bls->found_binlogs= 2;
    bls->prev_file_no= idx;
    bls->prev_size= size;
  }
}


/*
  Open an existing tablespace. The filehandle fh is taken over by the tablespace
  (or closed in case of error).
*/
static fil_space_t *
fsp_binlog_open(const char *file_name, pfs_os_file_t fh,
                uint64_t file_no, size_t file_size, bool open_empty)
{
  const uint32_t page_size= (uint32_t)srv_page_size;
  const uint32_t page_size_shift= srv_page_size_shift;

  os_offset_t binlog_size= max_binlog_size;
  if (open_empty && file_size < binlog_size) {
    /*
      A crash may have left a partially pre-allocated file. If so, extend it
      to the required size.
      Note that this may also extend a previously pre-allocated file to the new
      binlog configured size, if the configuration changed during server
      restart.
    */
    if (!os_file_set_size(file_name, fh, binlog_size, false)) {
      ib::warn() << "Failed to change the size of InnoDB binlog file " <<
        file_name << " from " << file_size << " to " << binlog_size <<
        " bytes (error code: " << errno << ").";
    } else {
      file_size= binlog_size;
    }
  }
  if (file_size < 2*page_size)
  {
    ib::warn() << "InnoDB binlog file number " << file_no << " is too short"
      " (" << file_size << " bytes), should be at least " << 2*page_size <<
      " bytes.";
    os_file_close(fh);
    return nullptr;
  }

  uint32_t space_id= SRV_SPACE_ID_BINLOG0 + (file_no & 1);

  if (!open_empty) {
    page_t *page_buf= static_cast<byte*>(aligned_malloc(page_size, page_size));
    if (!page_buf) {
      os_file_close(fh);
      return nullptr;
    }

    dberr_t err= os_file_read(IORequestRead, fh, page_buf, 0, page_size, nullptr);
    if (err != DB_SUCCESS) {
      ib::warn() << "Unable to read first page of file " << file_name;
      aligned_free(page_buf);
      os_file_close(fh);
      return nullptr;
    }

    /* ToDo: Maybe use leaner page format for binlog tablespace? */
    uint32_t id1= mach_read_from_4(FIL_PAGE_SPACE_ID + page_buf);
    if (id1 != space_id) {
      ib::warn() << "Binlog file " << file_name <<
        " has inconsistent tablespace id " << id1 <<
        " (expected " << space_id << ")";
      aligned_free(page_buf);
      os_file_close(fh);
      return nullptr;
    }
    // ToDo: should we here check buf_page_is_corrupted() ?

    aligned_free(page_buf);
  }

  uint32_t fsp_flags=
    FSP_FLAGS_FCRC32_MASK_MARKER | FSP_FLAGS_FCRC32_PAGE_SSIZE();
  /* ToDo: Enryption. */
  fil_encryption_t mode= FIL_ENCRYPTION_OFF;
  fil_space_crypt_t* crypt_data= nullptr;
  fil_space_t *space;

  mysql_mutex_lock(&fil_system.mutex);
  if (!(space= fil_space_t::create(space_id, fsp_flags,
                                   FIL_TYPE_TABLESPACE, crypt_data,
                                   mode, true))) {
    mysql_mutex_unlock(&fil_system.mutex);
    os_file_close(fh);
    return nullptr;
  }

  space->add(file_name, fh, (uint32_t)(file_size >> page_size_shift),
             false, true);

  first_open_binlog_file_no.store(file_no, std::memory_order_release);
  if (last_created_binlog_file_no == ~(uint64_t)0 ||
      file_no > last_created_binlog_file_no) {
    last_created_binlog_file_no= file_no;
    last_created_binlog_space= space;
  }

  mysql_mutex_unlock(&fil_system.mutex);
  return space;
}


static bool
binlog_page_empty(const byte *page)
{
  return page[FIL_PAGE_DATA] == 0;
}


/*
  Find the last written position in the binlog file.
  Do a binary search through the pages to find the last non-empty page, then
  scan the page to find the place to start writing new binlog data.

  Returns:
     1 position found, output in *out_space, *out_page_no and *out_pos_in_page.
     0 binlog file is empty.
    -1 error.
*/

static int
find_pos_in_binlog(uint64_t file_no, size_t file_size, byte *page_buf,
                   fil_space_t **out_space,
                   uint32_t *out_page_no, uint32_t *out_pos_in_page)
{
  const uint32_t page_size= (uint32_t)srv_page_size;
  const uint32_t page_size_shift= (uint32_t)srv_page_size_shift;
  const uint32_t idx= file_no & 1;
  char file_name[BINLOG_NAME_LEN];
  uint32_t p_0, p_1, p_2, last_nonempty;
  dberr_t err;
  byte *p, *page_end;
  bool ret;

  *out_page_no= 0;
  *out_pos_in_page= FIL_PAGE_DATA;

  binlog_name_make(file_name, file_no);
  pfs_os_file_t fh= os_file_create(innodb_data_file_key, file_name,
                                   OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
                                   OS_FILE_AIO, OS_DATA_FILE,
                                   srv_read_only_mode, &ret);
  if (!ret) {
    ib::warn() << "Unable to open file " << file_name;
    return -1;
  }

  err= os_file_read(IORequestRead, fh, page_buf, 0, page_size, nullptr);
  if (err != DB_SUCCESS) {
    os_file_close(fh);
    return -1;
  }
  if (binlog_page_empty(page_buf)) {
    *out_space= fsp_binlog_open(file_name, fh, file_no, file_size, true);
    binlog_cur_written_offset[idx].store(0, std::memory_order_relaxed);
    binlog_cur_end_offset[idx].store(0, std::memory_order_relaxed);
    return (*out_space ? 0 : -1);
  }
  last_nonempty= 0;

  /*
    During the binary search, p_0-1 is the largest page number that is know to
    be non-empty. And p_2 is the first page that is known to be empty.
  */
  p_0= 1;
  p_2= (uint32_t)(file_size / page_size);
  for (;;) {
    if (p_0 == p_2)
      break;
    ut_ad(p_0 < p_2);
    p_1= (p_0 + p_2) / 2;
    err= os_file_read(IORequestRead, fh, page_buf, p_1 << page_size_shift,
                      page_size, nullptr);
    if (err != DB_SUCCESS) {
      os_file_close(fh);
      return -1;
    }
    if (binlog_page_empty(page_buf)) {
      p_2= p_1;
    } else {
      p_0= p_1 + 1;
      last_nonempty= p_1;
    }
  }
  /* At this point, p_0 == p_2 is the first empty page. */
  ut_ad(p_0 >= 1);

  /*
    This sometimes does an extra read, but as this is only during startup it
    does not matter.
  */
  err= os_file_read(IORequestRead, fh, page_buf,
                    last_nonempty << page_size_shift, page_size, nullptr);
  if (err != DB_SUCCESS) {
    os_file_close(fh);
    return -1;
  }

  /* Now scan the last page to find the position in it to continue. */
  p= &page_buf[FIL_PAGE_DATA];
  page_end= &page_buf[page_size - FIL_PAGE_DATA_END];
  while (*p && p < page_end) {
    if (*p == 0xff) {
      p= page_end;
      break;
    }
    p += 3 + (((uint32_t)p[2] << 8) | ((uint32_t)p[1] & 0xff));
    // ToDo: How to handle page corruption?
    ut_a(p <= page_end);
  }

  *out_page_no= p_0 - 1;
  *out_pos_in_page= (uint32_t)(p - page_buf);

  *out_space= fsp_binlog_open(file_name, fh, file_no, file_size, false);
  uint64_t pos= (*out_page_no << page_size_shift) | *out_pos_in_page;
  binlog_cur_written_offset[idx].store(pos, std::memory_order_relaxed);
  binlog_cur_end_offset[idx].store(pos, std::memory_order_relaxed);
  return (*out_space ? 1 : -1);
}


/*
  Returns:
    -1     error
     0     No binlogs found
     1     Just one binlog file found
     2     Found two (or more) existing binlog files
*/
static int
fsp_binlog_discover()
{
  uint64_t file_no;
  const uint32_t page_size= (uint32_t)srv_page_size;
  const uint32_t page_size_shift= (uint32_t)srv_page_size_shift;
  MY_DIR *dir= my_dir(".", MYF(MY_WME|MY_WANT_STAT));  // ToDo: configurable binlog directory, and don't ask my_dir to stat every file found
  if (!dir)
    return -1;

  struct found_binlogs binlog_files;
  binlog_files.found_binlogs= 0;
  size_t num_entries= dir->number_of_files;
  fileinfo *entries= dir-> dir_entry;
  for (size_t i= 0; i < num_entries; ++i) {
    const char *name= entries[i].name;
    uint64_t idx;
    if (!is_binlog_name(name, &idx))
      continue;
    process_binlog_name(&binlog_files, idx, entries[i].mystat->st_size);
  }
  my_dirend(dir);

  /*
    Now, if we found any binlog files, locate the point in one of them where
    binlogging stopped, and where we should continue writing new binlog data.
  */
  fil_space_t *space, *prev_space;
  uint32_t page_no, prev_page_no, pos_in_page, prev_pos_in_page;
  // ToDo: Do we need aligned_malloc() for page_buf, to be able to read a page into it (like IO_DIRECT maybe) ?
  std::unique_ptr<byte[]> page_buf(new byte[page_size]);
  if (!page_buf)
    return -1;
  if (binlog_files.found_binlogs >= 1) {
    int res= find_pos_in_binlog(binlog_files.last_file_no,
                                binlog_files.last_size,
                                page_buf.get(),
                                &space, &page_no, &pos_in_page);
    if (res < 0) {
      file_no= binlog_files.last_file_no;
      active_binlog_file_no.store(file_no, std::memory_order_release);
      ib::warn() << "Binlog number " << binlog_files.last_file_no <<
        " could no be opened. Starting a new binlog file from number " <<
        (file_no + 1) << ".";
      return 0;
    }

    if (res > 0) {
      /* Found start position in the last binlog file. */
      file_no= binlog_files.last_file_no;
      active_binlog_file_no.store(file_no, std::memory_order_release);
      active_binlog_space= space;
      binlog_cur_page_no= page_no;
      binlog_cur_page_offset= pos_in_page;
      ib::info() << "Continuing binlog number " << file_no << " from position "
                 << (((uint64_t)page_no << page_size_shift) | pos_in_page)
                 << ".";
      return binlog_files.found_binlogs;
    }

    /* res == 0, the last binlog is empty. */
    if (binlog_files.found_binlogs >= 2) {
      /* The last binlog is empty, try the previous one. */
      res= find_pos_in_binlog(binlog_files.prev_file_no,
                              binlog_files.prev_size,
                              page_buf.get(),
                              &prev_space, &prev_page_no, &prev_pos_in_page);
      if (res < 0) {
        file_no= binlog_files.last_file_no;
        active_binlog_file_no.store(file_no, std::memory_order_release);
        active_binlog_space= space;
        binlog_cur_page_no= page_no;
        binlog_cur_page_offset= pos_in_page;
        ib::warn() << "Binlog number " << binlog_files.prev_file_no
                   << " could not be opened, starting from binlog number "
                   << file_no << " instead." ;
        return 1;
      }
      file_no= binlog_files.prev_file_no;
      active_binlog_file_no.store(file_no, std::memory_order_release);
      active_binlog_space= prev_space;
      binlog_cur_page_no= prev_page_no;
      binlog_cur_page_offset= prev_pos_in_page;
      ib::info() << "Continuing binlog number " << file_no << " from position "
                 << (((uint64_t)prev_page_no << page_size_shift) |
                     prev_pos_in_page)
                 << ".";
      return binlog_files.found_binlogs;
    }

    /* Just one empty binlog file found. */
    file_no= binlog_files.last_file_no;
    active_binlog_file_no.store(file_no, std::memory_order_release);
    active_binlog_space= space;
    binlog_cur_page_no= page_no;
    binlog_cur_page_offset= pos_in_page;
    ib::info() << "Continuing binlog number " << file_no << " from position "
               << FIL_PAGE_DATA << ".";
    return binlog_files.found_binlogs;
  }

  /* No binlog files found, start from scratch. */
  file_no= 0;
  ib::info() << "Starting a new binlog from file number " << file_no << ".";
  return 0;
}


4323 4324
void fsp_binlog_close()
{
4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341
  if (binlog_prealloc_thr_obj.joinable()) {
    mysql_mutex_lock(&active_binlog_mutex);
    prealloc_thread_end= true;
    pthread_cond_signal(&active_binlog_cond);
    mysql_mutex_unlock(&active_binlog_mutex);
    binlog_prealloc_thr_obj.join();
  }

  uint64_t file_no= first_open_binlog_file_no.load(std::memory_order_relaxed);
  if (file_no != ~(uint64_t)0) {
    if (file_no <= last_created_binlog_file_no) {
      fsp_binlog_tablespace_close(file_no);
      if (file_no + 1 <= last_created_binlog_file_no) {
        fsp_binlog_tablespace_close(file_no + 1);
      }
    }
  }
4342 4343 4344 4345 4346 4347 4348 4349
  /*
    ToDo: This doesn't seem to free all memory. I'm still getting leaks in eg. --valgrind. Find out why and fix. Example:
==3464576==    at 0x48407B4: malloc (vg_replace_malloc.c:381)
==3464576==    by 0x15318CD: mem_strdup(char const*) (mem0mem.inl:452)
==3464576==    by 0x15321DF: fil_space_t::add(char const*, pfs_os_file_t, unsigned int, bool, bool, unsigned int) (fil0fil.cc:306)
==3464576==    by 0x1558445: fsp_binlog_tablespace_create(unsigned long) (fsp0fsp.cc:3900)
==3464576==    by 0x1558C70: fsp_binlog_write_cache(st_io_cache*, unsigned long, mtr_t*) (fsp0fsp.cc:4013)
  */
4350 4351
  pthread_cond_destroy(&active_binlog_cond);
  mysql_mutex_destroy(&active_binlog_mutex);
4352 4353 4354
}


4355
/** Create a binlog tablespace file
4356 4357
@param[in]  file_no	 Index of the binlog tablespace
@param[out] new_space	 The newly created tablespace
4358
@return DB_SUCCESS or error code */
4359
dberr_t fsp_binlog_tablespace_create(uint64_t file_no, fil_space_t **new_space)
4360 4361 4362 4363
{
	pfs_os_file_t	fh;
	bool		ret;

4364 4365
        *new_space= nullptr;
	uint32_t size= binlog_size_in_pages;
4366 4367 4368
	if(srv_read_only_mode)
		return DB_ERROR;

4369 4370
        char name[BINLOG_NAME_LEN];
        binlog_name_make(name, file_no);
4371

4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399
	os_file_create_subdirs_if_needed(name);

	/* ToDo: Do we need here an mtr.log_file_op(FILE_CREATE) like in fil_ibd_create(()? */
	fh = os_file_create(
		innodb_data_file_key,
		name,
		OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
		OS_FILE_AIO, OS_DATA_FILE, srv_read_only_mode, &ret);

	if (!ret) {
		os_file_close(fh);
		return DB_ERROR;
	}

	/* ToDo: Enryption? */
	fil_encryption_t mode= FIL_ENCRYPTION_OFF;
	fil_space_crypt_t* crypt_data= nullptr;

	/* We created the binlog file and now write it full of zeros */
	if (!os_file_set_size(name, fh,
			      os_offset_t{size} << srv_page_size_shift)) {
		ib::error() << "Unable to allocate " << name;
		os_file_close(fh);
		os_file_delete(innodb_data_file_key, name);
		return DB_ERROR;
	}

	mysql_mutex_lock(&fil_system.mutex);
4400 4401
        /* ToDo: Need to ensure file (N-2) is no longer active before creating (N). */
	uint32_t space_id= SRV_SPACE_ID_BINLOG0 + (file_no & 1);
4402
	if (!(*new_space= fil_space_t::create(space_id,
4403 4404 4405 4406 4407
                                                ( FSP_FLAGS_FCRC32_MASK_MARKER |
						  FSP_FLAGS_FCRC32_PAGE_SSIZE()),
						FIL_TYPE_TABLESPACE, crypt_data,
						mode, true))) {
		mysql_mutex_unlock(&fil_system.mutex);
4408 4409
		os_file_close(fh);
		os_file_delete(innodb_data_file_key, name);
4410 4411 4412
		return DB_ERROR;
	}

4413
	fil_node_t* node = (*new_space)->add(name, fh, size, false, true);
4414 4415 4416 4417 4418 4419
	IF_WIN(node->find_metadata(), node->find_metadata(fh, true));
	mysql_mutex_unlock(&fil_system.mutex);

	return DB_SUCCESS;
}

4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489

/*
  Background thread to close old binlog tablespaces and pre-allocate new ones.
*/
static void
fsp_binlog_prealloc_thread()
{

  mysql_mutex_lock(&active_binlog_mutex);
  while (1)
  {
    uint64_t active= active_binlog_file_no.load(std::memory_order_relaxed);
    uint64_t first_open= first_open_binlog_file_no.load(std::memory_order_relaxed);

    /* Pre-allocate the next tablespace (if not done already). */
    uint64_t last_created= last_created_binlog_file_no;
    if (last_created <= active && last_created <= first_open) {
      fil_space_t *new_space;
      ut_ad(last_created == active);
      ut_ad(last_created == first_open || first_open == ~(uint64_t)0);
      /*
        Note: `last_created` is initialized to ~0, so incrementing it here
        makes us start from binlog file 0.
      */
      ++last_created;
      mysql_mutex_unlock(&active_binlog_mutex);
      dberr_t res2= fsp_binlog_tablespace_create(last_created, &new_space);
      mysql_mutex_lock(&active_binlog_mutex);
      ut_a(res2 == DB_SUCCESS /* ToDo: Error handling. */);
      ut_a(new_space);
      last_created_binlog_file_no= last_created;
      last_created_binlog_space= new_space;

      /* If we created the initial tablespace file, make it the active one. */
      ut_ad(active < ~(uint64_t)0 || last_created == 0);
      if (active == ~(uint64_t)0) {
        active_binlog_file_no.store(last_created, std::memory_order_relaxed);
        active_binlog_space= last_created_binlog_space;
      }
      if (first_open == ~(uint64_t)0)
        first_open_binlog_file_no.store(first_open= last_created,
                                        std::memory_order_relaxed);

      pthread_cond_signal(&active_binlog_cond);
      continue;  /* Re-start loop after releasing/reacquiring mutex. */
    }

    /*
      Flush out to disk and close any binlog tablespace that has been
      completely written.
    */
    if (first_open < active) {
      ut_ad(first_open == active - 1);
      mysql_mutex_unlock(&active_binlog_mutex);
      fsp_binlog_tablespace_close(active - 1);
      mysql_mutex_lock(&active_binlog_mutex);
      first_open_binlog_file_no.store(first_open + 1, std::memory_order_relaxed);
      continue;  /* Re-start loop after releasing/reacquiring mutex. */
    }

    /* Exit thread at server shutdown. */
    if (prealloc_thread_end)
      break;
    my_cond_wait(&active_binlog_cond, &active_binlog_mutex.m_mutex);

  }
  mysql_mutex_unlock(&active_binlog_mutex);
}


4490 4491 4492
void fsp_binlog_write_start(uint32_t page_no,
                            const uchar *data, uint32_t len, mtr_t *mtr)
{
4493
	buf_block_t *block= fsp_page_create(active_binlog_space, page_no, mtr);
4494 4495 4496 4497 4498 4499 4500 4501 4502 4503
	mtr->memcpy<mtr_t::MAYBE_NOP>(*block, FIL_PAGE_DATA + block->page.frame,
				      data, len);
	binlog_cur_block= block;
}

void fsp_binlog_write_offset(uint32_t page_no, uint32_t offset,
                             const uchar *data, uint32_t len, mtr_t *mtr)
{
	dberr_t err;
        /* ToDo: Is RW_SX_LATCH appropriate here? */
4504
	buf_block_t *block= buf_page_get_gen(page_id_t{active_binlog_space->id, page_no},
4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537
					     0, RW_SX_LATCH, binlog_cur_block,
					     BUF_GET, mtr, &err);
	ut_a(err == DB_SUCCESS);
	mtr->memcpy<mtr_t::MAYBE_NOP>(*block,
                                      offset + block->page.frame,
                                      data, len);
}

void fsp_binlog_append(const uchar *data, uint32_t len, mtr_t *mtr)
{
  ut_ad(binlog_cur_page_offset <= srv_page_size - FIL_PAGE_DATA_END);
  uint32_t remain= ((uint32_t)srv_page_size - FIL_PAGE_DATA_END) -
    binlog_cur_page_offset;
  // ToDo: Some kind of mutex to protect binlog access.
  while (len > 0) {
    if (remain < 4) {
      binlog_cur_page_offset= FIL_PAGE_DATA;
      remain= ((uint32_t)srv_page_size - FIL_PAGE_DATA_END) -
        binlog_cur_page_offset;
      ++binlog_cur_page_no;
    }
    uint32_t this_len= std::min<uint32_t>(len, remain);
    if (binlog_cur_page_offset == FIL_PAGE_DATA)
      fsp_binlog_write_start(binlog_cur_page_no, data, this_len, mtr);
    else
      fsp_binlog_write_offset(binlog_cur_page_no, binlog_cur_page_offset,
                              data, this_len, mtr);
    len-= this_len;
    data+= this_len;
    binlog_cur_page_offset+= this_len;
  }
}

4538 4539
void fsp_binlog_write_cache(IO_CACHE *cache, size_t main_size, mtr_t *mtr)
{
4540 4541
  uint32_t page_size= (uint32_t)srv_page_size;
  uint32_t page_size_shift= srv_page_size_shift;
4542
  fil_space_t *space= active_binlog_space;
4543
  const uint32_t page_end= page_size - FIL_PAGE_DATA_END;
4544 4545 4546 4547
  uint32_t page_no= binlog_cur_page_no;
  uint32_t page_offset= binlog_cur_page_offset;
  /* ToDo: What is the lifetime of what's pointed to by binlog_cur_block, is there some locking needed around it or something? */
  buf_block_t *block= binlog_cur_block;
4548
  uint64_t file_no= active_binlog_file_no.load(std::memory_order_relaxed);
4549
  uint64_t pending_prev_end_offset= 0;
4550 4551 4552 4553 4554 4555 4556

  /*
    Write out the event data in chunks of whatever size will fit in the current
    page, until all data has been written.
  */
  size_t remain= my_b_tell(cache);
  ut_ad(remain > main_size);
4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572
  if (cache->pos_in_file > 0) {
    /*
      ToDo: A limitation in mysys IO_CACHE. If I change (reinit_io_cache())
      the cache from WRITE_CACHE to READ_CACHE without seeking out of the
      current buffer, then the cache will not be flushed to disk (which is
      good for small cache that fits completely in buffer). But then if I
      later my_b_seek() or reinit_io_cache() it again and seek out of the
      current buffer, the buffered data will not be flushed to the file
      because the cache is now a READ_CACHE! The result is that the end of the
      cache will be lost if the cache doesn't fit in memory.

      So for now, have to do this somewhat in-elegant conditional flush
      myself.
    */
    flush_io_cache(cache);
  }
4573 4574 4575 4576 4577 4578
  /* Start with the GTID event, which is put at the end of the IO_CACHE. */
  my_bool res= reinit_io_cache(cache, READ_CACHE, main_size, 0, 0);
  ut_a(!res /* ToDo: Error handling. */);
  size_t gtid_remain= remain - main_size;
  while (remain > 0) {
    if (page_offset == FIL_PAGE_DATA) {
4579
      if (UNIV_UNLIKELY(page_no >= space->size)) {
4580
        /*
4581 4582 4583 4584
          Signal to the pre-allocation thread that this tablespace has been
          written full, so that it can be closed and a new one pre-allocated
          in its place. Then wait for a new tablespace to be pre-allocated that
          we can use.
4585

4586 4587 4588 4589
          The normal case is that the next tablespace is already pre-allocated
          and available; binlog tablespace N is active while (N+1) is being
          pre-allocated. Only under extreme I/O pressure should be need to
          stall here.
4590 4591 4592 4593 4594 4595 4596 4597 4598

          ToDo: Handle recovery. Idea: write the current LSN at the start of
          the binlog tablespace when we create it. At recovery, we should open
          the (at most) 2 most recent binlog tablespaces. Whenever we have a
          redo record, skip it if its LSN is smaller than the one stored in the
          tablespace corresponding to its space_id. This way, it should be safe
          to re-use tablespace ids between just two, SRV_SPACE_ID_BINLOG0 and
          SRV_SPACE_ID_BINLOG1.
        */
4599
        pending_prev_end_offset= page_no << page_size_shift;
4600 4601 4602 4603 4604 4605 4606 4607
        mysql_mutex_lock(&active_binlog_mutex);
        /* ToDo: Make this wait killable?. */
        /* ToDo2: Handle not stalling infinitely if the new tablespace cannot be created due to eg. I/O error. Or should we in this case loop and repeatedly retry the create? */
        while (last_created_binlog_file_no <= file_no) {
          my_cond_wait(&active_binlog_cond, &active_binlog_mutex.m_mutex);
        }

        // ToDo: assert that a single write doesn't span more than two binlog files.
4608
        ++file_no;
4609 4610 4611 4612 4613 4614
        binlog_cur_written_offset[file_no & 1].store(0, std::memory_order_relaxed);
        binlog_cur_end_offset[file_no & 1].store(0, std::memory_order_relaxed);
        active_binlog_file_no.store(file_no, std::memory_order_release);
        active_binlog_space= space= last_created_binlog_space;
        pthread_cond_signal(&active_binlog_cond);
        mysql_mutex_unlock(&active_binlog_mutex);
4615 4616
        binlog_cur_page_no= page_no= 0;
      }
4617 4618 4619 4620 4621
      block= fsp_page_create(space, page_no, mtr);
    } else {
      dberr_t err;
      /* ToDo: Is RW_SX_LATCH appropriate here? */
      block= buf_page_get_gen(page_id_t{space->id, page_no},
4622
                              0, RW_SX_LATCH, block,
4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679
                              BUF_GET, mtr, &err);
      ut_a(err == DB_SUCCESS);
    }

    ut_ad(page_offset < page_end);
    uint32_t page_remain= page_end - page_offset;
    byte *ptr= page_offset + block->page.frame;
    /* ToDo: Do this check at the end instead, to save one buf_page_get_gen()? */
    if (page_remain < 4) {
      /* Pad the remaining few bytes, and move to next page. */
      mtr->memset(block, page_offset, page_remain, 0xff);
      block= nullptr;
      ++page_no;
      page_offset= FIL_PAGE_DATA;
      continue;
    }
    page_remain-= 3;    /* Type byte and 2-byte length. */
    uint32_t size= 0;
    /* Write GTID data, if any still available. */
    if (gtid_remain > 0)
    {
      size= gtid_remain > page_remain ? page_remain : (uint32_t)gtid_remain;
      int res2= my_b_read(cache, ptr+3, size);
      ut_a(!res2 /* ToDo: Error handling */);
      gtid_remain-= size;
      page_remain-= size;
      if (gtid_remain == 0)
        my_b_seek(cache, 0);    /* Move to read the rest of the events. */
    }
    /* Write remaining data, if any available _and_ more room on page. */
    ut_ad(remain >= size);
    size_t remain2= remain - size;
    if (remain2 + page_remain > 0) {
      uint32_t size2= remain2 > page_remain ? page_remain : (uint32_t)remain2;
      int res2= my_b_read(cache, ptr+3+size, size2);
      ut_a(!res2 /* ToDo: Error handling */);
      size+= size2;
      page_remain-= size2;
    }
    ptr[0]= 0x01 /* ToDo: FSP_BINLOG_TYPE_COMMIT */ | ((size < remain) << 7);
    ptr[1]= size & 0xff;
    ptr[2]= (byte)(size >> 8);
    ut_ad(size <= 0xffff);

    mtr->memcpy(*block, page_offset, size+3);
    remain-= size;
    if (page_remain == 0) {
      block= nullptr;
      page_offset= FIL_PAGE_DATA;
      ++page_no;
    } else {
      page_offset+= size+3;
    }
  }
  binlog_cur_block= block;
  binlog_cur_page_no= page_no;
  binlog_cur_page_offset= page_offset;
4680
  if (UNIV_UNLIKELY(pending_prev_end_offset))
4681 4682 4683 4684
    binlog_cur_end_offset[(file_no-1) & 1].store(pending_prev_end_offset,
                                                 std::memory_order_relaxed);
  binlog_cur_end_offset[file_no & 1].store((page_no << page_size_shift) + page_offset,
                                           std::memory_order_relaxed);
4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702
}


extern "C" void binlog_get_cache(THD *, IO_CACHE **, size_t *);

void
fsp_binlog_trx(trx_t *trx, mtr_t *mtr)
{
  IO_CACHE *cache;
  size_t main_size;

  if (!trx->mysql_thd)
    return;
  binlog_get_cache(trx->mysql_thd, &cache, &main_size);
  if (main_size)
    fsp_binlog_write_cache(cache, main_size, mtr);
}

4703 4704 4705 4706 4707

void fsp_binlog_test(const uchar *data, uint32_t len)
{
  mtr_t mtr;
  mtr.start();
4708 4709
  if (!active_binlog_space)
    fsp_binlog_tablespace_create(0, &active_binlog_space);
4710 4711 4712
  fsp_binlog_append(data, len, &mtr);
  mtr.commit();
}
4713 4714 4715 4716 4717


class ha_innodb_binlog_reader : public handler_binlog_reader {
  /* Buffer to hold a page read directly from the binlog file. */
  uchar *page_buf;
4718 4719
  /* Length of the currently open file (cur_file). */
  uint64_t cur_file_length;
4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740
  /* Used to keep track of partial chunk returned to reader. */
  uint32_t chunk_pos;
  uint32_t chunk_remain;
private:
  int read_from_buffer_pool_page(buf_block_t *block, uint64_t end_offset,
                                 uchar *buf, uint32_t len);
  int read_from_file(uint64_t end_offset, uchar *buf, uint32_t len);
  int read_from_page(uchar *page_ptr, uint64_t end_offset,
                     uchar *buf, uint32_t len);

public:
  ha_innodb_binlog_reader();
  ~ha_innodb_binlog_reader();
  virtual int read_binlog_data(uchar *buf, uint32_t len) final;
};


ha_innodb_binlog_reader::ha_innodb_binlog_reader()
  : chunk_pos(0), chunk_remain(0)
{
  page_buf= (uchar *)my_malloc(PSI_NOT_INSTRUMENTED, srv_page_size, MYF(0)); /* ToDo: InnoDB alloc function? */
4741
  // ToDo: Need some mechanism to find where to start reading. This is just "start from 0" for early testing.
4742
  cur_file_no= 0;
4743 4744 4745 4746 4747
}


ha_innodb_binlog_reader::~ha_innodb_binlog_reader()
{
4748 4749
  if (cur_file != (File)-1)
    my_close(cur_file, MYF(0));
4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777
  my_free(page_buf); /* ToDo: InnoDB alloc function? */
}


/*
  Read data from current position in binlog.

  If the data is written to disk (visible at the OS level, even if not
  necessarily fsync()'ed to disk), we can read directly from the file.
  Otherwise, the data must still be available in the buffer pool and
  we can read it from there.

  First try a dirty read of current state; if this says the data is available
  to read from the file, this is safe to do (data cannot become un-written).

  If not, then check if the page is in the buffer pool; if not, the likewise
  we know it's safe to read from the file directly.

  Finally, do another check of the current state. This will catch the case
  where we looked for a page in binlog file N, but its tablespace id has been
  recycled, so we got a page from (N+2) instead. In this case also, we can
  then read from the real file.
*/
int ha_innodb_binlog_reader::read_binlog_data(uchar *buf, uint32_t len)
{
  int res;
  uint64_t file_no= cur_file_no;
  uint64_t offset= cur_file_offset;
4778 4779
  uint64_t active_file_no= active_binlog_file_no.load(std::memory_order_acquire);
  if (first_open_binlog_file_no.load(std::memory_order_relaxed) > file_no + /* Temporary hack to work-around the next line ToDo: comment */ (offset >= cur_file_length)) {
4780
    // ToDo: I think there is a bug here, if we're at the end of active_file_no-2, we will be reading directly from active_file_no-1 without checking properly if buffer pool is needed instead. */
4781
    return read_from_file(~(uint64_t)0, buf, len);
4782
  }
4783

4784
  ut_ad(active_file_no >= file_no);
4785 4786 4787
  uint32_t idx= file_no & 1;
  uint64_t write_offset=
    binlog_cur_written_offset[idx].load(std::memory_order_relaxed);
4788
  // ToDo: I'm not 100% confident about this dirty read of the end_offset. I need to make sure it's not possible to end up using a wrong end offset when reading from a file. When reading from a file that is not the latest, active binlog file, the end offset should basically always come from the file size.
4789 4790 4791
  uint64_t end_offset=
    binlog_cur_end_offset[idx].load(std::memory_order_relaxed);
  /* ToDo: Should I check end_offset? It might be stale and completely wrong? But on the other hand, I _must_ check it somehow, otherwise I might read not yet committed data (possibly never committed). But I should be able to read and check, it cannot be completely wrong, as I have the per-tablespace-id values. And in case of stale, I will then go to lock and wait and get the real value in a safe way, which also results in correct behaviour. And I can never read stale data from the file, data will not be written out until valid and synced in the redo log. */
4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808
  if (file_no == active_file_no) {
    ut_ad(end_offset >= offset);
    if (end_offset <= offset)
      return 0;
  } else {  /* file_no == active_file_no - 1 */
    if (offset >= end_offset) {  // ToDo what if this end_pos is stale? Need somehow an extra check afterwards if we are now active-2, and then do a simple file read, not EOF on the stale end_offset.
      /* Handle moving to the currently active file. */
      cur_file_no= ++file_no;
      cur_file_offset= offset= 0;
      idx= file_no & 1;
      write_offset=
        binlog_cur_written_offset[idx].load(std::memory_order_relaxed);
      end_offset=
        binlog_cur_end_offset[idx].load(std::memory_order_relaxed);
    }
  }

4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845
  if (write_offset > offset)
    return read_from_file(std::min(write_offset, end_offset), buf, len);

  /*
    The data we need may not yet been written and available to read from the
    file directly. So try to find it in the buffer pool. But don't ask the pool
    to bring in the page if it turns out not to be there, in that case we will
    just read it directly ourselves.
  */
  mtr_t mtr;
  mtr.start();
  /*
    ToDo: Here could remember last block and use as guess for the page lookup.
    This might be useful for reading at the active position in the binlog,
    where we might read successive events repeatedly from same page. Though
    there will be a (small) cost to maintaining the hint also, and we should
    clear the hint block whenever we read a page to completion. Also, care
    should be taken about the lifetime, if the hint block is required to point
    to a valid page and does not any more. */
  buf_block_t *hint_block= nullptr;
  uint32_t space_id= SRV_SPACE_ID_BINLOG0 + idx;
  uint32_t page_no= (uint32_t)(offset >> srv_page_size_shift);
  dberr_t err= DB_SUCCESS;
  buf_block_t *block=
    buf_page_get_gen(page_id_t{space_id, page_no}, 0,
                     RW_S_LATCH, hint_block, BUF_GET_IF_IN_POOL, &mtr, &err);
  if (err != DB_SUCCESS)
    res= -1;  // ToDo: More error handling here? Probably this is not expected to fail. Or what is the return if the page is not found?
  else if (!block)
    res= read_from_file(end_offset, buf, len);
  else {
    /*
      Ok, we got a page from the buffer pool. Before reading the data from it
      though, check again that the tablespace ID has not been recycled. If it
      has, then the page is invalid (it is from a newer binlog file tablespace
      file_no + 2), and we should just read from the real file directly.
    */
4846
    active_file_no= active_binlog_file_no.load(std::memory_order_acquire);
4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872
    if (active_file_no > file_no + 1)
      res= read_from_file(~(uint64_t)0, buf, len);
    else
      res= read_from_buffer_pool_page(block, end_offset, buf, len);
  }
  mtr.commit();

  return res;
}


int
ha_innodb_binlog_reader::read_from_buffer_pool_page(buf_block_t *block,
                                                    uint64_t end_offset,
                                                    uchar *buf, uint32_t len)
{
  return read_from_page(block->page.frame, end_offset, buf, len);
}


int
ha_innodb_binlog_reader::read_from_file(uint64_t end_offset,
                                        uchar *buf, uint32_t len)
{
  uint64_t mask= ((uint64_t)1 << srv_page_size_shift) - 1;
  uint64_t offset= cur_file_offset;
4873
  uint64_t page_start_offset;
4874

4875 4876 4877 4878 4879
  if (cur_file < (File)0 || cur_file_offset >= cur_file_length) {
    if (!(cur_file < (File)0)) {
      my_close(cur_file, MYF(0));
      ++cur_file_no;
    }
4880 4881
    char filename[BINLOG_NAME_LEN];
    binlog_name_make(filename, cur_file_no);
4882
    if ((cur_file= my_open(filename, O_RDONLY | O_BINARY, MYF(MY_WME))) < (File)0)
4883 4884
      return -1;
    /* ToDo: Handle closing the file when we reach the end. In fact, handle reaching the end of a file in the first place. */
4885 4886 4887 4888 4889 4890 4891 4892 4893
    MY_STAT stat_buf;
    if (my_fstat(cur_file, &stat_buf, MYF(0))) {
      my_error(ER_CANT_GET_STAT, MYF(0), filename, errno);
      my_close(cur_file, MYF(0));
      cur_file= (File)-1;
      return -1;
    }
    cur_file_length= stat_buf.st_size;
    cur_file_offset= offset= 0;
4894
  }
4895 4896

  page_start_offset= offset & ~mask;
4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945
  size_t res= my_pread(cur_file, page_buf, srv_page_size, page_start_offset,
                       MYF(MY_WME));
  if (res == (size_t)-1)
    return -1;

  return read_from_page(page_buf, end_offset, buf, len);
}


/*
  Read out max `len` bytes from the chunks stored in a page.

  page_ptr   Points to start of data for current page matching cur_file_offset
  end_offset Current end of binlog file, no reads past this point
  buf        Destination buffer to read into
  len        Maximum number of bytes to read

  Returns number of bytes actually read.
*/
int
ha_innodb_binlog_reader::read_from_page(uchar *page_ptr, uint64_t end_offset,
                                        uchar *buf, uint32_t len)
{
  uint32_t page_size= (uint32_t)srv_page_size;
  uint64_t mask= ((uint64_t)1 << srv_page_size_shift) - 1;
  uint64_t offset= cur_file_offset;
  uint64_t page_start_offset= offset & ~mask;
  uint32_t page_end=
    end_offset > page_start_offset + (page_size - FIL_PAGE_DATA_END) ?
      (page_size - FIL_PAGE_DATA_END) :
      (uint32_t)(end_offset & mask);
  uint32_t in_page_offset= (uint32_t)(offset & mask);
  uint32_t sofar= 0;

  ut_ad(in_page_offset < page_size - FIL_PAGE_DATA_END);
  if (in_page_offset < FIL_PAGE_DATA)
    in_page_offset= FIL_PAGE_DATA;

  /* First return data from any partially-read chunk. */
  if ((sofar= chunk_remain)) {
    if (sofar <= len) {
      memcpy(buf, page_ptr + in_page_offset + chunk_pos, sofar);
      chunk_pos= 0;
      chunk_remain= 0;
      in_page_offset+= sofar;
    } else {
      memcpy(buf, page_ptr + in_page_offset + chunk_pos, len);
      chunk_pos+= len;
      chunk_remain= sofar - len;
4946
      cur_file_offset= offset + len;
4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961
      return len;
    }
  }

  while (sofar < len && in_page_offset < page_end)
  {
    uchar type= page_ptr[in_page_offset];
    if (type == 0x00)
      break;  /* No more data on the page yet */
    if (type == 0xff /* ToDo FSP_BINLOG_TYPE_FILLER */) {
      in_page_offset= page_size;  /* Point to start of next page */
      break;  /* No more data on page */
    }
    uint32_t size=
      page_ptr[in_page_offset + 1] + (uint32_t)(page_ptr[in_page_offset + 2] << 8);
4962
    if ((type & 0x7f) != 1 /* ToDo FSP_BINLOG_TYPE_COMMIT */) {
4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978
      /* Skip non-binlog-event record. */
      in_page_offset += 3 + size;
      continue;
    }

    /* Now grab the data in the chunk, or however much the caller requested. */
    uint32_t rest = len - sofar;
    if (size > rest) {
      /*
        Chunk contains more data than reader requested.
        Return what was requested, and remember the remaining partial data
        for the next read.
      */
      memcpy(buf + sofar, page_ptr + (in_page_offset + 3), rest);
      chunk_pos= rest;
      chunk_remain= size - rest;
4979
      sofar+= rest;
4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000
      break;
    }

    memcpy(buf + sofar, page_ptr + (in_page_offset + 3), size);
    in_page_offset= in_page_offset + 3 + size;
    sofar+= size;
  }

  if (in_page_offset >= page_size - FIL_PAGE_DATA_END)
    cur_file_offset= page_start_offset + page_size; // To start of next page
  else
    cur_file_offset= page_start_offset | in_page_offset;
  return sofar;
}


handler_binlog_reader *
innodb_get_binlog_reader()
{
  return new ha_innodb_binlog_reader();
}
5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013


bool
innobase_binlog_write_direct(IO_CACHE *cache, size_t main_size)
{
  mtr_t mtr;
  mtr.start();
  fsp_binlog_write_cache(cache, main_size, &mtr);
  mtr.commit();
  /* ToDo: Should we sync the log here? Maybe depending on an extra bool parameter? */
  /* ToDo: Presumably fsp_binlog_write_cache() should be able to fail in some cases? Then return any such error to the caller. */
  return false;
}