log0log.cc 36.9 KB
Newer Older
1 2
/*****************************************************************************

3
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4
Copyright (c) 2009, Google Inc.
5
Copyright (c) 2014, 2021, MariaDB Corporation.
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21

Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
22
this program; if not, write to the Free Software Foundation, Inc.,
23
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
24

25 26
*****************************************************************************/

27
/**************************************************//**
28
@file log/log0log.cc
osku's avatar
osku committed
29 30 31 32 33
Database log

Created 12/9/1995 Heikki Tuuri
*******************************************************/

34
#include "univ.i"
35
#include <debug_sync.h>
36
#include <my_service_manager.h>
37

osku's avatar
osku committed
38
#include "log0log.h"
39
#include "log0crypt.h"
osku's avatar
osku committed
40 41
#include "buf0buf.h"
#include "buf0flu.h"
42
#include "lock0lock.h"
osku's avatar
osku committed
43 44
#include "log0recv.h"
#include "fil0fil.h"
45
#include "dict0stats_bg.h"
46
#include "btr0defragment.h"
osku's avatar
osku committed
47 48 49 50
#include "srv0srv.h"
#include "srv0start.h"
#include "trx0sys.h"
#include "trx0trx.h"
Sergei Golubchik's avatar
Sergei Golubchik committed
51
#include "trx0roll.h"
52
#include "srv0mon.h"
53
#include "sync0sync.h"
54
#include "buf0dump.h"
55
#include "log0sync.h"
osku's avatar
osku committed
56 57 58 59

/*
General philosophy of InnoDB redo-logs:

60 61 62
Every change to a contents of a data page must be done
through mtr_t, and mtr_t::commit() will write log records
to the InnoDB redo log. */
osku's avatar
osku committed
63

64
/** Redo log system */
65
log_t	log_sys;
osku's avatar
osku committed
66 67

/* A margin for free space in the log buffer before a log entry is catenated */
68
#define LOG_BUF_WRITE_MARGIN	(4 * OS_FILE_LOG_BLOCK_SIZE)
osku's avatar
osku committed
69 70 71

/* Margins for free space in the log buffer after a log entry is catenated */
#define LOG_BUF_FLUSH_RATIO	2
72 73
#define LOG_BUF_FLUSH_MARGIN	(LOG_BUF_WRITE_MARGIN		\
				 + (4U << srv_page_size_shift))
osku's avatar
osku committed
74

Sergei Golubchik's avatar
Sergei Golubchik committed
75
/** Extends the log buffer.
76
@param[in]	len	requested minimum size in bytes */
77
void log_buffer_extend(ulong len)
Sergei Golubchik's avatar
Sergei Golubchik committed
78
{
79
	const size_t new_buf_size = ut_calc_align(len, srv_page_size);
Marko Mäkelä's avatar
Marko Mäkelä committed
80 81 82 83
	byte* new_buf = static_cast<byte*>
		(ut_malloc_dontdump(new_buf_size, PSI_INSTRUMENT_ME));
	byte* new_flush_buf = static_cast<byte*>
		(ut_malloc_dontdump(new_buf_size, PSI_INSTRUMENT_ME));
Sergei Golubchik's avatar
Sergei Golubchik committed
84

85
	mysql_mutex_lock(&log_sys.mutex);
Sergei Golubchik's avatar
Sergei Golubchik committed
86

Marko Mäkelä's avatar
Marko Mäkelä committed
87
	if (len <= srv_log_buffer_size) {
88
		/* Already extended enough by the others */
89
		mysql_mutex_unlock(&log_sys.mutex);
90 91
		ut_free_dodump(new_buf, new_buf_size);
		ut_free_dodump(new_flush_buf, new_buf_size);
92
		return;
93 94
	}

Marko Mäkelä's avatar
Marko Mäkelä committed
95 96 97
	ib::warn() << "The redo log transaction size " << len <<
		" exceeds innodb_log_buffer_size="
		<< srv_log_buffer_size << " / 2). Trying to extend it.";
98

99 100
	byte* old_buf = log_sys.buf;
	byte* old_flush_buf = log_sys.flush_buf;
Marko Mäkelä's avatar
Marko Mäkelä committed
101
	const ulong old_buf_size = srv_log_buffer_size;
102
	srv_log_buffer_size = static_cast<ulong>(new_buf_size);
Marko Mäkelä's avatar
Marko Mäkelä committed
103
	log_sys.buf = new_buf;
104
	log_sys.flush_buf = new_flush_buf;
Marko Mäkelä's avatar
Marko Mäkelä committed
105
	memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(new_buf, old_buf,
106
					       log_sys.buf_free);
107

Marko Mäkelä's avatar
Marko Mäkelä committed
108
	log_sys.max_buf_free = new_buf_size / LOG_BUF_FLUSH_RATIO
Sergei Golubchik's avatar
Sergei Golubchik committed
109 110
		- LOG_BUF_FLUSH_MARGIN;

111
	mysql_mutex_unlock(&log_sys.mutex);
Sergei Golubchik's avatar
Sergei Golubchik committed
112

Marko Mäkelä's avatar
Marko Mäkelä committed
113
	ut_free_dodump(old_buf, old_buf_size);
114
	ut_free_dodump(old_flush_buf, old_buf_size);
Sergei Golubchik's avatar
Sergei Golubchik committed
115

116
	ib::info() << "innodb_log_buffer_size was extended to "
Marko Mäkelä's avatar
Marko Mäkelä committed
117
		<< new_buf_size << ".";
Sergei Golubchik's avatar
Sergei Golubchik committed
118 119
}

120
/** Calculate the recommended highest values for lsn - last_checkpoint_lsn
121
and lsn - buf_pool.get_oldest_modification().
122
@param[in]	file_size	requested innodb_log_file_size
123 124
@retval true on success
@retval false if the smallest log group is too small to
125
accommodate the number of OS threads in the database server */
126
bool
127
log_set_capacity(ulonglong file_size)
osku's avatar
osku committed
128
{
129 130 131 132 133 134
	/* Margin for the free space in the smallest log, before a new query
	step which modifies the database, is started */
	const size_t LOG_CHECKPOINT_FREE_PER_THREAD = 4U
						      << srv_page_size_shift;
	const size_t LOG_CHECKPOINT_EXTRA_FREE = 8U << srv_page_size_shift;

135
	lsn_t		margin;
osku's avatar
osku committed
136 137
	ulint		free;

138
	lsn_t smallest_capacity = file_size - LOG_FILE_HDR_SIZE;
osku's avatar
osku committed
139
	/* Add extra safety */
140
	smallest_capacity -= smallest_capacity / 10;
osku's avatar
osku committed
141 142 143 144 145

	/* For each OS thread we must reserve so much free space in the
	smallest log group that it can accommodate the log entries produced
	by single query steps: running out of free log space is a serious
	system error which requires rebooting the database. */
146

147
	free = LOG_CHECKPOINT_FREE_PER_THREAD * 10
148
		+ LOG_CHECKPOINT_EXTRA_FREE;
osku's avatar
osku committed
149
	if (free >= smallest_capacity / 2) {
150 151 152
		ib::error() << "Cannot continue operation because log file is "
			       "too small. Increase innodb_log_file_size "
			       "or decrease innodb_thread_concurrency. "
153
			    << INNODB_PARAMETERS_MSG;
154
		return false;
osku's avatar
osku committed
155 156
	}

157
	margin = smallest_capacity - free;
osku's avatar
osku committed
158 159
	margin = margin - margin / 10;	/* Add still some extra safety */

160
	mysql_mutex_lock(&log_sys.mutex);
161

162
	log_sys.log_capacity = smallest_capacity;
osku's avatar
osku committed
163

164
	log_sys.max_modified_age_async = margin - margin / 8;
165
	log_sys.max_checkpoint_age = margin;
osku's avatar
osku committed
166

167
	mysql_mutex_unlock(&log_sys.mutex);
osku's avatar
osku committed
168

169
	return(true);
osku's avatar
osku committed
170 171
}

172 173
/** Initialize the redo log subsystem. */
void log_t::create()
osku's avatar
osku committed
174
{
175 176 177 178
  ut_ad(this == &log_sys);
  ut_ad(!is_initialised());
  m_initialised= true;

179 180
  mysql_mutex_init(log_sys_mutex_key, &mutex, nullptr);
  mysql_mutex_init(log_flush_order_mutex_key, &flush_order_mutex, nullptr);
181 182 183 184

  /* Start the lsn from one log block from zero: this way every
  log record has a non-zero start lsn, a fact which we will use */

185
  set_lsn(LOG_START_LSN + LOG_BLOCK_HDR_SIZE);
186
  set_flushed_lsn(LOG_START_LSN + LOG_BLOCK_HDR_SIZE);
187 188 189 190

  ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE);
  ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift);

Marko Mäkelä's avatar
Marko Mäkelä committed
191 192
  buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size,
                                             PSI_INSTRUMENT_ME));
193
  TRASH_ALLOC(buf, srv_log_buffer_size);
Marko Mäkelä's avatar
Marko Mäkelä committed
194 195
  flush_buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size,
                                                   PSI_INSTRUMENT_ME));
196
  TRASH_ALLOC(flush_buf, srv_log_buffer_size);
197 198 199

  max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO -
    LOG_BUF_FLUSH_MARGIN;
200
  set_check_flush_or_checkpoint();
201 202 203 204 205

  n_log_ios_old= n_log_ios;
  last_printout_time= time(NULL);

  buf_next_to_write= 0;
206
  last_checkpoint_lsn= write_lsn= LOG_START_LSN;
207 208
  n_log_ios= 0;
  n_log_ios_old= 0;
209
  log_capacity= 0;
210 211 212 213 214 215
  max_modified_age_async= 0;
  max_checkpoint_age= 0;
  next_checkpoint_no= 0;
  next_checkpoint_lsn= 0;
  n_pending_checkpoint_writes= 0;

216
  log_block_init(buf, LOG_START_LSN);
217 218 219
  log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE);

  buf_free= LOG_BLOCK_HDR_SIZE;
220 221
  checkpoint_buf= static_cast<byte*>
    (aligned_malloc(OS_FILE_LOG_BLOCK_SIZE, OS_FILE_LOG_BLOCK_SIZE));
osku's avatar
osku committed
222 223
}

224 225 226 227 228 229
mapped_file_t::~mapped_file_t() noexcept
{
  if (!m_area.empty())
    unmap();
}

230 231
dberr_t mapped_file_t::map(const char *path, bool read_only,
                           bool nvme) noexcept
232
{
233 234
  auto fd= mysql_file_open(innodb_log_file_key, path,
                           read_only ? O_RDONLY : O_RDWR, MYF(MY_WME));
235 236 237
  if (fd == -1)
    return DB_ERROR;

238
  const auto file_size= os_file_get_size(path).m_total_size;
239

240 241 242 243
  const int nvme_flag= nvme ? MAP_SYNC : 0;
  void *ptr= my_mmap(0, static_cast<size_t>(file_size),
                     read_only ? PROT_READ : PROT_READ | PROT_WRITE,
                     MAP_SHARED_VALIDATE | nvme_flag, fd, 0);
244 245 246 247 248 249
  mysql_file_close(fd, MYF(MY_WME));

  if (ptr == MAP_FAILED)
    return DB_ERROR;

  m_area= {static_cast<byte *>(ptr),
Marko Mäkelä's avatar
Marko Mäkelä committed
250
           static_cast<span<byte>::size_type>(file_size)};
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
  return DB_SUCCESS;
}

dberr_t mapped_file_t::unmap() noexcept
{
  ut_ad(!m_area.empty());

  if (my_munmap(m_area.data(), m_area.size()))
    return DB_ERROR;

  m_area= {};
  return DB_SUCCESS;
}

file_os_io::file_os_io(file_os_io &&rhs) : m_fd(rhs.m_fd)
Eugene Kosov's avatar
Eugene Kosov committed
266 267 268
{
  rhs.m_fd= OS_FILE_CLOSED;
}
269 270

file_os_io &file_os_io::operator=(file_os_io &&rhs)
Eugene Kosov's avatar
Eugene Kosov committed
271 272 273 274 275
{
  std::swap(m_fd, rhs.m_fd);
  return *this;
}

276
file_os_io::~file_os_io() noexcept
Eugene Kosov's avatar
Eugene Kosov committed
277 278
{
  if (is_opened())
279
    close();
Eugene Kosov's avatar
Eugene Kosov committed
280 281
}

282
dberr_t file_os_io::open(const char *path, bool read_only) noexcept
Eugene Kosov's avatar
Eugene Kosov committed
283
{
284
  ut_ad(!is_opened());
Eugene Kosov's avatar
Eugene Kosov committed
285 286

  bool success;
287 288
  auto tmp_fd= os_file_create(
      innodb_log_file_key, path, OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
289
      OS_FILE_NORMAL, OS_LOG_FILE, read_only, &success);
Eugene Kosov's avatar
Eugene Kosov committed
290
  if (!success)
291
    return DB_ERROR;
Eugene Kosov's avatar
Eugene Kosov committed
292

293 294 295
  m_durable_writes= srv_file_flush_method == SRV_O_DSYNC;
  m_fd= tmp_fd;
  return success ? DB_SUCCESS : DB_ERROR;
Eugene Kosov's avatar
Eugene Kosov committed
296 297
}

298
dberr_t file_os_io::rename(const char *old_path, const char *new_path) noexcept
Eugene Kosov's avatar
Eugene Kosov committed
299
{
300 301 302 303 304 305 306
  return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS
                                                                 : DB_ERROR;
}

dberr_t file_os_io::close() noexcept
{
  if (!os_file_close(m_fd))
Eugene Kosov's avatar
Eugene Kosov committed
307
    return DB_ERROR;
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324

  m_fd= OS_FILE_CLOSED;
  return DB_SUCCESS;
}

dberr_t file_os_io::read(os_offset_t offset, span<byte> buf) noexcept
{
  return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size());
}

dberr_t file_os_io::write(const char *path, os_offset_t offset,
                          span<const byte> buf) noexcept
{
  return os_file_write(IORequestWrite, path, m_fd, buf.data(), offset,
                       buf.size());
}

325
dberr_t file_os_io::flush() noexcept
326
{
327
  return os_file_flush(m_fd) ? DB_SUCCESS : DB_ERROR;
328 329 330 331 332 333 334 335 336
}

#ifdef HAVE_PMEM

#include <libpmem.h>

static bool is_pmem(const char *path) noexcept
{
  mapped_file_t mf;
337
  return mf.map(path, true, true) == DB_SUCCESS ? true : false;
338 339 340 341 342 343 344
}

class file_pmem_io final : public file_io
{
public:
  file_pmem_io() noexcept : file_io(true) {}

345
  dberr_t open(const char *path, bool read_only) noexcept final
346
  {
347
    return m_file.map(path, read_only, true);
348 349 350 351 352
  }
  dberr_t rename(const char *old_path, const char *new_path) noexcept final
  {
    return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS
                                                                   : DB_ERROR;
Eugene Kosov's avatar
Eugene Kosov committed
353
  }
354 355 356 357 358 359 360 361 362 363 364 365
  dberr_t close() noexcept final { return m_file.unmap(); }
  dberr_t read(os_offset_t offset, span<byte> buf) noexcept final
  {
    memcpy(buf.data(), m_file.data() + offset, buf.size());
    return DB_SUCCESS;
  }
  dberr_t write(const char *, os_offset_t offset,
                span<const byte> buf) noexcept final
  {
    pmem_memcpy_persist(m_file.data() + offset, buf.data(), buf.size());
    return DB_SUCCESS;
  }
366
  dberr_t flush() noexcept final
367 368 369 370 371 372 373 374 375 376
  {
    ut_ad(0);
    return DB_SUCCESS;
  }

private:
  mapped_file_t m_file;
};
#endif

377
dberr_t log_file_t::open(bool read_only) noexcept
378 379 380 381 382 383 384 385 386 387 388
{
  ut_a(!is_opened());

#ifdef HAVE_PMEM
  auto ptr= is_pmem(m_path.c_str())
                ? std::unique_ptr<file_io>(new file_pmem_io)
                : std::unique_ptr<file_io>(new file_os_io);
#else
  auto ptr= std::unique_ptr<file_io>(new file_os_io);
#endif

389
  if (dberr_t err= ptr->open(m_path.c_str(), read_only))
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
    return err;

  m_file= std::move(ptr);
  return DB_SUCCESS;
}

bool log_file_t::is_opened() const noexcept
{
  return static_cast<bool>(m_file);
}

dberr_t log_file_t::rename(std::string new_path) noexcept
{
  if (dberr_t err= m_file->rename(m_path.c_str(), new_path.c_str()))
    return err;

Eugene Kosov's avatar
Eugene Kosov committed
406 407 408 409
  m_path = std::move(new_path);
  return DB_SUCCESS;
}

410
dberr_t log_file_t::close() noexcept
Eugene Kosov's avatar
Eugene Kosov committed
411 412
{
  ut_a(is_opened());
413 414 415 416 417 418

  if (dberr_t err= m_file->close())
    return err;

  m_file.reset();
  return DB_SUCCESS;
Eugene Kosov's avatar
Eugene Kosov committed
419 420
}

421
dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) noexcept
Eugene Kosov's avatar
Eugene Kosov committed
422 423
{
  ut_ad(is_opened());
424 425 426 427 428 429
  return m_file->read(offset, buf);
}

bool log_file_t::writes_are_durable() const noexcept
{
  return m_file->writes_are_durable();
Eugene Kosov's avatar
Eugene Kosov committed
430 431
}

432
dberr_t log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept
Eugene Kosov's avatar
Eugene Kosov committed
433 434
{
  ut_ad(is_opened());
435
  return m_file->write(m_path.c_str(), offset, buf);
Eugene Kosov's avatar
Eugene Kosov committed
436 437
}

438
dberr_t log_file_t::flush() noexcept
Eugene Kosov's avatar
Eugene Kosov committed
439 440
{
  ut_ad(is_opened());
441
  return m_file->flush();
Eugene Kosov's avatar
Eugene Kosov committed
442 443
}

444
void log_t::file::open_file(std::string path)
445
{
446
  fd= log_file_t(std::move(path));
447
  if (const dberr_t err= fd.open(srv_read_only_mode))
448
    ib::fatal() << "open(" << fd.get_path() << ") returned " << err;
449 450
}

Eugene Kosov's avatar
Eugene Kosov committed
451 452 453 454 455 456 457 458 459 460 461 462 463
/** Update the log block checksum. */
static void log_block_store_checksum(byte* block)
{
  log_block_set_checksum(block, log_block_calc_checksum_crc32(block));
}

void log_t::file::write_header_durable(lsn_t lsn)
{
  ut_ad(lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
  ut_ad(!recv_no_log_write);
  ut_ad(log_sys.log.format == log_t::FORMAT_10_5 ||
        log_sys.log.format == log_t::FORMAT_ENC_10_5);

464 465
  byte *buf= log_sys.checkpoint_buf;
  memset_aligned<OS_FILE_LOG_BLOCK_SIZE>(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
Eugene Kosov's avatar
Eugene Kosov committed
466 467 468 469 470 471 472 473 474 475 476 477

  mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format);
  mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat);
  mach_write_to_8(buf + LOG_HEADER_START_LSN, lsn);
  strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
         LOG_HEADER_CREATOR_CURRENT);
  ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR >=
        sizeof LOG_HEADER_CREATOR_CURRENT);
  log_block_store_checksum(buf);

  DBUG_PRINT("ib_log", ("write " LSN_PF, lsn));

478
  log_sys.log.write(0, {buf, OS_FILE_LOG_BLOCK_SIZE});
Eugene Kosov's avatar
Eugene Kosov committed
479
  if (!log_sys.log.writes_are_durable())
480
    log_sys.log.flush();
Eugene Kosov's avatar
Eugene Kosov committed
481 482
}

483
void log_t::file::read(os_offset_t offset, span<byte> buf)
484
{
485 486
  if (const dberr_t err= fd.read(offset, buf))
    ib::fatal() << "read(" << fd.get_path() << ") returned "<< err;
487 488
}

489
bool log_t::file::writes_are_durable() const noexcept
490
{
491
  return fd.writes_are_durable();
492 493
}

494
void log_t::file::write(os_offset_t offset, span<byte> buf)
495
{
Eugene Kosov's avatar
Eugene Kosov committed
496
  srv_stats.os_log_pending_writes.inc();
497 498
  if (const dberr_t err= fd.write(offset, buf))
    ib::fatal() << "write(" << fd.get_path() << ") returned " << err;
Eugene Kosov's avatar
Eugene Kosov committed
499 500 501 502
  srv_stats.os_log_pending_writes.dec();
  srv_stats.os_log_written.add(buf.size());
  srv_stats.log_writes.inc();
  log_sys.n_log_ios++;
503 504
}

505
void log_t::file::flush()
506 507
{
  log_sys.pending_flushes.fetch_add(1, std::memory_order_acquire);
508 509
  if (const dberr_t err= fd.flush())
    ib::fatal() << "flush(" << fd.get_path() << ") returned " << err;
Eugene Kosov's avatar
Eugene Kosov committed
510
  log_sys.pending_flushes.fetch_sub(1, std::memory_order_release);
511 512 513
  log_sys.flushes.fetch_add(1, std::memory_order_release);
}

514
void log_t::file::close_file()
515
{
516 517 518 519 520 521
  if (fd.is_opened())
  {
    if (const dberr_t err= fd.close())
      ib::fatal() << "close(" << fd.get_path() << ") returned " << err;
  }
  fd.free();                                    // Free path
522 523
}

524 525
/** Initialize the redo log. */
void log_t::file::create()
osku's avatar
osku committed
526
{
527 528 529
  ut_ad(this == &log_sys.log);
  ut_ad(log_sys.is_initialised());

530
  format= srv_encrypt_log ? log_t::FORMAT_ENC_10_5 : log_t::FORMAT_10_5;
531
  subformat= 2;
532 533 534
  file_size= srv_log_file_size;
  lsn= LOG_START_LSN;
  lsn_offset= LOG_FILE_HDR_SIZE;
osku's avatar
osku committed
535 536
}

537
/******************************************************//**
538
Writes a buffer to a log file. */
539
static
osku's avatar
osku committed
540
void
541
log_write_buf(
542 543
	byte*		buf,		/*!< in: buffer */
	ulint		len,		/*!< in: buffer len; must be divisible
osku's avatar
osku committed
544
					by OS_FILE_LOG_BLOCK_SIZE */
545 546 547
#ifdef UNIV_DEBUG
	ulint		pad_len,	/*!< in: pad len in the buffer len */
#endif /* UNIV_DEBUG */
548
	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
osku's avatar
osku committed
549 550
					be divisible by
					OS_FILE_LOG_BLOCK_SIZE */
551
	ulint		new_data_offset)/*!< in: start offset of new data in
osku's avatar
osku committed
552 553 554 555
					buf: this parameter is used to decide
					if we have to write a new log file
					header */
{
556 557 558
	ulint		write_len;
	lsn_t		next_offset;
	ulint		i;
559

560
	ut_ad(log_write_lock_own());
561
	ut_ad(!recv_no_log_write);
osku's avatar
osku committed
562
	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
563
	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
osku's avatar
osku committed
564 565 566 567 568 569 570

loop:
	if (len == 0) {

		return;
	}

571
	next_offset = log_sys.log.calc_lsn_offset(start_lsn);
osku's avatar
osku committed
572

573 574
	if ((next_offset % log_sys.log.file_size) + len
	    > log_sys.log.file_size) {
575 576
		/* if the above condition holds, then the below expression
		is < len which is ulint, so the typecast is ok */
577 578
		write_len = ulint(log_sys.log.file_size
				  - (next_offset % log_sys.log.file_size));
osku's avatar
osku committed
579 580 581 582
	} else {
		write_len = len;
	}

583 584
	DBUG_PRINT("ib_log",
		   ("write " LSN_PF " to " LSN_PF
585
		    ": len " ULINTPF
586 587
		    " blocks " ULINTPF ".." ULINTPF,
		    start_lsn, next_offset,
588
		    write_len,
589 590 591 592 593 594 595 596 597
		    log_block_get_hdr_no(buf),
		    log_block_get_hdr_no(
			    buf + write_len
			    - OS_FILE_LOG_BLOCK_SIZE)));

	ut_ad(pad_len >= len
	      || log_block_get_hdr_no(buf)
		 == log_block_convert_lsn_to_no(start_lsn));

osku's avatar
osku committed
598 599 600 601
	/* Calculate the checksums for each log block and write them to
	the trailer fields of the log blocks */

	for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
602 603 604 605 606
#ifdef UNIV_DEBUG
		ulint hdr_no_2 = log_block_get_hdr_no(buf) + i;
		DBUG_EXECUTE_IF("innodb_small_log_block_no_limit",
				hdr_no_2 = ((hdr_no_2 - 1) & 0xFUL) + 1;);
#endif
607
		ut_ad(pad_len >= len
608 609
			|| i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
			|| log_block_get_hdr_no(buf + i * OS_FILE_LOG_BLOCK_SIZE) == hdr_no_2);
osku's avatar
osku committed
610 611 612
		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
	}

613
	log_sys.log.write(next_offset, {buf, write_len});
osku's avatar
osku committed
614 615

	if (write_len < len) {
616
		start_lsn += write_len;
osku's avatar
osku committed
617 618 619 620 621 622
		len -= write_len;
		buf += write_len;
		goto loop;
	}
}

623
/** Flush the recently written changes to the log file.
624
and invoke mysql_mutex_lock(&log_sys.mutex). */
625
static void log_write_flush_to_disk_low(lsn_t lsn)
626
{
627 628
  if (!log_sys.log.writes_are_durable())
    log_sys.log.flush();
629 630
  ut_a(lsn >= log_sys.get_flushed_lsn());
  log_sys.set_flushed_lsn(lsn);
631 632
}

633 634
/** Swap log buffers, and copy the content of last block
from old buf to the head of the new buf. Thus, buf_free and
635 636 637 638 639
buf_next_to_write would be changed accordingly */
static inline
void
log_buffer_switch()
{
640
	mysql_mutex_assert_owner(&log_sys.mutex);
641
	ut_ad(log_write_lock_own());
642

643 644
	size_t		area_end = ut_calc_align<size_t>(
		log_sys.buf_free, OS_FILE_LOG_BLOCK_SIZE);
645 646

	/* Copy the last block to new buf */
647
	memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
Marko Mäkelä's avatar
Marko Mäkelä committed
648 649
		log_sys.flush_buf,
		log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
650
		OS_FILE_LOG_BLOCK_SIZE);
651

652 653
	std::swap(log_sys.buf, log_sys.flush_buf);

654 655
	log_sys.buf_free %= OS_FILE_LOG_BLOCK_SIZE;
	log_sys.buf_next_to_write = log_sys.buf_free;
656 657
}

658 659 660 661
/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
log writes have been completed. */
void log_flush_notify(lsn_t flush_lsn);

662 663 664
/**
Writes log buffer to disk
which is the "write" part of log_write_up_to().
osku's avatar
osku committed
665

666
This function does not flush anything.
osku's avatar
osku committed
667

668
Note : the caller must have log_sys.mutex locked, and this
669
mutex is released in the function.
670

671 672 673
*/
static void log_write(bool rotate_key)
{
674
	mysql_mutex_assert_owner(&log_sys.mutex);
675
	ut_ad(!recv_no_log_write);
676 677 678
	lsn_t write_lsn;
	if (log_sys.buf_free == log_sys.buf_next_to_write) {
		/* Nothing to write */
679
		mysql_mutex_unlock(&log_sys.mutex);
osku's avatar
osku committed
680 681 682
		return;
	}

683 684 685 686 687 688 689 690
	ulint		start_offset;
	ulint		end_offset;
	ulint		area_start;
	ulint		area_end;
	ulong		write_ahead_size = srv_log_write_ahead_size;
	ulint		pad_size;

	DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
691
			      log_sys.write_lsn,
692
			      log_sys.get_lsn()));
693

osku's avatar
osku committed
694

695 696
	start_offset = log_sys.buf_next_to_write;
	end_offset = log_sys.buf_free;
osku's avatar
osku committed
697

698 699 700
	area_start = ut_2pow_round(start_offset,
				   ulint(OS_FILE_LOG_BLOCK_SIZE));
	area_end = ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
osku's avatar
osku committed
701 702 703

	ut_ad(area_end - area_start > 0);

704
	log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
705
	log_block_set_checkpoint_no(
706 707
		log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
		log_sys.next_checkpoint_no);
708

709
	write_lsn = log_sys.get_lsn();
710
	byte *write_buf = log_sys.buf;
711 712 713

	log_buffer_switch();

714
	log_sys.log.set_fields(log_sys.write_lsn);
715

716
	mysql_mutex_unlock(&log_sys.mutex);
717
	/* Erase the end of the last log block. */
718 719
	memset(write_buf + end_offset, 0,
	       ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
720

721 722 723 724
	/* Calculate pad_size if needed. */
	pad_size = 0;
	if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
		ulint	end_offset_in_unit;
725 726
		lsn_t	end_offset = log_sys.log.calc_lsn_offset(
			ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE));
727 728 729 730 731 732 733
		end_offset_in_unit = (ulint) (end_offset % write_ahead_size);

		if (end_offset_in_unit > 0
		    && (area_end - area_start) > end_offset_in_unit) {
			/* The first block in the unit was initialized
			after the last writing.
			Needs to be written padded data once. */
734
			pad_size = std::min<ulint>(
735
				ulint(write_ahead_size) - end_offset_in_unit,
736
				srv_log_buffer_size - area_end);
737
			::memset(write_buf + area_end, 0, pad_size);
738 739
		}
	}
740

741
	if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) {
742 743
		service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
					       "InnoDB log write: "
744
					       LSN_PF, log_sys.write_lsn);
745 746
	}

747 748
	if (log_sys.is_encrypted()) {
		log_crypt(write_buf + area_start, log_sys.write_lsn,
749 750
			  area_end - area_start,
			  rotate_key ? LOG_ENCRYPT_ROTATE_KEY : LOG_ENCRYPT);
751 752
	}

753
	/* Do the write to the log file */
754 755
	log_write_buf(
		write_buf + area_start, area_end - area_start + pad_size,
756 757 758
#ifdef UNIV_DEBUG
		pad_size,
#endif /* UNIV_DEBUG */
759
		ut_uint64_align_down(log_sys.write_lsn,
760 761 762
				     OS_FILE_LOG_BLOCK_SIZE),
		start_offset - area_start);
	srv_stats.log_padded.add(pad_size);
763
	log_sys.write_lsn = write_lsn;
764
	if (log_sys.log.writes_are_durable()) {
765
		log_sys.set_flushed_lsn(write_lsn);
766 767
		log_flush_notify(write_lsn);
	}
768 769
	return;
}
osku's avatar
osku committed
770

771 772
static group_commit_lock write_lock;
static group_commit_lock flush_lock;
osku's avatar
osku committed
773

774 775 776 777 778 779
#ifdef UNIV_DEBUG
bool log_write_lock_own()
{
  return write_lock.is_owner();
}
#endif
780

781 782 783 784 785 786 787 788 789 790 791 792
/** Ensure that the log has been written to the log file up to a given
log entry (such as that of a transaction commit). Start a new write, or
wait and check if an already running write is covering the request.
@param[in]	lsn		log sequence number that should be
included in the redo log file write
@param[in]	flush_to_disk	whether the written log should also
be flushed to the file system
@param[in]	rotate_key	whether to rotate the encryption key */
void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
{
  ut_ad(!srv_read_only_mode);
  ut_ad(!rotate_key || flush_to_disk);
793
  ut_ad(lsn != LSN_MAX);
794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809

  if (recv_no_ibuf_operations)
  {
    /* Recovery is running and no operations on the log files are
    allowed yet (the variable name .._no_ibuf_.. is misleading) */
    return;
  }

  if (flush_to_disk &&
    flush_lock.acquire(lsn) != group_commit_lock::ACQUIRED)
  {
    return;
  }

  if (write_lock.acquire(lsn) == group_commit_lock::ACQUIRED)
  {
810
    mysql_mutex_lock(&log_sys.mutex);
811
    lsn_t write_lsn= log_sys.get_lsn();
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827
    write_lock.set_pending(write_lsn);

    log_write(rotate_key);

    ut_a(log_sys.write_lsn == write_lsn);
    write_lock.release(write_lsn);
  }

  if (!flush_to_disk)
  {
    return;
  }

  /* Flush the highest written lsn.*/
  auto flush_lsn = write_lock.value();
  flush_lock.set_pending(flush_lsn);
828
  log_write_flush_to_disk_low(flush_lsn);
829 830
  flush_lock.release(flush_lsn);

831
  log_flush_notify(flush_lsn);
osku's avatar
osku committed
832 833
}

834 835
/** Write to the log file up to the last log entry.
@param sync  whether to wait for a durable write to complete */
836
void log_buffer_flush_to_disk(bool sync)
osku's avatar
osku committed
837
{
838 839
  ut_ad(!srv_read_only_mode);
  log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), sync);
osku's avatar
osku committed
840 841
}

842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */
ATTRIBUTE_COLD void log_write_and_flush_prepare()
{
  mysql_mutex_assert_not_owner(&log_sys.mutex);

  while (flush_lock.acquire(log_sys.get_lsn() + 1) !=
         group_commit_lock::ACQUIRED);
  while (write_lock.acquire(log_sys.get_lsn() + 1) !=
         group_commit_lock::ACQUIRED);
}

/** Durably write the log and release log_sys.mutex */
ATTRIBUTE_COLD void log_write_and_flush()
{
  ut_ad(!srv_read_only_mode);
  auto lsn= log_sys.get_lsn();
  write_lock.set_pending(lsn);
  log_write(false);
  ut_a(log_sys.write_lsn == lsn);
  write_lock.release(lsn);

  lsn= write_lock.value();
  flush_lock.set_pending(lsn);
  log_write_flush_to_disk_low(lsn);
  flush_lock.release(lsn);
}

869 870
/********************************************************************

osku's avatar
osku committed
871 872
Tries to establish a big enough margin of free space in the log buffer, such
that a new log entry can be catenated without an immediate need for a flush. */
873
ATTRIBUTE_COLD static void log_flush_margin()
osku's avatar
osku committed
874
{
875
	lsn_t	lsn	= 0;
osku's avatar
osku committed
876

877
	mysql_mutex_lock(&log_sys.mutex);
osku's avatar
osku committed
878

879
	if (log_sys.buf_free > log_sys.max_buf_free) {
880
		/* We can write during flush */
881
		lsn = log_sys.get_lsn();
osku's avatar
osku committed
882 883
	}

884
	mysql_mutex_unlock(&log_sys.mutex);
osku's avatar
osku committed
885

marko's avatar
marko committed
886
	if (lsn) {
887
		log_write_up_to(lsn, false);
osku's avatar
osku committed
888 889
	}
}
890

891
/** Write checkpoint info to the log header and release log_sys.mutex.
892
@param[in]	end_lsn	start LSN of the FILE_CHECKPOINT mini-transaction */
893
ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn)
osku's avatar
osku committed
894
{
895
	ut_ad(!srv_read_only_mode);
896
	ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn);
897 898
	ut_ad(end_lsn <= log_sys.get_lsn());
	ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= log_sys.get_lsn()
899
	      || srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
900

901
	DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
902
			      " written",
903 904
			      log_sys.next_checkpoint_no,
			      log_sys.next_checkpoint_lsn));
905

906
	byte* buf = log_sys.checkpoint_buf;
907
	memset_aligned<OS_FILE_LOG_BLOCK_SIZE>(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
908

909 910
	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no);
	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn);
osku's avatar
osku committed
911

912
	if (log_sys.is_encrypted()) {
913 914
		log_crypt_write_checkpoint_buf(buf);
	}
Monty's avatar
Monty committed
915

916 917
	lsn_t lsn_offset
		= log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn);
918
	mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
919 920
	mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE,
			srv_log_buffer_size);
921
	mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn);
osku's avatar
osku committed
922

923
	log_block_store_checksum(buf);
osku's avatar
osku committed
924

925 926
	ut_ad(LOG_CHECKPOINT_1 < srv_page_size);
	ut_ad(LOG_CHECKPOINT_2 < srv_page_size);
927

928 929
	++log_sys.n_pending_checkpoint_writes;

930
	mysql_mutex_unlock(&log_sys.mutex);
931

932 933
	/* Note: We alternate the physical place of the checkpoint info.
	See the (next_checkpoint_no & 1) below. */
934

935 936 937
	log_sys.log.write((log_sys.next_checkpoint_no & 1) ? LOG_CHECKPOINT_2
							   : LOG_CHECKPOINT_1,
			  {buf, OS_FILE_LOG_BLOCK_SIZE});
osku's avatar
osku committed
938

939
	log_sys.log.flush();
940

941
	mysql_mutex_lock(&log_sys.mutex);
942

943 944
	--log_sys.n_pending_checkpoint_writes;
	ut_ad(log_sys.n_pending_checkpoint_writes == 0);
osku's avatar
osku committed
945

946
	log_sys.next_checkpoint_no++;
osku's avatar
osku committed
947

948
	log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn;
949

950 951
	DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
			      ", flushed to " LSN_PF,
952
			      lsn_t{log_sys.last_checkpoint_lsn},
953
			      log_sys.get_flushed_lsn()));
954 955 956

	MONITOR_INC(MONITOR_NUM_CHECKPOINT);

957
	DBUG_EXECUTE_IF("crash_after_checkpoint", DBUG_SUICIDE(););
958

959
	mysql_mutex_unlock(&log_sys.mutex);
osku's avatar
osku committed
960 961
}

962
/****************************************************************//**
963
Tries to establish a big enough margin of free space in the log, such
osku's avatar
osku committed
964 965 966
that a new log entry can be catenated without an immediate need for a
checkpoint. NOTE: this function may only be called if the calling thread
owns no synchronization objects! */
967
ATTRIBUTE_COLD static void log_checkpoint_margin()
osku's avatar
osku committed
968
{
969 970
  while (log_sys.check_flush_or_checkpoint())
  {
971
    mysql_mutex_lock(&log_sys.mutex);
972
    ut_ad(!recv_no_log_write);
osku's avatar
osku committed
973

974 975 976
    if (!log_sys.check_flush_or_checkpoint())
    {
func_exit:
977
      mysql_mutex_unlock(&log_sys.mutex);
978 979
      return;
    }
osku's avatar
osku committed
980

981 982 983 984 985 986 987 988
    const lsn_t lsn= log_sys.get_lsn();
    const lsn_t checkpoint= log_sys.last_checkpoint_lsn;
    const lsn_t sync_lsn= checkpoint + log_sys.max_checkpoint_age;
    if (lsn <= sync_lsn)
    {
      log_sys.set_check_flush_or_checkpoint(false);
      goto func_exit;
    }
osku's avatar
osku committed
989

990
    mysql_mutex_unlock(&log_sys.mutex);
osku's avatar
osku committed
991

992 993 994 995
    /* We must wait to prevent the tail of the log overwriting the head. */
    buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20)));
    os_thread_sleep(10000); /* Sleep 10ms to avoid a thundering herd */
  }
osku's avatar
osku committed
996 997
}

998 999 1000 1001 1002
/**
Checks that there is enough free space in the log to start a new query step.
Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
function may only be called if the calling thread owns no synchronization
objects! */
1003
ATTRIBUTE_COLD void log_check_margins()
osku's avatar
osku committed
1004
{
1005 1006 1007 1008 1009 1010 1011
  do
  {
    log_flush_margin();
    log_checkpoint_margin();
    ut_ad(!recv_no_log_write);
  }
  while (log_sys.check_flush_or_checkpoint());
osku's avatar
osku committed
1012 1013
}

1014
extern void buf_resize_shutdown();
1015 1016

/** Make a checkpoint at the latest lsn on shutdown. */
1017
ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown()
osku's avatar
osku committed
1018
{
1019
	lsn_t			lsn;
1020
	ulint			count = 0;
osku's avatar
osku committed
1021

1022
	ib::info() << "Starting shutdown...";
1023

osku's avatar
osku committed
1024 1025
	/* Wait until the master thread and all other operations are idle: our
	algorithm only works if the server is idle at shutdown */
1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
	bool do_srv_shutdown = false;
	if (srv_master_timer) {
		do_srv_shutdown = srv_fast_shutdown < 2;
		srv_master_timer.reset();
	}

	/* Wait for the end of the buffer resize task.*/
	buf_resize_shutdown();
	dict_stats_shutdown();
	btr_defragment_shutdown();
osku's avatar
osku committed
1036 1037

	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049

	if (srv_buffer_pool_dump_at_shutdown &&
		!srv_read_only_mode && srv_fast_shutdown < 2) {
		buf_dump_start();
	}
	srv_monitor_timer.reset();
	lock_sys.timeout_timer.reset();
	if (do_srv_shutdown) {
		srv_shutdown(srv_fast_shutdown == 0);
	}


osku's avatar
osku committed
1050
loop:
1051
	ut_ad(lock_sys.is_initialised() || !srv_was_started);
1052
	ut_ad(log_sys.is_initialised() || !srv_was_started);
1053
	ut_ad(fil_system.is_initialised() || !srv_was_started);
Marko Mäkelä's avatar
Marko Mäkelä committed
1054

1055 1056
#define COUNT_INTERVAL 600U
#define CHECK_INTERVAL 100000U
1057
	os_thread_sleep(CHECK_INTERVAL);
osku's avatar
osku committed
1058

1059 1060
	count++;

1061 1062 1063 1064
	/* Check that there are no longer transactions, except for
	PREPARED ones. We need this wait even for the 'very fast'
	shutdown, because the InnoDB layer may have committed or
	prepared transactions and we don't want to lose them. */
osku's avatar
osku committed
1065

1066 1067
	if (ulint total_trx = srv_was_started && !srv_read_only_mode
	    && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
1068
	    ? trx_sys.any_active_transactions() : 0) {
1069

1070 1071 1072
		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
			service_manager_extend_timeout(
				COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1073 1074
				"Waiting for %lu active transactions to finish",
				(ulong) total_trx);
1075 1076
			ib::info() << "Waiting for " << total_trx << " active"
				<< " transactions to finish";
1077

1078 1079 1080 1081 1082 1083
			count = 0;
		}

		goto loop;
	}

1084 1085
	/* We need these threads to stop early in shutdown. */
	const char* thread_name;
1086

1087
   if (srv_fast_shutdown != 2 && trx_rollback_is_active) {
1088
		thread_name = "rollback of recovered transactions";
1089 1090 1091
	} else {
		thread_name = NULL;
	}
1092

1093 1094 1095
	if (thread_name) {
		ut_ad(!srv_read_only_mode);
wait_suspend_loop:
1096 1097 1098 1099
		service_manager_extend_timeout(
			COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
			"Waiting for %s to exit", thread_name);
		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
Marko Mäkelä's avatar
Marko Mäkelä committed
1100
			ib::info() << "Waiting for " << thread_name
1101
				   << " to exit";
1102 1103
			count = 0;
		}
osku's avatar
osku committed
1104 1105 1106
		goto loop;
	}

1107 1108
	/* Check that the background threads are suspended */

1109
	ut_ad(!srv_any_background_activity());
1110
	if (srv_n_fil_crypt_threads_started) {
1111 1112 1113 1114 1115
		os_event_set(fil_crypt_threads_event);
		thread_name = "fil_crypt_thread";
		goto wait_suspend_loop;
	}

1116 1117
	if (buf_page_cleaner_is_active) {
		thread_name = "page cleaner thread";
1118
		pthread_cond_signal(&buf_pool.do_flush_list);
1119 1120
		goto wait_suspend_loop;
	}
1121

1122
	buf_load_dump_end();
1123

1124 1125 1126 1127 1128 1129
	if (!buf_pool.is_initialised()) {
		ut_ad(!srv_was_started);
	} else if (ulint pending_io = buf_pool.io_pending()) {
		if (srv_print_verbose_log && count > 600) {
			ib::info() << "Waiting for " << pending_io << " buffer"
				" page I/Os to complete";
1130 1131
			count = 0;
		}
1132 1133 1134

		goto loop;
	} else {
1135
		buf_flush_buffer_pool();
1136 1137
	}

1138
	if (log_sys.is_initialised()) {
1139
		mysql_mutex_lock(&log_sys.mutex);
1140
		const ulint	n_write	= log_sys.n_pending_checkpoint_writes;
1141
		const ulint	n_flush	= log_sys.pending_flushes;
1142
		mysql_mutex_unlock(&log_sys.mutex);
osku's avatar
osku committed
1143

1144
		if (n_write || n_flush) {
1145 1146 1147 1148 1149 1150 1151 1152
			if (srv_print_verbose_log && count > 600) {
				ib::info() << "Pending checkpoint_writes: "
					<< n_write
					<< ". Pending log flush writes: "
					<< n_flush;
				count = 0;
			}
			goto loop;
1153
		}
osku's avatar
osku committed
1154 1155
	}

1156 1157
	if (srv_fast_shutdown == 2 || !srv_was_started) {
		if (!srv_read_only_mode && srv_was_started) {
1158 1159 1160 1161
			ib::info() << "MySQL has requested a very fast"
				" shutdown without flushing the InnoDB buffer"
				" pool to data files. At the next mysqld"
				" startup InnoDB will do a crash recovery!";
1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173

			/* In this fastest shutdown we do not flush the
			buffer pool:

			it is essentially a 'crash' of the InnoDB server.
			Make sure that the log is all flushed to disk, so
			that we can recover all committed transactions in
			a crash recovery. We must not write the lsn stamps
			to the data files, since at a startup InnoDB deduces
			from the stamps if the previous shutdown was clean. */

			log_buffer_flush_to_disk();
1174 1175 1176 1177 1178
		}

		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
		return;
	}
osku's avatar
osku committed
1179

1180
	if (!srv_read_only_mode) {
1181 1182
		service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
			"ensuring dirty buffer pool are written to log");
1183
		log_make_checkpoint();
osku's avatar
osku committed
1184

1185
		mysql_mutex_lock(&log_sys.mutex);
osku's avatar
osku committed
1186

1187
		lsn = log_sys.get_lsn();
osku's avatar
osku committed
1188

1189 1190 1191
		const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn
			&& lsn != log_sys.last_checkpoint_lsn
			+ SIZE_OF_FILE_CHECKPOINT;
1192
		ut_ad(lsn >= log_sys.last_checkpoint_lsn);
osku's avatar
osku committed
1193

1194
		mysql_mutex_unlock(&log_sys.mutex);
osku's avatar
osku committed
1195

Marko Mäkelä's avatar
Marko Mäkelä committed
1196
		if (lsn_changed) {
1197 1198
			goto loop;
		}
osku's avatar
osku committed
1199

1200
		log_sys.log.flush();
1201
	} else {
1202
		lsn = recv_sys.recovered_lsn;
osku's avatar
osku committed
1203 1204 1205 1206 1207
	}

	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;

	/* Make some checks that the server really is quiet */
1208
	ut_ad(!srv_any_background_activity());
1209

1210 1211
	service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
				       "Free innodb buffer pool");
1212
	ut_d(buf_pool.assert_all_freed());
1213

1214
	ut_a(lsn == log_sys.get_lsn()
1215
	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
osku's avatar
osku committed
1216

1217
	if (UNIV_UNLIKELY(lsn < recv_sys.recovered_lsn)) {
1218
		ib::error() << "Shutdown LSN=" << lsn
1219 1220
			    << " is less than start LSN="
			    << recv_sys.recovered_lsn;
osku's avatar
osku committed
1221 1222 1223 1224
	}

	srv_shutdown_lsn = lsn;

1225
	if (!srv_read_only_mode) {
Marko Mäkelä's avatar
Marko Mäkelä committed
1226
		dberr_t err = fil_write_flushed_lsn(lsn);
1227 1228 1229

		if (err != DB_SUCCESS) {
			ib::error() << "Writing flushed lsn " << lsn
1230
				<< " failed; error=" << err;
1231
		}
1232
	}
osku's avatar
osku committed
1233 1234

	/* Make some checks that the server really is quiet */
1235
	ut_ad(!srv_any_background_activity());
1236

1237
	ut_a(lsn == log_sys.get_lsn()
1238
	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
osku's avatar
osku committed
1239 1240
}

1241
/******************************************************//**
osku's avatar
osku committed
1242 1243 1244 1245
Prints info of the log. */
void
log_print(
/*======*/
1246
	FILE*	file)	/*!< in: file where to print */
osku's avatar
osku committed
1247 1248 1249 1250
{
	double	time_elapsed;
	time_t	current_time;

1251
	mysql_mutex_lock(&log_sys.mutex);
osku's avatar
osku committed
1252

1253 1254 1255 1256 1257
	const lsn_t lsn= log_sys.get_lsn();
	mysql_mutex_lock(&buf_pool.flush_list_mutex);
	const lsn_t pages_flushed = buf_pool.get_oldest_modification(lsn);
	mysql_mutex_unlock(&buf_pool.flush_list_mutex);

osku's avatar
osku committed
1258
	fprintf(file,
1259 1260 1261 1262
		"Log sequence number " LSN_PF "\n"
		"Log flushed up to   " LSN_PF "\n"
		"Pages flushed up to " LSN_PF "\n"
		"Last checkpoint at  " LSN_PF "\n",
1263
		lsn,
1264
		log_sys.get_flushed_lsn(),
1265
		pages_flushed,
1266
		lsn_t{log_sys.last_checkpoint_lsn});
osku's avatar
osku committed
1267 1268

	current_time = time(NULL);
1269

1270
	time_elapsed = difftime(current_time,
1271
				log_sys.last_printout_time);
1272 1273 1274 1275 1276

	if (time_elapsed <= 0) {
		time_elapsed = 1;
	}

osku's avatar
osku committed
1277
	fprintf(file,
1278 1279 1280
		ULINTPF " pending log flushes, "
		ULINTPF " pending chkp writes\n"
		ULINTPF " log i/o's done, %.2f log i/o's/second\n",
1281
		log_sys.pending_flushes.load(),
1282 1283
		log_sys.n_pending_checkpoint_writes,
		log_sys.n_log_ios,
1284
		static_cast<double>(
1285
			log_sys.n_log_ios - log_sys.n_log_ios_old)
1286
		/ time_elapsed);
osku's avatar
osku committed
1287

1288 1289
	log_sys.n_log_ios_old = log_sys.n_log_ios;
	log_sys.last_printout_time = current_time;
osku's avatar
osku committed
1290

1291
	mysql_mutex_unlock(&log_sys.mutex);
osku's avatar
osku committed
1292 1293
}

1294
/**********************************************************************//**
osku's avatar
osku committed
1295 1296 1297 1298 1299
Refreshes the statistics used to print per-second averages. */
void
log_refresh_stats(void)
/*===================*/
{
1300 1301
	log_sys.n_log_ios_old = log_sys.n_log_ios;
	log_sys.last_printout_time = time(NULL);
osku's avatar
osku committed
1302
}
1303

1304
/** Shut down the redo log subsystem. */
1305
void log_t::close()
1306
{
1307 1308
  ut_ad(this == &log_sys);
  if (!is_initialised()) return;
1309
  m_initialised= false;
1310
  log.close();
1311

1312
  ut_free_dodump(buf, srv_log_buffer_size);
1313
  buf= nullptr;
1314
  ut_free_dodump(flush_buf, srv_log_buffer_size);
1315
  flush_buf= nullptr;
1316

1317 1318
  mysql_mutex_destroy(&mutex);
  mysql_mutex_destroy(&flush_order_mutex);
1319

1320
  recv_sys.close();
1321 1322 1323

  aligned_free(checkpoint_buf);
  checkpoint_buf= nullptr;
1324
}
Monty's avatar
Monty committed
1325

1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
std::string get_log_file_path(const char *filename)
{
  const size_t size= strlen(srv_log_group_home_dir) + /* path separator */ 1 +
                     strlen(filename) + /* longest suffix */ 3;
  std::string path;
  path.reserve(size);
  path.assign(srv_log_group_home_dir);

  std::replace(path.begin(), path.end(), OS_PATH_SEPARATOR_ALT,
	       OS_PATH_SEPARATOR);

  if (path.back() != OS_PATH_SEPARATOR)
    path.push_back(OS_PATH_SEPARATOR);
  path.append(filename);

  return path;
}

std::vector<std::string> get_existing_log_files_paths() {
  std::vector<std::string> result;

  for (int i= 0; i < 101; i++) {
    auto path= get_log_file_path(LOG_FILE_NAME_PREFIX)
                                 .append(std::to_string(i));
    os_file_stat_t stat;
    dberr_t err= os_file_get_status(path.c_str(), &stat, false, true);
    if (err)
      break;

    if (stat.type != OS_FILE_TYPE_FILE)
      break;

    result.push_back(std::move(path));
  }

  return result;
}