ma_checkpoint.c 46.1 KB
Newer Older
unknown's avatar
unknown committed
1
/* Copyright (C) 2006,2007 MySQL AB
unknown's avatar
unknown committed
2 3 4

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
unknown's avatar
unknown committed
6 7 8 9 10 11 12 13 14 15

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

16 17 18 19 20 21 22
/*
  WL#3071 Maria checkpoint
  First version written by Guilhem Bichot on 2006-04-27.
*/

/* Here is the implementation of this module */

unknown's avatar
unknown committed
23
/** @todo RECOVERY BUG this is unreviewed code */
24 25
/*
  Summary:
unknown's avatar
unknown committed
26 27 28 29 30
  checkpoints are done either by a background thread (checkpoint every Nth
  second) or by a client.
  In ha_maria, it's not made available to clients, and will soon be done by a
  background thread (periodically taking checkpoints and flushing dirty
  pages).
31 32
*/

unknown's avatar
unknown committed
33 34 35 36
#include "maria_def.h"
#include "ma_pagecache.h"
#include "ma_blockrec.h"
#include "ma_checkpoint.h"
37
#include "ma_loghandler_lsn.h"
38
#include "ma_servicethread.h"
39

40

unknown's avatar
unknown committed
41 42 43 44 45 46
/** @brief type of checkpoint currently running */
static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE;
/** @brief protects checkpoint_in_progress */
static pthread_mutex_t LOCK_checkpoint;
/** @brief for killing the background checkpoint thread */
static pthread_cond_t  COND_checkpoint;
47 48 49
/** @brief control structure for checkpoint background thread */
static MA_SERVICE_THREAD_CONTROL checkpoint_control=
  {THREAD_DEAD, FALSE, &LOCK_checkpoint, &COND_checkpoint};
unknown's avatar
unknown committed
50 51 52 53 54 55 56 57 58 59 60
/* is ulong like pagecache->blocks_changed */
static ulong pages_to_flush_before_next_checkpoint;
static PAGECACHE_FILE *dfiles, /**< data files to flush in background */
  *dfiles_end; /**< list of data files ends here */
static PAGECACHE_FILE *kfiles, /**< index files to flush in background */
  *kfiles_end; /**< list of index files ends here */
/* those two statistics below could serve in SHOW GLOBAL STATUS */
static uint checkpoints_total= 0, /**< all checkpoint requests made */
  checkpoints_ok_total= 0; /**< all checkpoints which succeeded */

struct st_filter_param
61
{
unknown's avatar
unknown committed
62 63 64
  LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */
  uint max_pages; /**< stop after flushing this number pages */
}; /**< information to determine which dirty pages should be flushed */
65

unknown's avatar
unknown committed
66
static enum pagecache_flush_filter_result
unknown's avatar
unknown committed
67 68 69
filter_flush_file_medium(enum pagecache_page_type type,
                         pgcache_page_no_t page,
                         LSN rec_lsn, void *arg);
unknown's avatar
unknown committed
70
static enum pagecache_flush_filter_result
unknown's avatar
unknown committed
71 72 73
filter_flush_file_full(enum pagecache_page_type type,
                       pgcache_page_no_t page,
                       LSN rec_lsn, void *arg);
unknown's avatar
unknown committed
74
static enum pagecache_flush_filter_result
unknown's avatar
unknown committed
75 76 77
filter_flush_file_evenly(enum pagecache_page_type type,
                         pgcache_page_no_t pageno,
                         LSN rec_lsn, void *arg);
78
static int really_execute_checkpoint(void);
unknown's avatar
unknown committed
79
pthread_handler_t ma_checkpoint_background(void *arg);
80
static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon);
81

unknown's avatar
unknown committed
82 83
/**
   @brief Does a checkpoint
84

unknown's avatar
unknown committed
85 86 87
   @param  level               what level of checkpoint to do
   @param  no_wait             if another checkpoint of same or stronger level
                               is already running, consider our job done
88

unknown's avatar
unknown committed
89 90
   @note In ha_maria, there can never be two threads trying a checkpoint at
   the same time.
91

unknown's avatar
unknown committed
92 93 94
   @return Operation status
    @retval 0 ok
    @retval !=0 error
95
*/
unknown's avatar
unknown committed
96 97

int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
98
{
unknown's avatar
unknown committed
99 100
  int result= 0;
  DBUG_ENTER("ma_checkpoint_execute");
101

102
  if (!checkpoint_control.inited)
unknown's avatar
unknown committed
103 104 105 106 107 108
  {
    /*
      If ha_maria failed to start, maria_panic_hton is called, we come here.
    */
    DBUG_RETURN(0);
  }
unknown's avatar
unknown committed
109
  DBUG_ASSERT(level > CHECKPOINT_NONE);
110

unknown's avatar
unknown committed
111 112 113
  /* look for already running checkpoints */
  pthread_mutex_lock(&LOCK_checkpoint);
  while (checkpoint_in_progress != CHECKPOINT_NONE)
114
  {
unknown's avatar
unknown committed
115
    if (no_wait && (checkpoint_in_progress >= level))
116 117
    {
      /*
unknown's avatar
unknown committed
118 119 120
        If we are the checkpoint background thread, we don't wait (it's
        smarter to flush pages instead of waiting here while the other thread
        finishes its checkpoint).
121
      */
unknown's avatar
unknown committed
122 123
      pthread_mutex_unlock(&LOCK_checkpoint);
      goto end;
124
    }
unknown's avatar
unknown committed
125
    pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
126
  }
127

unknown's avatar
unknown committed
128 129 130 131 132
  checkpoint_in_progress= level;
  pthread_mutex_unlock(&LOCK_checkpoint);
  /* from then on, we are sure to be and stay the only checkpointer */

  result= really_execute_checkpoint();
Michael Widenius's avatar
Michael Widenius committed
133 134 135
  DBUG_EXECUTE_IF("maria_crash_after_checkpoint",
                  { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); });

unknown's avatar
unknown committed
136 137
  pthread_cond_broadcast(&COND_checkpoint);
end:
unknown's avatar
unknown committed
138
  DBUG_RETURN(result);
139 140 141
}


unknown's avatar
unknown committed
142 143 144 145 146 147 148 149 150
/**
   @brief Does a checkpoint, really; expects no other checkpoints
   running.

   Checkpoint level requested is read from checkpoint_in_progress.

   @return Operation status
    @retval 0   ok
    @retval !=0 error
unknown's avatar
unknown committed
151
*/
unknown's avatar
unknown committed
152

153
static int really_execute_checkpoint(void)
154
{
unknown's avatar
unknown committed
155 156
  uint i, error= 0;
  /** @brief checkpoint_start_log_horizon will be stored there */
157
  char *ptr;
unknown's avatar
unknown committed
158 159 160
  LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */
  LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
  TRANSLOG_ADDRESS checkpoint_start_log_horizon;
unknown's avatar
unknown committed
161
  char checkpoint_start_log_horizon_char[LSN_STORE_SIZE];
unknown's avatar
unknown committed
162
  DBUG_ENTER("really_execute_checkpoint");
unknown's avatar
unknown committed
163
  DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress));
unknown's avatar
unknown committed
164
  bzero(&record_pieces, sizeof(record_pieces));
165

unknown's avatar
unknown committed
166 167 168 169 170 171 172 173
  /*
    STEP 1: record current end-of-log position using log's lock. It is
    critical for the correctness of Checkpoint (related to memory visibility
    rules, the log's lock is a mutex).
    "Horizon" is a lower bound of the LSN of the next log record.
  */
  checkpoint_start_log_horizon= translog_get_horizon();
  DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)",
unknown's avatar
unknown committed
174
                     LSN_IN_PARTS(checkpoint_start_log_horizon)));
unknown's avatar
unknown committed
175
  lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
176

unknown's avatar
unknown committed
177 178 179 180 181 182 183 184 185
  /*
    STEP 2: fetch information about transactions.
    We must fetch transactions before dirty pages. Indeed, a transaction
    first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn
    to 0. If we fetched pages first, we may see no dirty page yet, then we
    fetch transactions but the transaction has already reset its rec_lsn to 0
    so we miss rec_lsn again.
    For a similar reason (over-allocated bitmap pages) we have to fetch
    transactions before flushing bitmap pages.
186

unknown's avatar
unknown committed
187 188 189 190 191 192 193 194
    min_trn_rec_lsn will serve to lower the starting point of the REDO phase
    (down from checkpoint_start_log_horizon).
 */
  if (unlikely(trnman_collect_transactions(&record_pieces[0],
                                           &record_pieces[1],
                                           &min_trn_rec_lsn,
                                           &min_first_undo_lsn)))
    goto err;
195

unknown's avatar
unknown committed
196

unknown's avatar
unknown committed
197
  /* STEP 3: fetch information about table files */
198 199
  if (unlikely(collect_tables(&record_pieces[2],
                              checkpoint_start_log_horizon)))
unknown's avatar
unknown committed
200
    goto err;
unknown's avatar
unknown committed
201

unknown's avatar
unknown committed
202 203 204 205 206 207 208 209 210 211 212 213

  /* STEP 4: fetch information about dirty pages */
  /*
    It's better to do it _after_ having flushed some data pages (which
    collect_tables() may have done), because those are now non-dirty and so we
    have a more up-to-date dirty pages list to put into the checkpoint record,
    and thus we will have less work at Recovery.
  */
  /* Using default pagecache for now */
  if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
                                                         &record_pieces[3],
                                                         &min_page_rec_lsn)))
unknown's avatar
unknown committed
214
    goto err;
215

unknown's avatar
unknown committed
216

unknown's avatar
unknown committed
217
  /* LAST STEP: now write the checkpoint log record */
218
  {
unknown's avatar
unknown committed
219
    LSN lsn;
220
    translog_size_t total_rec_length;
221
    /*
unknown's avatar
unknown committed
222 223 224
      the log handler is allowed to modify "str" and "length" (but not "*str")
      of its argument, so we must not pass it record_pieces directly,
      otherwise we would later not know what memory pieces to my_free().
225
    */
226
    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5];
unknown's avatar
unknown committed
227
    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
Sergei Golubchik's avatar
Sergei Golubchik committed
228
      (uchar*) checkpoint_start_log_horizon_char;
unknown's avatar
unknown committed
229 230 231
    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length=
      sizeof(checkpoint_start_log_horizon_char);
    for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
232
    {
233 234
      log_array[TRANSLOG_INTERNAL_PARTS + 1 + i]=
        *(LEX_CUSTRING *)&record_pieces[i];
235
      total_rec_length+= (translog_size_t) record_pieces[i].length;
236
    }
unknown's avatar
unknown committed
237 238 239 240
    if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT,
                                       &dummy_transaction_object, NULL,
                                       total_rec_length,
                                       sizeof(log_array)/sizeof(log_array[0]),
unknown's avatar
unknown committed
241
                                       log_array, NULL, NULL) ||
unknown's avatar
unknown committed
242 243 244 245 246 247 248 249
                 translog_flush(lsn)))
      goto err;
    translog_lock();
    /*
      This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
      such hook would be called before translog_flush (and we must be sure
      that log was flushed before we write to the control file).
    */
250
    if (unlikely(ma_control_file_write_and_force(lsn, last_logno,
251 252
                                                 max_trid_in_control_file,
                                                 recovery_failures)))
unknown's avatar
unknown committed
253 254 255
    {
      translog_unlock();
      goto err;
unknown's avatar
unknown committed
256
    }
unknown's avatar
unknown committed
257
    translog_unlock();
258 259
  }

260 261 262 263
  /*
    Note that we should not alter memory structures until we have successfully
    written the checkpoint record and control file.
  */
unknown's avatar
unknown committed
264 265 266
  /* checkpoint succeeded */
  ptr= record_pieces[3].str;
  pages_to_flush_before_next_checkpoint= uint4korr(ptr);
unknown's avatar
unknown committed
267 268
  DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint",
                           (uint)pages_to_flush_before_next_checkpoint));
unknown's avatar
unknown committed
269 270

  /* compute log's low-water mark */
unknown's avatar
unknown committed
271 272 273 274 275 276 277 278 279 280 281 282 283 284
  {
    TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn;
    set_if_smaller(log_low_water_mark, min_trn_rec_lsn);
    set_if_smaller(log_low_water_mark, min_first_undo_lsn);
    set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon);
    /**
       Now purge unneeded logs.
       As some systems have an unreliable fsync (drive lying), we could try to
       be robust against that: remember a few previous checkpoints in the
       control file, and not purge logs immediately... Think about it.
    */
    if (translog_purge(log_low_water_mark))
      ma_message_no_user(0, "log purging failed");
  }
unknown's avatar
unknown committed
285

286
  goto end;
287 288

err:
unknown's avatar
unknown committed
289
  error= 1;
290
  ma_message_no_user(0, "checkpoint failed");
unknown's avatar
unknown committed
291 292
  /* we were possibly not able to determine what pages to flush */
  pages_to_flush_before_next_checkpoint= 0;
293 294

end:
unknown's avatar
unknown committed
295 296 297 298 299 300 301 302 303
  for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
    my_free(record_pieces[i].str, MYF(MY_ALLOW_ZERO_PTR));
  pthread_mutex_lock(&LOCK_checkpoint);
  checkpoint_in_progress= CHECKPOINT_NONE;
  checkpoints_total++;
  checkpoints_ok_total+= !error;
  pthread_mutex_unlock(&LOCK_checkpoint);
  DBUG_RETURN(error);
}
unknown's avatar
unknown committed
304

305

unknown's avatar
unknown committed
306 307 308
/**
   @brief Initializes the checkpoint module

unknown's avatar
unknown committed
309 310 311 312 313 314 315 316 317 318 319 320
   @param  interval           If one wants the module to create a
                              thread which will periodically do
                              checkpoints, and flush dirty pages, in the
                              background, it should specify a non-zero
                              interval in seconds. The thread will then be
                              created and will take checkpoints separated by
                              approximately 'interval' second.

   @note A checkpoint is taken only if there has been some significant
   activity since the previous checkpoint. Between checkpoint N and N+1 the
   thread flushes all dirty pages which were already dirty at the time of
   checkpoint N.
unknown's avatar
unknown committed
321 322 323 324 325 326

   @return Operation status
    @retval 0   ok
    @retval !=0 error
*/

unknown's avatar
unknown committed
327
int ma_checkpoint_init(ulong interval)
unknown's avatar
unknown committed
328 329 330 331
{
  pthread_t th;
  int res= 0;
  DBUG_ENTER("ma_checkpoint_init");
332
  if (ma_service_thread_control_init(&checkpoint_control))
unknown's avatar
unknown committed
333
    res= 1;
unknown's avatar
unknown committed
334
  else if (interval > 0)
unknown's avatar
unknown committed
335
  {
unknown's avatar
unknown committed
336 337 338
    compile_time_assert(sizeof(void *) >= sizeof(ulong));
    if (!(res= pthread_create(&th, NULL, ma_checkpoint_background,
                              (void *)interval)))
339 340 341 342
    {
      /* thread lives, will have to be killed */
      checkpoint_control.status= THREAD_RUNNING;
    }
unknown's avatar
unknown committed
343
  }
unknown's avatar
unknown committed
344
  DBUG_RETURN(res);
345 346 347
}


348 349 350 351 352 353 354
#ifndef DBUG_OFF
/**
   Function used to test recovery: flush some table pieces and then caller
   crashes.

   @param  what_to_flush   0: current bitmap and all data pages
                           1: state
unknown's avatar
unknown committed
355
                           2: all bitmap pages
356 357 358
*/
static void flush_all_tables(int what_to_flush)
{
unknown's avatar
unknown committed
359
  int res= 0;
360 361 362 363 364 365 366 367 368 369
  LIST *pos; /**< to iterate over open tables */
  pthread_mutex_lock(&THR_LOCK_maria);
  for (pos= maria_open_list; pos; pos= pos->next)
  {
    MARIA_HA *info= (MARIA_HA*)pos->data;
    if (info->s->now_transactional)
    {
      switch (what_to_flush)
      {
      case 0:
unknown's avatar
unknown committed
370
        res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
unknown's avatar
unknown committed
371
                                   FLUSH_KEEP, FLUSH_KEEP);
372 373
        break;
      case 1:
374 375 376
        res= _ma_state_info_write(info->s,
                                  MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
                                  MA_STATE_INFO_WRITE_LOCK);
377 378 379 380
        DBUG_PRINT("maria_flush_states",
                   ("is_of_horizon: LSN (%lu,0x%lx)",
                    LSN_IN_PARTS(info->s->state.is_of_horizon)));
        break;
unknown's avatar
unknown committed
381 382 383
      case 2:
        res= _ma_bitmap_flush_all(info->s);
        break;
384 385
      }
    }
unknown's avatar
unknown committed
386
    DBUG_ASSERT(res == 0);
387 388 389 390 391 392
  }
  pthread_mutex_unlock(&THR_LOCK_maria);
}
#endif


unknown's avatar
unknown committed
393 394
/**
   @brief Destroys the checkpoint module
395
*/
unknown's avatar
unknown committed
396

397
void ma_checkpoint_end(void)
398
{
unknown's avatar
unknown committed
399
  DBUG_ENTER("ma_checkpoint_end");
unknown's avatar
unknown committed
400 401 402 403
  /*
    Some intentional crash methods, usually triggered by
    SET MARIA_CHECKPOINT_INTERVAL=X
  */
unknown's avatar
unknown committed
404 405 406 407 408
  DBUG_EXECUTE_IF("maria_flush_bitmap",
                  {
                    DBUG_PRINT("maria_flush_bitmap", ("now"));
                    flush_all_tables(2);
                  });
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
  DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
                  {
                    DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
                    flush_all_tables(0);
                  });
  DBUG_EXECUTE_IF("maria_flush_whole_log",
                  {
                    DBUG_PRINT("maria_flush_whole_log", ("now"));
                    translog_flush(translog_get_horizon());
                  });
  /*
    Note that for WAL reasons, maria_flush_states requires
    maria_flush_whole_log.
  */
  DBUG_EXECUTE_IF("maria_flush_states",
                  {
                    DBUG_PRINT("maria_flush_states", ("now"));
                    flush_all_tables(1);
                  });
  DBUG_EXECUTE_IF("maria_crash",
429
                  { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); });
430

431
  if (checkpoint_control.inited)
432
  {
433
    ma_service_thread_control_end(&checkpoint_control);
unknown's avatar
unknown committed
434 435
    my_free((uchar *)dfiles, MYF(MY_ALLOW_ZERO_PTR));
    my_free((uchar *)kfiles, MYF(MY_ALLOW_ZERO_PTR));
unknown's avatar
unknown committed
436
    dfiles= kfiles= NULL;
437
  }
unknown's avatar
unknown committed
438
  DBUG_VOID_RETURN;
439 440
}

unknown's avatar
unknown committed
441 442 443 444 445 446

/**
   @brief dirty-page filtering criteria for MEDIUM checkpoint.

   We flush data/index pages which have been dirty since the previous
   checkpoint (this is the two-checkpoint rule: the REDO phase will not have
unknown's avatar
unknown committed
447 448
   to start from earlier than the next-to-last checkpoint).
   Bitmap pages are handled by _ma_bitmap_flush_all().
unknown's avatar
unknown committed
449 450 451 452 453 454 455

   @param  type                Page's type
   @param  pageno              Page's number
   @param  rec_lsn             Page's rec_lsn
   @param  arg                 filter_param
*/

unknown's avatar
unknown committed
456
static enum pagecache_flush_filter_result
unknown's avatar
unknown committed
457
filter_flush_file_medium(enum pagecache_page_type type,
unknown's avatar
unknown committed
458
                         pgcache_page_no_t pageno __attribute__ ((unused)),
unknown's avatar
unknown committed
459
                         LSN rec_lsn, void *arg)
unknown's avatar
unknown committed
460 461
{
  struct st_filter_param *param= (struct st_filter_param *)arg;
unknown's avatar
unknown committed
462 463
  return (type == PAGECACHE_LSN_PAGE) &&
    (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0);
unknown's avatar
unknown committed
464 465 466 467 468 469
}


/**
   @brief dirty-page filtering criteria for FULL checkpoint.

unknown's avatar
unknown committed
470 471
   We flush all dirty data/index pages.
   Bitmap pages are handled by _ma_bitmap_flush_all().
unknown's avatar
unknown committed
472 473 474 475 476

   @param  type                Page's type
   @param  pageno              Page's number
   @param  rec_lsn             Page's rec_lsn
   @param  arg                 filter_param
477
*/
unknown's avatar
unknown committed
478

unknown's avatar
unknown committed
479
static enum pagecache_flush_filter_result
unknown's avatar
unknown committed
480
filter_flush_file_full(enum pagecache_page_type type,
unknown's avatar
unknown committed
481
                       pgcache_page_no_t pageno __attribute__ ((unused)),
unknown's avatar
unknown committed
482
                       LSN rec_lsn __attribute__ ((unused)),
unknown's avatar
unknown committed
483
                       void *arg __attribute__ ((unused)))
484
{
unknown's avatar
unknown committed
485
  return (type == PAGECACHE_LSN_PAGE);
unknown's avatar
unknown committed
486 487 488 489 490 491
}


/**
   @brief dirty-page filtering criteria for background flushing thread.

unknown's avatar
unknown committed
492 493 494 495 496
   We flush data/index pages which have been dirty since the previous
   checkpoint (this is the two-checkpoint rule: the REDO phase will not have
   to start from earlier than the next-to-last checkpoint), and no
   bitmap pages. But we flush no more than a certain number of pages (to have
   an even flushing, no write burst).
unknown's avatar
unknown committed
497 498
   The reason to not flush bitmap pages is that they may not be in a flushable
   state at this moment and we don't want to wait for them.
499

unknown's avatar
unknown committed
500 501 502 503 504 505
   @param  type                Page's type
   @param  pageno              Page's number
   @param  rec_lsn             Page's rec_lsn
   @param  arg                 filter_param
*/

unknown's avatar
unknown committed
506
static enum pagecache_flush_filter_result
unknown's avatar
unknown committed
507 508 509
filter_flush_file_evenly(enum pagecache_page_type type,
                         pgcache_page_no_t pageno __attribute__ ((unused)),
                         LSN rec_lsn, void *arg)
unknown's avatar
unknown committed
510 511 512
{
  struct st_filter_param *param= (struct st_filter_param *)arg;
  if (unlikely(param->max_pages == 0)) /* all flushed already */
unknown's avatar
unknown committed
513
    return FLUSH_FILTER_SKIP_ALL;
unknown's avatar
unknown committed
514 515
  if ((type == PAGECACHE_LSN_PAGE) &&
      (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0))
516
  {
unknown's avatar
unknown committed
517
    param->max_pages--;
unknown's avatar
unknown committed
518
    return FLUSH_FILTER_OK;
519
  }
unknown's avatar
unknown committed
520
  return FLUSH_FILTER_SKIP_TRY_NEXT;
unknown's avatar
unknown committed
521 522 523 524 525 526
}


/**
   @brief Background thread which does checkpoints and flushes periodically.

unknown's avatar
unknown committed
527 528 529 530
   Takes a checkpoint. After this, all pages dirty at the time of that
   checkpoint are flushed evenly until it is time to take another checkpoint.
   This ensures that the REDO phase starts at earliest (in LSN time) at the
   next-to-last checkpoint record ("two-checkpoint rule").
unknown's avatar
unknown committed
531 532 533 534 535

   @note MikaelR questioned why the same thread does two different jobs, the
   risk could be that while a checkpoint happens no LRD flushing happens.
*/

536 537 538
static ulong maria_checkpoint_min_cache_activity= 10*1024*1024;
/* Set in ha_maria.cc */
ulong maria_checkpoint_min_log_activity= 1*1024*1024;
Michael Widenius's avatar
Michael Widenius committed
539

unknown's avatar
unknown committed
540
pthread_handler_t ma_checkpoint_background(void *arg)
unknown's avatar
unknown committed
541
{
unknown's avatar
unknown committed
542
  /** @brief At least this of log/page bytes written between checkpoints */
unknown's avatar
unknown committed
543 544 545 546 547 548 549
  /*
    If the interval could be changed by the user while we are in this thread,
    it could be annoying: for example it could cause "case 2" to be executed
    right after "case 0", thus having 'dfile' unset. So the thread cares only
    about the interval's value when it started.
  */
  const ulong interval= (ulong)arg;
unknown's avatar
unknown committed
550 551 552 553 554
  uint sleeps, sleep_time;
  TRANSLOG_ADDRESS log_horizon_at_last_checkpoint=
    translog_get_horizon();
  ulonglong pagecache_flushes_at_last_checkpoint=
    maria_pagecache->global_cache_write;
unknown's avatar
unknown committed
555 556 557 558 559 560 561
  uint pages_bunch_size;
  struct st_filter_param filter_param;
  PAGECACHE_FILE *dfile; /**< data file currently being flushed */
  PAGECACHE_FILE *kfile; /**< index file currently being flushed */
  LINT_INIT(kfile);
  LINT_INIT(dfile);
  LINT_INIT(pages_bunch_size);
562

unknown's avatar
unknown committed
563 564
  my_thread_init();
  DBUG_PRINT("info",("Maria background checkpoint thread starts"));
unknown's avatar
unknown committed
565 566 567 568 569 570 571 572 573 574
  DBUG_ASSERT(interval > 0);

  /*
    Recovery ended with all tables closed and a checkpoint: no need to take
    one immediately.
  */
  sleeps= 1;
  pages_to_flush_before_next_checkpoint= 0;

  for(;;) /* iterations of checkpoints and dirty page flushing */
unknown's avatar
unknown committed
575 576 577 578
  {
#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
    sleeps=0;
#endif
unknown's avatar
unknown committed
579
    switch (sleeps % interval)
unknown's avatar
unknown committed
580 581
    {
    case 0:
582 583 584
    {
      TRANSLOG_ADDRESS horizon= translog_get_horizon();

unknown's avatar
unknown committed
585 586 587 588 589 590
      /*
        With background flushing evenly distributed over the time
        between two checkpoints, we should have only little flushing to do
        in the checkpoint.
      */
      /*
unknown's avatar
unknown committed
591
        No checkpoint if little work of interest for recovery was done
unknown's avatar
unknown committed
592 593 594
        since last checkpoint. Such work includes log writing (lengthens
        recovery, checkpoint would shorten it), page flushing (checkpoint
        would decrease the amount of read pages in recovery).
unknown's avatar
unknown committed
595 596
        In case of one short statement per minute (very low load), we don't
        want to checkpoint every minute, hence the positive
Michael Widenius's avatar
Michael Widenius committed
597
        maria_checkpoint_min_activity.
unknown's avatar
unknown committed
598
      */
599
      if ((ulonglong) (horizon - log_horizon_at_last_checkpoint) <=
600 601 602
          maria_checkpoint_min_log_activity &&
          ((ulonglong) (maria_pagecache->global_cache_write -
                        pagecache_flushes_at_last_checkpoint) *
603 604
             maria_pagecache->block_size) <=
          maria_checkpoint_min_cache_activity)
unknown's avatar
unknown committed
605
      {
606 607 608 609
        /*
          Not enough has happend since last checkpoint.
          Sleep for a while and try again later
        */
unknown's avatar
unknown committed
610
        sleep_time= interval;
unknown's avatar
unknown committed
611 612
        break;
      }
unknown's avatar
unknown committed
613
      sleep_time= 1;
unknown's avatar
unknown committed
614 615 616 617 618 619 620 621 622 623 624 625 626 627 628
      ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
      /*
        Snapshot this kind of "state" of the engine. Note that the value below
        is possibly greater than last_checkpoint_lsn.
      */
      log_horizon_at_last_checkpoint= translog_get_horizon();
      pagecache_flushes_at_last_checkpoint=
        maria_pagecache->global_cache_write;
      /*
        If the checkpoint above succeeded it has set d|kfiles and
        d|kfiles_end. If is has failed, it has set
        pages_to_flush_before_next_checkpoint to 0 so we will skip flushing
        and sleep until the next checkpoint.
      */
      break;
629
    }
unknown's avatar
unknown committed
630 631 632
    case 1:
      /* set up parameters for background page flushing */
      filter_param.up_to_lsn= last_checkpoint_lsn;
unknown's avatar
unknown committed
633
      pages_bunch_size= pages_to_flush_before_next_checkpoint / interval;
unknown's avatar
unknown committed
634 635 636 637 638 639
      dfile= dfiles;
      kfile= kfiles;
      /* fall through */
    default:
      if (pages_bunch_size > 0)
      {
unknown's avatar
unknown committed
640 641 642
        DBUG_PRINT("checkpoint",
                   ("Maria background checkpoint thread: %u pages",
                    pages_bunch_size));
unknown's avatar
unknown committed
643 644 645 646
        /* flush a bunch of dirty pages */
        filter_param.max_pages= pages_bunch_size;
        while (dfile != dfiles_end)
        {
unknown's avatar
unknown committed
647 648 649 650
          /*
            We use FLUSH_KEEP_LAZY: if a file is already in flush, it's
            smarter to move to the next file than wait for this one to be
            completely flushed, which may take long.
651 652 653 654 655 656 657 658
            StaleFilePointersInFlush: notice how below we use "dfile" which
            is an OS file descriptor plus some function and MARIA_SHARE
            pointers; this data dates from a previous checkpoint; since then,
            the table may have been closed (so MARIA_SHARE* became stale), and
            the file descriptor reassigned to another table which does not
            have the same CRC-read-set callbacks: it is thus important that
            flush_pagecache_blocks_with_filter() does not use the pointers,
            only the OS file descriptor.
unknown's avatar
unknown committed
659
          */
unknown's avatar
unknown committed
660 661
          int res=
            flush_pagecache_blocks_with_filter(maria_pagecache,
unknown's avatar
unknown committed
662
                                               dfile, FLUSH_KEEP_LAZY,
unknown's avatar
unknown committed
663
                                               filter_flush_file_evenly,
unknown's avatar
unknown committed
664
                                               &filter_param);
665
          if (unlikely(res & PCFLUSH_ERROR))
666
            ma_message_no_user(0, "background data page flush failed");
unknown's avatar
unknown committed
667 668 669
          if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
            break; /* and we will continue with the same file */
          dfile++; /* otherwise all this file is flushed, move to next file */
unknown's avatar
unknown committed
670 671 672 673 674 675 676
          /*
            MikaelR noted that he observed that Linux's file cache may never
            fsync to  disk until this cache is full, at which point it decides
            to empty the cache, making the machine very slow. A solution was
            to fsync after writing 2 MB. So we might want to fsync() here if
            we wrote enough pages.
          */
unknown's avatar
unknown committed
677 678 679 680 681
        }
        while (kfile != kfiles_end)
        {
          int res=
            flush_pagecache_blocks_with_filter(maria_pagecache,
unknown's avatar
unknown committed
682 683
                                               kfile, FLUSH_KEEP_LAZY,
                                               filter_flush_file_evenly,
unknown's avatar
unknown committed
684
                                               &filter_param);
685
          if (unlikely(res & PCFLUSH_ERROR))
686
            ma_message_no_user(0, "background index page flush failed");
unknown's avatar
unknown committed
687 688 689 690
          if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
            break; /* and we will continue with the same file */
          kfile++; /* otherwise all this file is flushed, move to next file */
        }
unknown's avatar
unknown committed
691 692 693 694 695 696
        sleep_time= 1;
      }
      else
      {
        /* Can directly sleep until the next checkpoint moment */
        sleep_time= interval - (sleeps % interval);
unknown's avatar
unknown committed
697 698
      }
    }
699 700
    if (my_service_thread_sleep(&checkpoint_control,
                                sleep_time * 1000000000ULL))
unknown's avatar
unknown committed
701
      break;
unknown's avatar
unknown committed
702
    sleeps+= sleep_time;
unknown's avatar
unknown committed
703 704
  }
  DBUG_PRINT("info",("Maria background checkpoint thread ends"));
unknown's avatar
unknown committed
705 706 707 708 709 710 711 712 713
  {
    CHECKPOINT_LEVEL level= CHECKPOINT_FULL;
    /*
      That's the final one, which guarantees that a clean shutdown always ends
      with a checkpoint.
    */
    DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;);
    ma_checkpoint_execute(level, FALSE);
  }
714
  my_service_thread_signal_end(&checkpoint_control);
unknown's avatar
unknown committed
715 716
  my_thread_end();
  return 0;
717
}
unknown's avatar
unknown committed
718 719


unknown's avatar
unknown committed
720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
/**
   @brief Allocates buffer and stores in it some info about open tables,
   does some flushing on those.

   Does the allocation because the caller cannot know the size itself.
   Memory freeing is to be done by the caller (if the "str" member of the
   LEX_STRING is not NULL).
   The caller is taking a checkpoint.

   @param[out]  str        pointer to where the allocated buffer,
                           and its size, will be put; buffer will be filled
                           with info about open tables
   @param       checkpoint_start_log_horizon  Of the in-progress checkpoint
                                              record.

   @return Operation status
     @retval 0      OK
     @retval 1      Error
unknown's avatar
unknown committed
738
*/
unknown's avatar
unknown committed
739 740

static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
unknown's avatar
unknown committed
741
{
unknown's avatar
unknown committed
742 743 744 745 746 747 748 749 750 751 752 753 754 755
  MARIA_SHARE **distinct_shares= NULL;
  char *ptr;
  uint error= 1, sync_error= 0, nb, nb_stored, i;
  my_bool unmark_tables= TRUE;
  uint total_names_length;
  LIST *pos; /**< to iterate over open tables */
  struct st_state_copy {
    uint index;
    MARIA_STATE_INFO state;
  };
  struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */
    *state_copies_end, /**< cache ends here */
    *state_copy; /**< iterator in cache */
  TRANSLOG_ADDRESS state_copies_horizon; /**< horizon of states' _copies_ */
unknown's avatar
unknown committed
756 757
  struct st_filter_param filter_param;
  PAGECACHE_FLUSH_FILTER filter;
unknown's avatar
unknown committed
758 759
  DBUG_ENTER("collect_tables");

unknown's avatar
unknown committed
760
  LINT_INIT(state_copies_horizon);
unknown's avatar
unknown committed
761 762 763 764 765 766 767 768 769 770 771 772
  /* let's make a list of distinct shares */
  pthread_mutex_lock(&THR_LOCK_maria);
  for (nb= 0, pos= maria_open_list; pos; pos= pos->next)
  {
    MARIA_HA *info= (MARIA_HA*)pos->data;
    MARIA_SHARE *share= info->s;
    /* the first three variables below can never change */
    if (share->base.born_transactional && !share->temporary &&
        share->mode != O_RDONLY &&
        !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP))
    {
      /*
773 774
        Apart from us, only maria_close() reads/sets in_checkpoint but cannot
        run now as we hold THR_LOCK_maria.
unknown's avatar
unknown committed
775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
      */
      /*
        This table is relevant for checkpoint and not already seen. Mark it,
        so that it is not seen again in the loop.
      */
      nb++;
      DBUG_ASSERT(share->in_checkpoint == 0);
      /* This flag ensures that we count only _distinct_ shares. */
      share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP;
    }
  }
  if (unlikely((distinct_shares=
                (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *),
                                          MYF(MY_WME))) == NULL))
    goto err;
  for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next)
  {
    MARIA_HA *info= (MARIA_HA*)pos->data;
    MARIA_SHARE *share= info->s;
    if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)
    {
      distinct_shares[i++]= share;
      /*
        With this we prevent the share from going away while we later flush
        and force it without holding THR_LOCK_maria. For example if the share
        could be my_free()d by maria_close() we would have a problem when we
        access it to flush the table. We "pin" the share pointer.
        And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is
        not seen again in the loop.
      */
      share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME;
806
      total_names_length+= share->open_file_name.length;
unknown's avatar
unknown committed
807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
    }
  }

  DBUG_ASSERT(i == nb);
  pthread_mutex_unlock(&THR_LOCK_maria);
  DBUG_PRINT("info",("found %u table shares", nb));

  str->length=
    4 +               /* number of tables */
    (2 +              /* short id */
     LSN_STORE_SIZE + /* first_log_write_at_lsn */
     1                /* end-of-name 0 */
     ) * nb + total_names_length;
  if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
    goto err;

  ptr= str->str;
  ptr+= 4; /* real number of stored tables is not yet know */

  /* only possible checkpointer, so can do the read below without mutex */
  filter_param.up_to_lsn= last_checkpoint_lsn;
  switch(checkpoint_in_progress)
  {
  case CHECKPOINT_MEDIUM:
unknown's avatar
unknown committed
831
    filter= &filter_flush_file_medium;
unknown's avatar
unknown committed
832 833
    break;
  case CHECKPOINT_FULL:
unknown's avatar
unknown committed
834
    filter= &filter_flush_file_full;
unknown's avatar
unknown committed
835 836
    break;
  case CHECKPOINT_INDIRECT:
unknown's avatar
unknown committed
837
    filter= NULL;
unknown's avatar
unknown committed
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853
    break;
  default:
    DBUG_ASSERT(0);
    goto err;
  }

  /*
    The principle of reading/writing the state below is explained in
    ma_recovery.c, look for "Recovery of the state".
  */
#define STATE_COPIES 1024
  state_copies= (struct st_state_copy *)
    my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
  dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles,
                                       /* avoid size of 0 for my_realloc */
                                       max(1, nb) * sizeof(PAGECACHE_FILE),
854
                                       MYF(MY_WME | MY_ALLOW_ZERO_PTR));
unknown's avatar
unknown committed
855 856 857
  kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles,
                                       /* avoid size of 0 for my_realloc */
                                       max(1, nb) * sizeof(PAGECACHE_FILE),
858
                                       MYF(MY_WME | MY_ALLOW_ZERO_PTR));
unknown's avatar
unknown committed
859 860 861 862 863 864 865 866 867 868 869
  if (unlikely((state_copies == NULL) ||
               (dfiles == NULL) || (kfiles == NULL)))
    goto err;
  state_copy= state_copies_end= NULL;
  dfiles_end= dfiles;
  kfiles_end= kfiles;

  for (nb_stored= 0, i= 0; i < nb; i++)
  {
    MARIA_SHARE *share= distinct_shares[i];
    PAGECACHE_FILE kfile, dfile;
unknown's avatar
unknown committed
870
    my_bool ignore_share;
unknown's avatar
unknown committed
871 872
    if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
    {
873 874 875 876
      /*
        No need for a mutex to read the above, only us can write *this* bit of
        the in_checkpoint bitmap
      */
unknown's avatar
unknown committed
877 878
      continue;
    }
unknown's avatar
unknown committed
879 880 881 882
    /**
       @todo We should not look at tables which didn't change since last
       checkpoint.
    */
883
    DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str));
unknown's avatar
unknown committed
884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910
    if (state_copy == state_copies_end) /* we have no more cached states */
    {
      /*
        Collect and cache a bunch of states. We do this for many states at a
        time, to not lock/unlock the log's lock too often.
      */
      uint j, bound= min(nb, i + STATE_COPIES);
      state_copy= state_copies;
      /* part of the state is protected by log's lock */
      translog_lock();
      state_copies_horizon= translog_get_horizon_no_lock();
      for (j= i; j < bound; j++)
      {
        MARIA_SHARE *share2= distinct_shares[j];
        if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
          continue;
        state_copy->index= j;
        state_copy->state= share2->state; /* we copy the state */
        state_copy++;
        /*
          data_file_length is not updated under log's lock by the bitmap
          code, but writing a wrong data_file_length is ok: a next
          maria_close() will correct it; if we crash before, Recovery will
          set it to the true physical size.
        */
      }
      translog_unlock();
unknown's avatar
unknown committed
911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934
      /**
         We are going to flush these states.
         Before, all records describing how to undo such state must be
         in the log (WAL). Usually this means UNDOs. In the special case of
         data|key_file_length, recovery just needs to open the table to fix the
         length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to
         understand it must open a table, is enough; so as long as
         data|key_file_length is updated after writing any log record it's ok:
         if we copied new value above, it means the record was before
         state_copies_horizon and we flush such record below.
         Apart from data|key_file_length which are easily recoverable from the
         real file's size, all other state members must be updated only when
         writing the UNDO; otherwise, if updated before, if their new value is
         flushed by a checkpoint and there is a crash before UNDO is written,
         their REDO group will be missing or at least incomplete and skipped
         by recovery, so bad state value will stay. For example, setting
         key_root before writing the UNDO: the table would have old index
         pages (they were pinned at time of crash) and a new, thus wrong,
         key_root.
         @todo RECOVERY BUG check that all code honours that.
      */
      if (translog_flush(state_copies_horizon))
        goto err;
      /* now we have cached states and they are WAL-safe*/
unknown's avatar
unknown committed
935 936 937 938 939 940 941 942 943
      state_copies_end= state_copy;
      state_copy= state_copies;
    }

    /* locate our state among these cached ones */
    for ( ; state_copy->index != i; state_copy++)
      DBUG_ASSERT(state_copy < state_copies_end);

    /* OS file descriptors are ints which we stored in 4 bytes */
unknown's avatar
unknown committed
944
    compile_time_assert(sizeof(int) <= 4);
945 946 947 948 949 950 951 952
    /*
      Protect against maria_close() (which does some memory freeing in
      MARIA_FILE_BITMAP) with close_lock. intern_lock is not
      sufficient as we, as well as maria_close(), are going to unlock
      intern_lock in the middle of manipulating the table. Serializing us and
      maria_close() should help avoid problems.
    */
    pthread_mutex_lock(&share->close_lock);
unknown's avatar
unknown committed
953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
    pthread_mutex_lock(&share->intern_lock);
    /*
      Tables in a normal state have their two file descriptors open.
      In some rare cases like REPAIR, some descriptor may be closed or even
      -1. If that happened, the _ma_state_info_write() may fail. This is
      prevented by enclosing all all places which close/change kfile.file with
      intern_lock.
    */
    kfile= share->kfile;
    dfile= share->bitmap.file;
    /*
      Ignore table which has no logged writes (all its future log records will
      be found naturally by Recovery). Ignore obsolete shares (_before_
      setting themselves to last_version=0 they already did all flush and
      sync; if we flush their state now we may be flushing an obsolete state
      onto a newer one (assuming the table has been reopened with a different
      share but of course same physical index file).
    */
unknown's avatar
unknown committed
971 972 973
    ignore_share= (share->id == 0) | (share->last_version == 0);
    DBUG_PRINT("info", ("ignore_share: %d", ignore_share));
    if (!ignore_share)
unknown's avatar
unknown committed
974
    {
975
      uint open_file_name_len= share->open_file_name.length + 1;
unknown's avatar
unknown committed
976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995
      /* remember the descriptors for background flush */
      *(dfiles_end++)= dfile;
      *(kfiles_end++)= kfile;
      /* we will store this table in the record */
      nb_stored++;
      int2store(ptr, share->id);
      ptr+= 2;
      lsn_store(ptr, share->lsn_of_file_id);
      ptr+= LSN_STORE_SIZE;
      /*
        first_bitmap_with_space is not updated under log's lock, and is
        important. We would need the bitmap's lock to get it right. Recovery
        of this is not clear, so we just play safe: write it out as
        unknown: if crash, _ma_bitmap_init() at next open (for example in
        Recovery) will convert it to 0 and thus the first insertion will
        search for free space from the file's first bitmap (0) -
        under-optimal but safe.
        If no crash, maria_close() will write the exact value.
      */
      state_copy->state.first_bitmap_with_space= ~(ulonglong)0;
996
      memcpy(ptr, share->open_file_name.str, open_file_name_len);
unknown's avatar
unknown committed
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
      ptr+= open_file_name_len;
      if (cmp_translog_addr(share->state.is_of_horizon,
                            checkpoint_start_log_horizon) >= 0)
      {
        /*
          State was flushed recently, it does not hold down the log's
          low-water mark and will not give avoidable work to Recovery. So we
          needn't flush it. Also, it is possible that while we copied the
          state above (under log's lock, without intern_lock) it was being
          modified in memory or flushed to disk (without log's lock, under
          intern_lock, like in maria_extra()), so our copy may be incorrect
          and we should not flush it.
          It may also be a share which got last_version==0 since we checked
          last_version; in this case, it flushed its state and the LSN test
          above will catch it.
        */
      }
      else
      {
        /*
          We could do the state flush only if share->changed, but it's
          tricky.
          Consider a maria_write() which has written REDO,UNDO, and before it
          calls _ma_writeinfo() (setting share->changed=1), checkpoint
          happens and sees share->changed=0, does not flush state. It is
          possible that Recovery does not start from before the REDO and thus
          the state is not recovered. A solution may be to set
          share->changed=1 under log mutex when writing log records.
Michael Widenius's avatar
Michael Widenius committed
1025 1026 1027 1028 1029

          The current solution is to keep a copy the last saved state and
          not write the state if it was same as last time. It's ok if
          is_of_horizon would be different on disk if all other data is
          the same.
unknown's avatar
unknown committed
1030 1031 1032
        */
        DBUG_ASSERT(share->last_version != 0);
        state_copy->state.is_of_horizon= share->state.is_of_horizon=
Michael Widenius's avatar
Michael Widenius committed
1033 1034 1035 1036 1037
          share->checkpoint_state.is_of_horizon= state_copies_horizon;
        if (kfile.file >= 0 && memcmp(&share->checkpoint_state,
                                      &state_copy->state,
                                      sizeof(state_copy->state)))
        {
unknown's avatar
unknown committed
1038
          sync_error|=
1039 1040
            _ma_state_info_write_sub(kfile.file, &state_copy->state,
                                     MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
Michael Widenius's avatar
Michael Widenius committed
1041 1042 1043
          memcpy(&share->checkpoint_state,
                 &state_copy->state, sizeof(state_copy->state));
        }
unknown's avatar
unknown committed
1044 1045 1046 1047 1048 1049 1050
        /*
          We don't set share->changed=0 because it may interfere with a
          concurrent _ma_writeinfo() doing share->changed=1 (cancel its
          effect). The sad consequence is that we will flush the same state at
          each checkpoint if the table was once written and then not anymore.
        */
      }
1051
    }
Michael Widenius's avatar
Michael Widenius committed
1052 1053 1054 1055 1056 1057 1058 1059
#ifdef EXTRA_DEBUG_BITMAP
    else
    {
      DBUG_ASSERT(share->bitmap.changed == 0 &&
                  share->bitmap.changed_not_flushed == 0);
    }
#endif

1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
    /*
      _ma_bitmap_flush_all() may wait, so don't keep intern_lock as
      otherwise this would deadlock with allocate_and_write_block_record()
      calling _ma_set_share_data_file_length()
    */
    pthread_mutex_unlock(&share->intern_lock);
    
    if (!ignore_share)
    {
      /*
        share->bitmap is valid because it's destroyed under close_lock which
        we hold.
      */
unknown's avatar
unknown committed
1073 1074 1075 1076 1077 1078
      if (_ma_bitmap_flush_all(share))
      {
        sync_error= 1;
        /** @todo all write failures should mark table corrupted */
        ma_message_no_user(0, "checkpoint bitmap page flush failed");
      }
unknown's avatar
unknown committed
1079 1080
      DBUG_ASSERT(share->pagecache == maria_pagecache);
    }
1081 1082 1083 1084
    /*
      Clean up any unused states.
      TODO: Only do this call if there has been # (10?) ended transactions
      since last call.
1085
      We had to release intern_lock to respect lock order with LOCK_trn_list.
1086
    */
1087
    _ma_remove_not_visible_states_with_lock(share, FALSE);
1088

unknown's avatar
unknown committed
1089 1090
    if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
    {
1091 1092 1093 1094 1095 1096
      /*
        maria_close() left us free the share. When it run it set share->id
        to 0. As it run before we locked close_lock, we should have seen this
        and so this assertion should be true:
      */
      DBUG_ASSERT(ignore_share);
unknown's avatar
unknown committed
1097
      pthread_mutex_destroy(&share->intern_lock);
1098 1099
      pthread_mutex_unlock(&share->close_lock);
      pthread_mutex_destroy(&share->close_lock);
unknown's avatar
unknown committed
1100 1101 1102 1103 1104 1105
      my_free((uchar *)share, MYF(0));
    }
    else
    {
      /* share goes back to normal state */
      share->in_checkpoint= 0;
1106
      pthread_mutex_unlock(&share->close_lock);
unknown's avatar
unknown committed
1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
    }

    /*
      We do the big disk writes out of intern_lock to not block other
      users of this table (intern_lock is taken at the start and end of
      every statement). This means that file descriptors may be invalid
      (files may have been closed for example by HA_EXTRA_PREPARE_FOR_*
      under Windows, or REPAIR). This should not be a problem as we use
      MY_IGNORE_BADFD. Descriptors may even point to other files but then
      the old blocks (of before the close) must have been flushed for sure,
      so our flush will flush new blocks (of after the latest open) and that
      should do no harm.
    */
    /*
      If CHECKPOINT_MEDIUM, this big flush below may result in a
      serious write burst. Realize that all pages dirtied between the
      last checkpoint and the one we are doing now, will be flushed at
      next checkpoint, except those evicted by LRU eviction (depending on
      the size of the page cache compared to the size of the working data
      set, eviction may be rare or frequent).
      We avoid that burst by anticipating: those pages are flushed
      in bunches spanned regularly over the time interval between now and
      the next checkpoint, by a background thread. Thus the next checkpoint
      will have only little flushing to do (CHECKPOINT_MEDIUM should thus be
      only a little slower than CHECKPOINT_INDIRECT).
    */

unknown's avatar
unknown committed
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158
    /*
      PageCacheFlushConcurrencyBugs
      Inside the page cache, calls to flush_pagecache_blocks_int() on the same
      file are serialized. Examples of concurrency bugs which happened when we
      didn't have this serialization:
      - maria_chk_size() (via CHECK TABLE) happens concurrently with
      Checkpoint: Checkpoint is flushing a page: it pins the page and is
      pre-empted, maria_chk_size() wants to flush this page too so gets an
      error because Checkpoint pinned this page. Such error makes
      maria_chk_size() mark the table as corrupted.
      - maria_close() happens concurrently with Checkpoint:
      Checkpoint is flushing a page: it registers a request on the page, is
      pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE:
      FLUSH_RELEASE will cause a free_block() which assumes the page is in the
      LRU, but it is not (as Checkpoint registered a request). Crash.
      - one thread is evicting a page of the file out of the LRU: it marks it
      iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes
      of the same file concurrently (like above). Then one flusher sees the
      page is in switch, removes it from changed_blocks[] and puts it in its
      first_in_switch, so the other flusher will not see the page at all and
      return too early. If it's maria_close() which returns too early, then
      maria_close() may close the file descriptor, and the other flusher, and
      the evicter will fail to write their page: corruption.
    */

unknown's avatar
unknown committed
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173
    if (!ignore_share)
    {
      if (filter != NULL)
      {
        if ((flush_pagecache_blocks_with_filter(maria_pagecache,
                                                &dfile, FLUSH_KEEP_LAZY,
                                                filter, &filter_param) &
             PCFLUSH_ERROR))
          ma_message_no_user(0, "checkpoint data page flush failed");
        if ((flush_pagecache_blocks_with_filter(maria_pagecache,
                                                &kfile, FLUSH_KEEP_LAZY,
                                                filter, &filter_param) &
             PCFLUSH_ERROR))
          ma_message_no_user(0, "checkpoint index page flush failed");
      }
unknown's avatar
unknown committed
1174 1175 1176 1177
      /*
        fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
        per second, so if you have touched 1000 files it's 7 seconds).
      */
unknown's avatar
unknown committed
1178 1179 1180 1181 1182 1183 1184 1185
      sync_error|=
        my_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) |
        my_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD));
      /*
        in case of error, we continue because writing other tables to disk is
        still useful.
      */
    }
unknown's avatar
unknown committed
1186 1187 1188 1189 1190 1191 1192
  }

  if (sync_error)
    goto err;
  /* We maybe over-estimated (due to share->id==0 or last_version==0) */
  DBUG_ASSERT(str->length >= (uint)(ptr - str->str));
  str->length= (uint)(ptr - str->str);
unknown's avatar
unknown committed
1193
  /*
unknown's avatar
unknown committed
1194 1195 1196
    As we support max 65k tables open at a time (2-byte short id), we
    assume uint is enough for the cumulated length of table names; and
    LEX_STRING::length is uint.
unknown's avatar
unknown committed
1197
  */
unknown's avatar
unknown committed
1198 1199 1200 1201 1202
  int4store(str->str, nb_stored);
  error= unmark_tables= 0;

err:
  if (unlikely(unmark_tables))
unknown's avatar
unknown committed
1203
  {
unknown's avatar
unknown committed
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
    /* maria_close() uses THR_LOCK_maria from start to end */
    pthread_mutex_lock(&THR_LOCK_maria);
    for (i= 0; i < nb; i++)
    {
      MARIA_SHARE *share= distinct_shares[i];
      if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
      {
        /* maria_close() left us to free the share */
        pthread_mutex_destroy(&share->intern_lock);
        my_free((uchar *)share, MYF(0));
      }
      else
      {
        /* share goes back to normal state */
        share->in_checkpoint= 0;
      }
    }
    pthread_mutex_unlock(&THR_LOCK_maria);
unknown's avatar
unknown committed
1222
  }
unknown's avatar
unknown committed
1223 1224 1225
  my_free((uchar *)distinct_shares, MYF(MY_ALLOW_ZERO_PTR));
  my_free((uchar *)state_copies, MYF(MY_ALLOW_ZERO_PTR));
  DBUG_RETURN(error);
unknown's avatar
unknown committed
1226
}