Merge gbichot@bk-internal.mysql.com:/home/bk/mysql-maria

into gbichot4.local:/home/mysql_src/mysql-maria-monty

Merge gbichot@bk-internal.mysql.com:/home/bk/mysql-maria
into gbichot4.local:/home/mysql_src/mysql-maria-monty
8e8362cc · unknown · 313bb483 · d72c22de · 8e8362cc · 8e8362cc
Commit 8e8362cc authored Dec 14, 2007 by unknown
19 changed files
--- a/mysql-test/r/maria-recovery-bitmap.result
+++ b/mysql-test/r/maria-recovery-bitmap.result
+drop database if exists mysqltest;
+create database mysqltest;
+use mysqltest;
+* shut down mysqld, removed logs, restarted it
+use mysqltest;
+create table t1 (a varchar(10000)) engine=maria;
+* TEST of over-allocated bitmap not flushed by checkpoint
+insert into t1 values ("bbbbbbb");
+flush table t1;
+* copied t1 for comparison
+insert into t1 values ("bbbbbbb");
+delete from t1 limit 1;
+set session debug="+d,info,enter,exit,maria_over_alloc_bitmap";
+insert into t1 values ("aaaaaaaaa");
+set global maria_checkpoint_interval=1;
+SET SESSION debug="+d,maria_crash";
+* crashing mysqld intentionally
+set global maria_checkpoint_interval=1;
+ERROR HY000: Lost connection to MySQL server during query
+* recovery happens
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+* testing that checksum after recovery is as expected
+Checksum-check
+ok
+use mysqltest;
+drop database mysqltest_for_comparison;
+drop database mysqltest;
--- a/mysql-test/r/maria-recovery.result
+++ b/mysql-test/r/maria-recovery.result
+set global maria_log_file_size=4294967296;
 drop database if exists mysqltest;
 create database mysqltest;
 use mysqltest;
@@ -118,6 +119,7 @@ a
 00000000
 00000000
 drop table t1;
+* TEST of two REDOs for same page in one REDO group
 * shut down mysqld, removed logs, restarted it
 use mysqltest;
 CREATE TABLE t1 (
@@ -150,6 +152,7 @@ SELECT LENGTH(b) FROM t1 WHERE i=3;
 LENGTH(b)
 5001
 drop table t1;
+* TEST of INSERT vs state.auto_increment
 * shut down mysqld, removed logs, restarted it
 use mysqltest;
 CREATE TABLE t1 (
@@ -184,6 +187,7 @@ t1	CREATE TABLE `t1` (
  PRIMARY KEY (`i`),
  KEY `c` (`c`)
 ) ENGINE=MARIA AUTO_INCREMENT=5 DEFAULT CHARSET=latin1
+* TEST of UPDATE vs state.auto_increment
 * copied t1 for feeding_recovery
 update t1 set i=15 where c="a";
 flush table t1;

--- a/mysql-test/t/maria-recovery-bitmap-master.opt
+++ b/mysql-test/t/maria-recovery-bitmap-master.opt
+--skip-stack-trace --skip-core-file
+
--- a/mysql-test/t/maria-recovery-bitmap.test
+++ b/mysql-test/t/maria-recovery-bitmap.test
+# Tests of Maria's recovery of the bitmap pages
+
+--source include/not_embedded.inc
+# Don't test this under valgrind, memory leaks will occur as we crash
+--source include/not_valgrind.inc
+# Binary must be compiled with debug for crash to occur
+--source include/have_debug.inc
+--source include/have_maria.inc
+
+--disable_warnings
+drop database if exists mysqltest;
+--enable_warnings
+create database mysqltest;
+
+# Include scripts can perform SQL. For it to not influence the main test
+# they use a separate connection. This way if they use a DDL it would
+# not autocommit in the main test.
+connect (admin, 127.0.0.1, root,,mysqltest,,);
+--enable_reconnect
+
+connection default;
+use mysqltest;
+--enable_reconnect
+
+-- source include/maria_empty_logs.inc
+let $mms_tables=1;
+create table t1 (a varchar(10000)) engine=maria;
+
+# we want recovery to use the tables as they were at time of crash
+let $mvr_restore_old_snapshot=0;
+# UNDO phase prevents physical comparison, normally,
+# so we'll only use checksums to compare.
+let $mms_compare_physically=0;
+let $mvr_crash_statement= set global maria_checkpoint_interval=1;
+
+--echo * TEST of over-allocated bitmap not flushed by checkpoint
+let $mvr_debug_option="+d,maria_crash";
+insert into t1 values ("bbbbbbb");
+-- source include/maria_make_snapshot_for_comparison.inc
+# make_snapshot_for_comparison closed the table, which lost its id.
+# So we make a null operation just to give a short id to the table so
+# that checkpoint includes table in checkpoint (otherwise nothing to
+# test).
+insert into t1 values ("bbbbbbb");
+delete from t1 limit 1;
+set session debug="+d,info,enter,exit,maria_over_alloc_bitmap";
+send insert into t1 values ("aaaaaaaaa");
+connection admin;
+# Leave time for INSERT to block after modifying bitmap;
+# in the future we should not use sleep but something like
+# debug_sync_point().
+sleep 5;
+# force a checkpoint, which could, if buggy, flush over-allocated
+# bitmap page; as REDO-UNDO was not written, bitmap and data page
+# would be inconsistent. Correct checkpoint will wait until UNDO is
+# written.
+set global maria_checkpoint_interval=1;
+-- source include/maria_verify_recovery.inc
+
+# disabled until pagecache callback framework is coded at which point
+# we can add a get_lsn() callback for bitmaps, fixing the below bug.
+if (0)
+{
+--echo * TEST of bitmap flushed without REDO-UNDO in the log (WAL violation)
+# before crashing we'll flush the bitmap page
+let $mvr_debug_option="+d,maria_flush_bitmap,maria_crash";
+-- source include/maria_make_snapshot_for_comparison.inc
+lock tables t1 write;
+insert into t1 values (REPEAT('a', 6000));
+# bitmap of after-INSERT will be on disk, but data pages will not; if
+# log is not flushed the bitmap is inconsistent with the data.
+-- source include/maria_verify_recovery.inc
+drop table t1;
+}
+
+# clean up everything
+let $mms_purpose=comparison;
+eval drop database mysqltest_for_$mms_purpose;
+drop database mysqltest;
--- a/mysql-test/t/maria-recovery.test
+++ b/mysql-test/t/maria-recovery.test
@@ -122,6 +122,7 @@ drop table t1;
 # the rewrite was ignored.
 #

+--echo * TEST of two REDOs for same page in one REDO group
 -- source include/maria_empty_logs.inc
 let $mms_tables=1;
 CREATE TABLE t1 (
@@ -144,6 +145,7 @@ SELECT LENGTH(b) FROM t1 WHERE i=3;
 drop table t1;

 # Test that INSERT's effect on auto-increment is recovered
+--echo * TEST of INSERT vs state.auto_increment
 -- source include/maria_empty_logs.inc
 let $mms_tables=1;
 CREATE TABLE t1 (
@@ -165,6 +167,7 @@ let $mvr_crash_statement= set global maria_checkpoint_interval=1;
 show create table t1;

 # Test that UPDATE's effect on auto-increment is recovered
+--echo * TEST of UPDATE vs state.auto_increment
 -- source include/maria_make_snapshot_for_feeding_recovery.inc
 update t1 set i=15 where c="a";
 -- source include/maria_make_snapshot_for_comparison.inc

--- a/storage/maria/ma_bitmap.c
+++ b/storage/maria/ma_bitmap.c
@@ -132,6 +132,8 @@ uchar maria_bitmap_marker[4]=
 {(uchar) 255, (uchar) 255, (uchar) 255, (uchar) 254};
 uchar maria_normal_page_marker[4]=
 {(uchar) 255, (uchar) 255, (uchar) 255, (uchar) 255};
+/*#define WRONG_BITMAP_FLUSH 1*/ /*define only for provoking bugs*/
+#undef WRONG_BITMAP_FLUSH

 static my_bool _ma_read_bitmap_page(MARIA_SHARE *share,
                                    MARIA_FILE_BITMAP *bitmap,
@@ -143,14 +145,48 @@ static my_bool _ma_read_bitmap_page(MARIA_SHARE *share,
 static inline my_bool write_changed_bitmap(MARIA_SHARE *share,
                                           MARIA_FILE_BITMAP *bitmap)
 {
+  DBUG_ENTER("write_changed_bitmap");
  DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size);
-  return (pagecache_write(share->pagecache,
+  DBUG_PRINT("info", ("bitmap->flushable: %d", bitmap->flushable));
+  if (bitmap->flushable
+#ifdef WRONG_BITMAP_FLUSH
+      || 1
+#endif
+      )
+  {
+    my_bool res= pagecache_write(share->pagecache,
                                 &bitmap->file, bitmap->page, 0,
                                 (uchar*) bitmap->map, PAGECACHE_PLAIN_PAGE,
                                 PAGECACHE_LOCK_LEFT_UNLOCKED,
                                 PAGECACHE_PIN_LEFT_UNPINNED,
-                          PAGECACHE_WRITE_DELAY, 0,
-                          LSN_IMPOSSIBLE));
+                                 PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE);
+    DBUG_RETURN(res);
+  }
+  else
+  {
+    /**
+      @todo RECOVERY BUG
+      Not flushable: its content is not reflected by the log, to honour WAL we
+      must keep the bitmap page pinned. Scenario of INSERT:
+      REDO - UNDO (written to log but not forced)
+      bitmap goes to page cache (because other INSERT needs to)
+      and then to disk (pagecache eviction)
+      crash: recovery will not find REDO-UNDO, table is corrupted.
+      Solutions:
+      give LSNs to bitmap pages or change pagecache to flush all log when
+      flushing a bitmap page or keep bitmap page pinned until checkpoint.
+    */
+    MARIA_PINNED_PAGE page_link;
+    int res= pagecache_write(share->pagecache,
+                             &bitmap->file, bitmap->page, 0,
+                             (uchar*) bitmap->map, PAGECACHE_PLAIN_PAGE,
+                             PAGECACHE_LOCK_WRITE, PAGECACHE_PIN,
+                             PAGECACHE_WRITE_DELAY, &page_link.link,
+                             LSN_IMPOSSIBLE);
+    page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+    push_dynamic(&bitmap->pinned_pages, (void*) &page_link);
+    DBUG_RETURN(res);
+  }
 }

 /*
@@ -180,7 +216,9 @@ my_bool _ma_bitmap_init(MARIA_SHARE *share, File file)
  size*= 2;
 #endif

-  if (!(bitmap->map= (uchar*) my_malloc(size, MYF(MY_WME))))
+  if (((bitmap->map= (uchar*) my_malloc(size, MYF(MY_WME))) == NULL) ||
+      my_init_dynamic_array(&bitmap->pinned_pages,
+                            sizeof(MARIA_PINNED_PAGE), 1, 1))
    return 1;

  bitmap->file.file= file;
@@ -193,6 +231,7 @@ my_bool _ma_bitmap_init(MARIA_SHARE *share, File file)
    The +1 is to add the bitmap page, as this doesn't have to be covered
  */
  bitmap->pages_covered= aligned_bit_blocks * 16 + 1;
+  bitmap->flushable= TRUE;

  /* Update size for bits */
  /* TODO; Make this dependent of the row size */
@@ -207,6 +246,7 @@ my_bool _ma_bitmap_init(MARIA_SHARE *share, File file)
  bitmap->sizes[7]= 0;

  pthread_mutex_init(&share->bitmap.bitmap_lock, MY_MUTEX_INIT_SLOW);
+  pthread_cond_init(&share->bitmap.bitmap_cond, 0);

  _ma_bitmap_reset_cache(share);

@@ -231,6 +271,8 @@ my_bool _ma_bitmap_end(MARIA_SHARE *share)
 {
  my_bool res= _ma_bitmap_flush(share);
  pthread_mutex_destroy(&share->bitmap.bitmap_lock);
+  pthread_cond_destroy(&share->bitmap.bitmap_cond);
+  delete_dynamic(&share->bitmap.pinned_pages);
  my_free((uchar*) share->bitmap.map, MYF(MY_ALLOW_ZERO_PTR));
  share->bitmap.map= 0;
  return res;
@@ -273,6 +315,104 @@ my_bool _ma_bitmap_flush(MARIA_SHARE *share)
 }


+/**
+   Dirty-page filtering criteria for bitmap pages
+
+   @param  type                Page's type
+   @param  pageno              Page's number
+   @param  rec_lsn             Page's rec_lsn
+   @param  arg                 pages_covered of bitmap
+*/
+
+static enum pagecache_flush_filter_result
+filter_flush_bitmap_pages(enum pagecache_page_type type
+                          __attribute__ ((unused)),
+                          pgcache_page_no_t pageno,
+                          LSN rec_lsn __attribute__ ((unused)),
+                          void *arg)
+{
+  return ((pageno % (*(ulong*)arg)) == 0);
+}
+
+
+/**
+   Flushes current bitmap page to the pagecache, and then all bitmap pages
+   from pagecache to the file. Used by Checkpoint.
+
+   @param  share               Table's share
+*/
+
+my_bool _ma_bitmap_flush_all(MARIA_SHARE *share)
+{
+  my_bool res= 0;
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  DBUG_ENTER("_ma_bitmap_flush_all");
+  pthread_mutex_lock(&bitmap->bitmap_lock);
+  if (bitmap->changed)
+  {
+#ifndef WRONG_BITMAP_FLUSH
+    while (!bitmap->flushable)
+    {
+      DBUG_PRINT("info", ("waiting for bitmap to be flushable"));
+      pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock);
+    }
+#endif
+    /*
+      Bitmap is in a flushable state: its contents in memory are reflected by
+      log records (complete REDO-UNDO groups) and all bitmap pages are
+      unpinned. We keep the mutex to preserve this situation, and flush to the
+      file.
+    */
+    res= write_changed_bitmap(share, bitmap);
+    bitmap->changed= FALSE;
+    /*
+      We do NOT use FLUSH_KEEP_LAZY because we must be sure that bitmap
+      pages have been flushed. That's a condition of correctness of
+      Recovery: data pages may have been all flushed, if we write the
+      checkpoint record Recovery will start from after their REDOs. If
+      bitmap page was not flushed, as the REDOs about it will be skipped, it
+      will wrongly not be recovered. If bitmap pages had a rec_lsn it would
+      be different.
+      There should be no pinned pages as bitmap->flushable is true.
+    */
+    if (flush_pagecache_blocks_with_filter(share->pagecache,
+                                           &bitmap->file, FLUSH_KEEP,
+                                           filter_flush_bitmap_pages,
+                                           &bitmap->pages_covered) &
+        PCFLUSH_PINNED_AND_ERROR)
+      res= TRUE;
+  }
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
+  DBUG_RETURN(res);
+}
+
+
+/**
+  @brief Unpin all pinned bitmap pages
+
+  @param  share            Table's share
+
+  @return Operation status
+    @retval   0   ok
+*/
+
+static void _ma_bitmap_unpin_all(MARIA_SHARE *share)
+{
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*)
+                                 dynamic_array_ptr(&bitmap->pinned_pages, 0));
+  MARIA_PINNED_PAGE *pinned_page= page_link + bitmap->pinned_pages.elements;
+  DBUG_ENTER("_ma_bitmap_unpin_all");
+  DBUG_PRINT("info", ("pinned: %u", bitmap->pinned_pages.elements));
+  while (pinned_page-- != page_link)
+    pagecache_unlock_by_link(share->pagecache, pinned_page->link,
+                             pinned_page->unlock, PAGECACHE_UNPIN,
+                             LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, TRUE);
+  bitmap->pinned_pages.elements= 0;
+  DBUG_VOID_RETURN;
+}
+
+
 /*
  Intialize bitmap in memory to a zero bitmap

@@ -684,12 +824,6 @@ static my_bool _ma_change_bitmap_page(MARIA_HA *info,

  if (bitmap->changed)
  {
-    /**
-       @todo RECOVERY BUG this is going to flush the bitmap page possibly to
-       disk even though it could be over-allocated with not yet any REDO-UNDO
-       complete group (WAL violation: no way to undo the over-allocation if
-       crash). See also collect_tables().
-    */
    if (write_changed_bitmap(info->s, bitmap))
      DBUG_RETURN(1);
    bitmap->changed= 0;
@@ -1973,6 +2107,46 @@ my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info,
 }


+/**
+   Make a transition of MARIA_FILE_BITMAP::flushable.
+   If the bitmap becomes flushable, which requires that REDO-UNDO has been
+   logged and all bitmap pages touched by the thread have a correct
+   allocation, it unpins all bitmap pages, and if checkpoint is waiting, it
+   wakes it up.
+   If the bitmap becomes unflushable, it just records it.
+
+   @param  share               Table's share
+   @param  flushable           New state
+*/
+
+void _ma_bitmap_flushable(MARIA_SHARE *share, my_bool flushable)
+{
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  if (flushable)
+  {
+    pthread_mutex_lock(&bitmap->bitmap_lock);
+    _ma_bitmap_unpin_all(share);
+    bitmap->flushable= TRUE;
+    pthread_mutex_unlock(&bitmap->bitmap_lock);
+    /*
+      Ok to read in_checkpoint without mutex, as it is set before Checkpoint
+      calls _ma_bitmap_flush_all().
+    */
+    if (share->in_checkpoint)
+    {
+      DBUG_PRINT("info", ("bitmap ready waking up checkpoint"));
+      pthread_cond_broadcast(&bitmap->bitmap_cond);
+    }
+    return;
+  }
+  /*
+    Ok to set without mutex: we didn't touch the bitmap yet; when we touch it
+    we will take the mutex.
+  */
+  bitmap->flushable= FALSE;
+}
+
+
 /*
  Correct bitmap pages to reflect the true allocation

@@ -2015,7 +2189,7 @@ my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks)
  */
  current_bitmap_value= FULL_HEAD_PAGE;

-  pthread_mutex_lock(&info->s->bitmap.bitmap_lock);
+  pthread_mutex_lock(&bitmap->bitmap_lock);

  /* First handle head block */
  if (block->used & BLOCKUSED_USED)
@@ -2065,11 +2239,19 @@ my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks)
                                        block->page, page_count))
      goto err;
  }
-  pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+
+  _ma_bitmap_unpin_all(info->s);
+  bitmap->flushable= TRUE;
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
+  if (info->s->in_checkpoint)
+  {
+    DBUG_PRINT("info", ("bitmap ready waking up checkpoint"));
+    pthread_cond_broadcast(&bitmap->bitmap_cond);
+  }
  DBUG_RETURN(0);

 err:
-  pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
  DBUG_RETURN(1);
 }


--- a/storage/maria/ma_blockrec.c
+++ b/storage/maria/ma_blockrec.c
@@ -2692,32 +2692,21 @@ static my_bool allocate_and_write_block_record(MARIA_HA *info,
  MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks;
  DBUG_ENTER("allocate_and_write_block_record");

+  _ma_bitmap_flushable(info->s, FALSE);
  if (_ma_bitmap_find_place(info, row, blocks))
-    DBUG_RETURN(1);                         /* Error reading bitmap */
+    goto err;                         /* Error reading bitmap */

-#ifdef RECOVERY_EXTRA_DEBUG
-  /* Send this over-allocated bitmap to disk and crash, see if recovers */
-  DBUG_EXECUTE_IF("maria_flush_bitmap",
-                  {
-                    DBUG_PRINT("maria_flush_bitmap", ("now"));
-                    _ma_bitmap_flush(info->s);
-                    _ma_flush_table_files(info, MARIA_FLUSH_DATA |
-                                          MARIA_FLUSH_INDEX,
-                                          FLUSH_KEEP, FLUSH_KEEP);
-                  });
-  DBUG_EXECUTE_IF("maria_crash",
-                  {
-                    DBUG_PRINT("maria_crash", ("now"));
-                    fflush(DBUG_FILE);
-                    abort();
-                  });
-#endif
+  /*
+    Sleep; a checkpoint will happen and should not send this over-allocated
+    bitmap to disk but rather wait.
+  */
+  DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10););

  /* page will be pinned & locked by get_head_or_tail_page */
  if (get_head_or_tail_page(info, blocks->block, info->buff,
                            row->space_on_head_page, HEAD_PAGE,
                            PAGECACHE_LOCK_WRITE, &row_pos))
-    DBUG_RETURN(1);
+    goto err;
  row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr);
  if (info->s->calc_checksum)
  {
@@ -2732,11 +2721,17 @@ static my_bool allocate_and_write_block_record(MARIA_HA *info,
  if (write_block_record(info, (uchar*) 0, record, row,
                         blocks, blocks->block->org_bitmap_value != 0,
                         &row_pos, undo_lsn, 0))
-    DBUG_RETURN(1);                         /* Error reading bitmap */
+    goto err;                         /* Error reading bitmap */
  DBUG_PRINT("exit", ("Rowid: %lu (%lu:%u)", (ulong) row->lastpos,
                      (ulong) ma_recordpos_to_page(row->lastpos),
                      ma_recordpos_to_dir_entry(row->lastpos)));
+  /* Now let checkpoint happen but don't commit */
+  DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000););
  DBUG_RETURN(0);
+err:
+  _ma_bitmap_flushable(info->s, TRUE);
+  _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+  DBUG_RETURN(1);
 }


@@ -2806,6 +2801,7 @@ my_bool _ma_write_abort_block_record(MARIA_HA *info)
  MARIA_SHARE *share= info->s;
  DBUG_ENTER("_ma_write_abort_block_record");

+  _ma_bitmap_flushable(share, FALSE);
  if (delete_head_or_tail(info,
                          ma_recordpos_to_page(info->cur_row.lastpos),
                          ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1,
@@ -2840,6 +2836,7 @@ my_bool _ma_write_abort_block_record(MARIA_HA *info)
                      &lsn, (void*) 0))
      res= 1;
  }
+  _ma_bitmap_flushable(share, TRUE);
  _ma_unpin_all_pages_and_finalize_row(info, lsn);
  DBUG_RETURN(res);
 }
@@ -2889,12 +2886,13 @@ static my_bool _ma_update_block_record2(MARIA_HA *info,
  calc_record_size(info, record, new_row);
  page= ma_recordpos_to_page(record_pos);

+  _ma_bitmap_flushable(share, FALSE);
  DBUG_ASSERT(share->pagecache->block_size == block_size);
  if (!(buff= pagecache_read(share->pagecache,
                             &info->dfile, (pgcache_page_no_t) page, 0,
                             info->buff, share->page_type,
                             PAGECACHE_LOCK_WRITE, &page_link.link)))
-    DBUG_RETURN(1);
+    goto err;
  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
  page_link.changed= 1;
  push_dynamic(&info->pinned_pages, (void*) &page_link);
@@ -2918,7 +2916,7 @@ static my_bool _ma_update_block_record2(MARIA_HA *info,
    if (extend_area_on_page(buff, dir, rownr, share->block_size,
                            new_row->total_length, &org_empty_size,
                            &rec_offset, &length))
-      DBUG_RETURN(1);
+      goto err;

    row_pos.buff= buff;
    row_pos.rownr= rownr;
@@ -2980,6 +2978,7 @@ static my_bool _ma_update_block_record2(MARIA_HA *info,
  DBUG_RETURN(res);

 err:
+  _ma_bitmap_flushable(share, TRUE);
  _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
  DBUG_RETURN(1);
 }
@@ -3288,6 +3287,7 @@ my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
  DBUG_PRINT("enter", ("Rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos,
                       (ulong) page, record_number));

+  _ma_bitmap_flushable(share, FALSE);
  if (delete_head_or_tail(info, page, record_number, 1, 0) ||
      delete_tails(info, info->cur_row.tail_positions))
    goto err;
@@ -3334,10 +3334,12 @@ my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)

  }

+  _ma_bitmap_flushable(share, TRUE);
  _ma_unpin_all_pages_and_finalize_row(info, lsn);
  DBUG_RETURN(0);

 err:
+  _ma_bitmap_flushable(share, TRUE);
  _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
  DBUG_RETURN(1);
 }
@@ -5509,10 +5511,14 @@ uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
        enum pagecache_page_pin unpin_method;
        uint length;

-        if ((page * info->s->block_size) > info->state->data_file_length)
+        if (((page + 1) * info->s->block_size) >
+            info->state->data_file_length)
        {
          /* New page or half written page at end of file */
-          info->state->data_file_length= page * info->s->block_size;
+          DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
+                              (ulong) info->state->data_file_length,
+                              (ulong) ((page + 1 ) * info->s->block_size)));
+          info->state->data_file_length= (page + 1) * info->s->block_size;
          buff= info->keyread_buff;
          info->keyread_buff_used= 1;
          make_empty_page(info, buff, BLOB_PAGE);
@@ -5540,7 +5546,12 @@ uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
                                       LSN_IMPOSSIBLE, 0);
              DBUG_RETURN(my_errno);
            }
-            /* Physical file was too short; Create new page */
+            /*
+              Physical file was too short, create new page. It can be that
+              recovery started with a file with N pages, wrote page N+2 into
+              pagecache (increased data_file_length but not physical file
+              length), now reads page N+1: the read fails.
+            */
            buff= info->keyread_buff;
            info->keyread_buff_used= 1;
            make_empty_page(info, buff, BLOB_PAGE);
@@ -5637,6 +5648,7 @@ my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
  if (read_row_extent_info(info, buff, rownr))
    DBUG_RETURN(1);

+  _ma_bitmap_flushable(share, FALSE);
  if (delete_head_or_tail(info, page, rownr, 1, 1) ||
      delete_tails(info, info->cur_row.tail_positions))
    goto err;
@@ -5653,6 +5665,7 @@ my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,

  res= 0;
 err:
+  _ma_bitmap_flushable(share, TRUE);
  _ma_unpin_all_pages_and_finalize_row(info, lsn);
  DBUG_RETURN(res);
 }

--- a/storage/maria/ma_blockrec.h
+++ b/storage/maria/ma_blockrec.h
@@ -171,6 +171,7 @@ my_bool _ma_compare_block_record(register MARIA_HA *info,
 my_bool _ma_bitmap_init(MARIA_SHARE *share, File file);
 my_bool _ma_bitmap_end(MARIA_SHARE *share);
 my_bool _ma_bitmap_flush(MARIA_SHARE *share);
+my_bool _ma_bitmap_flush_all(MARIA_SHARE *share);
 void _ma_bitmap_reset_cache(MARIA_SHARE *share);
 my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row,
                              MARIA_BITMAP_BLOCKS *result_blocks);
@@ -198,6 +199,7 @@ my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info,
                                       uint *bitmap_pattern);
 void _ma_bitmap_delete_all(MARIA_SHARE *share);
 int  _ma_bitmap_create_first(MARIA_SHARE *share);
+void _ma_bitmap_flushable(MARIA_SHARE *share, my_bool flushable);
 #ifndef DBUG_OFF
 void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data,
                      ulonglong page);

--- a/storage/maria/ma_checkpoint.c
+++ b/storage/maria/ma_checkpoint.c
@@ -59,9 +59,7 @@ static uint checkpoints_total= 0, /**< all checkpoint requests made */

 struct st_filter_param
 {
-  my_bool is_data_file; /**< is the file about data or index */
  LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */
-  ulong pages_covered_by_bitmap; /**< to know which page is a bitmap page */
  uint max_pages; /**< stop after flushing this number pages */
 }; /**< information to determine which dirty pages should be flushed */

@@ -74,10 +72,6 @@ filter_flush_file_full(enum pagecache_page_type type,
                       pgcache_page_no_t page,
                       LSN rec_lsn, void *arg);
 static enum pagecache_flush_filter_result
-filter_flush_file_indirect(enum pagecache_page_type type,
-                           pgcache_page_no_t page,
-                           LSN rec_lsn, void *arg);
-static enum pagecache_flush_filter_result
 filter_flush_file_evenly(enum pagecache_page_type type,
                         pgcache_page_no_t pageno,
                         LSN rec_lsn, void *arg);
@@ -264,7 +258,7 @@ static int really_execute_checkpoint(void)
  /* checkpoint succeeded */
  ptr= record_pieces[3].str;
  pages_to_flush_before_next_checkpoint= uint4korr(ptr);
-  DBUG_PRINT("info",("%u pages to flush before next checkpoint",
+  DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint",
                           (uint)pages_to_flush_before_next_checkpoint));

  /* compute log's low-water mark */
@@ -350,9 +344,11 @@ int ma_checkpoint_init(ulong interval)

   @param  what_to_flush   0: current bitmap and all data pages
                           1: state
+                           2: all bitmap pages
 */
 static void flush_all_tables(int what_to_flush)
 {
+  int res= 0;
  LIST *pos; /**< to iterate over open tables */
  pthread_mutex_lock(&THR_LOCK_maria);
  for (pos= maria_open_list; pos; pos= pos->next)
@@ -363,17 +359,21 @@ static void flush_all_tables(int what_to_flush)
      switch (what_to_flush)
      {
      case 0:
-        _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+        res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
                              FLUSH_KEEP, FLUSH_KEEP);
        break;
      case 1:
-        _ma_state_info_write(info->s, 1|4);
+        res= _ma_state_info_write(info->s, 1|4);
        DBUG_PRINT("maria_flush_states",
                   ("is_of_horizon: LSN (%lu,0x%lx)",
                    LSN_IN_PARTS(info->s->state.is_of_horizon)));
        break;
+      case 2:
+        res= _ma_bitmap_flush_all(info->s);
+        break;
      }
    }
+    DBUG_ASSERT(res == 0);
  }
  pthread_mutex_unlock(&THR_LOCK_maria);
 }
@@ -387,6 +387,11 @@ static void flush_all_tables(int what_to_flush)
 void ma_checkpoint_end(void)
 {
  DBUG_ENTER("ma_checkpoint_end");
+  DBUG_EXECUTE_IF("maria_flush_bitmap",
+                  {
+                    DBUG_PRINT("maria_flush_bitmap", ("now"));
+                    flush_all_tables(2);
+                  });
  DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
                  {
                    DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
@@ -447,8 +452,8 @@ void ma_checkpoint_end(void)

   We flush data/index pages which have been dirty since the previous
   checkpoint (this is the two-checkpoint rule: the REDO phase will not have
-   to start from earlier than the next-to-last checkpoint), and all dirty
-   bitmap pages.
+   to start from earlier than the next-to-last checkpoint).
+   Bitmap pages are handled by _ma_bitmap_flush_all().

   @param  type                Page's type
   @param  pageno              Page's number
@@ -458,21 +463,20 @@ void ma_checkpoint_end(void)

 static enum pagecache_flush_filter_result
 filter_flush_file_medium(enum pagecache_page_type type,
-                         pgcache_page_no_t pageno,
+                         pgcache_page_no_t pageno __attribute__ ((unused)),
                         LSN rec_lsn, void *arg)
 {
  struct st_filter_param *param= (struct st_filter_param *)arg;
-  return ((type == PAGECACHE_LSN_PAGE) &&
-          (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0)) ||
-    (param->is_data_file &&
-     ((pageno % param->pages_covered_by_bitmap) == 0));
+  return (type == PAGECACHE_LSN_PAGE) &&
+    (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0);
 }


 /**
   @brief dirty-page filtering criteria for FULL checkpoint.

-   We flush all dirty data/index pages and all dirty bitmap pages.
+   We flush all dirty data/index pages.
+   Bitmap pages are handled by _ma_bitmap_flush_all().

   @param  type                Page's type
   @param  pageno              Page's number
@@ -482,39 +486,11 @@ filter_flush_file_medium(enum pagecache_page_type type,

 static enum pagecache_flush_filter_result
 filter_flush_file_full(enum pagecache_page_type type,
-                       pgcache_page_no_t pageno,
-                       LSN rec_lsn __attribute__ ((unused)),
-                       void *arg)
-{
-  struct st_filter_param *param= (struct st_filter_param *)arg;
-  return (type == PAGECACHE_LSN_PAGE) ||
-    (param->is_data_file &&
-     ((pageno % param->pages_covered_by_bitmap) == 0));
-}
-
-
-/**
-   @brief dirty-page filtering criteria for INDIRECT checkpoint.
-
-   We flush all dirty bitmap pages.
-
-   @param  type                Page's type
-   @param  pageno              Page's number
-   @param  rec_lsn             Page's rec_lsn
-   @param  arg                 filter_param
-*/
-
-static enum pagecache_flush_filter_result
-filter_flush_file_indirect(enum pagecache_page_type type
-                           __attribute__ ((unused)),
-                           pgcache_page_no_t pageno,
+                       pgcache_page_no_t pageno __attribute__ ((unused)),
                       LSN rec_lsn __attribute__ ((unused)),
-                           void *arg)
+                       void *arg __attribute__ ((unused)))
 {
-  struct st_filter_param *param= (struct st_filter_param *)arg;
-  return
-    (param->is_data_file &&
-     ((pageno % param->pages_covered_by_bitmap) == 0));
+  return (type == PAGECACHE_LSN_PAGE);
 }


@@ -526,6 +502,8 @@ filter_flush_file_indirect(enum pagecache_page_type type
   to start from earlier than the next-to-last checkpoint), and no
   bitmap pages. But we flush no more than a certain number of pages (to have
   an even flushing, no write burst).
+   The reason to not flush bitmap pages is that they may not be in a flushable
+   state at this moment and we don't want to wait for them.

   @param  type                Page's type
   @param  pageno              Page's number
@@ -574,9 +552,11 @@ pthread_handler_t ma_checkpoint_background(void *arg)
    about the interval's value when it started.
  */
  const ulong interval= (ulong)arg;
-  uint sleeps;
-  TRANSLOG_ADDRESS log_horizon_at_last_checkpoint= LSN_IMPOSSIBLE;
-  ulonglong pagecache_flushes_at_last_checkpoint= 0;
+  uint sleeps, sleep_time;
+  TRANSLOG_ADDRESS log_horizon_at_last_checkpoint=
+    translog_get_horizon();
+  ulonglong pagecache_flushes_at_last_checkpoint=
+    maria_pagecache->global_cache_write;
  uint pages_bunch_size;
  struct st_filter_param filter_param;
  PAGECACHE_FILE *dfile; /**< data file currently being flushed */
@@ -602,7 +582,7 @@ pthread_handler_t ma_checkpoint_background(void *arg)
    sleeps=0;
 #endif
    struct timespec abstime;
-    switch((sleeps++) % interval)
+    switch (sleeps % interval)
    {
    case 0:
      /*
@@ -626,8 +606,10 @@ pthread_handler_t ma_checkpoint_background(void *arg)
      {
        /* don't take checkpoint, so don't know what to flush */
        pages_to_flush_before_next_checkpoint= 0;
+        sleep_time= interval;
        break;
      }
+      sleep_time= 1;
      ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
      /*
        Snapshot this kind of "state" of the engine. Note that the value below
@@ -653,11 +635,11 @@ pthread_handler_t ma_checkpoint_background(void *arg)
    default:
      if (pages_bunch_size > 0)
      {
-        DBUG_PRINT("info", ("Maria background checkpoint thread: %u pages",
+        DBUG_PRINT("checkpoint",
+                   ("Maria background checkpoint thread: %u pages",
                    pages_bunch_size));
        /* flush a bunch of dirty pages */
        filter_param.max_pages= pages_bunch_size;
-        filter_param.is_data_file= TRUE;
        while (dfile != dfiles_end)
        {
          /*
@@ -683,7 +665,6 @@ pthread_handler_t ma_checkpoint_background(void *arg)
            we wrote enough pages.
          */
        }
-        filter_param.is_data_file= FALSE;
        while (kfile != kfiles_end)
        {
          int res=
@@ -697,6 +678,12 @@ pthread_handler_t ma_checkpoint_background(void *arg)
            break; /* and we will continue with the same file */
          kfile++; /* otherwise all this file is flushed, move to next file */
        }
+        sleep_time= 1;
+      }
+      else
+      {
+        /* Can directly sleep until the next checkpoint moment */
+        sleep_time= interval - (sleeps % interval);
      }
    }
    pthread_mutex_lock(&LOCK_checkpoint);
@@ -708,12 +695,14 @@ pthread_handler_t ma_checkpoint_background(void *arg)
    pthread_mutex_lock(&LOCK_checkpoint);
 #else
    /* To have a killable sleep, we use timedwait like our SQL GET_LOCK() */
-    set_timespec(abstime, 1);
+    DBUG_PRINT("info", ("sleeping %u seconds", sleep_time));
+    set_timespec(abstime, sleep_time);
    pthread_cond_timedwait(&COND_checkpoint, &LOCK_checkpoint, &abstime);
 #endif
    if (checkpoint_thread_die == 1)
      break;
    pthread_mutex_unlock(&LOCK_checkpoint);
+    sleeps+= sleep_time;
  }
  pthread_mutex_unlock(&LOCK_checkpoint);
  DBUG_PRINT("info",("Maria background checkpoint thread ends"));
@@ -855,7 +844,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
    filter= &filter_flush_file_full;
    break;
  case CHECKPOINT_INDIRECT:
-    filter= &filter_flush_file_indirect;
+    filter= NULL;
    break;
  default:
    DBUG_ASSERT(0);
@@ -888,6 +877,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
  {
    MARIA_SHARE *share= distinct_shares[i];
    PAGECACHE_FILE kfile, dfile;
+    my_bool ignore_share;
    if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
    {
      /* No need for a mutex to read the above, only us can write this flag */
@@ -957,7 +947,6 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
    for ( ; state_copy->index != i; state_copy++)
      DBUG_ASSERT(state_copy < state_copies_end);

-    filter_param.pages_covered_by_bitmap= share->bitmap.pages_covered;
    /* OS file descriptors are ints which we stored in 4 bytes */
    compile_time_assert(sizeof(int) <= 4);
    pthread_mutex_lock(&share->intern_lock);
@@ -978,7 +967,9 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
      onto a newer one (assuming the table has been reopened with a different
      share but of course same physical index file).
    */
-    if ((share->id != 0) && (share->last_version != 0))
+    ignore_share= (share->id == 0) | (share->last_version == 0);
+    DBUG_PRINT("info", ("ignore_share: %d", ignore_share));
+    if (!ignore_share)
    {
      /** @todo avoid strlen */
      uint open_file_name_len= strlen(share->open_file_name) + 1;
@@ -1061,14 +1052,12 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
          each checkpoint if the table was once written and then not anymore.
        */
      }
-      /**
-         @todo RECOVERY BUG this is going to flush the bitmap page possibly to
-         disk even though it could be over-allocated with not yet any
-         REDO-UNDO complete group (WAL violation: no way to undo the
-         over-allocation if crash); see also _ma_change_bitmap_page().
-      */
-      sync_error|=
-        _ma_bitmap_flush(share); /* after that, all is in page cache */
+      if (_ma_bitmap_flush_all(share))
+      {
+        sync_error= 1;
+        /** @todo all write failures should mark table corrupted */
+        ma_message_no_user(0, "checkpoint bitmap page flush failed");
+      }
      DBUG_ASSERT(share->pagecache == maria_pagecache);
    }
    if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
@@ -1135,26 +1124,21 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
      the evicter will fail to write their page: corruption.
    */

-    /*
-      We do NOT use FLUSH_KEEP_LAZY because we must be sure that bitmap pages
-      have been flushed. That's a condition of correctness of Recovery: data
-      pages may have been all flushed, if we write the checkpoint record
-      Recovery will start from after their REDOs. If bitmap page was not
-      flushed, as the REDOs about it will be skipped, it will wrongly not be
-      recovered. If bitmap pages had a rec_lsn it would be different.
-    */
-    if ((filter_param.is_data_file= TRUE),
-        (flush_pagecache_blocks_with_filter(maria_pagecache,
-                                            &dfile, FLUSH_KEEP,
+    if (!ignore_share)
+    {
+      if (filter != NULL)
+      {
+        if ((flush_pagecache_blocks_with_filter(maria_pagecache,
+                                                &dfile, FLUSH_KEEP_LAZY,
                                                filter, &filter_param) &
             PCFLUSH_ERROR))
          ma_message_no_user(0, "checkpoint data page flush failed");
-    if ((filter_param.is_data_file= FALSE),
-        (flush_pagecache_blocks_with_filter(maria_pagecache,
-                                            &kfile, FLUSH_KEEP,
+        if ((flush_pagecache_blocks_with_filter(maria_pagecache,
+                                                &kfile, FLUSH_KEEP_LAZY,
                                                filter, &filter_param) &
             PCFLUSH_ERROR))
          ma_message_no_user(0, "checkpoint index page flush failed");
+      }
      /*
        fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
        per second, so if you have touched 1000 files it's 7 seconds).
@@ -1167,6 +1151,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
        still useful.
      */
    }
+  }

  if (sync_error)
    goto err;

--- a/storage/maria/ma_commit.c
+++ b/storage/maria/ma_commit.c
@@ -51,12 +51,6 @@ int ma_commit(TRN *trn)
    So we need to go the first way.
  */

-  /**
-     @todo RECOVERY share's state is written to disk only in
-     maria_lock_database(), so COMMIT record is not the last record of the
-     transaction! It is probably an issue. Recovery of the state is a problem
-     not yet solved.
-  */
  /*
    We do not store "thd->transaction.xid_state.xid" for now, it will be
    needed only when we support XA.

--- a/storage/maria/ma_key_recover.c
+++ b/storage/maria/ma_key_recover.c
@@ -175,7 +175,7 @@ my_bool write_hook_for_clr_end(enum translog_record_type type


 /**
-  @brief write hook for undo key insert
+  @brief write hook for undo key
 */

 my_bool write_hook_for_undo_key(enum translog_record_type type,

--- a/storage/maria/ma_loghandler.c
+++ b/storage/maria/ma_loghandler.c
@@ -389,8 +389,6 @@ static LOG_DESC INIT_LOGREC_REDO_NOT_USED=
 {LOGRECTYPE_VARIABLE_LENGTH, 0, 8, NULL, write_hook_for_redo, NULL, 0,
 "redo_insert_row_blob", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};

-/** @todo RECOVERY BUG handle it in recovery */
-/*QQ:TODO:header???*/
 static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOBS=
 {LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE, NULL,
 write_hook_for_redo, NULL, 0,

--- a/storage/maria/ma_open.c
+++ b/storage/maria/ma_open.c
@@ -1100,7 +1100,6 @@ uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite)

 uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite)
 {
-  /** @todo RECOVERY write it only at checkpoint time */
  uchar  buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE];
  uchar *ptr=buff;
  uint	i, keys= (uint) state->header.keys;
@@ -1143,7 +1142,6 @@ uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite)
  {
    mi_sizestore(ptr,state->key_root[i]);		ptr+= 8;
  }
-  /** @todo RECOVERY BUG key_del is a problem for recovery */
  mi_sizestore(ptr,state->key_del);	        	ptr+= 8;
  if (pWrite & 2)				/* From maria_chk */
  {

--- a/storage/maria/ma_pagecache.c
+++ b/storage/maria/ma_pagecache.c
@@ -601,6 +601,10 @@ static uint pagecache_fwrite(PAGECACHE *pagecache,
 {
  DBUG_ENTER("pagecache_fwrite");
  DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE);
+  /**
+    @todo RECOVERY BUG Here, we should call a callback get_lsn(): it will use
+    lsn_korr() for LSN pages, and translog_get_horizon() for bitmap pages.
+  */
  if (type == PAGECACHE_LSN_PAGE)
  {
    LSN lsn;
@@ -4185,18 +4189,8 @@ my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
      */
      DBUG_ASSERT(block->hash_link != NULL);
      DBUG_ASSERT(block->status & PCBLOCK_CHANGED);
-      /**
-         @todo RECOVERY BUG
-         REDO phase uses PAGECACHE_PLAIN_PAGE, so the lines below would
-         confuse the indirect Checkpoint taken at the end of the REDO phase.
-         So we below collect even dirty pages of temporary tables as a result
-         :( Soon we should have the MARIA_SHARE accessible from the
-         pagecache's block and then we can test born_transactional.
-      */
-#ifdef TRANS_TABLES_ALWAYS_USE_LSN_PAGE
      if (block->type != PAGECACHE_LSN_PAGE)
        continue; /* no need to store it */
-#endif
      stored_list_size++;
    }
  }
@@ -4221,10 +4215,8 @@ my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
         block;
         block= block->next_changed)
    {
-#ifdef TRANS_TABLES_ALWAYS_USE_LSN_PAGE
      if (block->type != PAGECACHE_LSN_PAGE)
        continue; /* no need to store it in the checkpoint record */
-#endif
      compile_time_assert(sizeof(block->hash_link->file.file) <= 4);
      compile_time_assert(sizeof(block->hash_link->pageno) <= 4);
      int4store(ptr, block->hash_link->file.file);

--- a/storage/maria/ma_recovery.c
+++ b/storage/maria/ma_recovery.c
@@ -348,11 +348,14 @@ int maria_apply_log(LSN from_lsn, enum maria_apply_log_way apply,
     REDO phase does not fill blocks' rec_lsn, so a checkpoint now would be
     wrong: if a future recovery used it, the REDO phase would always
     start from the checkpoint and never from before, wrongly skipping REDOs
-     (tested).
+     (tested). Another problem is that the REDO phase uses
+     PAGECACHE_PLAIN_PAGE, while Checkpoint only collects PAGECACHE_LSN_PAGE.

-     @todo fix this; pagecache_write() now can have a rec_lsn argument.
+     @todo fix this. pagecache_write() now can have a rec_lsn argument. And we
+     could make a function which goes through pages at end of REDO phase and
+     changes their type.
  */
-#if 0
+#ifdef FIX_AND_ENABLE_LATER
  if (take_checkpoints && checkpoint_useful)
  {
    /*
@@ -478,14 +481,11 @@ prototype_redo_exec_hook(LONG_TRANSACTION_ID)
 {
  uint16 sid= rec->short_trid;
  TrID long_trid= all_active_trans[sid].long_trid;
-  /* abort group of this trn (must be of before a crash) */
-  LSN gslsn= all_active_trans[sid].group_start_lsn;
-  if (gslsn != LSN_IMPOSSIBLE)
-  {
-    tprint(tracef, "Group at LSN (%lu,0x%lx) short_trid %u incomplete\n",
-           LSN_IN_PARTS(gslsn), sid);
-    all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
-  }
+  /*
+    Any incomplete group should be of an old crash which already had a
+    recovery and thus has logged INCOMPLETE_GROUP which we must have seen.
+  */
+  DBUG_ASSERT(all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE);
  if (long_trid != 0)
  {
    LSN ulsn= all_active_trans[sid].undo_lsn;
@@ -1160,6 +1160,7 @@ static int new_table(uint16 sid, const char *name,
  }
  if (maria_is_crashed(info))
  {
+    /** @todo what should we do? how to continue recovery? */
    tprint(tracef, "Table is crashed, can't apply log records to it\n");
    goto end;
  }
@@ -1566,10 +1567,6 @@ prototype_redo_exec_hook(UNDO_ROW_INSERT)
      }
      share->state.state.checksum+= ha_checksum_korr(buff);
    }
-    /**
-       @todo some bits below will rather be set when executing UNDOs related
-       to keys
-    */
    info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED;
  }
  tprint(tracef, "   rows' count %lu\n", (ulong)info->s->state.state.records);
@@ -1605,8 +1602,8 @@ prototype_redo_exec_hook(UNDO_ROW_DELETE)
      }
      share->state.state.checksum+= ha_checksum_korr(buff);
    }
-    share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
-                            STATE_NOT_OPTIMIZED_ROWS);
+    share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED |
+      STATE_NOT_OPTIMIZED_ROWS;
  }
  tprint(tracef, "   rows' count %lu\n", (ulong)share->state.state.records);
  _ma_unpin_all_pages(info, rec->lsn);
@@ -1743,6 +1740,7 @@ prototype_redo_exec_hook(COMMIT)
  {
    tprint(tracef, "We don't know about transaction with short_trid %u;"
           "it probably committed long ago, forget it\n", sid);
+    bzero(&all_active_trans[sid], sizeof(all_active_trans[sid]));
    return 0;
  }
  llstr(long_trid, llbuf);
@@ -1792,6 +1790,7 @@ prototype_redo_exec_hook(CLR_END)
      break;
    case LOGREC_UNDO_ROW_INSERT:
      share->state.state.records--;
+      share->state.changed|= STATE_NOT_OPTIMIZED_ROWS;
      row_entry= 1;
      break;
    case LOGREC_UNDO_ROW_UPDATE:
@@ -1865,7 +1864,8 @@ prototype_undo_exec_hook(UNDO_ROW_INSERT)
    return 1;
  }
  share= info->s;
-  share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED;
+  share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED |
+    STATE_NOT_OPTIMIZED_ROWS;

  record_ptr= rec->header;
  if (share->calc_checksum)
@@ -2205,8 +2205,9 @@ static int run_redo_phase(LSN lsn, enum maria_apply_log_way apply)
        if (log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF)
        {
          /*
-            can happen if the transaction got a table write error, then
-            unlocked tables thus wrote a COMMIT record.
+            Can happen if the transaction got a table write error, then
+            unlocked tables thus wrote a COMMIT record. Or can be an
+            INCOMPLETE_GROUP record written by a previous recovery.
          */
          tprint(tracef, "\nDiscarding incomplete group before this record\n");
          all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
@@ -2677,6 +2678,8 @@ static LSN parse_checkpoint_record(LSN lsn)
  tprint(tracef, "%u active transactions\n", nb_active_transactions);
  LSN minimum_rec_lsn_of_active_transactions= lsn_korr(ptr);
  ptr+= LSN_STORE_SIZE;
+  max_long_trid= transid_korr(ptr);
+  ptr+= TRANSID_SIZE;

  /*
    how much brain juice and discussions there was to come to writing this

--- a/storage/maria/maria_chk.c
+++ b/storage/maria/maria_chk.c
@@ -104,8 +104,8 @@ int main(int argc, char **argv)
  maria_init();

  /*
-    If we are doing a repair and we have requested logging (on by default),
-    enable transaction log handling.
+    If we are doing a repair, user may want to store this repair into the log
+    so that the log has a complete history and can be used to replay.
  */
  if (opt_transaction_logging && (check_param.testflag & T_REP_ANY) &&
      (ma_control_file_create_or_open() ||

--- a/storage/maria/maria_def.h
+++ b/storage/maria/maria_def.h
@@ -217,16 +217,19 @@ typedef struct st_maria_file_bitmap
  ulonglong page;                      /* Page number for current bitmap */
  uint used_size;                      /* Size of bitmap head that is not 0 */
  my_bool changed;                     /* 1 if page needs to be flushed */
+  my_bool flushable;                   /**< If bitmap and log are in sync */
  PAGECACHE_FILE file;		       /* datafile where bitmap is stored */

 #ifdef THREAD
  pthread_mutex_t bitmap_lock;
+  pthread_cond_t bitmap_cond;          /**< When bitmap becomes flushable */
 #endif
  /* Constants, allocated when initiating bitmaps */
  uint sizes[8];                      /* Size per bit combination */
  uint total_size;		      /* Total usable size of bitmap page */
  uint block_size;                    /* Block size of file */
  ulong pages_covered;                /* Pages covered by bitmap + 1 */
+  DYNAMIC_ARRAY pinned_pages;         /**< not-yet-flushable bitmap pages */
 } MARIA_FILE_BITMAP;

 #define MARIA_CHECKPOINT_LOOKS_AT_ME 1
@@ -511,7 +514,6 @@ struct st_maria_handler

 #define USE_WHOLE_KEY	65535         /* Use whole key in _search() */
 #define F_EXTRA_LCK	-1
-#define TRANSID_SIZE		6

 /* bits in opt_flag */
 #define MEMMAP_USED	32

--- a/storage/maria/trnman.c
+++ b/storage/maria/trnman.c
@@ -598,6 +598,7 @@ my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com,
  pthread_mutex_lock(&LOCK_trn_list);
  str_act->length= 2 + /* number of active transactions */
    LSN_STORE_SIZE + /* minimum of their rec_lsn */
+    TRANSID_SIZE + /* current TrID generator value */
    (2 + /* short id */
     6 + /* long id */
     LSN_STORE_SIZE + /* undo_lsn */
@@ -618,6 +619,8 @@ my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com,
    goto err;
  /* First, the active transactions */
  ptr= str_act->str + 2 + LSN_STORE_SIZE;
+  transid_store(ptr, global_trid_generator);
+  ptr+= TRANSID_SIZE;
  for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
  {
    /*

--- a/storage/maria/trnman_public.h
+++ b/storage/maria/trnman_public.h
@@ -55,6 +55,8 @@ my_bool trnman_has_locked_tables(TRN *trn);
 void trnman_reset_locked_tables(TRN *trn);
 TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid);
 TRN *trnman_get_any_trn();
-
+#define TRANSID_SIZE		6
+#define transid_store(dst, id) int6store(dst,id)
+#define transid_korr(P) uint6korr(P)
 C_MODE_END
 #endif