Commit 0f1feefa authored by unknown's avatar unknown

WL#3071 Maria checkpoint

Ability for flush_pagecache_blocks() to flush only certain pages of
a file, as instructed by an option "filter" pointer-to-function argument;
Checkpoint and background dirty page flushing use that to flush only
pages which have been dirty for long enough and bitmap pages.
Fix for a bug in flush_cached_blocks() (no idea if it could produce
a bug in real life, but theoretically it is).
Testing checkpoint in ma_test_recovery via ma_test1 and ma_test2.
Background checkpoint & dirty pages flush thread is still disabled
by default in ha_maria.


mysql-test/r/maria.result:
  result update
storage/maria/ha_maria.cc:
  blank after function comment
storage/maria/ma_checkpoint.c:
  Using an enum instead of 0/1/2 (applying Sanja's review comments).
  The comment about "this is an horizon" can be removed as Sanja
  created translog_next_LSN() which parse_checkpoint_record() uses.
  Variables in ma_checkpoint_background() cannot be declared in the
  for() as their value must not be reset at each iteration!
storage/maria/ma_pagecache.c:
  adding to flush_pagecache_blocks() optional arguments 'filter'
  (pointer to function) and 'filter_arg'; if filter!=NULL this function
  will be called for each block of the file and will reply if this
  block and following ones should be flushed or not (3 possible
  replies).
  Fixing a bug when flush_cached_blocks() skips a pinned page: it has
  to unset PCBLOCK_IN_FLUSH set by flush_pagecache_blocks_int().
storage/maria/ma_pagecache.h:
  flush_pagecache_blocks() is changed to take "filter" and "filter_arg"
  arguments. "filter", if it is not NULL, may return one value
  among enum pagecache_flush_filter_result.
storage/maria/ma_recovery.c:
  open_count=0 when closing tables at the end of recovery.
storage/maria/ma_test1.c:
  Optional checkpoints (-H#) at various stages (stages similar
  to --testflag), for testing of checkpoints.
storage/maria/ma_test2.c:
  Optional checkpoints (-H#) at various stages (stages similar
  to -t), for testing of checkpoints.
storage/maria/ma_test_recovery.expected:
  Result update: the results of the additional test run with -H#
  (checkpoints) are added here. They are exactly identical to without
  checkpoints except that the index's Root (printed by maria_chk)
  is more correct when using checkpoints. This is because checkpoint
  flushed the state, so it happens to be correct, while no-checkpoint
  does not flush the state, and recovery does not recover indexes
  so Root is never fixed. When we recover indices, this will go away.
storage/maria/ma_test_recovery:
  We duplicate the loop of tests to add an additional run with
  checkpoints at various stages, to see if maria_read_log
  uses them fine.
parent f2a675b3
......@@ -1976,7 +1976,7 @@ drop table t1;
show variables like 'maria%';
Variable_name Value
maria_block_size 8192
maria_checkpoint_frequency 30
maria_checkpoint_frequency 0
maria_max_sort_file_size 9223372036853727232
maria_pagecache_age_threshold 300
maria_pagecache_buffer_size 8384512
......
......@@ -95,7 +95,8 @@ static MYSQL_SYSVAR_ULONG(checkpoint_frequency, maria_checkpoint_frequency,
PLUGIN_VAR_RQCMDARG,
"Frequency of automatic checkpoints, in seconds;"
" 0 means 'no checkpoints'.",
NULL, update_checkpoint_frequency, 30, 0, UINT_MAX, 1);
/* disabled for now */
NULL, update_checkpoint_frequency, 0, 0, UINT_MAX, 1);
static MYSQL_SYSVAR_ULONGLONG(max_sort_file_size,
maria_max_temp_length, PLUGIN_VAR_RQCMDARG,
......@@ -2541,6 +2542,7 @@ static struct st_mysql_sys_var* system_variables[]= {
checkpoint frequency. So when the user wants to modify it, we stop and
restart the thread.
*/
static void update_checkpoint_frequency(MYSQL_THD thd,
struct st_mysql_sys_var *var,
void *var_ptr, void *save)
......
This diff is collapsed.
......@@ -841,7 +841,7 @@ static int flush_all_key_blocks(PAGECACHE *pagecache)
KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
#endif
if (flush_pagecache_blocks_int(pagecache, &block->hash_link->file,
FLUSH_RELEASE))
FLUSH_RELEASE, NULL, NULL))
return 1;
break;
}
......@@ -3489,7 +3489,7 @@ static int flush_cached_blocks(PAGECACHE *pagecache,
pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
/*
As all blocks referred in 'cache' are marked by PCBLOCK_IN_FLUSH
we are guarantied no thread will change them
we are guaranteed that no thread will change them
*/
qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link);
......@@ -3506,6 +3506,8 @@ static int flush_cached_blocks(PAGECACHE *pagecache,
DBUG_PRINT("info", ("block: %u (0x%lx) pinned",
PCBLOCK_NUMBER(pagecache, block), (ulong)block));
PCBLOCK_INFO(block);
/* undo the mark put by flush_pagecache_blocks_int(): */
block->status&= ~PCBLOCK_IN_FLUSH;
last_errno= -1;
unreg_request(pagecache, block, 1);
continue;
......@@ -3573,11 +3575,15 @@ static int flush_cached_blocks(PAGECACHE *pagecache,
/**
@brief flush all key blocks for a file to disk but don't do any mutex locks
@brief flush all blocks for a file to disk but don't do any mutex locks
@param pagecache pointer to a pagecache data structure
@param file handler for the file to flush to
@param flush_type type of the flush
@param filter optional function which tells what blocks to flush;
can be non-NULL only if FLUSH_KEEP or FLUSH_FORCE_WRITE.
@param filter_arg an argument to pass to 'filter'. Information about
the block will be passed too.
@note
This function doesn't do any mutex locks because it needs to be called
......@@ -3591,7 +3597,9 @@ static int flush_cached_blocks(PAGECACHE *pagecache,
static int flush_pagecache_blocks_int(PAGECACHE *pagecache,
PAGECACHE_FILE *file,
enum flush_type type)
enum flush_type type,
PAGECACHE_FLUSH_FILTER filter,
void *filter_arg)
{
PAGECACHE_BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache;
int last_errno= 0;
......@@ -3622,9 +3630,29 @@ static int flush_pagecache_blocks_int(PAGECACHE *pagecache,
if (type != FLUSH_IGNORE_CHANGED)
{
/*
/**
Count how many key blocks we have to cache to be able
to flush all dirty pages with minimum seek moves
to flush all dirty pages with minimum seek moves.
@todo RECOVERY BUG
We will soon here put code to wait if another thread is flushing the
same file, to avoid concurrency bugs. Examples of concurrency bugs
which happened without serialization:
- assume maria_chk_size() (via CHECK TABLE) happens
concurrently with Checkpoint: Checkpoint may be flushing a page, and
maria_chk_size() wants to flush this page too so gets an error
because Checkpoint pinned this page. Such error leads to marking the
table corrupted.
- assume maria_close() happens concurrently with Checkpoint:
Checkpoint may be flushing a page, and maria_close() flushes this
page too with FLUSH_RELEASE: the FLUSH_RELEASE will cause a
free_block() which assumes the page is in the LRU, but it is not (as
Checkpoint is flushing it). Crash.
- assume two flushes of the same file happen concurrently (like
above), and a third thread is pushing a page of this file out of the
LRU and runs first. Then one flusher will remove the page from
changed_blocks[] and put it in its first_in_switch, so the other
flusher will not see the page at all and return too early.
*/
for (block= pagecache->changed_blocks[FILE_HASH(*file)] ;
block;
......@@ -3659,7 +3687,19 @@ static int flush_pagecache_blocks_int(PAGECACHE *pagecache,
KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
#endif
next= block->next_changed;
if (block->hash_link->file.file == file->file)
if (block->hash_link->file.file != file->file)
continue;
if (filter != NULL)
{
int filter_res= (*filter)(block->type, block->hash_link->pageno,
block->rec_lsn, filter_arg);
DBUG_PRINT("info",("filter returned %d", filter_res));
if (filter_res == FLUSH_FILTER_SKIP_TRY_NEXT)
continue;
if (filter_res == FLUSH_FILTER_SKIP_ALL)
break;
DBUG_ASSERT(filter_res == FLUSH_FILTER_OK);
}
{
/*
Mark the block with BLOCK_IN_FLUSH in order not to let
......@@ -3775,6 +3815,11 @@ static int flush_pagecache_blocks_int(PAGECACHE *pagecache,
/* The following happens very seldom */
if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE))
{
/*
this code would free all blocks while filter maybe handled only a
few, that is not possible.
*/
DBUG_ASSERT(filter == NULL);
#if defined(PAGECACHE_DEBUG)
cnt=0;
#endif
......@@ -3810,23 +3855,27 @@ static int flush_pagecache_blocks_int(PAGECACHE *pagecache,
}
/*
Flush all blocks for a file to disk
SYNOPSIS
/**
@brief flush all blocks for a file to disk
flush_pagecache_blocks()
pagecache pointer to a page cache data structure
file handler for the file to flush to
flush_type type of the flush
@param pagecache pointer to a pagecache data structure
@param file handler for the file to flush to
@param flush_type type of the flush
@param filter optional function which tells what blocks to flush;
can be non-NULL only if FLUSH_KEEP or FLUSH_FORCE_WRITE.
@param filter_arg an argument to pass to 'filter'. Information about
the block will be passed too.
RETURN
0 OK
1 error
@return Operation status
@retval 0 OK
@retval 1 Error
*/
int flush_pagecache_blocks(PAGECACHE *pagecache,
PAGECACHE_FILE *file, enum flush_type type)
int flush_pagecache_blocks_with_filter(PAGECACHE *pagecache,
PAGECACHE_FILE *file,
enum flush_type type,
PAGECACHE_FLUSH_FILTER filter,
void *filter_arg)
{
int res;
DBUG_ENTER("flush_pagecache_blocks");
......@@ -3836,7 +3885,7 @@ int flush_pagecache_blocks(PAGECACHE *pagecache,
DBUG_RETURN(0);
pagecache_pthread_mutex_lock(&pagecache->cache_lock);
inc_counter_for_resize_op(pagecache);
res= flush_pagecache_blocks_int(pagecache, file, type);
res= flush_pagecache_blocks_int(pagecache, file, type, filter, filter_arg);
dec_counter_for_resize_op(pagecache);
pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
DBUG_RETURN(res);
......
......@@ -161,6 +161,18 @@ typedef struct st_pagecache
my_bool in_init; /* Set to 1 in MySQL during init/resize */
} PAGECACHE;
/** @brief Return values for PAGECACHE_FLUSH_FILTER */
enum pagecache_flush_filter_result
{
FLUSH_FILTER_SKIP_TRY_NEXT= 0,/**< skip page and move on to next one */
FLUSH_FILTER_OK, /**< flush page and move on to next one */
FLUSH_FILTER_SKIP_ALL /**< skip page and all next ones */
};
/** @brief a filter function type for flush_pagecache_blocks_with_filter() */
typedef enum pagecache_flush_filter_result
(*PAGECACHE_FLUSH_FILTER)(enum pagecache_page_type type, pgcache_page_no_t page,
LSN rec_lsn, void *arg);
/* The default key cache */
extern PAGECACHE dflt_pagecache_var, *dflt_pagecache;
......@@ -228,9 +240,13 @@ extern void pagecache_unpin(PAGECACHE *pagecache,
extern void pagecache_unpin_by_link(PAGECACHE *pagecache,
PAGECACHE_BLOCK_LINK *link,
LSN lsn);
extern int flush_pagecache_blocks(PAGECACHE *keycache,
PAGECACHE_FILE *file,
enum flush_type type);
#define flush_pagecache_blocks(A,B,C) \
flush_pagecache_blocks_with_filter(A,B,C,NULL,NULL)
extern int flush_pagecache_blocks_with_filter(PAGECACHE *keycache,
PAGECACHE_FILE *file,
enum flush_type type,
PAGECACHE_FLUSH_FILTER filter,
void *filter_arg);
extern my_bool pagecache_delete(PAGECACHE *pagecache,
PAGECACHE_FILE *file,
pgcache_page_no_t pageno,
......
......@@ -2255,6 +2255,13 @@ static int close_all_tables(void)
next_open= list_element->next;
info= (MARIA_HA*)list_element->data;
pthread_mutex_unlock(&THR_LOCK_maria); /* ok, UNDO phase not online yet */
/*
Tables which we see here are exactly those which were open at time of
crash. They might have open_count>0 as Checkpoint maybe flushed their
state while they were used. As Recovery corrected them, don't alarm the
user, don't ask for a table check:
*/
info->s->state.open_count= 0;
prepare_table_for_close(info, addr);
error|= maria_close(info);
pthread_mutex_lock(&THR_LOCK_maria);
......
......@@ -20,6 +20,7 @@
#include <m_string.h>
#include "ma_control_file.h"
#include "ma_loghandler.h"
#include "ma_checkpoint.h"
#include "trnman.h"
extern PAGECACHE *maria_log_pagecache;
......@@ -29,7 +30,7 @@ extern const char *maria_data_root;
static void usage();
static int rec_pointer_size=0, flags[50], testflag;
static int rec_pointer_size=0, flags[50], testflag, checkpoint;
static int key_field=FIELD_SKIP_PRESPACE,extra_field=FIELD_SKIP_ENDSPACE;
static int key_type=HA_KEYTYPE_NUM;
static int create_flag=0;
......@@ -82,7 +83,7 @@ int main(int argc,char *argv[])
translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
0, 0, maria_log_pagecache,
TRANSLOG_DEFAULT_FLAGS) ||
(transactional && trnman_init(0)))
(transactional && (trnman_init(0) || ma_checkpoint_init(FALSE))))
{
fprintf(stderr, "Error in initialization");
exit(1);
......@@ -226,6 +227,9 @@ static int run_test(const char *filename)
if (maria_commit(file) || maria_begin(file))
goto err;
if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
goto err;
if (testflag == 1)
goto end;
......@@ -246,6 +250,9 @@ static int run_test(const char *filename)
flags[0]=2;
}
if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
goto err;
if (testflag == 2)
{
printf("Terminating after inserts\n");
......@@ -307,6 +314,9 @@ static int run_test(const char *filename)
maria_scan_end(file);
}
if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
goto err;
if (testflag == 3)
{
printf("Terminating after updates\n");
......@@ -370,6 +380,9 @@ static int run_test(const char *filename)
}
}
if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
goto err;
if (testflag == 4)
{
printf("Terminating after deletes\n");
......@@ -672,6 +685,8 @@ static void update_record(uchar *record)
static struct my_option my_long_options[] =
{
{"checkpoint", 'H', "Checkpoint at specified stage", (uchar**) &checkpoint,
(uchar**) &checkpoint, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
{"checksum", 'c', "Undocumented",
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
#ifndef DBUG_OFF
......
......@@ -28,7 +28,7 @@
#include "trnman.h"
#include <m_ctype.h>
#include <my_bit.h>
#include "ma_checkpoint.h"
#define STANDARD_LENGTH 37
#define MARIA_KEYS 6
......@@ -51,7 +51,7 @@ static int verbose=0,testflag=0,
opt_quick_mode=0, transactional= 0, skip_update= 0,
die_in_middle_of_transaction= 0;
static int pack_seg=HA_SPACE_PACK,pack_type=HA_PACK_KEY,remove_count=-1;
static int create_flag= 0, srand_arg= 0;
static int create_flag= 0, srand_arg= 0, checkpoint= 0;
static ulong pagecache_size=IO_SIZE*16;
static enum data_file_type record_type= DYNAMIC_RECORD;
......@@ -98,7 +98,7 @@ int main(int argc, char *argv[])
translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
0, 0, maria_log_pagecache,
TRANSLOG_DEFAULT_FLAGS) ||
(transactional && trnman_init(0)))
(transactional && (trnman_init(0) || ma_checkpoint_init(FALSE))))
{
fprintf(stderr, "Error in initialization");
exit(1);
......@@ -240,6 +240,8 @@ int main(int argc, char *argv[])
maria_begin(file);
if (testflag == 1)
goto end;
if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
goto err;
if (!silent)
printf("- Writing key:s\n");
if (locking)
......@@ -302,6 +304,8 @@ int main(int argc, char *argv[])
}
if (testflag == 2)
goto end;
if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
goto err;
if (write_cacheing)
{
......@@ -353,6 +357,8 @@ int main(int argc, char *argv[])
}
if (testflag == 3)
goto end;
if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
goto err;
if (!silent)
printf("- Update\n");
......@@ -414,6 +420,8 @@ int main(int argc, char *argv[])
}
if (testflag == 4)
goto end;
if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
goto err;
for (i=999, dupp_keys=j=0 ; i>0 ; i--)
{
......@@ -824,6 +832,8 @@ int main(int argc, char *argv[])
if (testflag == 5)
goto end;
if (checkpoint == 5 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
goto err;
if (!silent)
printf("- Removing keys\n");
......@@ -1057,6 +1067,9 @@ static void get_options(int argc, char **argv)
if ((first_key=atoi(++pos)) < 0 || first_key >= MARIA_KEYS)
first_key=0;
break;
case 'H':
checkpoint= atoi(++pos);
break;
case 'k':
if ((keys=(uint) atoi(++pos)) < 1 ||
keys > (uint) (MARIA_KEYS-first_key))
......
......@@ -126,16 +126,22 @@ echo "Testing the REDO AND UNDO PHASE"
# Then we run it again and let it exit at T2. Then we compare
# and expect identity.
for take_checkpoint in "no" "yes"
do
for blobs in "" "-b" # we test table without blobs and then table with blobs
do
for test_undo in 1 2 3
do
# first iteration tests rollback of insert, second tests rollback of delete
set -- "ma_test1 $silent -M -T -c -N $blobs" "--testflag=1" "--testflag=2 --test-undo=" "ma_test1 $silent -M -T -c -N $blobs" "--testflag=3" "--testflag=4 --test-undo=" "ma_test1 $silent -M -T -c -N $blobs" "--testflag=2" "--testflag=3 --test-undo=" "ma_test2 $silent -L -K -W -P -M -T -c $blobs" "-t1" "-t2 -u"
set -- "ma_test1 $silent -M -T -c -N $blobs -H1" "--testflag=1" "--testflag=2 --test-undo=" "ma_test1 $silent -M -T -c -N $blobs -H2" "--testflag=3" "--testflag=4 --test-undo=" "ma_test1 $silent -M -T -c -N $blobs -H2 " "--testflag=2" "--testflag=3 --test-undo=" "ma_test2 $silent -L -K -W -P -M -T -c $blobs -H1" "-t1" "-t2 -u"
# -N (create NULL fields) is needed because --test-undo adds it anyway
while [ $# != 0 ]
do
prog=$1
if [ "$take_checkpoint" == "no" ]
then
prog=`echo $prog | sed 's/ -H[0-9]//'`
fi
commit_run_args=$2
abort_run_args=$3;
rm -f maria_log.* maria_log_control
......@@ -192,6 +198,7 @@ do
rm -f $table.* $tmp/$table* $tmp/maria_chk_*.txt $tmp/maria_read_log_$table.txt
done
done
done
) 2>&1 > $tmp/ma_test_recovery.output
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment