Commit 7199c905 authored by unknown's avatar unknown

WL#3071 Maria checkpoint

- cleanups, simplifications
- moving the construction of the "dirty pages table" into the
pagecache where it belongs (because it's the pagecache which knows
dirty pages). TODO: do the same soon for the "transactions table".
- fix for a small bug in the pagecache (decrementation of "changed_blocks")


include/pagecache.h:
  prototype
mysys/mf_pagecache.c:
  m_string.h moves up for LEX_STRING to be known for pagecache.h.
  In pagecache_delete_page(), we must decrement "blocks_changed" even
  if we just delete the page without flushing it.
  A new function pagecache_collect_changed_blocks_with_LSN()
  (used by the Checkpoint module), which stores information about the
  changed blocks (a.k.a. "the dirty pages table") into a LEX_STRING.
  This function is not tested now, it will be when there is a Checkpoint.
storage/maria/ma_checkpoint.c:
  refining the checkpoint code: factoring functions, moving the
  construction of the "dirty pages table" into mf_pagecache.c
  (I'll do the same with the construction of the "transactions table"
  once Serg tells me what's the best way to do it).
storage/maria/ma_least_recently_dirtied.c:
  Simplifying the thread which does background flushing of
  least-recently-dirtied pages:
  - in first version that thread will not flush, just do checkpoints
  - in 2nd version, flushing should re-use existing page cache functions
  like flush_pagecache_blocks().
unittest/mysys/test_file.h:
  m_string.h moves up for LEX_STRING to be known in pagecache.h
parent 71b40497
......@@ -221,6 +221,9 @@ extern my_bool pagecache_delete_page(PAGECACHE *pagecache,
enum pagecache_page_lock lock,
my_bool flush);
extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup);
extern my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache,
LEX_STRING *str,
LSN *max_lsn);
C_MODE_END
#endif /* _keycache_h */
......@@ -40,9 +40,9 @@
*/
#include "mysys_priv.h"
#include <m_string.h>
#include <pagecache.h>
#include "my_static.h"
#include <m_string.h>
#include <my_bit.h>
#include <errno.h>
#include <stdarg.h>
......@@ -295,7 +295,7 @@ struct st_pagecache_block_link
enum pagecache_page_type type; /* type of the block */
uint hits_left; /* number of hits left until promotion */
ulonglong last_hit_time; /* timestamp of the last hit */
ulonglong rec_lsn; /* LSN when first became dirty */
LSN rec_lsn; /* LSN when first became dirty */
KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */
};
......@@ -2988,33 +2988,35 @@ my_bool pagecache_delete_page(PAGECACHE *pagecache,
goto restart;
}
if (block->status & BLOCK_CHANGED && flush)
if (block->status & BLOCK_CHANGED)
{
/* The block contains a dirty page - push it out of the cache */
KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty"));
pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
/*
The call is thread safe because only the current
thread might change the block->hash_link value
*/
DBUG_ASSERT(block->pins == 1);
error= pagecache_fwrite(pagecache,
&block->hash_link->file,
block->buffer,
block->hash_link->pageno,
block->type,
MYF(MY_NABP | MY_WAIT_IF_FULL));
pagecache_pthread_mutex_lock(&pagecache->cache_lock);
pagecache->global_cache_write++;
if (error)
if (flush)
{
block->status|= BLOCK_ERROR;
goto err;
/* The block contains a dirty page - push it out of the cache */
KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty"));
pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
/*
The call is thread safe because only the current
thread might change the block->hash_link value
*/
DBUG_ASSERT(block->pins == 1);
error= pagecache_fwrite(pagecache,
&block->hash_link->file,
block->buffer,
block->hash_link->pageno,
block->type,
MYF(MY_NABP | MY_WAIT_IF_FULL));
pagecache_pthread_mutex_lock(&pagecache->cache_lock);
pagecache->global_cache_write++;
if (error)
{
block->status|= BLOCK_ERROR;
goto err;
}
}
pagecache->blocks_changed--;
pagecache->global_blocks_changed--;
/*
......@@ -3793,6 +3795,132 @@ int reset_key_cache_counters(const char *name, PAGECACHE *key_cache)
}
/*
Allocates a buffer and stores in it some information about all dirty pages
of type PAGECACHE_LSN_PAGE.
SYNOPSIS
pagecache_collect_changed_blocks_with_LSN()
pagecache pointer to the page cache
str (OUT) pointer to a LEX_STRING where the allocated buffer, and
its size, will be put
max_lsn (OUT) pointer to a LSN where the maximum rec_lsn of all
relevant dirty pages will be put
DESCRIPTION
Does the allocation because the caller cannot know the size itself.
Memory freeing is done by the caller.
Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they
are not interesting for a checkpoint record.
The caller has the intention of doing checkpoints.
RETURN
0 on success
1 on error
*/
my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache,
LEX_STRING *str,
LSN *max_lsn)
{
my_bool error;
ulong stored_LRD_size= 0;
uint file_hash;
char *ptr;
DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN");
*max_lsn= 0;
/*
We lock the entire cache but will be quick, just reading/writing a few MBs
of memory at most.
When we enter here, we must be sure that no "first_in_switch" situation
is happening or will happen (either we have to get rid of
first_in_switch in the code or, first_in_switch has to increment a
"danger" counter for this function to know it has to wait). TODO.
*/
pagecache_pthread_mutex_lock(&pagecache->cache_lock);
/* Count how many dirty pages are interesting */
for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
{
PAGECACHE_BLOCK_LINK *block;
for (block= pagecache->changed_blocks[file_hash] ;
block;
block= block->next_changed)
{
/*
Q: is there somthing subtle with block->hash_link: can it be NULL?
does it have to be == hash_link->block... ?
*/
DBUG_ASSERT(block->hash_link != NULL);
DBUG_ASSERT(block->status & BLOCK_CHANGED);
if (block->type != PAGECACHE_LSN_PAGE)
continue; /* no need to store it */
/*
In the current pagecache, rec_lsn is not set correctly:
1) it is set on pagecache_unlock(), too late (a page is dirty
(BLOCK_CHANGED) since the first pagecache_write()). So in this
scenario:
thread1: thread2:
write_REDO
pagecache_write() checkpoint : reclsn not known
pagecache_unlock(sets rec_lsn)
commit
crash,
at recovery we will wrongly skip the REDO. It also affects the
low-water mark's computation.
2) sometimes the unlocking can be an implicit action of
pagecache_write(), without any call to pagecache_unlock(), then
rec_lsn is not set.
1) and 2) are critical problems.
TODO: fix this when Monty has explained how he writes BLOB pages.
*/
if (0 == block->rec_lsn)
{
DBUG_ASSERT(0);
goto err;
}
stored_LRD_size++;
}
}
str->length= 8+(4+4+8)*stored_LRD_size;
if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME))))
goto err;
ptr= str->str;
int8store(ptr, stored_LRD_size);
ptr+= 8;
if (0 == stored_LRD_size)
goto end;
for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
{
PAGECACHE_BLOCK_LINK *block;
for (block= pagecache->changed_blocks[file_hash] ;
block;
block= block->next_changed)
{
if (block->type != PAGECACHE_LSN_PAGE)
continue; /* no need to store it in the checkpoint record */
DBUG_ASSERT((4 == sizeof(block->hash_link->file.file)) &&
(4 == sizeof(block->hash_link->pageno)));
int4store(ptr, block->hash_link->file.file);
ptr+= 4;
int4store(ptr, block->hash_link->pageno);
ptr+= 4;
int8store(ptr, (ulonglong)block->rec_lsn);
ptr+= 8;
set_if_bigger(*max_lsn, block->rec_lsn);
}
}
error= 0;
goto end;
err:
error= 1;
end:
pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
DBUG_RETURN(error);
}
#ifndef DBUG_OFF
/*
Test if disk-cache is ok
......
......@@ -56,9 +56,9 @@ st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,..
MEDIUM checkpoint.
*/
LSN max_rec_lsn_at_last_checkpoint= 0;
/* last submitted checkpoint request; cleared only when executed */
/* last submitted checkpoint request; cleared when starts */
CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE;
CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;
CHECKPOINT_LEVEL checkpoint_in_progress= NONE;
static inline ulonglong read_non_atomic(ulonglong volatile *x);
......@@ -74,16 +74,10 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
DBUG_ASSERT(level > NONE);
lock(log_mutex);
while ((synchronous_checkpoint_in_progress != NONE) ||
(next_asynchronous_checkpoint_to_do != NONE))
while (checkpoint_in_progress != NONE)
wait_on_checkpoint_done_cond();
synchronous_checkpoint_in_progress= level;
result= execute_checkpoint(level);
safemutex_assert_owner(log_mutex);
synchronous_checkpoint_in_progress= NONE;
unlock(log_mutex);
broadcast(checkpoint_done_cond);
DBUG_RETURN(result);
}
......@@ -92,7 +86,7 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
request, executes it.
Is safe if multiple threads call it, though in first version only one will.
It's intended to be used by a thread which regularly calls this function;
this is why, if there is a request,it does not wait in a loop for
this is why, if there is a request, it does not wait in a loop for
synchronous checkpoints to be finished, but just exits (because the thread
may want to do something useful meanwhile (flushing dirty pages for example)
instead of waiting).
......@@ -103,27 +97,20 @@ my_bool execute_asynchronous_checkpoint_if_any()
CHECKPOINT_LEVEL level;
DBUG_ENTER("execute_asynchronous_checkpoint");
/* first check without mutex, ok to see old data */
if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
(checkpoint_in_progress != NONE)))
DBUG_RETURN(FALSE);
lock(log_mutex);
if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
(synchronous_checkpoint_in_progress != NONE)))
(checkpoint_in_progress != NONE)))
{
unlock(log_mutex);
DBUG_RETURN(FALSE);
}
level= next_asynchronous_checkpoint_to_do;
DBUG_ASSERT(level > NONE);
result= execute_checkpoint(level);
safemutex_assert_owner(log_mutex);
/* If only one thread calls this function, "<" can never happen below */
if (next_asynchronous_checkpoint_to_do <= level)
{
/* it's our request or weaker/equal ones, all work is done */
next_asynchronous_checkpoint_to_do= NONE;
}
/* otherwise if it is a stronger request, we'll deal with it at next call */
unlock(log_mutex);
broadcast(checkpoint_done_cond);
result= execute_checkpoint(next_asynchronous_checkpoint_to_do);
DBUG_RETURN(result);
}
......@@ -135,9 +122,13 @@ my_bool execute_asynchronous_checkpoint_if_any()
*/
my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
{
my_bool result;
DBUG_ENTER("execute_checkpoint");
safemutex_assert_owner(log_mutex);
if (next_asynchronous_checkpoint_to_do <= level)
next_asynchronous_checkpoint_to_do= NONE;
checkpoint_in_progress= level;
if (unlikely(level > INDIRECT))
{
......@@ -166,11 +157,11 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
lock(log_mutex);
}
/*
keep mutex locked upon exit because callers will want to clear
mutex-protected status variables
*/
DBUG_RETURN(execute_checkpoint_indirect());
result= execute_checkpoint_indirect();
checkpoint_in_progress= NONE;
unlock(log_mutex);
broadcast(checkpoint_done_cond);
DBUG_RETURN(result);
}
......@@ -181,114 +172,37 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
*/
my_bool execute_checkpoint_indirect()
{
int error= 0;
int error= 0, i;
/* checkpoint record data: */
LSN checkpoint_start_lsn;
LEX_STRING string1={0,0}, string2={0,0}, string3={0,0};
LEX_STRING *string_array[4];
char checkpoint_start_lsn_char[8];
LEX_STRING strings[5]={ {&checkpoint_start_lsn_str, 8}, {0,0}, {0,0}, {0,0}, {0,0} };
char *ptr;
LSN checkpoint_lsn;
LSN candidate_max_rec_lsn_at_last_checkpoint= 0;
LSN candidate_max_rec_lsn_at_last_checkpoint;
DBUG_ENTER("execute_checkpoint_indirect");
DBUG_ASSERT(sizeof(byte *) <= 8);
DBUG_ASSERT(sizeof(LSN) <= 8);
safemutex_assert_owner(log_mutex);
/* STEP 1: record current end-of-log LSN */
checkpoint_start_lsn= log_read_end_lsn();
if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */
DBUG_RETURN(TRUE);
unlock(log_mutex);
DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn));
int8store(strings[0].str, checkpoint_start_lsn);
/* STEP 1: fetch information about dirty pages */
/* note: this piece will move into mysys/mf_pagecache.c */
{
ulong stored_LRD_size= 0;
/*
We lock the entire cache but will be quick, just reading/writing a few MBs
of memory at most.
When we enter here, we must be sure that no "first_in_switch" situation
is happening or will happen (either we have to get rid of
first_in_switch in the code or, first_in_switch has to increment a
"danger" counter for Checkpoint to know it has to wait. TODO.
*/
pagecache_pthread_mutex_lock(&pagecache->cache_lock);
/* STEP 2: fetch information about dirty pages */
/*
This is an over-estimation, as in theory blocks_changed may contain
non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the
checkpoint record; the true number of page-LRD-info we'll store into the
record is stored_LRD_size.
*/
/*
TODO: Ingo says blocks_changed is not a reliable number (see his
document); ask him.
*/
string1.length= 8+8+(8+8+8)*pagecache->blocks_changed;
if (NULL == (string1.str= my_malloc(string1.length)))
goto err;
ptr= string1.str;
int8store(ptr, checkpoint_start_lsn);
ptr+= 8+8; /* don't store stored_LRD_size now, wait */
if (pagecache->blocks_changed > 0)
{
uint file_hash;
for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
{
PAGECACHE_BLOCK_LINK *block;
for (block= pagecache->changed_blocks[file_hash] ;
block;
block= block->next_changed)
{
DBUG_ASSERT(block->hash_link != NULL);
DBUG_ASSERT(block->status & BLOCK_CHANGED);
if (block->type != PAGECACHE_LSN_PAGE)
{
continue; /* no need to store it in the checkpoint record */
}
/*
In the current pagecache, rec_lsn is not set correctly:
1) it is set on pagecache_unlock(), too late (a page is dirty
(BLOCK_CHANGED) since the first pagecache_write()). So in this
scenario:
thread1: thread2:
write_REDO
pagecache_write()
checkpoint : reclsn not known
pagecache_unlock(sets rec_lsn)
commit
crash,
at recovery we will wrongly skip the REDO. It also affects the
low-water mark's computation.
2) sometimes the unlocking can be an implicit action of
pagecache_write(), without any call to pagecache_unlock(), then
rec_lsn is not set.
1) and 2) are critical problems.
TODO: fix this when Monty has explained how he writes BLOB pages.
*/
if (0 == block->rec_lsn)
abort(); /* always fail in all builds */
int8store(ptr, block->hash_link->file.file);
ptr+= 8;
int8store(ptr, block->hash_link->pageno);
ptr+= 8;
int8store(ptr, block->rec_lsn);
ptr+= 8;
stored_LRD_size++;
DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed);
set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint,
block->rec_lsn);
}
}
pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
int8store(string1.str+8, stored_LRD_size);
string1.length= 8+8+(8+8+8)*stored_LRD_size;
}
if (pagecache_collect_changed_blocks_with_LSN(pagecache, &strings[1],
&candidate_max_rec_lsn_at_last_checkpoint))
goto err;
/* STEP 2: fetch information about transactions */
/* STEP 3: fetch information about transactions */
/* note: this piece will move into trnman.c */
/*
Transactions are in the "active list" (protected by a mutex) and in a
......@@ -345,7 +259,7 @@ my_bool execute_checkpoint_indirect()
string2.length= 8+(7+2+8+8+8)*stored_trn_size;
}
/* STEP 3: fetch information about table files */
/* STEP 4: fetch information about table files */
{
/* This global mutex is in fact THR_LOCK_maria (see ma_open()) */
......@@ -391,13 +305,8 @@ my_bool execute_checkpoint_indirect()
/* LAST STEP: now write the checkpoint log record */
string_array[0]= string1;
string_array[1]= string2;
string_array[2]= string3;
string_array[3]= NULL;
checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT,
&system_trans, string_array);
&system_trans, strings);
/*
Do nothing between the log write and the control file write, for the
......@@ -418,9 +327,8 @@ my_bool execute_checkpoint_indirect()
end:
my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR));
for (i= 1; i<5; i++)
my_free(strings[i], MYF(MY_ALLOW_ZERO_PTR));
/*
this portion cannot be done as a hook in write_log_record() for the
......@@ -440,7 +348,6 @@ my_bool execute_checkpoint_indirect()
lock(log_mutex);
/* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */
maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
written_since_last_checkpoint= (my_off_t)0;
DBUG_RETURN(FALSE);
}
lock(log_mutex);
......@@ -471,6 +378,8 @@ log_write_record(...)
thread" WL#3261) to do a checkpoint
*/
request_asynchronous_checkpoint(INDIRECT);
/* prevent similar redundant requests */
written_since_last_checkpoint= (my_off_t)0;
}
...;
unlock(log_mutex);
......@@ -488,16 +397,13 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
safemutex_assert_owner(log_mutex);
DBUG_ASSERT(level > NONE);
if (next_asynchronous_checkpoint_to_do < level)
if ((next_asynchronous_checkpoint_to_do < level) &&
(checkpoint_in_progress < level))
{
/* no equal or stronger running or to run, we post request */
/*
note that thousands of requests for checkpoints are going to come all
at the same time (when the log bound
MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS is passed), so it may not be a
good idea for each of them to broadcast a cond to wake up the background
checkpoint thread. We just don't broacast a cond, the checkpoint thread
(see least_recently_dirtied.c) will notice our request in max a few
We just don't broacast a cond, the checkpoint thread
(see ma_least_recently_dirtied.c) will notice our request in max a few
seconds.
*/
next_asynchronous_checkpoint_to_do= level; /* post request */
......@@ -520,6 +426,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
first_undo_lsn), this function can be used to do a read of it (without
mutex, without atomic load) which always produces a correct (though maybe
slightly old) value (even on 32-bit CPUs).
The prototype will change with Sanja's new LSN type.
*/
static inline ulonglong read_non_atomic(ulonglong volatile *x)
{
......
......@@ -36,162 +36,57 @@
#include "least_recently_dirtied.h"
/*
MikaelR suggested removing this global_LRD_mutex (I have a paper note of
comments), however at least for the first version we'll start with this
mutex (which will be a LOCK-based atomic_rwlock).
*/
pthread_mutex_t global_LRD_mutex;
/*
When we flush a page, we should pin page.
This "pin" is to protect against that:
I make copy,
you modify in memory and flush to disk and remove from LRD and from cache,
I write copy to disk,
checkpoint happens.
result: old page is on disk, page is absent from LRD, your REDO will be
wrongly ignored.
Pin: there can be multiple pins, flushing imposes that there are zero pins.
For example, pin could be a uint counter protected by the page's latch.
Maybe it's ok if when there is a page replacement, the replacer does not
remove page from the LRD (it would save global mutex); for that, background
flusher should be prepared to see pages in the LRD which are not in the page
cache (then just ignore them). However checkpoint will contain superfluous
entries and so do more work.
*/
#define PAGE_SIZE (16*1024) /* just as an example */
/*
Optimization:
LRD flusher should not flush pages one by one: to be fast, it flushes a
group of pages in sequential disk order if possible; a group of pages is just
FLUSH_GROUP_SIZE pages.
Key cache has groupping already somehow Monty said (investigate that).
*/
#define FLUSH_GROUP_SIZE 512 /* 8 MB */
/*
We don't want to probe for checkpoint requests all the time (it takes
the log mutex).
If FLUSH_GROUP_SIZE is 8MB, assuming a local disk which can write 30MB/s
(1.8GB/min), probing every 16th call to flush_one_group_from_LRD() is every
16*8=128MB which is every 128/30=4.2second.
Using a power of 2 gives a fast modulo operation.
*/
#define CHECKPOINT_PROBING_PERIOD_LOG2 4
/*
This thread does background flush of pieces of the LRD, and all checkpoints.
This thread does background flush of pieces of the LRD, and serves
requests for asynchronous checkpoints.
Just launch it when engine starts.
MikaelR questioned why the same thread does two different jobs, the risk
could be that while a checkpoint happens no LRD flushing happens.
For now, we only do checkpoints - no LRD flushing (to be done when the
second version of the page cache is ready WL#3077).
Reasons to delay:
- Recovery will work (just slower)
- new page cache may be different, why do then re-do
- current pagecache probably has issues with flushing when somebody is
writing to the table being flushed - better avoid that.
*/
pthread_handler_decl background_flush_and_checkpoint_thread()
{
char *flush_group_buffer= my_malloc(PAGE_SIZE*FLUSH_GROUP_SIZE);
uint flush_calls= 0;
while (this_thread_not_killed)
{
if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0)
{
/* note that we don't care of the checkpoint's success */
(void)execute_asynchronous_checkpoint_if_any();
}
lock(global_LRD_mutex);
flush_one_group_from_LRD();
safemutex_assert_not_owner(global_LRD_mutex);
/* note that we don't care of the checkpoint's success */
(void)execute_asynchronous_checkpoint_if_any();
sleep(5);
/*
We are a background thread, leave time for client threads or we would
monopolize the disk:
in the final version, we will not sleep but call flush_pages_from_LRD()
repeatedly. If there are no dirty pages, we'll make sure to not have a
tight loop probing for checkpoint requests.
*/
pthread_yield();
}
my_free(flush_group_buffer);
}
/* The rest of this file will not serve in first version */
/*
flushes only the first FLUSH_GROUP_SIZE pages of the LRD.
flushes only the first pages of the LRD.
max_this_number could be FLUSH_CACHE (of mf_pagecache.c) for example.
*/
flush_one_group_from_LRD()
flush_pages_from_LRD(uint max_this_number, LSN max_this_lsn)
{
char *ptr;
safe_mutex_assert_owner(global_LRD_mutex);
for (page= 0; page<FLUSH_GROUP_SIZE; page++)
{
copy_element_to_array;
}
/*
One rule to better observe is "page must be flushed to disk before it is
removed from LRD" (otherwise checkpoint is incomplete info, corruption).
*/
unlock(global_LRD_mutex);
/* page id is concatenation of "file id" and "number of page in file" */
qsort(array, sizeof(*element), FLUSH_GROUP_SIZE, by_page_id);
for (scan_array)
{
if (page_cache_latch(page_id, READ) == PAGE_ABSENT)
{
/*
page disappeared since we made the copy (it was flushed to be
replaced), remove from array (memcpy tail of array over it)...
*/
continue;
}
memcpy(flush_group_buffer+..., page->data, PAGE_SIZE);
pin_page;
page_cache_unlatch(page_id, KEEP_PINNED); /* but keep pinned */
}
for (scan_the_array)
{
/*
As an optimization, we try to identify contiguous-in-the-file segments (to
issue one big write()).
In non-optimized version, contiguous segment is always only one page.
*/
if ((next_page.page_id - this_page.page_id) == 1)
{
/*
this page and next page are in same file and are contiguous in the
file: add page to contiguous segment...
*/
continue; /* defer write() to next pages */
}
/* contiguous segment ends */
my_pwrite(file, contiguous_segment_start_offset, contiguous_segment_size);
/*
note that if we had doublewrite, doublewrite buffer may prevent us from
doing this write() grouping (if doublewrite space is shorter).
*/
}
/*
Now remove pages from LRD. As we have pinned them, all pages that we
managed to pin are still in the LRD, in the same order, we can just cut
the LRD at the last element of "array". This is more efficient that
removing element by element (which would take LRD mutex many times) in the
loop above.
Build a list of pages to flush:
changed_blocks[i] is roughly sorted by descending rec_lsn,
so we could do a merge sort of changed_blocks[] lists, stopping after we
have the max_this_number first elements or after we have found a page with
rec_lsn > max_this_lsn.
Then do like pagecache_flush_blocks_int() does (beware! this time we are
not alone on the file! there may be dangers! TODO: sort this out).
*/
lock(global_LRD_mutex);
/* cut LRD by bending LRD->first, free cut portion... */
unlock(global_LRD_mutex);
for (scan_array)
{
/*
if the page has a property "modified since last flush" (i.e. which is
redundant with the presence of the page in the LRD, this property can
just be a pointer to the LRD element) we should reset it
(note that then the property would live slightly longer than
the presence in LRD).
*/
page_cache_unpin(page_id);
/*
order between unpin and removal from LRD is not clear, depends on what
pin actually is.
*/
}
free(array);
/*
MikaelR noted that he observed that Linux's file cache may never fsync to
disk until this cache is full, at which point it decides to empty the
......@@ -201,28 +96,11 @@ flush_one_group_from_LRD()
}
/*
Flushes all page from LRD up to approximately rec_lsn>=max_lsn.
This is approximate because we flush groups, and because the LRD list may
Note that when we flush all page from LRD up to rec_lsn>=max_lsn,
this is approximate because the LRD list may
not be exactly sorted by rec_lsn (because for a big row, all pages of the
row are inserted into the LRD with rec_lsn being the LSN of the REDO for the
first page, so if there are concurrent insertions, the last page of the big
row may have a smaller rec_lsn than the previous pages inserted by
concurrent inserters).
*/
int flush_all_LRD_to_lsn(LSN max_lsn)
{
lock(global_LRD_mutex);
if (max_lsn == MAX_LSN) /* don't want to flush forever, so make it fixed: */
max_lsn= LRD->first->prev->rec_lsn;
while (LRD->first->rec_lsn < max_lsn)
{
if (flush_one_group_from_LRD()) /* will unlock LRD mutex */
return 1;
/*
The scheduler may preempt us here as we released the mutex; this is good.
*/
lock(global_LRD_mutex);
}
unlock(global_LRD_mutex);
return 0;
}
#include <m_string.h>
#include <pagecache.h>
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment