Commit 06f7675b authored by unknown's avatar unknown

Maria: first version of checkpoint (WL#3071), least-recently-dirtied page...

Maria: first version of checkpoint (WL#3071), least-recently-dirtied page flushing (WL#3261), recovery (WL#3072),
control file (WL#3234), to serve as a detailed LLD. It looks like C code, but does not compile (no point in making it compile,
as other modules on which I depend are not yet fully speficied or written); some pieces are not coded and just marked in comments.
Files' organization (names, directories of C files) does not matter at this point.
I don't think I had to commit so early, but it feels good to publish something, gives me the impression of moving forward :)


storage/maria/checkpoint.c:
  WL#3071 Maria checkpoint, implementation
storage/maria/checkpoint.h:
  WL#3071 Maria checkpoint, interface
storage/maria/control_file.c:
  WL#3234 Maria control file, implementation
storage/maria/control_file.h:
  WL#3234 Maria control file, interface
storage/maria/least_recently_dirtied.c:
  WL#3261 Maria background flushing of least-recently-dirtied pages, implementation
storage/maria/least_recently_dirtied.h:
  WL#3261 Maria background flushing of least-recently-dirtied pages, interface
storage/maria/recovery.c:
  WL#3072 Maria recovery, implementation
storage/maria/recovery.h:
  WL#3072 Maria recovery, interface
parent 99a86a34
/*
WL#3071 Maria checkpoint
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/* Here is the implementation of this module */
#include "page_cache.h"
#include "least_recently_dirtied.h"
#include "transaction.h"
#include "share.h"
#include "log.h"
/*
this transaction is used for any system work (purge, checkpoint writing
etc), that is, background threads. It will not be declared/initialized here
in the final version.
*/
st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,...};
/*
The maximum rec_lsn in the LRD when last checkpoint was run, serves for the
MEDIUM checkpoint.
*/
LSN max_rec_lsn_at_last_checkpoint= 0;
/* Picks a checkpoint request and executes it */
my_bool checkpoint()
{
CHECKPOINT_LEVEL level;
DBUG_ENTER("checkpoint");
level= checkpoint_running= checkpoint_request;
unlock(log_mutex);
DBUG_ASSERT(level != NONE);
switch (level)
{
case FULL:
/* flush all pages up to the current end of the LRD */
flush_all_LRD_to_lsn(MAX_LSN); /* MAX_LSN==ULONGLONG_MAX */
/* this will go full speed (normal scheduling, no sleep) */
break;
case MEDIUM:
/*
flush all pages which were already dirty at last checkpoint:
ensures that recovery will never start from before the next-to-last
checkpoint (two-checkpoint rule).
It is max, not min as the WL says (TODO update WL).
*/
flush_all_LRD_to_lsn(max_rec_lsn_at_last_checkpoint);
/* this will go full speed (normal scheduling, no sleep) */
break;
}
error= checkpoint_indirect();
lock(log_mutex);
/*
this portion cannot be done as a hook in write_log_record() for the
LOGREC_CHECKPOINT type because:
- at that moment we still have not written to the control file so cannot
mark the request as done; this could be solved by writing to the control
file in the hook but that would be an I/O under the log's mutex, bad.
- it would not be nice organisation of code (I tried it :).
*/
mark_checkpoint_done(error);
unlock(log_mutex);
DBUG_RETURN(error);
}
my_bool checkpoint_indirect()
{
DBUG_ENTER("checkpoint_indirect");
int error= 0;
/* checkpoint record data: */
LSN checkpoint_start_lsn;
LEX_STRING string1={0,0}, string2={0,0}, string3={0,0};
LEX_STRING *string_array[4];
char *ptr;
LSN checkpoint_lsn;
LSN candidate_max_rec_lsn_at_last_checkpoint= 0;
list_element *el; /* to scan lists */
DBUG_ASSERT(sizeof(byte *) <= 8);
DBUG_ASSERT(sizeof(LSN) <= 8);
lock(log_mutex); /* will probably be in log_read_end_lsn() already */
checkpoint_start_lsn= log_read_end_lsn();
unlock(log_mutex);
DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn));
lock(global_LRD_mutex);
string1.length= 8+8+(8+8)*LRD->count;
if (NULL == (string1.str= my_malloc(string1.length)))
goto err;
ptr= string1.str;
int8store(ptr, checkpoint_start_lsn);
ptr+= 8;
int8store(ptr, LRD->count);
ptr+= 8;
if (LRD->count)
{
candidate_max_rec_lsn_at_last_checkpoint= LRD->last->rec_lsn;
for (el= LRD->first; el; el= el->next)
{
int8store(ptr, el->page_id);
ptr+= 8;
int8store(ptr, el->rec_lsn);
ptr+= 8;
}
}
unlock(global_LRD_mutex);
/*
If trx are in more than one list (e.g. three:
running transactions, committed transactions, purge queue), we can either
take mutexes of all three together or do crabbing.
But if an element can move from list 1 to list 3 without passing through
list 2, crabbing is dangerous.
Hopefully it's ok to take 3 mutexes together...
Otherwise I'll have to make sure I miss no important trx and I handle dups.
*/
lock(global_transactions_list_mutex); /* or 3 mutexes if there are 3 */
string2.length= 8+(8+8)*trx_list->count;
if (NULL == (string2.str= my_malloc(string2.length)))
goto err;
ptr= string2.str;
int8store(ptr, trx_list->count);
ptr+= 8;
for (el= trx_list->first; el; el= el->next)
{
/* possibly latch el.rwlock */
*ptr= el->state;
ptr++;
int7store(ptr, el->long_trans_id);
ptr+= 7;
int2store(ptr, el->short_trans_id);
ptr+= 2;
int8store(ptr, el->undo_lsn);
ptr+= 8;
int8store(ptr, el->undo_purge_lsn);
ptr+= 8;
/*
if no latch, use double variable of type ULONGLONG_CONSISTENT in
st_transaction, or even no need if Intel >=486
*/
int8store(ptr, el->first_purge_lsn);
ptr+= 8;
/* possibly unlatch el.rwlock */
}
unlock(global_transactions_list_mutex);
lock(global_share_list_mutex);
string3.length= 8+(8+8)*share_list->count;
if (NULL == (string3.str= my_malloc(string3.length)))
goto err;
ptr= string3.str;
/* possibly latch each MARIA_SHARE */
make_copy_of_global_share_list_to_array;
unlock(global_share_list_mutex);
/* work on copy */
int8store(ptr, elements_in_array);
ptr+= 8;
for (scan_array)
{
int8store(ptr, array[...].file_id);
ptr+= 8;
memcpy(ptr, array[...].file_name, ...);
ptr+= ...;
/*
these two are long ops (involving disk I/O) that's why we copied the
list:
*/
flush_bitmap_pages(el);
/*
fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per
second, so if you have touched 1000 files it's 7 seconds).
*/
force_file(el);
}
/* now write the record */
string_array[0]= string1;
string_array[1]= string2;
string_array[2]= string3;
string_array[3]= NULL;
checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT,
&system_trans, string_array);
if (0 == checkpoint_lsn) /* maybe 0 is impossible LSN to indicate error ? */
goto err;
if (0 != control_file_write_and_force(checkpoint_lsn, NULL))
goto err;
maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
DBUG_RETURN(0);
err:
print_error_to_error_log(the_error_message);
my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR));
DBUG_RETURN(1);
}
/*
Here's what should be put in log_write_record() in the log handler:
*/
log_write_record(...)
{
...;
lock(log_mutex);
...;
write_to_log(length);
written_since_last_checkpoint+= length;
if (written_since_last_checkpoint >
MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS)
{
/*
ask one system thread (the "LRD background flusher and checkpointer
thread" WL#3261) to do a checkpoint
*/
request_checkpoint(INDIRECT, 0 /*wait_for_completion*/);
}
...;
unlock(log_mutex);
...;
}
/*
Call this when you want to request a checkpoint.
In real life it will be called by log_write_record() and by client thread
which explicitely wants to do checkpoint (ALTER ENGINE CHECKPOINT
checkpoint_level).
*/
int request_checkpoint(CHECKPOINT_LEVEL level, my_bool wait_for_completion)
{
int error= 0;
/*
If caller wants to wait for completion we'll have to release the log mutex
to wait on condition, if caller had log mutex he may not be happy that we
release it, so we check that caller didn't have log mutex.
*/
if (wait_for_completion)
{
lock(log_mutex);
}
else
safemutex_assert_owner(log_mutex);
DBUG_ASSERT(checkpoint_request >= checkpoint_running);
DBUG_ASSERT(level > NONE);
if (checkpoint_request < level)
{
/* no equal or stronger running or to run, we post request */
/*
note that thousands of requests for checkpoints are going to come all
at the same time (when the log bound is passed), so it may not be a good
idea for each of them to broadcast a cond. We just don't broacast a
cond, the checkpoint thread will wake up in max one second.
*/
checkpoint_request= level; /* post request */
}
if (wait_for_completion)
{
uint checkpoints_done_copy= checkpoints_done;
uint checkpoint_errors_copy= checkpoint_errors;
/*
note that the "==done" works when the uint counter wraps too, so counter
can even be smaller than uint if we wanted (however it should be big
enough so that max_the_int_type checkpoints cannot happen between two
wakeups of our thread below). uint sounds fine.
Wait for our checkpoint to be done:
*/
if (checkpoint_running != NONE) /* not ours, let it pass */
{
while (1)
{
if (checkpoints_done != checkpoints_done_copy)
{
if (checkpoints_done == (checkpoints_done_copy+1))
{
/* not our checkpoint, forget about it */
checkpoints_done_copy= checkpoints_done;
}
break; /* maybe even ours has been done at this stage! */
}
cond_wait(checkpoint_done_cond, log_mutex);
}
}
/* now we come to waiting for our checkpoint */
while (1)
{
if (checkpoints_done != checkpoints_done_copy)
{
/* our checkpoint has been done */
break;
}
if (checkpoint_errors != checkpoint_errors_copy)
{
/*
the one which was running a few milliseconds ago (if there was one),
and/or ours, had an error, just assume it was ours. So there
is a possibility that we return error though we succeeded, in which
case user will have to retry; but two simultanate checkpoints have
high changes to fail together (as the error probably comes from
malloc or disk write problem), so chance of false alarm is low.
Reporting the error only to the one which caused the error would
require having a (not fixed size) list of all requests, not worth it.
*/
error= 1;
break;
}
cond_wait(checkpoint_done_cond, log_mutex);
}
unlock(log_mutex);
} /* ... if (wait_for_completion) */
/*
If wait_for_completion was false, and there was an error, only an error
message to the error log will say it; normal, for a checkpoint triggered
by a log write, we probably don't want the client's log write to throw an
error, as the log write succeeded and a checkpoint failure is not
critical: the failure in this case is more for the DBA to know than for
the end user.
*/
return error;
}
void mark_checkpoint_done(int error)
{
safemutex_assert_owner(log_mutex);
if (error)
checkpoint_errors++;
/* a checkpoint is said done even if it had an error */
checkpoints_done++;
if (checkpoint_request == checkpoint_running)
{
/*
No new request has been posted, so we satisfied all requests, forget
about them.
*/
checkpoint_request= NONE;
}
checkpoint_running= NONE;
written_since_last_checkpoint= 0;
broadcast(checkpoint_done_cond);
}
/*
Alternative (not to be done, too disturbing):
do the autocheckpoint in the thread which passed the bound first (and do the
checkpoint in the client thread which requested it).
It will give a delay to that client thread which passed the bound (time to
fsync() for example 1000 files is 16 s on my laptop). Here is code for
explicit and implicit checkpoints, where client thread does the job:
*/
#if 0
{
lock(log_mutex); /* explicit takes it here, implicit already has it */
while (checkpoint_running != NONE)
{
if (checkpoint_running >= my_level) /* always true for auto checkpoints */
goto end; /* we skip checkpoint */
/* a less strong is running, I'll go next */
wait_on_checkpoint_done_cond();
}
checkpoint_running= my_level;
checkpoint(my_level); // can gather checkpoint_start_lsn before unlock
lock(log_mutex);
checkpoint_running= NONE;
written_since_last_checkpoint= 0;
end:
unlock(log_mutex);
}
#endif
/*
WL#3071 Maria checkpoint
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/* This is the interface of this module. */
typedef enum enum_checkpoint_level {
NONE=-1,
INDIRECT, /* just write dirty_pages, transactions table and sync files */
MEDIUM, /* also flush all dirty pages which were already dirty at prev checkpoint*/
FULL /* also flush all dirty pages */
} CHECKPOINT_LEVEL;
/*
Call this when you want to request a checkpoint.
In real life it will be called by log_write_record() and by client thread
which explicitely wants to do checkpoint (ALTER ENGINE CHECKPOINT
checkpoint_level).
*/
int request_checkpoint(CHECKPOINT_LEVEL level, my_bool wait_for_completion);
/* that's all that's needed in the interface */
/*
WL#3234 Maria control file
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/* Here is the implementation of this module */
/* Control file is 512 bytes (a disk sector), to be as atomic as possible */
int control_file_fd;
/*
Looks for the control file. If absent, it's a fresh start, create file.
If present, read it to find out last checkpoint's LSN and last log.
Called at engine's start.
*/
int control_file_create_or_open()
{
char buffer[4];
/* name is concatenation of Maria's home dir and "control" */
if ((control_file_fd= my_open(name, O_RDWR)) < 0)
{
/* failure, try to create it */
if ((control_file_fd= my_create(name, O_RDWR)) < 0)
return 1;
/*
So this is a start from scratch, to be safer we should make sure that
there are no logs or data/index files around (indeed it could be that
the control file alone was deleted or not restored, and we should not
go on with life at this point.
For now we trust (this is alpha version), but for beta if would be great
to verify.
We could have a tool which can rebuild the control file, by reading the
directory of logs, finding the newest log, reading it to find last
checkpoint... Slow but can save your db.
*/
last_checkpoint_lsn_at_startup= 0;
last_log_name_at_startup= NULL;
return 0;
}
/* Already existing file, read it */
if (my_read(control_file_fd, buffer, 8, MYF(MY_FNABP)))
return 1;
last_checkpoint_lsn_at_startup= uint8korr(buffer);
if (last_log_name_at_startup= my_malloc(512-8+1))
return 1;
if (my_read(control_file_fd, last_log_name_at_startup, 512-8), MYF(MY_FNABP))
return 1;
last_log_name[512-8]= 0; /* end zero to be nice */
return 0;
}
/*
Write information durably to the control file.
Called when we have created a new log (after syncing this log's creation)
and when we have written a checkpoint (after syncing this log record).
*/
int control_file_write_and_force(LSN lsn, char *log_name)
{
char buffer[512];
uint start=8,end=8;
if (lsn != 0) /* LSN was specified */
{
start= 0;
int8store(buffer, lsn);
}
if (log_name != NULL) /* log name was specified */
{
end= 512;
memcpy(buffer+8, log_name, 512-8);
}
DBUG_ASSERT(start != end);
return (my_pwrite(control_file_fd, buffer, end-start, start, MYF(MY_FNABP)) ||
my_sync(control_file_fd))
}
/*
WL#3234 Maria control file
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/* Here is the interface of this module */
LSN last_checkpoint_lsn_at_startup;
char *last_log_name_at_startup;
/*
Looks for the control file. If absent, it's a fresh start, create file.
If present, read it to find out last checkpoint's LSN and last log.
Called at engine's start.
*/
int control_file_create_or_open();
/*
Write information durably to the control file.
Called when we have created a new log (after syncing this log's creation)
and when we have written a checkpoint (after syncing this log record).
*/
int control_file_write_and_force(LSN lsn, char *log_name);
/*
WL#3261 Maria - background flushing of the least-recently-dirtied pages
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/*
To be part of the page cache.
The pseudocode below is dependent on the page cache
which is being designed WL#3134. It is not clear if I need to do page
copies, as the page cache already keeps page copies.
So, this code will move to the page cache and take inspiration from its
methods. Below is just to give the idea of what could be done.
And I should compare my imaginations to WL#3134.
*/
/* Here is the implementation of this module */
#include "page_cache.h"
#include "least_recently_dirtied.h"
/*
When we flush a page, we should pin page.
This "pin" is to protect against that:
I make copy,
you modify in memory and flush to disk and remove from LRD and from cache,
I write copy to disk,
checkpoint happens.
result: old page is on disk, page is absent from LRD, your REDO will be
wrongly ignored.
Pin: there can be multiple pins, flushing imposes that there are zero pins.
For example, pin could be a uint counter protected by the page's latch.
Maybe it's ok if when there is a page replacement, the replacer does not
remove page from the LRD (it would save global mutex); for that, background
flusher should be prepared to see pages in the LRD which are not in the page
cache (then just ignore them). However checkpoint will contain superfluous
entries and so do more work.
*/
#define PAGE_SIZE (16*1024) /* just as an example */
/*
Optimization:
LRD flusher should not flush pages one by one: to be fast, it flushes a
group of pages in sequential disk order if possible; a group of pages is just
FLUSH_GROUP_SIZE pages.
Key cache has groupping already somehow Monty said (investigate that).
*/
#define FLUSH_GROUP_SIZE 512 /* 8 MB */
/*
This thread does background flush of pieces of the LRD, and all checkpoints.
Just launch it when engine starts.
*/
pthread_handler_decl background_flush_and_checkpoint_thread()
{
char *flush_group_buffer= my_malloc(PAGE_SIZE*FLUSH_GROUP_SIZE);
while (this_thread_not_killed)
{
lock(log_mutex);
if (checkpoint_request)
checkpoint(); /* will unlock mutex */
else
{
unlock(log_mutex);
lock(global_LRD_mutex);
flush_one_group_from_LRD();
safemutex_assert_not_owner(global_LRD_mutex);
}
my_sleep(1000000); /* one second ? */
}
my_free(flush_group_buffer);
}
/*
flushes only the first FLUSH_GROUP_SIZE pages of the LRD.
*/
flush_one_group_from_LRD()
{
char *ptr;
safe_mutex_assert_owner(global_LRD_mutex);
for (page= 0; page<FLUSH_GROUP_SIZE; page++)
{
copy_element_to_array;
}
/*
One rule to better observe is "page must be flushed to disk before it is
removed from LRD" (otherwise checkpoint is incomplete info, corruption).
*/
unlock(global_LRD_mutex);
/* page id is concatenation of "file id" and "number of page in file" */
qsort(array, sizeof(*element), FLUSH_GROUP_SIZE, by_page_id);
for (scan_array)
{
if (page_cache_latch(page_id, READ) == PAGE_ABSENT)
{
/*
page disappeared since we made the copy (it was flushed to be
replaced), remove from array (memcpy tail of array over it)...
*/
continue;
}
memcpy(flush_group_buffer+..., page->data, PAGE_SIZE);
pin_page;
page_cache_unlatch(page_id, KEEP_PINNED); /* but keep pinned */
}
for (scan_the_array)
{
/*
As an optimization, we try to identify contiguous-in-the-file segments (to
issue one big write()).
In non-optimized version, contiguous segment is always only one page.
*/
if ((next_page.page_id - this_page.page_id) == 1)
{
/*
this page and next page are in same file and are contiguous in the
file: add page to contiguous segment...
*/
continue; /* defer write() to next pages */
}
/* contiguous segment ends */
my_pwrite(file, contiguous_segment_start_offset, contiguous_segment_size);
/*
note that if we had doublewrite, doublewrite buffer may prevent us from
doing this write() grouping (if doublewrite space is shorter).
*/
}
/*
Now remove pages from LRD. As we have pinned them, all pages that we
managed to pin are still in the LRD, in the same order, we can just cut
the LRD at the last element of "array". This is more efficient that
removing element by element (which would take LRD mutex many times) in the
loop above.
*/
lock(global_LRD_mutex);
/* cut LRD by bending LRD->first, free cut portion... */
unlock(global_LRD_mutex);
for (scan_array)
{
/*
if the page has a property "modified since last flush" (i.e. which is
redundant with the presence of the page in the LRD, this property can
just be a pointer to the LRD element) we should reset it
(note that then the property would live slightly longer than
the presence in LRD).
*/
page_cache_unpin(page_id);
/*
order between unpin and removal from LRD is not clear, depends on what
pin actually is.
*/
}
free(array);
}
/* flushes all page from LRD up to approximately rec_lsn>=max_lsn */
int flush_all_LRD_to_lsn(LSN max_lsn)
{
lock(global_LRD_mutex);
if (max_lsn == MAX_LSN) /* don't want to flush forever, so make it fixed: */
max_lsn= LRD->first->prev->rec_lsn;
while (LRD->first->rec_lsn < max_lsn)
{
if (flush_one_group_from_LRD()) /* will unlock mutex */
return 1;
/* scheduler may preempt us here so that we don't take full CPU */
lock(global_LRD_mutex);
}
unlock(global_LRD_mutex);
return 0;
}
/*
WL#3261 Maria - background flushing of the least-recently-dirtied pages
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/* This is the interface of this module. */
/* flushes all page from LRD up to approximately rec_lsn>=max_lsn */
int flush_all_LRD_to_lsn(LSN max_lsn);
/*
WL#3072 Maria recovery
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/* Here is the implementation of this module */
#include "page_cache.h"
#include "least_recently_dirtied.h"
#include "transaction.h"
#include "share.h"
#include "log.h"
typedef struct st_record_type_properties {
/* used for debug error messages or "maria_read_log" command-line tool: */
char *name,
my_bool record_ends_group;
int (*record_execute)(RECORD *); /* param will be record header instead later */
} RECORD_TYPE_PROPERTIES;
RECORD_TYPE_PROPERTIES all_record_type_properties[]=
{
/* listed here in the order of the "log records type" enumeration */
{"REDO_INSERT_HEAD", 0, redo_insert_head_execute},
...,
{"UNDO_INSERT" , 1, undo_insert_execute },
{"COMMIT", , 1, commit_execute },
...
};
int redo_insert_head_execute(RECORD *record)
{
/* write the data to the proper page */
}
int undo_insert_execute(RECORD *record)
{
trans_table[short_trans_id].undo_lsn= record.lsn;
/* restore the old version of the row */
}
int commit_execute(RECORD *record)
{
trans_table[short_trans_id].state= COMMITTED;
/*
and that's all: the delete/update handler should not be woken up! as there
may be REDO for purge further in the log.
*/
}
#define record_ends_group(R) \
all_record_type_properties[(R)->type].record_ends_group)
#define execute_log_record(R) \
all_record_type_properties[(R).type].record_execute(R)
int recovery()
{
control_file_create_or_open();
/*
init log handler: tell it that we are going to do large reads of the
log, sequential and backward. Log handler could decide to alloc a big
read-only IO_CACHE for this, or use its usual page cache.
*/
/* read checkpoint log record from log handler */
RECORD *checkpoint_record= log_read_record(last_checkpoint_lsn_at_start);
/* parse this record, build structs (dirty_pages, transactions table, file_map) */
/*
read log records (note: sometimes only the header is needed, for ex during
REDO phase only the header of UNDO is needed, not the 4G blob in the
variable-length part, so I could use that; however for PREPARE (which is a
variable-length record) I'll need to read the full record in the REDO
phase):
*/
record= log_read_record(min(rec_lsn, ...));
/*
if log handler knows the end LSN of the log, we could print here how many
MB of log we have to read (to give an idea of the time), and print
progress notes.
*/
while (record != NULL)
{
/*
A complete group is a set of log records with an "end mark" record
(e.g. a set of REDOs for an operation, terminated by an UNDO for this
operation); if there is no "end mark" record the group is incomplete
and won't be executed.
*/
if (record_ends_group(record)
{
/*
such end events can always be executed immediately (they don't touch
the disk).
*/
execute_log_record(record);
if (trans_table[record.short_trans_id].group_start_lsn != 0)
{
/*
There is a complete group for this transaction.
We're going to read recently read log records:
for this log_read_record() to be efficient (not touch the disk),
log handler could cache recently read pages
(can just use an IO_CACHE of 10 MB to read the log, or the normal
log handler page cache).
Without it only OS file cache will help.
*/
record2= log_read_record(trans_table[record.short_trans_id].group_start_lsn);
while (record2.lsn < record.lsn)
{
if (record2.short_trans_id == record.short_trans_id)
execute_log_record(record2); /* it's in our group */
record2= log_read_next_record();
}
trans_table[record.short_trans_id].group_start_lsn= 0; /* group finished */
/* we're now at the UNDO, re-read it to advance log pointer */
record2= log_read_next_record(); /* and throw it away */
}
}
else /* record does not end group */
{
/* just record the fact, can't know if can execute yet */
if (trans_table[short_trans_id].group_start_lsn == 0) /* group not yet started */
trans_table[short_trans_id].group_start_lsn= record.lsn;
}
/*
Later we can optimize: instead of "execute_log_record(record2)", do
copy_record_into_exec_buffer(record2):
this will just copy record into a multi-record (10 MB?) memory buffer,
and when buffer is full, will do sorting of REDOs per
page id and execute them.
This sorting will enable us to do more sequential reads of the
data/index pages.
Note that updating bitmap pages (when we have executed a REDO for a page
we update its bitmap page) may break the sequential read of pages,
so maybe we should read and cache bitmap pages in the beginning.
Or ok the sequence will be broken, but quickly all bitmap pages will be
in memory and so the sequence will not be broken anymore.
Sorting could even determine, based on physical device of files
("st_dev" in stat()), that some files should be should be taken by
different threads, if we want to do parallism.
*/
/*
Here's how to read a complete variable-length record if needed:
<sanja> read the header, allocate buffer of record length, read whole
record.
*/
record= log_read_next_record();
}
/*
Earlier or here, create true transactions in TM.
If done earlier, note that TM should not wake up the delete/update handler
when it receives a commit info, as existing REDO for purge may exist in
the log, and so the delete/update handler may do changes which conflict
with these REDOs.
Even if done here, better to not wake it up now as we're going to free the
page cache:
*/
/*
We want to have two steps:
engine->recover_with_max_memory();
next_engine->recover_with_max_memory();
engine->init_with_normal_memory();
next_engine->init_with_normal_memory();
So: in recover_with_max_memory() allocate a giant page cache, do REDO
phase, then all page cache is flushed and emptied and freed (only retain
small structures like TM): take full checkpoint, which is useful if
next engine crashes in its recovery the next second.
Destroy all shares (maria_close()), then at init_with_normal_memory() we
do this:
*/
print_information_to_error_log(nb of trans to roll back, nb of prepared trans);
/*
Launch one or more threads to do the background rollback. Don't wait for
them to complete their rollback (background rollback; for debugging, we
can have an option which waits).
Note that InnoDB's rollback-in-background works as long as InnoDB is the
last engine to recover, otherwise MySQL will refuse new connections until
the last engine has recovered so it's not "background" from the user's
point of view. InnoDB is near top of sys_table_types so all others
(e.g. BDB) recover after it... So it's really "online rollback" only if
InnoDB is the only engine.
*/
/* wake up delete/update handler */
/* tell the TM that it can now accept new transactions */
/*
mark that checkpoint requests are now allowed.
*/
/*
when all rollback threads have terminated, somebody should print "rollback
finished" to the error log.
*/
}
pthread_handler_decl rollback_background_thread()
{
/*
execute the normal runtime-rollback code for a bunch of transactions.
*/
while (trans in list_of_trans_to_rollback_by_this_thread)
{
while (trans->undo_lsn != 0)
{
/* this is the normal runtime-rollback code: */
record= log_read_record(trans->undo_lsn);
execute_log_record(record);
trans->undo_lsn= record.prev_undo_lsn;
}
/* remove trans from list */
}
}
/*
WL#3072 Maria recovery
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/* This is the interface of this module. */
/* Performs recovery of the engine at start */
int recovery();
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment