Post-vacation-musing fixes to WL#3071 "Maria checkpoint":

changes to how synchronous checkpoint requests are executed. changes to how the background LRD flushing thread refrains from using all resources. See more comments for each file. storage/maria/checkpoint.c: I was not happy that checkpoint requests which want to know the success/error of their executed request, get inaccurate information in case of error (no error string etc). Instead of implementing a more complete communication protocol between requestor and executor, I make the requestor do the execution itself. I call this a synchronous checkpoint. For asynchronous checkpoints (requestor does not want to know success/error, does not want to wait for completion), no change, checkpoint is executed by the background thread. Comments, constants, mutex usage fixes. storage/maria/checkpoint.h: new prototype of "API" (the calls exposed by the checkpoint module) storage/maria/least_recently_dirtied.c: A better solution than sleeping one second after flushing a piece of the LRD: instead we pthread_yield(). Hopefully this will slow down the background thread (avoiding it using all the disk's bandwidth) if there are other threads competing, and will not slow it down if this thread is alone (where we do want it to run fast and not do useless sleeps). This thread will probe for asynchronous checkpoint requests every few seconds.

Post-vacation-musing fixes to WL#3071 "Maria checkpoint":
changes to how synchronous checkpoint requests are executed. changes to how the background LRD flushing thread refrains from using all resources. See more comments for each file. storage/maria/checkpoint.c: I was not happy that checkpoint requests which want to know the success/error of their executed request, get inaccurate information in case of error (no error string etc). Instead of implementing a more complete communication protocol between requestor and executor, I make the requestor do the execution itself. I call this a synchronous checkpoint. For asynchronous checkpoints (requestor does not want to know success/error, does not want to wait for completion), no change, checkpoint is executed by the background thread. Comments, constants, mutex usage fixes. storage/maria/checkpoint.h: new prototype of "API" (the calls exposed by the checkpoint module) storage/maria/least_recently_dirtied.c: A better solution than sleeping one second after flushing a piece of the LRD: instead we pthread_yield(). Hopefully this will slow down the background thread (avoiding it using all the disk's bandwidth) if there are other threads competing, and will not slow it down if this thread is alone (where we do want it to run fast and not do useless sleeps). This thread will probe for asynchronous checkpoint requests every few seconds.
30e5a9bd · unknown · c262b880 · 30e5a9bd · 30e5a9bd · 30e5a9bd
Commit 30e5a9bd authored Jul 24, 2006 by unknown
Showing with 169 additions and 179 deletions

storage/maria/checkpoint.c storage/maria/checkpoint.c +139 -160

storage/maria/checkpoint.h storage/maria/checkpoint.h +3 -7

storage/maria/least_recently_dirtied.c storage/maria/least_recently_dirtied.c +27 -12

No files found.
--- a/storage/maria/checkpoint.c
+++ b/storage/maria/checkpoint.c
@@ -6,12 +6,28 @@

 /* Here is the implementation of this module */

+/*
+  Summary:
+  - there are asynchronous checkpoints (a writer to the log notices that it's
+  been a long time since we last checkpoint-ed, so posts a request for a
+  background thread to do a checkpoint; does not care about the success of the
+  checkpoint). Then the checkpoint is done by the checkpoint thread, at an
+  unspecified moment ("later") (==soon, of course).
+  - there are synchronous checkpoints: a thread requests a checkpoint to
+  happen now and wants to know when it finishes and if it succeeded; then the
+  checkpoint is done by that same thread.
+*/
+
 #include "page_cache.h"
 #include "least_recently_dirtied.h"
 #include "transaction.h"
 #include "share.h"
 #include "log.h"

+/* could also be called LSN_ERROR */
+#define LSN_IMPOSSIBLE ((LSN)0)
+#define LSN_MAX ((LSN)ULONGLONG_MAX)
+
 /*
  this transaction is used for any system work (purge, checkpoint writing
  etc), that is, background threads. It will not be declared/initialized here
@@ -19,43 +35,112 @@
 */
 st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,...};

+/* those three are protected by the log's mutex */
 /*
  The maximum rec_lsn in the LRD when last checkpoint was run, serves for the
  MEDIUM checkpoint.
 */
 LSN max_rec_lsn_at_last_checkpoint= 0;
+CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE;
+CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;
+
+/*
+  Used by MySQL client threads requesting a checkpoint (like "ALTER MARIA
+  ENGINE DO CHECKPOINT"), and probably by maria_panic().
+*/
+my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
+{
+  DBUG_ENTER("execute_synchronous_checkpoint");
+  DBUG_ASSERT(level > NONE);
+
+  lock(log_mutex);
+  while ((synchronous_checkpoint_in_progress != NONE) ||
+         (next_asynchronous_checkpoint_to_do != NONE))
+    wait_on_checkpoint_done_cond();
+
+  synchronous_checkpoint_in_progress= level;
+  execute_checkpoint(level);
+  safemutex_assert_owner(log_mutex);
+  synchronous_checkpoint_in_progress= NONE;
+  unlock(log_mutex);
+  broadcast(checkpoint_done_cond);
+}

-/* Picks a checkpoint request and executes it */
-my_bool checkpoint()
+/* Picks a checkpoint request, if there is one, and executes it */
+my_bool execute_asynchronous_checkpoint_if_any()
 {
  CHECKPOINT_LEVEL level;
-  DBUG_ENTER("checkpoint");
+  DBUG_ENTER("execute_asynchronous_checkpoint");
+
+  lock(log_mutex);
+  if (likely(next_asynchronous_checkpoint_to_do == NONE))
+  {
+    unlock(log_mutex);
+    DBUG_RETURN(FALSE);
+  }

-  level= checkpoint_running= checkpoint_request;
+  while (synchronous_checkpoint_in_progress)
+    wait_on_checkpoint_done_cond();
+
+do_checkpoint:
+  level= next_asynchronous_checkpoint_to_do;
+  DBUG_ASSERT(level > NONE);
+  execute_checkpoint(level);
+  safemutex_assert_owner(log_mutex);
+  if (next_asynchronous_checkpoint_to_do > level)
+    goto do_checkpoint;     /* one more request was posted */
+  else
+  {
+    DBUG_ASSERT(next_asynchronous_checkpoint_to_do == level);
+    next_asynchronous_checkpoint_to_do= NONE; /* all work done */
+  }
  unlock(log_mutex);
+  broadcast(checkpoint_done_cond);
+}

-  DBUG_ASSERT(level != NONE);

-  switch (level)
+/*
+  Does the actual checkpointing. Called by
+  execute_synchronous_checkpoint() and
+  execute_asynchronous_checkpoint_if_any().
+*/
+my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
+{
+  LSN candidate_max_rec_lsn_at_last_checkpoint;
+  /* to avoid { lock + no-op + unlock } in the common (==indirect) case */
+  my_bool need_log_mutex;
+                    
+  DBUG_ENTER("execute_checkpoint");
+
+  safemutex_assert_owner(log_mutex);
+  copy_of_max_rec_lsn_at_last_checkpoint= max_rec_lsn_at_last_checkpoint;
+
+  if (unlikely(need_log_mutex= (level > INDIRECT)))
  {
-  case FULL:
-    /* flush all pages up to the current end of the LRD */
-    flush_all_LRD_to_lsn(MAX_LSN); /* MAX_LSN==ULONGLONG_MAX */
-    /* this will go full speed (normal scheduling, no sleep) */
-    break;
-  case MEDIUM:
-    /*
-      flush all pages which were already dirty at last checkpoint:
-      ensures that recovery will never start from before the next-to-last
-      checkpoint (two-checkpoint rule).
-      It is max, not min as the WL says (TODO update WL).
-    */
-    flush_all_LRD_to_lsn(max_rec_lsn_at_last_checkpoint);
-    /* this will go full speed (normal scheduling, no sleep) */
-    break;
+    /* much I/O work to do, release log mutex */
+    unlock(log_mutex);
+    
+    switch (level)
+    {
+    case FULL:
+      /* flush all pages up to the current end of the LRD */
+      flush_all_LRD_to_lsn(LSN_MAX);
+      /* this will go full speed (normal scheduling, no sleep) */
+      break;
+    case MEDIUM:
+      /*
+        flush all pages which were already dirty at last checkpoint:
+        ensures that recovery will never start from before the next-to-last
+        checkpoint (two-checkpoint rule).
+        It is max, not min as the WL says (TODO update WL).
+      */
+      flush_all_LRD_to_lsn(copy_of_max_rec_lsn_at_last_checkpoint);
+      /* this will go full speed (normal scheduling, no sleep) */
+      break;
+    }
  }
-  
-  error= checkpoint_indirect();
+
+  candidate_max_rec_lsn_at_last_checkpoint= checkpoint_indirect(need_log_mutex);

  lock(log_mutex);
  /*
@@ -66,13 +151,22 @@ my_bool checkpoint()
    file in the hook but that would be an I/O under the log's mutex, bad.
    - it would not be nice organisation of code (I tried it :).
  */
-  mark_checkpoint_done(error);
-  unlock(log_mutex);
-  DBUG_RETURN(error);
+  if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE)
+  {
+    /* checkpoint succeeded */
+    maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
+    written_since_last_checkpoint= (my_off_t)0;
+    DBUG_RETURN(FALSE);
+  }
+  /*
+    keep mutex locked because callers will want to clear mutex-protected
+    status variables
+  */
+  DBUG_RETURN(TRUE);
 }


-my_bool checkpoint_indirect()
+LSN checkpoint_indirect(my_bool need_log_mutex)
 {
  DBUG_ENTER("checkpoint_indirect");

@@ -90,7 +184,8 @@ my_bool checkpoint_indirect()
  DBUG_ASSERT(sizeof(byte *) <= 8);
  DBUG_ASSERT(sizeof(LSN) <= 8);

-  lock(log_mutex); /* will probably be in log_read_end_lsn() already */
+  if (need_log_mutex)
+    lock(log_mutex); /* maybe this will clash with log_read_end_lsn() */
  checkpoint_start_lsn= log_read_end_lsn();
  unlock(log_mutex);

@@ -196,15 +291,13 @@ my_bool checkpoint_indirect()
  checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT,
                                   &system_trans, string_array);

-  if (0 == checkpoint_lsn) /* maybe 0 is impossible LSN to indicate error ? */
+  if (LSN_IMPOSSIBLE == checkpoint_lsn)
    goto err;

  if (0 != control_file_write_and_force(checkpoint_lsn, NULL))
    goto err;

-  maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
-
-  DBUG_RETURN(0);
+  DBUG_RETURN(candidate_max_rec_lsn_at_last_checkpoint);

 err:

@@ -213,7 +306,7 @@ my_bool checkpoint_indirect()
  my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR));
  my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR));

-  DBUG_RETURN(1);
+  DBUG_RETURN(LSN_IMPOSSIBLE);
 }


@@ -235,7 +328,7 @@ log_write_record(...)
      ask one system thread (the "LRD background flusher and checkpointer
      thread" WL#3261) to do a checkpoint
    */
-    request_checkpoint(INDIRECT, 0 /*wait_for_completion*/);
+    request_asynchronous_checkpoint(INDIRECT);
  }
  ...;
  unlock(log_mutex);
@@ -243,152 +336,38 @@ log_write_record(...)
 }

 /*
-  Call this when you want to request a checkpoint.
-  In real life it will be called by log_write_record() and by client thread
+  Requests a checkpoint from the background thread, *asynchronously*
+  (requestor does not wait for completion, and does not even later check the
+  result).
+  In real life it will be called by log_write_record().
  which explicitely wants to do checkpoint (ALTER ENGINE CHECKPOINT
  checkpoint_level).
 */
-int request_checkpoint(CHECKPOINT_LEVEL level, my_bool wait_for_completion)
+void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
 {
-  int error= 0;
-  /*
-    If caller wants to wait for completion we'll have to release the log mutex
-    to wait on condition, if caller had log mutex he may not be happy that we
-    release it, so we check that caller didn't have log mutex.
-  */
-  if (wait_for_completion)
-  {
-    lock(log_mutex);
-  }
-  else
-    safemutex_assert_owner(log_mutex);
+  safemutex_assert_owner(log_mutex);

-  DBUG_ASSERT(checkpoint_request >= checkpoint_running);
  DBUG_ASSERT(level > NONE);
  if (checkpoint_request < level)
  {
    /* no equal or stronger running or to run, we post request */
    /*
      note that thousands of requests for checkpoints are going to come all
-      at the same time (when the log bound is passed), so it may not be a good
-      idea for each of them to broadcast a cond. We just don't broacast a
-      cond, the checkpoint thread will wake up in max one second.
+      at the same time (when the log bound
+      MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS is passed), so it may not be a
+      good idea for each of them to broadcast a cond to wake up the background
+      checkpoint thread. We just don't broacast a cond, the checkpoint thread
+      will notice our request in max a few seconds.
    */
    checkpoint_request= level; /* post request */
  }
-   
-  if (wait_for_completion)
-  {
-    uint checkpoints_done_copy= checkpoints_done;
-    uint checkpoint_errors_copy= checkpoint_errors;
-    /*
-      note that the "==done" works when the uint counter wraps too, so counter
-      can even be smaller than uint if we wanted (however it should be big
-      enough so that max_the_int_type checkpoints cannot happen between two
-      wakeups of our thread below). uint sounds fine.
-      Wait for our checkpoint to be done:
-    */
-
-    if (checkpoint_running != NONE) /* not ours, let it pass */
-    {
-      while (1)
-      {
-        if (checkpoints_done != checkpoints_done_copy)
-        {
-          if (checkpoints_done == (checkpoints_done_copy+1))
-          {
-            /* not our checkpoint, forget about it */
-            checkpoints_done_copy= checkpoints_done;
-          }
-          break; /* maybe even ours has been done at this stage! */
-        }
-        cond_wait(checkpoint_done_cond, log_mutex);
-      }
-    }
-
-    /* now we come to waiting for our checkpoint */
-    while (1)
-    {
-      if (checkpoints_done != checkpoints_done_copy)
-      {
-        /* our checkpoint has been done */
-        break;
-      }
-      if (checkpoint_errors != checkpoint_errors_copy)
-      {
-        /*
-          the one which was running a few milliseconds ago (if there was one),
-          and/or ours, had an error, just assume it was ours. So there
-          is a possibility that we return error though we succeeded, in which
-          case user will have to retry; but two simultanate checkpoints have
-          high changes to fail together (as the error probably comes from
-          malloc or disk write problem), so chance of false alarm is low.
-          Reporting the error only to the one which caused the error would
-          require having a (not fixed size) list of all requests, not worth it.
-        */
-        error= 1;
-        break;
-      }
-      cond_wait(checkpoint_done_cond, log_mutex);
-    }
-    unlock(log_mutex);
-  } /* ... if (wait_for_completion) */

  /*
-    If wait_for_completion was false, and there was an error, only an error
+    If there was an error, only an error
    message to the error log will say it; normal, for a checkpoint triggered
    by a log write, we probably don't want the client's log write to throw an
    error, as the log write succeeded and a checkpoint failure is not
    critical: the failure in this case is more for the DBA to know than for
    the end user.
  */
-  return error;
-}
-
-void mark_checkpoint_done(int error)
-{
-  safemutex_assert_owner(log_mutex);
-  if (error)
-    checkpoint_errors++;
-  /* a checkpoint is said done even if it had an error */
-  checkpoints_done++;
-  if (checkpoint_request == checkpoint_running)
-  {
-    /*
-      No new request has been posted, so we satisfied all requests, forget
-      about them.
-    */
-    checkpoint_request= NONE;
-  }
-  checkpoint_running= NONE;
-  written_since_last_checkpoint= 0;
-  broadcast(checkpoint_done_cond);
-}
-
-/*
-  Alternative (not to be done, too disturbing):
-  do the autocheckpoint in the thread which passed the bound first (and do the
-  checkpoint in the client thread which requested it).
-  It will give a delay to that client thread which passed the bound (time to
-  fsync() for example 1000 files is 16 s on my laptop). Here is code for
-  explicit and implicit checkpoints, where client thread does the job:
-*/
-#if 0
-{
-  lock(log_mutex); /* explicit takes it here, implicit already has it */
-  while (checkpoint_running != NONE)
-  {
-    if (checkpoint_running >= my_level) /* always true for auto checkpoints */
-      goto end; /* we skip checkpoint */
-    /* a less strong is running, I'll go next */
-    wait_on_checkpoint_done_cond();
-  }
-  checkpoint_running= my_level;
-  checkpoint(my_level); // can gather checkpoint_start_lsn before unlock
-  lock(log_mutex);
-  checkpoint_running= NONE;
-  written_since_last_checkpoint= 0;
-end:
-  unlock(log_mutex);
 }
-#endif
--- a/storage/maria/checkpoint.h
+++ b/storage/maria/checkpoint.h
@@ -13,11 +13,7 @@ typedef enum enum_checkpoint_level {
  FULL /* also flush all dirty pages */
 } CHECKPOINT_LEVEL;

-/*
-  Call this when you want to request a checkpoint.
-  In real life it will be called by log_write_record() and by client thread
-  which explicitely wants to do checkpoint (ALTER ENGINE CHECKPOINT
-  checkpoint_level).
-*/
-int request_checkpoint(CHECKPOINT_LEVEL level, my_bool wait_for_completion);
+void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
+my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level);
+my_bool execute_asynchronous_checkpoint_if_any();
 /* that's all that's needed in the interface */
--- a/storage/maria/least_recently_dirtied.c
+++ b/storage/maria/least_recently_dirtied.c
@@ -48,6 +48,15 @@
  Key cache has groupping already somehow Monty said (investigate that).
 */
 #define FLUSH_GROUP_SIZE 512 /* 8 MB */
+/*
+  We don't want to probe for checkpoint requests all the time (it takes
+  the log mutex).
+  If FLUSH_GROUP_SIZE is 8MB, assuming a local disk which can write 30MB/s
+  (1.8GB/min), probing every 16th call to flush_one_group_from_LRD() is every
+  16*8=128MB which is every 128/30=4.2second.
+  Using a power of 2 gives a fast modulo operation.
+*/
+#define CHECKPOINT_PROBING_PERIOD_LOG2 4

 /*
  This thread does background flush of pieces of the LRD, and all checkpoints.
@@ -56,19 +65,19 @@
 pthread_handler_decl background_flush_and_checkpoint_thread()
 {
  char *flush_group_buffer= my_malloc(PAGE_SIZE*FLUSH_GROUP_SIZE);
+  uint flush_calls= 0;
  while (this_thread_not_killed)
  {
-    lock(log_mutex);
-    if (checkpoint_request)
-      checkpoint(); /* will unlock mutex */
-    else
-    {
-      unlock(log_mutex);
-      lock(global_LRD_mutex);
-      flush_one_group_from_LRD();
-      safemutex_assert_not_owner(global_LRD_mutex);
-    }
-    my_sleep(1000000); /* one second ? */
+    if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0)
+      execute_asynchronous_checkpoint_if_any();
+    lock(global_LRD_mutex);
+    flush_one_group_from_LRD();
+    safemutex_assert_not_owner(global_LRD_mutex);
+    /*
+      We are a background thread, leave time for client threads or we would
+      monopolize the disk:
+    */
+    pthread_yield();
  }
  my_free(flush_group_buffer);
 }
@@ -155,6 +164,12 @@ flush_one_group_from_LRD()
    */
  }
  free(array);
+  /*
+    MikaelR noted that he observed that Linux's file cache may never fsync to
+    disk until this cache is full, at which point it decides to empty the
+    cache, making the machine very slow. A solution was to fsync after writing
+    2 MB.
+  */
 }

 /* flushes all page from LRD up to approximately rec_lsn>=max_lsn */
@@ -165,7 +180,7 @@ int flush_all_LRD_to_lsn(LSN max_lsn)
    max_lsn= LRD->first->prev->rec_lsn;
  while (LRD->first->rec_lsn < max_lsn)
  {
-    if (flush_one_group_from_LRD()) /* will unlock mutex */
+    if (flush_one_group_from_LRD()) /* will unlock LRD mutex */
      return 1;
    /* scheduler may preempt us here so that we don't take full CPU */
    lock(global_LRD_mutex);