DS-MRR improvements: better comments, use symbolic name instead of +1/-1 constants.

18a34850 · Sergey Petrunya · 7b9df6aa · 18a34850 · 18a34850
Commit 18a34850 authored Sep 19, 2010 by Sergey Petrunya
Show whitespace changes
Inline Side-by-side

Showing with 137 additions and 72 deletions

sql/multi_range_read.cc sql/multi_range_read.cc +18 -8

sql/multi_range_read.h sql/multi_range_read.h +119 -64

No files found.
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -338,6 +338,7 @@ bool SimpleBuffer::have_space_for(size_t bytes)
    return (write_pos - bytes >= start);
 }

+
 size_t SimpleBuffer::used_size()
 {
  return (direction == 1)? write_pos - read_pos : read_pos - write_pos;
@@ -354,16 +355,18 @@ void SimpleBuffer::setup_reading(uchar **data1, size_t len1,
  read_size2= len2;
 }

+
 bool SimpleBuffer::read()
 {
  if (!have_data(read_size1 + (read_ptr2? read_size2 : 0)))
    return TRUE;
-  *read_ptr1 =read(read_size1);
+  *read_ptr1= read(read_size1);
  if (read_ptr2)
    *read_ptr2= read(read_size2);
  return FALSE;
 }

+
 uchar *SimpleBuffer::read(size_t bytes)
 {
  DBUG_ASSERT(have_data(bytes));
@@ -381,12 +384,14 @@ uchar *SimpleBuffer::read(size_t bytes)
  }
 }

+
 bool SimpleBuffer::have_data(size_t bytes)
 {
  return (direction == 1)? (write_pos - read_pos >= (ptrdiff_t)bytes) : 
                           (read_pos - write_pos >= (ptrdiff_t)bytes);
 }

+
 void SimpleBuffer::reset_for_writing()
 {
  if (direction == 1)
@@ -493,7 +498,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
  */
  full_buf= buf->buffer;
  full_buf_end= buf->buffer_end;
-  rowid_buffer.set_buffer_space(full_buf, full_buf_end, 1);
+  rowid_buffer.set_buffer_space(full_buf, full_buf_end, SimpleBuffer::FORWARD);
  
  if (do_sort_keys)
  {
@@ -819,10 +824,11 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
  if (!do_rowid_fetch)
  {
    /* Give all space to key buffer. */
-    key_buffer.set_buffer_space(full_buf, full_buf_end, 1);
+    key_buffer.set_buffer_space(full_buf, full_buf_end, SimpleBuffer::FORWARD);

    /* Just in case, tell rowid buffer that it has zero size: */
-    rowid_buffer.set_buffer_space(full_buf_end, full_buf_end, 1);
+    rowid_buffer.set_buffer_space(full_buf_end, full_buf_end, 
+                                  SimpleBuffer::FORWARD);
    return;
  }
  
@@ -865,8 +871,10 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
  }

  rowid_buffer_end= full_buf + bytes_for_rowids;
-  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 1);
-  key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1); 
+  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 
+                                SimpleBuffer::FORWARD);
+  key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, 
+                              SimpleBuffer::BACKWARD); 
 }


@@ -906,8 +914,10 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
        We're using two buffers and both of them are empty now. Restore the
        original sizes
      */
-      rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 1);
-      key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1);
+      rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end,
+                                    SimpleBuffer::FORWARD);
+      key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end,
+                                  SimpleBuffer::BACKWARD);
    }
    key_buffer.reset_for_writing();
    key_buffer.setup_writing(&key_ptr, key_size_in_keybuf,

--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -6,9 +6,14 @@
 /**
  A Disk-Sweep implementation of MRR Interface (DS-MRR for short)

-  This is a "plugin"(*) for storage engines that allows make index scans 
-  read table rows in rowid order. For disk-based storage engines, this is
-  faster than reading table rows in whatever-SQL-layer-makes-calls-in order.
+  This is a "plugin"(*) for storage engines that allows to
+    1. When doing index scans, read table rows in rowid order;
+    2. when making many index lookups, do them in key order and don't
+       lookup the same key value multiple times;
+    3. Do both #1 and #2, when applicable.
+  These changes are expected to speed up query execution for disk-based 
+  storage engines running io-bound loads and "big" queries (ie. queries that
+  do joins and enumerate lots of records).

  (*) - only conceptually. No dynamic loading or binary compatibility of any
        kind.
@@ -17,13 +22,17 @@
   
      SQL Layer code
       |   |   |
-      -v---v---v---- handler->multi_range_read_XXX() function calls
+       v   v   v 
+      -|---|---|---- handler->multi_range_read_XXX() function calls
       |   |   |
-      ____________________________________
+      _____________________________________
     / DS-MRR module                       \
-     |  (scan indexes, order rowids, do    |
-     |   full record reads in rowid order) |
-     \____________________________________/
+     | (order/de-duplicate lookup keys,    |
+     | scan indexes in key order,          |
+     | order/de-duplicate rowids,          |
+     | retrieve full record reads in rowid |
+     | order)                              |
+     \_____________________________________/
       |   |   |
      -|---|---|----- handler->read_range_first()/read_range_next(), 
       |   |   |      handler->index_read(), handler->rnd_pos() calls.
@@ -31,6 +40,7 @@
       v   v   v
      Storage engine internals

+
  Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines.
  Potentially it can be used with any table handler that has disk-based data
  storage and has better performance when reading data in rowid order.
@@ -38,77 +48,104 @@


 /*
-  A simple memory buffer for reading and writing.
+  An in-memory buffer used by DS-MRR implementation. 
+  - The buffer contains fixed-size elements. The elements are either atomic
+    byte sequences or pairs.
+  - The buffer resides in memory provided by the user. It is possible to
+     = dynamically (ie. between write operations) add ajacent memory space to
+       the buffer
+     = dynamically remove unused space from the buffer.
+  - Buffer can be set to be either "forward" or "backward". 
+
+  The intent of the last two properties is to allow to have two buffers on
+  adjacent memory space, one is being read from (and so its space shrinks)
+  while the other is being written to (and so it needs more and more space).
+
+  Illustration of forward buffer operation:
+
+                         +-- next read will read from here
+                         |
+                         |               +-- next write will write to here
+                         v               v
+        *--------------*===============*----------------*
+        |       ^      |          ^    |                |
+        |       |      read_pos   |    write_pos        |
+        start   |                 |                     end
+                |                 |            
+              usused space         user data

-  when writing, there is no user-visible "current" position, although
-  internally 'pos' points to just after the end of used area  (or at the 
-  start of it for reverse buffer).
-
-  When reading, there is current position pointing at start (for reverse
-  buffer, end) of the element that will be read next.
-   ^^ why end for reverse? it's more logical to point at start 
 */

 class SimpleBuffer
 {
-  uchar *start;
-  uchar *end;
+public:
+
+  enum enum_direction {
+    BACKWARD=-1, /* buffer is filled/read from bigger to smaller memory addresses */
+    FORWARD=1  /* buffer is filled/read from smaller to bigger memory addresses */
+  };
+
+private:
+  enum_direction direction;
+
+  uchar *start; /* points to start of buffer space */
+  uchar *end;   /* points to just beyond the end of buffer space */
+  /*
+    Forward buffer: points to the start of the data that will be read next
+    Backward buffer: points to just beyond the end of the data that will be 
+    read next.
+  */
  uchar *read_pos;
+  /*
+    Forward buffer: points to just after the end of the used area.
+    Backward buffer: points to the start of used area.
+  */
  uchar *write_pos;

  /* 
-     1 <=> buffer grows/is filled/is read from start to end
-    -1 <=> everthing is done from end to start instead.
+    Data to be written. write() call will assume that (*write_ptr1) points to 
+    write_size1 bytes of data to be written.
+    If write_ptr2!=NULL then the buffer stores pairs, and (*write_ptr2) points
+    to write_size2 bytes of data that form the second component.
  */
-  int direction;
-  
-  /* Pointers to read data from */
  uchar **write_ptr1;
  size_t write_size1;
-  /* Same as above, but may be NULL */
  uchar **write_ptr2;
  size_t write_size2;

-  /* Pointers to write data to */
+  /*
+    read() will do reading by storing pointer to read data into *read_ptr1 (if
+    the buffer stores atomic elements), or into {*read_ptr1, *read_ptr2} (if
+    the buffer stores pairs).
+  */
+  //TODO if write_size1 == read_size1 why have two variables??
  uchar **read_ptr1;
  size_t read_size1;
-  /* Same as above, but may be NULL */
  uchar **read_ptr2;
  size_t read_size2;

-  bool have_space_for(size_t bytes);
-  uchar *used_area() { return (direction == 1)? read_pos : write_pos; }
-  size_t used_size();
-
-  void write(const uchar *data, size_t bytes);
-  uchar *read(size_t bytes);
-
 public:
-  /* Set up writing*/
+  /* Write-mode functions */
  void setup_writing(uchar **data1, size_t len1, 
                     uchar **data2, size_t len2);
-
-  void sort(qsort2_cmp cmp_func, void *cmp_func_arg);
-
-  /* Write-mode functions */
  void reset_for_writing();
-  void write();
  bool can_write();
-
-  bool is_empty() { return used_size() == 0; }
+  void write();

  /* Read-mode functions */
+  bool is_empty() { return used_size() == 0; }
  void reset_for_reading();
-  // todo: join with setup-writing? (but what for?)
  void setup_reading(uchar **data1, size_t len1, 
                     uchar **data2, size_t len2);
  bool read();

-  bool have_data(size_t bytes);
+  /* Misc functions */
+  void sort(qsort2_cmp cmp_func, void *cmp_func_arg);
+  bool is_reverse() { return direction == BACKWARD; }
  uchar *end_of_space();

-  /* Control functions */
-  void set_buffer_space(uchar *start_arg, uchar *end_arg, int direction_arg) 
+  /* Buffer space control functions */
+  void set_buffer_space(uchar *start_arg, uchar *end_arg, enum_direction direction_arg) 
  {
    start= start_arg;
    end= end_arg;
@@ -118,8 +155,8 @@ public:
  }

  /*
-    Stop/return the unneded space (the one that we have wrote to and have read
-    from.
+    Stop using/return the unneded space (the one that we have already wrote 
+    to read from).
  */
  void remove_unused_space(uchar **unused_start, uchar **unused_end)
  {
@@ -142,9 +179,8 @@ public:
    uchar *tmp= read_pos;
    read_pos= write_pos;
    write_pos= tmp;
-    direction= -direction;
+    direction= (direction == FORWARD)? BACKWARD: FORWARD;
  }
-  bool is_reverse() { return direction == -1; }

  void grow(uchar *unused_start, uchar *unused_end)
  {
@@ -173,10 +209,13 @@ public:
  */
  class PeekIterator
  {
-    // if direction==1 : pointer to what to return next
-    // if direction==-1: pointer to the end of what is to be returned next
+    /*
+      if sb->direction==1 : pointer to what to return next
+      if sb->direction==-1: pointer to the end of what is to be returned next
+    */
    uchar *pos;
    SimpleBuffer *sb;
+    
  public:
    void init(SimpleBuffer *sb_arg)
    {
@@ -190,8 +229,10 @@ public:
    */
    bool read_next()
    {
-      // Always read the first component first? (because we do inverted-writes
-      // if needed, so no measures need to be taken here).
+      /* 
+        Always read the first component first (if the buffer is backwards, we
+        have written the second component first).
+      */
      uchar *res;
      if ((res= get_next(sb->read_size1)))
      {
@@ -223,6 +264,16 @@ public:
      }
    }
  };
+
+private:
+  bool have_space_for(size_t bytes);
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return (direction == FORWARD)? read_pos : write_pos; }
+  size_t used_size();
+
+  void write(const uchar *data, size_t bytes);
+  uchar *read(size_t bytes);
+  bool have_data(size_t bytes);
 };


@@ -231,11 +282,11 @@ public:
  each ha_{myisam/innobase/etc} object. That object will be further referred to
  as "the handler"

-  There are actually three strategies
-   S1. Bypass DS-MRR, pass all calls to default implementation (i.e. to
+  DsMrr_impl has the following execution strategies:
+   S1. Bypass DS-MRR, pass all calls to default MRR implementation (i.e. to
      MRR-to-non-MRR calls converter)
-   S2. Regular DS-MRR 
-   S3. DS-MRR/CPK for doing scans on clustered primary keys.
+   S2. Sort Keys
+   S3. Sort Rowids

  S1 is used for cases which DS-MRR is unable to handle for some reason.

@@ -294,9 +345,13 @@ private:
  handler *h;
  TABLE *table; /* Always equal to h->table */

-  /* Secondary handler object.  It is used for scanning the index */
+  /*
+    Secondary handler object, if needed (we need it when we need to both scan
+    the index and return rows).
+  */
  handler *h2;
  
+  /* Full buffer that we're using (the buffer is obtained from SQL layer) */
  uchar *full_buf;
  uchar *full_buf_end;