DS-MRR, key-ordered retrievals: commit for buildbot

e25db795 · Sergey Petrunya · 7391175d · e25db795 · e25db795
Commit e25db795 authored Aug 08, 2010 by Sergey Petrunya
Show whitespace changes
Inline Side-by-side

Showing with 581 additions and 255 deletions

sql/multi_range_read.cc sql/multi_range_read.cc +444 -243

sql/multi_range_read.h sql/multi_range_read.h +137 -12

No files found.
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -287,6 +287,84 @@ scan_it_again:
 * DS-MRR implementation 
 ***************************************************************************/

+void SimpleBuffer::write(const uchar *data, size_t bytes)
+{
+  DBUG_ASSERT(have_space_for(bytes));
+
+  if (direction == -1)
+    write_pos -= bytes;
+
+  memcpy(write_pos, data, bytes);
+
+  if (direction == 1)
+    write_pos += bytes;
+}
+
+bool SimpleBuffer::have_space_for(size_t bytes)
+{
+  if (direction == 1)
+    return (write_pos + bytes < end);
+  else
+    return (write_pos - bytes >= start);
+}
+
+size_t SimpleBuffer::used_size()
+{
+  return (direction == 1)? write_pos - read_pos : read_pos - write_pos;
+}
+
+uchar *SimpleBuffer::read(size_t bytes)
+{
+  DBUG_ASSERT(have_data(bytes));
+  uchar *res;
+  if (direction == 1)
+  {
+    res= read_pos;
+    read_pos += bytes;
+    return res;
+  }
+  else
+  {
+    read_pos= read_pos - bytes;
+    return read_pos;
+  }
+}
+
+bool SimpleBuffer::have_data(size_t bytes)
+{
+  return (direction == 1)? (write_pos - read_pos >= (ptrdiff_t)bytes) : 
+                           (read_pos - write_pos >= (ptrdiff_t)bytes);
+}
+
+void SimpleBuffer::reset_for_writing()
+{
+  if (direction == 1)
+    write_pos= read_pos= start;
+  else
+    write_pos= read_pos= end;
+}
+
+void SimpleBuffer::reset_for_reading()
+{
+/*
+Do we need this at all?
+  if (direction == 1)
+    pos= start;
+  else
+    pos= end;
+//end?
+*/
+}
+
+uchar *SimpleBuffer::end_of_space()
+{
+  if (direction == 1)
+    return start;
+  else
+    return end;
+//TODO: check this.
+}
+
 /**
  DS-MRR: Initialize and start MRR scan

@@ -309,7 +387,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
                           void *seq_init_param, uint n_ranges, uint mode,
                           HANDLER_BUFFER *buf)
 {
-  uint elem_size;
  Item *pushed_cond= NULL;
  handler *new_h2= 0;
  THD *thd= current_thd;
@@ -331,68 +408,68 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
  use_default_impl= FALSE;
  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
  
-  // psergey2: split the buffer:
  /*
-  psergey2-note: we can't split the buffer here because we don't know how key
-  length. we'll only be able to do it when we've got the first range.
+    Figure out what steps we'll need to do
  */
+  do_sort_keys= FALSE;
  if ((mode & HA_MRR_SINGLE_POINT) && 
       optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
  {
-    //do_sort_keys= TRUE; // will use key buffer to sort keys;
+    do_sort_keys= TRUE;
    use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS);
  }
-  /*
+
  do_rowid_fetch= FALSE;
-  if (!doing_cpk_scan && !index_only_read)
+  doing_cpk_scan= check_cpk_scan(h->active_index, mode);
+  if (!doing_cpk_scan /* && !index_only_read */)
  {
-    do_rowid_fetch= TRUE; //will use rowid buffer to store/sort rowids, etc
+    /* Will use rowid buffer to store/sort rowids, etc */
+    do_rowid_fetch= TRUE;
  }
+  DBUG_ASSERT(do_sort_keys || do_rowid_fetch);

+  full_buf= buf->buffer;
+  full_buf_end= buf->buffer_end;
  
-  if (do_sort_keys && do_rowid_fetch)
-  {
-    split buffer space proportionally
-  }
-  else
-  {
-    // give all space to one buffer
-    if (do_sort_keys)
-    {
-      //sort_buffer_start= ...;
-    }
-    else 
-    {
-      DBUG_ASSERT(do_rowid_fetch);
-      //rowid_buffer_start= ...;
-    }
-  }
+  /* 
+    At start, alloc all of the buffer for rowids. Key sorting code will grab a
+    piece if necessary.
  */
-  mrr_buf= buf->buffer;
-  mrr_buf_end= buf->buffer_end;
+  rowid_buffer.set_buffer_space(full_buf, full_buf_end, 1);

  if (is_mrr_assoc)
    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
  
-  if ((doing_cpk_scan= check_cpk_scan(h->active_index, mode)))
+  /*
+    psergey2-todo: for CPK scans:
+     - use MRR irrespectively of @@mrr_sort_keys setting,
+     - dont do rowid retrieval.
+  */
+  if (do_sort_keys)
  {
    /* It's a DS-MRR/CPK scan */
-    cpk_tuple_length= 0; /* dummy value telling it needs to be inited */
-    cpk_have_range= FALSE;
+    key_tuple_length= 0; /* dummy value telling it needs to be inited */
+    key_buff_elem_size= 0;
+    in_index_range= FALSE;
    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
    h->mrr_funcs= *seq_funcs;
+    keyno= h->active_index != MAX_KEY? h->active_index : h2->active_index;
    dsmrr_fill_key_buffer();
    
-    if (dsmrr_eof)
-      buf->end_of_used_area= mrr_buf_last;
-    DBUG_RETURN(0); /* nothing could go wrong while filling the buffer */
+    if (dsmrr_eof && !do_rowid_fetch)
+      buf->end_of_used_area= key_buffer.end_of_space();
  }

-  /* In regular DS-MRR, buffer stores {rowid, range_id} pairs */
-  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/ elem_size)* elem_size;
-  mrr_buf_end= mrr_buf_last;
+  if (!do_rowid_fetch)
+  {
+    /* 
+      We have the keys and won't need to fetch rowids, as key lookup will be
+      the last operation, done in multi_range_read_next().
+    */
+    DBUG_RETURN(0);
+  }

+  rowid_buff_elem_size= h->ref_length + (is_mrr_assoc? sizeof(char*) : 0);
  /*
    psergey2: this is only needed when 
      - doing a rowid-to-row scan
@@ -416,7 +493,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
    if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
      DBUG_RETURN(1);
    DBUG_ASSERT(h->active_index != MAX_KEY);
-    uint mrr_keyno= h->active_index;
+    keyno= h->active_index;

    /* Create a separate handler object to do rnd_pos() calls. */
    if (!(new_h2= h->clone(thd->mem_root)) || 
@@ -426,7 +503,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
      DBUG_RETURN(1);
    }

-    if (mrr_keyno == h->pushed_idx_cond_keyno)
+    if (keyno == h->pushed_idx_cond_keyno)
      pushed_cond= h->pushed_idx_cond;

    /*
@@ -443,12 +520,14 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
    h2= new_h2; /* Ok, now can put it into h2 */
    table->prepare_for_position();
    h2->extra(HA_EXTRA_KEYREAD);
+    h2->mrr_funcs= *seq_funcs; //psergey3-todo: sort out where to store
+    h2->mrr_iter= h->mrr_iter;

-    if (h2->ha_index_init(mrr_keyno, FALSE))
+    if (h2->ha_index_init(keyno, FALSE))
      goto error;

    if (pushed_cond)
-      h2->idx_cond_push(mrr_keyno, pushed_cond);
+      h2->idx_cond_push(keyno, pushed_cond);
  }
  else
  {
@@ -468,9 +547,14 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
      goto error;
  }
  
-  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
-                                         mode, buf) ||
-      dsmrr_fill_rowid_buffer())
+  if (!do_sort_keys && 
+      h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
+                                         mode, buf))
+  {
+    goto error;
+  }
+
+  if (dsmrr_fill_rowid_buffer())
  {
    goto error;
  }
@@ -479,7 +563,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
    adjust *buf to indicate that the remaining buffer space will not be used.
  */
  if (dsmrr_eof) 
-    buf->end_of_used_area= mrr_buf_last;
+    buf->end_of_used_area= rowid_buffer.end_of_space();

  /*
     h->inited == INDEX may occur when 'range checked for each record' is
@@ -526,21 +610,24 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)


 /**
-  DS-MRR: Fill the buffer with rowids and sort it by rowid
+  DS-MRR: Fill and sort the rowid buffer

  {This is an internal function of DiskSweep MRR implementation}
+
  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
  buffer. When the buffer is full or scan is completed, sort the buffer by 
  rowid and return.
  
  The function assumes that rowids buffer is empty when it is invoked. 

+  New2:
+    we will need to scan either 
+     - the source sequence getting records
+     - use dsmrr_next_from_index..
+
  dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
  scanning.

-  psergey2: this func will 'fill the rowid buffer'. If filling the rowid buffer 
-  requires that key buffer is filled/sorted first, will do that, too.
-  
  @param h  Table handler

  @retval 0      OK, the next portion of rowids is in the buffer,
@@ -554,40 +641,32 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
  int res;
  DBUG_ENTER("DsMrr_impl::dsmrr_fill_rowid_buffer");
  
-  mrr_buf_cur= mrr_buf;
-  mrr_buf_next_identical= mrr_buf_cur;
-  /*
-    psergey2-todo:
-      - call here fill/sort key buffer, if needed.
-
-    psergey2-todo: then, get keys either from
-      - multi_range_read_next()
-      - sorted key buffer
+  rowid_buffer.reset_for_writing();
+  identical_rowid_ptr= NULL;

-    psergey2-todo: if we're traversing an ordered key sequence,
-     check if next keys are the same as previous.
-     (note that it's easy as ordered sequence allows forward/backward
-     navigation so we don't need to buffer things)
-  */
-  while ((mrr_buf_cur < mrr_buf_end) && 
-         !(res= h2->handler::multi_range_read_next(&range_info)))
+  while (rowid_buffer.have_space_for(rowid_buff_elem_size))
  {
+    if (do_sort_keys)
+      res= dsmrr_next_from_index(&range_info);
+    else 
+      res= h2->handler::multi_range_read_next(&range_info);
+
+    if (res)
+      break;
+    
+
    KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
-    if (h2->mrr_funcs.skip_index_tuple &&
+    if (!do_sort_keys && /* If keys are sorted then this check is already done */
+        h2->mrr_funcs.skip_index_tuple &&
        h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr))
      continue;

-
    /* Put rowid, or {rowid, range_id} pair into the buffer */
    h2->position(table->record[0]);
-    memcpy(mrr_buf_cur, h2->ref, h2->ref_length);
-    mrr_buf_cur += h2->ref_length;
+    rowid_buffer.write(h2->ref, h2->ref_length);

    if (is_mrr_assoc)
-    {
-      memcpy(mrr_buf_cur, &range_info, sizeof(void*));
-      mrr_buf_cur += sizeof(void*);
-    }
+      rowid_buffer.write((uchar*)&range_info, sizeof(void*));
  }

  if (res && res != HA_ERR_END_OF_FILE)
@@ -596,12 +675,11 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()

  /* Sort the buffer contents by rowid */
  uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
+  uint n_rowids= rowid_buffer.used_size() / elem_size;
+  
+  my_qsort2(rowid_buffer.used_area(), n_rowids, elem_size, 
+            (qsort2_cmp)rowid_cmp, (void*)h);

-  my_qsort2(mrr_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
-            (void*)h);
-  mrr_buf_last= mrr_buf_cur;
-  mrr_buf_cur=  mrr_buf;
  DBUG_RETURN(0);
 }

@@ -616,8 +694,8 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 {
  DsMrr_impl *dsmrr= (DsMrr_impl*)arg;
  TABLE *table= dsmrr->h->table;
-  
-  KEY_PART_INFO *part= table->key_info[table->s->primary_key].key_part;
+  int res;
+  KEY_PART_INFO *part= table->key_info[dsmrr->keyno].key_part;
  
  if (dsmrr->use_key_pointers)
  {
@@ -626,15 +704,31 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
    key2= *((uchar**)key2);  // todo is this alignment-safe?
  }

-  uchar *key1_end= key1 + dsmrr->cpk_tuple_length;
+  uchar *key1_end= key1 + dsmrr->key_tuple_length;

  while (key1 < key1_end)
  {
    Field* f = part->field;
    int len = part->store_length;
-    int res = f->cmp(key1, key2);
-    if (res)
+    if (part->null_bit)
+    {
+      if (*key1) // key1 == NULL
+      {
+        if (!*key2) // key1(NULL) < key2(notNULL)
+          return -1;
+        goto equals;
+      }
+      else if (*key2) // key1(notNULL) > key2 (NULL)
+        return 1;
+      // Step over NULL byte for f->cmp().
+      key1++;
+      key2++;
+      len--;
+    }
+    
+    if ((res= f->key_cmp(key1, key2)))
      return res;
+equals:
    key1 += len;
    key2 += len;
    part++;
@@ -643,6 +737,68 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 }


+/*
+  Setup key/rowid buffer sizes based on sample_key
+
+  DESCRIPTION
+    Setup key/rowid buffer sizes based on sample_key and its length.
+
+    This function must be called when all buffer space is empty.
+*/
+
+void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
+{
+  key_tuple_length= sample_key->length;
+  key_tuple_map= sample_key->keypart_map;
+  key_size_in_keybuf= use_key_pointers ? sizeof(char*) : 
+                                       key_tuple_length;
+  key_buff_elem_size= key_size_in_keybuf + 
+                      (int)is_mrr_assoc * sizeof(void*);
+
+  uint rowid_buf_elem_size= h->ref_length + 
+                            (int)is_mrr_assoc * sizeof(char*);
+  
+  KEY *key_info= &h->table->key_info[keyno];
+  /*
+    Use rec_per_key statistics as a basis to find out how many rowids 
+    we'll get for each key value.
+     TODO: are we guaranteed to get r_p_c==1 for unique keys?
+     TODO: what should be the default value to use when there is no 
+           statistics?
+  */
+  uint parts= my_count_bits(key_tuple_map);
+  ulong rpc;
+  if ((rpc= key_info->rec_per_key[parts - 1]))
+  {
+    rowid_buf_elem_size *= rpc;
+  }
+
+  double fraction_for_rowids=
+    ((double) rowid_buf_elem_size / 
+         ((double)rowid_buf_elem_size + key_buff_elem_size));
+
+  uint bytes_for_rowids= 
+    round(fraction_for_rowids * (full_buf_end - full_buf));
+  
+  uint bytes_for_keys= (full_buf_end - full_buf) - bytes_for_rowids;
+
+  if (bytes_for_keys < key_buff_elem_size + 1)
+  {
+    uint add= key_buff_elem_size + 1 - bytes_for_keys;
+    bytes_for_rowids -= add;
+    DBUG_ASSERT(bytes_for_rowids >= 
+                (h->ref_length + (int)is_mrr_assoc * sizeof(char*) + 1));
+  }
+
+  rowid_buffer.set_buffer_space(full_buf, full_buf + bytes_for_rowids, 1);
+  key_buffer.set_buffer_space(full_buf + bytes_for_rowids, full_buf_end, 1);
+
+  index_ranges_unique= test(key_info->flags & HA_NOSAME && 
+                            key_info->key_parts == 
+                              my_count_bits(sample_key->keypart_map));
+}
+
+
 /*
  DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
  
@@ -650,69 +806,55 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
    DsMrr_impl::dsmrr_fill_key_buffer()

  DESCRIPTION
-    DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+    DS-MRR/CPK: Enumerate the input range (=key) sequence, fill the key buffer
+    (lookup_key, range_id) pairs and sort.

    dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
    we're scanning.
-
-  psergey2-q: can this be used for filling/sorting key buffer in general case?
-   a: yes. 
-  qq: can we push sequence iteration init down into here?
 */

 void DsMrr_impl::dsmrr_fill_key_buffer()
 {
-  //psergey2: here, no identicals detection is necessary since we always scan
-  //  the unordered sequence.
  int res;
  KEY_MULTI_RANGE cur_range;
  DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer");

-  mrr_buf_cur= mrr_buf;
-  while ((mrr_buf_cur < mrr_buf_end) && 
+  // reset the buffer for writing.
+  key_buffer.reset_for_writing();
+
+  while ((key_buffer.have_space_for(key_buff_elem_size)) && 
         !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
  {
    DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
-    if (!cpk_tuple_length)
+    if (!key_tuple_length)
    {
-      cpk_tuple_length= cur_range.start_key.length;
-      key_buf_element_size= use_key_pointers ? sizeof(char*) : 
-                                           cpk_tuple_length;
-
-      cpk_is_unique_scan= test(table->key_info[h->active_index].key_parts == 
-                               my_count_bits(cur_range.start_key.keypart_map));
-      uint elem_size= key_buf_element_size + (int)is_mrr_assoc * sizeof(void*);
-      mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/elem_size) * elem_size;
-      mrr_buf_end= mrr_buf_last;
+      /* This only happens when we've just started filling the buffer */
+      DBUG_ASSERT(key_buffer.used_size() == 0);
+      setup_buffer_sizes(&cur_range.start_key);
    }

-    //psergey2: if keys are materialized, store pointers, not copy keys
-
    /* Put key, or {key, range_id} pair into the buffer */
    if (use_key_pointers)
-      memcpy(mrr_buf_cur, &cur_range.start_key.key, sizeof(char*));
+      key_buffer.write((uchar*)&cur_range.start_key.key, sizeof(char*));
    else
-      memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
-
-    mrr_buf_cur += key_buf_element_size;
+      key_buffer.write(cur_range.start_key.key, key_tuple_length);
 
    if (is_mrr_assoc)
-    {
-      memcpy(mrr_buf_cur, &cur_range.ptr, sizeof(void*));
-      mrr_buf_cur += sizeof(void*);
-    }
+      key_buffer.write((uchar*)&cur_range.ptr, sizeof(void*));
  }

  dsmrr_eof= test(res);

  /* Sort the buffer contents by rowid */
-  uint elem_size= key_buf_element_size + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
+  uint key_elem_size= key_size_in_keybuf + (int)is_mrr_assoc * sizeof(void*);
+  uint n_keys= key_buffer.used_size() / key_elem_size;
  
-  my_qsort2(mrr_buf, n_rowids, elem_size, 
+  my_qsort2(key_buffer.used_area(), n_keys, key_elem_size,
            (qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
-  mrr_buf_last= mrr_buf_cur;
-  mrr_buf_cur=  mrr_buf;
+  
+  last_identical_key_ptr= NULL;
+  in_identical_keys_range= FALSE;
+
  DBUG_VOID_RETURN;
 }

@@ -721,60 +863,105 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
  DS-MRR/CPK: multi_range_read_next() function

  DESCRIPTION
-    DsMrr_impl::dsmrr_next_cpk()
+    DsMrr_impl::dsmrr_next_from_index()
      range_info  OUT  identifier of range that the returned record belongs to

  DESCRIPTION
-    DS-MRR/CPK: multi_range_read_next() function. 
-    This is similar to DsMrr_impl::dsmrr_next(), the differences are that
-     - we get records with index_read(), not with rnd_pos()
-     - we may get multiple records for one key (=element of the buffer)
-     - unlike dsmrr_fill_rowid_buffer(), dsmrr_fill_key_buffer() never fails.
+  
+  This function walks over key buffer and does index reads, i.e. it produces
+  {current_record, range_id} pairs.
+
+  The function has the same call contract like multi_range_read_next()'s.
+
+  We actually iterate nested sequences:
+  
+  - a disjoint sequence of index ranges
+    - each range has multiple records
+      - each record goes into multiple identical ranges.

  RETURN
    0                   OK, next record was successfully read
    HA_ERR_END_OF_FILE  End of records
    Other               Some other error
-
-  psergey2-todo: this should detect identical keys.
 */

-int DsMrr_impl::dsmrr_next_cpk(char **range_info)
+int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
 {
  int res;
+  uchar *key_in_buf;
+  handler *file= do_rowid_fetch? h2: h;

-  while (cpk_have_range)
+  while (in_identical_keys_range)
  {
+//read_and_check:
+    /* Read record/key pointer from the buffer */
+    key_in_buf= identical_key_it.get_next(key_size_in_keybuf);
+    if (is_mrr_assoc)
+      cur_range_info= (char*)identical_key_it.get_next(sizeof(void*));

-    if (h->mrr_funcs.skip_record &&
-        h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL))
+    if (key_in_buf == last_identical_key_ptr)
    {
-      cpk_have_range= FALSE;
-      break;
+      /* We're looking at the last of the identical keys */
+      in_identical_keys_range= FALSE;
+    }
+check_record:
+    if ((h->mrr_funcs.skip_index_tuple &&
+         h->mrr_funcs.skip_index_tuple(h->mrr_iter, *(char**)cur_range_info)) || 
+        (h->mrr_funcs.skip_record &&
+         h->mrr_funcs.skip_record(h->mrr_iter, *(char**)cur_range_info, NULL)))
+    {
+      continue;
    }
+    memcpy(range_info_arg, cur_range_info, sizeof(void*));

-    uchar *lookup_tuple= use_key_pointers? (*((uchar**)mrr_buf_cur)) : mrr_buf_cur;
-    res= h->index_next_same(table->record[0], lookup_tuple, cpk_tuple_length);
+    return 0;
+  }
  
-    if (h->mrr_funcs.skip_index_tuple &&
-        h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info))
-      continue;
+  /* Try returrning next record from the current range */
+  while (in_index_range)
+  {
+    res= file->ha_index_next_same(table->record[0], cur_index_tuple, 
+                                  key_tuple_length);
    
-    if (res != HA_ERR_END_OF_FILE)
+    if (res)
    {
-      if (is_mrr_assoc)
-        memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
-      return res;
+      if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
+        return res;  /* Fatal error */
+
+      in_index_range= FALSE; /* no more records here */
+      break;
    }
    
-    /* No more records in this range. Exit this loop and go get another range */
-    cpk_have_range= FALSE;
+    if (last_identical_key_ptr)
+    {
+      in_identical_keys_range= TRUE;
+      identical_key_it.init(&key_buffer);
+      cur_range_info= first_identical_range_info;
    }

-  do
+    goto check_record;
+   //  goto read_and_check;
+  }
+
+  while(1)
+  {
+    DBUG_ASSERT(!in_identical_keys_range && !in_index_range);
+
+    /* Jump over the keys that were handled by identical key processing */
+    if (last_identical_key_ptr)
+    {
+      while (key_buffer.read(key_size_in_keybuf) != last_identical_key_ptr)
      {
+        if (is_mrr_assoc)
+          key_buffer.read(sizeof(void*));
+      }
+      if (is_mrr_assoc)
+        key_buffer.read(sizeof(void*));
+      last_identical_key_ptr= NULL;
+    }
+
    /* First, make sure we have a range at start of the buffer */
-    if (mrr_buf_cur == mrr_buf_last)
+    if (!key_buffer.have_data(key_buff_elem_size))
    {
      if (dsmrr_eof)
      {
@@ -782,58 +969,56 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
        goto end;
      }
      dsmrr_fill_key_buffer();
-    }
-    if (mrr_buf_cur == mrr_buf_last)
+      if (!key_buffer.have_data(key_buff_elem_size))
      {
        res= HA_ERR_END_OF_FILE;
        goto end;
      }
+    }
+    
+    /* Get the next range to scan*/
+    cur_index_tuple= key_in_buf= key_buffer.read(key_size_in_keybuf);
+    if (use_key_pointers)
+      cur_index_tuple= *((uchar**)cur_index_tuple);

-    /* Ok, got the range. Try making a lookup.  */
-    uchar *lookup_tuple= use_key_pointers? (*((uchar**)mrr_buf_cur)) : mrr_buf_cur;
-    mrr_buf_cur += key_buf_element_size;
    if (is_mrr_assoc)
+      cur_range_info= (char*)key_buffer.read(sizeof(void*));
+      
+    /* Do index lookup */
+    if ((res= file->ha_index_read_map(table->record[0], cur_index_tuple, 
+                                      key_tuple_map, HA_READ_KEY_EXACT)))
    {
-      memcpy(&cpk_saved_range_info, mrr_buf_cur, sizeof(void*));
-      mrr_buf_cur += sizeof(void*) * test(is_mrr_assoc);
+      if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
+        return res;
+      continue; /* to next key and make another lookup */
    }

-    if (h->mrr_funcs.skip_record &&
-        h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL))
-      continue;
-    
-    res= h->index_read(table->record[0], lookup_tuple, cpk_tuple_length, 
-                       HA_READ_KEY_EXACT);
-
-    /*
-      Check pushed index condition. Performance-wise, it does not make any
-      sense to put this call here (the above call has already accessed the full
-      record). That's the best I could do, though, because:
-      - ha_innobase doesn't support IndexConditionPushdown on clustered PK
-      - MRR interface doesn't allow the storage engine to refuse a pushed index
-        condition.
-      Having this call here is not fully harmless: EXPLAIN shows "pushed index
-      condition", which is technically true but doesn't bring the benefits that
-      one might expect.
-    */
-    if (h->mrr_funcs.skip_index_tuple &&
-        h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info))
-      continue;
+    /* Check if subsequent keys in the key buffer are the same as this one */
+    {
+      uchar *ptr;
+      identical_key_it.init(&key_buffer);
+      last_identical_key_ptr= NULL;
+      while ((ptr= identical_key_it.get_next(key_size_in_keybuf)))
+      {
+        if (is_mrr_assoc)
+          identical_key_it.get_next(sizeof(void*));

-    if (res && res != HA_ERR_END_OF_FILE)
-      goto end;
+        if (key_tuple_cmp(this, key_in_buf, ptr))
+          break;

-    if (!res)
+        last_identical_key_ptr= ptr;
+      }
+      if (last_identical_key_ptr)
      {
-      memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
-      /* 
-        Attempt reading more rows from this range only if there actually can
-        be multiple matches:
-       */
-      cpk_have_range= !cpk_is_unique_scan;
-      break;
+        in_identical_keys_range= TRUE;
+        identical_key_it.init(&key_buffer);
+        first_identical_range_info= cur_range_info;
+      }
+    }
+
+    in_index_range= !index_ranges_unique;
+    goto check_record;
  }
-  } while (true);
 
 end:
  return res;
@@ -842,9 +1027,6 @@ end:

 /**
  DS-MRR implementation: multi_range_read_next() function
-
-  psergey2-todo: put identical rowid detection code here
-    it should always work because rowid sequences are always sorted
 */

 int DsMrr_impl::dsmrr_next(char **range_info)
@@ -852,89 +1034,108 @@ int DsMrr_impl::dsmrr_next(char **range_info)
  int res;
  uchar *cur_range_info= 0;
  uchar *rowid;
+  uchar *range_id;

  if (use_default_impl)
    return h->handler::multi_range_read_next(range_info);

-  if (doing_cpk_scan)
-    return dsmrr_next_cpk(range_info);
+  if (!do_rowid_fetch)
+    return dsmrr_next_from_index(range_info);
  
-  if (mrr_buf_next_identical != mrr_buf_cur)
+  while (identical_rowid_ptr)
  {
    /*
-      There are multiple rowids. Return the record again, now with different
-      range_id 
+      Current record (the one we've returned in previous call) was obtained
+      from a rowid that matched multiple range_ids. Return this record again,
+      with next matching range_id.
    */
-    do 
-    {
+    rowid= rowid_buffer.read(h->ref_length);
    if (is_mrr_assoc)
-        memcpy(range_info, mrr_buf_next_identical + h->ref_length, sizeof(uchar*));
-    } while (!h2->mrr_funcs.skip_record ||
-             !h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) range_info, rowid));
-
-    mrr_buf_next_identical += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
-    return 0;
+    {
+      uchar *range_ptr= rowid_buffer.read(sizeof(uchar*));
+      memcpy(range_info, range_ptr, sizeof(uchar*));
    }

-  do
-  {
-    if (mrr_buf_cur == mrr_buf_last)
+    if (rowid == identical_rowid_ptr)
    {
-      if (dsmrr_eof)
+      identical_rowid_ptr= NULL; /* reached the last of identical rowids */
+    }
+
+    if (!h2->mrr_funcs.skip_record ||
+        !h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) *range_info, rowid))
    {
-        res= HA_ERR_END_OF_FILE;
-        goto end;
+      return 0;
    }
-      res= dsmrr_fill_rowid_buffer();
-      if (res)
-        goto end;
  }

-    /* return eof if there are no rowids in the buffer after re-fill attempt */
-    if (mrr_buf_cur == mrr_buf_last)
+  while (1)
  {
-      res= HA_ERR_END_OF_FILE;
-      goto end;
+    if (!rowid_buffer.have_data(1))
+    {
+      if (dsmrr_eof)
+        return HA_ERR_END_OF_FILE;
+
+      if (do_sort_keys && key_buffer.used_size() == 0)
+        dsmrr_fill_key_buffer();
+
+      if ((res= dsmrr_fill_rowid_buffer()))
+        return res;
    }
-    rowid= mrr_buf_cur;
   
-    if (is_mrr_assoc)
-      memcpy(&cur_range_info, mrr_buf_cur + h->ref_length, sizeof(uchar**));
+    /* Return eof if there are no rowids in the buffer after re-fill attempt */
+    if (!rowid_buffer.have_data(1))
+      return HA_ERR_END_OF_FILE;

-    size_t element_size= h->ref_length + sizeof(void*) * test(is_mrr_assoc);
-    mrr_buf_cur += element_size;
-    mrr_buf_next_identical= mrr_buf_cur;
+    rowid= rowid_buffer.read(h->ref_length);
+    identical_rowid_ptr= NULL;

+    if (is_mrr_assoc)
+    {
+      range_id= rowid_buffer.read(sizeof(uchar*));
+      memcpy(&cur_range_info, range_id, sizeof(uchar*));
+      memcpy(range_info, range_id, sizeof(uchar*));
+    }
+    
+    //psergey2-note: the below isn't right- we won't want to skip over this 
+    // rowid because this (rowid, range_id) pair has nothing.. the next 
+    // identical rowids might have something.. (but we set identicals later,
+    // dont we?)
    if (h2->mrr_funcs.skip_record &&
 	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
      continue;
+
    res= h->ha_rnd_pos(table->record[0], rowid);

    if (res == HA_ERR_RECORD_DELETED)
      continue;
    
-    if (0)//(!res)
+    /* 
+      Check if subsequent buffer elements have the same rowid value as this
+      one. If yes, remember this fact so that we don't make any more rnd_pos()
+      calls with this value.
+    */
+    if (!res)
    {
      /* 
        Note: this implies that SQL layer doesn't touch table->record[0]
        between calls.
      */
-      uchar *current_el= mrr_buf_cur - element_size;
-      while (mrr_buf_cur != mrr_buf_last && 
-             !h2->cmp_ref(current_el, mrr_buf_cur))
+      uchar *ptr;
+      SimpleBuffer::PeekIterator identical_rowid_it;
+      identical_rowid_it.init(&rowid_buffer);
+      while ((ptr= identical_rowid_it.get_next(h->ref_length)))
      {
-        mrr_buf_cur += element_size;
+        if (is_mrr_assoc)
+          identical_rowid_it.get_next(sizeof(void*));
+
+        if (h2->cmp_ref(rowid, ptr))
+          break;
+        identical_rowid_ptr= ptr;
      }
    }
-    break;
-
-  } while (true);
- 
-  if (is_mrr_assoc)
-  {
-    memcpy(range_info, rowid + h->ref_length, sizeof(void*));
+    return 0;
  }
-end:
+
  return res;
 }


--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -37,6 +37,96 @@
 */


+/*
+  A simple memory buffer for reading and writing.
+
+  when writing, there is no user-visible "current" position, although
+  internally 'pos' points to just after the end of used area  (or at the 
+  start of it for reverse buffer).
+
+  When reading, there is current position pointing at start (for reverse
+  buffer, end) of the element that will be read next.
+   ^^ why end for reverse? it's more logical to point at start 
+
+  One can peek at what's behind that element by using advance_ptr function.
+
+  TODO: will the reverse buffer store {tuple; rowid} or {rowid; tuple} pairs?
+    (why does it matter??? Read and write in the same order and then it
+    shouldn't matter.)
+*/
+
+class SimpleBuffer
+{
+  uchar *start;
+  uchar *end;
+  uchar *read_pos;
+  uchar *write_pos;
+  
+  /*
+     1 <=> buffer grows/is filled/is read  from start to end
+    -1 <=> everthing is done from end to start instead.
+  */
+  int direction;
+public:
+  /* Write-mode functions */
+  void reset_for_writing();
+  void write(const uchar *data, size_t bytes);
+  bool have_space_for(size_t bytes);
+
+  uchar *used_area() { return (direction == 1)? read_pos : write_pos; }
+  size_t used_size();
+
+  /* Read-mode functions */
+  void reset_for_reading();
+
+  uchar *read(size_t bytes);
+  bool have_data(size_t bytes);
+  uchar *end_of_space();
+
+  /* Control functions */
+  void set_buffer_space(uchar *start_arg, uchar *end_arg, int direction_arg) 
+  {
+    start= start_arg;
+    end= end_arg;
+    direction= direction_arg;
+    reset_for_writing();
+  }
+
+  friend class PeekIterator;
+  class PeekIterator
+  {
+    // if direction==1 : pointer to what to return next
+    // if direction==-1: pointer to the end of what is to be returned next
+    uchar *pos;
+    SimpleBuffer *sb;
+  public:
+    void init(SimpleBuffer *sb_arg)
+    {
+      sb= sb_arg;
+      pos= sb->read_pos;
+    }
+    /* Return pointer to next chunk of nbytes bytes and avance over it */
+    uchar *get_next(size_t nbytes)
+    {
+      if (sb->direction == 1)
+      {
+        if (pos + nbytes > sb->write_pos)
+          return NULL;
+        uchar *res= pos;
+        pos += nbytes;
+        return res;
+      }
+      else
+      {
+        if (pos - nbytes <= sb->write_pos)
+          return NULL;
+        pos -= nbytes;
+        return pos;
+      }
+    }
+  };
+};
+
 /*
  DS-MRR implementation for one table. Create/use one object of this class for
  each ha_{myisam/innobase/etc} object. That object will be further referred to
@@ -73,6 +163,8 @@
       scanning.
 */

+
+
 class DsMrr_impl
 {
 public:
@@ -108,14 +200,27 @@ private:
  /* Secondary handler object.  It is used for scanning the index */
  handler *h2;

+  uchar *full_buf;
+  uchar *full_buf_end;
+
  /* Buffer to store rowids, or (rowid, range_id) pairs */
-  uchar *mrr_buf;
-  uchar *mrr_buf_cur;   /* Current position when reading/writing */
-  uchar *mrr_buf_last;  /* When reading: end of used buffer space */
-  uchar *mrr_buf_end;   /* End of the buffer */
+  SimpleBuffer rowid_buffer;
  
-  uchar *mrr_buf_next_identical;
+  uchar *identical_rowid_ptr;
+  
+  /* Identical keys */
+  bool in_identical_keys_range;
+  uchar *last_identical_key_ptr;
+  SimpleBuffer::PeekIterator identical_key_it;
+
+  SimpleBuffer key_buffer;
+  
+  uint keyno;
+
+  /* Execution control */
+  bool do_sort_keys;
  bool use_key_pointers;
+  bool do_rowid_fetch;

  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */

@@ -129,18 +234,33 @@ private:
  /** DS-MRR/CPK variables start */

  /* Length of lookup tuple being used, in bytes */
-  uint cpk_tuple_length;
+  uint key_tuple_length;
+  key_part_map key_tuple_map; 
+  /*
+    This is 
+      = key_tuple_length   if we copy keys to buffer
+      = sizeof(void*)      if we're using pointers to materialized keys.
+  */
+  uint key_size_in_keybuf;
+  
+  /* = key_size_in_keybuf [ + sizeof(range_assoc_info) ] */
+  uint key_buff_elem_size;
+  
+  /* = h->ref_length  [ + sizeof(range_assoc_info) ] */
+  uint rowid_buff_elem_size;

-  uint key_buf_element_size;
  /*
    TRUE <=> We're scanning on a full primary key (and not on prefix), and so 
    can get max. one match for each key 
  */
-  bool cpk_is_unique_scan;
+  bool index_ranges_unique;
  /* TRUE<=> we're in a middle of enumerating records from a range */ 
-  bool cpk_have_range;
-  /* Valid if cpk_have_range==TRUE: range_id of the range we're enumerating */
-  char *cpk_saved_range_info;
+  bool in_index_range;
+  uchar *cur_index_tuple;
+  /* if in_index_range==TRUE: range_id of the range we're enumerating */
+  char *cur_range_info;
+
+  char *first_identical_range_info;

  bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
                       COST_VECT *cost);
@@ -150,6 +270,11 @@ private:
  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
  int dsmrr_fill_rowid_buffer();
  void dsmrr_fill_key_buffer();
-  int dsmrr_next_cpk(char **range_info);
+  int dsmrr_next_from_index(char **range_info);
+
+  void setup_buffer_sizes(key_range *sample_key);
+
+  static range_seq_t key_buf_seq_init(void *init_param, uint n_ranges, uint flags);
+  static uint key_buf_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
 };