MDEV-27229: Estimation for filtered rows less precise ... #5

Fix special handling for values that are right next to buckets with ndv=1.

MDEV-27229: Estimation for filtered rows less precise ... #5
Fix special handling for values that are right next to buckets with ndv=1.
531dd708 · Sergei Petrunia · Sergei Petrunia · 67d4d042 · 531dd708 · 531dd708
Commit 531dd708 authored Jan 08, 2022 by Sergei Petrunia Committed by Sergei Petrunia Jan 19, 2022
4 changed files
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@@ -4631,12 +4631,12 @@ test	t1_json	a	a-0	a-9	0.0000	3.0000	1.0000	10	JSON_HB	{
 }
 explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	68.71	Using where
+1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	60.00	Using where
 Warnings:
 Note	1003	select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz'
 analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	10.00	68.71	60.00	Using where
+1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	10.00	60.00	60.00	Using where
 explain extended select * from t1_json where a < 'b-1a';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
@@ -8014,7 +8014,7 @@ test.t1	analyze	status	OK
 analyze
 select c from t1 where c > '1';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	16	16.00	80.47	75.00	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	16	16.00	75.00	75.00	Using where
 drop table t1;
 #
 # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
@@ -8211,3 +8211,33 @@ analyze select COUNT(*) FROM t1 WHERE a < 'a';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	50.00	50.00	Using where
 drop table t1;
+#
+# MDEV-27229: Estimation for filtered rows less precise ... #5
+#
+create table t1 (id int, a varchar(8));
+insert into t1 select seq, 'bar' from seq_1_to_100;
+insert into t1 select id, 'qux' from t1;
+set histogram_type=JSON_HB;
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	OK
+analyze select COUNT(*) FROM t1 WHERE a > 'foo';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	50.00	50.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	100.00	100.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a >='aaa';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	100.00	100.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a > 'bar';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	50.00	50.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a >='bar';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	100.00	100.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a <='bar';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	50.00	50.00	Using where
+drop table t1;
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@@ -390,3 +390,29 @@ analyze table t1 persistent for all;
 analyze select COUNT(*) FROM t1 WHERE a <> 'a';
 analyze select COUNT(*) FROM t1 WHERE a < 'a';
 drop table t1;
+
+--echo #
+--echo # MDEV-27229: Estimation for filtered rows less precise ... #5
+--echo #
+create table t1 (id int, a varchar(8));
+insert into t1 select seq, 'bar' from seq_1_to_100;
+insert into t1 select id, 'qux' from t1;
+
+set histogram_type=JSON_HB;
+analyze table t1 persistent for all;
+analyze select COUNT(*) FROM t1 WHERE a > 'foo';
+
+analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
+analyze select COUNT(*) FROM t1 WHERE a >='aaa';
+
+analyze select COUNT(*) FROM t1 WHERE a > 'bar';
+analyze select COUNT(*) FROM t1 WHERE a >='bar';
+
+# Can enable these after get_avg_frequency issue is resolved:
+# analyze select COUNT(*) FROM t1 WHERE a < 'aaa';
+# analyze select COUNT(*) FROM t1 WHERE a <='aaa';
+# analyze select COUNT(*) FROM t1 WHERE a < 'bar';
+
+analyze select COUNT(*) FROM t1 WHERE a <='bar';
+
+drop table t1;
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@@ -910,12 +910,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,

  // If the value is outside of the histogram's range, this will "clip" it to
  // first or last bucket.
-  bool equal;
-  int idx= find_bucket(field, key, &equal);
+  int endp_cmp;
+  int idx= find_bucket(field, key, &endp_cmp);

  double sel;

-  if (buckets[idx].ndv == 1 && !equal)
+  if (buckets[idx].ndv == 1 && (endp_cmp!=0))
  {
    /*
      The bucket has a single value and it doesn't match! Return a very
@@ -979,22 +979,27 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,

    // Find the leftmost bucket that contains the lookup value.
    // (If the lookup value is to the left of all buckets, find bucket #0)
-    bool equal;
-    int idx= find_bucket(field, min_key, &equal);
-    if (equal && exclusive_endp && buckets[idx].ndv==1 &&
-        idx < (int)buckets.size()-1)
+    int endp_cmp;
+    int idx= find_bucket(field, min_key, &endp_cmp);
+
+    double sel;
+    // Special handling for buckets with ndv=1:
+    if (buckets[idx].ndv == 1)
    {
-      /*
-        The range is "col > $CONST" and we've found a bucket that contains
-        only the value $CONST. Move to the next bucket.
-      */
-      idx++;
+      if (endp_cmp < 0)
+        sel= 0.0;
+      else if (endp_cmp > 0)
+        sel= 1.0;
+      else // endp_cmp == 0.0
+        sel= (exclusive_endp)? 1.0 : 0.0;
+    }
+    else
+    {
+      sel= position_in_interval(field, min_key, min_key_len,
+				buckets[idx].start_value,
+				get_end_value(idx));
    }
    double left_fract= get_left_fract(idx);
-    double sel= position_in_interval(field, min_key, min_key_len,
-                                     buckets[idx].start_value,
-                                     get_end_value(idx));
-
    min= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
@@ -1012,28 +1017,35 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
      max_key++;
      max_key_len--;
    }
-    bool equal;
-    int idx= find_bucket(field, max_key, &equal);
+    int endp_cmp;
+    int idx= find_bucket(field, max_key, &endp_cmp);

-    if (equal && !inclusive_endp && idx > 0)
+    if ((endp_cmp == 0) && !inclusive_endp)
    {
      /*
        The range is "col < $CONST" and we've found a bucket starting with
-        $CONST. Move to the previous bucket.
+        $CONST.
      */
-      idx--;
-      equal= false;
+      if (idx > 0)
+      {
+        // Move to the previous bucket
+        endp_cmp= 1;
+        idx--;
+      }
+      else
+        endp_cmp= -1;
    }
-    double left_fract= get_left_fract(idx);
-
    double sel;
-    /* Special handling for singleton buckets */
-    if (buckets[idx].ndv == 1 && equal)
+
+    // Special handling for buckets with ndv=1:
+    if (buckets[idx].ndv == 1)
    {
-      if (inclusive_endp)
-        sel= 1.0;
-      else
+      if (endp_cmp < 0)
        sel= 0.0;
+      else if (endp_cmp > 0)
+        sel= 1.0;
+      else // endp_cmp == 0.0
+        sel= inclusive_endp? 1.0 : 0.0;
    }
    else
    {
@@ -1041,13 +1053,13 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
                                buckets[idx].start_value,
                                get_end_value(idx));
    }
+    double left_fract= get_left_fract(idx);
    max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
    max= 1.0;

-  double sel = max - min;
-  return sel;
+  return max - min;
 }


@@ -1057,25 +1069,37 @@ void Histogram_json_hb::serialize(Field *field)
 }


+static int SGN(int x)
+{
+  if (!x)
+    return 0;
+  return (x < 0)? -1 : 1;
+}
+
+
 /*
  @brief
   Find the leftmost histogram bucket such that "lookup_val >= start_value".

  @param field        Field object (used to do value comparisons)
  @param lookup_val   The lookup value in KeyTupleFormat.
-  @param equal  OUT   TRUE<=> the found bucket has left_bound=lookup_val
-
+  @param cmp  OUT     How the lookup_val compares to found_bucket.left_bound:
+                      0  - lookup_val == bucket.left_bound
+                      >0 - lookup_val > bucket.left_bound (the most typical)
+                      <0 - lookup_val < bucket.left_bound. This can only happen
+                      for the first bucket, for all other buckets we would just
+                      pick the previous bucket and have cmp>=0.
  @return
     The bucket index
 */

 int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
-                                   bool *equal)
+                                   int *cmp)
 {
  int res;
  int low= 0;
  int high= (int)buckets.size() - 1;
-  *equal= false;
+  *cmp= 1; // By default, (bucket[retval].start_value < *lookup_val)

  while (low + 1 < high)
  {
@@ -1083,7 +1107,7 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
    res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
    if (!res)
    {
-      *equal= true;
+      *cmp= res;
      low= middle;
      goto end;
    }
@@ -1104,31 +1128,44 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
  */
  if (low == 0)
  {
-    res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
-    if (!res)
-      *equal= true;
-    else if (res < 0) //  buckets[0] < lookup_val
+    res= field->key_cmp(lookup_val, (uchar*)buckets[0].start_value.data());
+    if (res <= 0)
+      *cmp= res;
+    else // res>0, lookup_val > buckets[0].start_value
    {
-      res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
-      if (!res)
-        *equal= true;
-      if (res <= 0) // buckets[high] <= lookup_val
+      res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
+      if (res >= 0)  // lookup_val >= buckets[high].start_value
+      {
+        // Move to that bucket
        low= high;
+        *cmp= res;
+      }
+      else
+        *cmp= 1;
    }
  }
  else if (high == (int)buckets.size() - 1)
  {
-    res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
-    if (!res)
-      *equal= true;
-    if (res <= 0)
+    res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
+    if (res >= 0)
+    {
+      // Ok the value is in the last bucket.
+      *cmp= res;
      low= high;
+    }
+    else
+    {
+      // The value is in the 'low' bucket.
+      res= field->key_cmp(lookup_val, (uchar*)buckets[low].start_value.data());
+      *cmp= res;
+    }
  }

 end:
-  // Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
-  DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
-                                         lookup_val)));
+  // Verification: *cmp has correct value
+  DBUG_ASSERT(SGN(*cmp) ==
+              SGN(field->key_cmp(lookup_val,
+                                 (uchar*)buckets[low].start_value.data())));
  // buckets[low] <= lookup_val, with one exception of the first bucket.
  DBUG_ASSERT(low == 0 ||
              field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);

--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@@ -144,6 +144,6 @@ class Histogram_json_hb : public Histogram_base

  double get_left_fract(int idx);
  std::string& get_end_value(int idx);
-  int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
+  int find_bucket(const Field *field, const uchar *lookup_val, int *cmp);
 };