Fix calculation of selectivity

calculate_cond_selectivity_for_table() is largely rewritten: - Process keys in the order of rows found, smaller ranges first. If two ranges has equal number of rows, use the one with more key parts. This helps us to mark more used fields to not be used for further selectivity calculations. See cmp_quick_ranges(). - Ignore keys with fields that where used by previous keys - Don't use rec_per_key[] to calculate selectivity for smaller secondary key parts. This does not work as rec_per_key[] value is calculated in the context of the previous key parts, not for the key part itself. The one exception is if the previous key parts are all constants. Other things: - Ensure that select->cond_selectivity is always between 0 and 1. - Ensure that select->opt_range_condition_rows is never updated to a higher value. It is initially set to the number of rows in table. - We now store in table->opt_range_condition_rows the lowest number of rows that any row-read-method has found so far. Before it was only done for QUICK_SELECT_I::QS_TYPE_ROR_UNION and QUICK_SELECT_I::QS_TYPE_INDEX_MERGE. Now it is done for a lot more methods. See calculate_cond_selectivity_for_table() for details. - Calculate and use selectivity for the first key part of a multiple key part if the first key part is a constant. WHERE key1_part1=5 and key2_part1=5. IF key1 is used, then we can still use selectivity for key2 Changes in test results: - 'filtered' is slightly changed, usually to something slightly smaller. - A few cases where for group by queries the table order changed. This was because the number of resulting rows from a group by query with MIN/MAX is now set to be smaller. - A few index was changed as we now prefer index with more key parts if the number of resulting rows is the same.

Fix calculation of selectivity
calculate_cond_selectivity_for_table() is largely rewritten: - Process keys in the order of rows found, smaller ranges first. If two ranges has equal number of rows, use the one with more key parts. This helps us to mark more used fields to not be used for further selectivity calculations. See cmp_quick_ranges(). - Ignore keys with fields that where used by previous keys - Don't use rec_per_key[] to calculate selectivity for smaller secondary key parts. This does not work as rec_per_key[] value is calculated in the context of the previous key parts, not for the key part itself. The one exception is if the previous key parts are all constants. Other things: - Ensure that select->cond_selectivity is always between 0 and 1. - Ensure that select->opt_range_condition_rows is never updated to a higher value. It is initially set to the number of rows in table. - We now store in table->opt_range_condition_rows the lowest number of rows that any row-read-method has found so far. Before it was only done for QUICK_SELECT_I::QS_TYPE_ROR_UNION and QUICK_SELECT_I::QS_TYPE_INDEX_MERGE. Now it is done for a lot more methods. See calculate_cond_selectivity_for_table() for details. - Calculate and use selectivity for the first key part of a multiple key part if the first key part is a constant. WHERE key1_part1=5 and key2_part1=5. IF key1 is used, then we can still use selectivity for key2 Changes in test results: - 'filtered' is slightly changed, usually to something slightly smaller. - A few cases where for group by queries the table order changed. This was because the number of resulting rows from a group by query with MIN/MAX is now set to be smaller. - A few index was changed as we now prefer index with more key parts if the number of resulting rows is the same.
dc2f0d13 · Monty · Sergei Petrunia · 7d0bef6c · dc2f0d13 · dc2f0d13
Commit dc2f0d13 authored Oct 06, 2021 by Monty Committed by Sergei Petrunia Feb 02, 2023
9 changed files
--- a/mysql-test/main/group_min_max.result
+++ b/mysql-test/main/group_min_max.result
@@ -2460,8 +2460,8 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 EXPLAIN SELECT 1 FROM t1 AS t1_outer WHERE 
 a IN (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1_outer	index	a	a	10	NULL	15	Using where; Using index
-1	PRIMARY	<subquery2>	eq_ref	distinct_key	distinct_key	4	test.t1_outer.a	1	
+1	PRIMARY	<subquery2>	ALL	distinct_key	NULL	NULL	NULL	2	
+1	PRIMARY	t1_outer	ref	a	a	5	<subquery2>.max(b)	3	Using index
 2	MATERIALIZED	t1	range	a	a	5	NULL	5	Using where; Using index
 EXPLAIN SELECT 1 FROM t1 AS t1_outer GROUP BY a HAVING 
 a > (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);

--- a/mysql-test/main/mdev-25830.result
+++ b/mysql-test/main/mdev-25830.result
@@ -47,7 +47,7 @@ WHERE  task2.`sys_id` LIKE '8e7792a7dbfffb00fff8a345ca961934%'
 ORDER  BY sysapproval_approver0.`order`
 LIMIT  0, 50 ;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	task2	range	PRIMARY,sys_class_name_2,sys_domain_path	PRIMARY	96	NULL	1	0.00	98.00	100.00	Using where; Using temporary; Using filesort
+1	SIMPLE	task2	range	PRIMARY,sys_class_name_2,sys_domain_path	PRIMARY	96	NULL	1	0.00	100.00	100.00	Using where; Using temporary; Using filesort
 1	SIMPLE	task1	ref	PRIMARY,task_parent,sys_class_name_2,sys_domain_path	task_parent	99	test.task2.sys_id	1	NULL	100.00	NULL	Using index condition; Using where
 1	SIMPLE	sysapproval_approver0	ref	sysapproval_approver_ref5,sys_domain_path,sysapproval_approver_CHG1975376	sysapproval_approver_ref5	99	test.task1.sys_id	1	NULL	100.00	NULL	Using index condition; Using where
 drop table sysapproval_approver,task;

--- a/mysql-test/main/opt_trace_index_merge.result
+++ b/mysql-test/main/opt_trace_index_merge.result
@@ -193,7 +193,11 @@ explain select * from t1 where a=1 or b=1	{
                }
              },
              {
-                "selectivity_for_indexes": [],
+                "selectivity_for_indexes": [
+                  {
+                    "use_opt_range_condition_rows_selectivity": 0.002
+                  }
+                ],
                "selectivity_for_columns": [],
                "cond_selectivity": 0.002
              }

--- a/mysql-test/main/opt_trace_selectivity.result
+++ b/mysql-test/main/opt_trace_selectivity.result
--- a/mysql-test/main/opt_trace_selectivity.test
+++ b/mysql-test/main/opt_trace_selectivity.test
+--source include/have_sequence.inc
+--source include/not_embedded.inc
+
+#
+# Test changes in calculate_cond_selectivity_for_table()
+#
+create or replace table t1 (a int, b int, c int, key(a,c), key(b,c), key (c,b)) engine=aria;
+insert into t1 select seq/100+1, mod(seq,10), mod(seq,15) from seq_1_to_10000;
+insert into t1 select seq/100+1, mod(seq,10), 10 from seq_1_to_1000;
+optimize table t1;
+
+select count(*) from t1 where a=2;
+select count(*) from t1 where b=5;
+select count(*) from t1 where c=5;
+select count(*) from t1 where c=10;
+select count(*) from t1 where a=2 and b=5;
+select count(*) from t1 where c=10 and b=5;
+select count(*) from t1 where c=5 and b=5;
+
+set optimizer_trace="enabled=on";
+select count(*) from t1 where a=2 and b=5 and c=10;
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.considered_execution_plans')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+
+select count(*) from t1 where a=2 and b=5 and c=5;
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.considered_execution_plans')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+
+--echo # Ensure that we only use selectivity from non used index for simple cases
+
+
+select count(*) from t1 where (a=2 and b= 5);
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+
+--echo # All of the following should have selectivity=1 for index 'b'
+select count(*) from t1 where (a=2 and b between 0 and 100);
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+select count(*) from t1 where (a in (2,3) and b between 0 and 100);
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+select count(*) from t1 where (a>2 and b between 0 and 100);
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+select count(*) from t1 where (a>=2 and b between 0 and 100);
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+select count(*) from t1 where (a<=2 and b between 0 and 100);
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+select count(*) from t1 where (a<2 and b between 0 and 100);
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+select count(*) from t1 where (a between 2 and 3 and b between 0 and 100);
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.selectivity_for_indexes')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+
+drop table t1;
+set optimizer_trace='enabled=off';
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@@ -1109,6 +1109,13 @@ class QUICK_SELECT_I
  */
  uint used_key_parts;

+  /*
+    Set to 1 if we used group by optimization to calculate number of rows
+    in the result, stored in table->opt_range_condition_rows.
+    This is only used for asserts.
+  */
+  bool group_by_optimization_used;
+
  QUICK_SELECT_I();
  virtual ~QUICK_SELECT_I(){};


--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -5819,7 +5819,8 @@ make_join_statistics(JOIN *join, List<TABLE_LIST> &tables_list,
        get_delayed_table_estimates(s->table, &s->records, &s->read_time,
                                    &s->startup_cost);
        s->found_records= s->records;
-        table->opt_range_condition_rows=s->records;
+        s->table->opt_range_condition_rows= s->records;
+        s->table->used_stat_records= s->records;
      }
      else
        s->scan_time();
@@ -5843,8 +5844,12 @@ make_join_statistics(JOIN *join, List<TABLE_LIST> &tables_list,
      */
      add_group_and_distinct_keys(join, s);

-      s->table->cond_selectivity= 1.0;
-
+      /* This will be updated in calculate_cond_selectivity_for_table() */
+      s->table->set_cond_selectivity(1.0);
+      DBUG_ASSERT(s->table->used_stat_records == 0 ||
+                  s->table->cond_selectivity <=
+                  s->table->opt_range_condition_rows /
+                  s->table->used_stat_records);
      /*
        Perform range analysis if there are keys it could use (1).
        Don't do range analysis for materialized subqueries (2).
@@ -7725,6 +7730,15 @@ double matching_candidates_in_table(JOIN_TAB *s, bool with_found_constraint,
    TABLE *table= s->table;
    double sel= table->cond_selectivity;
    double table_records= rows2double(s->records);
+    DBUG_ASSERT(sel >= 0 && sel <= 1.0);
+    /*
+      table->cond_selectivity will include data from opt_range.
+      Here we check that this is indeeded the case.
+      Note that if table_records == 0, then 'sel' is probably 1
+    */
+    DBUG_ASSERT(table_records == 0 ||
+                sel <= s->table->opt_range_condition_rows /
+                table_records);
    dbl_records= table_records * sel;
    return dbl_records;
  }
@@ -9915,7 +9929,11 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
          However if sel becomes greater than 2 then with high probability
          something went wrong.
 	*/
-        sel /= (double)table->opt_range[key].rows / (double) table->stat_records();
+        DBUG_ASSERT(sel <= 1.0);
+        DBUG_ASSERT(table->opt_range[key].rows <=
+                    (double) table->stat_records());
+        sel /= ((double) table->opt_range[key].rows /
+                (double) table->stat_records());
        set_if_smaller(sel, 1.0);
        used_range_selectivity= true;
      }
@@ -10059,6 +10077,7 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
 exit:
  if (ref_keyuse_steps != ref_keyuse_steps_buf)
    my_free(ref_keyuse_steps);
+  DBUG_ASSERT(sel >= 0.0 and sel <= 1.0);
  return sel;
 }

@@ -11489,6 +11508,7 @@ bool JOIN::get_best_combination()
    */
    j->records_read= best_positions[tablenr].records_read;
    j->cond_selectivity= best_positions[tablenr].cond_selectivity;
+    DBUG_ASSERT(j->cond_selectivity <= 1.0);
    map2table[j->table->tablenr]= j;

    /* If we've reached the end of sjm nest, switch back to main sequence */
@@ -27824,6 +27844,15 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
  KEY *key_info= 0;
  uint key_len= 0;

+#ifdef NOT_YET
+  /*
+    Would be good to keep this condition up to date.
+    Another alternative is to remove JOIN_TAB::cond_selectivity and use
+    TABLE::cond_selectivity everywhere
+  */
+  DBUG_ASSERT(cond_selectivity == table->cond_selectivity);
+#endif
+
  explain_plan= eta;
  eta->key.clear();
  eta->quick_info= NULL;

--- a/sql/table.h
+++ b/sql/table.h
@@ -1398,6 +1398,7 @@ struct TABLE
      index only access for it is stored in index_only_costs[i]
    */
    double      index_only_cost;
+    bool        first_key_part_has_only_one_value;
  } *opt_range;
  /* 
     Bitmaps of key parts that =const for the duration of join execution. If
@@ -1839,7 +1840,18 @@ struct TABLE
    DBUG_ASSERT(s->period.name);
    return field[s->period.end_fieldno];
  }
-
+  void set_cond_selectivity(double selectivity)
+  {
+    DBUG_ASSERT(selectivity >= 0.0 && selectivity <= 1.0);
+    cond_selectivity= selectivity;
+    DBUG_PRINT("info", ("cond_selectivity: %g", cond_selectivity));
+  }
+  void multiply_cond_selectivity(double selectivity)
+  {
+    DBUG_ASSERT(selectivity >= 0.0 && selectivity <= 1.0);
+    cond_selectivity*= selectivity;
+    DBUG_PRINT("info", ("cond_selectivity: %g", cond_selectivity));
+  }

  ulonglong vers_start_id() const;
  ulonglong vers_end_id() const;