MDEV-30218 Incorrect optimization for rowid_filtering

Correction over the last patch for this MDEV.

MDEV-30218 Incorrect optimization for rowid_filtering
Correction over the last patch for this MDEV.
d1a46c68 · Igor Babaev · Oleksandr Byelkin · 03c9a4ef · d1a46c68 · d1a46c68
Commit d1a46c68 authored Jan 31, 2023 by Igor Babaev Committed by Oleksandr Byelkin Feb 15, 2023
11 changed files
--- a/mysql-test/main/join_nested_jcl6.result
+++ b/mysql-test/main/join_nested_jcl6.result
@@ -2085,7 +2085,7 @@ ON t6.b >= 2 AND t5.b=t7.b AND
 (t8.a > 0 OR t8.c IS NULL) AND t6.a>0 AND t7.a>0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t5	ALL	NULL	NULL	NULL	NULL	3	
-1	SIMPLE	t7	ref	PRIMARY,b_i	b_i	5	test.t5.b	2	Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
+1	SIMPLE	t7	ref|filter	PRIMARY,b_i	b_i|PRIMARY	5|4	test.t5.b	2 (29%)	Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan; Using rowid filter
 1	SIMPLE	t6	range	PRIMARY,b_i	PRIMARY	4	NULL	3	Using where; Rowid-ordered scan; Using join buffer (incremental, BNL join)
 1	SIMPLE	t8	ref	b_i	b_i	5	test.t5.b	2	Using where; Using join buffer (incremental, BKA join); Key-ordered Rowid-ordered scan
 SELECT t5.a,t5.b,t6.a,t6.b,t7.a,t7.b,t8.a,t8.b

--- a/mysql-test/main/opt_trace.result
+++ b/mysql-test/main/opt_trace.result
@@ -1016,7 +1016,6 @@ explain select * from t1,t2 where t1.a=t2.b+2 and t2.a= t1.b	{
                          "index": "a",
                          "used_range_estimates": false,
                          "cause": "not available",
-                          "rowid_filter_skipped": "cost_factor <= 0",
                          "rows": 1,
                          "cost": 200,
                          "chosen": true
@@ -1073,7 +1072,6 @@ explain select * from t1,t2 where t1.a=t2.b+2 and t2.a= t1.b	{
                          "index": "a",
                          "used_range_estimates": false,
                          "cause": "not available",
-                          "rowid_filter_skipped": "cost_factor <= 0",
                          "rows": 1,
                          "cost": 200,
                          "chosen": true
@@ -2120,7 +2118,6 @@ explain  select * from t1 where a=1 and b=2 order by c limit 1	{
                      "access_type": "ref",
                      "index": "a_c",
                      "used_range_estimates": true,
-                      "rowid_filter_skipped": "worst/max seeks clipping",
                      "rows": 180,
                      "cost": 92,
                      "chosen": true
@@ -3346,7 +3343,6 @@ explain select * from t1 where pk = 2 and a=5 and b=1	{
                      "access_type": "ref",
                      "index": "pk",
                      "used_range_estimates": true,
-                      "rowid_filter_skipped": "cost_factor <= 0",
                      "rows": 1,
                      "cost": 2,
                      "chosen": true
@@ -3355,7 +3351,6 @@ explain select * from t1 where pk = 2 and a=5 and b=1	{
                      "access_type": "ref",
                      "index": "pk_a",
                      "used_range_estimates": true,
-                      "rowid_filter_skipped": "cost_factor <= 0",
                      "rows": 1,
                      "cost": 2,
                      "chosen": false,
@@ -3365,7 +3360,6 @@ explain select * from t1 where pk = 2 and a=5 and b=1	{
                      "access_type": "ref",
                      "index": "pk_a_b",
                      "used_range_estimates": true,
-                      "rowid_filter_skipped": "cost_factor <= 0",
                      "rows": 1,
                      "cost": 1.0043,
                      "chosen": true
@@ -3974,6 +3968,7 @@ explain delete t0,t1 from t0, t1 where t0.a=t1.a and t1.a<3	{
                "best_access_path": {
                  "considered_access_paths": [
                    {
+                      "rowid_filter_skipped": "cost_factor <= 0",
                      "access_type": "range",
                      "resulting_rows": 3,
                      "cost": 1.407,
@@ -4000,7 +3995,6 @@ explain delete t0,t1 from t0, t1 where t0.a=t1.a and t1.a<3	{
                          "index": "a",
                          "used_range_estimates": false,
                          "cause": "not better than ref estimates",
-                          "rowid_filter_skipped": "cost_factor <= 0",
                          "rows": 1,
                          "cost": 3.007,
                          "chosen": true
@@ -4030,6 +4024,7 @@ explain delete t0,t1 from t0, t1 where t0.a=t1.a and t1.a<3	{
                "best_access_path": {
                  "considered_access_paths": [
                    {
+                      "rowid_filter_skipped": "cost_factor <= 0",
                      "access_type": "range",
                      "resulting_rows": 3,
                      "cost": 1.407,
@@ -4056,7 +4051,6 @@ explain delete t0,t1 from t0, t1 where t0.a=t1.a and t1.a<3	{
                          "index": "a",
                          "used_range_estimates": false,
                          "cause": "not better than ref estimates",
-                          "rowid_filter_skipped": "worst/max seeks clipping",
                          "rows": 2,
                          "cost": 3.014,
                          "chosen": true
@@ -8069,7 +8063,6 @@ JSON_DETAILED(JSON_EXTRACT(trace, '$**.considered_execution_plans'))
                                "index": "b",
                                "used_range_estimates": false,
                                "cause": "not available",
-                                "rowid_filter_skipped": "cost_factor <= 0",
                                "rows": 1,
                                "cost": 20,
                                "chosen": true
@@ -8273,7 +8266,6 @@ JSON_DETAILED(JSON_EXTRACT(trace, '$**.considered_execution_plans'))
                                "index": "a",
                                "used_range_estimates": false,
                                "cause": "not available",
-                                "rowid_filter_skipped": "cost_factor <= 0",
                                "rows": 1,
                                "cost": 20,
                                "chosen": true
@@ -8341,7 +8333,6 @@ JSON_DETAILED(JSON_EXTRACT(trace, '$**.considered_execution_plans'))
                                "index": "a",
                                "used_range_estimates": false,
                                "cause": "not available",
-                                "rowid_filter_skipped": "cost_factor <= 0",
                                "rows": 1,
                                "cost": 200,
                                "chosen": true

--- a/mysql-test/main/opt_trace_index_merge_innodb.result
+++ b/mysql-test/main/opt_trace_index_merge_innodb.result
@@ -208,7 +208,6 @@ explain select * from t1 where pk1 != 0  and key1 = 1	{
                      "access_type": "ref",
                      "index": "key1",
                      "used_range_estimates": true,
-                      "rowid_filter_skipped": "cost_factor <= 0",
                      "rows": 1,
                      "cost": 2,
                      "chosen": true

--- a/mysql-test/main/range.result
+++ b/mysql-test/main/range.result
@@ -281,7 +281,7 @@ INSERT INTO t1 VALUES
 (33,5),(33,5),(33,5),(33,5),(34,5),(35,5);
 EXPLAIN SELECT * FROM t1 WHERE a IN(1,2) AND b=5;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	a,b	a	5	NULL	2	Using index condition; Using where
+1	SIMPLE	t1	ref|filter	a,b	b|a	5|5	const	15 (5%)	Using where; Using rowid filter
 SELECT * FROM t1 WHERE a IN(1,2) AND b=5;
 a	b
 DROP TABLE t1;

--- a/mysql-test/main/rowid_filter.result
+++ b/mysql-test/main/rowid_filter.result
--- a/mysql-test/main/rowid_filter_innodb.result
+++ b/mysql-test/main/rowid_filter_innodb.result
--- a/mysql-test/main/select.result
+++ b/mysql-test/main/select.result
@@ -3744,7 +3744,7 @@ EXPLAIN SELECT * FROM t1
 WHERE ID_better=1 AND ID1_with_null IS NULL AND 
 (ID2_with_null=1 OR ID2_with_null=2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ref	idx1,idx2	idx2	4	const	2	Using where
+1	SIMPLE	t1	ref|filter	idx1,idx2	idx1|idx2	5|4	const	2 (1%)	Using index condition; Using where; Using rowid filter
 DROP TABLE t1;
 CREATE TABLE t1 (a INT, ts TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, KEY ts(ts));
 INSERT INTO t1 VALUES (30,"2006-01-03 23:00:00"), (31,"2006-01-03 23:00:00");

--- a/mysql-test/main/select_jcl6.result
+++ b/mysql-test/main/select_jcl6.result
@@ -3755,7 +3755,7 @@ EXPLAIN SELECT * FROM t1
 WHERE ID_better=1 AND ID1_with_null IS NULL AND 
 (ID2_with_null=1 OR ID2_with_null=2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ref	idx1,idx2	idx2	4	const	2	Using where
+1	SIMPLE	t1	ref|filter	idx1,idx2	idx1|idx2	5|4	const	2 (1%)	Using index condition; Using where; Using rowid filter
 DROP TABLE t1;
 CREATE TABLE t1 (a INT, ts TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, KEY ts(ts));
 INSERT INTO t1 VALUES (30,"2006-01-03 23:00:00"), (31,"2006-01-03 23:00:00");

--- a/mysql-test/main/select_pkeycache.result
+++ b/mysql-test/main/select_pkeycache.result
@@ -3744,7 +3744,7 @@ EXPLAIN SELECT * FROM t1
 WHERE ID_better=1 AND ID1_with_null IS NULL AND 
 (ID2_with_null=1 OR ID2_with_null=2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ref	idx1,idx2	idx2	4	const	2	Using where
+1	SIMPLE	t1	ref|filter	idx1,idx2	idx1|idx2	5|4	const	2 (1%)	Using index condition; Using where; Using rowid filter
 DROP TABLE t1;
 CREATE TABLE t1 (a INT, ts TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, KEY ts(ts));
 INSERT INTO t1 VALUES (30,"2006-01-03 23:00:00"), (31,"2006-01-03 23:00:00");

--- a/mysql-test/main/selectivity.result
+++ b/mysql-test/main/selectivity.result
@@ -1661,7 +1661,7 @@ Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1`
 # gives selectivity data
 explain extended select * from t1 where a in (17,51,5) and b=2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ref	b,a	b	5	const	58	2.90	Using where
+1	SIMPLE	t1	ref|filter	b,a	b|a	5|5	const	58 (3%)	2.90	Using where; Using rowid filter
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` where `test`.`t1`.`b` = 2 and `test`.`t1`.`a` in (17,51,5)
 drop table t1;

--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -7557,7 +7557,6 @@ best_access_path(JOIN      *join,
        rec= MATCHING_ROWS_IN_OTHER_TABLE;      // Fix for small tables

      Json_writer_object trace_access_idx(thd);
-      double eq_ref_rows= 0;
      /*
        full text keys require special treatment
      */
@@ -7596,8 +7595,7 @@ best_access_path(JOIN      *join,
            type= JT_EQ_REF;
            trace_access_idx.add("access_type", join_type_str[type])
                            .add("index", keyinfo->name);
-            eq_ref_rows= tmp = prev_record_reads(join_positions, idx,
-                                                 found_ref);
+            tmp = prev_record_reads(join_positions, idx, found_ref);
            records=1.0;
          }
          else
@@ -7904,28 +7902,7 @@ best_access_path(JOIN      *join,
          (s->table->file->index_flags(start_key->key,0,1) &
           HA_DO_RANGE_FILTER_PUSHDOWN))
      {
-        double rows;
-        if (type == JT_EQ_REF)
-        {
-          /*
-            Treat EQ_REF access in a special way:
-            1. We have no cost for index-only read. Assume its cost is 50% of
-               the cost of the full read.
-
-            2. A regular ref access will do #record_count lookups, but eq_ref
-               has "lookup cache" which reduces the number of lookups made.
-               The estimation code uses prev_record_reads() call to estimate:
-
-                tmp = prev_record_reads(join_positions, idx, found_ref);
-
-               Set the effective number of rows from "tmp" here.
-          */
-          keyread_tmp= COST_ADD(eq_ref_rows / 2, s->startup_cost);
-          rows= eq_ref_rows;
-        }
-        else
-          rows= record_count * records;
-
+        double rows= record_count * records;
        /*
          If we use filter F with selectivity s the the cost of fetching data
          by key using this filter will be
@@ -7947,46 +7924,53 @@ best_access_path(JOIN      *join,
             cost_of_fetching_1_row = tmp/rows
             cost_of_fetching_1_key_tuple = keyread_tmp/rows

-          access_cost_factor is the gain we expect for using rowid filter.
-          An access_cost_factor of 1.0 means that keyread_tmp is 0
-          (using key read is infinitely fast) and the gain for each row when
-          using filter is great.
-          An access_cost_factor if 0.0 means that using keyread has the
-          same cost as reading rows, so there is no gain to get with
-          filter.
-          access_cost_factor should never be bigger than 1.0 (if all
-          calculations are correct) as the cost of keyread should always be
-          smaller than the cost of fetching the same number of keys + rows.
-          access_cost_factor should also never be smaller than 0.0.
-          The one exception is if number of records is 1 (eq_ref), then
-          because we are comparing rows to cost of keyread_tmp, keyread_tmp
-          is higher by 1.0. This is a big that will be fixed in a later
-          version.
-
-          If we have limited the cost (=tmp) of reading rows with 'worst_seek'
-          we cannot use filters as the cost calculation below would cause
-          tmp to become negative.  The future resultion is to not limit
-          cost with worst_seek.
+          Here's a more detailed explanation that uses the formulas behind
+          the function the call filter->get_adjusted_gain(). The function
+          takes as a parameter the number of probes/look-ups into the filter
+          that is equal to the number of fetched key entries that is equal to
+          the number of row fetches when no filter is used (assuming no
+          index condition pushdown is employed for the used key access).
+          Let this number be N. Then the total gain from using the filter is
+          N*a_adj - b where b is the cost of building the filter and
+          a_adj is calcilated as follows:
+          a - (1-access_cost_factor)*(1-s) =
+          (1+1_cond_eval_cost)*(1-s)-1_probe_cost - (1-access_cost_factor)*(1-s)
+          =  (1-s)*(1_cond_eval_cost+access_cost_factor) - 1_probe_cost.
+          Here ((1-s)*(1_cond_eval_cost) * N is the gain from checking less
+          conditions pushed into the table, 1_probe_cost*N is the cost of the
+          probes and (1*s) * access_cost_factor * N must be the gain from
+          accessing less rows.
+          It does not matter how we calculate the cost of N full row fetches
+            cost_of_fetching_N_rows or
+          how we calculate the cost of fetching N key entries
+            cost_of_fetching_N_key_entries
+          the gain from less row fetches will be
+          (cost_of_fetching_N_rows - cost_of_fetching_N_key_entries) * (1-s)
+          and this should be equal to (1*s) * access_cost_factor * N.
+          Thus access_cost_factor must be calculated as
+          (cost_of_fetching_N_rows - cost_of_fetching_N_key_entries) / N.
+
+          For safety we clip cost_of_fetching_N_key_entries by the value
+          of cost_of_fetching_N_row though formally it's not necessary.
 	*/
-        double access_cost_factor= MY_MIN((rows - keyread_tmp) / rows, 1.0);
-        if (!(records < s->worst_seeks &&
-              records <= thd->variables.max_seeks_for_key))
-          trace_access_idx.add("rowid_filter_skipped", "worst/max seeks clipping");
-        else if (access_cost_factor <= 0.0)
-          trace_access_idx.add("rowid_filter_skipped", "cost_factor <= 0");
-        else
+        /*
+          For eq_ref access we assume that the cost of fetching N key entries
+          is equal to the half of fetching N rows
+	*/
+        double key_access_cost=
+	       type == JT_EQ_REF ? 0.5 * tmp : MY_MIN(tmp, keyread_tmp);
+        double access_cost_factor= MY_MIN((tmp - key_access_cost) / rows, 1.0);
+
+        filter=
+          table->best_range_rowid_filter_for_partial_join(start_key->key,
+                                                          rows,
+                                                          access_cost_factor);
+        if (filter)
        {
-          filter=
-            table->best_range_rowid_filter_for_partial_join(start_key->key,
-                                                            rows,
-                                                            access_cost_factor);
-          if (filter)
-          {
-            tmp-= filter->get_adjusted_gain(rows) - filter->get_cmp_gain(rows);
-            DBUG_ASSERT(tmp >= 0);
-            trace_access_idx.add("rowid_filter_key",
+          tmp-= filter->get_adjusted_gain(rows) - filter->get_cmp_gain(rows);
+          DBUG_ASSERT(tmp >= 0);
+          trace_access_idx.add("rowid_filter_key",
                                 s->table->key_info[filter->key_no].name);
-          }
        }
      }
      trace_access_idx.add("rows", records).add("cost", tmp);
@@ -8139,19 +8123,19 @@ best_access_path(JOIN      *join,
        uint key_no= s->quick->index;

        /* See the comment concerning using rowid filter for with ref access */
-        keyread_tmp= s->table->quick_index_only_costs[key_no] * record_count;
-        double access_cost_factor= MY_MIN((rows - keyread_tmp) / rows, 1.0);
-        if (access_cost_factor > 0.0)
+        double row_access_cost= s->quick->read_time * record_count;
+        double key_access_cost=
+	  MY_MIN(row_access_cost,
+                 s->table->quick_index_only_costs[key_no] * record_count);
+        double access_cost_factor= MY_MIN((row_access_cost - key_access_cost) /
+                                          rows, 1.0);
+        filter=
+         s->table->best_range_rowid_filter_for_partial_join(key_no, rows,
+                                                            access_cost_factor);
+        if (filter)
        {
-          filter=
-            s->table->
-            best_range_rowid_filter_for_partial_join(key_no, rows,
-                                                     access_cost_factor);
-          if (filter)
-          {
-            tmp-= filter->get_adjusted_gain(rows);
-            DBUG_ASSERT(tmp >= 0);
-          }
+          tmp-= filter->get_adjusted_gain(rows);
+          DBUG_ASSERT(tmp >= 0);
        }
        else
          trace_access_scan.add("rowid_filter_skipped", "cost_factor <= 0");