Revert these two changes (wrong push) :

MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered% MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value - Made a number of fixes in table_cond_selectivity() so that it returns correct selectivity estimates. - Added comments in related code. Better comments

Revert these two changes (wrong push) :
MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered% MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value - Made a number of fixes in table_cond_selectivity() so that it returns correct selectivity estimates. - Added comments in related code. Better comments
182f7d76 · Sergey Petrunya · 46d1b845 · 182f7d76 · 182f7d76 · 182f7d76
Commit 182f7d76 authored Apr 28, 2014 by Sergey Petrunya
4 changed files
--- a/mysql-test/r/selectivity_no_engine.result
+++ b/mysql-test/r/selectivity_no_engine.result
@@ -139,118 +139,6 @@ Warnings:
 Note	1003	select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3)))
 drop table t2, t1;
 # 
-# MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%
-# 
-create table t0(a int);
-insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
-create table t1(key1 int, col1 int, key(key1));
-insert into t1 select A.a, A.a from t0 A, t0 B, t0 C;
-set histogram_size=100;
-set use_stat_tables='preferably';
-set optimizer_use_condition_selectivity=4;
-analyze table t1 persistent for all;
-Table	Op	Msg_type	Msg_text
-test.t1	analyze	status	Engine-independent statistics collected
-test.t1	analyze	status	Table is already up to date
-# 10% is ok
-explain extended select * from t1 where col1=2;
-id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.90	Using where
-Warnings:
-Note	1003	select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` = 2)
-# Must show 100%, not 10%
-explain extended select * from t1 where key1=2;
-id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ref	key1	key1	5	const	98	100.00	
-Warnings:
-Note	1003	select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`key1` = 2)
-drop table t0, t1;
-# 
-# MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
-# 
-create table t1(a int);
-insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
-create table t2(a int);
-insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C;
-create table t3 as select * from t2;
-set histogram_size=100;
-set use_stat_tables='preferably';
-set optimizer_use_condition_selectivity=4;
-analyze table t2 persistent for all;
-Table	Op	Msg_type	Msg_text
-test.t2	analyze	status	Engine-independent statistics collected
-test.t2	analyze	status	OK
-analyze table t3 persistent for all;
-Table	Op	Msg_type	Msg_text
-test.t3	analyze	status	Engine-independent statistics collected
-test.t3	analyze	status	OK
-explain extended select * from t2 A where A.a < 40;
-id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	A	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where
-Warnings:
-Note	1003	select `test`.`A`.`a` AS `a` from `test`.`t2` `A` where (`test`.`A`.`a` < 40)
-explain extended select * from t3 B where B.a < 100;
-id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	B	ALL	NULL	NULL	NULL	NULL	1000	9.90	Using where
-Warnings:
-Note	1003	select `test`.`B`.`a` AS `a` from `test`.`t3` `B` where (`test`.`B`.`a` < 100)
-explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100;
-id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	A	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where
-1	SIMPLE	B	ALL	NULL	NULL	NULL	NULL	1000	9.90	Using where; Using join buffer (flat, BNL join)
-Warnings:
-Note	1003	select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`A`.`a` < 40) and (`test`.`B`.`a` < 100))
-explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a;
-id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	A	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where
-1	SIMPLE	B	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where; Using join buffer (flat, BNL join)
-Warnings:
-Note	1003	select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`B`.`a` = `test`.`A`.`a`) and (`test`.`A`.`a` < 40) and (`test`.`A`.`a` < 100))
-drop table t1, t2, t3;
-select 1;
-1
-1
-# 
-# MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
-# 
-create table t0(a int);
-insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
-create table t1 (
-kp1 int, kp2 int, 
-filler1 char(100),
-filler2 char(100),
-key(kp1, kp2)
-);
-insert into t1 
-select 
-A.a,
-B.a,
-'filler-data-1',
-'filler-data-2'
-from t0 A, t0 B, t0 C;
-set histogram_size=100;
-set use_stat_tables='preferably';
-set optimizer_use_condition_selectivity=4;
-analyze table t1 persistent for all;
-Table	Op	Msg_type	Msg_text
-test.t1	analyze	status	Engine-independent statistics collected
-test.t1	analyze	status	Table is already up to date
-# NOTE: 10*100%, 10*100% rows is ok
-explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
-id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
-1	SIMPLE	t1	ref	kp1	kp1	10	test.t0.a,func	10	100.00	Using index condition
-Warnings:
-Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = (`test`.`t0`.`a` + 1)))
-# NOTE: t0: 10*100% is ok,  t1: 10*9.90% is bad. t1 should have 10*100%.
-explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
-id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
-1	SIMPLE	t1	ref	kp1	kp1	10	test.t0.a,const	10	100.00	
-Warnings:
-Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = 4))
-drop table t0, t1;
-# 
 # End of the test file
 # 
 set use_stat_tables= @save_use_stat_tables;

--- a/mysql-test/t/selectivity_no_engine.test
+++ b/mysql-test/t/selectivity_no_engine.test
@@ -101,78 +101,6 @@ analyze table t2 persistent for all;
 explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);

 drop table t2, t1;
-
--echo # 
--echo # MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%
--echo # 
-create table t0(a int);
-insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
-create table t1(key1 int, col1 int, key(key1));
-insert into t1 select A.a, A.a from t0 A, t0 B, t0 C;
-
-set histogram_size=100;
-set use_stat_tables='preferably';
-set optimizer_use_condition_selectivity=4;
-analyze table t1 persistent for all;
--echo # 10% is ok
-explain extended select * from t1 where col1=2;
--echo # Must show 100%, not 10%
-explain extended select * from t1 where key1=2;
-drop table t0, t1;
-
-##--disable_parsing
--echo # 
--echo # MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
--echo # 
-create table t1(a int);
-insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
-create table t2(a int);
-insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C;
-create table t3 as select * from t2;
-set histogram_size=100;
-set use_stat_tables='preferably';
-set optimizer_use_condition_selectivity=4;
-analyze table t2 persistent for all;
-analyze table t3 persistent for all;
-explain extended select * from t2 A where A.a < 40;
-explain extended select * from t3 B where B.a < 100;
-explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100;
-explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a;
-drop table t1, t2, t3;
-## --enable_parsing
-select 1;
-##--disable_parsing
--echo # 
--echo # MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
--echo # 
-create table t0(a int);
-insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
-create table t1 (
-  kp1 int, kp2 int, 
-  filler1 char(100),
-  filler2 char(100),
-  key(kp1, kp2)
-);
-
-insert into t1 
-select 
-  A.a,
-  B.a,
-  'filler-data-1',
-  'filler-data-2'
-from t0 A, t0 B, t0 C;
-set histogram_size=100;
-set use_stat_tables='preferably';
-set optimizer_use_condition_selectivity=4;
-analyze table t1 persistent for all;
--echo # NOTE: 10*100%, 10*100% rows is ok
-explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
-
--echo # NOTE: t0: 10*100% is ok,  t1: 10*9.90% is bad. t1 should have 10*100%.
-explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
-drop table t0, t1;
-##--enable_parsing
-
 --echo # 
 --echo # End of the test file
 --echo # 

--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -3406,10 +3406,6 @@ double records_in_column_ranges(PARAM *param, uint idx,
       selectivity (this is used for conditions like "column LIKE '%val%'" 
       where approaches #1 and #2 do not provide selectivity data).

-  SEE ALSO
-     table_cond_selectivity()
-     matching_candidates_in_table()
-
  NOTE
    Currently the selectivities of range conditions over different columns are
    considered independent. 

--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -7135,15 +7135,6 @@ double JOIN::get_examined_rows()
  @param rem_tables The bitmap of tables to be joined later
  @param keyparts   The number of key parts to used when joining s
  @param ref_keyuse_steps Array of references to keyuses employed to join s 
-
-  @detail
-  Basic idea: if the WHERE clause has an equality in form
-
-    tbl.column= ...
-
-  then this condition will have selectivity 1/#distinct_values(tbl.column),
-  unless the equality was used by ref access. If the equality is used by ref
-  access, we only get rows that satisfy it, and so its selectivity=1.
 */

 static 
@@ -7174,18 +7165,8 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
      the current value of sel by this selectivity
    */
    table_map used_tables= item_equal->used_tables();
-
-    /* 
-      Equalities that do not include fields in this table do not matter 
-    */
    if (!(used_tables & table_bit))
      continue;
-
-    /* 
-      Equalities that include a constant are taken into account in
-      table->cond_selectivity. Selectivity from there is taken into account 
-      in matching_candidates_in_table() and/or table_cond_selectivity(). 
-    */
    if (item_equal->get_const())
      continue;

@@ -7197,23 +7178,14 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
      Field *fld= fi.get_curr_field();
      if (fld->table->map != table_bit)
        continue;
-
      if (pos->key == 0)
-      {
-        /* 
-          No ref access used (and no const in the multi-equality). We will
-          need to adjust the selectivity.
-        */
        adjust_sel= TRUE;
-      }
      else
      {
-        /* Ok, [eq]ref access is used */
        uint i;
        KEYUSE *keyuse= pos->key;
        uint key= keyuse->key;

-        /* Find which keypart participates in the equality */
        for (i= 0; i < keyparts; i++)
 	{
          uint fldno;
@@ -7224,7 +7196,6 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
          if (fld->field_index == fldno)
            break;
        }
-
        if (i == keyparts)
 	{
          /* 
@@ -7250,7 +7221,6 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
        }          
      }
    }
-
    if (adjust_sel)
    {
      /* 
@@ -7296,10 +7266,6 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,

    For other access methods, we need to calculate selectivity of the whole
    condition, "COND(this_table) AND COND(this_table, previous_tables)".
-  
-  @seealso
-    calculate_cond_selectivity_for_table()
-    matching_candidates_in_table()

  @retval
    selectivity of the conditions imposed on the rows of s
@@ -7310,90 +7276,34 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
                              table_map rem_tables)
 {
  uint16 ref_keyuse_steps[MAX_REF_PARTS - 1];
+  Field *field;
  TABLE *table= s->table;
-  double sel;
+  MY_BITMAP *read_set= table->read_set;
+  double sel= s->table->cond_selectivity;
  POSITION *pos= &join->positions[idx];
  uint keyparts= 0;
  uint found_part_ref_or_null= 0;

  if (pos->key != 0)
  {
-    sel= s->table->cond_selectivity;
    /* 
-      A ref access or hash join is used for this table. ref access is created
-      from
+      A ref access or hash join is used for this table.

-        tbl.keypart1=expr1 AND tbl.keypart2=expr2 AND ...
-      
-      and it will only return rows for which this condition is satisified.
-      Suppose, certain expr{i} is a constant. Since ref access only returns
-      rows that satisfy
-        
-         tbl.keypart{i}=const       (*)
-
-      then selectivity of this equality should not be counted in return value 
-      of this function. This function uses the value of 
-       
-         table->cond_selectivity=selectivity(COND(tbl)) (**)
-      
-      as a starting point. This value includes selectivity of equality (*). We
-      should somehow discount it. 
-      
-      Looking at calculate_cond_selectivity_for_table(), one can see that that
-      the value is not necessarily a direct multiplicand in 
-      table->cond_selectivity
-
-      There are three possible ways to discount
-      1. There is a potential range access on t.keypart{i}=const. 
-         (an important special case: the used ref access has a const prefix for
-          which a range estimate is available)
+      It could have some parts with "t.key_part=const". Using ref access
+      means that we will only get records where the condition holds, so we
+      should remove its selectivity from the condition selectivity.
      
-      2. The field has a histogram. field[x]->cond_selectivity has the data.
-      
-      3. Use index stats on this index:
-         rec_per_key[key_part+1]/rec_per_key[key_part]
-
      (TODO: more details about the "t.key=othertable.col" case)
    */
    KEYUSE *keyuse= pos->key;
    KEYUSE *prev_ref_keyuse= keyuse;
    uint key= keyuse->key;
-    
-    /*
-      Check if we have a prefix of key=const that matches a quick select.
-    */
-    if (!is_hash_join_key_no(key))
-    {
-      table_map quick_key_map= (table_map(1) << table->quick_key_parts[key]) - 1;
-      if (table->quick_rows[key] && 
-          !(quick_key_map & ~table->const_key_parts[key]))
-      {
-        /* 
-          Ok, there is an equality for each of the key parts used by the
-          quick select. This means, quick select's estimate can be reused to
-          discount the selectivity of a prefix of a ref access.
-        */
-        for (; quick_key_map & 1 ; quick_key_map>>= 1)
-        {
-          while (keyuse->keypart == keyparts)
-            keyuse++;
-          keyparts++;
-        }
-        sel /= table->quick_rows[key] / table->stat_records();
-      }
-    }
-    
-    /*
-      Go through the "keypart{N}=..." equalities and find those that were
-      already taken into account in table->cond_selectivity.
-    */
-    while (keyuse->table == table && keyuse->key == key)
+    do
    {
      if (!(keyuse->used_tables & (rem_tables | table->map)))
      {
        if (are_tables_local(s, keyuse->val->used_tables()))
-	{                            /// ^^ why val->used_tables here but just
-                                     ///       used_tables above?
+	{
          if (is_hash_join_key_no(key))
 	  {
            if (keyparts == keyuse->keypart)
@@ -7402,39 +7312,22 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
          else
 	  {
            if (keyparts == keyuse->keypart &&
-                !((keyuse->val->used_tables()) & ~pos->ref_depend_map) &&
+                !(~(keyuse->val->used_tables()) & pos->ref_depend_map) &&
                !(found_part_ref_or_null & keyuse->optimize))
 	    {
-              /* Found a KEYUSE object that will be used by ref access */
              keyparts++;
              found_part_ref_or_null|= keyuse->optimize & ~KEY_OPTIMIZE_EQ;
            }
          }
-
          if (keyparts > keyuse->keypart)
 	  {
-            /* Ok this is the keyuse that will be used for ref access */
            uint fldno;
            if (is_hash_join_key_no(key))
 	      fldno= keyuse->keypart;
            else
              fldno= table->key_info[key].key_part[keyparts-1].fieldnr - 1;
            if (keyuse->val->const_item())
-            {              
-              // psergey: not multiply, divide instead.
-              // before, we've had:
-              // sel*= table->field[fldno]->cond_selectivity; 
-
-              sel /= table->field[fldno]->cond_selectivity;
-              /* 
-               TODO: we could do better here:
-                 1. cond_selectivity might be =1 (the default) because quick 
-                    select on some index prevented us from analyzing 
-                    histogram for this column.
-                 2. we could get an estimate through this?
-                     rec_per_key[key_part-1] / rec_per_key[key_part]
-              */
-            }
+              sel*= table->field[fldno]->cond_selectivity; 
            if (keyparts > 1)
 	    {
              ref_keyuse_steps[keyparts-2]= keyuse - prev_ref_keyuse;
@@ -7444,59 +7337,23 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
 	}
      }
      keyuse++;
-    }
+    } while (keyuse->table == table && keyuse->key == key);
  }
  else
  {
    /*
      The table is accessed with full table scan, or quick select.
-      Selectivity of COND(this_table) is already accounted for in 
+      Selectivity of COND(table) is already accounted for in 
      matching_candidates_in_table().
-      For COND(this_table, previous_tables) we don't have any meaningful
-      estimates.
    */
    sel= 1;
  }

-  /*
-    Selectivity and multiple equalities. Consider an example:
-
-      select * from t1, t2 where t1.col=t2.col and t2.col<5
-    
-    Suppose the join order is t1, t2. When equality propagation is used, we 
-    get:
-
-      t1:  t1.col<5
-      t2:  t2.col<5  // not generated: AND t2.col=t1.col
-
-    if we use ref access on table t2, we will not get records for which
-    "t2.col<5"
-    
-      when we get to table t2, we will not get records that have "t2.col < 5"
-
-     COND(t2) = "t2.col<5" 
-
-      ## a variant with key:
-      select * from t1, t2 where t1.col=t2.col and t2.col<5 and t2.key=t1.col2
-
+  /* 
    If the field f from the table is equal to a field from one the
    earlier joined tables then the selectivity of the range conditions
    over the field f must be discounted.
-
-
-
-    Suppose, we're now looking at selectivity for table t2.
-    - in case t2 uses full table scan (or quick select): all selectivity is
-      already accounted for in matching_candidates_in_table().
-    - in case t2 uses ref access
-       = if the equality is used for ref access, we have already 
-         discounted its selectivity above
-         (However, we have not discounted selectivity of the induced
-         equalities)
-       = if the equality is not used for ref access, we should still count its
-         selectivity.
  */ 
-#if 0
  for (Field **f_ptr=table->field ; (field= *f_ptr) ; f_ptr++)
  {
    if (!bitmap_is_set(read_set, field->field_index) ||
@@ -7514,7 +7371,6 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
      }
    }
  }
-#endif 

  sel*= table_multi_eq_cond_selectivity(join, idx, s, rem_tables,
                                        keyparts, ref_keyuse_steps);