Commit deb26989 authored by Sergey Petrunya's avatar Sergey Petrunya

MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%

MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
- Made a number of fixes in table_cond_selectivity() so that it returns
  correct selectivity estimates.
- Added comments in related code.
parent 872a01b6
......@@ -139,6 +139,118 @@ Warnings:
Note 1003 select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3)))
drop table t2, t1;
#
# MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%
#
create table t0(a int);
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1(key1 int, col1 int, key(key1));
insert into t1 select A.a, A.a from t0 A, t0 B, t0 C;
set histogram_size=100;
set use_stat_tables='preferably';
set optimizer_use_condition_selectivity=4;
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status Table is already up to date
# 10% is ok
explain extended select * from t1 where col1=2;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.90 Using where
Warnings:
Note 1003 select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` = 2)
# Must show 100%, not 10%
explain extended select * from t1 where key1=2;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ref key1 key1 5 const 98 100.00
Warnings:
Note 1003 select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`key1` = 2)
drop table t0, t1;
#
# MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
#
create table t1(a int);
insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t2(a int);
insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C;
create table t3 as select * from t2;
set histogram_size=100;
set use_stat_tables='preferably';
set optimizer_use_condition_selectivity=4;
analyze table t2 persistent for all;
Table Op Msg_type Msg_text
test.t2 analyze status Engine-independent statistics collected
test.t2 analyze status OK
analyze table t3 persistent for all;
Table Op Msg_type Msg_text
test.t3 analyze status Engine-independent statistics collected
test.t3 analyze status OK
explain extended select * from t2 A where A.a < 40;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE A ALL NULL NULL NULL NULL 1000 4.95 Using where
Warnings:
Note 1003 select `test`.`A`.`a` AS `a` from `test`.`t2` `A` where (`test`.`A`.`a` < 40)
explain extended select * from t3 B where B.a < 100;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE B ALL NULL NULL NULL NULL 1000 9.90 Using where
Warnings:
Note 1003 select `test`.`B`.`a` AS `a` from `test`.`t3` `B` where (`test`.`B`.`a` < 100)
explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE A ALL NULL NULL NULL NULL 1000 4.95 Using where
1 SIMPLE B ALL NULL NULL NULL NULL 1000 9.90 Using where; Using join buffer (flat, BNL join)
Warnings:
Note 1003 select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`A`.`a` < 40) and (`test`.`B`.`a` < 100))
explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE A ALL NULL NULL NULL NULL 1000 4.95 Using where
1 SIMPLE B ALL NULL NULL NULL NULL 1000 4.95 Using where; Using join buffer (flat, BNL join)
Warnings:
Note 1003 select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`B`.`a` = `test`.`A`.`a`) and (`test`.`A`.`a` < 40) and (`test`.`A`.`a` < 100))
drop table t1, t2, t3;
select 1;
1
1
#
# MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
#
create table t0(a int);
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1 (
kp1 int, kp2 int,
filler1 char(100),
filler2 char(100),
key(kp1, kp2)
);
insert into t1
select
A.a,
B.a,
'filler-data-1',
'filler-data-2'
from t0 A, t0 B, t0 C;
set histogram_size=100;
set use_stat_tables='preferably';
set optimizer_use_condition_selectivity=4;
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status Table is already up to date
# NOTE: 10*100%, 10*100% rows is ok
explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t0 ALL NULL NULL NULL NULL 10 100.00 Using where
1 SIMPLE t1 ref kp1 kp1 10 test.t0.a,func 10 100.00 Using index condition
Warnings:
Note 1003 select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = (`test`.`t0`.`a` + 1)))
# NOTE: t0: 10*100% is ok, t1: 10*9.90% is bad. t1 should have 10*100%.
explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t0 ALL NULL NULL NULL NULL 10 100.00 Using where
1 SIMPLE t1 ref kp1 kp1 10 test.t0.a,const 10 100.00
Warnings:
Note 1003 select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = 4))
drop table t0, t1;
#
# End of the test file
#
set use_stat_tables= @save_use_stat_tables;
......
......@@ -101,6 +101,78 @@ analyze table t2 persistent for all;
explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
drop table t2, t1;
--echo #
--echo # MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%
--echo #
create table t0(a int);
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1(key1 int, col1 int, key(key1));
insert into t1 select A.a, A.a from t0 A, t0 B, t0 C;
set histogram_size=100;
set use_stat_tables='preferably';
set optimizer_use_condition_selectivity=4;
analyze table t1 persistent for all;
--echo # 10% is ok
explain extended select * from t1 where col1=2;
--echo # Must show 100%, not 10%
explain extended select * from t1 where key1=2;
drop table t0, t1;
##--disable_parsing
--echo #
--echo # MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
--echo #
create table t1(a int);
insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t2(a int);
insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C;
create table t3 as select * from t2;
set histogram_size=100;
set use_stat_tables='preferably';
set optimizer_use_condition_selectivity=4;
analyze table t2 persistent for all;
analyze table t3 persistent for all;
explain extended select * from t2 A where A.a < 40;
explain extended select * from t3 B where B.a < 100;
explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100;
explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a;
drop table t1, t2, t3;
## --enable_parsing
select 1;
##--disable_parsing
--echo #
--echo # MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
--echo #
create table t0(a int);
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1 (
kp1 int, kp2 int,
filler1 char(100),
filler2 char(100),
key(kp1, kp2)
);
insert into t1
select
A.a,
B.a,
'filler-data-1',
'filler-data-2'
from t0 A, t0 B, t0 C;
set histogram_size=100;
set use_stat_tables='preferably';
set optimizer_use_condition_selectivity=4;
analyze table t1 persistent for all;
--echo # NOTE: 10*100%, 10*100% rows is ok
explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
--echo # NOTE: t0: 10*100% is ok, t1: 10*9.90% is bad. t1 should have 10*100%.
explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
drop table t0, t1;
##--enable_parsing
--echo #
--echo # End of the test file
--echo #
......
......@@ -3406,6 +3406,10 @@ double records_in_column_ranges(PARAM *param, uint idx,
selectivity (this is used for conditions like "column LIKE '%val%'"
where approaches #1 and #2 do not provide selectivity data).
SEE ALSO
table_cond_selectivity()
matching_candidates_in_table()
NOTE
Currently the selectivities of range conditions over different columns are
considered independent.
......
......@@ -7165,8 +7165,18 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
the current value of sel by this selectivity
*/
table_map used_tables= item_equal->used_tables();
/*
Equalities that do not include fields in this table do not matter
*/
if (!(used_tables & table_bit))
continue;
/*
Equalities that include a constant are taken into account in
table->cond_selectivity. Selectivity from there is taken into account
in matching_candidates_in_table() and/or table_cond_selectivity().
*/
if (item_equal->get_const())
continue;
......@@ -7178,14 +7188,23 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
Field *fld= fi.get_curr_field();
if (fld->table->map != table_bit)
continue;
if (pos->key == 0)
{
/*
No ref access used (and no const in the multi-equality). We will
need to adjust the selectivity.
*/
adjust_sel= TRUE;
}
else
{
/* Ok, [eq]ref access is used */
uint i;
KEYUSE *keyuse= pos->key;
uint key= keyuse->key;
/* Find which keypart participates in the equality */
for (i= 0; i < keyparts; i++)
{
uint fldno;
......@@ -7196,6 +7215,7 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
if (fld->field_index == fldno)
break;
}
if (i == keyparts)
{
/*
......@@ -7221,6 +7241,7 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
}
}
}
if (adjust_sel)
{
/*
......@@ -7266,6 +7287,10 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
For other access methods, we need to calculate selectivity of the whole
condition, "COND(this_table) AND COND(this_table, previous_tables)".
@seealso
calculate_cond_selectivity_for_table()
matching_candidates_in_table()
@retval
selectivity of the conditions imposed on the rows of s
......@@ -7276,9 +7301,7 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
table_map rem_tables)
{
uint16 ref_keyuse_steps[MAX_REF_PARTS - 1];
Field *field;
TABLE *table= s->table;
MY_BITMAP *read_set= table->read_set;
double sel= s->table->cond_selectivity;
POSITION *pos= &join->positions[idx];
uint keyparts= 0;
......@@ -7287,23 +7310,79 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
if (pos->key != 0)
{
/*
A ref access or hash join is used for this table.
A ref access or hash join is used for this table. ref access is created
from
tbl.keypart1=expr1 AND tbl.keypart2=expr2 AND ...
and it will only return rows for which this condition is satisified.
Suppose, certain expr{i} is a constant. Since ref access only returns
rows that satisfy
tbl.keypart{i}=const (*)
then selectivity of this equality should not be counted in return value
of this function. This function uses the value of
table->cond_selectivity=selectivity(COND(tbl))
as a starting point. This value includes selectivity of equality (*). We
should somehow discount it.
Looking at calculate_cond_selectivity_for_table(), one can see that that
value is not necessarily a direct multiplicand in table->cond_selectivity
It could have some parts with "t.key_part=const". Using ref access
means that we will only get records where the condition holds, so we
should remove its selectivity from the condition selectivity.
There are three possible ways to discount
1. There is a range access on t.keypart{i}=const.
(an important special case: multi-keypart ref(const) access)
2. The field has a histogram. field[x]->cond_selectivity has the data.
3. Use index stats on this index:
rec_per_key[key_part+1]/rec_per_key[key_part]
(TODO: more details about the "t.key=othertable.col" case)
*/
KEYUSE *keyuse= pos->key;
KEYUSE *prev_ref_keyuse= keyuse;
uint key= keyuse->key;
do
/*
Check if we have a prefix of key=const that matches a quick select.
*/
if (!is_hash_join_key_no(key))
{
table_map quick_key_map= (table_map(1) << table->quick_key_parts[key]) - 1;
if (table->quick_rows[key] &&
!(quick_key_map & ~table->const_key_parts[key]))
{
/*
Ok, there is an equality for each of the key parts used by the
quick select. This means, quick select's estimate can be reused to
discount the selectivity of a prefix of a ref access.
*/
for (; quick_key_map & 1 ; quick_key_map>>= 1)
{
while (keyuse->keypart == keyparts)
keyuse++;
keyparts++;
}
sel /= table->quick_rows[key] / table->stat_records();
}
}
/*
Go through the "keypart{N}=..." equalities and find those that were
already taken into account in table->cond_selectivity.
*/
//do
while (keyuse->table == table && keyuse->key == key)
{
if (!(keyuse->used_tables & (rem_tables | table->map)))
{
if (are_tables_local(s, keyuse->val->used_tables()))
{
{ /// ^^ why val->used_tables here but just
/// used_tables above?
if (is_hash_join_key_no(key))
{
if (keyparts == keyuse->keypart)
......@@ -7312,22 +7391,40 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
else
{
if (keyparts == keyuse->keypart &&
!(~(keyuse->val->used_tables()) & pos->ref_depend_map) &&
!((keyuse->val->used_tables()) & ~pos->ref_depend_map) &&
!(found_part_ref_or_null & keyuse->optimize))
{
/* Found a KEYUSE object that will be used by ref access */
keyparts++;
found_part_ref_or_null|= keyuse->optimize & ~KEY_OPTIMIZE_EQ;
}
}
if (keyparts > keyuse->keypart)
{
/* Ok this is the keyuse that will be used for ref access */
uint fldno;
if (is_hash_join_key_no(key))
fldno= keyuse->keypart;
else
fldno= table->key_info[key].key_part[keyparts-1].fieldnr - 1;
if (keyuse->val->const_item())
sel*= table->field[fldno]->cond_selectivity;
{
// psergey: not multiply, divide instead.
// before, we've had:
// sel*= table->field[fldno]->cond_selectivity;
sel /= table->field[fldno]->cond_selectivity;
/*
TODO: we could do better here:
1. cond_selectivity might be =1 (the default) because quick
select on some index prevented us from analyzing
histogram for this column.
2. we could get an estimate through this?
rec_per_key[key_part-1] / rec_per_key[key_part]
*/
}
if (keyparts > 1)
{
ref_keyuse_steps[keyparts-2]= keyuse - prev_ref_keyuse;
......@@ -7337,14 +7434,16 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
}
}
keyuse++;
} while (keyuse->table == table && keyuse->key == key);
}
}
else
{
/*
The table is accessed with full table scan, or quick select.
Selectivity of COND(table) is already accounted for in
Selectivity of COND(this_table) is already accounted for in
matching_candidates_in_table().
For COND(this_table, previous_tables) we don't have any meaningful
estimates.
*/
sel= 1;
}
......@@ -7353,7 +7452,30 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
If the field f from the table is equal to a field from one the
earlier joined tables then the selectivity of the range conditions
over the field f must be discounted.
psergey: I think this is wrong. Example:
## no keys used
select * from t1, t2 where t1.col=t2.col and t2.col<5
## a variant with key:
select * from t1, t2 where t1.col=t2.col and t2.col<5 and t2.key=t1.col2
suppose the join order is t1, t2. Attached conditions:
t1: t2.col<3
t2: [t2.col=t1.col] AND t2.col<3
Suppose, we're now looking at selectivity for table t2.
- in case t2 uses full table scan (or quick select): all selectivity is
already accounted for in matching_candidates_in_table(). [YES. CHECKED]
- in case t2 uses ref access
= if the equality is used for ref access, we have already
discounted its selectivity above.
= if the equality is not used for ref access, we should still count its
selectivity.
*/
#if 0
for (Field **f_ptr=table->field ; (field= *f_ptr) ; f_ptr++)
{
if (!bitmap_is_set(read_set, field->field_index) ||
......@@ -7371,6 +7493,7 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
}
}
}
#endif
sel*= table_multi_eq_cond_selectivity(join, idx, s, rem_tables,
keyparts, ref_keyuse_steps);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment