Commit ad842b5f authored by Sergey Petrunya's avatar Sergey Petrunya

MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong

[Attempt #2]
- Use a new selectivity calculation formula in Histogram::point_selectivity. 
  The formula is different from the old one because it was developed from scratch.
  it doesn't have any possible division-by-zero problems.
parent e59dec03
drop table if exists t1,t2,t3; drop table if exists t0,t1,t2,t3;
select @@global.use_stat_tables; select @@global.use_stat_tables;
@@global.use_stat_tables @@global.use_stat_tables
COMPLEMENTARY COMPLEMENTARY
...@@ -826,7 +826,7 @@ flush table t1; ...@@ -826,7 +826,7 @@ flush table t1;
set optimizer_use_condition_selectivity=4; set optimizer_use_condition_selectivity=4;
explain extended select * from t1 where a=0; explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 49.61 Using where 1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
Warnings: Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0) Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
drop table t1; drop table t1;
...@@ -1308,15 +1308,54 @@ test.t2 analyze status OK ...@@ -1308,15 +1308,54 @@ test.t2 analyze status OK
# The following two must have the same in 'Extra' column: # The following two must have the same in 'Extra' column:
explain extended select * from t2 where col1 IN (20, 180); explain extended select * from t2 where col1 IN (20, 180);
id select_type table type possible_keys key key_len ref rows filtered Extra id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where 1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
Warnings: Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180)) Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180))
explain extended select * from t2 where col1 IN (180, 20); explain extended select * from t2 where col1 IN (180, 20);
id select_type table type possible_keys key key_len ref rows filtered Extra id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where 1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
Warnings: Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20)) Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20))
drop table t1, t2; drop table t1, t2;
#
# MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
#
create table t0(a int);
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1(a int);
insert into t1 select A.a from t0 A, t0 B, t0 C;
set histogram_size=20;
set histogram_type='single_prec_hb';
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status OK
set use_stat_tables='preferably';
set optimizer_use_condition_selectivity=4;
# Should select about 10%:
explain extended select * from t1 where a=2;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 2)
# Should select about 10%:
explain extended select * from t1 where a=1;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 1)
# Must not have filtered=100%:
explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
# Again, must not have filtered=100%:
explain extended select * from t1 where a=-1;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = <cache>(-(1)))
drop table t0, t1;
set histogram_type=@save_histogram_type; set histogram_type=@save_histogram_type;
set histogram_size=@save_histogram_size; set histogram_size=@save_histogram_size;
set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;
......
SET SESSION STORAGE_ENGINE='InnoDB'; SET SESSION STORAGE_ENGINE='InnoDB';
set @save_optimizer_switch_for_selectivity_test=@@optimizer_switch; set @save_optimizer_switch_for_selectivity_test=@@optimizer_switch;
set optimizer_switch='extended_keys=on'; set optimizer_switch='extended_keys=on';
drop table if exists t1,t2,t3; drop table if exists t0,t1,t2,t3;
select @@global.use_stat_tables; select @@global.use_stat_tables;
@@global.use_stat_tables @@global.use_stat_tables
COMPLEMENTARY COMPLEMENTARY
...@@ -835,7 +835,7 @@ flush table t1; ...@@ -835,7 +835,7 @@ flush table t1;
set optimizer_use_condition_selectivity=4; set optimizer_use_condition_selectivity=4;
explain extended select * from t1 where a=0; explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 49.61 Using where 1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
Warnings: Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0) Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
drop table t1; drop table t1;
...@@ -1318,15 +1318,54 @@ test.t2 analyze status OK ...@@ -1318,15 +1318,54 @@ test.t2 analyze status OK
# The following two must have the same in 'Extra' column: # The following two must have the same in 'Extra' column:
explain extended select * from t2 where col1 IN (20, 180); explain extended select * from t2 where col1 IN (20, 180);
id select_type table type possible_keys key key_len ref rows filtered Extra id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where 1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
Warnings: Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180)) Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180))
explain extended select * from t2 where col1 IN (180, 20); explain extended select * from t2 where col1 IN (180, 20);
id select_type table type possible_keys key key_len ref rows filtered Extra id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where 1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
Warnings: Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20)) Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20))
drop table t1, t2; drop table t1, t2;
#
# MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
#
create table t0(a int);
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1(a int);
insert into t1 select A.a from t0 A, t0 B, t0 C;
set histogram_size=20;
set histogram_type='single_prec_hb';
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status OK
set use_stat_tables='preferably';
set optimizer_use_condition_selectivity=4;
# Should select about 10%:
explain extended select * from t1 where a=2;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 2)
# Should select about 10%:
explain extended select * from t1 where a=1;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 1)
# Must not have filtered=100%:
explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
# Again, must not have filtered=100%:
explain extended select * from t1 where a=-1;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = <cache>(-(1)))
drop table t0, t1;
set histogram_type=@save_histogram_type; set histogram_type=@save_histogram_type;
set histogram_size=@save_histogram_size; set histogram_size=@save_histogram_size;
set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;
......
--source include/have_stat_tables.inc --source include/have_stat_tables.inc
--disable_warnings --disable_warnings
drop table if exists t1,t2,t3; drop table if exists t0,t1,t2,t3;
--enable_warnings --enable_warnings
select @@global.use_stat_tables; select @@global.use_stat_tables;
...@@ -885,6 +885,29 @@ explain extended select * from t2 where col1 IN (180, 20); ...@@ -885,6 +885,29 @@ explain extended select * from t2 where col1 IN (180, 20);
drop table t1, t2; drop table t1, t2;
--echo #
--echo # MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
--echo #
create table t0(a int);
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1(a int);
insert into t1 select A.a from t0 A, t0 B, t0 C;
set histogram_size=20;
set histogram_type='single_prec_hb';
analyze table t1 persistent for all;
set use_stat_tables='preferably';
set optimizer_use_condition_selectivity=4;
--echo # Should select about 10%:
explain extended select * from t1 where a=2;
--echo # Should select about 10%:
explain extended select * from t1 where a=1;
--echo # Must not have filtered=100%:
explain extended select * from t1 where a=0;
--echo # Again, must not have filtered=100%:
explain extended select * from t1 where a=-1;
drop table t0, t1;
set histogram_type=@save_histogram_type; set histogram_type=@save_histogram_type;
set histogram_size=@save_histogram_size; set histogram_size=@save_histogram_size;
set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;
......
...@@ -113,7 +113,7 @@ class Histogram ...@@ -113,7 +113,7 @@ class Histogram
private: private:
Histogram_type type; Histogram_type type;
uint8 size; uint8 size; /* Size of values array, in bytes */
uchar *values; uchar *values;
uint prec_factor() uint prec_factor()
...@@ -142,6 +142,7 @@ class Histogram ...@@ -142,6 +142,7 @@ class Histogram
private: private:
uint get_value(uint i) uint get_value(uint i)
{ {
DBUG_ASSERT(i < get_width());
switch (type) { switch (type) {
case SINGLE_PREC_HB: case SINGLE_PREC_HB:
return (uint) (((uint8 *) values)[i]); return (uint) (((uint8 *) values)[i]);
...@@ -150,7 +151,7 @@ class Histogram ...@@ -150,7 +151,7 @@ class Histogram
} }
return 0; return 0;
} }
/* Find the bucket which value 'pos' falls into. */
uint find_bucket(double pos, bool first) uint find_bucket(double pos, bool first)
{ {
uint val= (uint) (pos * prec_factor()); uint val= (uint) (pos * prec_factor());
...@@ -169,6 +170,10 @@ class Histogram ...@@ -169,6 +170,10 @@ class Histogram
else else
break; break;
} }
if (val > get_value(i))
i++;
if (val == get_value(i)) if (val == get_value(i))
{ {
if (first) if (first)
...@@ -235,23 +240,88 @@ class Histogram ...@@ -235,23 +240,88 @@ class Histogram
return sel; return sel;
} }
/*
Estimate selectivity of "col=const" using a histogram
@param pos Position of the "const" between column's min_value and
max_value. This is a number in [0..1] range.
@param avg_sel Average selectivity of condition "col=const" in this table.
It is calcuated as (#non_null_values / #distinct_values).
@return
Expected condition selectivity (a number between 0 and 1)
*/
double point_selectivity(double pos, double avg_sel) double point_selectivity(double pos, double avg_sel)
{ {
double sel; double sel;
double bucket_sel= 1.0/(get_width() + 1); /* Find the bucket that contains the value 'pos'. */
uint min= find_bucket(pos, TRUE); uint min= find_bucket(pos, TRUE);
uint pos_value= (uint) (pos * prec_factor());
/* Find how many buckets this value occupies */
uint max= min; uint max= min;
while (max + 1 < get_width() && get_value(max + 1) == get_value(max)) while (max + 1 < get_width() && get_value(max + 1) == pos_value)
max++; max++;
if (max > min)
{
/*
The value occupies multiple buckets. Use start_bucket ... end_bucket as
selectivity.
*/
double bucket_sel= 1.0/(get_width() + 1);
sel= bucket_sel * (max - min + 1);
}
else
{
/*
The value 'pos' fits within one single histogram bucket.
Histogram buckets have the same numbers of rows, but they cover
different ranges of values.
We assume that values are uniformly distributed across the [0..1] value
range.
*/
/*
If all buckets covered value ranges of the same size, the width of
value range would be:
*/
double avg_bucket_width= 1.0 / (get_width() + 1);
/*
Let's see what is the width of value range that our bucket is covering.
(min==max currently. they are kept in the formula just in case we
will want to extend it to handle multi-bucket case)
*/
double inv_prec_factor= (double) 1.0 / prec_factor(); double inv_prec_factor= (double) 1.0 / prec_factor();
double width= (max + 1 == get_width() ? double current_bucket_width=
1.0 : get_value(max) * inv_prec_factor) - (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) -
(min == 0 ? (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor));
0.0 : get_value(min-1) * inv_prec_factor);
sel= avg_sel * (bucket_sel * (max + 1 - min)) / width; /*
So:
- each bucket has the same #rows
- values are unformly distributed across the [min_value,max_value] domain.
If a bucket has value range that's N times bigger then average, than
each value will have to have N times fewer rows than average.
*/
DBUG_ASSERT(current_bucket_width);
sel= avg_sel * avg_bucket_width / current_bucket_width;
/*
(Q: if we just follow this proportion we may end up in a situation
where number of different values we expect to find in this bucket
exceeds the number of rows that this histogram has in a bucket. Are
we ok with this or we would want to have certain caps?)
*/
}
return sel; return sel;
} }
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment