Commit 76442927 authored by Vicențiu Ciorbaru's avatar Vicențiu Ciorbaru

Implement avg_frequency unsmoothed jacknife estimator

When sampling data through ANALYZE TABLE, use the estimator to get a
better estimation of avg_frequency instead of just using the raw sampled data.
parent f0773b78
...@@ -1768,6 +1768,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type; ...@@ -1768,6 +1768,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
# #
set @save_use_stat_tables=@@use_stat_tables; set @save_use_stat_tables=@@use_stat_tables;
set @save_analyze_sample_percentage=@@analyze_sample_percentage; set @save_analyze_sample_percentage=@@analyze_sample_percentage;
set @save_hist_size=@@histogram_size;
set session rand_seed1=42; set session rand_seed1=42;
set session rand_seed2=62; set session rand_seed2=62;
set use_stat_tables=PREFERABLY; set use_stat_tables=PREFERABLY;
...@@ -1800,7 +1801,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a ...@@ -1800,7 +1801,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
DECODE_HISTOGRAM(hist_type, histogram) DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats; from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086 t1 id 1 17384 0.0000 4.0000 14.0000 0.15705,0.15711,0.21463,0.15705,0.15711,0.15706
set analyze_sample_percentage=0.1; set analyze_sample_percentage=0.1;
# #
# This query will show an innacurate avg_frequency value. # This query will show an innacurate avg_frequency value.
...@@ -1813,7 +1814,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a ...@@ -1813,7 +1814,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
DECODE_HISTOGRAM(hist_type, histogram) DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats; from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 111 17026 0.0000 4.0000 1.0047 0.039,0.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098 t1 id 111 17026 0.0000 4.0000 10.4739 0.13649,0.14922,0.16921,0.21141,0.18355,0.15012
# #
# This query will show a better avg_frequency value. # This query will show a better avg_frequency value.
# #
...@@ -1826,7 +1827,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a ...@@ -1826,7 +1827,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
DECODE_HISTOGRAM(hist_type, histogram) DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats; from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 1 17384 0.0000 4.0000 3.5736 0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090 t1 id 1 17384 0.0000 4.0000 14.0401 0.15566,0.15590,0.15729,0.21538,0.15790,0.15787
set analyze_sample_percentage=0; set analyze_sample_percentage=0;
# #
# Test self adjusting sampling level. # Test self adjusting sampling level.
...@@ -1839,7 +1840,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a ...@@ -1839,7 +1840,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
DECODE_HISTOGRAM(hist_type, histogram) DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats; from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 1 17384 0.0000 4.0000 7.4523 0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086 t1 id 1 17384 0.0000 4.0000 13.9812 0.15860,0.15767,0.21515,0.15573,0.15630,0.15654
# #
# Test record estimation is working properly. # Test record estimation is working properly.
# #
...@@ -1858,9 +1859,11 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a ...@@ -1858,9 +1859,11 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
DECODE_HISTOGRAM(hist_type, histogram) DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats; from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086 t1 id 1 17384 0.0000 4.0000 14.0000 0.15705,0.15711,0.21463,0.15705,0.15711,0.15706
explain select * from t1; explain select * from t1;
id select_type table type possible_keys key key_len ref rows Extra id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 229376 1 SIMPLE t1 ALL NULL NULL NULL NULL 229376
set use_stat_tables=@save_use_stat_tables;
drop table t1; drop table t1;
set analyze_sample_percentage=@save_analyze_sample_percentage;
set histogram_size=@save_hist_size;
set use_stat_tables=@save_use_stat_tables;
...@@ -912,6 +912,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type; ...@@ -912,6 +912,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
--echo # --echo #
set @save_use_stat_tables=@@use_stat_tables; set @save_use_stat_tables=@@use_stat_tables;
set @save_analyze_sample_percentage=@@analyze_sample_percentage; set @save_analyze_sample_percentage=@@analyze_sample_percentage;
set @save_hist_size=@@histogram_size;
set session rand_seed1=42; set session rand_seed1=42;
set session rand_seed2=62; set session rand_seed2=62;
...@@ -986,6 +987,8 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a ...@@ -986,6 +987,8 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
from mysql.column_stats; from mysql.column_stats;
explain select * from t1; explain select * from t1;
set use_stat_tables=@save_use_stat_tables;
drop table t1; drop table t1;
set analyze_sample_percentage=@save_analyze_sample_percentage;
set histogram_size=@save_hist_size;
set use_stat_tables=@save_use_stat_tables;
...@@ -325,7 +325,7 @@ class Column_statistics_collected :public Column_statistics ...@@ -325,7 +325,7 @@ class Column_statistics_collected :public Column_statistics
inline void init(THD *thd, Field * table_field); inline void init(THD *thd, Field * table_field);
inline bool add(); inline bool add();
inline void finish(ha_rows rows); inline void finish(ha_rows rows, double sample_fraction);
inline void cleanup(); inline void cleanup();
}; };
...@@ -1540,6 +1540,8 @@ class Histogram_builder ...@@ -1540,6 +1540,8 @@ class Histogram_builder
uint curr_bucket; /* number of the current bucket to be built */ uint curr_bucket; /* number of the current bucket to be built */
ulonglong count; /* number of values retrieved */ ulonglong count; /* number of values retrieved */
ulonglong count_distinct; /* number of distinct values retrieved */ ulonglong count_distinct; /* number of distinct values retrieved */
/* number of distinct values that occured only once */
ulonglong count_distinct_single_occurence;
public: public:
Histogram_builder(Field *col, uint col_len, ha_rows rows) Histogram_builder(Field *col, uint col_len, ha_rows rows)
...@@ -1554,13 +1556,20 @@ class Histogram_builder ...@@ -1554,13 +1556,20 @@ class Histogram_builder
curr_bucket= 0; curr_bucket= 0;
count= 0; count= 0;
count_distinct= 0; count_distinct= 0;
count_distinct_single_occurence= 0;
} }
ulonglong get_count_distinct() { return count_distinct; } ulonglong get_count_distinct() const { return count_distinct; }
ulonglong get_count_single_occurence() const
{
return count_distinct_single_occurence;
}
int next(void *elem, element_count elem_cnt) int next(void *elem, element_count elem_cnt)
{ {
count_distinct++; count_distinct++;
if (elem_cnt == 1)
count_distinct_single_occurence++;
count+= elem_cnt; count+= elem_cnt;
if (curr_bucket == hist_width) if (curr_bucket == hist_width)
return 0; return 0;
...@@ -1590,9 +1599,18 @@ int histogram_build_walk(void *elem, element_count elem_cnt, void *arg) ...@@ -1590,9 +1599,18 @@ int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
return hist_builder->next(elem, elem_cnt); return hist_builder->next(elem, elem_cnt);
} }
C_MODE_END
static int count_distinct_single_occurence_walk(void *elem,
element_count count, void *arg)
{
((ulonglong*)arg)[0]+= 1;
if (count == 1)
((ulonglong*)arg)[1]+= 1;
return 0;
}
C_MODE_END
/* /*
The class Count_distinct_field is a helper class used to calculate The class Count_distinct_field is a helper class used to calculate
the number of distinct values for a column. The class employs the the number of distinct values for a column. The class employs the
...@@ -1611,6 +1629,9 @@ class Count_distinct_field: public Sql_alloc ...@@ -1611,6 +1629,9 @@ class Count_distinct_field: public Sql_alloc
Unique *tree; /* The helper object to contain distinct values */ Unique *tree; /* The helper object to contain distinct values */
uint tree_key_length; /* The length of the keys for the elements of 'tree */ uint tree_key_length; /* The length of the keys for the elements of 'tree */
ulonglong distincts;
ulonglong distincts_single_occurence;
public: public:
Count_distinct_field() {} Count_distinct_field() {}
...@@ -1667,25 +1688,35 @@ class Count_distinct_field: public Sql_alloc ...@@ -1667,25 +1688,35 @@ class Count_distinct_field: public Sql_alloc
@brief @brief
Calculate the number of elements accumulated in the container of 'tree' Calculate the number of elements accumulated in the container of 'tree'
*/ */
ulonglong get_value() void walk_tree()
{ {
ulonglong count; ulonglong counts[2] = {0, 0};
if (tree->elements == 0) tree->walk(table_field->table,
return (ulonglong) tree->elements_in_tree(); count_distinct_single_occurence_walk, counts);
count= 0; distincts= counts[0];
tree->walk(table_field->table, count_distinct_walk, (void*) &count); distincts_single_occurence= counts[1];
return count;
} }
/* /*
@brief @brief
Build the histogram for the elements accumulated in the container of 'tree' Calculate a histogram of the tree
*/ */
ulonglong get_value_with_histogram(ha_rows rows) void walk_tree_with_histogram(ha_rows rows)
{ {
Histogram_builder hist_builder(table_field, tree_key_length, rows); Histogram_builder hist_builder(table_field, tree_key_length, rows);
tree->walk(table_field->table, histogram_build_walk, (void *) &hist_builder); tree->walk(table_field->table, histogram_build_walk, (void *) &hist_builder);
return hist_builder.get_count_distinct(); distincts= hist_builder.get_count_distinct();
distincts_single_occurence= hist_builder.get_count_single_occurence();
}
ulonglong get_count_distinct()
{
return distincts;
}
ulonglong get_count_distinct_single_occurence()
{
return distincts_single_occurence;
} }
/* /*
...@@ -2514,7 +2545,7 @@ bool Column_statistics_collected::add() ...@@ -2514,7 +2545,7 @@ bool Column_statistics_collected::add()
*/ */
inline inline
void Column_statistics_collected::finish(ha_rows rows) void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)
{ {
double val; double val;
...@@ -2532,15 +2563,43 @@ void Column_statistics_collected::finish(ha_rows rows) ...@@ -2532,15 +2563,43 @@ void Column_statistics_collected::finish(ha_rows rows)
} }
if (count_distinct) if (count_distinct)
{ {
ulonglong distincts;
uint hist_size= count_distinct->get_hist_size(); uint hist_size= count_distinct->get_hist_size();
/* Compute cardinality statistics and optionally histogram. */
if (hist_size == 0) if (hist_size == 0)
distincts= count_distinct->get_value(); count_distinct->walk_tree();
else else
distincts= count_distinct->get_value_with_histogram(rows - nulls); count_distinct->walk_tree_with_histogram(rows - nulls);
ulonglong distincts= count_distinct->get_count_distinct();
ulonglong distincts_single_occurence=
count_distinct->get_count_distinct_single_occurence();
if (distincts) if (distincts)
{ {
/*
We use the unsmoothed first-order jackknife estimator" to estimate
the number of distinct values.
With a sufficient large percentage of rows sampled (80%), we revert back
to computing the avg_frequency off of the raw data.
*/
if (sample_fraction > 0.8)
val= (double) (rows - nulls) / distincts; val= (double) (rows - nulls) / distincts;
else
{
if (nulls == 1)
distincts_single_occurence+= 1;
if (nulls)
distincts+= 1;
double fraction_single_occurence=
static_cast<double>(distincts_single_occurence) / rows;
double total_number_of_rows= rows / sample_fraction;
double estimate_total_distincts= total_number_of_rows /
(distincts /
(1.0 - (1.0 - sample_fraction) * fraction_single_occurence));
val = std::fmax(estimate_total_distincts * (rows - nulls) / rows, 1.0);
}
set_avg_frequency(val); set_avg_frequency(val);
set_not_null(COLUMN_STAT_AVG_FREQUENCY); set_not_null(COLUMN_STAT_AVG_FREQUENCY);
} }
...@@ -2813,7 +2872,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table) ...@@ -2813,7 +2872,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
continue; continue;
bitmap_set_bit(table->write_set, table_field->field_index); bitmap_set_bit(table->write_set, table_field->field_index);
if (!rc) if (!rc)
table_field->collected_stats->finish(rows); table_field->collected_stats->finish(rows, sample_fraction);
else else
table_field->collected_stats->cleanup(); table_field->collected_stats->cleanup();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment