Commit f0773b78 authored by Vicențiu Ciorbaru's avatar Vicențiu Ciorbaru

Introduce analyze_sample_percentage variable

The variable controls the amount of sampling analyze table performs.

If ANALYZE table with histogram collection is too slow, one can reduce the
time taken by setting analyze_sample_percentage to a lower value of the
total number of rows.
Setting it to 0 will use a formula to compute how many rows to sample:

The number of rows collected is capped to a minimum of 50000 and
increases logarithmically with a coffecient of 4096. The coffecient is
chosen so that we expect an error of less than 3% in our estimations
according to the paper:
"Random Sampling for Histogram Construction: How much is enough?”
– Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998.

The drawback of sampling is that avg_frequency number is computed
imprecisely and will yeild a smaller number than the real one.
parent 47f15ea7
......@@ -15,6 +15,10 @@ The following specify which files/extra groups are read (specified before remain
--alter-algorithm[=name]
Specify the alter table algorithm. One of: DEFAULT, COPY,
INPLACE, NOCOPY, INSTANT
--analyze-sample-percentage=#
Percentage of rows from the table ANALYZE TABLE will
sample to collect table statistics. Set to 0 to let
MariaDB decide what percentage of rows to sample.
-a, --ansi Use ANSI SQL syntax instead of MySQL syntax. This mode
will also set transaction isolation level 'serializable'.
--auto-increment-increment[=#]
......@@ -1385,6 +1389,7 @@ The following specify which files/extra groups are read (specified before remain
Variables (--variable-name=value)
allow-suspicious-udfs FALSE
alter-algorithm DEFAULT
analyze-sample-percentage 100
auto-increment-increment 1
auto-increment-offset 1
autocommit TRUE
......
......@@ -1760,3 +1760,107 @@ DROP TABLE t1;
# End of 10.2 tests
#
set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
#
# Start of 10.4 tests
#
#
# Test analyze_sample_percentage system variable.
#
set @save_use_stat_tables=@@use_stat_tables;
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
set session rand_seed1=42;
set session rand_seed2=62;
set use_stat_tables=PREFERABLY;
set histogram_size=10;
CREATE TABLE t1 (id int);
INSERT INTO t1 (id) VALUES (1), (1), (1), (1), (1), (1), (1);
INSERT INTO t1 (id) SELECT id FROM t1;
INSERT INTO t1 SELECT id+1 FROM t1;
INSERT INTO t1 SELECT id+2 FROM t1;
INSERT INTO t1 SELECT id+4 FROM t1;
INSERT INTO t1 SELECT id+8 FROM t1;
INSERT INTO t1 SELECT id+16 FROM t1;
INSERT INTO t1 SELECT id+32 FROM t1;
INSERT INTO t1 SELECT id+64 FROM t1;
INSERT INTO t1 SELECT id+128 FROM t1;
INSERT INTO t1 SELECT id+256 FROM t1;
INSERT INTO t1 SELECT id+512 FROM t1;
INSERT INTO t1 SELECT id+1024 FROM t1;
INSERT INTO t1 SELECT id+2048 FROM t1;
INSERT INTO t1 SELECT id+4096 FROM t1;
INSERT INTO t1 SELECT id+9192 FROM t1;
#
# This query will should show a full table scan analysis.
#
ANALYZE TABLE t1;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
set analyze_sample_percentage=0.1;
#
# This query will show an innacurate avg_frequency value.
#
ANALYZE TABLE t1;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status Table is already up to date
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 111 17026 0.0000 4.0000 1.0047 0.039,0.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098
#
# This query will show a better avg_frequency value.
#
set analyze_sample_percentage=25;
ANALYZE TABLE t1;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status Table is already up to date
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 1 17384 0.0000 4.0000 3.5736 0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090
set analyze_sample_percentage=0;
#
# Test self adjusting sampling level.
#
ANALYZE TABLE t1;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status Table is already up to date
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 1 17384 0.0000 4.0000 7.4523 0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086
#
# Test record estimation is working properly.
#
select count(*) from t1;
count(*)
229376
explain select * from t1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 229060
set analyze_sample_percentage=100;
ANALYZE TABLE t1;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status Table is already up to date
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
explain select * from t1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 229376
set use_stat_tables=@save_use_stat_tables;
drop table t1;
......@@ -901,3 +901,91 @@ DROP TABLE t1;
--echo #
set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
--echo #
--echo # Start of 10.4 tests
--echo #
--echo #
--echo # Test analyze_sample_percentage system variable.
--echo #
set @save_use_stat_tables=@@use_stat_tables;
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
set session rand_seed1=42;
set session rand_seed2=62;
set use_stat_tables=PREFERABLY;
set histogram_size=10;
CREATE TABLE t1 (id int);
INSERT INTO t1 (id) VALUES (1), (1), (1), (1), (1), (1), (1);
INSERT INTO t1 (id) SELECT id FROM t1;
INSERT INTO t1 SELECT id+1 FROM t1;
INSERT INTO t1 SELECT id+2 FROM t1;
INSERT INTO t1 SELECT id+4 FROM t1;
INSERT INTO t1 SELECT id+8 FROM t1;
INSERT INTO t1 SELECT id+16 FROM t1;
INSERT INTO t1 SELECT id+32 FROM t1;
INSERT INTO t1 SELECT id+64 FROM t1;
INSERT INTO t1 SELECT id+128 FROM t1;
INSERT INTO t1 SELECT id+256 FROM t1;
INSERT INTO t1 SELECT id+512 FROM t1;
INSERT INTO t1 SELECT id+1024 FROM t1;
INSERT INTO t1 SELECT id+2048 FROM t1;
INSERT INTO t1 SELECT id+4096 FROM t1;
INSERT INTO t1 SELECT id+9192 FROM t1;
--echo #
--echo # This query will should show a full table scan analysis.
--echo #
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
set analyze_sample_percentage=0.1;
--echo #
--echo # This query will show an innacurate avg_frequency value.
--echo #
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
--echo #
--echo # This query will show a better avg_frequency value.
--echo #
set analyze_sample_percentage=25;
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
set analyze_sample_percentage=0;
--echo #
--echo # Test self adjusting sampling level.
--echo #
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
-- echo #
-- echo # Test record estimation is working properly.
-- echo #
select count(*) from t1;
explain select * from t1;
set analyze_sample_percentage=100;
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
explain select * from t1;
set use_stat_tables=@save_use_stat_tables;
drop table t1;
......@@ -40,6 +40,20 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST DEFAULT,COPY,INPLACE,NOCOPY,INSTANT
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME ANALYZE_SAMPLE_PERCENTAGE
SESSION_VALUE 100.000000
GLOBAL_VALUE 100.000000
GLOBAL_VALUE_ORIGIN COMPILE-TIME
DEFAULT_VALUE 100.000000
VARIABLE_SCOPE SESSION
VARIABLE_TYPE DOUBLE
VARIABLE_COMMENT Percentage of rows from the table ANALYZE TABLE will sample to collect table statistics. Set to 0 to let MariaDB decide what percentage of rows to sample.
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 100
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME AUTOCOMMIT
SESSION_VALUE ON
GLOBAL_VALUE ON
......
......@@ -40,6 +40,20 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST DEFAULT,COPY,INPLACE,NOCOPY,INSTANT
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME ANALYZE_SAMPLE_PERCENTAGE
SESSION_VALUE 100.000000
GLOBAL_VALUE 100.000000
GLOBAL_VALUE_ORIGIN COMPILE-TIME
DEFAULT_VALUE 100.000000
VARIABLE_SCOPE SESSION
VARIABLE_TYPE DOUBLE
VARIABLE_COMMENT Percentage of rows from the table ANALYZE TABLE will sample to collect table statistics. Set to 0 to let MariaDB decide what percentage of rows to sample.
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 100
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME AUTOCOMMIT
SESSION_VALUE ON
GLOBAL_VALUE ON
......
......@@ -622,6 +622,7 @@ typedef struct system_variables
ulong optimizer_selectivity_sampling_limit;
ulong optimizer_use_condition_selectivity;
ulong use_stat_tables;
double sample_percentage;
ulong histogram_size;
ulong histogram_type;
ulong preload_buff_size;
......
......@@ -2729,12 +2729,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
Field *table_field;
ha_rows rows= 0;
handler *file=table->file;
double sample_fraction= thd->variables.sample_percentage / 100;
const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000;
DBUG_ENTER("collect_statistics_for_table");
table->collected_stats->cardinality_is_null= TRUE;
table->collected_stats->cardinality= 0;
if (thd->variables.sample_percentage == 0)
{
if (file->records() < MIN_THRESHOLD_FOR_SAMPLING)
{
sample_fraction= 1;
}
else
{
sample_fraction= std::fmin(
(MIN_THRESHOLD_FOR_SAMPLING + 4096 *
log(200 * file->records())) / file->records(), 1);
}
}
for (field_ptr= table->field; *field_ptr; field_ptr++)
{
table_field= *field_ptr;
......@@ -2758,6 +2774,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
if (rc)
break;
if (thd_rnd(thd) <= sample_fraction)
{
for (field_ptr= table->field; *field_ptr; field_ptr++)
{
table_field= *field_ptr;
......@@ -2770,6 +2788,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
break;
rows++;
}
}
file->ha_rnd_end();
}
rc= (rc == HA_ERR_END_OF_FILE && !thd->killed) ? 0 : 1;
......@@ -2782,7 +2801,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
if (!rc)
{
table->collected_stats->cardinality_is_null= FALSE;
table->collected_stats->cardinality= rows;
table->collected_stats->cardinality=
static_cast<ha_rows>(rows / sample_fraction);
}
bitmap_clear_all(table->write_set);
......
......@@ -350,6 +350,15 @@ static Sys_var_long Sys_pfs_connect_attrs_size(
#endif /* WITH_PERFSCHEMA_STORAGE_ENGINE */
static Sys_var_double Sys_analyze_sample_percentage(
"analyze_sample_percentage",
"Percentage of rows from the table ANALYZE TABLE will sample "
"to collect table statistics. Set to 0 to let MariaDB decide "
"what percentage of rows to sample.",
SESSION_VAR(sample_percentage),
CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 100),
DEFAULT(100));
static Sys_var_ulong Sys_auto_increment_increment(
"auto_increment_increment",
"Auto-increment columns are incremented by this",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment