Commit 5515bcba authored by unknown's avatar unknown

MWL#68 Subquery optimization: Efficient NOT IN execution with NULLs

This patch implements correct NULL semantics for materialized subquery execution.
The implementation has the following properties and main limitations:
- It passes all query result tests, but fails a number of EXPLAIN tests because of
  changed plans.
- The EXPLAIN output for partial matching is not decided yet.
- It works only when all necessary indexes fit into main memory. Notice that these
  are not the general B-tree/Hash indexes, but instead much more compact ones,
  therefore this limitation may not be a problem in many practical cases.
- It doesn't contain specialized tests.
- In several places the implementation uses methods that are modified copies of
  other similar methods. These cases need to be refactored to avoid code duplication.
- Add a test if the predicate is top-level just before deciding on partial matching.
  If it is top-level, use a more efficient exec method (index lookup).
- Add sorting of indexes according to their selectivity. The code is almost there.
- Needs more comments, and to sync existing ones with the implementation.

sql/item_cmpfunc.h:
  Expose the Arg_comparator of a comparison predicate. This makes it possible to
  directly get the comparison result {-1,0,1}, which is not possible through the
  val_XXX() methods which "fold" such results into a boolean.
sql/item_subselect.cc:
  The core of the implementation of MWL#68.
sql/item_subselect.h:
  The core of the implementation of MWL#68.
sql/opt_subselect.cc:
  Removed the limitation for materialized subquery execution that it is applicable only
  for top-level predicates.
sql/sql_class.cc:
  New class select_materialize_with_stats that collects data statistics about
  the data being inserted into the target table.
sql/sql_class.h:
  New class select_materialize_with_stats that collects data statistics about
  the data being inserted into the target table.
sql/sql_select.cc:
  - more complete initialization of the TABLE object of a temp table.
  - call setup_subquery_materialization at one more exit point.
parent d63959ee
......@@ -350,6 +350,7 @@ class Item_bool_func2 :public Item_int_func
CHARSET_INFO *compare_collation() { return cmp.cmp_collation.collation; }
uint decimal_precision() const { return 1; }
void top_level_item() { abort_on_null= TRUE; }
Arg_comparator *get_comparator() { return &cmp; }
friend class Arg_comparator;
};
......
This diff is collapsed.
This diff is collapsed.
......@@ -187,11 +187,7 @@ int check_and_do_in_subquery_rewrites(JOIN *join)
does not call setup_subquery_materialization(). We could make
SELECT ... FROM DUAL call that function but that doesn't seem
to be the case that is worth handling.
4. Subquery predicate is a top-level predicate
(this implies it is not negated)
TODO: this is a limitation that should be lifted once we
implement correct NULL semantics (WL#3830)
5. Subquery is non-correlated
4. Subquery is non-correlated
TODO:
This is an overly restrictive condition. It can be extended to:
(Subquery is non-correlated ||
......@@ -199,7 +195,7 @@ int check_and_do_in_subquery_rewrites(JOIN *join)
(Subquery is correlated to the immediate outer query &&
Subquery !contains {GROUP BY, ORDER BY [LIMIT],
aggregate functions}) && subquery predicate is not under "NOT IN"))
6. No execution method was already chosen (by a prepared statement).
5. No execution method was already chosen (by a prepared statement).
(*) The subquery must be part of a SELECT statement. The current
condition also excludes multi-table update statements.
......@@ -218,9 +214,8 @@ int check_and_do_in_subquery_rewrites(JOIN *join)
subquery_types_allow_materialization(in_subs))
{
// psergey-todo: duplicated_subselect_card_check: where it's done?
if (in_subs->is_top_level_item() && // 4
!in_subs->is_correlated && // 5
in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) // 6
if (!in_subs->is_correlated && // 4
in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) // 5
in_subs->exec_method= Item_in_subselect::MATERIALIZATION;
}
......
......@@ -42,6 +42,7 @@
#include "sp_rcontext.h"
#include "sp_cache.h"
#include "sql_select.h" /* declares create_tmp_table() */
/*
The following is used to initialise Table_ident with a internal
......@@ -2877,6 +2878,71 @@ bool select_dumpvar::send_eof()
return 0;
}
bool
select_materialize_with_stats::
create_result_table(THD *thd_arg, List<Item> *column_types,
bool is_union_distinct, ulonglong options,
const char *table_alias, bool bit_fields_as_long)
{
DBUG_ASSERT(table == 0);
tmp_table_param.field_count= column_types->elements;
tmp_table_param.bit_fields_as_long= bit_fields_as_long;
if (! (table= create_tmp_table(thd_arg, &tmp_table_param, *column_types,
(ORDER*) 0, is_union_distinct, 1,
options, HA_POS_ERROR, (char*) table_alias)))
return TRUE;
col_stat= (Column_statistics*) table->in_use->alloc(table->s->fields *
sizeof(Column_statistics));
if (!stat)
return TRUE;
cleanup();
table->file->extra(HA_EXTRA_WRITE_CACHE);
table->file->extra(HA_EXTRA_IGNORE_DUP_KEY);
return FALSE;
}
/**
Override select_union::send_data to analyze each row for NULLs and to
update null_statistics before sending data to the client.
@return TRUE if fatal error when sending data to the client
@return FALSE on success
*/
bool select_materialize_with_stats::send_data(List<Item> &items)
{
List_iterator_fast<Item> item_it(items);
Item *cur_item;
Column_statistics *cur_col_stat= col_stat;
uint nulls_in_row= 0;
++count_rows;
while ((cur_item= item_it++))
{
if (cur_item->is_null())
{
++cur_col_stat->null_count;
cur_col_stat->max_null_row= count_rows;
if (!cur_col_stat->min_null_row)
cur_col_stat->min_null_row= count_rows;
++nulls_in_row;
}
++cur_col_stat;
}
if (nulls_in_row > max_nulls_in_row)
max_nulls_in_row= nulls_in_row;
return select_union::send_data(items);
}
/****************************************************************************
TMP_TABLE_PARAM
****************************************************************************/
......
......@@ -2740,17 +2740,18 @@ class TMP_TABLE_PARAM :public Sql_alloc
class select_union :public select_result_interceptor
{
protected:
TMP_TABLE_PARAM tmp_table_param;
public:
TABLE *table;
select_union() :table(0) {}
select_union() :table(0) { tmp_table_param.init(); }
int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
bool send_data(List<Item> &items);
bool send_eof();
bool flush();
bool create_result_table(THD *thd, List<Item> *column_types,
virtual bool create_result_table(THD *thd, List<Item> *column_types,
bool is_distinct, ulonglong options,
const char *alias, bool bit_fields_as_long);
};
......@@ -2776,6 +2777,74 @@ class select_singlerow_subselect :public select_subselect
bool send_data(List<Item> &items);
};
/*
This class specializes select_union to collect statistics about the
data stored in the temp table. Currently the class collects statistcs
about NULLs.
*/
class select_materialize_with_stats : public select_union
{
protected:
class Column_statistics
{
public:
/* Count of NULLs per column. */
ha_rows null_count;
/* The row number that contains the first NULL in a column. */
ha_rows min_null_row;
/* The row number that contains the last NULL in a column. */
ha_rows max_null_row;
};
/* Array of statistics data per column. */
Column_statistics* col_stat;
/*
The number of columns in the biggest sub-row that consists of only
NULL values.
*/
ha_rows max_nulls_in_row;
/*
Count of rows writtent to the temp table. This is redundant as it is
already stored in handler::stats.records, however that one is relatively
expensive to compute (given we need that for evry row).
*/
ha_rows count_rows;
public:
select_materialize_with_stats() {}
virtual bool create_result_table(THD *thd, List<Item> *column_types,
bool is_distinct, ulonglong options,
const char *alias, bool bit_fields_as_long);
bool init_result_table(ulonglong select_options);
bool send_data(List<Item> &items);
void cleanup()
{
memset(col_stat, 0, table->s->fields * sizeof(Column_statistics));
max_nulls_in_row= 0;
count_rows= 0;
}
ha_rows get_null_count_of_col(uint idx)
{
DBUG_ASSERT(idx < table->s->fields);
return col_stat[idx].null_count;
}
ha_rows get_max_null_of_col(uint idx)
{
DBUG_ASSERT(idx < table->s->fields);
return col_stat[idx].max_null_row;
}
ha_rows get_min_null_of_col(uint idx)
{
DBUG_ASSERT(idx < table->s->fields);
return col_stat[idx].min_null_row;
}
ha_rows get_max_nulls_in_row() { return max_nulls_in_row; }
};
/* used in independent ALL/ANY optimisation */
class select_max_min_finder_subselect :public select_subselect
{
......
......@@ -874,6 +874,9 @@ JOIN::optimize()
{
DBUG_PRINT("info",("No tables"));
error= 0;
/* Create all structures needed for materialized subquery execution. */
if (setup_subquery_materialization())
DBUG_RETURN(1);
DBUG_RETURN(0);
}
error= -1; // Error is sent to client
......@@ -11233,7 +11236,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
param->group_buff=group_buff;
share->keys=1;
share->uniques= test(using_unique_constraint);
table->key_info=keyinfo;
table->key_info= table->s->key_info= keyinfo;
keyinfo->key_part=key_part_info;
keyinfo->flags=HA_NOSAME;
keyinfo->usable_key_parts=keyinfo->key_parts= param->group_parts;
......@@ -11319,7 +11322,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
keyinfo->key_parts * sizeof(KEY_PART_INFO))))
goto err;
bzero((void*) key_part_info, keyinfo->key_parts * sizeof(KEY_PART_INFO));
table->key_info=keyinfo;
table->key_info= table->s->key_info= keyinfo;
keyinfo->key_part=key_part_info;
keyinfo->flags=HA_NOSAME | HA_NULL_ARE_EQUAL;
keyinfo->key_length= 0; // Will compute the sum of the parts below.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment