Commit 2ad68ca3 authored by Sergey Petrunya's avatar Sergey Petrunya

BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN...

BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
- Fix equality propagation to work with SJM nests and OR clauses (full descirption of problem and
  solution in the comment in the patch)
(The second commit with post-review fixes)
parent 85af7417
......@@ -940,5 +940,108 @@ WHERE a = d AND ( pk < 2 OR d = 'z' )
);
a b c
DROP TABLE t1, t2;
#
# BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
#
CREATE TABLE t1 (
a VARCHAR(1),
b VARCHAR(1) NOT NULL,
KEY(a)
) ENGINE=InnoDB;
INSERT INTO t1 VALUES
('j','j'),('v','v'),('c','c'),('m','m'),('d','d'),
('y','y'),('t','t'),('d','d'),('s','s'),('r','r'),
('m','m'),('b','b'),('x','x'),('g','g'),('p','p'),
('q','q'),('w','w'),('d','d'),('e','e');
CREATE ALGORITHM=TEMPTABLE VIEW v1 AS SELECT * FROM t1;
# This query returned 6 rows instead of 19
SELECT * FROM v1
WHERE ( a, a ) IN (
SELECT alias2.b, alias2.a
FROM t1 AS alias1, t1 AS alias2
WHERE alias2.b = alias1.a
AND ( alias1.b >= alias1.a OR alias2.b = 'z' )
);
a b
j j
v v
c c
m m
m m
d d
d d
d d
y y
t t
s s
r r
b b
x x
g g
p p
q q
w w
e e
# Another testcase, without the VIEW:
CREATE TABLE t2 (a VARCHAR(1), b VARCHAR(1) NOT NULL, KEY(a)) ENGINE=InnoDB;
INSERT INTO t2 SELECT * FROM t1;
INSERT INTO t2 SELECT * FROM t1;
EXPLAIN
SELECT * FROM t2
WHERE (a, a) IN (SELECT alias2.b, alias2.a FROM t1 AS alias1, t1 AS alias2
WHERE
alias2.b = alias1.a AND
(alias1.b >= alias1.a OR alias2.b = 'z'));
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY <subquery2> ALL distinct_key NULL NULL NULL 19
1 PRIMARY t2 ref a a 4 test.alias1.a 1
2 MATERIALIZED alias1 ALL a NULL NULL NULL 19 Using where
2 MATERIALIZED alias2 ref a a 4 test.alias1.a 1 Using where
SELECT * FROM t2
WHERE (a, a) IN (SELECT alias2.b, alias2.a FROM t1 AS alias1, t1 AS alias2
WHERE
alias2.b = alias1.a AND
(alias1.b >= alias1.a OR alias2.b = 'z'));
a b
j j
j j
v v
v v
c c
c c
m m
m m
m m
m m
d d
d d
d d
d d
d d
d d
y y
y y
t t
t t
s s
s s
r r
r r
b b
b b
x x
x x
g g
g g
p p
p p
q q
q q
w w
w w
e e
e e
DROP VIEW v1;
DROP TABLE t1, t2;
# This must be the last in the file:
set optimizer_switch=@subselect_sj2_tmp;
......@@ -954,6 +954,109 @@ WHERE a = d AND ( pk < 2 OR d = 'z' )
);
a b c
DROP TABLE t1, t2;
#
# BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
#
CREATE TABLE t1 (
a VARCHAR(1),
b VARCHAR(1) NOT NULL,
KEY(a)
) ENGINE=InnoDB;
INSERT INTO t1 VALUES
('j','j'),('v','v'),('c','c'),('m','m'),('d','d'),
('y','y'),('t','t'),('d','d'),('s','s'),('r','r'),
('m','m'),('b','b'),('x','x'),('g','g'),('p','p'),
('q','q'),('w','w'),('d','d'),('e','e');
CREATE ALGORITHM=TEMPTABLE VIEW v1 AS SELECT * FROM t1;
# This query returned 6 rows instead of 19
SELECT * FROM v1
WHERE ( a, a ) IN (
SELECT alias2.b, alias2.a
FROM t1 AS alias1, t1 AS alias2
WHERE alias2.b = alias1.a
AND ( alias1.b >= alias1.a OR alias2.b = 'z' )
);
a b
j j
v v
c c
m m
m m
d d
d d
d d
y y
t t
s s
r r
b b
x x
g g
p p
q q
w w
e e
# Another testcase, without the VIEW:
CREATE TABLE t2 (a VARCHAR(1), b VARCHAR(1) NOT NULL, KEY(a)) ENGINE=InnoDB;
INSERT INTO t2 SELECT * FROM t1;
INSERT INTO t2 SELECT * FROM t1;
EXPLAIN
SELECT * FROM t2
WHERE (a, a) IN (SELECT alias2.b, alias2.a FROM t1 AS alias1, t1 AS alias2
WHERE
alias2.b = alias1.a AND
(alias1.b >= alias1.a OR alias2.b = 'z'));
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY <subquery2> ALL distinct_key NULL NULL NULL 19
1 PRIMARY t2 ref a a 4 test.alias1.a 1 Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
2 MATERIALIZED alias1 ALL a NULL NULL NULL 19 Using where
2 MATERIALIZED alias2 ref a a 4 test.alias1.a 1 Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
SELECT * FROM t2
WHERE (a, a) IN (SELECT alias2.b, alias2.a FROM t1 AS alias1, t1 AS alias2
WHERE
alias2.b = alias1.a AND
(alias1.b >= alias1.a OR alias2.b = 'z'));
a b
j j
v v
c c
m m
d d
t t
d d
s s
r r
m m
b b
g g
p p
q q
w w
d d
e e
j j
v v
c c
m m
d d
t t
d d
s s
r r
m m
b b
g g
p p
q q
d d
e e
y y
x x
y y
x x
w w
DROP VIEW v1;
DROP TABLE t1, t2;
# This must be the last in the file:
set optimizer_switch=@subselect_sj2_tmp;
#
......
......@@ -942,6 +942,109 @@ WHERE a = d AND ( pk < 2 OR d = 'z' )
);
a b c
DROP TABLE t1, t2;
#
# BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
#
CREATE TABLE t1 (
a VARCHAR(1),
b VARCHAR(1) NOT NULL,
KEY(a)
) ENGINE=InnoDB;
INSERT INTO t1 VALUES
('j','j'),('v','v'),('c','c'),('m','m'),('d','d'),
('y','y'),('t','t'),('d','d'),('s','s'),('r','r'),
('m','m'),('b','b'),('x','x'),('g','g'),('p','p'),
('q','q'),('w','w'),('d','d'),('e','e');
CREATE ALGORITHM=TEMPTABLE VIEW v1 AS SELECT * FROM t1;
# This query returned 6 rows instead of 19
SELECT * FROM v1
WHERE ( a, a ) IN (
SELECT alias2.b, alias2.a
FROM t1 AS alias1, t1 AS alias2
WHERE alias2.b = alias1.a
AND ( alias1.b >= alias1.a OR alias2.b = 'z' )
);
a b
j j
v v
c c
m m
m m
d d
d d
d d
y y
t t
s s
r r
b b
x x
g g
p p
q q
w w
e e
# Another testcase, without the VIEW:
CREATE TABLE t2 (a VARCHAR(1), b VARCHAR(1) NOT NULL, KEY(a)) ENGINE=InnoDB;
INSERT INTO t2 SELECT * FROM t1;
INSERT INTO t2 SELECT * FROM t1;
EXPLAIN
SELECT * FROM t2
WHERE (a, a) IN (SELECT alias2.b, alias2.a FROM t1 AS alias1, t1 AS alias2
WHERE
alias2.b = alias1.a AND
(alias1.b >= alias1.a OR alias2.b = 'z'));
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY <subquery2> ALL distinct_key NULL NULL NULL 19
1 PRIMARY t2 ref a a 4 test.alias1.a 1
2 MATERIALIZED alias1 ALL a NULL NULL NULL 19 Using where
2 MATERIALIZED alias2 ref a a 4 test.alias1.a 1 Using where
SELECT * FROM t2
WHERE (a, a) IN (SELECT alias2.b, alias2.a FROM t1 AS alias1, t1 AS alias2
WHERE
alias2.b = alias1.a AND
(alias1.b >= alias1.a OR alias2.b = 'z'));
a b
j j
j j
v v
v v
c c
c c
m m
m m
m m
m m
d d
d d
d d
d d
d d
d d
y y
y y
t t
t t
s s
s s
r r
r r
b b
b b
x x
x x
g g
g g
p p
p p
q q
q q
w w
w w
e e
e e
DROP VIEW v1;
DROP TABLE t1, t2;
# This must be the last in the file:
set optimizer_switch=@subselect_sj2_tmp;
set optimizer_switch=default;
......
......@@ -1128,5 +1128,49 @@ SELECT * FROM t1 WHERE b IN (
DROP TABLE t1, t2;
--echo #
--echo # BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
--echo #
CREATE TABLE t1 (
a VARCHAR(1),
b VARCHAR(1) NOT NULL,
KEY(a)
) ENGINE=InnoDB;
INSERT INTO t1 VALUES
('j','j'),('v','v'),('c','c'),('m','m'),('d','d'),
('y','y'),('t','t'),('d','d'),('s','s'),('r','r'),
('m','m'),('b','b'),('x','x'),('g','g'),('p','p'),
('q','q'),('w','w'),('d','d'),('e','e');
CREATE ALGORITHM=TEMPTABLE VIEW v1 AS SELECT * FROM t1;
--echo # This query returned 6 rows instead of 19
SELECT * FROM v1
WHERE ( a, a ) IN (
SELECT alias2.b, alias2.a
FROM t1 AS alias1, t1 AS alias2
WHERE alias2.b = alias1.a
AND ( alias1.b >= alias1.a OR alias2.b = 'z' )
);
--echo # Another testcase, without the VIEW:
CREATE TABLE t2 (a VARCHAR(1), b VARCHAR(1) NOT NULL, KEY(a)) ENGINE=InnoDB;
INSERT INTO t2 SELECT * FROM t1;
INSERT INTO t2 SELECT * FROM t1;
EXPLAIN
SELECT * FROM t2
WHERE (a, a) IN (SELECT alias2.b, alias2.a FROM t1 AS alias1, t1 AS alias2
WHERE
alias2.b = alias1.a AND
(alias1.b >= alias1.a OR alias2.b = 'z'));
SELECT * FROM t2
WHERE (a, a) IN (SELECT alias2.b, alias2.a FROM t1 AS alias1, t1 AS alias2
WHERE
alias2.b = alias1.a AND
(alias1.b >= alias1.a OR alias2.b = 'z'));
DROP VIEW v1;
DROP TABLE t1, t2;
--echo # This must be the last in the file:
set optimizer_switch=@subselect_sj2_tmp;
......@@ -5388,7 +5388,7 @@ Item *Item_bool_rowready_func2::negated_item()
*/
Item_equal::Item_equal(Item *f1, Item *f2, bool with_const_item)
: Item_bool_func(), eval_item(0), cond_false(0)
: Item_bool_func(), eval_item(0), cond_false(0), context_field(NULL)
{
const_item_cache= 0;
with_const= with_const_item;
......@@ -5411,7 +5411,7 @@ Item_equal::Item_equal(Item *f1, Item *f2, bool with_const_item)
*/
Item_equal::Item_equal(Item_equal *item_equal)
: Item_bool_func(), eval_item(0), cond_false(0)
: Item_bool_func(), eval_item(0), cond_false(0), context_field(NULL)
{
const_item_cache= 0;
List_iterator_fast<Item> li(item_equal->equal_items);
......
......@@ -1700,9 +1700,16 @@ class Item_equal: public Item_bool_func
as datetimes. The comparator is used only if compare_as_dates=TRUE
*/
Arg_comparator cmp;
/*
For Item_equal objects inside an OR clause: one of the fields that were
used in the original equality.
*/
Item_field *context_field;
public:
inline Item_equal()
: Item_bool_func(), with_const(FALSE), eval_item(0), cond_false(0)
: Item_bool_func(), with_const(FALSE), eval_item(0), cond_false(0),
context_field(NULL)
{ const_item_cache=0 ;}
Item_equal(Item *f1, Item *f2, bool with_const_item);
Item_equal(Item_equal *item_equal);
......@@ -1729,6 +1736,8 @@ class Item_equal: public Item_bool_func
Item *transform(Item_transformer transformer, uchar *arg);
virtual void print(String *str, enum_query_type query_type);
CHARSET_INFO *compare_collation();
void set_context_field(Item_field *ctx_field) { context_field= ctx_field; }
friend class Item_equal_fields_iterator;
friend Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
Item_equal *item_equal);
......
......@@ -171,6 +171,252 @@
*/
/*
EqualityPropagationAndSjmNests
******************************
Equalities are used for:
P1. Equality propagation
P2. Equality substitution [for a certain join order]
The equality propagation is not affected by SJM nests. In fact, it is done
before we determine the execution plan, i.e. before we even know we will use
SJM-nests for execution.
The equality substitution is affected.
Substitution without SJMs
=========================
When one doesn't have SJM nests, tables have a strict join order:
--------------------------------->
t1 -- t2 -- t3 -- t4 --- t5
? ^
\
--(part-of-WHERE)
parts WHERE/ON and ref. expressions are attached at some point along the axis.
Expression is allowed to refer to a table column if the table is to the left of
the attachment point. For any given expression, we have a goal:
"Move leftmost allowed attachment point as much as possible to the left"
Substitution with SJMs - task setting
=====================================
When SJM nests are present, there is no global strict table ordering anymore:
--------------------------------->
ot1 -- ot2 --- sjm -- ot4 --- ot5
|
| Main execution
- - - - - - - - - - - - - - - - - - - - - - - -
| Materialization
it1 -- it2 --/
Besides that, we must take into account that
- values for outer table columns, otN.col, are inaccessible at
materialization step (SJM-RULE)
- values for inner table columns, itN.col, are inaccessible at Main execution
step, except for SJ-Materialization-Scan and columns that are in the
subquery's select list. (SJM-RULE)
Substitution with SJMs - solution
=================================
First, we introduce global strict table ordering like this:
ot1 - ot2 --\ /--- ot3 -- ot5
\--- it1 --- it2 --/
Now, let's see how to meet (SJM-RULE).
SJ-Materialization is only applicable for uncorrelated subqueries. From this, it
follows that any multiple equality will either
1. include only columns of outer tables, or
2. include only columns of inner tables, or
3. include columns of inner and outer tables, joined together through one
of IN-equalities.
Cases #1 and #2 can be handled in the same way as with regular inner joins.
Case #3 requires special handling, so that we don't construct violations of
(SJM-RULE). Let's consider possible ways to build violations.
Equality propagation starts with the clause in this form
top_query_where AND subquery_where AND in_equalities
First, it builds multi-equalities. It can also build a mixed multi-equality
multiple-equal(ot1.col, ot2.col, ... it1.col, itN.col)
Multi-equalities are pushed down the OR-clauses in top_query_where and in
subquery_where, so it's possible that clauses like this one are built:
subquery_cond OR (multiple-equal(it1.col, ot1.col,...) AND ...)
^^^^^^^^^^^^^ \
| this must be evaluated
\- can only be evaluated at the main phase.
at the materialization phase
Finally, equality substitution is started. It does two operations:
1. Field reference substitution
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
(In the code, this is Item_field::replace_equal_field)
This is a process of replacing each reference to "tblX.col"
with the first element of the multi-equality. (REF-SUBST-ORIG)
This behaviour can cause problems with Semi-join nests. Suppose, we have a
condition:
func(it1.col, it2.col)
and a multi-equality(ot1.col, it1.col). Then, reference to "it1.col" will be
replaced with "ot1.col", constructing a condition
func(ot1.col, it2.col)
which will be a violation of (SJM-RULE).
In order to avoid this, (REF-SUBST-ORIG) is amended as follows:
- references to tables "itX.col" that are inner wrt some SJM nest, are
replaced with references to the first inner table from the same SJM nest.
- references to top-level tables "otX.col" are replaced with references to
the first element of the multi-equality, no matter if that first element is
a column of a top-level table or of table from some SJM nest.
(REF-SUBST-SJM)
The case where the first element is a table from an SJM nest $SJM is ok,
because it can be proven that $SJM uses SJ-Materialization-Scan, and
"unpacks" correct column values to the first element during the main
execution phase.
2. Item_equal elimination
~~~~~~~~~~~~~~~~~~~~~~~~~
(In the code: eliminate_item_equal) This is a process of taking
multiple-equal(a,b,c,d,e)
and replacing it with an equivalent expression which is an AND of pair-wise
equalities:
a=b AND a=c AND ...
The equalities are picked such that for any given join prefix (t1,t2...) the
subset of equalities that can be evaluated gives the most restrictive
filtering.
Without SJM nests, it is sufficient to compare every multi-equality member
with the first one:
elem1=elem2 AND elem1=elem3 AND elem1=elem4 ...
When SJM nests are present, we should take care not to construct equalities
that violate the (SJM-RULE). This is achieved by generating separate sets of
equalites for top-level tables and for inner tables. That is, for the join
order
ot1 - ot2 --\ /--- ot3 -- ot5
\--- it1 --- it2 --/
we will generate
ot1.col=ot2.col
ot1.col=ot3.col
ot1.col=ot5.col
it2.col=it1.col
2.1 The problem with Item_equals and ORs
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
As has been mentioned above, multiple equalities are pushed down into OR
clauses, possibly building clauses like this:
func(it.col2) OR multiple-equal(it1.col1, it1.col2, ot1.col) (1)
where the first part of the clause has references to inner tables, while the
second has references to the top-level tables, which is a violation of
(SJM-RULE).
AND-clauses of this kind do not create problems, because make_cond_for_table()
will take them apart. OR-clauses will not be split. It is possible to
split-out the part that's dependent on the inner table:
func(it.col2) OR it1.col1=it1.col2
but this is a less-restrictive condition than condition (1). Current execution
scheme will still try to generate the "remainder" condition:
func(it.col2) OR it1.col1=ot1.col
which is a violation of (SJM-RULE).
QQ: "ot1.col=it1.col" is checked at the upper level. Why was it not removed
here?
AA: because has a proper subset of conditions that are found on this level.
consider a join order of ot, sjm(it)
and a condition
ot.col=it.col AND ( ot.col=it.col='foo' OR it.col2='bar')
we will produce:
table ot: nothing
table it: ot.col=it.col AND (ot.col='foo' OR it.col2='bar')
^^^^ ^^^^^^^^^^^^^^^^
| \ the problem is that
| this part condition didnt
| receive a substitution
|
+--- it was correct to subst, 'ot' is
the left-most.
Does it make sense to push "inner=outer" down into ORs?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Yes. Consider the query:
select * from ot
where ot.col in (select it.col from it where (it.col='foo' OR it.col='bar'))
here, it may be useful to infer that
(ot.col='foo' OR ot.col='bar') (CASE-FOR-SUBST)
and attach that condition to the table 'ot'.
Possible solutions for Item_equals and ORs
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Solution #1
~~~~~~~~~~~
Let make_cond_for_table() chop analyze the OR clauses it has produced and
discard them if they violate (SJM-RULE). This solution would allow to handle
cases like (CASE-FOR-SUBST) at the expense of making semantics of
make_cond_for_table() complicated.
Solution #2
~~~~~~~~~~~
Before the equality propagation phase, none of the OR clauses violate the
(SJM-RULE). This way, if we remember which tables the original equality
referred to, we can only generate equalities that refer to the outer (or inner)
tables. Note that this will disallow handling of cases like (CASE-FOR-SUBST).
Currently, solution #2 is implemented.
*/
static
bool subquery_types_allow_materialization(Item_in_subselect *in_subs);
......
......@@ -10675,6 +10675,9 @@ Item_equal *find_item_equal(COND_EQUAL *cond_equal, Field *field,
acceptable, as this happens rarely. The implementation without
copying would be much more complicated.
For description of how equality propagation works with SJM nests, grep
for EqualityPropagationAndSjmNests.
@param left_item left term of the quality to be checked
@param right_item right term of the equality to be checked
@param item equality item if the equality originates from a condition
......@@ -10748,12 +10751,14 @@ static bool check_simple_equality(Item *left_item, Item *right_item,
{
/* left_item_equal of an upper level contains left_item */
left_item_equal= new Item_equal(left_item_equal);
left_item_equal->set_context_field(((Item_field*) left_item));
cond_equal->current_level.push_back(left_item_equal);
}
if (right_copyfl)
{
/* right_item_equal of an upper level contains right_item */
right_item_equal= new Item_equal(right_item_equal);
right_item_equal->set_context_field(((Item_field*) right_item));
cond_equal->current_level.push_back(right_item_equal);
}
......@@ -10839,6 +10844,7 @@ static bool check_simple_equality(Item *left_item, Item *right_item,
{
item_equal= new Item_equal(item_equal);
cond_equal->current_level.push_back(item_equal);
item_equal->set_context_field(field_item);
}
if (item_equal)
{
......@@ -11489,6 +11495,8 @@ static TABLE_LIST* embedding_sjm(Item *item)
Item_equal::get_first() also takes similar measures for dealing with
equality substitution in presense of SJM nests.
Grep for EqualityPropagationAndSjmNests for a more verbose description.
@return
- The condition with generated simple equalities or
a pointer to the simple generated equality, if success.
......@@ -11553,8 +11561,12 @@ Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
*/
if (upper)
{
TABLE_LIST *native_sjm= embedding_sjm(item_equal->context_field);
if (item_const && upper->get_const())
{
/* Upper item also has "field_item=const". Don't produce equality here */
item= 0;
}
else
{
Item_equal_fields_iterator li(*item_equal);
......@@ -11565,6 +11577,8 @@ Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
break;
}
}
if (embedding_sjm(field_item) != native_sjm)
item= NULL; /* Don't produce equality */
}
bool produce_equality= test(item == field_item);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment