IMPALA-1550: Invalid rewrite when EXISTS subqueries contain aggregate

functions

This commit fixes an issue where a [NOT] EXISTS subquery that contains
an aggregate function will sometimes be incorrectly rewritten into a
join, thereby returning incorrect results.

Change-Id: I18b211d76ee3de77d8061603ff5bb1fbceae2e60
Reviewed-on: http://gerrit.cloudera.org:8080/266
Reviewed-by: Dimitris Tsirogiannis <dtsirogiannis@cloudera.com>
Tested-by: Internal Jenkins
This commit is contained in:
Dimitris Tsirogiannis
2015-03-18 17:05:59 -07:00
committed by Internal Jenkins
parent 672ae91732
commit 4eceeacf16
4 changed files with 157 additions and 14 deletions

View File

@@ -235,6 +235,12 @@ public class AggregateInfo extends AggregateInfoBase {
return outputToIntermediateTupleSmap_;
}
public boolean hasAggregateExprs() {
return !aggregateExprs_.isEmpty() ||
(secondPhaseDistinctAggInfo_ != null &&
!secondPhaseDistinctAggInfo_.getAggregateExprs().isEmpty());
}
/**
* Return the tuple id produced in the final aggregation step.
*/

View File

@@ -220,6 +220,18 @@ public class StmtRewriter {
"expression: " + conjunct.toSql());
}
if (conjunct instanceof ExistsPredicate) {
// Check if we can determine the result of an ExistsPredicate during analysis.
// If so, replace the predicate with a BoolLiteral predicate and remove it from
// the list of predicates to be rewritten.
BoolLiteral boolLiteral = replaceExistsPredicate((ExistsPredicate) conjunct);
if (boolLiteral != null) {
boolLiteral.analyze(analyzer);
smap.put(conjunct, boolLiteral);
continue;
}
}
// Replace all the supported exprs with subqueries with true BoolLiterals
// using an smap.
BoolLiteral boolLiteral = new BoolLiteral(true);
@@ -241,6 +253,25 @@ public class StmtRewriter {
if (hasNewVisibleTuple) replaceUnqualifiedStarItems(stmt, numTableRefs);
}
/**
* Replace an ExistsPredicate that contains a subquery with a BoolLiteral if we
* can determine its result without evaluating it. Return null if the result of the
* ExistsPredicate can only be determined at run-time.
*/
private static BoolLiteral replaceExistsPredicate(ExistsPredicate predicate) {
Subquery subquery = predicate.getSubquery();
Preconditions.checkNotNull(subquery);
SelectStmt subqueryStmt = (SelectStmt) subquery.getStatement();
BoolLiteral boolLiteral = null;
if (subqueryStmt.getAnalyzer().hasEmptyResultSet()) {
boolLiteral = new BoolLiteral(predicate.isNotExists());
} else if (subqueryStmt.hasAggInfo() && subqueryStmt.getAggInfo().hasAggregateExprs()
&& !subqueryStmt.hasAnalyticInfo() && subqueryStmt.getHavingPred() == null) {
boolLiteral = new BoolLiteral(!predicate.isNotExists());
}
return boolLiteral;
}
/**
* Replace all BetweenPredicates containing subqueries with their
* equivalent compound predicates from the expr tree rooted at 'expr'.
@@ -326,10 +357,17 @@ public class StmtRewriter {
subqueryStmt.limitElement_ = null;
}
if (expr instanceof ExistsPredicate) {
// For uncorrelated subqueries, we limit the number of rows returned by the
// subquery.
if (onClauseConjuncts.isEmpty()) subqueryStmt.setLimit(1);
}
// Update the subquery's select list and/or its GROUP BY clause by adding
// exprs from the extracted correlated predicates.
boolean updateGroupBy = expr.getSubquery().isScalarSubquery() ||
(expr instanceof ExistsPredicate && subqueryStmt.hasAggInfo());
(expr instanceof ExistsPredicate &&
subqueryStmt.hasAggInfo() && !subqueryStmt.getSelectList().isDistinct());
List<Expr> lhsExprs = Lists.newArrayList();
List<Expr> rhsExprs = Lists.newArrayList();
for (Expr conjunct: onClauseConjuncts) {
@@ -337,12 +375,6 @@ public class StmtRewriter {
lhsExprs, rhsExprs, updateGroupBy);
}
if (expr instanceof ExistsPredicate && onClauseConjuncts.isEmpty()) {
// For uncorrelated subqueries, we limit the number of rows returned by the
// subquery.
subqueryStmt.setLimit(1);
}
// Analyzing the inline view trigger reanalysis of the subquery's select statement.
// However the statement is already analyzed and since statement analysis is not
// idempotent, the analysis needs to be reset (by a call to clone()).
@@ -713,7 +745,6 @@ public class StmtRewriter {
// Update the subquery's select list.
boolean isDistinct = stmt.selectList_.isDistinct();
Preconditions.checkState(!isDistinct);
stmt.selectList_ = new SelectList(
items, isDistinct, stmt.selectList_.getPlanHints());
// Update subquery's GROUP BY clause

View File

@@ -613,12 +613,7 @@ select 1
from functional.alltypestiny t
where exists (select * from functional.alltypessmall limit 0)
---- PLAN
02:CROSS JOIN [BROADCAST]
|
|--01:EMPTYSET
|
00:SCAN HDFS [functional.alltypestiny t]
partitions=4/4 files=4 size=460B
00:EMPTYSET
====
# Multiple nesting levels
select count(*)
@@ -1419,3 +1414,86 @@ where
02:SCAN HDFS [functional.alltypesagg tt1]
partitions=11/11 files=11 size=814.73KB
====
# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and
# aggregates. All predicates evaluate to FALSE. (IMPALA-1550)
select 1
from functional.alltypestiny t1
where exists
(select id
from functional.alltypes t2
where t1.int_col = t2.int_col limit 0)
and not exists
(select count(distinct int_col)
from functional.alltypesagg t3
where t1.id = t3.id)
---- PLAN
00:EMPTYSET
====
# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and
# aggregates. All predicates evaluate to TRUE. (IMPALA-1550)
select 1
from functional.alltypestiny t1
where not exists
(select id
from functional.alltypes t2
where t1.int_col = t2.int_col limit 0)
and exists
(select count(distinct int_col), sum(distinct int_col)
from functional.alltypesagg t3
where t1.id = t3.id)
and not exists
(select sum(int_col)
from functional.alltypessmall t4
where t1.id = t4.id limit 0)
and not exists
(select min(int_col)
from functional.alltypestiny t5
where t1.id = t5.id and false)
---- PLAN
00:SCAN HDFS [functional.alltypestiny t1]
partitions=4/4 files=4 size=460B
====
# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and
# aggregates. Some predicates evaluate to TRUE while others need to
# be evaluated at run-time. (IMPALA-1550)
select 1
from functional.alltypestiny t1
where not exists
(select id
from functional.alltypes t2
where t1.int_col = t2.int_col limit 0)
and exists
(select distinct int_col
from functional.alltypesagg t3
where t3.id > 100 and t1.id = t3.id)
and not exists
(select count(id)
from functional.alltypestiny t4
where t4.int_col = t1.tinyint_col
having count(id) > 200)
---- PLAN
06:HASH JOIN [LEFT ANTI JOIN]
| hash predicates: t1.tinyint_col = t4.int_col
|
|--04:AGGREGATE [FINALIZE]
| | output: count(id)
| | group by: t4.int_col
| | having: count(id) > 200
| |
| 03:SCAN HDFS [functional.alltypestiny t4]
| partitions=4/4 files=4 size=460B
|
05:HASH JOIN [RIGHT SEMI JOIN]
| hash predicates: t3.id = t1.id
|
|--00:SCAN HDFS [functional.alltypestiny t1]
| partitions=4/4 files=4 size=460B
| predicates: t1.id > 100
|
02:AGGREGATE [FINALIZE]
| group by: int_col, t3.id
|
01:SCAN HDFS [functional.alltypesagg t3]
partitions=11/11 files=11 size=814.73KB
predicates: t3.id > 100
====

View File

@@ -759,3 +759,31 @@ UNION ALL SELECT 3
---- TYPES
TINYINT
====
---- QUERY
# Correlated NOT EXISTS subquery with an aggregate function (IMPALA-1550)
SELECT t1.bigint_col
FROM alltypestiny t1
WHERE NOT EXISTS
(SELECT SUM(smallint_col) AS int_col
FROM alltypestiny
WHERE t1.date_string_col = string_col AND t1.timestamp_col = timestamp_col)
GROUP BY t1.bigint_col
---- RESULTS
---- TYPES
BIGINT
====
---- QUERY
# Correlated EXISTS subquery with an aggregate function (IMPALA-1550)
SELECT t1.bigint_col
FROM alltypestiny t1
WHERE EXISTS
(SELECT SUM(smallint_col) AS int_col
FROM alltypestiny
WHERE t1.date_string_col = string_col AND t1.timestamp_col = timestamp_col)
GROUP BY t1.bigint_col
---- RESULTS
0
10
---- TYPES
BIGINT
====