mirror of
https://github.com/apache/impala.git
synced 2026-01-05 21:00:54 -05:00
IMPALA-1550: Invalid rewrite when EXISTS subqueries contain aggregate
functions This commit fixes an issue where a [NOT] EXISTS subquery that contains an aggregate function will sometimes be incorrectly rewritten into a join, thereby returning incorrect results. Change-Id: I18b211d76ee3de77d8061603ff5bb1fbceae2e60 Reviewed-on: http://gerrit.cloudera.org:8080/266 Reviewed-by: Dimitris Tsirogiannis <dtsirogiannis@cloudera.com> Tested-by: Internal Jenkins
This commit is contained in:
committed by
Internal Jenkins
parent
672ae91732
commit
4eceeacf16
@@ -235,6 +235,12 @@ public class AggregateInfo extends AggregateInfoBase {
|
||||
return outputToIntermediateTupleSmap_;
|
||||
}
|
||||
|
||||
public boolean hasAggregateExprs() {
|
||||
return !aggregateExprs_.isEmpty() ||
|
||||
(secondPhaseDistinctAggInfo_ != null &&
|
||||
!secondPhaseDistinctAggInfo_.getAggregateExprs().isEmpty());
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the tuple id produced in the final aggregation step.
|
||||
*/
|
||||
|
||||
@@ -220,6 +220,18 @@ public class StmtRewriter {
|
||||
"expression: " + conjunct.toSql());
|
||||
}
|
||||
|
||||
if (conjunct instanceof ExistsPredicate) {
|
||||
// Check if we can determine the result of an ExistsPredicate during analysis.
|
||||
// If so, replace the predicate with a BoolLiteral predicate and remove it from
|
||||
// the list of predicates to be rewritten.
|
||||
BoolLiteral boolLiteral = replaceExistsPredicate((ExistsPredicate) conjunct);
|
||||
if (boolLiteral != null) {
|
||||
boolLiteral.analyze(analyzer);
|
||||
smap.put(conjunct, boolLiteral);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Replace all the supported exprs with subqueries with true BoolLiterals
|
||||
// using an smap.
|
||||
BoolLiteral boolLiteral = new BoolLiteral(true);
|
||||
@@ -241,6 +253,25 @@ public class StmtRewriter {
|
||||
if (hasNewVisibleTuple) replaceUnqualifiedStarItems(stmt, numTableRefs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace an ExistsPredicate that contains a subquery with a BoolLiteral if we
|
||||
* can determine its result without evaluating it. Return null if the result of the
|
||||
* ExistsPredicate can only be determined at run-time.
|
||||
*/
|
||||
private static BoolLiteral replaceExistsPredicate(ExistsPredicate predicate) {
|
||||
Subquery subquery = predicate.getSubquery();
|
||||
Preconditions.checkNotNull(subquery);
|
||||
SelectStmt subqueryStmt = (SelectStmt) subquery.getStatement();
|
||||
BoolLiteral boolLiteral = null;
|
||||
if (subqueryStmt.getAnalyzer().hasEmptyResultSet()) {
|
||||
boolLiteral = new BoolLiteral(predicate.isNotExists());
|
||||
} else if (subqueryStmt.hasAggInfo() && subqueryStmt.getAggInfo().hasAggregateExprs()
|
||||
&& !subqueryStmt.hasAnalyticInfo() && subqueryStmt.getHavingPred() == null) {
|
||||
boolLiteral = new BoolLiteral(!predicate.isNotExists());
|
||||
}
|
||||
return boolLiteral;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace all BetweenPredicates containing subqueries with their
|
||||
* equivalent compound predicates from the expr tree rooted at 'expr'.
|
||||
@@ -326,10 +357,17 @@ public class StmtRewriter {
|
||||
subqueryStmt.limitElement_ = null;
|
||||
}
|
||||
|
||||
if (expr instanceof ExistsPredicate) {
|
||||
// For uncorrelated subqueries, we limit the number of rows returned by the
|
||||
// subquery.
|
||||
if (onClauseConjuncts.isEmpty()) subqueryStmt.setLimit(1);
|
||||
}
|
||||
|
||||
// Update the subquery's select list and/or its GROUP BY clause by adding
|
||||
// exprs from the extracted correlated predicates.
|
||||
boolean updateGroupBy = expr.getSubquery().isScalarSubquery() ||
|
||||
(expr instanceof ExistsPredicate && subqueryStmt.hasAggInfo());
|
||||
(expr instanceof ExistsPredicate &&
|
||||
subqueryStmt.hasAggInfo() && !subqueryStmt.getSelectList().isDistinct());
|
||||
List<Expr> lhsExprs = Lists.newArrayList();
|
||||
List<Expr> rhsExprs = Lists.newArrayList();
|
||||
for (Expr conjunct: onClauseConjuncts) {
|
||||
@@ -337,12 +375,6 @@ public class StmtRewriter {
|
||||
lhsExprs, rhsExprs, updateGroupBy);
|
||||
}
|
||||
|
||||
if (expr instanceof ExistsPredicate && onClauseConjuncts.isEmpty()) {
|
||||
// For uncorrelated subqueries, we limit the number of rows returned by the
|
||||
// subquery.
|
||||
subqueryStmt.setLimit(1);
|
||||
}
|
||||
|
||||
// Analyzing the inline view trigger reanalysis of the subquery's select statement.
|
||||
// However the statement is already analyzed and since statement analysis is not
|
||||
// idempotent, the analysis needs to be reset (by a call to clone()).
|
||||
@@ -713,7 +745,6 @@ public class StmtRewriter {
|
||||
|
||||
// Update the subquery's select list.
|
||||
boolean isDistinct = stmt.selectList_.isDistinct();
|
||||
Preconditions.checkState(!isDistinct);
|
||||
stmt.selectList_ = new SelectList(
|
||||
items, isDistinct, stmt.selectList_.getPlanHints());
|
||||
// Update subquery's GROUP BY clause
|
||||
|
||||
@@ -613,12 +613,7 @@ select 1
|
||||
from functional.alltypestiny t
|
||||
where exists (select * from functional.alltypessmall limit 0)
|
||||
---- PLAN
|
||||
02:CROSS JOIN [BROADCAST]
|
||||
|
|
||||
|--01:EMPTYSET
|
||||
|
|
||||
00:SCAN HDFS [functional.alltypestiny t]
|
||||
partitions=4/4 files=4 size=460B
|
||||
00:EMPTYSET
|
||||
====
|
||||
# Multiple nesting levels
|
||||
select count(*)
|
||||
@@ -1419,3 +1414,86 @@ where
|
||||
02:SCAN HDFS [functional.alltypesagg tt1]
|
||||
partitions=11/11 files=11 size=814.73KB
|
||||
====
|
||||
# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and
|
||||
# aggregates. All predicates evaluate to FALSE. (IMPALA-1550)
|
||||
select 1
|
||||
from functional.alltypestiny t1
|
||||
where exists
|
||||
(select id
|
||||
from functional.alltypes t2
|
||||
where t1.int_col = t2.int_col limit 0)
|
||||
and not exists
|
||||
(select count(distinct int_col)
|
||||
from functional.alltypesagg t3
|
||||
where t1.id = t3.id)
|
||||
---- PLAN
|
||||
00:EMPTYSET
|
||||
====
|
||||
# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and
|
||||
# aggregates. All predicates evaluate to TRUE. (IMPALA-1550)
|
||||
select 1
|
||||
from functional.alltypestiny t1
|
||||
where not exists
|
||||
(select id
|
||||
from functional.alltypes t2
|
||||
where t1.int_col = t2.int_col limit 0)
|
||||
and exists
|
||||
(select count(distinct int_col), sum(distinct int_col)
|
||||
from functional.alltypesagg t3
|
||||
where t1.id = t3.id)
|
||||
and not exists
|
||||
(select sum(int_col)
|
||||
from functional.alltypessmall t4
|
||||
where t1.id = t4.id limit 0)
|
||||
and not exists
|
||||
(select min(int_col)
|
||||
from functional.alltypestiny t5
|
||||
where t1.id = t5.id and false)
|
||||
---- PLAN
|
||||
00:SCAN HDFS [functional.alltypestiny t1]
|
||||
partitions=4/4 files=4 size=460B
|
||||
====
|
||||
# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and
|
||||
# aggregates. Some predicates evaluate to TRUE while others need to
|
||||
# be evaluated at run-time. (IMPALA-1550)
|
||||
select 1
|
||||
from functional.alltypestiny t1
|
||||
where not exists
|
||||
(select id
|
||||
from functional.alltypes t2
|
||||
where t1.int_col = t2.int_col limit 0)
|
||||
and exists
|
||||
(select distinct int_col
|
||||
from functional.alltypesagg t3
|
||||
where t3.id > 100 and t1.id = t3.id)
|
||||
and not exists
|
||||
(select count(id)
|
||||
from functional.alltypestiny t4
|
||||
where t4.int_col = t1.tinyint_col
|
||||
having count(id) > 200)
|
||||
---- PLAN
|
||||
06:HASH JOIN [LEFT ANTI JOIN]
|
||||
| hash predicates: t1.tinyint_col = t4.int_col
|
||||
|
|
||||
|--04:AGGREGATE [FINALIZE]
|
||||
| | output: count(id)
|
||||
| | group by: t4.int_col
|
||||
| | having: count(id) > 200
|
||||
| |
|
||||
| 03:SCAN HDFS [functional.alltypestiny t4]
|
||||
| partitions=4/4 files=4 size=460B
|
||||
|
|
||||
05:HASH JOIN [RIGHT SEMI JOIN]
|
||||
| hash predicates: t3.id = t1.id
|
||||
|
|
||||
|--00:SCAN HDFS [functional.alltypestiny t1]
|
||||
| partitions=4/4 files=4 size=460B
|
||||
| predicates: t1.id > 100
|
||||
|
|
||||
02:AGGREGATE [FINALIZE]
|
||||
| group by: int_col, t3.id
|
||||
|
|
||||
01:SCAN HDFS [functional.alltypesagg t3]
|
||||
partitions=11/11 files=11 size=814.73KB
|
||||
predicates: t3.id > 100
|
||||
====
|
||||
|
||||
@@ -759,3 +759,31 @@ UNION ALL SELECT 3
|
||||
---- TYPES
|
||||
TINYINT
|
||||
====
|
||||
---- QUERY
|
||||
# Correlated NOT EXISTS subquery with an aggregate function (IMPALA-1550)
|
||||
SELECT t1.bigint_col
|
||||
FROM alltypestiny t1
|
||||
WHERE NOT EXISTS
|
||||
(SELECT SUM(smallint_col) AS int_col
|
||||
FROM alltypestiny
|
||||
WHERE t1.date_string_col = string_col AND t1.timestamp_col = timestamp_col)
|
||||
GROUP BY t1.bigint_col
|
||||
---- RESULTS
|
||||
---- TYPES
|
||||
BIGINT
|
||||
====
|
||||
---- QUERY
|
||||
# Correlated EXISTS subquery with an aggregate function (IMPALA-1550)
|
||||
SELECT t1.bigint_col
|
||||
FROM alltypestiny t1
|
||||
WHERE EXISTS
|
||||
(SELECT SUM(smallint_col) AS int_col
|
||||
FROM alltypestiny
|
||||
WHERE t1.date_string_col = string_col AND t1.timestamp_col = timestamp_col)
|
||||
GROUP BY t1.bigint_col
|
||||
---- RESULTS
|
||||
0
|
||||
10
|
||||
---- TYPES
|
||||
BIGINT
|
||||
====
|
||||
|
||||
Reference in New Issue
Block a user