diff --git a/fe/src/main/java/com/cloudera/impala/analysis/AggregateInfo.java b/fe/src/main/java/com/cloudera/impala/analysis/AggregateInfo.java index be2f4a5d9..d772c55e6 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/AggregateInfo.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/AggregateInfo.java @@ -235,6 +235,12 @@ public class AggregateInfo extends AggregateInfoBase { return outputToIntermediateTupleSmap_; } + public boolean hasAggregateExprs() { + return !aggregateExprs_.isEmpty() || + (secondPhaseDistinctAggInfo_ != null && + !secondPhaseDistinctAggInfo_.getAggregateExprs().isEmpty()); + } + /** * Return the tuple id produced in the final aggregation step. */ diff --git a/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java b/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java index ac046f9ad..6e9b9c782 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java @@ -220,6 +220,18 @@ public class StmtRewriter { "expression: " + conjunct.toSql()); } + if (conjunct instanceof ExistsPredicate) { + // Check if we can determine the result of an ExistsPredicate during analysis. + // If so, replace the predicate with a BoolLiteral predicate and remove it from + // the list of predicates to be rewritten. + BoolLiteral boolLiteral = replaceExistsPredicate((ExistsPredicate) conjunct); + if (boolLiteral != null) { + boolLiteral.analyze(analyzer); + smap.put(conjunct, boolLiteral); + continue; + } + } + // Replace all the supported exprs with subqueries with true BoolLiterals // using an smap. BoolLiteral boolLiteral = new BoolLiteral(true); @@ -241,6 +253,25 @@ public class StmtRewriter { if (hasNewVisibleTuple) replaceUnqualifiedStarItems(stmt, numTableRefs); } + /** + * Replace an ExistsPredicate that contains a subquery with a BoolLiteral if we + * can determine its result without evaluating it. Return null if the result of the + * ExistsPredicate can only be determined at run-time. + */ + private static BoolLiteral replaceExistsPredicate(ExistsPredicate predicate) { + Subquery subquery = predicate.getSubquery(); + Preconditions.checkNotNull(subquery); + SelectStmt subqueryStmt = (SelectStmt) subquery.getStatement(); + BoolLiteral boolLiteral = null; + if (subqueryStmt.getAnalyzer().hasEmptyResultSet()) { + boolLiteral = new BoolLiteral(predicate.isNotExists()); + } else if (subqueryStmt.hasAggInfo() && subqueryStmt.getAggInfo().hasAggregateExprs() + && !subqueryStmt.hasAnalyticInfo() && subqueryStmt.getHavingPred() == null) { + boolLiteral = new BoolLiteral(!predicate.isNotExists()); + } + return boolLiteral; + } + /** * Replace all BetweenPredicates containing subqueries with their * equivalent compound predicates from the expr tree rooted at 'expr'. @@ -326,10 +357,17 @@ public class StmtRewriter { subqueryStmt.limitElement_ = null; } + if (expr instanceof ExistsPredicate) { + // For uncorrelated subqueries, we limit the number of rows returned by the + // subquery. + if (onClauseConjuncts.isEmpty()) subqueryStmt.setLimit(1); + } + // Update the subquery's select list and/or its GROUP BY clause by adding // exprs from the extracted correlated predicates. boolean updateGroupBy = expr.getSubquery().isScalarSubquery() || - (expr instanceof ExistsPredicate && subqueryStmt.hasAggInfo()); + (expr instanceof ExistsPredicate && + subqueryStmt.hasAggInfo() && !subqueryStmt.getSelectList().isDistinct()); List lhsExprs = Lists.newArrayList(); List rhsExprs = Lists.newArrayList(); for (Expr conjunct: onClauseConjuncts) { @@ -337,12 +375,6 @@ public class StmtRewriter { lhsExprs, rhsExprs, updateGroupBy); } - if (expr instanceof ExistsPredicate && onClauseConjuncts.isEmpty()) { - // For uncorrelated subqueries, we limit the number of rows returned by the - // subquery. - subqueryStmt.setLimit(1); - } - // Analyzing the inline view trigger reanalysis of the subquery's select statement. // However the statement is already analyzed and since statement analysis is not // idempotent, the analysis needs to be reset (by a call to clone()). @@ -713,7 +745,6 @@ public class StmtRewriter { // Update the subquery's select list. boolean isDistinct = stmt.selectList_.isDistinct(); - Preconditions.checkState(!isDistinct); stmt.selectList_ = new SelectList( items, isDistinct, stmt.selectList_.getPlanHints()); // Update subquery's GROUP BY clause diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test index 36df1ff5d..e4da3648c 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test @@ -613,12 +613,7 @@ select 1 from functional.alltypestiny t where exists (select * from functional.alltypessmall limit 0) ---- PLAN -02:CROSS JOIN [BROADCAST] -| -|--01:EMPTYSET -| -00:SCAN HDFS [functional.alltypestiny t] - partitions=4/4 files=4 size=460B +00:EMPTYSET ==== # Multiple nesting levels select count(*) @@ -1419,3 +1414,86 @@ where 02:SCAN HDFS [functional.alltypesagg tt1] partitions=11/11 files=11 size=814.73KB ==== +# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and +# aggregates. All predicates evaluate to FALSE. (IMPALA-1550) +select 1 +from functional.alltypestiny t1 +where exists + (select id + from functional.alltypes t2 + where t1.int_col = t2.int_col limit 0) +and not exists + (select count(distinct int_col) + from functional.alltypesagg t3 + where t1.id = t3.id) +---- PLAN +00:EMPTYSET +==== +# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and +# aggregates. All predicates evaluate to TRUE. (IMPALA-1550) +select 1 +from functional.alltypestiny t1 +where not exists + (select id + from functional.alltypes t2 + where t1.int_col = t2.int_col limit 0) +and exists + (select count(distinct int_col), sum(distinct int_col) + from functional.alltypesagg t3 + where t1.id = t3.id) +and not exists + (select sum(int_col) + from functional.alltypessmall t4 + where t1.id = t4.id limit 0) +and not exists + (select min(int_col) + from functional.alltypestiny t5 + where t1.id = t5.id and false) +---- PLAN +00:SCAN HDFS [functional.alltypestiny t1] + partitions=4/4 files=4 size=460B +==== +# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and +# aggregates. Some predicates evaluate to TRUE while others need to +# be evaluated at run-time. (IMPALA-1550) +select 1 +from functional.alltypestiny t1 +where not exists + (select id + from functional.alltypes t2 + where t1.int_col = t2.int_col limit 0) +and exists + (select distinct int_col + from functional.alltypesagg t3 + where t3.id > 100 and t1.id = t3.id) +and not exists + (select count(id) + from functional.alltypestiny t4 + where t4.int_col = t1.tinyint_col + having count(id) > 200) +---- PLAN +06:HASH JOIN [LEFT ANTI JOIN] +| hash predicates: t1.tinyint_col = t4.int_col +| +|--04:AGGREGATE [FINALIZE] +| | output: count(id) +| | group by: t4.int_col +| | having: count(id) > 200 +| | +| 03:SCAN HDFS [functional.alltypestiny t4] +| partitions=4/4 files=4 size=460B +| +05:HASH JOIN [RIGHT SEMI JOIN] +| hash predicates: t3.id = t1.id +| +|--00:SCAN HDFS [functional.alltypestiny t1] +| partitions=4/4 files=4 size=460B +| predicates: t1.id > 100 +| +02:AGGREGATE [FINALIZE] +| group by: int_col, t3.id +| +01:SCAN HDFS [functional.alltypesagg t3] + partitions=11/11 files=11 size=814.73KB + predicates: t3.id > 100 +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/subquery.test b/testdata/workloads/functional-query/queries/QueryTest/subquery.test index fcf35a2ff..520dcceb9 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/subquery.test +++ b/testdata/workloads/functional-query/queries/QueryTest/subquery.test @@ -759,3 +759,31 @@ UNION ALL SELECT 3 ---- TYPES TINYINT ==== +---- QUERY +# Correlated NOT EXISTS subquery with an aggregate function (IMPALA-1550) +SELECT t1.bigint_col +FROM alltypestiny t1 +WHERE NOT EXISTS + (SELECT SUM(smallint_col) AS int_col + FROM alltypestiny + WHERE t1.date_string_col = string_col AND t1.timestamp_col = timestamp_col) +GROUP BY t1.bigint_col +---- RESULTS +---- TYPES +BIGINT +==== +---- QUERY +# Correlated EXISTS subquery with an aggregate function (IMPALA-1550) +SELECT t1.bigint_col +FROM alltypestiny t1 +WHERE EXISTS + (SELECT SUM(smallint_col) AS int_col + FROM alltypestiny + WHERE t1.date_string_col = string_col AND t1.timestamp_col = timestamp_col) +GROUP BY t1.bigint_col +---- RESULTS +0 +10 +---- TYPES +BIGINT +====