IMPALA-1550: Invalid rewrite when EXISTS subqueries contain aggregate

functions This commit fixes an issue where a [NOT] EXISTS subquery that contains an aggregate function will sometimes be incorrectly rewritten into a join, thereby returning incorrect results. Change-Id: I18b211d76ee3de77d8061603ff5bb1fbceae2e60 Reviewed-on: http://gerrit.cloudera.org:8080/266 Reviewed-by: Dimitris Tsirogiannis <dtsirogiannis@cloudera.com> Tested-by: Internal Jenkins
2026-01-05 21:00:54 -05:00 · 2015-03-18 17:05:59 -07:00
parent 672ae91732
commit 4eceeacf16
4 changed files with 157 additions and 14 deletions
--- a/fe/src/main/java/com/cloudera/impala/analysis/AggregateInfo.java
+++ b/fe/src/main/java/com/cloudera/impala/analysis/AggregateInfo.java
@@ -235,6 +235,12 @@ public class AggregateInfo extends AggregateInfoBase {
    return outputToIntermediateTupleSmap_;
  }

+  public boolean hasAggregateExprs() {
+    return !aggregateExprs_.isEmpty() ||
+        (secondPhaseDistinctAggInfo_ != null &&
+         !secondPhaseDistinctAggInfo_.getAggregateExprs().isEmpty());
+  }
+
  /**
   * Return the tuple id produced in the final aggregation step.
   */
--- a/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java
+++ b/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java
@@ -220,6 +220,18 @@ public class StmtRewriter {
            "expression: " + conjunct.toSql());
      }

+      if (conjunct instanceof ExistsPredicate) {
+        // Check if we can determine the result of an ExistsPredicate during analysis.
+        // If so, replace the predicate with a BoolLiteral predicate and remove it from
+        // the list of predicates to be rewritten.
+        BoolLiteral boolLiteral = replaceExistsPredicate((ExistsPredicate) conjunct);
+        if (boolLiteral != null) {
+          boolLiteral.analyze(analyzer);
+          smap.put(conjunct, boolLiteral);
+          continue;
+        }
+      }
+
      // Replace all the supported exprs with subqueries with true BoolLiterals
      // using an smap.
      BoolLiteral boolLiteral = new BoolLiteral(true);
@@ -241,6 +253,25 @@ public class StmtRewriter {
    if (hasNewVisibleTuple) replaceUnqualifiedStarItems(stmt, numTableRefs);
  }

+  /**
+   * Replace an ExistsPredicate that contains a subquery with a BoolLiteral if we
+   * can determine its result without evaluating it. Return null if the result of the
+   * ExistsPredicate can only be determined at run-time.
+   */
+  private static BoolLiteral replaceExistsPredicate(ExistsPredicate predicate) {
+    Subquery subquery = predicate.getSubquery();
+    Preconditions.checkNotNull(subquery);
+    SelectStmt subqueryStmt = (SelectStmt) subquery.getStatement();
+    BoolLiteral boolLiteral = null;
+    if (subqueryStmt.getAnalyzer().hasEmptyResultSet()) {
+      boolLiteral = new BoolLiteral(predicate.isNotExists());
+    } else if (subqueryStmt.hasAggInfo() && subqueryStmt.getAggInfo().hasAggregateExprs()
+          && !subqueryStmt.hasAnalyticInfo() && subqueryStmt.getHavingPred() == null) {
+      boolLiteral = new BoolLiteral(!predicate.isNotExists());
+    }
+    return boolLiteral;
+  }
+
  /**
   * Replace all BetweenPredicates containing subqueries with their
   * equivalent compound predicates from the expr tree rooted at 'expr'.
@@ -326,10 +357,17 @@ public class StmtRewriter {
      subqueryStmt.limitElement_ = null;
    }

+    if (expr instanceof ExistsPredicate) {
+      // For uncorrelated subqueries, we limit the number of rows returned by the
+      // subquery.
+      if (onClauseConjuncts.isEmpty()) subqueryStmt.setLimit(1);
+    }
+
    // Update the subquery's select list and/or its GROUP BY clause by adding
    // exprs from the extracted correlated predicates.
    boolean updateGroupBy = expr.getSubquery().isScalarSubquery() ||
-        (expr instanceof ExistsPredicate && subqueryStmt.hasAggInfo());
+        (expr instanceof ExistsPredicate &&
+         subqueryStmt.hasAggInfo() && !subqueryStmt.getSelectList().isDistinct());
    List<Expr> lhsExprs = Lists.newArrayList();
    List<Expr> rhsExprs = Lists.newArrayList();
    for (Expr conjunct: onClauseConjuncts) {
@@ -337,12 +375,6 @@ public class StmtRewriter {
          lhsExprs, rhsExprs, updateGroupBy);
    }

-    if (expr instanceof ExistsPredicate && onClauseConjuncts.isEmpty()) {
-      // For uncorrelated subqueries, we limit the number of rows returned by the
-      // subquery.
-      subqueryStmt.setLimit(1);
-    }
-
    // Analyzing the inline view trigger reanalysis of the subquery's select statement.
    // However the statement is already analyzed and since statement analysis is not
    // idempotent, the analysis needs to be reset (by a call to clone()).
@@ -713,7 +745,6 @@ public class StmtRewriter {

    // Update the subquery's select list.
    boolean isDistinct = stmt.selectList_.isDistinct();
-    Preconditions.checkState(!isDistinct);
    stmt.selectList_ = new SelectList(
        items, isDistinct, stmt.selectList_.getPlanHints());
    // Update subquery's GROUP BY clause
--- a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
@@ -613,12 +613,7 @@ select 1
 from functional.alltypestiny t
 where exists (select * from functional.alltypessmall limit 0)
 ---- PLAN
-02:CROSS JOIN [BROADCAST]
-|
-|--01:EMPTYSET
-|
-00:SCAN HDFS [functional.alltypestiny t]
-   partitions=4/4 files=4 size=460B
+00:EMPTYSET
 ====
 # Multiple nesting levels
 select count(*)
@@ -1419,3 +1414,86 @@ where
 02:SCAN HDFS [functional.alltypesagg tt1]
   partitions=11/11 files=11 size=814.73KB
 ====
+# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and
+# aggregates. All predicates evaluate to FALSE. (IMPALA-1550)
+select 1
+from functional.alltypestiny t1
+where exists
+  (select id
+   from functional.alltypes t2
+   where t1.int_col = t2.int_col limit 0)
+and not exists
+  (select count(distinct int_col)
+   from functional.alltypesagg t3
+   where t1.id = t3.id)
+---- PLAN
+00:EMPTYSET
+====
+# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and
+# aggregates. All predicates evaluate to TRUE. (IMPALA-1550)
+select 1
+from functional.alltypestiny t1
+where not exists
+  (select id
+   from functional.alltypes t2
+   where t1.int_col = t2.int_col limit 0)
+and exists
+  (select count(distinct int_col), sum(distinct int_col)
+   from functional.alltypesagg t3
+   where t1.id = t3.id)
+and not exists
+  (select sum(int_col)
+   from functional.alltypessmall t4
+   where t1.id = t4.id limit 0)
+and not exists
+  (select min(int_col)
+   from functional.alltypestiny t5
+   where t1.id = t5.id and false)
+---- PLAN
+00:SCAN HDFS [functional.alltypestiny t1]
+   partitions=4/4 files=4 size=460B
+====
+# Correlated EXISTS and NOT EXISTS subqueries with limit 0 and
+# aggregates. Some predicates evaluate to TRUE while others need to
+# be evaluated at run-time. (IMPALA-1550)
+select 1
+from functional.alltypestiny t1
+where not exists
+  (select id
+   from functional.alltypes t2
+   where t1.int_col = t2.int_col limit 0)
+and exists
+  (select distinct int_col
+   from functional.alltypesagg t3
+   where t3.id > 100 and t1.id = t3.id)
+and not exists
+  (select count(id)
+   from functional.alltypestiny t4
+   where t4.int_col = t1.tinyint_col
+   having count(id) > 200)
+---- PLAN
+06:HASH JOIN [LEFT ANTI JOIN]
+|  hash predicates: t1.tinyint_col = t4.int_col
+|
+|--04:AGGREGATE [FINALIZE]
+|  |  output: count(id)
+|  |  group by: t4.int_col
+|  |  having: count(id) > 200
+|  |
+|  03:SCAN HDFS [functional.alltypestiny t4]
+|     partitions=4/4 files=4 size=460B
+|
+05:HASH JOIN [RIGHT SEMI JOIN]
+|  hash predicates: t3.id = t1.id
+|
+|--00:SCAN HDFS [functional.alltypestiny t1]
+|     partitions=4/4 files=4 size=460B
+|     predicates: t1.id > 100
+|
+02:AGGREGATE [FINALIZE]
+|  group by: int_col, t3.id
+|
+01:SCAN HDFS [functional.alltypesagg t3]
+   partitions=11/11 files=11 size=814.73KB
+   predicates: t3.id > 100
+====
--- a/testdata/workloads/functional-query/queries/QueryTest/subquery.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/subquery.test
@@ -759,3 +759,31 @@ UNION ALL SELECT 3
 ---- TYPES
 TINYINT
 ====
+---- QUERY
+# Correlated NOT EXISTS subquery with an aggregate function (IMPALA-1550)
+SELECT t1.bigint_col
+FROM alltypestiny t1
+WHERE NOT EXISTS
+  (SELECT SUM(smallint_col) AS int_col
+   FROM alltypestiny
+   WHERE t1.date_string_col = string_col AND t1.timestamp_col = timestamp_col)
+GROUP BY t1.bigint_col
+---- RESULTS
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Correlated EXISTS subquery with an aggregate function (IMPALA-1550)
+SELECT t1.bigint_col
+FROM alltypestiny t1
+WHERE EXISTS
+  (SELECT SUM(smallint_col) AS int_col
+   FROM alltypestiny
+   WHERE t1.date_string_col = string_col AND t1.timestamp_col = timestamp_col)
+GROUP BY t1.bigint_col
+---- RESULTS
+0
+10
+---- TYPES
+BIGINT
+====