diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java b/fe/src/main/java/org/apache/impala/analysis/Expr.java index df8d42a08..e949123dd 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Expr.java +++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java @@ -139,6 +139,18 @@ abstract public class Expr extends TreeNode implements ParseNode, Cloneabl } }; + // Returns true if an Expr has a subquery as a direct child. + public static final com.google.common.base.Predicate HAS_SUBQUERY_CHILD = + new com.google.common.base.Predicate() { + @Override + public boolean apply(Expr arg) { + for (Expr child : arg.getChildren()) { + if (child instanceof Subquery) return true; + } + return false; + } + }; + // Returns true if an Expr is an aggregate function that returns non-null on // an empty set (e.g. count). public static final com.google.common.base.Predicate diff --git a/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java b/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java index 440dd70af..96d4b9c54 100644 --- a/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java @@ -187,6 +187,16 @@ public class SelectStmt extends QueryStmt { groupingExprs_.addAll(addtlGroupingExprs); } + /** + * Remove the group by clause. Used by StmtRewriter. This changes the semantics + * of this statement and should only be called when the query is being rewritten + * in a way such that the GROUP BY is *not* required for correctness. + */ + protected void removeGroupBy() { + groupByClause_ = null; + groupingExprs_ = null; + } + // Column alias generator used during query rewriting. private ColumnAliasGenerator columnAliasGenerator_ = null; public ColumnAliasGenerator getColumnAliasGenerator() { @@ -670,13 +680,7 @@ public class SelectStmt extends QueryStmt { } private boolean checkForAggregates() throws AnalysisException { - if (groupingExprs_ == null && !selectList_.isDistinct() - && !TreeNode.contains(resultExprs_, Expr.IS_AGGREGATE) - && (havingPred_ == null - || !havingPred_.contains(Expr.IS_AGGREGATE)) - && (sortInfo_ == null - || !TreeNode.contains(sortInfo_.getSortExprs(), - Expr.IS_AGGREGATE))) { + if (!hasAggregate(/*includeDistinct=*/ true)) { // We're not computing aggregates but we still need to register the HAVING // clause which could, e.g., contain a constant expression evaluating to false. if (havingPred_ != null) analyzer_.registerConjuncts(havingPred_, true); @@ -1358,4 +1362,17 @@ public class SelectStmt extends QueryStmt { // In all other cases, return false. return false; } + + /** + * @param includeDistinct if true, a distinct in the select list is counted + * as requiring an aggregation + * @returns true if query block has an aggregate function or grouping. + */ + public boolean hasAggregate(boolean includeDistinct) throws AnalysisException { + return groupingExprs_ != null || (includeDistinct && selectList_.isDistinct()) || + TreeNode.contains(resultExprs_, Expr.IS_AGGREGATE) || + (havingPred_ != null && havingPred_.contains(Expr.IS_AGGREGATE)) || + (sortInfo_ != null && + TreeNode.contains(sortInfo_.getSortExprs(), Expr.IS_AGGREGATE)); + } } diff --git a/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java b/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java index 16d6bf230..4313c1c4b 100644 --- a/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java +++ b/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java @@ -39,8 +39,8 @@ import static org.apache.impala.analysis.ToSqlOptions.REWRITTEN; * Class representing a statement rewriter. The base class traverses the stmt tree and * the specific rewrite rules are implemented in the subclasses and are called by the * hooks in the base class. - * TODO: Now that we have a nested-loop join supporting all join modes we could - * allow more rewrites, although it is not clear we would always want to. + * TODO: IMPALA-9948: Now that we have a nested-loop join supporting all join modes we + * could allow more rewrites, although it is not clear we would always want to. */ public class StmtRewriter { private final static Logger LOG = LoggerFactory.getLogger(StmtRewriter.class); @@ -309,6 +309,7 @@ public class StmtRewriter { boolean isScalarSubquery = expr.getSubquery().isScalarSubquery(); boolean isScalarColumn = expr.getSubquery().returnsScalarColumn(); boolean isRuntimeScalar = subqueryStmt.isRuntimeScalar(); + boolean isDisjunctive = hasSubqueryInDisjunction(expr); // Create a new inline view from the subquery stmt. The inline view will be added // to the stmt's table refs later. Explicitly set the inline view's column labels // to eliminate any chance that column aliases from the parent query could reference @@ -321,6 +322,20 @@ public class StmtRewriter { new InlineViewRef(stmt.getTableAliasGenerator().getNextAlias(), subqueryStmt, colLabels); + // To handle a subquery in a disjunct, we need to pull out the subexpression that + // is the immediate parent of the subquery and prepare to add additional predicates + // to the WHERE clause of 'stmt'. + List whereClauseConjuncts = null; + List whereClauseSmapLhs = null; + List whereClauseSmapRhs = null; + if (isDisjunctive) { + whereClauseConjuncts = new ArrayList(); + whereClauseSmapLhs = new ArrayList(); + whereClauseSmapRhs = new ArrayList(); + expr = replaceSubqueryInDisjunct(expr, inlineView, subqueryStmt, + whereClauseConjuncts, whereClauseSmapLhs, whereClauseSmapRhs); + } + // Extract all correlated predicates from the subquery. List onClauseConjuncts = extractCorrelatedPredicates(subqueryStmt); if (!onClauseConjuncts.isEmpty()) { @@ -350,15 +365,46 @@ public class StmtRewriter { // However the statement is already analyzed and since statement analysis is not // idempotent, the analysis needs to be reset. inlineView.reset(); - inlineView.analyze(analyzer); + try { + inlineView.analyze(analyzer); + } catch (AnalysisException e) { + // We can't identify all the aggregate functions until the subquery is fully + // analyzed, so we need to catch the exception here and produce a more helpful + // error message. + if (isDisjunctive && subqueryStmt.hasAggregate(/*includeDistinct=*/ false)) { + // TODO: IMPALA-5098: we could easily support this if DISTINCT and aggregates + // were supported in the same query block. + throw new AnalysisException("Aggregate functions in subquery in disjunction " + + "not supported: " + subqueryStmt.toSql()); + } + throw e; + } inlineView.setLeftTblRef(stmt.fromClause_.get(stmt.fromClause_.size() - 1)); stmt.fromClause_.add(inlineView); - JoinOperator joinOp = JoinOperator.LEFT_SEMI_JOIN; // Create a join conjunct from the expr that contains a subquery. Expr joinConjunct = - createJoinConjunct(expr, inlineView, analyzer, !onClauseConjuncts.isEmpty()); - if (joinConjunct != null) { + createJoinConjunct(expr, inlineView, analyzer, !onClauseConjuncts.isEmpty()); + JoinOperator joinOp = JoinOperator.LEFT_SEMI_JOIN; + + if (isDisjunctive) { + // Special case handling of disjunctive subqueries - add the WHERE conjuncts + // generated above and convert to a LEFT OUTER JOIN so we can reference slots + // from subquery. + for (Expr rhsExpr : whereClauseSmapRhs) { + rhsExpr.analyze(analyzer); + } + ExprSubstitutionMap smap = + new ExprSubstitutionMap(whereClauseSmapLhs, whereClauseSmapRhs); + for (Expr pred : whereClauseConjuncts) { + pred = pred.substitute(smap, analyzer, false); + stmt.whereClause_ = + CompoundPredicate.createConjunction(pred, stmt.whereClause_); + } + joinOp = JoinOperator.LEFT_OUTER_JOIN; + updateSelectList = true; + if (joinConjunct != null) onClauseConjuncts.add(joinConjunct); + } else if (joinConjunct != null) { SelectListItem firstItem = ((SelectStmt) inlineView.getViewStmt()).getSelectList().getItems().get(0); if (!onClauseConjuncts.isEmpty() && firstItem.getExpr() != null && @@ -461,7 +507,7 @@ public class StmtRewriter { } if (!hasEqJoinPred && !inlineView.isCorrelated()) { - // TODO: Requires support for non-equi joins. + // TODO: IMPALA-9948: we could support non-equi joins here // TODO: Remove this when independent subquery evaluation is implemented. // TODO: IMPALA-5100 to cover all cases, we do let through runtime scalars with // group by clauses to allow for subqueries where we haven't implemented plan time @@ -527,6 +573,78 @@ public class StmtRewriter { return updateSelectList; } + /** + * Handle a single subquery in 'expr', which is a predicate containing a disjunction, + * which in turn contains a subquery. The inline view and subqueryStmt are modified + * as needed and where clause predicates are generated and added to + * 'whereClauseConjuncts'. A smap constructed from 'smapLhs' and 'smapRhs' will be + * later applied to 'whereClauseConjuncts'. Exprs in 'smapRhs' will be analyzed by + * the caller before construction of the smap. + * + * 'subqueryStmt' must have a single item in its select list. + * + * @returns the parent expr of the subquery to be converted into a join conjunct + * in the containing statement of the subquery. + * @throws AnalysisException if this predicate cannot be converted into a join + * conjunct. + */ + static private Expr replaceSubqueryInDisjunct(Expr expr, InlineViewRef inlineView, + SelectStmt subqueryStmt, List whereClauseConjuncts, + List smapLhs, List smapRhs) throws AnalysisException { + Preconditions.checkState(subqueryStmt.getSelectList().getItems().size() == 1); + List parents = new ArrayList<>(); + expr.collect(Expr.HAS_SUBQUERY_CHILD, parents); + Preconditions.checkState(parents.size() == 1, "Must contain exactly 1 subquery"); + Expr parent = parents.get(0); + + // The caller will convert the IN predicate, a binary predicate against a + // scalar subquery and and any correlated predicates into join predicates. + // We can then replace the expression referencing the subquery with a NULL or + // IS NOT NULL referencing the select list item from the inline view, e.g.: + // + // WHERE OR inlineview.col IS NOT NULL. + // + // Other expressions are not supported and rejected earlier in analysis. + // TODO: add support for [NOT] EXISTS. We could implement [NOT] EXISTS + // support by manipulating the select list of the subquery so that it + // includes a constant value, then referencing that in the generated WHERE conjunct. + if (parent instanceof ExistsPredicate) { + throw new AnalysisException("EXISTS/NOT EXISTS subqueries in OR predicates are " + + "not supported: " + expr.toSql()); + } else if (parent instanceof InPredicate && ((InPredicate)parent).isNotIn()) { + throw new AnalysisException("NOT IN subqueries in OR predicates are not " + + "supported: " + expr.toSql()); + } else if (!(parent instanceof Predicate)) { + // If the predicate is not the parent of the subquery, it requires more work to + // convert into a join conjunct. + // TODO: IMPALA-5226: handle a broader spectrum of expressions in where clause + // conjuncts. + throw new AnalysisException("Subqueries that are arguments to non-predicate " + + "exprs are not supported inside OR: " + expr.toSql()); + } + Preconditions.checkState(parent instanceof InPredicate || + parent instanceof BinaryPredicate || parent instanceof LikePredicate, parent); + // Get a reference to the first select list item from the IN. + SlotRef slotRef = new SlotRef(Lists.newArrayList(inlineView.getUniqueAlias(), + inlineView.getColLabels().get(0))); + // Add the original predicate to the where clause, and set up the subquery to be + // replaced. + whereClauseConjuncts.add(expr); + // We are going to do a LEFT OUTER equi-join against the single select list item + // from the subquery. We need each left input row to match at most one row from + // the right input, which we can ensure by adding a distinct to the subquery. + // The distinct supersedes any pre-existing grouping. + if (!subqueryStmt.returnsSingleRow()) { + subqueryStmt.getSelectList().setIsDistinct(true); + subqueryStmt.removeGroupBy(); + } + smapLhs.add(parent); + // The new IsNullPredicate is not analyzed, but will be analyzed during + // construction of the smap. + smapRhs.add(new IsNullPredicate(slotRef, true)); + return parent; + } + /** * Replace all unqualified star exprs ('*') from stmt's select list with qualified * ones, i.e. tbl_1.*,...,tbl_n.*, where tbl_1,...,tbl_n are the visible tablerefs @@ -946,10 +1064,6 @@ public class StmtRewriter { throws AnalysisException { // Rewrite all the subqueries in the HAVING clause. if (stmt.hasHavingClause() && stmt.havingClause_.getSubquery() != null) { - if (hasSubqueryInDisjunction(stmt.havingClause_)) { - throw new AnalysisException("Subqueries in OR predicates are not supported: " - + stmt.havingClause_.toSql()); - } rewriteHavingClauseSubqueries(stmt, analyzer); } @@ -957,12 +1071,6 @@ public class StmtRewriter { if (stmt.hasWhereClause()) { // Push negation to leaf operands. stmt.whereClause_ = Expr.pushNegationToOperands(stmt.whereClause_); - // Check if we can rewrite the subqueries in the WHERE clause. OR predicates with - // subqueries are not supported. - if (hasSubqueryInDisjunction(stmt.whereClause_)) { - throw new AnalysisException("Subqueries in OR predicates are not supported: " + - stmt.whereClause_.toSql()); - } rewriteWhereClauseSubqueries(stmt, analyzer); } rewriteSelectListSubqueries(stmt, analyzer); diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeSubqueriesTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeSubqueriesTest.java index 741270940..e4be4d09c 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeSubqueriesTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeSubqueriesTest.java @@ -324,18 +324,13 @@ public class AnalyzeSubqueriesTest extends AnalyzerTest { AnalyzesOk(String.format("select id from functional.alltypestiny t where " + "int_col %s (select int_col from functional.alltypestiny t)", op)); - // OR with subquery predicates + // OR with subquery predicates. Only a single subquery is supported. AnalysisError(String.format("select * from functional.alltypes t where t.id %s " + - "(select id from functional.alltypesagg) or t.bool_col = false", op), - String.format("Subqueries in OR predicates are not supported: t.id %s " + - "(SELECT id FROM functional.alltypesagg) OR t.bool_col = FALSE", op)); - AnalysisError(String.format("select id from functional.allcomplextypes t where " + - "id %s " + - "(select f1 from t.struct_array_col a where t.int_struct_col.f1 < a.f1) " + - "or id < 10", op), - String.format("Subqueries in OR predicates are not supported: id %s " + - "(SELECT f1 FROM t.struct_array_col a WHERE t.int_struct_col.f1 < a.f1) " + - "OR id < 10", op)); + "(select id from functional.alltypesagg) or t.int_col %s " + + "(select int_col from functional.alltypesagg)", op, op), + String.format("Multiple subqueries are not supported in expression: t.id %s " + + "(SELECT id FROM functional.alltypesagg) OR t.int_col %s " + + "(SELECT int_col FROM functional.alltypesagg)", op, op)); // Binary predicate with non-comparable operands AnalysisError(String.format("select * from functional.alltypes t where " + @@ -438,29 +433,6 @@ public class AnalyzeSubqueriesTest extends AnalyzerTest { "t.id < 10 or not (t.int_col %s (select int_col from " + "functional.alltypesagg) and t.bool_col = false))", op)); } - - // Negated [NOT] IN subquery with disjunction. - AnalysisError("select * from functional.alltypes t where not (t.id in " + - "(select id from functional.alltypesagg) and t.int_col = 10)", - "Subqueries in OR predicates are not supported: t.id NOT IN " + - "(SELECT id FROM functional.alltypesagg) OR t.int_col != 10"); - AnalysisError("select * from functional.alltypes t where not (t.id not in " + - "(select id from functional.alltypesagg) and t.int_col = 10)", - "Subqueries in OR predicates are not supported: t.id IN " + - "(SELECT id FROM functional.alltypesagg) OR t.int_col != 10"); - - // Exists subquery with disjunction. - AnalysisError("select * from functional.alltypes t where exists " + - "(select * from functional.alltypesagg g where g.bool_col = false) " + - "or t.bool_col = true", "Subqueries in OR predicates are not " + - "supported: EXISTS (SELECT * FROM functional.alltypesagg g WHERE " + - "g.bool_col = FALSE) OR t.bool_col = TRUE"); - - // Comparator-based subquery with disjunction. - AnalysisError("select * from functional.alltypes t where t.id = " + - "(select min(id) from functional.alltypesagg g) or t.id = 10", - "Subqueries in OR predicates are not supported: t.id = " + - "(SELECT min(id) FROM functional.alltypesagg g) OR t.id = 10"); } @Test @@ -592,7 +564,7 @@ public class AnalyzeSubqueriesTest extends AnalyzerTest { AnalysisError(String.format("select * from functional.alltypestiny t where " + "%s (select * from functional.alltypesagg a where a.id = t.id) or %s " + "(select * from functional.alltypessmall s where s.int_col = t.int_col)", op, - op), String.format("Subqueries in OR predicates are not supported: %s " + + op), String.format("Multiple subqueries are not supported in expression: %s " + "(SELECT * FROM functional.alltypesagg a WHERE a.id = t.id) OR %s (SELECT " + "* FROM functional.alltypessmall s WHERE s.int_col = t.int_col)", op.toUpperCase(), op.toUpperCase())); @@ -778,6 +750,200 @@ public class AnalyzeSubqueriesTest extends AnalyzerTest { AnalyzesOk("select int_col from functional.alltypestiny a where " + "not exists (select 1 as int_col from functional.alltypesagg b " + "where a.int_col = b.int_col)"); + + // TODO: IMPALA-9945: EXISTS subquery inside FunctionCallExpr results in + // IllegalStateException. + AnalysisError("select * from functional.alltypes t " + + "where nvl(exists(select id from functional.alltypesagg), false)", + "Failed analysis after expr substitution."); + // TODO: IMPALA-9945: EXISTS subquery inside CaseExpr results in + // IllegalStateException. + AnalysisError("select * from functional.alltypes t " + + "where case when id % 2 = 0 then exists(select id from functional.alltypesagg) " + + "end", + "Failed analysis after expr substitution."); + } + + /** + * Test subqueries in an OR predicate. Most cases cannot be handled by the + * StmtRewriter and should fail in analysis - see IMPALA-5226. + */ + @Test + public void TestDisjunctiveSubqueries() throws AnalysisException { + // Only a single IN subquery is supported. Multiple subquery error cases are tested + // in TestInSubqueries. We test the remaining cases here. + + // Basic IN subquery. + AnalyzesOk("select * from functional.alltypes t where t.id IN " + + "(select id from functional.alltypesagg) or t.bool_col = false"); + + // IN subquery with unnesting of array. + AnalyzesOk("select id from functional.allcomplextypes t where id IN " + + "(select f1 from t.struct_array_col a where t.int_struct_col.f1 < a.f1) " + + "or id < 10"); + + // IN scalar subquery. + AnalyzesOk("select * from functional.alltypes where int_col > 10 or " + + "string_col in (select min(string_col) from functional.alltypestiny)"); + AnalyzesOk("select * from functional.alltypes where int_col > 10 or " + + "string_col in (select min(string_col) from functional.alltypestiny " + + "having min(string_col) > 'abc')"); + + // Aggregate functions with GROUP BY not supported in disjunctive subquery. + AnalysisError("select * from functional.alltypes where int_col > 10 or " + + "string_col in (select min(string_col) from functional.alltypestiny " + + "group by int_col)", + "Aggregate functions in subquery in disjunction not supported: " + + "SELECT min(string_col) FROM functional.alltypestiny GROUP BY int_col"); + + // Aggregate functions in HAVING not supported in disjunctive subquery. + AnalysisError("select * from functional.alltypes where int_col > 10 or " + + "string_col in (select string_col from functional.alltypestiny " + + "group by string_col having min(int_col) > 1)", + "Aggregate functions in subquery in disjunction not supported: " + + "SELECT string_col FROM functional.alltypestiny GROUP BY string_col " + + "HAVING min(int_col) > 1"); + + // GROUP BY in subquery can be ignored if not aggregate function are present. + AnalyzesOk("select * from functional.alltypes t where t.id IN " + + "(select int_col from functional.alltypesagg group by int_col, string_col) " + + "or t.bool_col = false"); + AnalyzesOk("select * from functional.alltypes t where t.id IN " + + "(select int_col from functional.alltypesagg group by int_col) " + + "or t.bool_col = false"); + + // DISTINCT in subquery should work. + AnalyzesOk("select * from functional.alltypes t where t.id IN " + + "(select distinct id from functional.alltypesagg) or t.bool_col = false"); + + // NOT IN, EXISTS and NOT EXISTS subqueries are not supported. + AnalysisError("select * from functional.alltypes t where t.id not in " + + "(select id from functional.alltypesagg) OR t.int_col = 10", + "NOT IN subqueries in OR predicates are not supported: t.id NOT IN " + + "(SELECT id FROM functional.alltypesagg) OR t.int_col = 10"); + AnalysisError("select * from functional.alltypes t where exists (select 1 " + + "from functional.alltypesagg t2 where t.id = t2.id) OR t.int_col = 10", + "EXISTS/NOT EXISTS subqueries in OR predicates are not supported: EXISTS " + + "(SELECT 1 FROM functional.alltypesagg t2 WHERE t.id = t2.id) OR " + + "t.int_col = 10"); + + // Negated [NOT] IN subquery with disjunction. Only IN can be rewritten by the + // analyzer. + AnalysisError("select * from functional.alltypes t where not (t.id in " + + "(select id from functional.alltypesagg) and t.int_col = 10)", + "NOT IN subqueries in OR predicates are not supported: t.id NOT IN " + + "(SELECT id FROM functional.alltypesagg) OR t.int_col != 10"); + AnalyzesOk("select * from functional.alltypes t where not (t.id not in " + + "(select id from functional.alltypesagg) and t.int_col = 10)"); + + // Comparator-based subqueries in disjunctions are supported for scalar subqueries + // but not runtime scalar subqueries. + AnalyzesOk("select * from functional.alltypes t where t.id = " + + "(select min(id) from functional.alltypesagg g) or t.id = 10"); + AnalyzesOk("select * from functional.alltypes t where t.id = " + + "(select min(id) from functional.alltypesagg g where t.int_col = g.int_col) " + + "or t.id = 10"); + AnalysisError("select * from functional.alltypes t where t.id = " + + "(select id from functional.alltypesagg g where t.int_col = g.int_col) " + + "or t.id = 10", + "Unsupported correlated subquery with runtime scalar check: SELECT id FROM " + + "functional.alltypesagg g WHERE t.int_col = g.int_col"); + + // OR predicates must be at top-level - can't be more deeply nested inside another + // predicate. + AnalysisError("select * from functional.alltypes " + + "where int_col > 10 = (string_col in ('a', 'b') or string_col in " + + "(select min(string_col) from functional.alltypestiny))", + "IN subquery predicates are not supported in binary predicates: int_col > 10 = " + + "(string_col IN ('a', 'b') OR string_col IN (SELECT min(string_col) " + + "FROM functional.alltypestiny))"); + + // IN subquery inside function call inside OR is supported. + AnalyzesOk("select * from functional.alltypes t " + + "where nvl(int_col in (select id from functional.alltypesagg), false) or id = 2"); + + // IN subquery inside case inside OR is supported. + AnalyzesOk("select * from functional.alltypes t " + + "where case when id % 2 = 0 then int_col in (" + + " select id from functional.alltypesagg) end or id = 2"); + + // BETWEEN predicate is rewritten during analysis into inequality operators. The + // subquery rewriter cannot directly handle BETWEEN predicates, but should be able + // to handle the rewritten version of BETWEEN. Ensure this is handled correctly, + // with the same validations applied as other predicates. + AnalyzesOk("select int_col from functional.alltypes t " + + " where (t.int_col is null or (t.int_col between " + + " (select min(int_col) from functional.alltypesagg t2 where t.year = t2.year)" + + " and 2))"); + AnalysisError("select int_col from functional.alltypes t " + + " where (t.int_col is null or (t.int_col between " + + " (select min(int_col) from functional.alltypesagg t2 where t.year = t2.year) " + + " and (select max(int_col) from functional.alltypesagg t3)))", + "Multiple subqueries are not supported in expression: (t.int_col IS NULL OR " + + "t.int_col >= (SELECT min(int_col) FROM functional.alltypesagg t2 WHERE " + + "t.`year` = t2.`year`) AND " + + "t.int_col <= (SELECT max(int_col) FROM functional.alltypesagg t3))"); + + // LIKE predicate can be converted into a join conjunct. + AnalyzesOk("select * from functional.alltypes t1 " + + "where string_col like (select min(string_col) from functional.alltypesagg t2 " + + " where t1.int_col = t2.int_col) or id = 2"); + + // OR predicates nested inside a different expr type should be rejected - + // the rewrite is only implemented for WHERE and HAVING conjuncts with OR + // at the top level. The different Expr subclasses should reject subqueries + // during analysis. We try to test all Expr subclasses that could contain + // a Subquery here. Note that many of these are also unsupported in general - + // i.e. even outside a disjunct. + // TODO: IMPALA-5226: support all of these that can be safely rewritten into + // joins. + // IsNullPredicate rejects nested subqueries. + AnalysisError("select * from functional.alltypes t " + + "where cast((t.id IN (select id from functional.alltypesagg) " + + " or t.bool_col = false) as string) is not null ", + "Unsupported IS NULL predicate that contains a subquery: " + + "CAST((t.id IN (SELECT id FROM functional.alltypesagg) " + + "OR t.bool_col = FALSE) AS STRING) IS NOT NULL"); + AnalysisError("select * from functional.alltypes t " + + "where cast((t.id IN (select id from functional.alltypesagg) " + + " or t.bool_col = false) as string) is null ", + "Unsupported IS NULL predicate that contains a subquery: " + + "CAST((t.id IN (SELECT id FROM functional.alltypesagg) " + + "OR t.bool_col = FALSE) AS STRING) IS NULL"); + // BinaryPredicate rejects nested subqueries. + AnalysisError("select * from functional.alltypes t " + + "where (t.id IN (select id from functional.alltypesagg) " + + "or t.bool_col = false) > 1", + "IN subquery predicates are not supported in binary predicates: " + + "(t.id IN (SELECT id FROM functional.alltypesagg) OR t.bool_col = FALSE) > 1"); + // InPredicate rejects nested subqueries. + AnalysisError("select * from functional.alltypes t1 " + + "where id in (1, (select min(id) from functional.alltypesagg t2 " + + "where t1.int_col = t2.int_col)) or t1.bool_col = false", + "Unsupported IN predicate with a subquery: id IN 1, " + + "(SELECT min(id) FROM functional.alltypesagg t2 WHERE t1.int_col = t2.int_col)"); + + // Subquery that is argument of ArithmeticExpr inside OR is not supported. + AnalysisError("select * from functional.alltypes t1 " + + "where (select min(int_col) from functional.alltypesagg t2 " + + " where t1.int_col = t2.int_col) % 2 = 0 or id = 1", + "Subqueries that are arguments to non-predicate exprs are not supported inside " + + "OR: (SELECT min(int_col) FROM functional.alltypesagg t2 " + + "WHERE t1.int_col = t2.int_col) % 2 = 0 OR id = 1"); + // Subquery that is argument of TimestampArithmeticExpr inside OR is not supported. + AnalysisError("select * from functional.alltypes t1 " + + "where (select min(timestamp_col) from functional.alltypesagg t2 " + + " where t1.int_col = t2.int_col) + interval 1 day > '2020-01-01' or id = 2", + "Subqueries that are arguments to non-predicate exprs are not supported inside " + + "OR: (SELECT min(timestamp_col) FROM functional.alltypesagg t2 " + + "WHERE t1.int_col = t2.int_col) + INTERVAL 1 day > '2020-01-01' OR id = 2"); + // Subquery that is argument of CaseExpr inside OR is not supported. + AnalysisError("select * from functional.alltypes t " + + "where case when id % 2 = 0 then (" + + " select bool_col from functional.alltypesagg) end or id = 2", + "Subqueries that are arguments to non-predicate exprs are not supported inside " + + "OR: CASE WHEN id % 2 = 0 THEN (" + + "SELECT bool_col FROM functional.alltypesagg) END OR id = 2"); } @Test diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test index cb44adcd7..b29a2fb67 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test @@ -3429,7 +3429,7 @@ PLAN-ROOT SINK HDFS partitions=11/11 files=11 size=814.73KB runtime filters: RF000 -> alltypesagg.id row-size=39B cardinality=11.00K -======= +==== # IMPALA-9905 Cardinality Check for subqueries with grouping expressions and # scalar return types select bigint_col from functional.alltypes where int_col >= @@ -3501,3 +3501,693 @@ PLAN-ROOT SINK HDFS partitions=24/24 files=24 size=478.45KB row-size=12B cardinality=7.30K ==== +# Basic IN subquery in disjunct. +select * from functional.alltypes t +where t.id IN (select id from functional.alltypesagg) or t.bool_col = false +---- PLAN +PLAN-ROOT SINK +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t.id = id +| other predicates: id IS NOT NULL OR t.bool_col = FALSE +| row-size=93B cardinality=7.81K +| +|--02:AGGREGATE [FINALIZE] +| | group by: id +| | row-size=4B cardinality=10.28K +| | +| 01:SCAN HDFS [functional.alltypesagg] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=4B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +07:EXCHANGE [UNPARTITIONED] +| +03:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: t.id = id +| other predicates: id IS NOT NULL OR t.bool_col = FALSE +| row-size=93B cardinality=7.81K +| +|--06:EXCHANGE [BROADCAST] +| | +| 05:AGGREGATE [FINALIZE] +| | group by: id +| | row-size=4B cardinality=10.28K +| | +| 04:EXCHANGE [HASH(id)] +| | +| 02:AGGREGATE [STREAMING] +| | group by: id +| | row-size=4B cardinality=10.28K +| | +| 01:SCAN HDFS [functional.alltypesagg] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=4B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +==== +# IN subquery with unnesting of array in disjunct. +select id from functional_parquet.complextypestbl t +where id in (7, 9, 10) or + id + 1 IN (select item from t.int_array a where t.nested_struct.a < a.item) +---- PLAN +PLAN-ROOT SINK +| +01:SUBPLAN +| row-size=28B cardinality=unavailable +| +|--05:NESTED LOOP JOIN [RIGHT OUTER JOIN] +| | join predicates: t.nested_struct.a < item, item = id + 1 +| | predicates: id IN (7, 9, 10) OR item IS NOT NULL +| | row-size=28B cardinality=10 +| | +| |--02:SINGULAR ROW SRC +| | row-size=24B cardinality=1 +| | +| 04:AGGREGATE [FINALIZE] +| | group by: item +| | row-size=4B cardinality=10 +| | +| 03:UNNEST [t.int_array a] +| row-size=0B cardinality=10 +| +00:SCAN HDFS [functional_parquet.complextypestbl t] + HDFS partitions=1/1 files=2 size=6.92KB + row-size=24B cardinality=unavailable +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +06:EXCHANGE [UNPARTITIONED] +| +01:SUBPLAN +| row-size=28B cardinality=unavailable +| +|--05:NESTED LOOP JOIN [RIGHT OUTER JOIN] +| | join predicates: t.nested_struct.a < item, item = id + 1 +| | predicates: id IN (7, 9, 10) OR item IS NOT NULL +| | row-size=28B cardinality=10 +| | +| |--02:SINGULAR ROW SRC +| | row-size=24B cardinality=1 +| | +| 04:AGGREGATE [FINALIZE] +| | group by: item +| | row-size=4B cardinality=10 +| | +| 03:UNNEST [t.int_array a] +| row-size=0B cardinality=10 +| +00:SCAN HDFS [functional_parquet.complextypestbl t] + HDFS partitions=1/1 files=2 size=6.92KB + row-size=24B cardinality=unavailable +==== +# IN scalar subquery in disjunct. +select * from functional.alltypes +where int_col > 10 or + string_col in ( + select min(string_col) from functional.alltypestiny + having min(string_col) > 'abc') +---- PLAN +PLAN-ROOT SINK +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: string_col = min(string_col) +| other predicates: int_col > 10 OR min(string_col) IS NOT NULL +| row-size=101B cardinality=7.30K +| +|--02:AGGREGATE [FINALIZE] +| | output: min(string_col) +| | having: min(string_col) > 'abc' +| | row-size=12B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypestiny] +| HDFS partitions=4/4 files=4 size=460B +| row-size=13B cardinality=8 +| +00:SCAN HDFS [functional.alltypes] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +07:EXCHANGE [UNPARTITIONED] +| +03:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: string_col = min(string_col) +| other predicates: int_col > 10 OR min(string_col) IS NOT NULL +| row-size=101B cardinality=7.30K +| +|--06:EXCHANGE [BROADCAST] +| | +| 05:AGGREGATE [FINALIZE] +| | output: min:merge(string_col) +| | having: min(string_col) > 'abc' +| | row-size=12B cardinality=1 +| | +| 04:EXCHANGE [UNPARTITIONED] +| | +| 02:AGGREGATE +| | output: min(string_col) +| | row-size=12B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypestiny] +| HDFS partitions=4/4 files=4 size=460B +| row-size=13B cardinality=8 +| +00:SCAN HDFS [functional.alltypes] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +==== +# Comparator-based subquery in disjunction. +select * from functional.alltypes t +where t.id = (select min(id) from functional.alltypesagg g) or t.id = 10 +---- PLAN +PLAN-ROOT SINK +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t.id = min(id) +| other predicates: min(id) IS NOT NULL OR t.id = 10 +| row-size=93B cardinality=7.30K +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | row-size=4B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypesagg g] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=4B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +07:EXCHANGE [UNPARTITIONED] +| +03:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: t.id = min(id) +| other predicates: min(id) IS NOT NULL OR t.id = 10 +| row-size=93B cardinality=7.30K +| +|--06:EXCHANGE [BROADCAST] +| | +| 05:AGGREGATE [FINALIZE] +| | output: min:merge(id) +| | row-size=4B cardinality=1 +| | +| 04:EXCHANGE [UNPARTITIONED] +| | +| 02:AGGREGATE +| | output: min(id) +| | row-size=4B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypesagg g] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=4B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +==== +# Comparator-based subquery with correlated predicate in disjunction. +select * from functional.alltypes t +where int_col % 2 = 0 or + t.id <= (select min(id) from functional.alltypesagg g where t.int_col = g.int_col) +---- PLAN +PLAN-ROOT SINK +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t.int_col = g.int_col +| other join predicates: t.id <= min(id) +| other predicates: int_col % 2 = 0 OR min(id) IS NOT NULL +| row-size=97B cardinality=83.91K +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: g.int_col +| | row-size=8B cardinality=957 +| | +| 01:SCAN HDFS [functional.alltypesagg g] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=8B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +07:EXCHANGE [UNPARTITIONED] +| +03:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: t.int_col = g.int_col +| other join predicates: t.id <= min(id) +| other predicates: int_col % 2 = 0 OR min(id) IS NOT NULL +| row-size=97B cardinality=83.91K +| +|--06:EXCHANGE [BROADCAST] +| | +| 05:AGGREGATE [FINALIZE] +| | output: min:merge(id) +| | group by: g.int_col +| | row-size=8B cardinality=957 +| | +| 04:EXCHANGE [HASH(g.int_col)] +| | +| 02:AGGREGATE [STREAMING] +| | output: min(id) +| | group by: g.int_col +| | row-size=8B cardinality=957 +| | +| 01:SCAN HDFS [functional.alltypesagg g] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=8B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +==== +# Subquery nested in a more complex expression that has an OR at the top level. +# The subquery select lists includes an alias to test alias handling. +select * from functional.alltypes +where int_col > 10 or ( + string_col in ('2', '4') and int_col > 0 and string_col in ( + select min(string_col) the_min from functional.alltypestiny)) +---- PLAN +PLAN-ROOT SINK +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: string_col = min(string_col) +| other predicates: int_col > 10 OR (string_col IN ('2', '4') AND int_col > 0 AND min(string_col) IS NOT NULL) +| row-size=101B cardinality=7.30K +| +|--02:AGGREGATE [FINALIZE] +| | output: min(string_col) +| | row-size=12B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypestiny] +| HDFS partitions=4/4 files=4 size=460B +| row-size=13B cardinality=8 +| +00:SCAN HDFS [functional.alltypes] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +07:EXCHANGE [UNPARTITIONED] +| +03:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: string_col = min(string_col) +| other predicates: int_col > 10 OR (string_col IN ('2', '4') AND int_col > 0 AND min(string_col) IS NOT NULL) +| row-size=101B cardinality=7.30K +| +|--06:EXCHANGE [BROADCAST] +| | +| 05:AGGREGATE [FINALIZE] +| | output: min:merge(string_col) +| | row-size=12B cardinality=1 +| | +| 04:EXCHANGE [UNPARTITIONED] +| | +| 02:AGGREGATE +| | output: min(string_col) +| | row-size=12B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypestiny] +| HDFS partitions=4/4 files=4 size=460B +| row-size=13B cardinality=8 +| +00:SCAN HDFS [functional.alltypes] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +==== +# Subquery in OR in having clause. +select year, month, min(id) +from functional.alltypes +group by year, month +having (year = 2010 and month > 11) or + min(id) in (select int_col from functional.alltypestiny) +---- PLAN +PLAN-ROOT SINK +| +04:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: min(id) = int_col +| other predicates: (`year` = 2010 AND `month` > 11) OR int_col IS NOT NULL +| row-size=16B cardinality=24 +| +|--03:AGGREGATE [FINALIZE] +| | group by: int_col +| | row-size=4B cardinality=2 +| | +| 02:SCAN HDFS [functional.alltypestiny] +| HDFS partitions=4/4 files=4 size=460B +| row-size=4B cardinality=8 +| +01:AGGREGATE [FINALIZE] +| output: min(id) +| group by: `year`, `month` +| row-size=12B cardinality=24 +| +00:SCAN HDFS [functional.alltypes] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=12B cardinality=7.30K +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +10:EXCHANGE [UNPARTITIONED] +| +04:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: min(id) = int_col +| other predicates: (`year` = 2010 AND `month` > 11) OR int_col IS NOT NULL +| row-size=16B cardinality=24 +| +|--09:EXCHANGE [BROADCAST] +| | +| 08:AGGREGATE [FINALIZE] +| | group by: int_col +| | row-size=4B cardinality=2 +| | +| 07:EXCHANGE [HASH(int_col)] +| | +| 03:AGGREGATE [STREAMING] +| | group by: int_col +| | row-size=4B cardinality=2 +| | +| 02:SCAN HDFS [functional.alltypestiny] +| HDFS partitions=4/4 files=4 size=460B +| row-size=4B cardinality=8 +| +06:AGGREGATE [FINALIZE] +| output: min:merge(id) +| group by: `year`, `month` +| row-size=12B cardinality=24 +| +05:EXCHANGE [HASH(`year`,`month`)] +| +01:AGGREGATE [STREAMING] +| output: min(id) +| group by: `year`, `month` +| row-size=12B cardinality=24 +| +00:SCAN HDFS [functional.alltypes] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=12B cardinality=7.30K +==== +# Multiple conjuncts with subqueries with OR. +select id from functional.alltypes t +where (id % 1234 = 0 or + t.id <= (select min(id) from functional.alltypesagg g where t.int_col = g.int_col)) and + (id % 4321 = 0 or + t.id >= (select max(id) from functional.alltypesagg g where t.int_col = g.int_col)) +---- PLAN +PLAN-ROOT SINK +| +06:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t.int_col = g.int_col +| other join predicates: t.id >= max(id) +| other predicates: (id % 4321 = 0 OR max(id) IS NOT NULL) +| row-size=24B cardinality=964.46K +| +|--04:AGGREGATE [FINALIZE] +| | output: max(id) +| | group by: g.int_col +| | row-size=8B cardinality=957 +| | +| 03:SCAN HDFS [functional.alltypesagg g] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=8B cardinality=11.00K +| +05:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t.int_col = g.int_col +| other join predicates: t.id <= min(id) +| other predicates: (id % 1234 = 0 OR min(id) IS NOT NULL) +| row-size=16B cardinality=83.91K +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: g.int_col +| | row-size=8B cardinality=957 +| | +| 01:SCAN HDFS [functional.alltypesagg g] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=8B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=8B cardinality=7.30K +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +13:EXCHANGE [UNPARTITIONED] +| +06:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: t.int_col = g.int_col +| other join predicates: t.id >= max(id) +| other predicates: (id % 4321 = 0 OR max(id) IS NOT NULL) +| row-size=24B cardinality=964.46K +| +|--12:EXCHANGE [BROADCAST] +| | +| 11:AGGREGATE [FINALIZE] +| | output: max:merge(id) +| | group by: g.int_col +| | row-size=8B cardinality=957 +| | +| 10:EXCHANGE [HASH(g.int_col)] +| | +| 04:AGGREGATE [STREAMING] +| | output: max(id) +| | group by: g.int_col +| | row-size=8B cardinality=957 +| | +| 03:SCAN HDFS [functional.alltypesagg g] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=8B cardinality=11.00K +| +05:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: t.int_col = g.int_col +| other join predicates: t.id <= min(id) +| other predicates: (id % 1234 = 0 OR min(id) IS NOT NULL) +| row-size=16B cardinality=83.91K +| +|--09:EXCHANGE [BROADCAST] +| | +| 08:AGGREGATE [FINALIZE] +| | output: min:merge(id) +| | group by: g.int_col +| | row-size=8B cardinality=957 +| | +| 07:EXCHANGE [HASH(g.int_col)] +| | +| 02:AGGREGATE [STREAMING] +| | output: min(id) +| | group by: g.int_col +| | row-size=8B cardinality=957 +| | +| 01:SCAN HDFS [functional.alltypesagg g] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=8B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=8B cardinality=7.30K +==== +# Correlated subquery in BETWEEN in OR rewritten to LOJ. +select int_col from functional.alltypes t +where (t.int_col is null or + (t.int_col between + (select min(int_col) from functional.alltypesagg t2 where t.year = t2.year) + and 2)) +---- PLAN +PLAN-ROOT SINK +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t.`year` = t2.`year` +| other join predicates: t.int_col >= min(int_col) +| other predicates: (t.int_col IS NULL OR min(int_col) IS NOT NULL AND t.int_col <= 2) +| row-size=16B cardinality=7.30K +| +|--02:AGGREGATE [FINALIZE] +| | output: min(int_col) +| | group by: t2.`year` +| | row-size=8B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypesagg t2] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=8B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=8B cardinality=7.30K +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +07:EXCHANGE [UNPARTITIONED] +| +03:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: t.`year` = t2.`year` +| other join predicates: t.int_col >= min(int_col) +| other predicates: (t.int_col IS NULL OR min(int_col) IS NOT NULL AND t.int_col <= 2) +| row-size=16B cardinality=7.30K +| +|--06:EXCHANGE [BROADCAST] +| | +| 05:AGGREGATE [FINALIZE] +| | output: min:merge(int_col) +| | group by: t2.`year` +| | row-size=8B cardinality=1 +| | +| 04:EXCHANGE [HASH(t2.`year`)] +| | +| 02:AGGREGATE [STREAMING] +| | output: min(int_col) +| | group by: t2.`year` +| | row-size=8B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypesagg t2] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=8B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=8B cardinality=7.30K +==== +# Uncorrelated subquery in BETWEEN in OR rewritten to INNER JOIN. +select int_col from functional.alltypes t +where (t.int_col is null or + (t.int_col between + (select min(int_col) from functional.alltypesagg t2) + and 2)) +---- PLAN +PLAN-ROOT SINK +| +03:NESTED LOOP JOIN [INNER JOIN] +| predicates: t.int_col >= min(int_col), (t.int_col IS NULL OR min(int_col) IS NOT NULL AND t.int_col <= 2) +| row-size=8B cardinality=7.30K +| +|--02:AGGREGATE [FINALIZE] +| | output: min(int_col) +| | row-size=4B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypesagg t2] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=4B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=4B cardinality=7.30K +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +07:EXCHANGE [UNPARTITIONED] +| +03:NESTED LOOP JOIN [INNER JOIN, BROADCAST] +| predicates: t.int_col >= min(int_col), (t.int_col IS NULL OR min(int_col) IS NOT NULL AND t.int_col <= 2) +| row-size=8B cardinality=7.30K +| +|--06:EXCHANGE [BROADCAST] +| | +| 05:AGGREGATE [FINALIZE] +| | output: min:merge(int_col) +| | row-size=4B cardinality=1 +| | +| 04:EXCHANGE [UNPARTITIONED] +| | +| 02:AGGREGATE +| | output: min(int_col) +| | row-size=4B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypesagg t2] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=4B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=4B cardinality=7.30K +==== +# LIKE predicate with subquerycan be rewritten into LEFT OUTER JOIN +select * from functional.alltypes t1 +where id = 2 or string_col like ( + select min(string_col) + from functional.alltypesagg t2 + where t1.int_col = t2.int_col) +---- PLAN +PLAN-ROOT SINK +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t1.int_col = t2.int_col +| other join predicates: string_col LIKE min(string_col) +| other predicates: id = 2 OR min(string_col) IS NOT NULL +| row-size=105B cardinality=83.91K +| +|--02:AGGREGATE [FINALIZE] +| | output: min(string_col) +| | group by: t2.int_col +| | row-size=16B cardinality=957 +| | +| 01:SCAN HDFS [functional.alltypesagg t2] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=19B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t1] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +==== +# IN Subquery inside function call (NVL) can be rewritten into LEFT OUTER JOIN +select * from functional.alltypes t +where nvl(int_col in (select id from functional.alltypesagg), false) or id = 2 +---- PLAN +PLAN-ROOT SINK +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: int_col = id +| other predicates: nvl(id IS NOT NULL, FALSE) OR id = 2 +| row-size=93B cardinality=7.81K +| +|--02:AGGREGATE [FINALIZE] +| | group by: id +| | row-size=4B cardinality=10.28K +| | +| 01:SCAN HDFS [functional.alltypesagg] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=4B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +==== +# IN Subquery inside case expr can be rewritten into LEFT OUTER JOIN +select * from functional.alltypes t +where bigint_col = 2 or + case when bigint_col % 2 = 0 + then int_col in (select id from functional.alltypesagg) + end +---- PLAN +PLAN-ROOT SINK +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: int_col = id +| other predicates: bigint_col = 2 OR CASE WHEN bigint_col % 2 = 0 THEN id IS NOT NULL END +| row-size=93B cardinality=7.81K +| +|--02:AGGREGATE [FINALIZE] +| | group by: id +| | row-size=4B cardinality=10.28K +| | +| 01:SCAN HDFS [functional.alltypesagg] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=4B cardinality=11.00K +| +00:SCAN HDFS [functional.alltypes t] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=89B cardinality=7.30K +==== diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpcds-all.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpcds-all.test index d7d295307..c95a47dcf 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/tpcds-all.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpcds-all.test @@ -10231,3 +10231,215 @@ PLAN-ROOT SINK runtime filters: RF000 -> i2.i_item_sk row-size=38B cardinality=18.00K ==== +# TPCDS-Q45 +SELECT ca_zip, + ca_city, + sum(ws_sales_price) +FROM web_sales, + customer, + customer_address, + date_dim, + item +WHERE ws_bill_customer_sk = c_customer_sk + AND c_current_addr_sk = ca_address_sk + AND ws_item_sk = i_item_sk + AND (SUBSTRING(ca_zip,1,5) IN ('85669', + '86197', + '88274', + '83405', + '86475', + '85392', + '85460', + '80348', + '81792') + OR i_item_id IN + (SELECT i_item_id + FROM item + WHERE i_item_sk IN (2, + 3, + 5, + 7, + 11, + 13, + 17, + 19, + 23, + 29) )) + AND ws_sold_date_sk = d_date_sk + AND d_qoy = 2 + AND d_year = 2001 +GROUP BY ca_zip, + ca_city +ORDER BY ca_zip, + ca_city +LIMIT 100; +---- PLAN +Max Per-Host Resource Reservation: Memory=62.62MB Threads=7 +Per-Host Resource Estimates: Memory=375MB +PLAN-ROOT SINK +| +13:TOP-N [LIMIT=100] +| order by: ca_zip ASC, ca_city ASC +| row-size=54B cardinality=100 +| +12:AGGREGATE [FINALIZE] +| output: sum(ws_sales_price) +| group by: ca_zip, ca_city +| row-size=54B cardinality=73.80K +| +11:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: i_item_id = i_item_id +| other predicates: (substring(ca_zip, 1, 5) IN ('85669', '86197', '88274', '83405', '86475', '85392', '85460', '80348', '81792') OR i_item_id IS NOT NULL) +| row-size=146B cardinality=73.80K +| +|--06:AGGREGATE [FINALIZE] +| | group by: i_item_id +| | row-size=28B cardinality=10 +| | +| 05:SCAN HDFS [tpcds.item] +| HDFS partitions=1/1 files=1 size=4.82MB +| predicates: i_item_sk IN (2, 3, 5, 7, 11, 13, 17, 19, 23, 29) +| row-size=36B cardinality=10 +| +10:HASH JOIN [INNER JOIN] +| hash predicates: ws_item_sk = i_item_sk +| runtime filters: RF000 <- i_item_sk +| row-size=118B cardinality=73.80K +| +|--04:SCAN HDFS [tpcds.item] +| HDFS partitions=1/1 files=1 size=4.82MB +| row-size=36B cardinality=18.00K +| +09:HASH JOIN [INNER JOIN] +| hash predicates: c_current_addr_sk = ca_address_sk +| runtime filters: RF002 <- ca_address_sk +| row-size=82B cardinality=73.80K +| +|--02:SCAN HDFS [tpcds.customer_address] +| HDFS partitions=1/1 files=1 size=5.25MB +| row-size=42B cardinality=50.00K +| +08:HASH JOIN [INNER JOIN] +| hash predicates: ws_bill_customer_sk = c_customer_sk +| runtime filters: RF004 <- c_customer_sk +| row-size=40B cardinality=73.80K +| +|--01:SCAN HDFS [tpcds.customer] +| HDFS partitions=1/1 files=1 size=12.60MB +| runtime filters: RF002 -> c_current_addr_sk +| row-size=8B cardinality=100.00K +| +07:HASH JOIN [INNER JOIN] +| hash predicates: ws_sold_date_sk = d_date_sk +| runtime filters: RF006 <- d_date_sk +| row-size=32B cardinality=73.80K +| +|--03:SCAN HDFS [tpcds.date_dim] +| HDFS partitions=1/1 files=1 size=9.84MB +| predicates: d_year = 2001, d_qoy = 2 +| row-size=12B cardinality=186 +| +00:SCAN HDFS [tpcds.web_sales] + HDFS partitions=1/1 files=1 size=140.07MB + runtime filters: RF000 -> ws_item_sk, RF004 -> ws_bill_customer_sk, RF006 -> ws_sold_date_sk + row-size=20B cardinality=719.38K +---- DISTRIBUTEDPLAN +Max Per-Host Resource Reservation: Memory=77.38MB Threads=16 +Per-Host Resource Estimates: Memory=415MB +PLAN-ROOT SINK +| +24:MERGING-EXCHANGE [UNPARTITIONED] +| order by: ca_zip ASC, ca_city ASC +| limit: 100 +| +13:TOP-N [LIMIT=100] +| order by: ca_zip ASC, ca_city ASC +| row-size=54B cardinality=100 +| +23:AGGREGATE [FINALIZE] +| output: sum:merge(ws_sales_price) +| group by: ca_zip, ca_city +| row-size=54B cardinality=73.80K +| +22:EXCHANGE [HASH(ca_zip,ca_city)] +| +12:AGGREGATE [STREAMING] +| output: sum(ws_sales_price) +| group by: ca_zip, ca_city +| row-size=54B cardinality=73.80K +| +11:HASH JOIN [LEFT OUTER JOIN, BROADCAST] +| hash predicates: i_item_id = i_item_id +| other predicates: (substring(ca_zip, 1, 5) IN ('85669', '86197', '88274', '83405', '86475', '85392', '85460', '80348', '81792') OR i_item_id IS NOT NULL) +| row-size=146B cardinality=73.80K +| +|--21:EXCHANGE [BROADCAST] +| | +| 20:AGGREGATE [FINALIZE] +| | group by: i_item_id +| | row-size=28B cardinality=10 +| | +| 19:EXCHANGE [HASH(i_item_id)] +| | +| 06:AGGREGATE [STREAMING] +| | group by: i_item_id +| | row-size=28B cardinality=10 +| | +| 05:SCAN HDFS [tpcds.item] +| HDFS partitions=1/1 files=1 size=4.82MB +| predicates: i_item_sk IN (2, 3, 5, 7, 11, 13, 17, 19, 23, 29) +| row-size=36B cardinality=10 +| +10:HASH JOIN [INNER JOIN, BROADCAST] +| hash predicates: ws_item_sk = i_item_sk +| runtime filters: RF000 <- i_item_sk +| row-size=118B cardinality=73.80K +| +|--18:EXCHANGE [BROADCAST] +| | +| 04:SCAN HDFS [tpcds.item] +| HDFS partitions=1/1 files=1 size=4.82MB +| row-size=36B cardinality=18.00K +| +09:HASH JOIN [INNER JOIN, PARTITIONED] +| hash predicates: c_current_addr_sk = ca_address_sk +| runtime filters: RF002 <- ca_address_sk +| row-size=82B cardinality=73.80K +| +|--17:EXCHANGE [HASH(ca_address_sk)] +| | +| 02:SCAN HDFS [tpcds.customer_address] +| HDFS partitions=1/1 files=1 size=5.25MB +| row-size=42B cardinality=50.00K +| +16:EXCHANGE [HASH(c_current_addr_sk)] +| +08:HASH JOIN [INNER JOIN, BROADCAST] +| hash predicates: ws_bill_customer_sk = c_customer_sk +| runtime filters: RF004 <- c_customer_sk +| row-size=40B cardinality=73.80K +| +|--15:EXCHANGE [BROADCAST] +| | +| 01:SCAN HDFS [tpcds.customer] +| HDFS partitions=1/1 files=1 size=12.60MB +| runtime filters: RF002 -> c_current_addr_sk +| row-size=8B cardinality=100.00K +| +07:HASH JOIN [INNER JOIN, BROADCAST] +| hash predicates: ws_sold_date_sk = d_date_sk +| runtime filters: RF006 <- d_date_sk +| row-size=32B cardinality=73.80K +| +|--14:EXCHANGE [BROADCAST] +| | +| 03:SCAN HDFS [tpcds.date_dim] +| HDFS partitions=1/1 files=1 size=9.84MB +| predicates: d_year = 2001, d_qoy = 2 +| row-size=12B cardinality=186 +| +00:SCAN HDFS [tpcds.web_sales] + HDFS partitions=1/1 files=1 size=140.07MB + runtime filters: RF000 -> ws_item_sk, RF004 -> ws_bill_customer_sk, RF006 -> ws_sold_date_sk + row-size=20B cardinality=719.38K +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/subquery.test b/testdata/workloads/functional-query/queries/QueryTest/subquery.test index 8565f2a30..fe674fc8f 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/subquery.test +++ b/testdata/workloads/functional-query/queries/QueryTest/subquery.test @@ -1317,3 +1317,104 @@ select id, sum(int_col) from alltypestiny where ---- CATCH Subquery must not return more than one row: ==== +---- QUERY +# Basic IN subquery in OR predicate. +# Crafted so that each branch of the OR matches a handful of rows. +select id, timestamp_col from alltypes +where (timestamp_col between '2009-04-13' and '2009-04-14') or + id in (select int_col from alltypestiny) +---- TYPES +INT, TIMESTAMP +---- RESULTS +0,2009-01-01 00:00:00 +1,2009-01-01 00:01:00 +1020,2009-04-13 02:00:05.400000000 +1021,2009-04-13 02:01:05.400000000 +1022,2009-04-13 02:02:05.410000000 +1023,2009-04-13 02:03:05.430000000 +1024,2009-04-13 02:04:05.460000000 +1025,2009-04-13 02:05:05.500000000 +1026,2009-04-13 02:06:05.550000000 +1027,2009-04-13 02:07:05.610000000 +1028,2009-04-13 02:08:05.680000000 +1029,2009-04-13 02:09:05.760000000 +---- DBAPI_RESULTS +0,2009-01-01 00:00:00 +1,2009-01-01 00:01:00 +1020,2009-04-13 02:00:05.400000 +1021,2009-04-13 02:01:05.400000 +1022,2009-04-13 02:02:05.410000 +1023,2009-04-13 02:03:05.430000 +1024,2009-04-13 02:04:05.460000 +1025,2009-04-13 02:05:05.500000 +1026,2009-04-13 02:06:05.550000 +1027,2009-04-13 02:07:05.610000 +1028,2009-04-13 02:08:05.680000 +1029,2009-04-13 02:09:05.760000 +==== +---- QUERY +# Subquery in OR predicate inside non-trivial expression. +# Crafted so that each branch of the OR matches a few rows. +select year, id, int_col, string_col, date_string_col +from alltypes +where (int_col = 9 and date_string_col > '12/31/00') or + (year = 2010 and date_string_col > '12/28/10' and + string_col in (select min(string_col) from alltypestiny)) +---- TYPES +INT, INT, INT, STRING, STRING +---- RESULTS +2009,3649,9,'9','12/31/09' +2010,7270,0,'0','12/29/10' +2010,7280,0,'0','12/30/10' +2010,7290,0,'0','12/31/10' +2010,7299,9,'9','12/31/10' +==== +---- QUERY +# Subquery in HAVING clause. +# Crafted so that each branch of the OR matches one row. +select year, month, min(id) +from alltypes +group by year, month +having (year = 2010 and month > 11) or + min(id) in (select int_col from alltypestiny) +---- TYPES +INT, INT, INT +---- RESULTS +2010,12,6990 +2009,1,0 +==== +---- QUERY +# Comparator-based subquery with correlated predicate in disjunction. +select id from alltypes t +where id % 1234 = 0 or + t.id <= (select min(id) from alltypesagg g where t.int_col = g.int_col) +---- TYPES +INT +---- RESULTS +6170 +2468 +4936 +1234 +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +3702 +==== +---- QUERY +# Subquery that only returns NULL values. The IN predicate should always +# evaluate to false in this case, because the hand-picked values have a null 'int_col'. +select id, int_col from functional.alltypes t +where t.id = 42 or t.int_col IN ( + select int_col from functional.alltypesagg where id in (1000,2000,3000,4000)) +---- TYPES +INT, INT +---- RESULTS +42,2 +==== diff --git a/testdata/workloads/tpcds/queries/tpcds-decimal_v2-q45.test b/testdata/workloads/tpcds/queries/tpcds-decimal_v2-q45.test new file mode 100644 index 000000000..e5638ff96 --- /dev/null +++ b/testdata/workloads/tpcds/queries/tpcds-decimal_v2-q45.test @@ -0,0 +1,66 @@ +==== +---- QUERY: TPCDS-Q45 +SELECT ca_zip, + ca_city, + sum(ws_sales_price) +FROM web_sales, + customer, + customer_address, + date_dim, + item +WHERE ws_bill_customer_sk = c_customer_sk + AND c_current_addr_sk = ca_address_sk + AND ws_item_sk = i_item_sk + AND (SUBSTRING(ca_zip,1,5) IN ('85669', + '86197', + '88274', + '83405', + '86475', + '85392', + '85460', + '80348', + '81792') + OR i_item_id IN + (SELECT i_item_id + FROM item + WHERE i_item_sk IN (2, + 3, + 5, + 7, + 11, + 13, + 17, + 19, + 23, + 29) )) + AND ws_sold_date_sk = d_date_sk + AND d_qoy = 2 + AND d_year = 2001 +GROUP BY ca_zip, + ca_city +ORDER BY ca_zip, + ca_city +LIMIT 100; +---- RESULTS +'10150','Bunker Hill',20.25 +'16098','Five Points',18.05 +'28048','Salem',9.81 +'29584','Oakdale',41.21 +'31289','Lincoln',1.26 +'34975','Kingston',56.67 +'36971','Wilson',202.63 +'38354','Sulphur Springs',25.68 +'44975','Kingston',12.62 +'62808','Hamilton',85.39 +'62812','Shady Grove',48.77 +'66557','Arlington',2.68 +'68252','Maple Grove',11.37 +'69583','Jackson',54.42 +'73003','Hillcrest',36.03 +'78222','Clinton',85.87 +'83683','Plainview',43.82 +'84536','Friendship',64.44 +'88370','Oak Grove',52.42 +---- TYPES +STRING, STRING, DECIMAL +==== diff --git a/tests/query_test/test_tpcds_queries.py b/tests/query_test/test_tpcds_queries.py index 18135fcb7..f9e911dc7 100644 --- a/tests/query_test/test_tpcds_queries.py +++ b/tests/query_test/test_tpcds_queries.py @@ -447,6 +447,9 @@ class TestTpcdsDecimalV2Query(ImpalaTestSuite): def test_tpcds_q43(self, vector): self.run_test_case(self.get_workload() + '-decimal_v2-q43', vector) + def test_tpcds_q45(self, vector): + self.run_test_case(self.get_workload() + '-decimal_v2-q45', vector) + def test_tpcds_q46(self, vector): self.run_test_case(self.get_workload() + '-decimal_v2-q46', vector) diff --git a/tests/util/parse_util.py b/tests/util/parse_util.py index c093a5dad..80b1c1d16 100644 --- a/tests/util/parse_util.py +++ b/tests/util/parse_util.py @@ -22,7 +22,7 @@ from datetime import datetime # changed, and the stress test loses the ability to run the full set of queries. Set # these constants and assert that when a workload is used, all the queries we expect to # use are there. -EXPECTED_TPCDS_QUERIES_COUNT = 86 +EXPECTED_TPCDS_QUERIES_COUNT = 87 EXPECTED_TPCH_NESTED_QUERIES_COUNT = 22 EXPECTED_TPCH_QUERIES_COUNT = 22 # Add the number of stress test specific queries, i.e. in files like '*-stress-*.test'