diff --git a/fe/src/main/java/com/cloudera/impala/analysis/AnalysisContext.java b/fe/src/main/java/com/cloudera/impala/analysis/AnalysisContext.java index 9bfab13ad..4305561ae 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/AnalysisContext.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/AnalysisContext.java @@ -34,7 +34,6 @@ import com.cloudera.impala.catalog.ImpaladCatalog; import com.cloudera.impala.common.AnalysisException; import com.cloudera.impala.common.Pair; import com.cloudera.impala.thrift.TAccessEvent; -import com.cloudera.impala.thrift.TDescribeOutputStyle; import com.cloudera.impala.thrift.TLineageGraph; import com.cloudera.impala.thrift.TQueryCtx; import com.google.common.base.Preconditions; diff --git a/fe/src/main/java/com/cloudera/impala/analysis/CollectionTableRef.java b/fe/src/main/java/com/cloudera/impala/analysis/CollectionTableRef.java index ce6896070..deeabb072 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/CollectionTableRef.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/CollectionTableRef.java @@ -94,16 +94,6 @@ public class CollectionTableRef extends TableRef { // InlineViews are currently not supported as a parent ref. Preconditions.checkState(!(parentRef instanceof InlineViewRef)); correlatedTupleIds_.add(parentRef.getId()); - } else if (getJoinOp().isCrossJoin() || getJoinOp().isInnerJoin() - || getJoinOp() == JoinOperator.LEFT_SEMI_JOIN) { - // Generate a predicate to filter out empty collections directly - // in the parent scan. This is a performance optimization to avoid - // processing empty collections inside a subplan that would yield - // an empty result set. - IsNotEmptyPredicate isNotEmptyPred = - new IsNotEmptyPredicate(collectionExpr_.clone()); - isNotEmptyPred.analyze(analyzer); - analyzer.registerConjuncts(isNotEmptyPred, false); } } if (!isRelative()) { @@ -115,6 +105,11 @@ public class CollectionTableRef extends TableRef { } isAnalyzed_ = true; analyzeHints(analyzer); + + // TODO: For joins on nested collections some join ops can be simplified + // due to the containment relationship of the parent and child. For example, + // a FULL OUTER JOIN would become a LEFT OUTER JOIN, or a RIGHT SEMI JOIN + // would become an INNER or CROSS JOIN. analyzeJoin(analyzer); } diff --git a/fe/src/main/java/com/cloudera/impala/analysis/SelectStmt.java b/fe/src/main/java/com/cloudera/impala/analysis/SelectStmt.java index a278b6874..576514cca 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/SelectStmt.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/SelectStmt.java @@ -172,6 +172,12 @@ public class SelectStmt extends QueryStmt { throw new AnalysisException("Found missing tables. Aborting analysis."); } + // Generate !empty() predicates to filter out empty collections. + // Skip this step when analyzing a WITH-clause because CollectionTableRefs + // do not register collection slots in their parent in that context + // (see CollectionTableRef.analyze()). + if (!analyzer.isWithClause()) registerIsNotEmptyPredicates(analyzer); + // analyze plan hints from select list selectList_.analyzePlanHints(analyzer); @@ -284,6 +290,50 @@ public class SelectStmt extends QueryStmt { if (aggInfo_ != null) LOG.debug("post-analysis " + aggInfo_.debugString()); } + /** + * Generates and registers !empty() predicates to filter out empty collections directly + * in the parent scan of collection table refs. This is a performance optimization to + * avoid the expensive processing of empty collections inside a subplan that would + * yield an empty result set. + * + * For correctness purposes, the predicates are generated in cases where we can ensure + * that they will be assigned only to the parent scan, and no other plan node. + * + * The conditions are as follows: + * - collection table ref is relative and non-correlated + * - collection table ref represents the rhs of an inner/cross/semi join + * - collection table ref's parent tuple is not outer joined + * + * TODO: In some cases, it is possible to generate !empty() predicates for a correlated + * table ref, but in general, that is not correct for non-trivial query blocks. + * For example, if the block with the correlated ref has an aggregation then adding a + * !empty() predicate would incorrectly discard rows from the final result set. + * TODO: Evaluating !empty() predicates at non-scan nodes interacts poorly with our BE + * projection of collection slots. For example, rows could incorrectly be filtered if + * a !empty() predicate is assigned to a plan node that comes after the unnest of the + * collection that also performs the projection. + */ + private void registerIsNotEmptyPredicates(Analyzer analyzer) throws AnalysisException { + for (TableRef tblRef: tableRefs_) { + Preconditions.checkState(tblRef.isResolved()); + if (!(tblRef instanceof CollectionTableRef)) continue; + CollectionTableRef ref = (CollectionTableRef) tblRef; + // Skip non-relative and correlated refs. + if (!ref.isRelative() || ref.isCorrelated()) continue; + // Skip outer and anti joins. + if (ref.getJoinOp().isOuterJoin() || ref.getJoinOp().isAntiJoin()) continue; + // Do not generate a predicate if the parent tuple is outer joined. + if (analyzer.isOuterJoined(ref.getResolvedPath().getRootDesc().getId())) continue; + IsNotEmptyPredicate isNotEmptyPred = + new IsNotEmptyPredicate(ref.getCollectionExpr().clone()); + isNotEmptyPred.analyze(analyzer); + // Register the predicate as an On-clause conjunct because it should only + // affect the result of this join and not the whole FROM clause. + analyzer.registerOnClauseConjuncts( + Lists.newArrayList(isNotEmptyPred), ref); + } + } + /** * Marks all unassigned join predicates as well as exprs in aggInfo and sortInfo. */ diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test b/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test index 5737f82cc..b5e888708 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test @@ -82,7 +82,7 @@ where c_nationkey = n_nationkey and s_nationkey = n_nationkey | partitions=1/1 files=1 size=111.08MB | 05:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB runtime filters: RF000 -> c_nationkey, RF001 -> c.c_comment, RF002 -> c_comment, RF003 -> c.c_nationkey ==== # Test subplans: Cross join of parent and relative ref. @@ -132,6 +132,7 @@ where b.item % 2 = 0 | 00:SCAN HDFS [functional.allcomplextypes a] partitions=0/0 files=0 size=0B + predicates: !empty(a.int_array_col) predicates on b: b.item % 2 = 0 ==== # Test subplans: Left anti join of parent and relative ref without On-clause. @@ -168,8 +169,6 @@ where a.id < 10 predicates: a.id < 10 ==== # Test subplans: Right anti join of parent and relative ref without On-clause. -# TODO: Transform the join op into a CROSS JOIN. -# in this context? select b.item from functional.allcomplextypes a right anti join a.int_array_col b where b.item % 2 = 0 ---- PLAN @@ -219,8 +218,6 @@ where a.id < 10 predicates: a.id < 10 ==== # Test subplans: Right outer join of parent and relative ref without On-clause. -# TODO: Transform the join op into a CROSS JOIN. -# in this context? select a.id, b.item from functional.allcomplextypes a right outer join a.int_array_col b where b.item % 2 = 0 ---- PLAN @@ -237,7 +234,6 @@ where b.item % 2 = 0 predicates on b: b.item % 2 = 0 ==== # Test subplans: Full outer join of parent and relative ref without On-clause. -# TODO: Transform the join op into an INNER JOIN. select a.id, b.item from functional.allcomplextypes a full outer join a.int_array_col b where b.item % 2 = 0 and a.id < 10 ---- PLAN @@ -293,7 +289,6 @@ where a.id < 10 predicates on b: b.item % 2 = 0 ==== # Test subplans: Non-equi right semi join of parent and relative ref. -# TODO: Transform the join op into an INNER JOIN. select b.item from functional.allcomplextypes a right semi join a.int_array_col b on (a.id < b.item and a.id < 10) where b.item % 2 = 0 @@ -309,7 +304,7 @@ where b.item % 2 = 0 | 00:SCAN HDFS [functional.allcomplextypes a] partitions=0/0 files=0 size=0B - predicates: a.id < 10 + predicates: a.id < 10, !empty(a.int_array_col) predicates on b: b.item % 2 = 0 ==== # Test subplans: Non-equi left anti join of parent and relative ref. @@ -332,7 +327,6 @@ where a.id < 10 predicates on b: b.item % 2 = 0 ==== # Test subplans: Non-equi right anti join of parent and relative ref. -# TODO: Transform the join op into an INNER JOIN. select b.item from functional.allcomplextypes a right anti join a.int_array_col b on (a.id < b.item and a.id < 10) where b.item % 2 = 0 @@ -371,7 +365,6 @@ where a.id < 10 predicates on b: b.item % 2 = 0 ==== # Test subplans: Non-equi right outer join of parent and relative ref. -# TODO: Transform the join op into an INNER JOIN. select a.id, b.item from functional.allcomplextypes a right outer join a.int_array_col b on (a.id < b.item and a.id < 10) where b.item % 2 = 0 @@ -391,7 +384,6 @@ where b.item % 2 = 0 predicates on b: b.item % 2 = 0 ==== # Test subplans: Non-equi full outer join of parent and relative ref. -# TODO: Transform the join op into LEFT OUTER JOIN. select a.id, b.item from functional.allcomplextypes a full outer join a.int_array_col b on (a.id < b.item and a.id < 10) where b.item % 2 = 0 @@ -447,11 +439,10 @@ where a.id < 10 | 00:SCAN HDFS [functional.allcomplextypes a] partitions=0/0 files=0 size=0B - predicates: !empty(a.struct_array_col), a.id % 2 = 0, a.id < 10 + predicates: a.id % 2 = 0, !empty(a.struct_array_col), a.id < 10 predicates on b: b.f1 % 2 = 0, b.f1 < 10 ==== # Test subplans: Right-semi equi-join of parent and relative ref. -# TODO: Transform the join op into an INNER JOIN. select b.f1, b.f2 from functional.allcomplextypes a right semi join a.struct_array_col b on (a.id < 10 and b.f1 = a.id and b.f1 < a.year) @@ -468,7 +459,7 @@ where b.f1 % 2 = 0 | 00:SCAN HDFS [functional.allcomplextypes a] partitions=0/0 files=0 size=0B - predicates: a.id < 10, a.id % 2 = 0 + predicates: a.id < 10, !empty(a.struct_array_col), a.id % 2 = 0 predicates on b: b.f1 < 10, b.f1 % 2 = 0 ==== # Test subplans: Left-anti equi-join of parent and relative ref. @@ -492,7 +483,6 @@ where a.id < 10 predicates on b: b.f1 % 2 = 0, b.f1 < 10 ==== # Test subplans: Right-anti equi-join of parent and relative ref. -# TODO: Transform the join op into an INNER JOIN. select b.f1, b.f2 from functional.allcomplextypes a right anti join a.struct_array_col b on (a.id < 10 and b.f1 = a.id and b.f1 < a.year) @@ -533,7 +523,6 @@ where a.id < 10 predicates on b: b.f1 % 2 = 0, b.f1 < 10 ==== # Test subplans: Right-outer equi-join of parent and relative ref. -# TODO: Transform the join op into an INNER JOIN. select b.f1, b.f2 from functional.allcomplextypes a right outer join a.struct_array_col b on (a.id < 10 and b.f1 = a.id and b.f1 < a.year) @@ -554,7 +543,6 @@ where b.f1 % 2 = 0 predicates on b: b.f1 % 2 = 0 ==== # Test subplans: Full-outer equi-join of parent and relative ref. -# TODO: Transform the join op into an LEFT OUTER JOIN. select b.f1, b.f2 from functional.allcomplextypes a full outer join a.struct_array_col b on (b.f1 = a.id and b.f1 < a.year) @@ -689,11 +677,9 @@ inner join functional.alltypes d on (b.id = d.id) | 08:HASH JOIN [FULL OUTER JOIN] | hash predicates: b.id = a.id -| other predicates: !empty(a.struct_array_col) | |--00:SCAN HDFS [functional.allcomplextypes a] | partitions=0/0 files=0 size=0B -| predicates: !empty(a.struct_array_col) | predicates on e: e.f1 < 10 | 01:SCAN HDFS [functional.alltypestiny b] @@ -775,7 +761,7 @@ where b.item < 10 and c.int_col > 30 | 10:HASH JOIN [FULL OUTER JOIN] | hash predicates: c.id = b.item -| other predicates: !empty(a.int_array_col), !empty(a.int_map_col), b.item < 10, c.int_col > 30 +| other predicates: b.item < 10, c.int_col > 30 | |--01:SUBPLAN | | @@ -787,7 +773,6 @@ where b.item < 10 and c.int_col > 30 | | | 00:SCAN HDFS [functional.allcomplextypes a] | partitions=0/0 files=0 size=0B -| predicates: !empty(a.int_array_col), !empty(a.int_map_col) | predicates on b: b.item < 10 | 05:SCAN HDFS [functional.alltypessmall c] @@ -1225,7 +1210,7 @@ limit 10 | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders), c_custkey < 10 predicates on o: !empty(o.o_lineitems), o_orderkey < 5 predicates on o_lineitems: l_linenumber < 3 @@ -1322,7 +1307,7 @@ where c.c_custkey = o.o_orderkey and c.c_custkey = o.o_shippriority | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders), c.c_custkey = c.c_nationkey predicates on o: !empty(o.o_lineitems), o.o_orderkey = o.o_shippriority predicates on l: l.l_partkey = l.l_linenumber, l.l_partkey = l.l_suppkey @@ -1429,11 +1414,9 @@ inner join t2.int_array_col | 05:HASH JOIN [LEFT OUTER JOIN] | hash predicates: t1.id = t2.id -| other predicates: !empty(t2.int_array_col) | |--01:SCAN HDFS [functional.allcomplextypes t2] | partitions=0/0 files=0 size=0B -| predicates: !empty(t2.int_array_col) | 00:SCAN HDFS [functional.allcomplextypes t1] partitions=0/0 files=0 size=0B @@ -1539,7 +1522,7 @@ where c.c_custkey in | 04:UNNEST [c.c_orders o2] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB ==== # IMPALA-2412: Test join ordering in nested subplans. Same as above # but with a few inner joins. @@ -1583,7 +1566,7 @@ where c.c_custkey in | 04:UNNEST [c.c_orders o2] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) ==== # IMPALA-2412: Test join ordering in nested subplans. @@ -1625,7 +1608,7 @@ where c.c_custkey in | 04:UNNEST [c.c_orders o2] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB ==== # IMPALA-2446: Test predicate assignment when outer join has no conjuncts in # the ON clause and there are predicates in the WHERE clause that can be assigned to @@ -1726,13 +1709,66 @@ inner join o.o_lineitems | | 05:UNNEST [o.o_lineitems] | | | 07:NESTED LOOP JOIN [RIGHT OUTER JOIN] -| | predicates: !empty(o.o_lineitems) | | | |--02:SINGULAR ROW SRC | | | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB - predicates on o: !empty(o.o_lineitems) + partitions=1/1 files=4 size=554.13MB +==== +# IMPALA-3065/IMPALA-3062: Test correct assignment of !empty() predicates. +# Predicates should not be generated if the parent tuple is outer joined. +select 1 from tpch_nested_parquet.customer c1 +inner join c1.c_orders +right outer join tpch_nested_parquet.customer c2 + on c1.c_custkey = c2.c_custkey +---- PLAN +06:HASH JOIN [RIGHT OUTER JOIN] +| hash predicates: c1.c_custkey = c2.c_custkey +| runtime filters: RF000 <- c2.c_custkey +| +|--05:SCAN HDFS [tpch_nested_parquet.customer c2] +| partitions=1/1 files=4 size=554.13MB +| +01:SUBPLAN +| +|--04:NESTED LOOP JOIN [CROSS JOIN] +| | +| |--02:SINGULAR ROW SRC +| | +| 03:UNNEST [c1.c_orders] +| +00:SCAN HDFS [tpch_nested_parquet.customer c1] + partitions=1/1 files=4 size=554.13MB + runtime filters: RF000 -> c1.c_custkey +==== +# IMPALA-3065/IMPALA-3062: Test correct assignment of !empty() predicates. +# Predicates should not be generated if the parent tuple is outer joined. +select 1 from tpch_nested_parquet.customer c1 +full outer join tpch_nested_parquet.customer c2 + on c1.c_custkey = c2.c_custkey +inner join c1.c_orders o1 +left semi join c2.c_orders o2 +---- PLAN +08:SUBPLAN +| +|--06:NESTED LOOP JOIN [LEFT SEMI JOIN] +| | +| |--04:UNNEST [c2.c_orders o2] +| | +| 05:NESTED LOOP JOIN [CROSS JOIN] +| | +| |--02:SINGULAR ROW SRC +| | +| 03:UNNEST [c1.c_orders o1] +| +07:HASH JOIN [FULL OUTER JOIN] +| hash predicates: c1.c_custkey = c2.c_custkey +| +|--01:SCAN HDFS [tpch_nested_parquet.customer c2] +| partitions=1/1 files=4 size=554.13MB +| +00:SCAN HDFS [tpch_nested_parquet.customer c1] + partitions=1/1 files=4 size=554.13MB ==== diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test index fcc12dc5e..97307df20 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test @@ -30,7 +30,7 @@ order by | group by: l_returnflag, l_linestatus | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate <= '1998-09-02' ---- DISTRIBUTEDPLAN 05:MERGING-EXCHANGE [UNPARTITIONED] @@ -50,7 +50,7 @@ order by | group by: l_returnflag, l_linestatus | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate <= '1998-09-02' ==== # TPCH-Q2 @@ -316,7 +316,7 @@ limit 10 | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders), c_mktsegment = 'BUILDING' predicates on o: !empty(o.o_lineitems), o_orderdate < '1995-03-15' predicates on l: l_shipdate > '1995-03-15' @@ -355,7 +355,7 @@ limit 10 | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders), c_mktsegment = 'BUILDING' predicates on o: !empty(o.o_lineitems), o_orderdate < '1995-03-15' predicates on l: l_shipdate > '1995-03-15' @@ -408,7 +408,7 @@ order by | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: o_orderdate >= '1993-07-01', o_orderdate < '1993-10-01' predicates on o_lineitems: l_commitdate < l_receiptdate @@ -446,7 +446,7 @@ order by | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: o_orderdate >= '1993-07-01', o_orderdate < '1993-10-01' predicates on o_lineitems: l_commitdate < l_receiptdate @@ -522,7 +522,7 @@ order by | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems), o_orderdate >= '1994-01-01', o_orderdate < '1995-01-01' runtime filters: RF000 -> c.c_nationkey, RF002 -> c_nationkey @@ -587,7 +587,7 @@ order by | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems), o_orderdate >= '1994-01-01', o_orderdate < '1995-01-01' runtime filters: RF000 -> c.c_nationkey, RF002 -> c_nationkey @@ -608,7 +608,7 @@ where | output: sum(l_extendedprice * l_discount) | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate >= '1994-01-01', l_shipdate < '1995-01-01', l_discount >= 0.05, l_discount <= 0.07, l_quantity < 24 ---- DISTRIBUTEDPLAN 03:AGGREGATE [FINALIZE] @@ -620,7 +620,7 @@ where | output: sum(l_extendedprice * l_discount) | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate >= '1994-01-01', l_shipdate < '1995-01-01', l_discount >= 0.05, l_discount <= 0.07, l_quantity < 24 ==== # TPCH-Q7 @@ -708,7 +708,7 @@ order by | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems) predicates on l: l_shipdate >= '1995-01-01', l_shipdate <= '1996-12-31' @@ -775,7 +775,7 @@ order by | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems) predicates on l: l_shipdate >= '1995-01-01', l_shipdate <= '1996-12-31' @@ -879,7 +879,7 @@ order by | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems), o_orderdate >= '1995-01-01', o_orderdate <= '1996-12-31' runtime filters: RF001 -> c_nationkey @@ -962,7 +962,7 @@ order by | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems), o_orderdate >= '1995-01-01', o_orderdate <= '1996-12-31' runtime filters: RF001 -> c_nationkey @@ -1045,7 +1045,7 @@ order by | 03:UNNEST [o.o_lineitems l] | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders o] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(o.o_lineitems) ---- DISTRIBUTEDPLAN 22:MERGING-EXCHANGE [UNPARTITIONED] @@ -1109,7 +1109,7 @@ order by | 03:UNNEST [o.o_lineitems l] | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders o] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(o.o_lineitems) ==== # TPCH-Q10 @@ -1177,7 +1177,7 @@ limit 20 | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems), o_orderdate >= '1993-10-01', o_orderdate < '1994-01-01' predicates on l: l_returnflag = 'R' @@ -1226,7 +1226,7 @@ limit 20 | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems), o_orderdate >= '1993-10-01', o_orderdate < '1994-01-01' predicates on l: l_returnflag = 'R' @@ -1442,7 +1442,7 @@ order by | 03:UNNEST [o.o_lineitems l] | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders o] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(o.o_lineitems) predicates on l: l_shipmode IN ('MAIL', 'SHIP'), l_commitdate < l_receiptdate, l_shipdate < l_commitdate, l_receiptdate >= '1994-01-01', l_receiptdate < '1995-01-01' ---- DISTRIBUTEDPLAN @@ -1471,7 +1471,7 @@ order by | 03:UNNEST [o.o_lineitems l] | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders o] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(o.o_lineitems) predicates on l: l_shipmode IN ('MAIL', 'SHIP'), l_commitdate < l_receiptdate, l_shipdate < l_commitdate, l_receiptdate >= '1994-01-01', l_receiptdate < '1995-01-01' ==== @@ -1517,7 +1517,7 @@ order by | 03:UNNEST [c.c_orders] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates on c_orders: (NOT o_comment LIKE '%special%requests%') ---- DISTRIBUTEDPLAN 12:MERGING-EXCHANGE [UNPARTITIONED] @@ -1555,7 +1555,7 @@ order by | 03:UNNEST [c.c_orders] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates on c_orders: (NOT o_comment LIKE '%special%requests%') ==== # TPCH-Q14 @@ -1585,7 +1585,7 @@ where | partitions=1/1 files=1 size=6.30MB | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate >= '1995-09-01', l_shipdate < '1995-10-01' runtime filters: RF000 -> l_partkey ---- DISTRIBUTEDPLAN @@ -1607,7 +1607,7 @@ where | partitions=1/1 files=1 size=6.30MB | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate >= '1995-09-01', l_shipdate < '1995-10-01' runtime filters: RF000 -> l_partkey ==== @@ -1658,7 +1658,7 @@ order by | | group by: l_suppkey | | | 03:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] -| partitions=1/1 files=4 size=577.87MB +| partitions=1/1 files=4 size=554.13MB | predicates: l_shipdate >= '1996-01-01', l_shipdate < '1996-04-01' | 06:HASH JOIN [INNER JOIN] @@ -1673,7 +1673,7 @@ order by | group by: l_suppkey | 01:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate >= '1996-01-01', l_shipdate < '1996-04-01' runtime filters: RF000 -> l.l_suppkey ---- DISTRIBUTEDPLAN @@ -1707,7 +1707,7 @@ order by | | group by: l_suppkey | | | 03:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] -| partitions=1/1 files=4 size=577.87MB +| partitions=1/1 files=4 size=554.13MB | predicates: l_shipdate >= '1996-01-01', l_shipdate < '1996-04-01' | 06:HASH JOIN [INNER JOIN, PARTITIONED] @@ -1730,7 +1730,7 @@ order by | group by: l_suppkey | 01:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate >= '1996-01-01', l_shipdate < '1996-04-01' runtime filters: RF000 -> l.l_suppkey ==== @@ -1862,7 +1862,7 @@ where | | group by: l_partkey | | | 02:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] -| partitions=1/1 files=4 size=577.87MB +| partitions=1/1 files=4 size=554.13MB | 04:HASH JOIN [INNER JOIN] | hash predicates: l_partkey = p_partkey @@ -1873,7 +1873,7 @@ where | predicates: p_brand = 'Brand#23', p_container = 'MED BOX' | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB runtime filters: RF000 -> l.l_partkey, RF001 -> l_partkey ---- DISTRIBUTEDPLAN 12:AGGREGATE [FINALIZE] @@ -1902,7 +1902,7 @@ where | | group by: l_partkey | | | 02:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] -| partitions=1/1 files=4 size=577.87MB +| partitions=1/1 files=4 size=554.13MB | 04:HASH JOIN [INNER JOIN, BROADCAST] | hash predicates: l_partkey = p_partkey @@ -1915,7 +1915,7 @@ where | predicates: p_brand = 'Brand#23', p_container = 'MED BOX' | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB runtime filters: RF000 -> l.l_partkey, RF001 -> l_partkey ==== # TPCH-Q18 @@ -1976,7 +1976,7 @@ limit 100 | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems) ---- DISTRIBUTEDPLAN @@ -2022,7 +2022,7 @@ limit 100 | 03:UNNEST [c.c_orders o] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) predicates on o: !empty(o.o_lineitems) ==== @@ -2076,7 +2076,7 @@ where | partitions=1/1 files=1 size=6.30MB | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB runtime filters: RF000 -> l_partkey ---- DISTRIBUTEDPLAN 06:AGGREGATE [FINALIZE] @@ -2098,7 +2098,7 @@ where | partitions=1/1 files=1 size=6.30MB | 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB runtime filters: RF000 -> l_partkey ==== # TPCH-Q20 @@ -2180,7 +2180,7 @@ order by | group by: l_partkey, l_suppkey | 07:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate >= '1994-01-01', l_shipdate < '1995-01-01' runtime filters: RF000 -> l.l_suppkey, RF001 -> l.l_partkey ---- DISTRIBUTEDPLAN @@ -2248,7 +2248,7 @@ order by | group by: l_partkey, l_suppkey | 07:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: l_shipdate >= '1994-01-01', l_shipdate < '1995-01-01' runtime filters: RF000 -> l.l_suppkey, RF001 -> l.l_partkey ==== @@ -2346,9 +2346,9 @@ limit 100 | 04:UNNEST [c.c_orders o] | 01:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) - predicates on o: !empty(o.o_lineitems), o_orderstatus = 'F' + predicates on o: o_orderstatus = 'F', !empty(o.o_lineitems) predicates on l1: l1.l_receiptdate > l1.l_commitdate predicates on l3: l3.l_receiptdate > l3.l_commitdate ---- DISTRIBUTEDPLAN @@ -2419,9 +2419,9 @@ limit 100 | 04:UNNEST [c.c_orders o] | 01:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: !empty(c.c_orders) - predicates on o: !empty(o.o_lineitems), o_orderstatus = 'F' + predicates on o: o_orderstatus = 'F', !empty(o.o_lineitems) predicates on l1: l1.l_receiptdate > l1.l_commitdate predicates on l3: l3.l_receiptdate > l3.l_commitdate ==== @@ -2474,7 +2474,7 @@ order by | | output: avg(c_acctbal) | | | 05:SCAN HDFS [tpch_nested_parquet.customer c] -| partitions=1/1 files=4 size=577.87MB +| partitions=1/1 files=4 size=554.13MB | predicates: c_acctbal > 0.00, substr(c_phone, 1, 2) IN ('13', '31', '23', '29', '30', '18', '17') | 01:SUBPLAN @@ -2486,7 +2486,7 @@ order by | 03:UNNEST [c.c_orders] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: substr(c_phone, 1, 2) IN ('13', '31', '23', '29', '30', '18', '17') ---- DISTRIBUTEDPLAN 15:MERGING-EXCHANGE [UNPARTITIONED] @@ -2519,7 +2519,7 @@ order by | | output: avg(c_acctbal) | | | 05:SCAN HDFS [tpch_nested_parquet.customer c] -| partitions=1/1 files=4 size=577.87MB +| partitions=1/1 files=4 size=554.13MB | predicates: c_acctbal > 0.00, substr(c_phone, 1, 2) IN ('13', '31', '23', '29', '30', '18', '17') | 01:SUBPLAN @@ -2531,6 +2531,6 @@ order by | 03:UNNEST [c.c_orders] | 00:SCAN HDFS [tpch_nested_parquet.customer c] - partitions=1/1 files=4 size=577.87MB + partitions=1/1 files=4 size=554.13MB predicates: substr(c_phone, 1, 2) IN ('13', '31', '23', '29', '30', '18', '17') ==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test b/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test index c3ddd7768..2e38d1deb 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test +++ b/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test @@ -421,3 +421,32 @@ select id, pos from complextypestbl t1 full outer join t1.int_array t2 ---- TYPES bigint,bigint ==== +---- QUERY +# IMPALA-3065/IMPALA-3062: Test a join on a nested collection whose +# parent tuple is outer joined. This test covers the case where the +# outer joined collection is on the probe side of the outer join. +# To reliably reproduce one of the problematic cases, we need +# > batch_size matches for at least one probe row. +select straight_join count(o.pos) from tpch_nested_parquet.customer c1 +right outer join tpch_nested_parquet.customer c2 + on c1.c_custkey % 2 = c2.c_custkey % 2 +inner join c1.c_orders o +where c1.c_custkey < 10 and c2.c_custkey < 10000 +---- RESULTS +329960 +---- TYPES +bigint +==== +---- QUERY +# IMPALA-3065/IMPALA-3062: Test a join on a nested collection whose +# parent tuple is outer joined. This test covers the case where the +# outer joined collection is on the build side of the outer join. +select count(a.pos) from complextypestbl t1 +full outer join complextypestbl t2 + on t1.id = t2.id +inner join t2.int_array a +---- RESULTS +10 +---- TYPES +bigint +====